diff options
| author | Dhravya Shah <[email protected]> | 2024-06-17 20:17:35 -0500 |
|---|---|---|
| committer | GitHub <[email protected]> | 2024-06-17 20:17:35 -0500 |
| commit | 6aa8dc448936badbcd886b2e5fdd8d8368fb11f3 (patch) | |
| tree | a8ea78964b390e253064f81c34f4e79be2355f21 | |
| parent | Merge pull request #70 from CodeTorso/codetorso (diff) | |
| parent | include all selected spaces in the fetch call to backend (diff) | |
| download | supermemory-6aa8dc448936badbcd886b2e5fdd8d8368fb11f3.tar.xz supermemory-6aa8dc448936badbcd886b2e5fdd8d8368fb11f3.zip | |
Merge pull request #71 from Dhravya/vector-deduplication
Vector deduplication
| -rw-r--r-- | apps/cf-ai-backend/src/helper.ts | 88 | ||||
| -rw-r--r-- | apps/cf-ai-backend/src/index.test.ts | 13 | ||||
| -rw-r--r-- | apps/cf-ai-backend/src/index.ts | 13 | ||||
| -rw-r--r-- | apps/cf-ai-backend/src/types.ts | 2 | ||||
| -rw-r--r-- | apps/web/app/actions/doers.ts | 3 |
5 files changed, 82 insertions, 37 deletions
diff --git a/apps/cf-ai-backend/src/helper.ts b/apps/cf-ai-backend/src/helper.ts index 78ff86da..cef781be 100644 --- a/apps/cf-ai-backend/src/helper.ts +++ b/apps/cf-ai-backend/src/helper.ts @@ -49,7 +49,7 @@ export async function initQuery( selectedModel = openai.chat("gpt-4o"); break; } - + return { store, model: selectedModel }; } @@ -64,19 +64,46 @@ export async function deleteDocument({ c: Context<{ Bindings: Env }>; store: CloudflareVectorizeStore; }) { - const toBeDeleted = `${url}-${user}`; + const toBeDeleted = `${url}#supermemory-web`; const random = seededRandom(toBeDeleted); const uuid = random().toString(36).substring(2, 15) + random().toString(36).substring(2, 15); - await c.env.KV.list({ prefix: uuid }).then(async (keys) => { - for (const key of keys.keys) { - await c.env.KV.delete(key.name); - await store.delete({ ids: [key.name] }); + const allIds = await c.env.KV.list({ prefix: uuid }); + + if (allIds.keys.length > 0) { + const savedVectorIds = allIds.keys.map((key) => key.name); + const vectors = await c.env.VECTORIZE_INDEX.getByIds(savedVectorIds); + // We don't actually delete document directly, we just remove the user from the metadata. + // If there's no user left, we can delete the document. + const newVectors = vectors.map((vector) => { + delete vector.metadata[`user-${user}`]; + + // Get count of how many users are left + const userCount = Object.keys(vector.metadata).filter((key) => + key.startsWith("user-"), + ).length; + + // If there's no user left, we can delete the document. + // need to make sure that every chunk is deleted otherwise it would be problematic. + if (userCount === 0) { + store.delete({ ids: savedVectorIds }); + void Promise.all(savedVectorIds.map((id) => c.env.KV.delete(id))); + return null; + } + + return vector; + }); + + // If all vectors are null (deleted), we can delete the KV too. Otherwise, we update (upsert) the vectors. + if (newVectors.every((v) => v === null)) { + await c.env.KV.delete(uuid); + } else { + await c.env.VECTORIZE_INDEX.upsert(newVectors.filter((v) => v !== null)); } - }); + } } export async function batchCreateChunksAndEmbeddings({ @@ -90,15 +117,44 @@ export async function batchCreateChunksAndEmbeddings({ chunks: string[]; context: Context<{ Bindings: Env }>; }) { - const ourID = `${body.url}/#supermemory-${body.user}`; - - await deleteDocument({ url: body.url, user: body.user, c: context, store }); - + //! NOTE that we use #supermemory-web to ensure that + //! If a user saves it through the extension, we don't want other users to be able to see it. + // Requests from the extension should ALWAYS have a unique ID with the USERiD in it. + // I cannot stress this enough, important for security. + const ourID = `${body.url}#supermemory-web`; const random = seededRandom(ourID); const uuid = random().toString(36).substring(2, 15) + random().toString(36).substring(2, 15); + const allIds = await context.env.KV.list({ prefix: uuid }); + + // If some chunks for that content already exist, we'll just update the metadata to include + // the user. + if (allIds.keys.length > 0) { + const savedVectorIds = allIds.keys.map((key) => key.name); + const vectors = await context.env.VECTORIZE_INDEX.getByIds(savedVectorIds); + + // Now, we'll update all vector metadatas with one more userId and all spaceIds + const newVectors = vectors.map((vector) => { + vector.metadata = { + ...vector.metadata, + [`user-${body.user}`]: 1, + + // For each space in body, add the spaceId to the vector metadata + ...(body.spaces ?? [])?.reduce((acc, space) => { + acc[`space-${body.user}-${space}`] = 1; + return acc; + }, {}), + }; + + return vector; + }); + + await context.env.VECTORIZE_INDEX.upsert(newVectors); + return; + } + for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; const chunkId = `${uuid}-${i}`; @@ -112,11 +168,15 @@ export async function batchCreateChunksAndEmbeddings({ metadata: { title: body.title?.slice(0, 50) ?? "", description: body.description ?? "", - space: body.space ?? "", url: body.url, - user: body.user, type: body.type ?? "page", content: newPageContent, + + [`user-${body.user}`]: 1, + ...body.spaces?.reduce((acc, space) => { + acc[`space-${body.user}-${space}`] = 1; + return acc; + }, {}), }, }, ], @@ -127,6 +187,6 @@ export async function batchCreateChunksAndEmbeddings({ console.log("Docs added: ", docs); - await context.env.KV.put(uuid, ourID); + await context.env.KV.put(chunkId, ourID); } } diff --git a/apps/cf-ai-backend/src/index.test.ts b/apps/cf-ai-backend/src/index.test.ts deleted file mode 100644 index bbf66fb5..00000000 --- a/apps/cf-ai-backend/src/index.test.ts +++ /dev/null @@ -1,13 +0,0 @@ -import app from "."; - -// TODO: write more tests -describe("Test the application", () => { - it("Should return 200 response", async () => { - const res = await app.request("http://localhost/"); - expect(res.status).toBe(200); - }), - it("Should return 404 response", async () => { - const res = await app.request("http://localhost/404"); - expect(res.status).toBe(404); - }); -}); diff --git a/apps/cf-ai-backend/src/index.ts b/apps/cf-ai-backend/src/index.ts index 75a3b8e8..a4c5cbfd 100644 --- a/apps/cf-ai-backend/src/index.ts +++ b/apps/cf-ai-backend/src/index.ts @@ -87,7 +87,7 @@ app.post( .min(1, "At least one image is required") .optional(), text: z.string().optional(), - space: z.string().optional(), + spaces: z.array(z.string()).optional(), url: z.string(), user: z.string(), }), @@ -134,7 +134,7 @@ app.post( imageDescriptions.length > 1 ? `A group of ${imageDescriptions.length} images on ${body.url}` : imageDescriptions[0], - space: body.space, + spaces: body.spaces, pageContent: imageDescriptions.join("\n"), title: "Image content from the web", }, @@ -198,7 +198,9 @@ app.post( // Get the AI model maker and vector store const { model, store } = await initQuery(c, query.model); - const filter: VectorizeVectorMetadataFilter = { user: query.user }; + const filter: VectorizeVectorMetadataFilter = { + [`user-${query.user}`]: 1, + }; console.log("Spaces", spaces); // Converting the query to a vector so that we can search for similar vectors @@ -212,7 +214,7 @@ app.post( console.log("space", space); if (!space && spaces.length > 1) { // it's possible for space list to be [undefined] so we only add space filter conditionally - filter.space = space; + filter[`space-${query.user}-${space}`] = 1; } // Because there's no OR operator in the filter, we have to make multiple queries @@ -265,9 +267,6 @@ app.post( dataPoint.id.toString(), ); - // We are getting the content ID back, so that the frontend can show the actual sources properly. - // it IS a lot of DB calls, i completely agree. - // TODO: return metadata value here, so that the frontend doesn't have to re-fetch anything. const storedContent = await Promise.all( idsAsStrings.map(async (id) => await c.env.KV.get(id)), ); diff --git a/apps/cf-ai-backend/src/types.ts b/apps/cf-ai-backend/src/types.ts index 5f6d0583..417d6320 100644 --- a/apps/cf-ai-backend/src/types.ts +++ b/apps/cf-ai-backend/src/types.ts @@ -43,7 +43,7 @@ export const vectorObj = z.object({ pageContent: z.string(), title: z.string().optional(), description: z.string().optional(), - space: z.string().optional(), + spaces: z.array(z.string()).optional(), url: z.string(), user: z.string(), type: z.string().optional().default("page"), diff --git a/apps/web/app/actions/doers.ts b/apps/web/app/actions/doers.ts index f94ed8ec..6c7180d9 100644 --- a/apps/web/app/actions/doers.ts +++ b/apps/web/app/actions/doers.ts @@ -168,8 +168,7 @@ export const createMemory = async (input: { title: metadata.title, description: metadata.description, url: metadata.baseUrl, - // TODO: now, in the vector store, we are only saving the first space. We need to save all spaces. - space: storeToSpaces[0], + spaces: storeToSpaces, user: data.user.id, type, }), |