diff options
Diffstat (limited to 'apps/cf-ai-backend')
| -rw-r--r-- | apps/cf-ai-backend/src/helper.ts | 8 | ||||
| -rw-r--r-- | apps/cf-ai-backend/src/index.ts | 8 | ||||
| -rw-r--r-- | apps/cf-ai-backend/src/utils/chunkTweet.ts | 47 |
3 files changed, 46 insertions, 17 deletions
diff --git a/apps/cf-ai-backend/src/helper.ts b/apps/cf-ai-backend/src/helper.ts index 3a15ac4d..2a68879a 100644 --- a/apps/cf-ai-backend/src/helper.ts +++ b/apps/cf-ai-backend/src/helper.ts @@ -203,7 +203,7 @@ export async function batchCreateChunksAndEmbeddings({ { const commonMetaData = { type: body.type ?? "tweet", - title: body.title, + title: body.title?.slice(0, 50) ?? "", description: body.description ?? "", url: body.url, [sanitizeKey(`user-${body.user}`)]: 1, @@ -225,6 +225,7 @@ export async function batchCreateChunksAndEmbeddings({ return { pageContent: chunk, metadata: { + content: chunk, links: tweetLinks, videos: tweetVids, tweetId: tweetId, @@ -254,7 +255,7 @@ export async function batchCreateChunksAndEmbeddings({ { const commonMetaData = { type: body.type ?? "page", - title: body.title, + title: body.title?.slice(0, 50) ?? "", description: body.description ?? "", url: body.url, [sanitizeKey(`user-${body.user}`)]: 1, @@ -271,6 +272,7 @@ export async function batchCreateChunksAndEmbeddings({ return { pageContent: chunk, metadata: { + content: chunk, ...commonMetaData, ...spaceMetadata, }, @@ -290,6 +292,7 @@ export async function batchCreateChunksAndEmbeddings({ case "note": { const commonMetaData = { + title: body.title?.slice(0, 50) ?? "", type: body.type ?? "page", description: body.description ?? "", url: body.url, @@ -307,6 +310,7 @@ export async function batchCreateChunksAndEmbeddings({ return { pageContent: chunk, metadata: { + content: chunk, ...commonMetaData, ...spaceMetadata, }, diff --git a/apps/cf-ai-backend/src/index.ts b/apps/cf-ai-backend/src/index.ts index a3ac1380..1a118327 100644 --- a/apps/cf-ai-backend/src/index.ts +++ b/apps/cf-ai-backend/src/index.ts @@ -77,19 +77,19 @@ app.post("/api/add", zValidator("json", vectorObj), async (c) => { console.log(body.spaces); let chunks: TweetChunks | PageOrNoteChunks; // remove everything in <raw> tags - const newPageContent = body.pageContent?.replace(/<raw>.*?<\/raw>/g, ""); + // const newPageContent = body.pageContent?.replace(/<raw>.*?<\/raw>/g, ""); switch (body.type) { case "tweet": - chunks = chunkThread(newPageContent); + chunks = chunkThread(body.pageContent); break; case "page": - chunks = chunkPage(newPageContent); + chunks = chunkPage(body.pageContent); break; case "note": - chunks = chunkNote(newPageContent); + chunks = chunkNote(body.pageContent); break; } diff --git a/apps/cf-ai-backend/src/utils/chunkTweet.ts b/apps/cf-ai-backend/src/utils/chunkTweet.ts index 224c6c05..78f0f261 100644 --- a/apps/cf-ai-backend/src/utils/chunkTweet.ts +++ b/apps/cf-ai-backend/src/utils/chunkTweet.ts @@ -1,5 +1,6 @@ import { TweetChunks } from "../types"; import chunkText from "./chonker"; +import { getRawTweet } from "@repo/shared-types/utils"; interface Tweet { id: string; @@ -22,19 +23,43 @@ export interface ThreadTweetData { export function chunkThread(threadText: string): TweetChunks { const thread = JSON.parse(threadText); + if (typeof thread == "string") { + console.log("DA WORKER FAILED DO SOMEHTING FIX DA WROKER"); + const rawTweet = getRawTweet(thread); + const parsedTweet: any = JSON.parse(rawTweet); - const chunkedTweets = thread.map((tweet: Tweet) => { - const chunkedTweet = chunkText(tweet.text, 1536); - - const metadata = { - tweetId: tweet.id, - tweetLinks: tweet.links, - tweetVids: tweet.videos, - tweetImages: tweet.images, + const chunkedTweet = chunkText(parsedTweet.text, 1536); + const metadata: Metadata = { + tweetId: parsedTweet.id_str, + tweetLinks: parsedTweet.entities.urls.map((url: any) => url.expanded_url), + tweetVids: + parsedTweet.extended_entities?.media + .filter((media: any) => media.type === "video") + .map((media: any) => media.video_info!.variants[0].url) || [], + tweetImages: + parsedTweet.extended_entities?.media + .filter((media: any) => media.type === "photo") + .map((media: any) => media.media_url_https!) || [], }; - return { chunkedTweet, metadata }; - }); + const chunks = [{ chunkedTweet: chunkedTweet, metadata }]; + + return { type: "tweet", chunks }; + } else { + console.log(JSON.stringify(thread)); + const chunkedTweets = thread.map((tweet: Tweet) => { + const chunkedTweet = chunkText(tweet.text, 1536); + + const metadata = { + tweetId: tweet.id, + tweetLinks: tweet.links, + tweetVids: tweet.videos, + tweetImages: tweet.images, + }; + + return { chunkedTweet, metadata }; + }); - return { type: "tweet", chunks: chunkedTweets }; + return { type: "tweet", chunks: chunkedTweets }; + } } |