diff options
| -rw-r--r-- | apps/cf-ai-backend/src/helper.ts | 8 | ||||
| -rw-r--r-- | apps/cf-ai-backend/src/index.ts | 8 | ||||
| -rw-r--r-- | apps/cf-ai-backend/src/utils/chunkTweet.ts | 47 | ||||
| -rw-r--r-- | apps/web/app/actions/doers.ts | 9 |
4 files changed, 53 insertions, 19 deletions
diff --git a/apps/cf-ai-backend/src/helper.ts b/apps/cf-ai-backend/src/helper.ts index 3a15ac4d..2a68879a 100644 --- a/apps/cf-ai-backend/src/helper.ts +++ b/apps/cf-ai-backend/src/helper.ts @@ -203,7 +203,7 @@ export async function batchCreateChunksAndEmbeddings({ { const commonMetaData = { type: body.type ?? "tweet", - title: body.title, + title: body.title?.slice(0, 50) ?? "", description: body.description ?? "", url: body.url, [sanitizeKey(`user-${body.user}`)]: 1, @@ -225,6 +225,7 @@ export async function batchCreateChunksAndEmbeddings({ return { pageContent: chunk, metadata: { + content: chunk, links: tweetLinks, videos: tweetVids, tweetId: tweetId, @@ -254,7 +255,7 @@ export async function batchCreateChunksAndEmbeddings({ { const commonMetaData = { type: body.type ?? "page", - title: body.title, + title: body.title?.slice(0, 50) ?? "", description: body.description ?? "", url: body.url, [sanitizeKey(`user-${body.user}`)]: 1, @@ -271,6 +272,7 @@ export async function batchCreateChunksAndEmbeddings({ return { pageContent: chunk, metadata: { + content: chunk, ...commonMetaData, ...spaceMetadata, }, @@ -290,6 +292,7 @@ export async function batchCreateChunksAndEmbeddings({ case "note": { const commonMetaData = { + title: body.title?.slice(0, 50) ?? "", type: body.type ?? "page", description: body.description ?? "", url: body.url, @@ -307,6 +310,7 @@ export async function batchCreateChunksAndEmbeddings({ return { pageContent: chunk, metadata: { + content: chunk, ...commonMetaData, ...spaceMetadata, }, diff --git a/apps/cf-ai-backend/src/index.ts b/apps/cf-ai-backend/src/index.ts index a3ac1380..1a118327 100644 --- a/apps/cf-ai-backend/src/index.ts +++ b/apps/cf-ai-backend/src/index.ts @@ -77,19 +77,19 @@ app.post("/api/add", zValidator("json", vectorObj), async (c) => { console.log(body.spaces); let chunks: TweetChunks | PageOrNoteChunks; // remove everything in <raw> tags - const newPageContent = body.pageContent?.replace(/<raw>.*?<\/raw>/g, ""); + // const newPageContent = body.pageContent?.replace(/<raw>.*?<\/raw>/g, ""); switch (body.type) { case "tweet": - chunks = chunkThread(newPageContent); + chunks = chunkThread(body.pageContent); break; case "page": - chunks = chunkPage(newPageContent); + chunks = chunkPage(body.pageContent); break; case "note": - chunks = chunkNote(newPageContent); + chunks = chunkNote(body.pageContent); break; } diff --git a/apps/cf-ai-backend/src/utils/chunkTweet.ts b/apps/cf-ai-backend/src/utils/chunkTweet.ts index 224c6c05..78f0f261 100644 --- a/apps/cf-ai-backend/src/utils/chunkTweet.ts +++ b/apps/cf-ai-backend/src/utils/chunkTweet.ts @@ -1,5 +1,6 @@ import { TweetChunks } from "../types"; import chunkText from "./chonker"; +import { getRawTweet } from "@repo/shared-types/utils"; interface Tweet { id: string; @@ -22,19 +23,43 @@ export interface ThreadTweetData { export function chunkThread(threadText: string): TweetChunks { const thread = JSON.parse(threadText); + if (typeof thread == "string") { + console.log("DA WORKER FAILED DO SOMEHTING FIX DA WROKER"); + const rawTweet = getRawTweet(thread); + const parsedTweet: any = JSON.parse(rawTweet); - const chunkedTweets = thread.map((tweet: Tweet) => { - const chunkedTweet = chunkText(tweet.text, 1536); - - const metadata = { - tweetId: tweet.id, - tweetLinks: tweet.links, - tweetVids: tweet.videos, - tweetImages: tweet.images, + const chunkedTweet = chunkText(parsedTweet.text, 1536); + const metadata: Metadata = { + tweetId: parsedTweet.id_str, + tweetLinks: parsedTweet.entities.urls.map((url: any) => url.expanded_url), + tweetVids: + parsedTweet.extended_entities?.media + .filter((media: any) => media.type === "video") + .map((media: any) => media.video_info!.variants[0].url) || [], + tweetImages: + parsedTweet.extended_entities?.media + .filter((media: any) => media.type === "photo") + .map((media: any) => media.media_url_https!) || [], }; - return { chunkedTweet, metadata }; - }); + const chunks = [{ chunkedTweet: chunkedTweet, metadata }]; + + return { type: "tweet", chunks }; + } else { + console.log(JSON.stringify(thread)); + const chunkedTweets = thread.map((tweet: Tweet) => { + const chunkedTweet = chunkText(tweet.text, 1536); + + const metadata = { + tweetId: tweet.id, + tweetLinks: tweet.links, + tweetVids: tweet.videos, + tweetImages: tweet.images, + }; + + return { chunkedTweet, metadata }; + }); - return { type: "tweet", chunks: chunkedTweets }; + return { type: "tweet", chunks: chunkedTweets }; + } } diff --git a/apps/web/app/actions/doers.ts b/apps/web/app/actions/doers.ts index da2bfb5f..eaaaafbd 100644 --- a/apps/web/app/actions/doers.ts +++ b/apps/web/app/actions/doers.ts @@ -236,7 +236,6 @@ export const createMemory = async (input: { try { const cf_thread_endpoint = process.env.THREAD_CF_WORKER; const authKey = process.env.THREAD_CF_AUTH; - const threadRequest = await fetch(cf_thread_endpoint, { method: "POST", headers: { @@ -253,6 +252,12 @@ export const createMemory = async (input: { } thread = await threadRequest.text(); + if (thread.trim().length === 2) { + console.log("Thread is an empty array"); + throw new Error( + "[THREAD FETCHING SERVICE] Got no content form thread worker", + ); + } } catch (e) { console.log("[THREAD FETCHING SERVICE] Failed to fetch the thread", e); errorOccurred = true; @@ -263,7 +268,7 @@ export const createMemory = async (input: { pageContent = tweetToMd(tweet); console.log("THis ishte page content!!", pageContent); //@ts-ignore - vectorData = errorOccurred ? pageContent : thread; + vectorData = errorOccurred ? JSON.stringify(pageContent) : thread; metadata = { baseUrl: input.content, description: tweet.text.slice(0, 200), |