aboutsummaryrefslogtreecommitdiff
path: root/apps/cf-ai-backend
diff options
context:
space:
mode:
Diffstat (limited to 'apps/cf-ai-backend')
-rw-r--r--apps/cf-ai-backend/src/helper.ts8
-rw-r--r--apps/cf-ai-backend/src/index.ts8
-rw-r--r--apps/cf-ai-backend/src/utils/chunkTweet.ts47
3 files changed, 46 insertions, 17 deletions
diff --git a/apps/cf-ai-backend/src/helper.ts b/apps/cf-ai-backend/src/helper.ts
index 3a15ac4d..2a68879a 100644
--- a/apps/cf-ai-backend/src/helper.ts
+++ b/apps/cf-ai-backend/src/helper.ts
@@ -203,7 +203,7 @@ export async function batchCreateChunksAndEmbeddings({
{
const commonMetaData = {
type: body.type ?? "tweet",
- title: body.title,
+ title: body.title?.slice(0, 50) ?? "",
description: body.description ?? "",
url: body.url,
[sanitizeKey(`user-${body.user}`)]: 1,
@@ -225,6 +225,7 @@ export async function batchCreateChunksAndEmbeddings({
return {
pageContent: chunk,
metadata: {
+ content: chunk,
links: tweetLinks,
videos: tweetVids,
tweetId: tweetId,
@@ -254,7 +255,7 @@ export async function batchCreateChunksAndEmbeddings({
{
const commonMetaData = {
type: body.type ?? "page",
- title: body.title,
+ title: body.title?.slice(0, 50) ?? "",
description: body.description ?? "",
url: body.url,
[sanitizeKey(`user-${body.user}`)]: 1,
@@ -271,6 +272,7 @@ export async function batchCreateChunksAndEmbeddings({
return {
pageContent: chunk,
metadata: {
+ content: chunk,
...commonMetaData,
...spaceMetadata,
},
@@ -290,6 +292,7 @@ export async function batchCreateChunksAndEmbeddings({
case "note":
{
const commonMetaData = {
+ title: body.title?.slice(0, 50) ?? "",
type: body.type ?? "page",
description: body.description ?? "",
url: body.url,
@@ -307,6 +310,7 @@ export async function batchCreateChunksAndEmbeddings({
return {
pageContent: chunk,
metadata: {
+ content: chunk,
...commonMetaData,
...spaceMetadata,
},
diff --git a/apps/cf-ai-backend/src/index.ts b/apps/cf-ai-backend/src/index.ts
index a3ac1380..1a118327 100644
--- a/apps/cf-ai-backend/src/index.ts
+++ b/apps/cf-ai-backend/src/index.ts
@@ -77,19 +77,19 @@ app.post("/api/add", zValidator("json", vectorObj), async (c) => {
console.log(body.spaces);
let chunks: TweetChunks | PageOrNoteChunks;
// remove everything in <raw> tags
- const newPageContent = body.pageContent?.replace(/<raw>.*?<\/raw>/g, "");
+ // const newPageContent = body.pageContent?.replace(/<raw>.*?<\/raw>/g, "");
switch (body.type) {
case "tweet":
- chunks = chunkThread(newPageContent);
+ chunks = chunkThread(body.pageContent);
break;
case "page":
- chunks = chunkPage(newPageContent);
+ chunks = chunkPage(body.pageContent);
break;
case "note":
- chunks = chunkNote(newPageContent);
+ chunks = chunkNote(body.pageContent);
break;
}
diff --git a/apps/cf-ai-backend/src/utils/chunkTweet.ts b/apps/cf-ai-backend/src/utils/chunkTweet.ts
index 224c6c05..78f0f261 100644
--- a/apps/cf-ai-backend/src/utils/chunkTweet.ts
+++ b/apps/cf-ai-backend/src/utils/chunkTweet.ts
@@ -1,5 +1,6 @@
import { TweetChunks } from "../types";
import chunkText from "./chonker";
+import { getRawTweet } from "@repo/shared-types/utils";
interface Tweet {
id: string;
@@ -22,19 +23,43 @@ export interface ThreadTweetData {
export function chunkThread(threadText: string): TweetChunks {
const thread = JSON.parse(threadText);
+ if (typeof thread == "string") {
+ console.log("DA WORKER FAILED DO SOMEHTING FIX DA WROKER");
+ const rawTweet = getRawTweet(thread);
+ const parsedTweet: any = JSON.parse(rawTweet);
- const chunkedTweets = thread.map((tweet: Tweet) => {
- const chunkedTweet = chunkText(tweet.text, 1536);
-
- const metadata = {
- tweetId: tweet.id,
- tweetLinks: tweet.links,
- tweetVids: tweet.videos,
- tweetImages: tweet.images,
+ const chunkedTweet = chunkText(parsedTweet.text, 1536);
+ const metadata: Metadata = {
+ tweetId: parsedTweet.id_str,
+ tweetLinks: parsedTweet.entities.urls.map((url: any) => url.expanded_url),
+ tweetVids:
+ parsedTweet.extended_entities?.media
+ .filter((media: any) => media.type === "video")
+ .map((media: any) => media.video_info!.variants[0].url) || [],
+ tweetImages:
+ parsedTweet.extended_entities?.media
+ .filter((media: any) => media.type === "photo")
+ .map((media: any) => media.media_url_https!) || [],
};
- return { chunkedTweet, metadata };
- });
+ const chunks = [{ chunkedTweet: chunkedTweet, metadata }];
+
+ return { type: "tweet", chunks };
+ } else {
+ console.log(JSON.stringify(thread));
+ const chunkedTweets = thread.map((tweet: Tweet) => {
+ const chunkedTweet = chunkText(tweet.text, 1536);
+
+ const metadata = {
+ tweetId: tweet.id,
+ tweetLinks: tweet.links,
+ tweetVids: tweet.videos,
+ tweetImages: tweet.images,
+ };
+
+ return { chunkedTweet, metadata };
+ });
- return { type: "tweet", chunks: chunkedTweets };
+ return { type: "tweet", chunks: chunkedTweets };
+ }
}