aboutsummaryrefslogtreecommitdiff
path: root/apps/backend/src/utils
diff options
context:
space:
mode:
authorMahesh Sanikommmu <[email protected]>2025-08-16 18:50:10 -0700
committerMahesh Sanikommmu <[email protected]>2025-08-16 18:50:10 -0700
commit39003aff23d64ff1d96074d71521f6023c9bec01 (patch)
tree3f870c04b3dce315bba1b21aa2da158494e71774 /apps/backend/src/utils
parentMerge pull request #355 from supermemoryai/archive (diff)
downloadsupermemory-39003aff23d64ff1d96074d71521f6023c9bec01.tar.xz
supermemory-39003aff23d64ff1d96074d71521f6023c9bec01.zip
New Version of Supermemory Consumer App
Diffstat (limited to 'apps/backend/src/utils')
-rw-r--r--apps/backend/src/utils/chunkers.ts116
-rw-r--r--apps/backend/src/utils/cipher.ts79
-rw-r--r--apps/backend/src/utils/extractDocumentContent.ts87
-rw-r--r--apps/backend/src/utils/extractor.ts50
-rw-r--r--apps/backend/src/utils/fetchers.ts143
-rw-r--r--apps/backend/src/utils/notion.ts239
-rw-r--r--apps/backend/src/utils/tweetsToThreads.ts108
-rw-r--r--apps/backend/src/utils/typeDecider.ts41
8 files changed, 0 insertions, 863 deletions
diff --git a/apps/backend/src/utils/chunkers.ts b/apps/backend/src/utils/chunkers.ts
deleted file mode 100644
index ce345d29..00000000
--- a/apps/backend/src/utils/chunkers.ts
+++ /dev/null
@@ -1,116 +0,0 @@
-import nlp from "compromise";
-
-export default function chunkText(
- text: string,
- maxChunkSize: number,
- overlap: number = 0.2
-): string[] {
- // Pre-process text to remove excessive whitespace
- text = text.replace(/\s+/g, " ").trim();
-
- const sentences = nlp(text).sentences().out("array");
- const chunks: {
- text: string;
- start: number;
- end: number;
- metadata?: {
- position: string;
- context?: string;
- };
- }[] = [];
-
- let currentChunk: string[] = [];
- let currentSize = 0;
-
- for (let i = 0; i < sentences.length; i++) {
- const sentence = sentences[i].trim();
-
- // Skip empty sentences
- if (!sentence) continue;
-
- // If a single sentence is longer than maxChunkSize, split it
- if (sentence.length > maxChunkSize) {
- if (currentChunk.length > 0) {
- chunks.push({
- text: currentChunk.join(" "),
- start: i - currentChunk.length,
- end: i - 1,
- metadata: {
- position: `${i - currentChunk.length}-${i - 1}`,
- context: currentChunk[0].substring(0, 100), // First 100 chars for context
- },
- });
- currentChunk = [];
- currentSize = 0;
- }
-
- // Split long sentence into smaller chunks
- const words = sentence.split(" ");
- let tempChunk: string[] = [];
-
- for (const word of words) {
- if (tempChunk.join(" ").length + word.length > maxChunkSize) {
- chunks.push({
- text: tempChunk.join(" "),
- start: i,
- end: i,
- metadata: {
- position: `${i}`,
- context: "Split sentence",
- },
- });
- tempChunk = [];
- }
- tempChunk.push(word);
- }
-
- if (tempChunk.length > 0) {
- chunks.push({
- text: tempChunk.join(" "),
- start: i,
- end: i,
- metadata: {
- position: `${i}`,
- context: "Split sentence remainder",
- },
- });
- }
- continue;
- }
-
- currentChunk.push(sentence);
- currentSize += sentence.length;
-
- if (currentSize >= maxChunkSize) {
- const overlapSize = Math.floor(currentChunk.length * overlap);
- chunks.push({
- text: currentChunk.join(" "),
- start: i - currentChunk.length + 1,
- end: i,
- metadata: {
- position: `${i - currentChunk.length + 1}-${i}`,
- context: currentChunk[0].substring(0, 100),
- },
- });
-
- // Keep overlap sentences for next chunk
- currentChunk = currentChunk.slice(-overlapSize);
- currentSize = currentChunk.reduce((sum, s) => sum + s.length, 0);
- }
- }
-
- // Handle remaining sentences
- if (currentChunk.length > 0) {
- chunks.push({
- text: currentChunk.join(" "),
- start: sentences.length - currentChunk.length,
- end: sentences.length - 1,
- metadata: {
- position: `${sentences.length - currentChunk.length}-${sentences.length - 1}`,
- context: currentChunk[0].substring(0, 100),
- },
- });
- }
-
- return chunks.map((chunk) => chunk.text);
-}
diff --git a/apps/backend/src/utils/cipher.ts b/apps/backend/src/utils/cipher.ts
deleted file mode 100644
index 3ba2e905..00000000
--- a/apps/backend/src/utils/cipher.ts
+++ /dev/null
@@ -1,79 +0,0 @@
-async function encrypt(data: string, key: string): Promise<string> {
- try {
- const encoder = new TextEncoder();
- const encodedData = encoder.encode(data);
-
- const baseForIv = encoder.encode(data + key);
- const ivHash = await crypto.subtle.digest('SHA-256', baseForIv);
- const iv = new Uint8Array(ivHash).slice(0, 12);
-
- const cryptoKey = await crypto.subtle.importKey(
- "raw",
- encoder.encode(key),
- { name: "AES-GCM", length: 256 },
- false,
- ["encrypt", "decrypt"]
- );
-
- const encrypted = await crypto.subtle.encrypt(
- { name: "AES-GCM", iv: new Uint8Array(iv).buffer as ArrayBuffer },
- cryptoKey,
- encodedData
- );
-
- const combined = new Uint8Array([...iv, ...new Uint8Array(encrypted)]);
-
- // Convert to base64 safely
- const base64 = Buffer.from(combined).toString("base64");
-
- // Make URL-safe
- return base64.replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
- } catch (err) {
- console.error("Encryption error:", err);
- throw err;
- }
-}
-
-async function decrypt(encryptedData: string, key: string): Promise<string> {
- try {
- // Restore base64 padding and convert URL-safe chars
- const base64 = encryptedData
- .replace(/-/g, "+")
- .replace(/_/g, "/")
- .padEnd(
- encryptedData.length + ((4 - (encryptedData.length % 4)) % 4),
- "="
- );
-
- // Use Buffer for safer base64 decoding
- const combined = Buffer.from(base64, "base64");
- const combinedArray = new Uint8Array(combined);
-
- // Extract the IV that was used for encryption
- const iv = combinedArray.slice(0, 12);
- const encrypted = combinedArray.slice(12);
-
- // Import the same key used for encryption
- const cryptoKey = await crypto.subtle.importKey(
- "raw",
- new TextEncoder().encode(key),
- { name: "AES-GCM", length: 256 },
- false,
- ["encrypt", "decrypt"]
- );
-
- // Use the extracted IV and key to decrypt
- const decrypted = await crypto.subtle.decrypt(
- { name: "AES-GCM", iv: new Uint8Array(iv).buffer as ArrayBuffer },
- cryptoKey,
- encrypted.buffer as ArrayBuffer
- );
-
- return new TextDecoder().decode(decrypted);
- } catch (err) {
- console.error("Decryption error:", err);
- throw err;
- }
-}
-
-export { encrypt, decrypt }; \ No newline at end of file
diff --git a/apps/backend/src/utils/extractDocumentContent.ts b/apps/backend/src/utils/extractDocumentContent.ts
deleted file mode 100644
index 8b7d9256..00000000
--- a/apps/backend/src/utils/extractDocumentContent.ts
+++ /dev/null
@@ -1,87 +0,0 @@
-import * as mammoth from "mammoth";
-import { NonRetryableError } from "cloudflare:workflows";
-import { resolvePDFJS } from 'pdfjs-serverless';
-
-interface DocumentContent {
- content: string;
- error?: string;
-}
-
-export const extractDocumentContent = async (
- url: string
-): Promise<DocumentContent> => {
- try {
- const fileExtension = url.split(".").pop()?.toLowerCase();
-
- if (!fileExtension) {
- throw new Error("Invalid file URL");
- }
-
- console.log("file", fileExtension);
-
- switch (fileExtension) {
- case "pdf":
- return await extractPdfContent(url);
- case "md":
- case "txt":
- return await extractTextContent(url);
- case "doc":
- case "docx":
- return await extractWordContent(url);
- default:
- throw new NonRetryableError(`Unsupported file type: ${fileExtension}`);
- }
- } catch (error) {
- return {
- content: "",
- error: error instanceof Error ? error.message : "Unknown error occurred",
- };
- }
-};
-
-async function extractPdfContent(url: string): Promise<DocumentContent> {
- try {
- const response = await fetch(url);
- const arrayBuffer = await response.arrayBuffer();
-
- // Initialize PDF.js with serverless compatibility
- const { getDocument } = await resolvePDFJS();
-
- // Load the PDF document
- const pdf = await getDocument({
- data: arrayBuffer,
- useSystemFonts: true,
- }).promise;
-
- let fullText = "";
-
- // Extract text from each page
- for (let i = 1; i <= pdf.numPages; i++) {
- const page = await pdf.getPage(i);
- const textContent = await page.getTextContent();
- const pageText = textContent.items.map((item: any) => item.str).join(" ");
- fullText += pageText + "\n";
- }
-
- return { content: fullText };
- } catch (error) {
- console.error("Error extracting PDF content:", error);
- return {
- content: "",
- error: error instanceof Error ? error.message : "Failed to extract PDF content",
- };
- }
-}
-
-async function extractTextContent(url: string): Promise<DocumentContent> {
- const response = await fetch(url);
- const text = await response.text();
- return { content: text };
-}
-
-async function extractWordContent(url: string): Promise<DocumentContent> {
- const response = await fetch(url);
- const arrayBuffer = await response.arrayBuffer();
- const result = await mammoth.extractRawText({ arrayBuffer });
- return { content: result.value };
-}
diff --git a/apps/backend/src/utils/extractor.ts b/apps/backend/src/utils/extractor.ts
deleted file mode 100644
index f033f8e1..00000000
--- a/apps/backend/src/utils/extractor.ts
+++ /dev/null
@@ -1,50 +0,0 @@
-import { Env } from "../types";
-
-export const extractPageContent = async (content: string, env: Env) => {
- const resp = await fetch(`https://r.jina.ai/${content}`);
-
- if (!resp.ok) {
- throw new Error(
- `Failed to fetch ${content}: ${resp.statusText}` + (await resp.text())
- );
- }
-
- const metadataResp = await fetch(`https://md.dhr.wtf/metadata?url=${content}`);
-
- if (!metadataResp.ok) {
- throw new Error(
- `Failed to fetch metadata for ${content}: ${metadataResp.statusText}` +
- (await metadataResp.text())
- );
- }
-
- const metadata = await metadataResp.json() as {
- title?: string;
- description?: string;
- image?: string;
- favicon?: string;
- };
-
- const responseText = await resp.text();
-
- try {
- const json: {
- contentToVectorize: string;
- contentToSave: string;
- title?: string;
- description?: string;
- image?: string;
- favicon?: string;
- } = {
- contentToSave: responseText,
- contentToVectorize: responseText,
- title: metadata.title,
- description: metadata.description,
- image: metadata.image,
- favicon: metadata.favicon,
- };
- return json;
- } catch (e) {
- throw new Error(`Failed to parse JSON from ${content}: ${e}`);
- }
-};
diff --git a/apps/backend/src/utils/fetchers.ts b/apps/backend/src/utils/fetchers.ts
deleted file mode 100644
index 2329f48a..00000000
--- a/apps/backend/src/utils/fetchers.ts
+++ /dev/null
@@ -1,143 +0,0 @@
-import { WorkflowStep } from "cloudflare:workers";
-import { isErr, Ok } from "../errors/results";
-import { typeDecider } from "./typeDecider";
-import { Env, WorkflowParams } from "../types";
-import { unrollTweets } from "./tweetsToThreads";
-import { Tweet } from "react-tweet/api";
-import { NonRetryableError } from "cloudflare:workflows";
-import { extractPageContent } from "./extractor";
-import { extractDocumentContent } from "./extractDocumentContent";
-
-export const fetchContent = async (
- params: WorkflowParams,
- env: Env,
- step: WorkflowStep
-) => {
- const type = typeDecider(params.content);
-
- if (isErr(type)) {
- throw type.error;
- }
-
- switch (type.value) {
- case "page":
- const pageContent = await step?.do(
- "extract page content",
- async () => await extractPageContent(params.content, env)
- );
- return {
- ...pageContent,
- type: "page",
- };
-
- case "tweet":
- const tweetUrl = new URL(params.content);
- tweetUrl.search = ""; // Remove all search params
- const tweetId = tweetUrl.pathname.split("/").pop();
-
- const rawBaseTweetContent = await step.do(
- "extract tweet content",
- async () => {
- const url = `https://cdn.syndication.twimg.com/tweet-result?id=${tweetId}&lang=en&features=tfw_timeline_list%3A%3Btfw_follower_count_sunset%3Atrue%3Btfw_tweet_edit_backend%3Aon%3Btfw_refsrc_session%3Aon%3Btfw_fosnr_soft_interventions_enabled%3Aon%3Btfw_show_birdwatch_pivots_enabled%3Aon%3Btfw_show_business_verified_badge%3Aon%3Btfw_duplicate_scribes_to_settings%3Aon%3Btfw_use_profile_image_shape_enabled%3Aon%3Btfw_show_blue_verified_badge%3Aon%3Btfw_legacy_timeline_sunset%3Atrue%3Btfw_show_gov_verified_badge%3Aon%3Btfw_show_business_affiliate_badge%3Aon%3Btfw_tweet_edit_frontend%3Aon&token=4c2mmul6mnh`;
-
- const resp = await fetch(url, {
- headers: {
- "User-Agent":
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
- Accept: "application/json",
- "Accept-Language": "en-US,en;q=0.5",
- "Accept-Encoding": "gzip, deflate, br",
- Connection: "keep-alive",
- "Upgrade-Insecure-Requests": "1",
- "Cache-Control": "max-age=0",
- TE: "Trailers",
- },
- });
-
- const data = (await resp.json()) as Tweet;
- return data;
- }
- );
-
- let tweetContent: {
- text: string;
- metadata: {
- media?: string[] | undefined;
- links?: string[] | undefined;
- };
- raw: string;
- };
- const unrolledTweetContent = {
- value: [rawBaseTweetContent],
- };
- if (true) {
- console.error("Can't get thread, reverting back to single tweet");
- tweetContent = {
- text: rawBaseTweetContent.text,
- metadata: {
- media: [
- ...(rawBaseTweetContent.photos?.map((url) => url.expandedUrl) ??
- []),
- ...(rawBaseTweetContent.video?.variants[0].src ?? []),
- ],
- },
- raw: `<raw>${JSON.stringify(rawBaseTweetContent)}</raw>`,
- };
- } else {
- tweetContent = {
- text: unrolledTweetContent.value
- .map((tweet) => tweet.text)
- .join("\n"),
- metadata: {
- media: unrolledTweetContent.value.flatMap((tweet) => [
- ...tweet.videos,
- ...tweet.images,
- ]),
- links: unrolledTweetContent.value.flatMap((tweet) => tweet.links),
- },
- raw: `<raw>${JSON.stringify(rawBaseTweetContent)}</raw>`,
- };
- }
-
- // make it the same type as the page content
- const pageContentType: Awaited<ReturnType<typeof extractPageContent>> & {
- type: string;
- } = {
- contentToVectorize:
- tweetContent.text +
- "\n\nMetadata for this tweet:\n" +
- JSON.stringify(tweetContent.metadata) +
- "\n\nRaw tweet data:\n" +
- tweetContent.raw,
- contentToSave: tweetContent.raw,
- title: "",
- description: JSON.stringify(tweetContent.metadata),
- image: "",
- favicon: "",
- type: "tweet",
- };
- return pageContentType;
- case "note":
- const noteContent = {
- contentToVectorize: params.content,
- // TODO: different when using platejs
- contentToSave: params.content,
- // title is the first 30 characters of the first line
- title: params.content.split("\n")[0].slice(0, 30),
- type: "note",
- };
- return noteContent;
- case "document":
- const documentContent = await step.do(
- "extract document content",
- async () => await extractDocumentContent(params.content)
- );
- return {
- contentToVectorize: documentContent.content,
- contentToSave: documentContent.content,
- type: "document",
- };
- default:
- throw new NonRetryableError("Unknown content type");
- }
-};
diff --git a/apps/backend/src/utils/notion.ts b/apps/backend/src/utils/notion.ts
deleted file mode 100644
index ebe559e1..00000000
--- a/apps/backend/src/utils/notion.ts
+++ /dev/null
@@ -1,239 +0,0 @@
-interface PageContent {
- content: string;
- url: string;
- title: string;
- id: string;
- createdAt: string;
-}
-
-interface NotionBlock {
- type: string;
- [key: string]: any;
-}
-
-interface SearchResponse {
- results: {
- id: string;
- object: string;
- url: string;
- created_time: string;
- properties: {
- title?: {
- title: Array<{
- plain_text: string;
- }>;
- };
- Name?: {
- title: Array<{
- plain_text: string;
- }>;
- };
- };
- }[];
- next_cursor: string | undefined;
- has_more: boolean;
-}
-
-interface BlockResponse {
- results: NotionBlock[];
- next_cursor: string | undefined;
- has_more: boolean;
-}
-
-export const getAllNotionPageContents = async (
- token: string,
- onProgress: (progress: number) => Promise<void>
-): Promise<PageContent[]> => {
- const pages: PageContent[] = [];
- const NOTION_API_VERSION = "2022-06-28";
- const BASE_URL = "https://api.notion.com/v1";
- const MAX_RETRIES = 3;
- const BATCH_SIZE = 10; // Number of concurrent requests
- const PAGE_SIZE = 100; // Number of pages to fetch per search request
-
- const delay = (ms: number) =>
- new Promise((resolve) => setTimeout(resolve, ms));
-
- const notionFetch = async (
- endpoint: string,
- options: RequestInit = {},
- retries = 0
- ): Promise<any> => {
- try {
- const response = await fetch(`${BASE_URL}${endpoint}`, {
- ...options,
- headers: {
- Authorization: `Bearer ${token}`,
- "Notion-Version": NOTION_API_VERSION,
- "Content-Type": "application/json",
- ...((options.headers || {}) as Record<string, string>),
- },
- });
-
- if (response.status === 429) {
- // Rate limit error
- const retryAfter = parseInt(response.headers.get("Retry-After") || "5");
- if (retries < MAX_RETRIES) {
- await delay(retryAfter * 1000);
- return notionFetch(endpoint, options, retries + 1);
- }
- }
-
- if (!response.ok) {
- const errorText = await response.text();
- throw new Error(
- `Notion API error: ${response.statusText}\n${errorText}`
- );
- }
-
- return response.json();
- } catch (error) {
- if (retries < MAX_RETRIES) {
- await delay(2000 * (retries + 1)); // Exponential backoff
- return notionFetch(endpoint, options, retries + 1);
- }
- throw error;
- }
- };
-
- const convertBlockToMarkdown = (block: NotionBlock): string => {
- switch (block.type) {
- case "paragraph":
- return (
- block.paragraph?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("") || ""
- );
- case "heading_1":
- return `# ${block.heading_1?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("")}\n`;
- case "heading_2":
- return `## ${block.heading_2?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("")}\n`;
- case "heading_3":
- return `### ${block.heading_3?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("")}\n`;
- case "bulleted_list_item":
- return `* ${block.bulleted_list_item?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("")}\n`;
- case "numbered_list_item":
- return `1. ${block.numbered_list_item?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("")}\n`;
- case "to_do":
- const checked = block.to_do?.checked ? "x" : " ";
- return `- [${checked}] ${block.to_do?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("")}\n`;
- case "code":
- return `\`\`\`${block.code?.language || ""}\n${block.code?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("")}\n\`\`\`\n`;
- case "quote":
- return `> ${block.quote?.rich_text
- ?.map((text: any) => text.plain_text)
- .join("")}\n`;
- default:
- return "";
- }
- };
-
- const getAllBlocks = async (pageId: string): Promise<NotionBlock[]> => {
- const blocks: NotionBlock[] = [];
- let cursor: string | undefined = undefined;
-
- do {
- const endpoint = `/blocks/${pageId}/children${
- cursor ? `?start_cursor=${cursor}` : ""
- }`;
- const response = (await notionFetch(endpoint)) as BlockResponse;
- blocks.push(...response.results);
- cursor = response.next_cursor;
- } while (cursor);
-
- return blocks;
- };
-
- try {
- let hasMore = true;
- let cursor: string | undefined = undefined;
- let allPages: SearchResponse["results"] = [];
-
- // First, collect all pages
- while (hasMore) {
- const searchResponse = (await notionFetch("/search", {
- method: "POST",
- body: JSON.stringify({
- filter: {
- value: "page",
- property: "object",
- },
- sort: {
- direction: "ascending",
- timestamp: "last_edited_time",
- },
- start_cursor: cursor,
- page_size: PAGE_SIZE,
- }),
- })) as SearchResponse;
-
- allPages = [...allPages, ...searchResponse.results];
- cursor = searchResponse.next_cursor;
- hasMore = searchResponse.has_more;
-
- // Report progress for page collection (0-30%)
- const progressPercent = (allPages.length / (allPages.length + searchResponse.results.length)) * 30;
- await onProgress(progressPercent);
- }
-
- // Process pages in parallel batches
- for (let i = 0; i < allPages.length; i += BATCH_SIZE) {
- const batch = allPages.slice(i, i + BATCH_SIZE);
- const batchResults = await Promise.all(
- batch.map(async (page) => {
- try {
- const blocks = await getAllBlocks(page.id);
- const pageContent = {
- content: blocks.map(convertBlockToMarkdown).join("\n"),
- url: page.url || `https://notion.so/${page.id.replace(/-/g, "")}`,
- title:
- page.properties?.Name?.title?.[0]?.plain_text ||
- page.properties?.title?.title?.[0]?.plain_text ||
- "Untitled",
- id: page.id,
- createdAt: page.created_time,
- };
- return pageContent.content.length > 10 ? pageContent : null;
- } catch (error) {
- console.error(`Error processing page ${page.id}:`, error);
- return null;
- }
- })
- );
-
- pages.push(
- ...batchResults.filter(
- (result): result is PageContent => result !== null
- )
- );
-
- // Report progress for page processing (30-100%)
- const progressPercent = 30 + ((i + BATCH_SIZE) / allPages.length) * 70;
- await onProgress(Math.min(progressPercent, 100));
-
- // Add a small delay between batches to respect rate limits
- if (i + BATCH_SIZE < allPages.length) {
- await delay(1000);
- }
- }
-
- return pages.filter((page) => page.content.length > 10);
- } catch (error) {
- console.error("Error fetching Notion pages:", error);
- throw error;
- }
-};
diff --git a/apps/backend/src/utils/tweetsToThreads.ts b/apps/backend/src/utils/tweetsToThreads.ts
deleted file mode 100644
index 85f69b87..00000000
--- a/apps/backend/src/utils/tweetsToThreads.ts
+++ /dev/null
@@ -1,108 +0,0 @@
-import * as cheerio from "cheerio";
-import { BaseError } from "../errors/baseError";
-import { Ok, Result } from "../errors/results";
-
-interface Tweet {
- id: string;
- text: string;
- links: Array<string>;
- images: Array<string>;
- videos: Array<string>;
-}
-
-class ProcessTweetsError extends BaseError {
- constructor(message?: string, source?: string) {
- super("[Thread Proceessing Error]", message, source);
- }
-}
-
-type TweetProcessResult = Array<Tweet>;
-
-// there won't be a need for url caching right?
-export async function unrollTweets(
- url: string
-): Promise<Result<TweetProcessResult, ProcessTweetsError>> {
- const tweetId = url.split("/").pop();
- const response = await fetch(`https://unrollnow.com/status/${tweetId}`, {
- headers: {
- "User-Agent":
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
- "Cache-Control": "max-age=3600",
- },
- });
-
- if (!response.ok) {
- const error = await response.text();
- console.error(error);
- throw new Error(`HTTP error! status: ${response.status} - ${error}`);
- }
-
- const html = await response.text();
- const $ = cheerio.load(html);
- const tweets: Array<Tweet> = [];
-
- const urlRegex = /(https?:\/\/\S+)/g;
- const paragraphs = $(".mainarticle p").toArray();
-
- const processedTweets = await Promise.all(
- paragraphs.map(async (element, i) => {
- const $tweet = $(element);
- let tweetText = $tweet.text().trim();
- if (tweetText.length < 1) {
- return null;
- }
-
- if (i === paragraphs.length - 1 && tweetText.toLowerCase() === "yes") {
- return null;
- }
-
- const shortUrls = tweetText.match(urlRegex) || [];
- console.log("SHORT_URLS_LEN", shortUrls.length);
- console.log("SHORT_URLS", shortUrls);
-
- const expandedUrls = await Promise.all(shortUrls.map(expandShortUrl));
-
- tweetText = tweetText.replace(urlRegex, "").trim().replace(/\s+/g, " ");
-
- const images = $tweet
- .nextUntil("p")
- .find("img.tweetimg")
- .map((i, img) => $(img).attr("src"))
- .get();
-
- const videos = $tweet
- .nextUntil("p")
- .find("video > source")
- .map((i, vid) => $(vid).attr("src"))
- .get();
-
- return {
- id: `${tweetId}_${i}`,
- text: tweetText,
- links: expandedUrls,
- images: images,
- videos: videos,
- };
- })
- );
-
- tweets.push(
- ...processedTweets.filter((tweet): tweet is Tweet => tweet !== null)
- );
-
- return Ok(tweets);
-}
-
-async function expandShortUrl(shortUrl: string): Promise<string> {
- try {
- const response = await fetch(shortUrl, {
- method: "HEAD",
- redirect: "follow",
- });
- const expandedUrl = response.url;
- return expandedUrl;
- } catch (error) {
- console.error(`Failed to expand URL: ${shortUrl}`, error);
- return shortUrl;
- }
-}
diff --git a/apps/backend/src/utils/typeDecider.ts b/apps/backend/src/utils/typeDecider.ts
deleted file mode 100644
index 642b178e..00000000
--- a/apps/backend/src/utils/typeDecider.ts
+++ /dev/null
@@ -1,41 +0,0 @@
-import { Result, Ok, Err } from "../errors/results";
-import { BaseError } from "../errors/baseError";
-
-export type contentType = "page" | "tweet" | "note" | "document" | "notion";
-
-class GetTypeError extends BaseError {
- constructor(message?: string, source?: string) {
- super("[Decide Type Error]", message, source);
- }
-}
-export const typeDecider = (
- content: string
-): Result<contentType, GetTypeError> => {
- try {
- // if the content is a URL, then it's a page. if its a URL with https://x.com/user/status/123, then it's a tweet.
- // if it ends with .pdf etc then it's a document. else, it's a note.
- // do strict checking with regex
- if (
- content.match(/https?:\/\/(x\.com|twitter\.com)\/[\w]+\/[\w]+\/[\d]+/)
- ) {
- return Ok("tweet");
- } else if (content.match(/\.(pdf|doc|docx|txt|rtf|odt|md)/i)) {
- return Ok("document");
- } else if (
- content.match(/https?:\/\/(www\.)?notion\.so\/.*/)
- ) {
- return Ok("notion");
- } else if (
- content.match(
- /^(https?:\/\/)?(www\.)?[a-z0-9]+([-.]{1}[a-z0-9]+)*\.[a-z]{2,5}(\/.*)?$/i
- )
- ) {
- return Ok("page");
- } else {
- return Ok("note");
- }
- } catch (e) {
- console.error("[Decide Type Error]", e);
- return Err(new GetTypeError((e as Error).message, "typeDecider"));
- }
-};