diff options
| author | Mahesh Sanikommmu <[email protected]> | 2025-08-16 18:50:10 -0700 |
|---|---|---|
| committer | Mahesh Sanikommmu <[email protected]> | 2025-08-16 18:50:10 -0700 |
| commit | 39003aff23d64ff1d96074d71521f6023c9bec01 (patch) | |
| tree | 3f870c04b3dce315bba1b21aa2da158494e71774 /apps/backend/src/utils | |
| parent | Merge pull request #355 from supermemoryai/archive (diff) | |
| download | supermemory-39003aff23d64ff1d96074d71521f6023c9bec01.tar.xz supermemory-39003aff23d64ff1d96074d71521f6023c9bec01.zip | |
New Version of Supermemory Consumer App
Diffstat (limited to 'apps/backend/src/utils')
| -rw-r--r-- | apps/backend/src/utils/chunkers.ts | 116 | ||||
| -rw-r--r-- | apps/backend/src/utils/cipher.ts | 79 | ||||
| -rw-r--r-- | apps/backend/src/utils/extractDocumentContent.ts | 87 | ||||
| -rw-r--r-- | apps/backend/src/utils/extractor.ts | 50 | ||||
| -rw-r--r-- | apps/backend/src/utils/fetchers.ts | 143 | ||||
| -rw-r--r-- | apps/backend/src/utils/notion.ts | 239 | ||||
| -rw-r--r-- | apps/backend/src/utils/tweetsToThreads.ts | 108 | ||||
| -rw-r--r-- | apps/backend/src/utils/typeDecider.ts | 41 |
8 files changed, 0 insertions, 863 deletions
diff --git a/apps/backend/src/utils/chunkers.ts b/apps/backend/src/utils/chunkers.ts deleted file mode 100644 index ce345d29..00000000 --- a/apps/backend/src/utils/chunkers.ts +++ /dev/null @@ -1,116 +0,0 @@ -import nlp from "compromise"; - -export default function chunkText( - text: string, - maxChunkSize: number, - overlap: number = 0.2 -): string[] { - // Pre-process text to remove excessive whitespace - text = text.replace(/\s+/g, " ").trim(); - - const sentences = nlp(text).sentences().out("array"); - const chunks: { - text: string; - start: number; - end: number; - metadata?: { - position: string; - context?: string; - }; - }[] = []; - - let currentChunk: string[] = []; - let currentSize = 0; - - for (let i = 0; i < sentences.length; i++) { - const sentence = sentences[i].trim(); - - // Skip empty sentences - if (!sentence) continue; - - // If a single sentence is longer than maxChunkSize, split it - if (sentence.length > maxChunkSize) { - if (currentChunk.length > 0) { - chunks.push({ - text: currentChunk.join(" "), - start: i - currentChunk.length, - end: i - 1, - metadata: { - position: `${i - currentChunk.length}-${i - 1}`, - context: currentChunk[0].substring(0, 100), // First 100 chars for context - }, - }); - currentChunk = []; - currentSize = 0; - } - - // Split long sentence into smaller chunks - const words = sentence.split(" "); - let tempChunk: string[] = []; - - for (const word of words) { - if (tempChunk.join(" ").length + word.length > maxChunkSize) { - chunks.push({ - text: tempChunk.join(" "), - start: i, - end: i, - metadata: { - position: `${i}`, - context: "Split sentence", - }, - }); - tempChunk = []; - } - tempChunk.push(word); - } - - if (tempChunk.length > 0) { - chunks.push({ - text: tempChunk.join(" "), - start: i, - end: i, - metadata: { - position: `${i}`, - context: "Split sentence remainder", - }, - }); - } - continue; - } - - currentChunk.push(sentence); - currentSize += sentence.length; - - if (currentSize >= maxChunkSize) { - const overlapSize = Math.floor(currentChunk.length * overlap); - chunks.push({ - text: currentChunk.join(" "), - start: i - currentChunk.length + 1, - end: i, - metadata: { - position: `${i - currentChunk.length + 1}-${i}`, - context: currentChunk[0].substring(0, 100), - }, - }); - - // Keep overlap sentences for next chunk - currentChunk = currentChunk.slice(-overlapSize); - currentSize = currentChunk.reduce((sum, s) => sum + s.length, 0); - } - } - - // Handle remaining sentences - if (currentChunk.length > 0) { - chunks.push({ - text: currentChunk.join(" "), - start: sentences.length - currentChunk.length, - end: sentences.length - 1, - metadata: { - position: `${sentences.length - currentChunk.length}-${sentences.length - 1}`, - context: currentChunk[0].substring(0, 100), - }, - }); - } - - return chunks.map((chunk) => chunk.text); -} diff --git a/apps/backend/src/utils/cipher.ts b/apps/backend/src/utils/cipher.ts deleted file mode 100644 index 3ba2e905..00000000 --- a/apps/backend/src/utils/cipher.ts +++ /dev/null @@ -1,79 +0,0 @@ -async function encrypt(data: string, key: string): Promise<string> { - try { - const encoder = new TextEncoder(); - const encodedData = encoder.encode(data); - - const baseForIv = encoder.encode(data + key); - const ivHash = await crypto.subtle.digest('SHA-256', baseForIv); - const iv = new Uint8Array(ivHash).slice(0, 12); - - const cryptoKey = await crypto.subtle.importKey( - "raw", - encoder.encode(key), - { name: "AES-GCM", length: 256 }, - false, - ["encrypt", "decrypt"] - ); - - const encrypted = await crypto.subtle.encrypt( - { name: "AES-GCM", iv: new Uint8Array(iv).buffer as ArrayBuffer }, - cryptoKey, - encodedData - ); - - const combined = new Uint8Array([...iv, ...new Uint8Array(encrypted)]); - - // Convert to base64 safely - const base64 = Buffer.from(combined).toString("base64"); - - // Make URL-safe - return base64.replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, ""); - } catch (err) { - console.error("Encryption error:", err); - throw err; - } -} - -async function decrypt(encryptedData: string, key: string): Promise<string> { - try { - // Restore base64 padding and convert URL-safe chars - const base64 = encryptedData - .replace(/-/g, "+") - .replace(/_/g, "/") - .padEnd( - encryptedData.length + ((4 - (encryptedData.length % 4)) % 4), - "=" - ); - - // Use Buffer for safer base64 decoding - const combined = Buffer.from(base64, "base64"); - const combinedArray = new Uint8Array(combined); - - // Extract the IV that was used for encryption - const iv = combinedArray.slice(0, 12); - const encrypted = combinedArray.slice(12); - - // Import the same key used for encryption - const cryptoKey = await crypto.subtle.importKey( - "raw", - new TextEncoder().encode(key), - { name: "AES-GCM", length: 256 }, - false, - ["encrypt", "decrypt"] - ); - - // Use the extracted IV and key to decrypt - const decrypted = await crypto.subtle.decrypt( - { name: "AES-GCM", iv: new Uint8Array(iv).buffer as ArrayBuffer }, - cryptoKey, - encrypted.buffer as ArrayBuffer - ); - - return new TextDecoder().decode(decrypted); - } catch (err) { - console.error("Decryption error:", err); - throw err; - } -} - -export { encrypt, decrypt };
\ No newline at end of file diff --git a/apps/backend/src/utils/extractDocumentContent.ts b/apps/backend/src/utils/extractDocumentContent.ts deleted file mode 100644 index 8b7d9256..00000000 --- a/apps/backend/src/utils/extractDocumentContent.ts +++ /dev/null @@ -1,87 +0,0 @@ -import * as mammoth from "mammoth"; -import { NonRetryableError } from "cloudflare:workflows"; -import { resolvePDFJS } from 'pdfjs-serverless'; - -interface DocumentContent { - content: string; - error?: string; -} - -export const extractDocumentContent = async ( - url: string -): Promise<DocumentContent> => { - try { - const fileExtension = url.split(".").pop()?.toLowerCase(); - - if (!fileExtension) { - throw new Error("Invalid file URL"); - } - - console.log("file", fileExtension); - - switch (fileExtension) { - case "pdf": - return await extractPdfContent(url); - case "md": - case "txt": - return await extractTextContent(url); - case "doc": - case "docx": - return await extractWordContent(url); - default: - throw new NonRetryableError(`Unsupported file type: ${fileExtension}`); - } - } catch (error) { - return { - content: "", - error: error instanceof Error ? error.message : "Unknown error occurred", - }; - } -}; - -async function extractPdfContent(url: string): Promise<DocumentContent> { - try { - const response = await fetch(url); - const arrayBuffer = await response.arrayBuffer(); - - // Initialize PDF.js with serverless compatibility - const { getDocument } = await resolvePDFJS(); - - // Load the PDF document - const pdf = await getDocument({ - data: arrayBuffer, - useSystemFonts: true, - }).promise; - - let fullText = ""; - - // Extract text from each page - for (let i = 1; i <= pdf.numPages; i++) { - const page = await pdf.getPage(i); - const textContent = await page.getTextContent(); - const pageText = textContent.items.map((item: any) => item.str).join(" "); - fullText += pageText + "\n"; - } - - return { content: fullText }; - } catch (error) { - console.error("Error extracting PDF content:", error); - return { - content: "", - error: error instanceof Error ? error.message : "Failed to extract PDF content", - }; - } -} - -async function extractTextContent(url: string): Promise<DocumentContent> { - const response = await fetch(url); - const text = await response.text(); - return { content: text }; -} - -async function extractWordContent(url: string): Promise<DocumentContent> { - const response = await fetch(url); - const arrayBuffer = await response.arrayBuffer(); - const result = await mammoth.extractRawText({ arrayBuffer }); - return { content: result.value }; -} diff --git a/apps/backend/src/utils/extractor.ts b/apps/backend/src/utils/extractor.ts deleted file mode 100644 index f033f8e1..00000000 --- a/apps/backend/src/utils/extractor.ts +++ /dev/null @@ -1,50 +0,0 @@ -import { Env } from "../types"; - -export const extractPageContent = async (content: string, env: Env) => { - const resp = await fetch(`https://r.jina.ai/${content}`); - - if (!resp.ok) { - throw new Error( - `Failed to fetch ${content}: ${resp.statusText}` + (await resp.text()) - ); - } - - const metadataResp = await fetch(`https://md.dhr.wtf/metadata?url=${content}`); - - if (!metadataResp.ok) { - throw new Error( - `Failed to fetch metadata for ${content}: ${metadataResp.statusText}` + - (await metadataResp.text()) - ); - } - - const metadata = await metadataResp.json() as { - title?: string; - description?: string; - image?: string; - favicon?: string; - }; - - const responseText = await resp.text(); - - try { - const json: { - contentToVectorize: string; - contentToSave: string; - title?: string; - description?: string; - image?: string; - favicon?: string; - } = { - contentToSave: responseText, - contentToVectorize: responseText, - title: metadata.title, - description: metadata.description, - image: metadata.image, - favicon: metadata.favicon, - }; - return json; - } catch (e) { - throw new Error(`Failed to parse JSON from ${content}: ${e}`); - } -}; diff --git a/apps/backend/src/utils/fetchers.ts b/apps/backend/src/utils/fetchers.ts deleted file mode 100644 index 2329f48a..00000000 --- a/apps/backend/src/utils/fetchers.ts +++ /dev/null @@ -1,143 +0,0 @@ -import { WorkflowStep } from "cloudflare:workers"; -import { isErr, Ok } from "../errors/results"; -import { typeDecider } from "./typeDecider"; -import { Env, WorkflowParams } from "../types"; -import { unrollTweets } from "./tweetsToThreads"; -import { Tweet } from "react-tweet/api"; -import { NonRetryableError } from "cloudflare:workflows"; -import { extractPageContent } from "./extractor"; -import { extractDocumentContent } from "./extractDocumentContent"; - -export const fetchContent = async ( - params: WorkflowParams, - env: Env, - step: WorkflowStep -) => { - const type = typeDecider(params.content); - - if (isErr(type)) { - throw type.error; - } - - switch (type.value) { - case "page": - const pageContent = await step?.do( - "extract page content", - async () => await extractPageContent(params.content, env) - ); - return { - ...pageContent, - type: "page", - }; - - case "tweet": - const tweetUrl = new URL(params.content); - tweetUrl.search = ""; // Remove all search params - const tweetId = tweetUrl.pathname.split("/").pop(); - - const rawBaseTweetContent = await step.do( - "extract tweet content", - async () => { - const url = `https://cdn.syndication.twimg.com/tweet-result?id=${tweetId}&lang=en&features=tfw_timeline_list%3A%3Btfw_follower_count_sunset%3Atrue%3Btfw_tweet_edit_backend%3Aon%3Btfw_refsrc_session%3Aon%3Btfw_fosnr_soft_interventions_enabled%3Aon%3Btfw_show_birdwatch_pivots_enabled%3Aon%3Btfw_show_business_verified_badge%3Aon%3Btfw_duplicate_scribes_to_settings%3Aon%3Btfw_use_profile_image_shape_enabled%3Aon%3Btfw_show_blue_verified_badge%3Aon%3Btfw_legacy_timeline_sunset%3Atrue%3Btfw_show_gov_verified_badge%3Aon%3Btfw_show_business_affiliate_badge%3Aon%3Btfw_tweet_edit_frontend%3Aon&token=4c2mmul6mnh`; - - const resp = await fetch(url, { - headers: { - "User-Agent": - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", - Accept: "application/json", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br", - Connection: "keep-alive", - "Upgrade-Insecure-Requests": "1", - "Cache-Control": "max-age=0", - TE: "Trailers", - }, - }); - - const data = (await resp.json()) as Tweet; - return data; - } - ); - - let tweetContent: { - text: string; - metadata: { - media?: string[] | undefined; - links?: string[] | undefined; - }; - raw: string; - }; - const unrolledTweetContent = { - value: [rawBaseTweetContent], - }; - if (true) { - console.error("Can't get thread, reverting back to single tweet"); - tweetContent = { - text: rawBaseTweetContent.text, - metadata: { - media: [ - ...(rawBaseTweetContent.photos?.map((url) => url.expandedUrl) ?? - []), - ...(rawBaseTweetContent.video?.variants[0].src ?? []), - ], - }, - raw: `<raw>${JSON.stringify(rawBaseTweetContent)}</raw>`, - }; - } else { - tweetContent = { - text: unrolledTweetContent.value - .map((tweet) => tweet.text) - .join("\n"), - metadata: { - media: unrolledTweetContent.value.flatMap((tweet) => [ - ...tweet.videos, - ...tweet.images, - ]), - links: unrolledTweetContent.value.flatMap((tweet) => tweet.links), - }, - raw: `<raw>${JSON.stringify(rawBaseTweetContent)}</raw>`, - }; - } - - // make it the same type as the page content - const pageContentType: Awaited<ReturnType<typeof extractPageContent>> & { - type: string; - } = { - contentToVectorize: - tweetContent.text + - "\n\nMetadata for this tweet:\n" + - JSON.stringify(tweetContent.metadata) + - "\n\nRaw tweet data:\n" + - tweetContent.raw, - contentToSave: tweetContent.raw, - title: "", - description: JSON.stringify(tweetContent.metadata), - image: "", - favicon: "", - type: "tweet", - }; - return pageContentType; - case "note": - const noteContent = { - contentToVectorize: params.content, - // TODO: different when using platejs - contentToSave: params.content, - // title is the first 30 characters of the first line - title: params.content.split("\n")[0].slice(0, 30), - type: "note", - }; - return noteContent; - case "document": - const documentContent = await step.do( - "extract document content", - async () => await extractDocumentContent(params.content) - ); - return { - contentToVectorize: documentContent.content, - contentToSave: documentContent.content, - type: "document", - }; - default: - throw new NonRetryableError("Unknown content type"); - } -}; diff --git a/apps/backend/src/utils/notion.ts b/apps/backend/src/utils/notion.ts deleted file mode 100644 index ebe559e1..00000000 --- a/apps/backend/src/utils/notion.ts +++ /dev/null @@ -1,239 +0,0 @@ -interface PageContent { - content: string; - url: string; - title: string; - id: string; - createdAt: string; -} - -interface NotionBlock { - type: string; - [key: string]: any; -} - -interface SearchResponse { - results: { - id: string; - object: string; - url: string; - created_time: string; - properties: { - title?: { - title: Array<{ - plain_text: string; - }>; - }; - Name?: { - title: Array<{ - plain_text: string; - }>; - }; - }; - }[]; - next_cursor: string | undefined; - has_more: boolean; -} - -interface BlockResponse { - results: NotionBlock[]; - next_cursor: string | undefined; - has_more: boolean; -} - -export const getAllNotionPageContents = async ( - token: string, - onProgress: (progress: number) => Promise<void> -): Promise<PageContent[]> => { - const pages: PageContent[] = []; - const NOTION_API_VERSION = "2022-06-28"; - const BASE_URL = "https://api.notion.com/v1"; - const MAX_RETRIES = 3; - const BATCH_SIZE = 10; // Number of concurrent requests - const PAGE_SIZE = 100; // Number of pages to fetch per search request - - const delay = (ms: number) => - new Promise((resolve) => setTimeout(resolve, ms)); - - const notionFetch = async ( - endpoint: string, - options: RequestInit = {}, - retries = 0 - ): Promise<any> => { - try { - const response = await fetch(`${BASE_URL}${endpoint}`, { - ...options, - headers: { - Authorization: `Bearer ${token}`, - "Notion-Version": NOTION_API_VERSION, - "Content-Type": "application/json", - ...((options.headers || {}) as Record<string, string>), - }, - }); - - if (response.status === 429) { - // Rate limit error - const retryAfter = parseInt(response.headers.get("Retry-After") || "5"); - if (retries < MAX_RETRIES) { - await delay(retryAfter * 1000); - return notionFetch(endpoint, options, retries + 1); - } - } - - if (!response.ok) { - const errorText = await response.text(); - throw new Error( - `Notion API error: ${response.statusText}\n${errorText}` - ); - } - - return response.json(); - } catch (error) { - if (retries < MAX_RETRIES) { - await delay(2000 * (retries + 1)); // Exponential backoff - return notionFetch(endpoint, options, retries + 1); - } - throw error; - } - }; - - const convertBlockToMarkdown = (block: NotionBlock): string => { - switch (block.type) { - case "paragraph": - return ( - block.paragraph?.rich_text - ?.map((text: any) => text.plain_text) - .join("") || "" - ); - case "heading_1": - return `# ${block.heading_1?.rich_text - ?.map((text: any) => text.plain_text) - .join("")}\n`; - case "heading_2": - return `## ${block.heading_2?.rich_text - ?.map((text: any) => text.plain_text) - .join("")}\n`; - case "heading_3": - return `### ${block.heading_3?.rich_text - ?.map((text: any) => text.plain_text) - .join("")}\n`; - case "bulleted_list_item": - return `* ${block.bulleted_list_item?.rich_text - ?.map((text: any) => text.plain_text) - .join("")}\n`; - case "numbered_list_item": - return `1. ${block.numbered_list_item?.rich_text - ?.map((text: any) => text.plain_text) - .join("")}\n`; - case "to_do": - const checked = block.to_do?.checked ? "x" : " "; - return `- [${checked}] ${block.to_do?.rich_text - ?.map((text: any) => text.plain_text) - .join("")}\n`; - case "code": - return `\`\`\`${block.code?.language || ""}\n${block.code?.rich_text - ?.map((text: any) => text.plain_text) - .join("")}\n\`\`\`\n`; - case "quote": - return `> ${block.quote?.rich_text - ?.map((text: any) => text.plain_text) - .join("")}\n`; - default: - return ""; - } - }; - - const getAllBlocks = async (pageId: string): Promise<NotionBlock[]> => { - const blocks: NotionBlock[] = []; - let cursor: string | undefined = undefined; - - do { - const endpoint = `/blocks/${pageId}/children${ - cursor ? `?start_cursor=${cursor}` : "" - }`; - const response = (await notionFetch(endpoint)) as BlockResponse; - blocks.push(...response.results); - cursor = response.next_cursor; - } while (cursor); - - return blocks; - }; - - try { - let hasMore = true; - let cursor: string | undefined = undefined; - let allPages: SearchResponse["results"] = []; - - // First, collect all pages - while (hasMore) { - const searchResponse = (await notionFetch("/search", { - method: "POST", - body: JSON.stringify({ - filter: { - value: "page", - property: "object", - }, - sort: { - direction: "ascending", - timestamp: "last_edited_time", - }, - start_cursor: cursor, - page_size: PAGE_SIZE, - }), - })) as SearchResponse; - - allPages = [...allPages, ...searchResponse.results]; - cursor = searchResponse.next_cursor; - hasMore = searchResponse.has_more; - - // Report progress for page collection (0-30%) - const progressPercent = (allPages.length / (allPages.length + searchResponse.results.length)) * 30; - await onProgress(progressPercent); - } - - // Process pages in parallel batches - for (let i = 0; i < allPages.length; i += BATCH_SIZE) { - const batch = allPages.slice(i, i + BATCH_SIZE); - const batchResults = await Promise.all( - batch.map(async (page) => { - try { - const blocks = await getAllBlocks(page.id); - const pageContent = { - content: blocks.map(convertBlockToMarkdown).join("\n"), - url: page.url || `https://notion.so/${page.id.replace(/-/g, "")}`, - title: - page.properties?.Name?.title?.[0]?.plain_text || - page.properties?.title?.title?.[0]?.plain_text || - "Untitled", - id: page.id, - createdAt: page.created_time, - }; - return pageContent.content.length > 10 ? pageContent : null; - } catch (error) { - console.error(`Error processing page ${page.id}:`, error); - return null; - } - }) - ); - - pages.push( - ...batchResults.filter( - (result): result is PageContent => result !== null - ) - ); - - // Report progress for page processing (30-100%) - const progressPercent = 30 + ((i + BATCH_SIZE) / allPages.length) * 70; - await onProgress(Math.min(progressPercent, 100)); - - // Add a small delay between batches to respect rate limits - if (i + BATCH_SIZE < allPages.length) { - await delay(1000); - } - } - - return pages.filter((page) => page.content.length > 10); - } catch (error) { - console.error("Error fetching Notion pages:", error); - throw error; - } -}; diff --git a/apps/backend/src/utils/tweetsToThreads.ts b/apps/backend/src/utils/tweetsToThreads.ts deleted file mode 100644 index 85f69b87..00000000 --- a/apps/backend/src/utils/tweetsToThreads.ts +++ /dev/null @@ -1,108 +0,0 @@ -import * as cheerio from "cheerio"; -import { BaseError } from "../errors/baseError"; -import { Ok, Result } from "../errors/results"; - -interface Tweet { - id: string; - text: string; - links: Array<string>; - images: Array<string>; - videos: Array<string>; -} - -class ProcessTweetsError extends BaseError { - constructor(message?: string, source?: string) { - super("[Thread Proceessing Error]", message, source); - } -} - -type TweetProcessResult = Array<Tweet>; - -// there won't be a need for url caching right? -export async function unrollTweets( - url: string -): Promise<Result<TweetProcessResult, ProcessTweetsError>> { - const tweetId = url.split("/").pop(); - const response = await fetch(`https://unrollnow.com/status/${tweetId}`, { - headers: { - "User-Agent": - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Cache-Control": "max-age=3600", - }, - }); - - if (!response.ok) { - const error = await response.text(); - console.error(error); - throw new Error(`HTTP error! status: ${response.status} - ${error}`); - } - - const html = await response.text(); - const $ = cheerio.load(html); - const tweets: Array<Tweet> = []; - - const urlRegex = /(https?:\/\/\S+)/g; - const paragraphs = $(".mainarticle p").toArray(); - - const processedTweets = await Promise.all( - paragraphs.map(async (element, i) => { - const $tweet = $(element); - let tweetText = $tweet.text().trim(); - if (tweetText.length < 1) { - return null; - } - - if (i === paragraphs.length - 1 && tweetText.toLowerCase() === "yes") { - return null; - } - - const shortUrls = tweetText.match(urlRegex) || []; - console.log("SHORT_URLS_LEN", shortUrls.length); - console.log("SHORT_URLS", shortUrls); - - const expandedUrls = await Promise.all(shortUrls.map(expandShortUrl)); - - tweetText = tweetText.replace(urlRegex, "").trim().replace(/\s+/g, " "); - - const images = $tweet - .nextUntil("p") - .find("img.tweetimg") - .map((i, img) => $(img).attr("src")) - .get(); - - const videos = $tweet - .nextUntil("p") - .find("video > source") - .map((i, vid) => $(vid).attr("src")) - .get(); - - return { - id: `${tweetId}_${i}`, - text: tweetText, - links: expandedUrls, - images: images, - videos: videos, - }; - }) - ); - - tweets.push( - ...processedTweets.filter((tweet): tweet is Tweet => tweet !== null) - ); - - return Ok(tweets); -} - -async function expandShortUrl(shortUrl: string): Promise<string> { - try { - const response = await fetch(shortUrl, { - method: "HEAD", - redirect: "follow", - }); - const expandedUrl = response.url; - return expandedUrl; - } catch (error) { - console.error(`Failed to expand URL: ${shortUrl}`, error); - return shortUrl; - } -} diff --git a/apps/backend/src/utils/typeDecider.ts b/apps/backend/src/utils/typeDecider.ts deleted file mode 100644 index 642b178e..00000000 --- a/apps/backend/src/utils/typeDecider.ts +++ /dev/null @@ -1,41 +0,0 @@ -import { Result, Ok, Err } from "../errors/results"; -import { BaseError } from "../errors/baseError"; - -export type contentType = "page" | "tweet" | "note" | "document" | "notion"; - -class GetTypeError extends BaseError { - constructor(message?: string, source?: string) { - super("[Decide Type Error]", message, source); - } -} -export const typeDecider = ( - content: string -): Result<contentType, GetTypeError> => { - try { - // if the content is a URL, then it's a page. if its a URL with https://x.com/user/status/123, then it's a tweet. - // if it ends with .pdf etc then it's a document. else, it's a note. - // do strict checking with regex - if ( - content.match(/https?:\/\/(x\.com|twitter\.com)\/[\w]+\/[\w]+\/[\d]+/) - ) { - return Ok("tweet"); - } else if (content.match(/\.(pdf|doc|docx|txt|rtf|odt|md)/i)) { - return Ok("document"); - } else if ( - content.match(/https?:\/\/(www\.)?notion\.so\/.*/) - ) { - return Ok("notion"); - } else if ( - content.match( - /^(https?:\/\/)?(www\.)?[a-z0-9]+([-.]{1}[a-z0-9]+)*\.[a-z]{2,5}(\/.*)?$/i - ) - ) { - return Ok("page"); - } else { - return Ok("note"); - } - } catch (e) { - console.error("[Decide Type Error]", e); - return Err(new GetTypeError((e as Error).message, "typeDecider")); - } -}; |