aboutsummaryrefslogtreecommitdiff
path: root/apps/backend/src/utils/extractDocumentContent.ts
diff options
context:
space:
mode:
Diffstat (limited to 'apps/backend/src/utils/extractDocumentContent.ts')
-rw-r--r--apps/backend/src/utils/extractDocumentContent.ts87
1 files changed, 87 insertions, 0 deletions
diff --git a/apps/backend/src/utils/extractDocumentContent.ts b/apps/backend/src/utils/extractDocumentContent.ts
new file mode 100644
index 00000000..8b7d9256
--- /dev/null
+++ b/apps/backend/src/utils/extractDocumentContent.ts
@@ -0,0 +1,87 @@
+import * as mammoth from "mammoth";
+import { NonRetryableError } from "cloudflare:workflows";
+import { resolvePDFJS } from 'pdfjs-serverless';
+
+interface DocumentContent {
+ content: string;
+ error?: string;
+}
+
+export const extractDocumentContent = async (
+ url: string
+): Promise<DocumentContent> => {
+ try {
+ const fileExtension = url.split(".").pop()?.toLowerCase();
+
+ if (!fileExtension) {
+ throw new Error("Invalid file URL");
+ }
+
+ console.log("file", fileExtension);
+
+ switch (fileExtension) {
+ case "pdf":
+ return await extractPdfContent(url);
+ case "md":
+ case "txt":
+ return await extractTextContent(url);
+ case "doc":
+ case "docx":
+ return await extractWordContent(url);
+ default:
+ throw new NonRetryableError(`Unsupported file type: ${fileExtension}`);
+ }
+ } catch (error) {
+ return {
+ content: "",
+ error: error instanceof Error ? error.message : "Unknown error occurred",
+ };
+ }
+};
+
+async function extractPdfContent(url: string): Promise<DocumentContent> {
+ try {
+ const response = await fetch(url);
+ const arrayBuffer = await response.arrayBuffer();
+
+ // Initialize PDF.js with serverless compatibility
+ const { getDocument } = await resolvePDFJS();
+
+ // Load the PDF document
+ const pdf = await getDocument({
+ data: arrayBuffer,
+ useSystemFonts: true,
+ }).promise;
+
+ let fullText = "";
+
+ // Extract text from each page
+ for (let i = 1; i <= pdf.numPages; i++) {
+ const page = await pdf.getPage(i);
+ const textContent = await page.getTextContent();
+ const pageText = textContent.items.map((item: any) => item.str).join(" ");
+ fullText += pageText + "\n";
+ }
+
+ return { content: fullText };
+ } catch (error) {
+ console.error("Error extracting PDF content:", error);
+ return {
+ content: "",
+ error: error instanceof Error ? error.message : "Failed to extract PDF content",
+ };
+ }
+}
+
+async function extractTextContent(url: string): Promise<DocumentContent> {
+ const response = await fetch(url);
+ const text = await response.text();
+ return { content: text };
+}
+
+async function extractWordContent(url: string): Promise<DocumentContent> {
+ const response = await fetch(url);
+ const arrayBuffer = await response.arrayBuffer();
+ const result = await mammoth.extractRawText({ arrayBuffer });
+ return { content: result.value };
+}