aboutsummaryrefslogtreecommitdiff
path: root/apps/backend/src/utils/chunkers.ts
diff options
context:
space:
mode:
Diffstat (limited to 'apps/backend/src/utils/chunkers.ts')
-rw-r--r--apps/backend/src/utils/chunkers.ts116
1 files changed, 116 insertions, 0 deletions
diff --git a/apps/backend/src/utils/chunkers.ts b/apps/backend/src/utils/chunkers.ts
new file mode 100644
index 00000000..ce345d29
--- /dev/null
+++ b/apps/backend/src/utils/chunkers.ts
@@ -0,0 +1,116 @@
+import nlp from "compromise";
+
+export default function chunkText(
+ text: string,
+ maxChunkSize: number,
+ overlap: number = 0.2
+): string[] {
+ // Pre-process text to remove excessive whitespace
+ text = text.replace(/\s+/g, " ").trim();
+
+ const sentences = nlp(text).sentences().out("array");
+ const chunks: {
+ text: string;
+ start: number;
+ end: number;
+ metadata?: {
+ position: string;
+ context?: string;
+ };
+ }[] = [];
+
+ let currentChunk: string[] = [];
+ let currentSize = 0;
+
+ for (let i = 0; i < sentences.length; i++) {
+ const sentence = sentences[i].trim();
+
+ // Skip empty sentences
+ if (!sentence) continue;
+
+ // If a single sentence is longer than maxChunkSize, split it
+ if (sentence.length > maxChunkSize) {
+ if (currentChunk.length > 0) {
+ chunks.push({
+ text: currentChunk.join(" "),
+ start: i - currentChunk.length,
+ end: i - 1,
+ metadata: {
+ position: `${i - currentChunk.length}-${i - 1}`,
+ context: currentChunk[0].substring(0, 100), // First 100 chars for context
+ },
+ });
+ currentChunk = [];
+ currentSize = 0;
+ }
+
+ // Split long sentence into smaller chunks
+ const words = sentence.split(" ");
+ let tempChunk: string[] = [];
+
+ for (const word of words) {
+ if (tempChunk.join(" ").length + word.length > maxChunkSize) {
+ chunks.push({
+ text: tempChunk.join(" "),
+ start: i,
+ end: i,
+ metadata: {
+ position: `${i}`,
+ context: "Split sentence",
+ },
+ });
+ tempChunk = [];
+ }
+ tempChunk.push(word);
+ }
+
+ if (tempChunk.length > 0) {
+ chunks.push({
+ text: tempChunk.join(" "),
+ start: i,
+ end: i,
+ metadata: {
+ position: `${i}`,
+ context: "Split sentence remainder",
+ },
+ });
+ }
+ continue;
+ }
+
+ currentChunk.push(sentence);
+ currentSize += sentence.length;
+
+ if (currentSize >= maxChunkSize) {
+ const overlapSize = Math.floor(currentChunk.length * overlap);
+ chunks.push({
+ text: currentChunk.join(" "),
+ start: i - currentChunk.length + 1,
+ end: i,
+ metadata: {
+ position: `${i - currentChunk.length + 1}-${i}`,
+ context: currentChunk[0].substring(0, 100),
+ },
+ });
+
+ // Keep overlap sentences for next chunk
+ currentChunk = currentChunk.slice(-overlapSize);
+ currentSize = currentChunk.reduce((sum, s) => sum + s.length, 0);
+ }
+ }
+
+ // Handle remaining sentences
+ if (currentChunk.length > 0) {
+ chunks.push({
+ text: currentChunk.join(" "),
+ start: sentences.length - currentChunk.length,
+ end: sentences.length - 1,
+ metadata: {
+ position: `${sentences.length - currentChunk.length}-${sentences.length - 1}`,
+ context: currentChunk[0].substring(0, 100),
+ },
+ });
+ }
+
+ return chunks.map((chunk) => chunk.text);
+}