diff options
Diffstat (limited to 'apps/backend/src/utils/chunkers.ts')
| -rw-r--r-- | apps/backend/src/utils/chunkers.ts | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/apps/backend/src/utils/chunkers.ts b/apps/backend/src/utils/chunkers.ts new file mode 100644 index 00000000..ce345d29 --- /dev/null +++ b/apps/backend/src/utils/chunkers.ts @@ -0,0 +1,116 @@ +import nlp from "compromise"; + +export default function chunkText( + text: string, + maxChunkSize: number, + overlap: number = 0.2 +): string[] { + // Pre-process text to remove excessive whitespace + text = text.replace(/\s+/g, " ").trim(); + + const sentences = nlp(text).sentences().out("array"); + const chunks: { + text: string; + start: number; + end: number; + metadata?: { + position: string; + context?: string; + }; + }[] = []; + + let currentChunk: string[] = []; + let currentSize = 0; + + for (let i = 0; i < sentences.length; i++) { + const sentence = sentences[i].trim(); + + // Skip empty sentences + if (!sentence) continue; + + // If a single sentence is longer than maxChunkSize, split it + if (sentence.length > maxChunkSize) { + if (currentChunk.length > 0) { + chunks.push({ + text: currentChunk.join(" "), + start: i - currentChunk.length, + end: i - 1, + metadata: { + position: `${i - currentChunk.length}-${i - 1}`, + context: currentChunk[0].substring(0, 100), // First 100 chars for context + }, + }); + currentChunk = []; + currentSize = 0; + } + + // Split long sentence into smaller chunks + const words = sentence.split(" "); + let tempChunk: string[] = []; + + for (const word of words) { + if (tempChunk.join(" ").length + word.length > maxChunkSize) { + chunks.push({ + text: tempChunk.join(" "), + start: i, + end: i, + metadata: { + position: `${i}`, + context: "Split sentence", + }, + }); + tempChunk = []; + } + tempChunk.push(word); + } + + if (tempChunk.length > 0) { + chunks.push({ + text: tempChunk.join(" "), + start: i, + end: i, + metadata: { + position: `${i}`, + context: "Split sentence remainder", + }, + }); + } + continue; + } + + currentChunk.push(sentence); + currentSize += sentence.length; + + if (currentSize >= maxChunkSize) { + const overlapSize = Math.floor(currentChunk.length * overlap); + chunks.push({ + text: currentChunk.join(" "), + start: i - currentChunk.length + 1, + end: i, + metadata: { + position: `${i - currentChunk.length + 1}-${i}`, + context: currentChunk[0].substring(0, 100), + }, + }); + + // Keep overlap sentences for next chunk + currentChunk = currentChunk.slice(-overlapSize); + currentSize = currentChunk.reduce((sum, s) => sum + s.length, 0); + } + } + + // Handle remaining sentences + if (currentChunk.length > 0) { + chunks.push({ + text: currentChunk.join(" "), + start: sentences.length - currentChunk.length, + end: sentences.length - 1, + metadata: { + position: `${sentences.length - currentChunk.length}-${sentences.length - 1}`, + context: currentChunk[0].substring(0, 100), + }, + }); + } + + return chunks.map((chunk) => chunk.text); +} |