import nlp from "compromise"; /** * Split text into chunks of specified max size with some overlap for continuity. */ export default function chunkText( text: string, maxChunkSize: number, overlap: number = 0.2, ): string[] { const sentences = nlp(text).sentences().out("array"); const chunks = []; let currentChunk: string[] = []; let currentSize = 0; for (let i = 0; i < sentences.length; i++) { const sentence = sentences[i]; currentChunk.push(sentence); currentSize += sentence.length; if (currentSize >= maxChunkSize) { // Calculate overlap const overlapSize = Math.floor(currentChunk.length * overlap); const chunkText = currentChunk.join(" "); chunks.push({ text: chunkText, start: i - currentChunk.length + 1, end: i, }); // Prepare the next chunk with overlap currentChunk = currentChunk.slice(-overlapSize); currentSize = currentChunk.reduce((sum, s) => sum + s.length, 0); } } if (currentChunk.length > 0) { const chunkText = currentChunk.join(" "); chunks.push({ text: chunkText, start: sentences.length - currentChunk.length, end: sentences.length, }); } return chunks.map((chunk) => chunk.text); }