aboutsummaryrefslogtreecommitdiff
path: root/apps/cf-ai-backend/src/utils/chonker.ts
blob: c63020be1505118db61423aa41598cd3bce58940 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import nlp from "compromise";

/**
 * Split text into chunks of specified max size with some overlap for continuity.
 */
export default function chunkText(
  text: string,
  maxChunkSize: number,
  overlap: number = 0.2,
): string[] {
  const sentences = nlp(text).sentences().out("array");
  const chunks = [];
  let currentChunk: string[] = [];
  let currentSize = 0;

  for (let i = 0; i < sentences.length; i++) {
    const sentence = sentences[i];
    currentChunk.push(sentence);
    currentSize += sentence.length;

    if (currentSize >= maxChunkSize) {
      // Calculate overlap
      const overlapSize = Math.floor(currentChunk.length * overlap);
      const chunkText = currentChunk.join(" ");
      chunks.push({
        text: chunkText,
        start: i - currentChunk.length + 1,
        end: i,
      });

      // Prepare the next chunk with overlap
      currentChunk = currentChunk.slice(-overlapSize);
      currentSize = currentChunk.reduce((sum, s) => sum + s.length, 0);
    }
  }

  if (currentChunk.length > 0) {
    const chunkText = currentChunk.join(" ");
    chunks.push({
      text: chunkText,
      start: sentences.length - currentChunk.length,
      end: sentences.length,
    });
  }

  return chunks.map((chunk) => chunk.text);
}