aboutsummaryrefslogtreecommitdiff
path: root/apps/cf-ai-backend/src/queueConsumer/chunkers/chonker.ts
blob: 18788dabd2cde93f705cd1e7c0962a1097781b56 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import nlp from "compromise";

/**
 * Split text into chunks of specified max size with some overlap for continuity.
 */
export default function chunkText(
	text: string,
	maxChunkSize: number,
	overlap: number = 0.2,
): string[] {
	const sentences = nlp(text).sentences().out("array");
	const chunks = [];
	let currentChunk: string[] = [];
	let currentSize = 0;

	for (let i = 0; i < sentences.length; i++) {
		const sentence = sentences[i];
		currentChunk.push(sentence);
		currentSize += sentence.length;

		if (currentSize >= maxChunkSize) {
			// Calculate overlap
			const overlapSize = Math.floor(currentChunk.length * overlap);
			const chunkText = currentChunk.join(" ");
			chunks.push({
				text: chunkText,
				start: i - currentChunk.length + 1,
				end: i,
			});

			// Prepare the next chunk with overlap
			currentChunk = currentChunk.slice(-overlapSize);
			currentSize = currentChunk.reduce((sum, s) => sum + s.length, 0);
		}
	}

	if (currentChunk.length > 0) {
		const chunkText = currentChunk.join(" ");
		chunks.push({
			text: chunkText,
			start: sentences.length - currentChunk.length,
			end: sentences.length,
		});
	}

	return chunks.map((chunk) => chunk.text);
}