diff options
| author | Fuwn <[email protected]> | 2025-07-29 13:30:06 +0200 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2025-07-29 13:30:06 +0200 |
| commit | 7facf12273094f29f517f0be4b7ac662c18e0abb (patch) | |
| tree | efa7f74a0463ab2ae2e92ebcb9a43cb8dd12f497 | |
| parent | feat(umapyai): Improve chunk search (diff) | |
| download | umapyai-7facf12273094f29f517f0be4b7ac662c18e0abb.tar.xz umapyai-7facf12273094f29f517f0be4b7ac662c18e0abb.zip | |
refactor(umapyai): Move NLP functions to module
| -rw-r--r-- | src/umapyai/__init__.py | 51 | ||||
| -rw-r--r-- | src/umapyai/language.py | 52 |
2 files changed, 53 insertions, 50 deletions
diff --git a/src/umapyai/__init__.py b/src/umapyai/__init__.py index fd1a2c7..a5b3c3d 100644 --- a/src/umapyai/__init__.py +++ b/src/umapyai/__init__.py @@ -13,9 +13,8 @@ from .constants import (ARTICLES_DIRECTORY, CHROMA_DIRECTORY, CHROMA_COLLECTION, CHUNK_SIZE, EMBEDDING_MODEL, OLLAMA_MODEL, TOP_K, OLLAMA_URL) from .ollama import start_ollama_server, is_ollama_live, ensure_model_pulled, kill_ollama -import spacy -import re from collections import defaultdict +from .language import clean_for_match, get_query_phrases logger.remove() logger.add( @@ -27,55 +26,10 @@ logger.add( app = Flask(__name__) socket = Sock(app) -language = spacy.load("en_core_web_sm") CORS(app) -def normalize(text): - return text.replace('_', ' ').replace('-', ' ').replace('.txt', - '').lower().strip() - - -def get_significant_filename_parts(filename): - normalised = normalize(filename) - words = normalised.split() - n_grams = [' '.join(words[i:i + 2]) for i in range(len(words) - 1)] - - if len(words) >= 2: - n_grams.append(' '.join(words[:2])) - - if len(words) >= 3: - n_grams.append(' '.join(words[:3])) - - n_grams.append(words[0]) - - return set(n_grams) - - -def get_query_phrases(query): - document = language(normalize(query)) - words = [ - token.text for token in document if not token.is_stop and token.is_alpha - ] - phrases = set() - - for chunk in document.noun_chunks: - phrases.add(chunk.text.lower().strip()) - - for ent in document.ents: - phrases.add(ent.text.lower().strip()) - - for n in [2, 3]: - for i in range(len(words) - n + 1): - phrases.add(' '.join(words[i:i + n])) - - for word in words: - phrases.add(word) - - return {phrase for phrase in phrases if len(phrase) > 2} - - def prompt(rag_context, user_query, is_first_turn=True): if is_first_turn: system_prompt = ( @@ -197,9 +151,6 @@ def main(): "source": chunk["source"] }]) - def clean_for_match(text): - return re.sub(r'\W+', ' ', text.lower()).strip() - def find_relevant_chunks(query, top_k=TOP_K): query_phrases = get_query_phrases(query) document_match_count = defaultdict(int) diff --git a/src/umapyai/language.py b/src/umapyai/language.py new file mode 100644 index 0000000..1a3c970 --- /dev/null +++ b/src/umapyai/language.py @@ -0,0 +1,52 @@ +import spacy +import re + +language = spacy.load("en_core_web_sm") + + +def normalize(text): + return text.replace('_', ' ').replace('-', ' ').replace('.txt', + '').lower().strip() + + +def get_significant_filename_parts(filename): + normalised = normalize(filename) + words = normalised.split() + n_grams = [' '.join(words[i:i + 2]) for i in range(len(words) - 1)] + + if len(words) >= 2: + n_grams.append(' '.join(words[:2])) + + if len(words) >= 3: + n_grams.append(' '.join(words[:3])) + + n_grams.append(words[0]) + + return set(n_grams) + + +def get_query_phrases(query): + document = language(normalize(query)) + words = [ + token.text for token in document if not token.is_stop and token.is_alpha + ] + phrases = set() + + for chunk in document.noun_chunks: + phrases.add(chunk.text.lower().strip()) + + for ent in document.ents: + phrases.add(ent.text.lower().strip()) + + for n in [2, 3]: + for i in range(len(words) - n + 1): + phrases.add(' '.join(words[i:i + n])) + + for word in words: + phrases.add(word) + + return {phrase for phrase in phrases if len(phrase) > 2} + + +def clean_for_match(text): + return re.sub(r'\W+', ' ', text.lower()).strip() |