aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFuwn <[email protected]>2025-07-29 13:30:06 +0200
committerFuwn <[email protected]>2025-07-29 13:30:06 +0200
commit7facf12273094f29f517f0be4b7ac662c18e0abb (patch)
treeefa7f74a0463ab2ae2e92ebcb9a43cb8dd12f497
parentfeat(umapyai): Improve chunk search (diff)
downloadumapyai-7facf12273094f29f517f0be4b7ac662c18e0abb.tar.xz
umapyai-7facf12273094f29f517f0be4b7ac662c18e0abb.zip
refactor(umapyai): Move NLP functions to module
-rw-r--r--src/umapyai/__init__.py51
-rw-r--r--src/umapyai/language.py52
2 files changed, 53 insertions, 50 deletions
diff --git a/src/umapyai/__init__.py b/src/umapyai/__init__.py
index fd1a2c7..a5b3c3d 100644
--- a/src/umapyai/__init__.py
+++ b/src/umapyai/__init__.py
@@ -13,9 +13,8 @@ from .constants import (ARTICLES_DIRECTORY, CHROMA_DIRECTORY, CHROMA_COLLECTION,
CHUNK_SIZE, EMBEDDING_MODEL, OLLAMA_MODEL, TOP_K,
OLLAMA_URL)
from .ollama import start_ollama_server, is_ollama_live, ensure_model_pulled, kill_ollama
-import spacy
-import re
from collections import defaultdict
+from .language import clean_for_match, get_query_phrases
logger.remove()
logger.add(
@@ -27,55 +26,10 @@ logger.add(
app = Flask(__name__)
socket = Sock(app)
-language = spacy.load("en_core_web_sm")
CORS(app)
-def normalize(text):
- return text.replace('_', ' ').replace('-', ' ').replace('.txt',
- '').lower().strip()
-
-
-def get_significant_filename_parts(filename):
- normalised = normalize(filename)
- words = normalised.split()
- n_grams = [' '.join(words[i:i + 2]) for i in range(len(words) - 1)]
-
- if len(words) >= 2:
- n_grams.append(' '.join(words[:2]))
-
- if len(words) >= 3:
- n_grams.append(' '.join(words[:3]))
-
- n_grams.append(words[0])
-
- return set(n_grams)
-
-
-def get_query_phrases(query):
- document = language(normalize(query))
- words = [
- token.text for token in document if not token.is_stop and token.is_alpha
- ]
- phrases = set()
-
- for chunk in document.noun_chunks:
- phrases.add(chunk.text.lower().strip())
-
- for ent in document.ents:
- phrases.add(ent.text.lower().strip())
-
- for n in [2, 3]:
- for i in range(len(words) - n + 1):
- phrases.add(' '.join(words[i:i + n]))
-
- for word in words:
- phrases.add(word)
-
- return {phrase for phrase in phrases if len(phrase) > 2}
-
-
def prompt(rag_context, user_query, is_first_turn=True):
if is_first_turn:
system_prompt = (
@@ -197,9 +151,6 @@ def main():
"source": chunk["source"]
}])
- def clean_for_match(text):
- return re.sub(r'\W+', ' ', text.lower()).strip()
-
def find_relevant_chunks(query, top_k=TOP_K):
query_phrases = get_query_phrases(query)
document_match_count = defaultdict(int)
diff --git a/src/umapyai/language.py b/src/umapyai/language.py
new file mode 100644
index 0000000..1a3c970
--- /dev/null
+++ b/src/umapyai/language.py
@@ -0,0 +1,52 @@
+import spacy
+import re
+
+language = spacy.load("en_core_web_sm")
+
+
+def normalize(text):
+ return text.replace('_', ' ').replace('-', ' ').replace('.txt',
+ '').lower().strip()
+
+
+def get_significant_filename_parts(filename):
+ normalised = normalize(filename)
+ words = normalised.split()
+ n_grams = [' '.join(words[i:i + 2]) for i in range(len(words) - 1)]
+
+ if len(words) >= 2:
+ n_grams.append(' '.join(words[:2]))
+
+ if len(words) >= 3:
+ n_grams.append(' '.join(words[:3]))
+
+ n_grams.append(words[0])
+
+ return set(n_grams)
+
+
+def get_query_phrases(query):
+ document = language(normalize(query))
+ words = [
+ token.text for token in document if not token.is_stop and token.is_alpha
+ ]
+ phrases = set()
+
+ for chunk in document.noun_chunks:
+ phrases.add(chunk.text.lower().strip())
+
+ for ent in document.ents:
+ phrases.add(ent.text.lower().strip())
+
+ for n in [2, 3]:
+ for i in range(len(words) - n + 1):
+ phrases.add(' '.join(words[i:i + n]))
+
+ for word in words:
+ phrases.add(word)
+
+ return {phrase for phrase in phrases if len(phrase) > 2}
+
+
+def clean_for_match(text):
+ return re.sub(r'\W+', ' ', text.lower()).strip()