1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
import qdrant_client
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.core.storage import StorageContext
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
import logging
import threading
import sys
import time
def _wait_animation(stop_event: threading.Event, interval: float = 0.35):
frames = ["", ".", "..", "..."]
max_length = max(len(frame) for frame in frames)
index = 0
while not stop_event.is_set():
frame = frames[index % len(frames)]
sys.stdout.write("\r" + frame + " " * (max_length - len(frame)))
index += 1
time.sleep(interval)
sys.stdout.write("\r" + " " * max_length + "\r")
sys.stdout.flush()
def main():
logging.basicConfig(level=logging.WARNING)
documents = SimpleDirectoryReader("./uma_articles_clean").load_data()
client = qdrant_client.QdrantClient(path="./qdrant_data")
vector_store = QdrantVectorStore(client=client, collection_name="umamusume")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
llm = Ollama(model="gpt-oss:20b", request_timeout=120.0)
embedding_model = HuggingFaceEmbedding(
model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.llm = llm
Settings.embed_model = embedding_model
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
query_engine = index.as_query_engine(streaming=True)
try:
while True:
try:
user_query = input("> ").strip()
except EOFError:
print()
break
if not user_query:
continue
if user_query.lower() in {"quit", "exit", "q"}:
break
response_stream = query_engine.query(user_query)
stop_event = threading.Event()
print()
animation_thread = threading.Thread(
target=_wait_animation, args=(stop_event,), daemon=True)
animation_thread.start()
got_first_token = False
try:
for token in response_stream.response_gen:
if not got_first_token:
stop_event.set()
animation_thread.join()
got_first_token = True
print(token, end="", flush=True)
finally:
if not got_first_token:
stop_event.set()
animation_thread.join()
print("\n")
except KeyboardInterrupt:
print()
print()
|