-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchatbot.py
More file actions
110 lines (86 loc) · 3.51 KB
/
chatbot.py
File metadata and controls
110 lines (86 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from ollama import Client
import json
import chromadb
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
client = chromadb.PersistentClient()
remote_client = Client(host=f" http://172.16.8.170:11434")
collection = client.get_or_create_collection(name="articles_demo")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=200, chunk_overlap=20, separators=["."]
)
with open("counter.txt","r") as f:
count = int(f.read().strip())
print("Reading articles.jsonl and generating embeddings...")
with open("news.jsonl", "r") as f:
for i, line in enumerate(f):
if i < count:
continue
count+=1
article = json.loads(line)
content = article["content"]
sentences = text_splitter.split_text(content)
print(f"Processing article {i}: {article.get('title', 'No Title')}")
# 1. Loop through and add individual sentences
for j, each_sentence in enumerate(sentences):
response = remote_client.embed(
model="nomic-embed-text",
input=f"search_document: {each_sentence}"
)
embedding = response["embeddings"][0]
collection.add(
ids=[f"article_{i}_sentence_{j}"],
embeddings=[embedding],
documents=[each_sentence],
metadatas=[{"title": article["title"]}],
)
# 2. Add the full article content
# response = remote_client.embed(
# model="nomic-embed-text",
# input=f"search_document: {content}"
# )
# embedding = response["embeddings"][0]
# collection.add(
# ids=[f"article_{i}"],
# embeddings=[embedding],
# documents=[content],
# metadatas=[{"title": article["title"]}],
# )
print("Database built successfully!")
# --- Query Section ---
with open ("counter.txt", 'w') as f:
f.write(str(count))
# query = "are there any predicted hindrance for upcoming election ?"
while True:
print("-----------------------------")
query=input("Enter your question: ")
if query == "break":
break
query_embed = remote_client.embed(
model="nomic-embed-text",
input=f"query: {query}"
)["embeddings"][0]
results = collection.query(query_embeddings=[query_embed], n_results=4)
#print(f"\nQuestion: {query}")
# Using results["metadatas"][0][0] and results["documents"][0][0] to access the first match
#print(f'\nTitle : {results["metadatas"][0][0]["title"]} \n{results["documents"][0][0]}')
context='\n'.join(results["documents"][0]) # Combine all retrieved documents into a single context string
prompt = f"""You are a helpful assistant. Answer the question based on the context provided. Use the information in the context to form your answer.
rules:
1. Use only the information provided in the context to answer the question.-
3. Be precise while answering the question, and avoid adding any information that is not present in the context.
4. No need to restate the question in your answer. Just provide the answer based on the context.
5. If context does not have any information just say "I don't know"
Context: {context}
Question: {query}
Answer:"""
print(prompt)
response = remote_client.generate(
model="qwen3:4b-q4_K_M",
prompt=prompt,
options={
"temperature": 0.4,
}
)
answer = response['response']
print(answer)