newschatbot/chatbot.py at master · RajTimalsina/newschatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from ollama import Client
import json
import chromadb
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter

client = chromadb.PersistentClient()
remote_client = Client(host=f" http://172.16.8.170:11434")
collection = client.get_or_create_collection(name="articles_demo")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, chunk_overlap=20, separators=["."]
)
with open("counter.txt","r") as f:
    count = int(f.read().strip())


print("Reading articles.jsonl and generating embeddings...")
with open("news.jsonl", "r") as f:
    for i, line in enumerate(f):
        if i < count:
            continue
        count+=1
        article = json.loads(line)
        content = article["content"]
        sentences = text_splitter.split_text(content)
        print(f"Processing article {i}: {article.get('title', 'No Title')}")

        # 1. Loop through and add individual sentences
        for j, each_sentence in enumerate(sentences):
            response = remote_client.embed(
                model="nomic-embed-text",
                input=f"search_document: {each_sentence}"
            )
            embedding = response["embeddings"][0]
            collection.add(
                ids=[f"article_{i}_sentence_{j}"],
                embeddings=[embedding],
                documents=[each_sentence],
                metadatas=[{"title": article["title"]}],
            )

        # 2. Add the full article content
        # response = remote_client.embed(
        #     model="nomic-embed-text",
        #     input=f"search_document: {content}"
        # )
        # embedding = response["embeddings"][0]

        # collection.add(
        #     ids=[f"article_{i}"],
        #     embeddings=[embedding],
        #     documents=[content],
        #     metadatas=[{"title": article["title"]}],
        # )

print("Database built successfully!")

# --- Query Section ---
with open ("counter.txt", 'w') as f:
    f.write(str(count))

# query = "are there any predicted hindrance for upcoming election ?"

while True:
    print("-----------------------------")
    query=input("Enter your question: ")

    if query == "break":
        break
    query_embed = remote_client.embed(
        model="nomic-embed-text",
        input=f"query: {query}"
    )["embeddings"][0]

    results = collection.query(query_embeddings=[query_embed], n_results=4)

    #print(f"\nQuestion: {query}")
    # Using results["metadatas"][0][0] and results["documents"][0][0] to access the first match
    #print(f'\nTitle : {results["metadatas"][0][0]["title"]} \n{results["documents"][0][0]}')


    context='\n'.join(results["documents"][0])  # Combine all retrieved documents into a single context string
    prompt = f"""You are a helpful assistant. Answer the question based on the context provided. Use the information in the context to form your answer.
    rules:
    1. Use only the information provided in the context to answer the question.-
    3. Be precise while answering the question, and avoid adding any information that is not present in the context.
    4. No need to restate the question in your answer. Just provide the answer based on the context.
    5. If context does not have any information just say "I don't know"


    Context: {context}

    Question: {query}

    Answer:"""

    print(prompt)
    response = remote_client.generate(
            model="qwen3:4b-q4_K_M",
            prompt=prompt,
            options={
                "temperature": 0.4,
            }
        )


    answer = response['response']

    print(answer)