ConQuer/wiki.py at main · sofyc/ConQuer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import json
import collections
import numpy as np
from openai import OpenAI
from google import genai
from google.genai import types
import time
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

import os
from tqdm import tqdm
import ast
import chromadb
import wikipedia
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('your_email', 'en')

from llama_index.readers.wikipedia import WikipediaReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
from llama_index.core.schema import Document
from tenacity import retry, stop_after_attempt, wait_exponential

with open("concepts.json", "r") as f:
    data = json.loads(f.read())


llm = OpenAI(model="gpt-4o-mini")
embed_model_name = "text-embedding-3-large"
embed_model = OpenAIEmbedding(model=embed_model_name)

Settings.embed_model = embed_model
Settings.llm = llm
top_k = 5
chunk_size = 128
chunk_overlap = 50
Settings.top_k = top_k
Settings.chunk_size = chunk_size
Settings.chunk_overlap = chunk_overlap

wiki = {}
# gemini_client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
for level, areas in data.items():
    wiki[level] = {}

    for area, qs in tqdm(areas.items()):
        wiki[level][area] = {}
        wiki[level][area]["summary"] = []
        wiki[level][area]["wiki"] = []
        data[level][area]["quiz"] = []

        for question, concepts in zip(qs["questions"], qs["concepts"]):
            i = 0
            wiki_docs = []
            for concept in concepts:
                # for page in pages[:3]:
                try:
                    page_py = wiki_wiki.page(concept)
                    page_content = page_py.text
                    page_id = str(page_py.pageid)
                    wiki_docs.append(Document(id_=page_id, text=page_content))
                except Exception as e:
                    print(f"Error loading page {concept}: {e}")


                # wiki_docs.extend(results)

                # wiki_docs.extend(results)
            # print(len(wiki_docs))
            index = VectorStoreIndex.from_documents(wiki_docs)
            # Get top_k nearest documents based on embeddings similarity
            retriever = index.as_retriever(similarity_top_k=5)
            nodes = retriever.retrieve(question)
            # print(len(nodes))
            wiki_information = ""
            for i, node in enumerate(nodes):
                wiki_information += f"\n\nInformation {i+1}:\n"
                wiki_information += node.node.get_content()

            wiki_information = wiki_information.strip()

            prompt = f"""You are a summary generator. The students are currently studying {area} at the {level} level and have asked a question. You have access to reference information from Wikipedia. Your task is to condense this information into a single, clear paragraph that highlights the key points and aids the students in better understanding their question.

Reference Wikipedia Information:
{wiki_information}

Student Question: {question}"""

            # response = gemini_client.models.generate_content(model='gemini-2.0-flash-exp', contents=prompt)
            # time.sleep(4)
            # summary = response.text

            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "user", "content": prompt},
                ]
            )
            summary = response.choices[0].message.content

            prompt = f"""You are a quiz generator. The students are currently studying {area} at the {level} level and have asked a question. Your task is to create 3 quizzes that helps the student better understand the question. You have access to summarized reference information from Wikipedia. The quizzes should accurately reflect reference information, and the correct answer must be well-supported by reference information. The quiz should consist of one question, one correct answer, and three incorrect options. The correct answer must always be placed in option A. The difficulty level should align with the knowledge and reasoning complexity appropriate for {level} education.

Example:

Student Question: Where is Beijing located?
[Quiz]
Quiz: What is the capital city of China?
A. Beijing
B. Chengdu
C. Shanghai
D. Hangzhou

[Quiz]
Quiz: What continent is Beijing located?
A. Asia
B. Europe
C. Africa
D. North America

Now, please generate 3 quizzes following the format, each quiz should follow thw sign of [Quiz]:
Reference Wikipedia Information:
{summary}

Student Question: {question}"""

            # response = gemini_client.models.generate_content(model='gemini-2.0-flash-exp', contents=prompt)
            # time.sleep(4)
            # quiz = [i.strip() for i in str(response.text).split('[Quiz]') if i.strip() and i.strip().startswith("Quiz")]

            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "user", "content": prompt},
                ]
            )
            quiz = [i.strip() for i in str(response.choices[0].message.content).split('[Quiz]') if i.strip() and i.strip().startswith("Quiz")]

            data[level][area]["quiz"].append(quiz)
            wiki[level][area]["summary"].append(summary)
            wiki[level][area]["wiki"].append(wiki_information)
            # data[level][area]["score"].append(score)

with open("quiz_concept_wiki.json", "w") as f:
    f.write(json.dumps(data, indent=4))

with open("wiki.json", "w") as f:
    f.write(json.dumps(wiki, indent=4))