chatpdflike/generate_embedding.py at main · Ulov888/chatpdflike · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import logging as logger
import ollama
import openai
import os
import os
import pandas as pd
from flask_cors import CORS
from openai.embeddings_utils import get_embedding, cosine_similarity

openai.api_key = os.getenv('OPENAI_API_KEY')

class Chatbot():
    def parse_paper(self, pdf):
        logger.info("Parsing paper")
        number_of_pages = len(pdf.pages)
        logger.info(f"Total number of pages: {number_of_pages}")
        paper_text = []
        for i in range(number_of_pages):
            page = pdf.pages[i]
            page_text = []

            def visitor_body(text, cm, tm, fontDict, fontSize):
                x = tm[4]
                y = tm[5]
                # ignore header/footer
                if (y > 50 and y < 720) and (len(text.strip()) > 1):
                    page_text.append({
                        'fontsize': fontSize,
                        'text': text.strip().replace('\x03', ''),
                        'x': x,
                        'y': y
                    })

            _ = page.extract_text(visitor_text=visitor_body)

            blob_font_size = None
            blob_text = ''
            processed_text = []

            for t in page_text:
                if t['fontsize'] == blob_font_size:
                    blob_text += f" {t['text']}"
                    if len(blob_text) >= 2000:
                        processed_text.append({
                            'fontsize': blob_font_size,
                            'text': blob_text,
                            'page': i
                        })
                        blob_font_size = None
                        blob_text = ''
                else:
                    if blob_font_size is not None and len(blob_text) >= 1:
                        processed_text.append({
                            'fontsize': blob_font_size,
                            'text': blob_text,
                            'page': i
                        })
                    blob_font_size = t['fontsize']
                    blob_text = t['text']
                paper_text += processed_text
        logger.info("Done parsing paper")
        return paper_text

    def paper_df(self, pdf):
        logger.info('Creating dataframe')
        filtered_pdf= []
        for row in pdf:
            if len(row['text']) < 30:
                continue
            if len(row['text']) > 8000:
                row['text'] = row['text'][:8000]
            filtered_pdf.append(row)
        df = pd.DataFrame(filtered_pdf)
        # remove elements with identical df[text] and df[page] values
        df = df.drop_duplicates(subset=['text', 'page'], keep='first')
        df['length'] = df['text'].apply(lambda x: len(x))
        logger.info('Done creating df')
        return df

    def calculate_embeddings(self, df):
        logger.info('Calculating embeddings')
        embedding_model = "text-embedding-ada-002"
        embeddings = df.text.apply([lambda x: get_embedding(x, engine=embedding_model)])
        df["embeddings"] = embeddings
        logger.info('Done calculating embeddings')
        return df

    def search_embeddings(self, df, query, n=2, pprint=True):
        query_embedding = get_embedding(
            query,
            engine="text-embedding-ada-002"
        )
        df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))

        results = df.sort_values("similarity", ascending=False, ignore_index=True)
        results = results.head(n)
        global sources
        sources = []
        for i in range(n):
            # append the page number and the text as a dict to the sources list
            sources.append({'Page '+str(results.iloc[i]['page']): results.iloc[i]['text'][:150]+'...'})
        print(sources)
        return results.head(n)

    def create_prompt(self, df, user_input, strategy=None):
        result = self.search_embeddings(df, user_input)
        if strategy == "paper":
            prompt = """You are a large language model whose expertise is reading and summarizing scientific papers.
            You are given a query and a series of text embeddings from a paper in order of their cosine similarity to the query.
            You must take the given embeddings and return a very detailed summary of the paper that answers the query.
                Given the question: """+ user_input + """

                and the following embeddings as data:

                1.""" + str(result.iloc[0]['text']) + """
                2.""" + str(result.iloc[1]['text']) + """

                Return a concise and accurate answer:"""
        elif strategy == "handbook":
            prompt = """You are a large language model whose expertise is reading and summarizing financial handbook.
            You are given a query and a series of text embeddings from a handbook in order of their cosine similarity to the query.
            You must take the given embeddings and return a very detailed answer in Chinese of the handbook that answers the query.
            If not necessary, your answer please use the original text as much as possible.
            You should also ensure that your response is written in clear and concise Chinese, using appropriate grammar and vocabulary.
            Additionally, your response should focus on answering the specific query provided..
                Given the question: """+ user_input + """
                and the following embeddings as data:

                1.""" + str(result.iloc[0]['text']) + """
                2.""" + str(result.iloc[1]['text']) + """

                Return a concise and accurate answer:"""
        elif strategy == "contract":
            prompt = """As a large language model specializing in reading and summarizing, your task is to read a query and a sequence of text inputs sorted by their cosine similarity to the query.
             Your goal is to provide a Chinese answer to the query using the given padding. If possible, please use the original text of your answer.
             Please ensure that your response adheres to the terms of the agreement. Your response should focus on addressing the specific query provided,
             providing relevant information and details based on the input texts' content. You should also strive for clarity and conciseness in your response,
             summarizing key points while maintaining accuracy and relevance. Please note that you should prioritize understanding the context and meaning
             behind both the query and input texts before generating a response.
                Given the question: """+ user_input + """
                and the following embeddings as data:

                1.""" + str(result.iloc[0]['text']) + """
                2.""" + str(result.iloc[1]['text']) + """

                Return a concise and accurate answer:"""
        else:
            prompt = """As a language model specialized in reading and summarizing documents, your task is to provide a concise answer in Chinese based on a given query and a series of text embeddings from the document.
            The embeddings are provided in order of their cosine similarity to the query. Your response should use as much original text as possible.
            Your answer should be highly concise and accurate, providing relevant information that directly answers the query.
            You should also ensure that your response is written in clear and concise Chinese, using appropriate grammar and vocabulary.
            Please note that you must use the provided text embeddings to generate your response, which means you will need to understand how they relate to the original document.
            Additionally, your response should focus on answering the specific query provided..
                Given the question: """+ user_input + """

                and the following embeddings as data:

                1.""" + str(result.iloc[0]['text']) + """
                2.""" + str(result.iloc[1]['text']) + """

                Return a concise and accurate answer:"""
        logger.info('Done creating prompt')
        return prompt

    def response(self, df, prompt):
        logger.info('Sending request to GPT-3')
        prompt = self.create_prompt(df, prompt)
        r = openai.ChatCompletion.create(model="gpt-3.5-turbo",
                                         messages=[{"role": "user", "content": prompt},
                                                   ])
        answer = r.choices[0]['message']['content']
        logger.info('Done sending request to GPT-3')
        response = {'answer': answer, 'sources': sources}
        return response


class OllamaChatbot(Chatbot):

    def __init__(self):
        self.ollama_api_key = os.getenv('OLLAMA_API_KEY')
        ollama.api_key = self.ollama_api_key

    def get_ollama_embedding(self, text):
        response = ollama.embed(model='llama3.1', input=text)
        return response['embedding']

    def calculate_ollama_embeddings(self, df):
        logger.info('Calculating embeddings using Ollama')
        embeddings = df.text.apply(lambda x: self.get_ollama_embedding(x))
        df["embeddings"] = embeddings
        logger.info('Done calculating embeddings')
        return df

    def ollama_response(self, df, prompt):
        logger.info('Sending request to Ollama')
        response = ollama.chat(model='llama3.1', messages=[{"role": "user", "content": prompt}])
        answer = response['message']['content']
        logger.info('Done sending request to Ollama')
        response = {'answer': answer, 'sources': sources}
        return response