From 6b4758ac6b53d5e78280f6a493428ad863c96fa4 Mon Sep 17 00:00:00 2001 From: Tony Okeke Date: Mon, 26 Feb 2024 00:19:26 -0500 Subject: [PATCH 1/7] add .gitignore --- .gitignore | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index ad8c20e..a2a8dea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1 @@ -dist/ -build/ -*.pyc +.env/ From b1a1a832dfb8c8eb10dab0ffdd908597984e6a33 Mon Sep 17 00:00:00 2001 From: Tony Okeke Date: Mon, 26 Feb 2024 00:19:50 -0500 Subject: [PATCH 2/7] migrate to openai 1.0.0 API --- .env | 1 + gpthistory.egg-info/PKG-INFO | 6 ++ gpthistory.egg-info/entry_points.txt | 6 +- gpthistory.egg-info/requires.txt | 6 +- .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 152 bytes .../__pycache__/gpthistory.cpython-310.pyc | Bin 0 -> 3048 bytes .../__pycache__/helpers.cpython-310.pyc | Bin 0 -> 3067 bytes gpthistory/gpthistory.py | 91 +++++++++--------- gpthistory/helpers.py | 48 ++++----- setup.py | 22 ++--- 10 files changed, 94 insertions(+), 86 deletions(-) create mode 100644 .env create mode 100644 gpthistory/__pycache__/__init__.cpython-310.pyc create mode 100644 gpthistory/__pycache__/gpthistory.cpython-310.pyc create mode 100644 gpthistory/__pycache__/helpers.cpython-310.pyc diff --git a/.env b/.env new file mode 100644 index 0000000..a390e60 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +OPENAI_API_KEY="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" \ No newline at end of file diff --git a/gpthistory.egg-info/PKG-INFO b/gpthistory.egg-info/PKG-INFO index 5cbb48a..da0adaa 100644 --- a/gpthistory.egg-info/PKG-INFO +++ b/gpthistory.egg-info/PKG-INFO @@ -2,6 +2,12 @@ Metadata-Version: 2.1 Name: gpthistory Version: 0.3 Summary: A tool for searching through your chatgpt conversation history +Home-page: UNKNOWN Author: Shrikar Archak Author-email: shrikar84@gmail.com +License: UNKNOWN +Platform: UNKNOWN License-File: LICENSE.md + +UNKNOWN + diff --git a/gpthistory.egg-info/entry_points.txt b/gpthistory.egg-info/entry_points.txt index ce90c75..d13f257 100644 --- a/gpthistory.egg-info/entry_points.txt +++ b/gpthistory.egg-info/entry_points.txt @@ -1,2 +1,4 @@ -[console_scripts] -gpthistory = gpthistory.gpthistory:main + + [console_scripts] + gpthistory=gpthistory.gpthistory:app + \ No newline at end of file diff --git a/gpthistory.egg-info/requires.txt b/gpthistory.egg-info/requires.txt index dbd3260..2cbb3f1 100644 --- a/gpthistory.egg-info/requires.txt +++ b/gpthistory.egg-info/requires.txt @@ -1,5 +1,5 @@ -Click -python-dotenv +numpy openai pandas -numpy +python-dotenv +typer diff --git a/gpthistory/__pycache__/__init__.cpython-310.pyc b/gpthistory/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7e5b61ee4a144c6d9cf2c3d3062a8e7eacd0cbc GIT binary patch literal 152 zcmd1j<>g`kg8NmsQ$h4&5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Huer{fgeqvEk zVs>V+esN}MNpgmMQEEYcv3`0%Nk(RINq$i!GBZ9tGcU6wK3=b&@)n0pZhlH>PO2Tq L&|)Sa!NLFl32!91 literal 0 HcmV?d00001 diff --git a/gpthistory/__pycache__/gpthistory.cpython-310.pyc b/gpthistory/__pycache__/gpthistory.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d1fd228ab56d8520127ce9d9afd7c8ee66725b7 GIT binary patch literal 3048 zcma)8&2JmW72nw}E|;ICBwMl(bYVbAh$K`{6lkl;ZW60<4HU9l324D2Al5rWa;g2G zXNLM9mPG;iszc9h0rD82mtK3yAJD&G&|6PA_vRkd_lAlk6$#KK=Iy-Md2i-@{T}mL z%{Sor*)p~`k!)le*^y&s%q4gt^f6glB27&t?=+SO`1NS%f+m>pv zQ^~_f_9RSt!7|UTS zWX{i~wP+a1IHT;e6N%$#d-MHJL^4iUTLd2ko9`#=m?c^GmQR@w;Q+q2P37p1%I)Vo z4W+XCal#^k7rlC#@OEK5fg%0@U!*Z2FwD6nZSWnTCi+S`bGI-SWJU(Y%!FaVunV$l zeEIRrDI98T81vG?0RFu%%$YkYNv|lKlbI*Wg?A1yII{WE!lU+vL7gKD=H}c7Id=oJ z5~HwybM?%Incj!otKikTtjw#lB<(*Le=%`Q^QhN6CvVqk^Lk+xR$dE z{KD6|e67pBsS7l(!q*1)t$AV1YDI0nM(-@h=q_#1l?yVfWM)yGRf;PA8Z4+>dFE?v z)2g=aUQvO2uP+SnUcIOn6}q-#z}+^AO5qQ{1G9$SeHY{#*B&Suqx<@XReJBjg1lTm zFxnf_pZEbhpH1R~24NQHGz$7WPlGTR#K$ZPFuMX8$}qs>2u@E=wa8(cD36^&4)*dR zRZ7G07;=_l?kGcnBIGmXUB{S$lN>=%X1ms4xl_r zcw#T6$`=gsIL~lyW3BKSXoqu4aW+oEi0!>0#aBdGf{m4vPbA}z($+9PQDrbBjzTza zy4Fh>AF$xjqu^n}`f%MWIeVzw&75_BfP7Rf0Jd ztn#1j@BZR=0|Zfyc9?QrO&C8@Hs$%4TPW+wJb{2K z7rX$@MIHs(P3~#cm?g@)3ZGDw>-Y)fU;yF8G6_n_ks`;ctN~C&s@4>vm z^An*%1=qxdN}NTUrSLgSR9WAU!aCo;lf8bNg$b}<8T?aNc=Kp0&E!_dd3Y3yEfF&r z4YxQO=VI#`Ilp__8K0>NCM{h$XdU#47eMa1zGa&>v57}kVXTw7>61D!|6smpmR3oN zG)WbB&+=eT6-Hh2P183`;$7M{%t#%+)XL|U(ygKZ&$t*1SAMh zmxBc@;7oG*zsGpr%HtvY8IbAv_Z^|@VCrrP^xO2uPXI!nzBmkSqHK`F+0k||lyWS# zx3;iMJ2{|67D6ilN%y^b*S3@&76zkP>q zq7m;Rd4S{{Bp)F80h0HSe27FBv;%mSoGlT{5|RU|CO8IBm030c41j+|04!%? zL_8J^^R~x#QOD144}d}fErC(~BV_&SzfjA$h@c$_MPmLGWe_)R#MoVW`ZusD9s;R4 zSlcfAyvvr=w3=p1BY}Ncwe0`+C7bZy!ZzjgdMVtq*K6PBzebe-5;R%4W9WjR(6*`) zA~oc^YFz_TXUGz;WeMyWa+lURhf5Ptv349ZgFMcl-<+b#iC|oN5pXLMaRfo{#{*?Q z+uwVx$^#~UwQLG-HS8+Ao{k0fy)*@JsTQpzH$|H^aNMf4nhGss11^eon}DD>qZK7C zNzaTFYr!pA$U)je4!!jQ1++&4J@oIH9CGNTI_Ks?-S=i!yOQ9L;G3_<$9cc^dv91Q zECd9uFNUA#BSo4>;iH&RS9&jn+KCE^v?V3gH?G$1VtAyePrk4Y)2hTltRM9d0Eg1eX(v)gCD#rb(5|rn{@8u4vXI)1-Xh{dC z6B!jz14o1p;yDq^(7bqWeUPj5ScrH(mFqI)D%o8Zye#B;zf`-aRE0P?-`(X|$%R}i zk8}&{q8HKp1J}T&Tm<1b0d;6VTW~v6v|uW+K%8ARbdatIJaP@hABh5J6+nDUD|&3- zCm%HcW>Nvv6}?VAu{z|}K*}d|W&x1)@NI+8qcTfX80R6na1^GwD#FJQ%3Ure5DEk%4_rbfSs`$Yh)KeU}TbzX`a zFu1@RbQ9TQn1lL??9n}NrB4}U)@K&*h%&oFm;;a7A%++48+Sh7ITx|w=n)@0=8UCz zKLmP)aroqbi=*&B;>?4R=Nnrg>2( z*ceE3eWna>cm)`YQ(fOY`>6I3!C@zeocXf$1(#)!OMV%q8~y37BcmA24a3JImMqV4 zs71a1Ej!{Z7+!!2lWdk~e}w@inj**qMvtvy3QTzGLu+J>Y-OF;4EVt8>*OEgpC+x8 zv*+%4mCYRH0<;eESnV099Oie(ec;l_t!U-G3pStBRSlR$S$*FTo5ofEGoUkuvSr4N zi3W}_UCmr`mb?qYjf+8=ISSbeA%e_2gw%j)1Sow)piUWhIDB&&XAI>G@re<`v-iSN zq^;H@1r~=O16fjB&*i#aY(}dgccMlM5!pw3ikAJ{Q_abHMuR=-M|>geVY%uBR6+ zL_i1gy^HIhp^)J5Ubsy+4BuGvmx=(6U$6%n1!=^e0$p+{(fNI`TgyI}}gqXMW|I z^r(MIVWxg=Ca7u^B+sZ()vKUt?ASo2VE9u*{d;ki9AsFaR8fX1RT-bqHV*sgAyk^(wZ1<@Ut)(HN%MMEn4H zu!p7D9ws&~?chVy#~pNQ+R96D6}s9}1!kA2liG%VAULs#MAu@`mxWN>rncCQcoS7! z+-4QmC|R$YUsR1~?u>;nwT zG}`A!+9_kT3zmyCha&M5?H<6frh~2R` 0 and text_data[0] != '': + if len(text_data) > 0 and text_data[0] != "": # Add relevant chat information to the index - chat_ids.append(entry['id']) + chat_ids.append(entry["id"]) section_ids.append(k) texts.append(text_data[0]) logger.info(f"Index built and stored at: {INDEX_PATH}") logger.info(f"Conversations indexed: {len(chat_ids)}") - df = pd.DataFrame({'chat_id': chat_ids, 'section_id': section_ids, 'text': texts}) - df = df[~df.text.isna()] - df['id'] = df['chat_id'] + df = pd.DataFrame({"chat_id": chat_ids, "section_id": section_ids, "text": texts}) + df = df[~df.text.isna()] + df["id"] = df["chat_id"] df.set_index("id", inplace=True) # Handle incremental index updates - current_df = pd.DataFrame() + current_df = pd.DataFrame() rows_only_in_df = pd.DataFrame() incremental = False if os.path.exists(INDEX_PATH): incremental = True - current_df = pd.read_csv(INDEX_PATH, sep='|') - current_df['id'] = current_df['chat_id'] + current_df = pd.read_csv(INDEX_PATH, sep="|") + current_df["id"] = current_df["chat_id"] current_df.set_index("id", inplace=True) # Use merge with indicator=True to find rows present in one DataFrame but not the other - merged_df = df.merge(current_df, how='outer', indicator=True) + merged_df = df.merge(current_df, how="outer", indicator=True) # Query rows only present in df1 - rows_only_in_df = merged_df.query('_merge == "left_only"').drop(columns='_merge') + rows_only_in_df = merged_df.query('_merge == "left_only"').drop( + columns="_merge" + ) else: rows_only_in_df = df - + if incremental and len(rows_only_in_df) > 0: logger.info("Only generating embeddings for new conversations to save money.") - + # Generate and add embeddings to the index embeddings = generate_embeddings(rows_only_in_df.text.tolist()) - rows_only_in_df['embeddings'] = embeddings + rows_only_in_df["embeddings"] = embeddings final_df = pd.concat([rows_only_in_df, current_df]) logger.info(f"Total conversations: {len(final_df)}") - final_df.to_csv(INDEX_PATH, sep='|', index=False) + final_df.to_csv(INDEX_PATH, sep="|", index=False) + @main.command() -@click.argument('keyword', required=True) -def search(keyword): +def search(keyword: str): """ Search a keyword within the index """ - # TODO: Implement search function - # Load the index from the predefined path logger.info("Searching for keyword: %s", keyword) if os.path.exists(INDEX_PATH): - df = pd.read_csv(INDEX_PATH, sep='|') - df['embeddings'] = df.embeddings.apply(lambda x: [float(t) for t in json.loads(x)]) + df = pd.read_csv(INDEX_PATH, sep="|") + df["embeddings"] = df.embeddings.apply( + lambda x: [float(t) for t in json.loads(x)] + ) filtered = df[df.text.str.contains(keyword)] - + # Calculate top titles and their corresponding chat IDs chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword) - + for i, t in enumerate(top_titles): logger.info("%s: %s", chat_ids[i], t) - logger.info("ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i]) + logger.info( + "ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i] + ) logger.info("--------------------------------------") else: - click.echo("Index not found. Please build the index first.") + typer.echo("Index not found. Please build the index first.") return + if __name__ == "__main__": main() diff --git a/gpthistory/helpers.py b/gpthistory/helpers.py index 56bf08f..156d29a 100644 --- a/gpthistory/helpers.py +++ b/gpthistory/helpers.py @@ -2,7 +2,9 @@ import os import pandas as pd import numpy as np -import openai +from openai import OpenAI + +client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) from dotenv import load_dotenv import logging @@ -10,56 +12,55 @@ load_dotenv() # Set up OpenAI API key -openai.api_key = os.environ.get('OPENAI_API_KEY') # Define the path to the index file in the user's home directory -INDEX_PATH = os.path.join(os.path.expanduser('~'), '.chatsearch', 'chatindex.csv') +INDEX_PATH = os.path.join(os.path.expanduser("~"), ".chatsearch", "chatindex.csv") # Set up logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) + def extract_text_parts(data): """ Extract text parts from chat data. """ text_parts = [] - message = data.get('message') + message = data.get("message") if message: - content = message.get('content') - if content and content.get('content_type') == 'text': - text_parts.extend(content.get('parts', [])) + content = message.get("content") + if content and content.get("content_type") == "text": + text_parts.extend(content.get("parts", [])) return text_parts + def split_into_batches(array, batch_size): """ Split an array into batches. """ for i in range(0, len(array), batch_size): - yield array[i:i + batch_size] + yield array[i : i + batch_size] + def generate_query_embedding(query): """ Generate an embedding for a query using OpenAI API. """ - response = openai.Embedding.create( - input=[query], - model="text-embedding-ada-002" - ) - return response['data'][0]['embedding'] + response = client.embeddings.create(input=[query], model="text-embedding-ada-002") + return response.data[0].embedding + -def generate_embeddings(conversations): +def genoerate_embeddings(conversations): """ Generate embeddings for conversations using OpenAI API. """ embeddings = [] for i, batch in enumerate(split_into_batches(conversations, 100)): logger.info(f"Generating Embeddings for batch: {i + 1}") - response = openai.Embedding.create( - input=batch, - model="text-embedding-ada-002" - ) - tmp_embedding = [row['embedding'] for row in response['data']] + response = client.embeddings.create(input=batch, model="text-embedding-ada-002") + tmp_embedding = [row["embedding"] for row in response.data] embeddings += tmp_embedding if len(embeddings) > 0: logger.info("Conversations (Chunks) = %d", len(conversations)) @@ -68,12 +69,13 @@ def generate_embeddings(conversations): logger.info("No new conversations detected") return embeddings + def calculate_top_titles(df, query, top_n=1000): """ Calculate top titles for a given query using embeddings. """ # Extract the embeddings from the DataFrame - embedding_array = np.array(df['embeddings'].tolist()) + embedding_array = np.array(df["embeddings"].tolist()) query_embedding = generate_query_embedding(query) # Calculate the dot product between the query embedding and all embeddings in the DataFrame dot_scores = np.dot(embedding_array, query_embedding) @@ -81,8 +83,8 @@ def calculate_top_titles(df, query, top_n=1000): # Filter out titles with dot scores below the threshold mask = dot_scores >= 0.8 filtered_dot_scores = dot_scores[mask] - filtered_titles = df.loc[mask, 'text'].tolist() - filtered_chat_ids = df.loc[mask, 'chat_id'].tolist() + filtered_titles = df.loc[mask, "text"].tolist() + filtered_chat_ids = df.loc[mask, "chat_id"].tolist() # Sort the filtered titles based on the dot scores (in descending order) sorted_indices = np.argsort(filtered_dot_scores)[::-1][:top_n] diff --git a/setup.py b/setup.py index 93a0f68..2d91494 100644 --- a/setup.py +++ b/setup.py @@ -1,22 +1,16 @@ from setuptools import setup, find_packages setup( - name='gpthistory', - version='0.3', - description='A tool for searching through your chatgpt conversation history', - author='Shrikar Archak', - author_email='shrikar84@gmail.com', + name="gpthistory", + version="0.3", + description="A tool for searching through your chatgpt conversation history", + author="Shrikar Archak", + author_email="shrikar84@gmail.com", packages=find_packages(), include_package_data=True, - install_requires=[ - 'Click', - 'python-dotenv', - 'openai', - 'pandas', - 'numpy' - ], - entry_points=''' + install_requires=["typer", "python-dotenv", "openai", "pandas", "numpy"], + entry_points=""" [console_scripts] gpthistory=gpthistory.gpthistory:main - ''', + """, ) From 0f14e4f676556303660f4802d93ffd9e5acf6a34 Mon Sep 17 00:00:00 2001 From: Tony Okeke Date: Mon, 26 Feb 2024 00:19:50 -0500 Subject: [PATCH 3/7] migrate to openai 1.0.0 API --- .env | 1 + gpthistory.egg-info/PKG-INFO | 6 ++ gpthistory.egg-info/entry_points.txt | 6 +- gpthistory.egg-info/requires.txt | 6 +- .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 152 bytes .../__pycache__/gpthistory.cpython-310.pyc | Bin 0 -> 3048 bytes .../__pycache__/helpers.cpython-310.pyc | Bin 0 -> 3082 bytes gpthistory/gpthistory.py | 91 +++++++++--------- gpthistory/helpers.py | 46 ++++----- setup.py | 22 ++--- 10 files changed, 93 insertions(+), 85 deletions(-) create mode 100644 .env create mode 100644 gpthistory/__pycache__/__init__.cpython-310.pyc create mode 100644 gpthistory/__pycache__/gpthistory.cpython-310.pyc create mode 100644 gpthistory/__pycache__/helpers.cpython-310.pyc diff --git a/.env b/.env new file mode 100644 index 0000000..a390e60 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +OPENAI_API_KEY="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" \ No newline at end of file diff --git a/gpthistory.egg-info/PKG-INFO b/gpthistory.egg-info/PKG-INFO index 5cbb48a..da0adaa 100644 --- a/gpthistory.egg-info/PKG-INFO +++ b/gpthistory.egg-info/PKG-INFO @@ -2,6 +2,12 @@ Metadata-Version: 2.1 Name: gpthistory Version: 0.3 Summary: A tool for searching through your chatgpt conversation history +Home-page: UNKNOWN Author: Shrikar Archak Author-email: shrikar84@gmail.com +License: UNKNOWN +Platform: UNKNOWN License-File: LICENSE.md + +UNKNOWN + diff --git a/gpthistory.egg-info/entry_points.txt b/gpthistory.egg-info/entry_points.txt index ce90c75..d13f257 100644 --- a/gpthistory.egg-info/entry_points.txt +++ b/gpthistory.egg-info/entry_points.txt @@ -1,2 +1,4 @@ -[console_scripts] -gpthistory = gpthistory.gpthistory:main + + [console_scripts] + gpthistory=gpthistory.gpthistory:app + \ No newline at end of file diff --git a/gpthistory.egg-info/requires.txt b/gpthistory.egg-info/requires.txt index dbd3260..2cbb3f1 100644 --- a/gpthistory.egg-info/requires.txt +++ b/gpthistory.egg-info/requires.txt @@ -1,5 +1,5 @@ -Click -python-dotenv +numpy openai pandas -numpy +python-dotenv +typer diff --git a/gpthistory/__pycache__/__init__.cpython-310.pyc b/gpthistory/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7e5b61ee4a144c6d9cf2c3d3062a8e7eacd0cbc GIT binary patch literal 152 zcmd1j<>g`kg8NmsQ$h4&5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Huer{fgeqvEk zVs>V+esN}MNpgmMQEEYcv3`0%Nk(RINq$i!GBZ9tGcU6wK3=b&@)n0pZhlH>PO2Tq L&|)Sa!NLFl32!91 literal 0 HcmV?d00001 diff --git a/gpthistory/__pycache__/gpthistory.cpython-310.pyc b/gpthistory/__pycache__/gpthistory.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d1fd228ab56d8520127ce9d9afd7c8ee66725b7 GIT binary patch literal 3048 zcma)8&2JmW72nw}E|;ICBwMl(bYVbAh$K`{6lkl;ZW60<4HU9l324D2Al5rWa;g2G zXNLM9mPG;iszc9h0rD82mtK3yAJD&G&|6PA_vRkd_lAlk6$#KK=Iy-Md2i-@{T}mL z%{Sor*)p~`k!)le*^y&s%q4gt^f6glB27&t?=+SO`1NS%f+m>pv zQ^~_f_9RSt!7|UTS zWX{i~wP+a1IHT;e6N%$#d-MHJL^4iUTLd2ko9`#=m?c^GmQR@w;Q+q2P37p1%I)Vo z4W+XCal#^k7rlC#@OEK5fg%0@U!*Z2FwD6nZSWnTCi+S`bGI-SWJU(Y%!FaVunV$l zeEIRrDI98T81vG?0RFu%%$YkYNv|lKlbI*Wg?A1yII{WE!lU+vL7gKD=H}c7Id=oJ z5~HwybM?%Incj!otKikTtjw#lB<(*Le=%`Q^QhN6CvVqk^Lk+xR$dE z{KD6|e67pBsS7l(!q*1)t$AV1YDI0nM(-@h=q_#1l?yVfWM)yGRf;PA8Z4+>dFE?v z)2g=aUQvO2uP+SnUcIOn6}q-#z}+^AO5qQ{1G9$SeHY{#*B&Suqx<@XReJBjg1lTm zFxnf_pZEbhpH1R~24NQHGz$7WPlGTR#K$ZPFuMX8$}qs>2u@E=wa8(cD36^&4)*dR zRZ7G07;=_l?kGcnBIGmXUB{S$lN>=%X1ms4xl_r zcw#T6$`=gsIL~lyW3BKSXoqu4aW+oEi0!>0#aBdGf{m4vPbA}z($+9PQDrbBjzTza zy4Fh>AF$xjqu^n}`f%MWIeVzw&75_BfP7Rf0Jd ztn#1j@BZR=0|Zfyc9?QrO&C8@Hs$%4TPW+wJb{2K z7rX$@MIHs(P3~#cm?g@)3ZGDw>-Y)fU;yF8G6_n_ks`;ctN~C&s@4>vm z^An*%1=qxdN}NTUrSLgSR9WAU!aCo;lf8bNg$b}<8T?aNc=Kp0&E!_dd3Y3yEfF&r z4YxQO=VI#`Ilp__8K0>NCM{h$XdU#47eMa1zGa&>v57}kVXTw7>61D!|6smpmR3oN zG)WbB&+=eT6-Hh2P183`;$7M{%t#%+)XL|U(ygKZ&$t*1SAMh zmxBc@;7oG*zsGpr%HtvY8IbAv_Z^|@VCrrP^xO2uPXI!nzBmkSqHK`F+0k||lyWS# zx3;iMJ2{|67D6ilN%y^b*S3@&76zkP>q zq7m;Rd4S{{Bp)F80h0HSe27FBv;%mSoGlT{5|RU|CO8IBm030c41j+|04!%? zL_8J^^R~x#QOD144}d}fErC(~BV_&SzfjA$h@c$_MPmLGWe_)R#MoVW`ZusD9s;R4 zSlcfAyvvr=w3=p1BY}Ncwe0`+C7bZy!ZzjgdMVtq*K6PBzebe-5;R%4W9WjR(6*`) zA~oc^YFz_TXUGz;WeMyWa+lURhf5Ptv349ZgFMcl-<+b#iC|oN5pXLMaRfo{#{*?Q z+uwVx$^#~UwQLG-HS8+Ao{k0fy)*@JURP@Qx0#r?fmY4rhT2)_Ax^Jd!cI?MyJ6>5}KW+XdD>eWxdJ6DNU!j3;r7Sj;ziV(d@PGU*&b~ zKR3bGJe(149I?(UpWuyWMrV#U`6TLVe2Pz_p66G13-z0Pme24vp0Un#&K?@=xnrE( z*l085-XDl;WnE3)f3UW(vL3EHSPy@-_9$n+DXR6)1|oB9ky>SL`7UCebFw0#o*>bDt1MhMdN;76h+jcB; zF8fg#Gdk5tu~EDD@8K_2LAu61tBgaK{D|$Fl_`VDM9)Oesx0&@^t5reX^tH-J>qnY zBk(YzBE{X|_x%86?P)2axC~11v)(>cr*bs8x>4lu{q;&bKdw3u=%9UO&HP| zzNanpaF8Zt5M=?Wuoom*nFo*I$}LfhOeo2z8{}&lWnH1HRAg;SIhtSjnl~(xp-?Q* zCK-{dSfyG8Ne@YAcp~(DIjr>k5Cu;{uJ(YRnGimh>c{S6(8uhJj@aCTrGFaY4) z-QjvBv59fSHijz~y)#x2Aq(Fi9Cr~9NI|g9H+bln zL%-oWCk`ebuATaKBt(Pjc5qBsmDh>f0J&hVQ&c^~L_y)yH7etgOIYH@yzy7_355ld z@6u`tZTQYyx}_|ce|Fht{+$$o9q0YQ-H*ZfHOZbah6`|It|55l#bW7Bn#WOE+(qMb zkw((+`n?OPFWlSO$+nAj@KJD+52wy)s0Y5akq4Rh?2^5>C`DWf-mU|6*-l^Yu0u+> zY2NJ$sq7?!o!=(40wV7aA)LxtkhZIu7hEu6n#YuKv{~hMsNF34gEOrz8)W8nntzCO z#WfJyo@Fy^8lTS^y0$I<6|&j~m^}U;Qdx|DTySmc_@}YrE0EH_T>wu_`1*>@|aa_5cxT4nUm;?^b-AFcZfWatrlJVKJ)s)h^N!RLOC zjKZDDq*S@@ePL|si~E^Zc{(QQUoec+xi=p1t6JrOccQ9Sel@Xa0YU!oryB3~qBPz~ zDIJyhASjbE6(g)hK{weI*@cvJ7855)H-OsRFaJ*E_+v>H!3~DHEll{v@hgi<;mHogh?S(t?}9v{#`i#m3qbNns}Nd+ zBduPC@NYbx{2>XW_!en8a?W%r6aQ)Z6aXb@#QDwH|k%0X5b?8)1-jho}K7$uVOm8#(} zn8-Yn^xV+-LAf|6MkPHgRE_cna;>rlQMm=_d6MDIdrGo*aE~c}ePea)x8Z}8Pkx~$ z9!EtIBcW|3Sg^kFv-_%!x74pkDaZ>$dO(JuT%^4Sp*lxt*Xb;vRXGO>4G?V_?+z~C nomAYV++Mr~;u#*)he=q=Y@l|nYmRNX%zfosFjCV)nQZ+Zcf=zE literal 0 HcmV?d00001 diff --git a/gpthistory/gpthistory.py b/gpthistory/gpthistory.py index 8adc06e..c9570d6 100644 --- a/gpthistory/gpthistory.py +++ b/gpthistory/gpthistory.py @@ -1,107 +1,110 @@ -import click +import typer import json import os import pandas as pd import logging -from gpthistory.helpers import extract_text_parts, generate_embeddings, calculate_top_titles +from gpthistory.helpers import ( + extract_text_parts, + generate_embeddings, + calculate_top_titles, +) + +main = typer.Typer() # Define the path to the index file in the user's home directory -INDEX_PATH = os.path.join(os.path.expanduser('~'), '.gpthistory', 'chatindex.csv') +INDEX_PATH = os.path.join(os.path.expanduser("~"), ".gpthistory", "chatindex.csv") # Configure the logger -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) -@click.group() -def main(): - """ - Simple CLI for searching within a chat data - """ - pass @main.command() -@click.option('--file', type=click.Path(exists=True), help='Input file') -def build_index(file): +def build_index(file: typer.FileText): """ - Build an index from a given chat data file + Build an index from a given chat data file xxx """ - # TODO: Implement index building - # Write the index to the predefined path # Make sure the directory exists os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True) - + # Load the chat data from the given file - with open(file) as f: - data = json.load(f) - + data = json.load(file) + chat_ids = [] section_ids = [] texts = [] for entry in data: - for k, v in entry['mapping'].items(): + for k, v in entry["mapping"].items(): text_data = extract_text_parts(v) - if len(text_data) > 0 and text_data[0] != '': + if len(text_data) > 0 and text_data[0] != "": # Add relevant chat information to the index - chat_ids.append(entry['id']) + chat_ids.append(entry["id"]) section_ids.append(k) texts.append(text_data[0]) logger.info(f"Index built and stored at: {INDEX_PATH}") logger.info(f"Conversations indexed: {len(chat_ids)}") - df = pd.DataFrame({'chat_id': chat_ids, 'section_id': section_ids, 'text': texts}) - df = df[~df.text.isna()] - df['id'] = df['chat_id'] + df = pd.DataFrame({"chat_id": chat_ids, "section_id": section_ids, "text": texts}) + df = df[~df.text.isna()] + df["id"] = df["chat_id"] df.set_index("id", inplace=True) # Handle incremental index updates - current_df = pd.DataFrame() + current_df = pd.DataFrame() rows_only_in_df = pd.DataFrame() incremental = False if os.path.exists(INDEX_PATH): incremental = True - current_df = pd.read_csv(INDEX_PATH, sep='|') - current_df['id'] = current_df['chat_id'] + current_df = pd.read_csv(INDEX_PATH, sep="|") + current_df["id"] = current_df["chat_id"] current_df.set_index("id", inplace=True) # Use merge with indicator=True to find rows present in one DataFrame but not the other - merged_df = df.merge(current_df, how='outer', indicator=True) + merged_df = df.merge(current_df, how="outer", indicator=True) # Query rows only present in df1 - rows_only_in_df = merged_df.query('_merge == "left_only"').drop(columns='_merge') + rows_only_in_df = merged_df.query('_merge == "left_only"').drop( + columns="_merge" + ) else: rows_only_in_df = df - + if incremental and len(rows_only_in_df) > 0: logger.info("Only generating embeddings for new conversations to save money.") - + # Generate and add embeddings to the index embeddings = generate_embeddings(rows_only_in_df.text.tolist()) - rows_only_in_df['embeddings'] = embeddings + rows_only_in_df["embeddings"] = embeddings final_df = pd.concat([rows_only_in_df, current_df]) logger.info(f"Total conversations: {len(final_df)}") - final_df.to_csv(INDEX_PATH, sep='|', index=False) + final_df.to_csv(INDEX_PATH, sep="|", index=False) + @main.command() -@click.argument('keyword', required=True) -def search(keyword): +def search(keyword: str): """ Search a keyword within the index """ - # TODO: Implement search function - # Load the index from the predefined path logger.info("Searching for keyword: %s", keyword) if os.path.exists(INDEX_PATH): - df = pd.read_csv(INDEX_PATH, sep='|') - df['embeddings'] = df.embeddings.apply(lambda x: [float(t) for t in json.loads(x)]) + df = pd.read_csv(INDEX_PATH, sep="|") + df["embeddings"] = df.embeddings.apply( + lambda x: [float(t) for t in json.loads(x)] + ) filtered = df[df.text.str.contains(keyword)] - + # Calculate top titles and their corresponding chat IDs chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword) - + for i, t in enumerate(top_titles): logger.info("%s: %s", chat_ids[i], t) - logger.info("ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i]) + logger.info( + "ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i] + ) logger.info("--------------------------------------") else: - click.echo("Index not found. Please build the index first.") + typer.echo("Index not found. Please build the index first.") return + if __name__ == "__main__": main() diff --git a/gpthistory/helpers.py b/gpthistory/helpers.py index 56bf08f..003203a 100644 --- a/gpthistory/helpers.py +++ b/gpthistory/helpers.py @@ -2,7 +2,9 @@ import os import pandas as pd import numpy as np -import openai +from openai import OpenAI + +client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) from dotenv import load_dotenv import logging @@ -10,43 +12,45 @@ load_dotenv() # Set up OpenAI API key -openai.api_key = os.environ.get('OPENAI_API_KEY') # Define the path to the index file in the user's home directory -INDEX_PATH = os.path.join(os.path.expanduser('~'), '.chatsearch', 'chatindex.csv') +INDEX_PATH = os.path.join(os.path.expanduser("~"), ".chatsearch", "chatindex.csv") # Set up logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) + def extract_text_parts(data): """ Extract text parts from chat data. """ text_parts = [] - message = data.get('message') + message = data.get("message") if message: - content = message.get('content') - if content and content.get('content_type') == 'text': - text_parts.extend(content.get('parts', [])) + content = message.get("content") + if content and content.get("content_type") == "text": + text_parts.extend(content.get("parts", [])) return text_parts + def split_into_batches(array, batch_size): """ Split an array into batches. """ for i in range(0, len(array), batch_size): - yield array[i:i + batch_size] + yield array[i : i + batch_size] + def generate_query_embedding(query): """ Generate an embedding for a query using OpenAI API. """ - response = openai.Embedding.create( - input=[query], - model="text-embedding-ada-002" - ) - return response['data'][0]['embedding'] + response = client.embeddings.create(input=[query], model="text-embedding-ada-002") + return response.data[0].embedding + def generate_embeddings(conversations): """ @@ -55,11 +59,8 @@ def generate_embeddings(conversations): embeddings = [] for i, batch in enumerate(split_into_batches(conversations, 100)): logger.info(f"Generating Embeddings for batch: {i + 1}") - response = openai.Embedding.create( - input=batch, - model="text-embedding-ada-002" - ) - tmp_embedding = [row['embedding'] for row in response['data']] + response = client.embeddings.create(input=batch, model="text-embedding-ada-002") + tmp_embedding = [row["embedding"] for row in response.data] embeddings += tmp_embedding if len(embeddings) > 0: logger.info("Conversations (Chunks) = %d", len(conversations)) @@ -68,12 +69,13 @@ def generate_embeddings(conversations): logger.info("No new conversations detected") return embeddings + def calculate_top_titles(df, query, top_n=1000): """ Calculate top titles for a given query using embeddings. """ # Extract the embeddings from the DataFrame - embedding_array = np.array(df['embeddings'].tolist()) + embedding_array = np.array(df["embeddings"].tolist()) query_embedding = generate_query_embedding(query) # Calculate the dot product between the query embedding and all embeddings in the DataFrame dot_scores = np.dot(embedding_array, query_embedding) @@ -81,8 +83,8 @@ def calculate_top_titles(df, query, top_n=1000): # Filter out titles with dot scores below the threshold mask = dot_scores >= 0.8 filtered_dot_scores = dot_scores[mask] - filtered_titles = df.loc[mask, 'text'].tolist() - filtered_chat_ids = df.loc[mask, 'chat_id'].tolist() + filtered_titles = df.loc[mask, "text"].tolist() + filtered_chat_ids = df.loc[mask, "chat_id"].tolist() # Sort the filtered titles based on the dot scores (in descending order) sorted_indices = np.argsort(filtered_dot_scores)[::-1][:top_n] diff --git a/setup.py b/setup.py index 93a0f68..2d91494 100644 --- a/setup.py +++ b/setup.py @@ -1,22 +1,16 @@ from setuptools import setup, find_packages setup( - name='gpthistory', - version='0.3', - description='A tool for searching through your chatgpt conversation history', - author='Shrikar Archak', - author_email='shrikar84@gmail.com', + name="gpthistory", + version="0.3", + description="A tool for searching through your chatgpt conversation history", + author="Shrikar Archak", + author_email="shrikar84@gmail.com", packages=find_packages(), include_package_data=True, - install_requires=[ - 'Click', - 'python-dotenv', - 'openai', - 'pandas', - 'numpy' - ], - entry_points=''' + install_requires=["typer", "python-dotenv", "openai", "pandas", "numpy"], + entry_points=""" [console_scripts] gpthistory=gpthistory.gpthistory:main - ''', + """, ) From 40603bb639cf45b4410efac0e0aa411d69792200 Mon Sep 17 00:00:00 2001 From: Tony Okeke Date: Mon, 26 Feb 2024 00:22:35 -0500 Subject: [PATCH 4/7] fix key leak --- .env | 1 - .gitignore | 3 ++- gpthistory/__pycache__/__init__.cpython-310.pyc | Bin 152 -> 0 bytes .../__pycache__/gpthistory.cpython-310.pyc | Bin 3048 -> 0 bytes gpthistory/__pycache__/helpers.cpython-310.pyc | Bin 3082 -> 0 bytes 5 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 .env delete mode 100644 gpthistory/__pycache__/__init__.cpython-310.pyc delete mode 100644 gpthistory/__pycache__/gpthistory.cpython-310.pyc delete mode 100644 gpthistory/__pycache__/helpers.cpython-310.pyc diff --git a/.env b/.env deleted file mode 100644 index a390e60..0000000 --- a/.env +++ /dev/null @@ -1 +0,0 @@ -OPENAI_API_KEY="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" \ No newline at end of file diff --git a/.gitignore b/.gitignore index a2a8dea..d50a09f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.env/ +.env +__pycache__/ diff --git a/gpthistory/__pycache__/__init__.cpython-310.pyc b/gpthistory/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index c7e5b61ee4a144c6d9cf2c3d3062a8e7eacd0cbc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 152 zcmd1j<>g`kg8NmsQ$h4&5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Huer{fgeqvEk zVs>V+esN}MNpgmMQEEYcv3`0%Nk(RINq$i!GBZ9tGcU6wK3=b&@)n0pZhlH>PO2Tq L&|)Sa!NLFl32!91 diff --git a/gpthistory/__pycache__/gpthistory.cpython-310.pyc b/gpthistory/__pycache__/gpthistory.cpython-310.pyc deleted file mode 100644 index 2d1fd228ab56d8520127ce9d9afd7c8ee66725b7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3048 zcma)8&2JmW72nw}E|;ICBwMl(bYVbAh$K`{6lkl;ZW60<4HU9l324D2Al5rWa;g2G zXNLM9mPG;iszc9h0rD82mtK3yAJD&G&|6PA_vRkd_lAlk6$#KK=Iy-Md2i-@{T}mL z%{Sor*)p~`k!)le*^y&s%q4gt^f6glB27&t?=+SO`1NS%f+m>pv zQ^~_f_9RSt!7|UTS zWX{i~wP+a1IHT;e6N%$#d-MHJL^4iUTLd2ko9`#=m?c^GmQR@w;Q+q2P37p1%I)Vo z4W+XCal#^k7rlC#@OEK5fg%0@U!*Z2FwD6nZSWnTCi+S`bGI-SWJU(Y%!FaVunV$l zeEIRrDI98T81vG?0RFu%%$YkYNv|lKlbI*Wg?A1yII{WE!lU+vL7gKD=H}c7Id=oJ z5~HwybM?%Incj!otKikTtjw#lB<(*Le=%`Q^QhN6CvVqk^Lk+xR$dE z{KD6|e67pBsS7l(!q*1)t$AV1YDI0nM(-@h=q_#1l?yVfWM)yGRf;PA8Z4+>dFE?v z)2g=aUQvO2uP+SnUcIOn6}q-#z}+^AO5qQ{1G9$SeHY{#*B&Suqx<@XReJBjg1lTm zFxnf_pZEbhpH1R~24NQHGz$7WPlGTR#K$ZPFuMX8$}qs>2u@E=wa8(cD36^&4)*dR zRZ7G07;=_l?kGcnBIGmXUB{S$lN>=%X1ms4xl_r zcw#T6$`=gsIL~lyW3BKSXoqu4aW+oEi0!>0#aBdGf{m4vPbA}z($+9PQDrbBjzTza zy4Fh>AF$xjqu^n}`f%MWIeVzw&75_BfP7Rf0Jd ztn#1j@BZR=0|Zfyc9?QrO&C8@Hs$%4TPW+wJb{2K z7rX$@MIHs(P3~#cm?g@)3ZGDw>-Y)fU;yF8G6_n_ks`;ctN~C&s@4>vm z^An*%1=qxdN}NTUrSLgSR9WAU!aCo;lf8bNg$b}<8T?aNc=Kp0&E!_dd3Y3yEfF&r z4YxQO=VI#`Ilp__8K0>NCM{h$XdU#47eMa1zGa&>v57}kVXTw7>61D!|6smpmR3oN zG)WbB&+=eT6-Hh2P183`;$7M{%t#%+)XL|U(ygKZ&$t*1SAMh zmxBc@;7oG*zsGpr%HtvY8IbAv_Z^|@VCrrP^xO2uPXI!nzBmkSqHK`F+0k||lyWS# zx3;iMJ2{|67D6ilN%y^b*S3@&76zkP>q zq7m;Rd4S{{Bp)F80h0HSe27FBv;%mSoGlT{5|RU|CO8IBm030c41j+|04!%? zL_8J^^R~x#QOD144}d}fErC(~BV_&SzfjA$h@c$_MPmLGWe_)R#MoVW`ZusD9s;R4 zSlcfAyvvr=w3=p1BY}Ncwe0`+C7bZy!ZzjgdMVtq*K6PBzebe-5;R%4W9WjR(6*`) zA~oc^YFz_TXUGz;WeMyWa+lURhf5Ptv349ZgFMcl-<+b#iC|oN5pXLMaRfo{#{*?Q z+uwVx$^#~UwQLG-HS8+Ao{k0fy)*@JURP@Qx0#r?fmY4rhT2)_Ax^Jd!cI?MyJ6>5}KW+XdD>eWxdJ6DNU!j3;r7Sj;ziV(d@PGU*&b~ zKR3bGJe(149I?(UpWuyWMrV#U`6TLVe2Pz_p66G13-z0Pme24vp0Un#&K?@=xnrE( z*l085-XDl;WnE3)f3UW(vL3EHSPy@-_9$n+DXR6)1|oB9ky>SL`7UCebFw0#o*>bDt1MhMdN;76h+jcB; zF8fg#Gdk5tu~EDD@8K_2LAu61tBgaK{D|$Fl_`VDM9)Oesx0&@^t5reX^tH-J>qnY zBk(YzBE{X|_x%86?P)2axC~11v)(>cr*bs8x>4lu{q;&bKdw3u=%9UO&HP| zzNanpaF8Zt5M=?Wuoom*nFo*I$}LfhOeo2z8{}&lWnH1HRAg;SIhtSjnl~(xp-?Q* zCK-{dSfyG8Ne@YAcp~(DIjr>k5Cu;{uJ(YRnGimh>c{S6(8uhJj@aCTrGFaY4) z-QjvBv59fSHijz~y)#x2Aq(Fi9Cr~9NI|g9H+bln zL%-oWCk`ebuATaKBt(Pjc5qBsmDh>f0J&hVQ&c^~L_y)yH7etgOIYH@yzy7_355ld z@6u`tZTQYyx}_|ce|Fht{+$$o9q0YQ-H*ZfHOZbah6`|It|55l#bW7Bn#WOE+(qMb zkw((+`n?OPFWlSO$+nAj@KJD+52wy)s0Y5akq4Rh?2^5>C`DWf-mU|6*-l^Yu0u+> zY2NJ$sq7?!o!=(40wV7aA)LxtkhZIu7hEu6n#YuKv{~hMsNF34gEOrz8)W8nntzCO z#WfJyo@Fy^8lTS^y0$I<6|&j~m^}U;Qdx|DTySmc_@}YrE0EH_T>wu_`1*>@|aa_5cxT4nUm;?^b-AFcZfWatrlJVKJ)s)h^N!RLOC zjKZDDq*S@@ePL|si~E^Zc{(QQUoec+xi=p1t6JrOccQ9Sel@Xa0YU!oryB3~qBPz~ zDIJyhASjbE6(g)hK{weI*@cvJ7855)H-OsRFaJ*E_+v>H!3~DHEll{v@hgi<;mHogh?S(t?}9v{#`i#m3qbNns}Nd+ zBduPC@NYbx{2>XW_!en8a?W%r6aQ)Z6aXb@#QDwH|k%0X5b?8)1-jho}K7$uVOm8#(} zn8-Yn^xV+-LAf|6MkPHgRE_cna;>rlQMm=_d6MDIdrGo*aE~c}ePea)x8Z}8Pkx~$ z9!EtIBcW|3Sg^kFv-_%!x74pkDaZ>$dO(JuT%^4Sp*lxt*Xb;vRXGO>4G?V_?+z~C nomAYV++Mr~;u#*)he=q=Y@l|nYmRNX%zfosFjCV)nQZ+Zcf=zE From 4038a50cd8f8bed7607cf8781f086525c11574bf Mon Sep 17 00:00:00 2001 From: Tony Okeke Date: Mon, 26 Feb 2024 00:25:41 -0500 Subject: [PATCH 5/7] remove .env --- .env | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .env diff --git a/.env b/.env deleted file mode 100644 index a390e60..0000000 --- a/.env +++ /dev/null @@ -1 +0,0 @@ -OPENAI_API_KEY="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" \ No newline at end of file From 5adbc26919b19535b2c606236ecfa2bdb07cf91f Mon Sep 17 00:00:00 2001 From: Tony Okeke Date: Mon, 26 Feb 2024 00:26:21 -0500 Subject: [PATCH 6/7] fix typo --- .../__pycache__/helpers.cpython-310.pyc | Bin 3067 -> 3082 bytes gpthistory/helpers.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/gpthistory/__pycache__/helpers.cpython-310.pyc b/gpthistory/__pycache__/helpers.cpython-310.pyc index 36d2d91438589b7b190375ee38d6c080771841c5..a6a1f8cf61d3e0fd3ab51b17dc96e55d5f40669e 100644 GIT binary patch delta 1362 zcmZ8g&2QX96rbm>wY_$-**N>r4@gT>vQ!k5N(4}q5|Ky*L>nbsSX5Tqo$1D|*WQes zqEs7&YI{NoQe_FLoU#{0B_xD;;lP7ZX8byXWEe$5$>NDxuLUGz&mjfk9v%TT~qZ)sCQPQ$qEy*mIf=5R@Tw zn=W&P1i27Q9Z%<2q6YE{)EueJ8RiCM{AQ^|?IQ(w?y<+_M$oL#B6S{-W|g{h3fCH) zrX^hKTXcq&Q8+^@)T48cpm~-*NNniB=!8CtQem^+9Y(1ibram?)6u0 zAoCvu!6OeAF%5AR8w=SJW9!hu97cP+JTG~%gb~KwF5^P;qq{MGUH)iZ!UYL3>%4-{ zFofHQqx*~}?bOF~k4bej3H?qx?d6YOeAQI@1Yh6Mv!4rfn4U*Y_s1u4ytu)g>@RbDZSo|aDtr;`T)Kphedewo;lL*xh}#*T0MTvk~Wu%mq*M_hJpUPEDw`<&~!UDg=e zDf^`OdGNO6coJl@%J)zBk_73(|Dw%Exng=|H3Npcy%Bqqq|Kv{8cHPo7} RsRkG)wrZ-TYvPzH{|iUlIXeIV delta 1302 zcmZ8g&2Jk;6rZ=d-YzSBM*SpT_ zrl^slRJbRk9;&$jr}$E-7eMM?z#qUFF&sGe%#BL4Z`P`*nAN;_@6GS+ym`NQ-?d&e zz0~tug5$4)KiTN9w+z2LK6~Z}VzX;fLbcc$+Fct6>JU3!huISXgu$+RT&DUIm>E{k z@{X&8tWjeMv_#FR-mNod_XN%cwW$3_!zkx?f%#MDE>efO4@q~4mT3joCiQ3)*A}hO zIAZGIv(H&CXMrY5d;5&vmo{hFC`~dpyTj}v z;wNKyF;KZECQZWt4a|S*2454>SZ;lf&H*`uoa8V@8YkKWu9Gj zX$<4iC4}ucxiy9fQ0K0}JGpkOQx`pmnIeaUU7jJ7Jw|#8Uu6n2EAWIU?U`*nm7f%@ zdHL$HQdO#W4{iN@sq^%kH}l6G>0XJWEbpho(Z&Yaf2$Jvf5&yaXJ_(QNl+Nn9}Q~$vwuiULIlfq&u&B#+!(c zrGk8}xR+i&9EEeh{}SDh3z$-I`hS@01B}bm$%e56KA1>=qM%{8P=PvrhVH&JfS-Hz zv(<*$IwA)@{-DP5tK1d8+nvVzVLsLP8QkYhvF^0}`BUZZq4T!d2*hpY#?b|(&V%?x zG_gHuJ5h{vVKmI6JZ9Mpp`LmX|3wk6pqA8<5nn`Ant7^f&)1ZpV{1u0PW#g8@j;gI zJg7*G?(kK$TguZ>n8+F%idFYPbH*vZgj@Ww5=Bq>;w#688{#*2)!RgG0Q|gK-V!g| zQx`u}I8`cX&bpW@s->k!#6FM~Hg?3*gjWi8Zxn@h*?qBH{&eT83T-H%I Date: Mon, 26 Feb 2024 15:24:54 -0500 Subject: [PATCH 7/7] feat: update code to work with OpenAI v1.0.0 API feat: make search more flexible by exposing topk and thr parameters --- gpthistory.egg-info/PKG-INFO | 12 ++-- gpthistory.egg-info/entry_points.txt | 6 +- gpthistory.egg-info/requires.txt | 7 ++- .../__pycache__/__init__.cpython-310.pyc | Bin 152 -> 0 bytes .../__pycache__/gpthistory.cpython-310.pyc | Bin 3048 -> 0 bytes .../__pycache__/helpers.cpython-310.pyc | Bin 3082 -> 0 bytes gpthistory/gpthistory.py | 56 ++++++++++------- gpthistory/helpers.py | 58 ++++++++++++------ setup.py | 9 ++- 9 files changed, 94 insertions(+), 54 deletions(-) delete mode 100644 gpthistory/__pycache__/__init__.cpython-310.pyc delete mode 100644 gpthistory/__pycache__/gpthistory.cpython-310.pyc delete mode 100644 gpthistory/__pycache__/helpers.cpython-310.pyc diff --git a/gpthistory.egg-info/PKG-INFO b/gpthistory.egg-info/PKG-INFO index da0adaa..af91d66 100644 --- a/gpthistory.egg-info/PKG-INFO +++ b/gpthistory.egg-info/PKG-INFO @@ -2,12 +2,12 @@ Metadata-Version: 2.1 Name: gpthistory Version: 0.3 Summary: A tool for searching through your chatgpt conversation history -Home-page: UNKNOWN Author: Shrikar Archak Author-email: shrikar84@gmail.com -License: UNKNOWN -Platform: UNKNOWN License-File: LICENSE.md - -UNKNOWN - +Requires-Dist: typer +Requires-Dist: python-dotenv +Requires-Dist: openai +Requires-Dist: pandas +Requires-Dist: numpy +Requires-Dist: loguru diff --git a/gpthistory.egg-info/entry_points.txt b/gpthistory.egg-info/entry_points.txt index d13f257..ce90c75 100644 --- a/gpthistory.egg-info/entry_points.txt +++ b/gpthistory.egg-info/entry_points.txt @@ -1,4 +1,2 @@ - - [console_scripts] - gpthistory=gpthistory.gpthistory:app - \ No newline at end of file +[console_scripts] +gpthistory = gpthistory.gpthistory:main diff --git a/gpthistory.egg-info/requires.txt b/gpthistory.egg-info/requires.txt index 2cbb3f1..2f2931b 100644 --- a/gpthistory.egg-info/requires.txt +++ b/gpthistory.egg-info/requires.txt @@ -1,5 +1,6 @@ -numpy +typer +python-dotenv openai pandas -python-dotenv -typer +numpy +loguru diff --git a/gpthistory/__pycache__/__init__.cpython-310.pyc b/gpthistory/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index c7e5b61ee4a144c6d9cf2c3d3062a8e7eacd0cbc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 152 zcmd1j<>g`kg8NmsQ$h4&5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Huer{fgeqvEk zVs>V+esN}MNpgmMQEEYcv3`0%Nk(RINq$i!GBZ9tGcU6wK3=b&@)n0pZhlH>PO2Tq L&|)Sa!NLFl32!91 diff --git a/gpthistory/__pycache__/gpthistory.cpython-310.pyc b/gpthistory/__pycache__/gpthistory.cpython-310.pyc deleted file mode 100644 index 2d1fd228ab56d8520127ce9d9afd7c8ee66725b7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3048 zcma)8&2JmW72nw}E|;ICBwMl(bYVbAh$K`{6lkl;ZW60<4HU9l324D2Al5rWa;g2G zXNLM9mPG;iszc9h0rD82mtK3yAJD&G&|6PA_vRkd_lAlk6$#KK=Iy-Md2i-@{T}mL z%{Sor*)p~`k!)le*^y&s%q4gt^f6glB27&t?=+SO`1NS%f+m>pv zQ^~_f_9RSt!7|UTS zWX{i~wP+a1IHT;e6N%$#d-MHJL^4iUTLd2ko9`#=m?c^GmQR@w;Q+q2P37p1%I)Vo z4W+XCal#^k7rlC#@OEK5fg%0@U!*Z2FwD6nZSWnTCi+S`bGI-SWJU(Y%!FaVunV$l zeEIRrDI98T81vG?0RFu%%$YkYNv|lKlbI*Wg?A1yII{WE!lU+vL7gKD=H}c7Id=oJ z5~HwybM?%Incj!otKikTtjw#lB<(*Le=%`Q^QhN6CvVqk^Lk+xR$dE z{KD6|e67pBsS7l(!q*1)t$AV1YDI0nM(-@h=q_#1l?yVfWM)yGRf;PA8Z4+>dFE?v z)2g=aUQvO2uP+SnUcIOn6}q-#z}+^AO5qQ{1G9$SeHY{#*B&Suqx<@XReJBjg1lTm zFxnf_pZEbhpH1R~24NQHGz$7WPlGTR#K$ZPFuMX8$}qs>2u@E=wa8(cD36^&4)*dR zRZ7G07;=_l?kGcnBIGmXUB{S$lN>=%X1ms4xl_r zcw#T6$`=gsIL~lyW3BKSXoqu4aW+oEi0!>0#aBdGf{m4vPbA}z($+9PQDrbBjzTza zy4Fh>AF$xjqu^n}`f%MWIeVzw&75_BfP7Rf0Jd ztn#1j@BZR=0|Zfyc9?QrO&C8@Hs$%4TPW+wJb{2K z7rX$@MIHs(P3~#cm?g@)3ZGDw>-Y)fU;yF8G6_n_ks`;ctN~C&s@4>vm z^An*%1=qxdN}NTUrSLgSR9WAU!aCo;lf8bNg$b}<8T?aNc=Kp0&E!_dd3Y3yEfF&r z4YxQO=VI#`Ilp__8K0>NCM{h$XdU#47eMa1zGa&>v57}kVXTw7>61D!|6smpmR3oN zG)WbB&+=eT6-Hh2P183`;$7M{%t#%+)XL|U(ygKZ&$t*1SAMh zmxBc@;7oG*zsGpr%HtvY8IbAv_Z^|@VCrrP^xO2uPXI!nzBmkSqHK`F+0k||lyWS# zx3;iMJ2{|67D6ilN%y^b*S3@&76zkP>q zq7m;Rd4S{{Bp)F80h0HSe27FBv;%mSoGlT{5|RU|CO8IBm030c41j+|04!%? zL_8J^^R~x#QOD144}d}fErC(~BV_&SzfjA$h@c$_MPmLGWe_)R#MoVW`ZusD9s;R4 zSlcfAyvvr=w3=p1BY}Ncwe0`+C7bZy!ZzjgdMVtq*K6PBzebe-5;R%4W9WjR(6*`) zA~oc^YFz_TXUGz;WeMyWa+lURhf5Ptv349ZgFMcl-<+b#iC|oN5pXLMaRfo{#{*?Q z+uwVx$^#~UwQLG-HS8+Ao{k0fy)*@JSjqH}J(xW6|mtv=uk!8c!T-pqUN_ufq2 zZ2AVu?}Gn|PaB5uKkA$>4mv->UwjF|4bD=dGpbpK>Au;qbZvJW)Mo1TYn>W1L`|@s zcUb4v5##oA*6}gxq!ax{r@=-Nnw_R-92nqby~)EVO{cjF{u=j=tj-nD?6vS;<#p~q zH^J9DoDpvvvCb@?;EiWSXO1`dB%vCehQ9vbbrW1QaD zXfx&BABb#aT}|G9u(q+X97Qv>>`vK_^@TiQIG@GiOoL1H1JcIhBYMeB} za&I7%O#+moB|smmD@x^JNMyWiE1M)KpLjyeBI{5@62e{>vW4p9?d5(}E=N*E+exup zB%+MBmZccv#d3F0ZY4#T%f0i>Es+j{ES3g)ss*{EMR>Nsn~+&dg4njtZ055TKAXuI z48w2d;vM6CZR_o=i6wMMvwKNN|$lQ&F3a? z#CjID=8c{WRqWvhT9sdjOvtDdq>bo57Mv$pHvlw)D0s3XngV@90o`l?DNTvjg*h0RV6R z-rP6$t`AsSq_gJZ&~yiVi>$OU_yqUs?g3JRyLQ5lb1!V)*;jlY>sC@h$K zn^se3!*}M=EoI64v&%m7@1zLqIPVYcehAL5N%o8}T!1Tc4Z$-n7E5>1JdV=hE*htc zG?Iqb?_E%R;ojCxwq3M?4}zO~ICV}#J@BoKJjldnm+ZwwDdJM_b{(k8cKUjE9a73o z^KMs2WhWWz{2fv&Ao3Ow!l|4EX}hXXn(%3}QEf@@pHKaCat0VxgK1wi$@acCYHbYGMf zVRGNbj9pokePjZ$4mY=*eYbKdciz~qRYngjZfzp*(Yn7vh90rUBP7|aYPg^seD3GS zDBP(`N|pQG7sjT(xSx5Ir(>f21;a?4d*czms#P9%C#rhoR}-5S5abVkuJL{^O5>fB z(ovZYf-)&nF~VvTbdz0?T}Vl1F>#V~1E}5o=Rc?%e<;Z!xWRC@g$ds{eueT{Y*W(x z`aHJ(f`~lD_Lw!5OB;XBf%6bb>z+Kb@dmQ)V`hb7(5J%qgODYE%|pH@KVf z81Ek0Epl158_MLH@*QGz=(L%d8rwM>U0+j|a<1|rCM@C{*;?8CsMuCl$0s{CHAx%5 zV0bnqF<8A*%c+Lm>BKT>del2xsr71`@+YLwPl=F~)y(Vcx{etgKP|^&4b!tc{TjGo zwX7ENm~YCD!GVQ?%-2_LL{`b#zO zI4Y7D32igMg7uA$?yEZ9QlE@ckQavZfDA*qNP7`Nb&k@m(^){Pat;<6AlfwE9bCRU msklqIy?7VIGd!pdldzWAK 0: - logger.info("Only generating embeddings for new conversations to save money.") + print( + "[yellow]Only generating embeddings for new conversations to save money.[/yellow]" + ) + + import pickle + + with open("convos.pkl", "wb") as f: + pickle.dump(rows_only_in_df, f) # Generate and add embeddings to the index embeddings = generate_embeddings(rows_only_in_df.text.tolist()) rows_only_in_df["embeddings"] = embeddings final_df = pd.concat([rows_only_in_df, current_df]) - logger.info(f"Total conversations: {len(final_df)}") + print(f"[cyan]Total conversations:[/cyan] {len(final_df)}") final_df.to_csv(INDEX_PATH, sep="|", index=False) @main.command() -def search(keyword: str): +def search(keyword: str, topk: int = 5, thr: float | None = None): """ - Search a keyword within the index + Search a keyword within the index with an optional threshold argument. """ - logger.info("Searching for keyword: %s", keyword) + print(f"[cyan]Searching for:[/cyan] '{keyword}'") if os.path.exists(INDEX_PATH): df = pd.read_csv(INDEX_PATH, sep="|") df["embeddings"] = df.embeddings.apply( @@ -92,17 +93,30 @@ def search(keyword: str): ) filtered = df[df.text.str.contains(keyword)] - # Calculate top titles and their corresponding chat IDs - chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword) + if filtered.shape[0] == 0: + print( + "[yellow]No exact matches found. Performing solely embedding search.[/yellow]" + ) + filtered = df.copy() + + # Calculate top titles and their corresponding chat IDs based on the threshold + chat_ids, top_titles, top_scores = calculate_top_titles( + filtered, keyword, thr, topk + ) for i, t in enumerate(top_titles): - logger.info("%s: %s", chat_ids[i], t) - logger.info( - "ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i] + print( + f"""\ +-------------------------------------------------------------------------------- +[cyan bold]url:[/cyan bold] [green]https://chat.openai.com/c/{chat_ids[i]}[/green] +[cyan bold]score:[/cyan bold] {top_scores[i]:.2f} + +{t} +-------------------------------------------------------------------------------\ +""" ) - logger.info("--------------------------------------") else: - typer.echo("Index not found. Please build the index first.") + print("Index not found. Please build the index first.") return diff --git a/gpthistory/helpers.py b/gpthistory/helpers.py index 003203a..7218aee 100644 --- a/gpthistory/helpers.py +++ b/gpthistory/helpers.py @@ -1,26 +1,38 @@ -import json import os -import pandas as pd +import tiktoken import numpy as np from openai import OpenAI - -client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) from dotenv import load_dotenv -import logging # Load environment variables load_dotenv() # Set up OpenAI API key +client = OpenAI( + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="http://oai.hconeai.com/v1", + default_headers={ + "Helicone-Auth": f"Bearer {os.environ.get('HELICONE_API_KEY')}", + "Helicone-Property-project": "gpthistory", + }, +) + +# Load model +tokenizer = tiktoken.get_encoding("cl100k_base") +EMBEDDING_MODEL = "text-embedding-3-small" # Define the path to the index file in the user's home directory INDEX_PATH = os.path.join(os.path.expanduser("~"), ".chatsearch", "chatindex.csv") -# Set up logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) + +def count_tokens(text): + return len(tokenizer.encode(text)) + + +def get_first_n_tokens(text: str, n: int) -> str: + tokens = tokenizer.encode(text) + first_n_tokens = tokens[:n] + return tokenizer.decode(first_n_tokens) def extract_text_parts(data): @@ -48,7 +60,7 @@ def generate_query_embedding(query): """ Generate an embedding for a query using OpenAI API. """ - response = client.embeddings.create(input=[query], model="text-embedding-ada-002") + response = client.embeddings.create(input=[query], model=EMBEDDING_MODEL) return response.data[0].embedding @@ -58,22 +70,26 @@ def generate_embeddings(conversations): """ embeddings = [] for i, batch in enumerate(split_into_batches(conversations, 100)): - logger.info(f"Generating Embeddings for batch: {i + 1}") - response = client.embeddings.create(input=batch, model="text-embedding-ada-002") - tmp_embedding = [row["embedding"] for row in response.data] + # Suppressing logging of individual batch processing for OpenAI requests + for i, text in enumerate(batch): + if count_tokens(text) > 8000: + batch[i] = get_first_n_tokens(text, 8000) + response = client.embeddings.create(input=batch, model=EMBEDDING_MODEL) + tmp_embedding = [r.embedding for r in response.data] embeddings += tmp_embedding if len(embeddings) > 0: - logger.info("Conversations (Chunks) = %d", len(conversations)) - logger.info("Embeddings = %d", len(embeddings)) + print(f"[cyan]Conversations (Chunks):[/cyan] {len(conversations)}") + print(f"[cyan]Embeddings:[/cyan] {len(embeddings)}") else: - logger.info("No new conversations detected") + print("[yellow]No new conversations detected[/yellow]") return embeddings -def calculate_top_titles(df, query, top_n=1000): +def calculate_top_titles(df, query, thr=0.8, top_n=1000): """ Calculate top titles for a given query using embeddings. """ + # Extract the embeddings from the DataFrame embedding_array = np.array(df["embeddings"].tolist()) query_embedding = generate_query_embedding(query) @@ -81,7 +97,11 @@ def calculate_top_titles(df, query, top_n=1000): dot_scores = np.dot(embedding_array, query_embedding) # Filter out titles with dot scores below the threshold - mask = dot_scores >= 0.8 + if thr is not None: + mask = dot_scores >= thr + else: + mask = np.ones_like(dot_scores, dtype=bool) + filtered_dot_scores = dot_scores[mask] filtered_titles = df.loc[mask, "text"].tolist() filtered_chat_ids = df.loc[mask, "chat_id"].tolist() diff --git a/setup.py b/setup.py index 2d91494..23a6e1d 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,14 @@ author_email="shrikar84@gmail.com", packages=find_packages(), include_package_data=True, - install_requires=["typer", "python-dotenv", "openai", "pandas", "numpy"], + install_requires=[ + "typer", + "python-dotenv", + "openai", + "pandas", + "numpy", + "rich", + ], entry_points=""" [console_scripts] gpthistory=gpthistory.gpthistory:main