diff --git a/.gitignore b/.gitignore index ad8c20e..d50a09f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ -dist/ -build/ -*.pyc +.env +__pycache__/ diff --git a/gpthistory.egg-info/PKG-INFO b/gpthistory.egg-info/PKG-INFO index 5cbb48a..af91d66 100644 --- a/gpthistory.egg-info/PKG-INFO +++ b/gpthistory.egg-info/PKG-INFO @@ -5,3 +5,9 @@ Summary: A tool for searching through your chatgpt conversation history Author: Shrikar Archak Author-email: shrikar84@gmail.com License-File: LICENSE.md +Requires-Dist: typer +Requires-Dist: python-dotenv +Requires-Dist: openai +Requires-Dist: pandas +Requires-Dist: numpy +Requires-Dist: loguru diff --git a/gpthistory.egg-info/requires.txt b/gpthistory.egg-info/requires.txt index dbd3260..2f2931b 100644 --- a/gpthistory.egg-info/requires.txt +++ b/gpthistory.egg-info/requires.txt @@ -1,5 +1,6 @@ -Click +typer python-dotenv openai pandas numpy +loguru diff --git a/gpthistory/gpthistory.py b/gpthistory/gpthistory.py index 8adc06e..20fd4d4 100644 --- a/gpthistory/gpthistory.py +++ b/gpthistory/gpthistory.py @@ -1,107 +1,124 @@ -import click +import typer import json import os import pandas as pd -import logging -from gpthistory.helpers import extract_text_parts, generate_embeddings, calculate_top_titles +from rich import print +from gpthistory.helpers import ( + extract_text_parts, + generate_embeddings, + calculate_top_titles, +) -# Define the path to the index file in the user's home directory -INDEX_PATH = os.path.join(os.path.expanduser('~'), '.gpthistory', 'chatindex.csv') +main = typer.Typer() -# Configure the logger -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) +# Define the path to the index file in the user's home directory +INDEX_PATH = os.path.join(os.path.expanduser("~"), ".gpthistory", "chatindex.csv") -@click.group() -def main(): - """ - Simple CLI for searching within a chat data - """ - pass @main.command() -@click.option('--file', type=click.Path(exists=True), help='Input file') -def build_index(file): +def build_index(file: typer.FileText): """ - Build an index from a given chat data file + Build an index from a given chat data file xxx """ - # TODO: Implement index building - # Write the index to the predefined path # Make sure the directory exists os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True) - + # Load the chat data from the given file - with open(file) as f: - data = json.load(f) - + data = json.load(file) + chat_ids = [] section_ids = [] texts = [] for entry in data: - for k, v in entry['mapping'].items(): + for k, v in entry["mapping"].items(): text_data = extract_text_parts(v) - if len(text_data) > 0 and text_data[0] != '': + if len(text_data) > 0 and text_data[0] != "": # Add relevant chat information to the index - chat_ids.append(entry['id']) + chat_ids.append(entry["id"]) section_ids.append(k) texts.append(text_data[0]) - logger.info(f"Index built and stored at: {INDEX_PATH}") - logger.info(f"Conversations indexed: {len(chat_ids)}") - df = pd.DataFrame({'chat_id': chat_ids, 'section_id': section_ids, 'text': texts}) - df = df[~df.text.isna()] - df['id'] = df['chat_id'] + print(f"[cyan]Index built and stored at:[/cyan] {INDEX_PATH}") + print(f"[cyan]Conversations indexed:[/cyan] {len(chat_ids)}") + df = pd.DataFrame({"chat_id": chat_ids, "section_id": section_ids, "text": texts}) + df = df[~df.text.isna()] + df["id"] = df["chat_id"] df.set_index("id", inplace=True) # Handle incremental index updates - current_df = pd.DataFrame() + current_df = pd.DataFrame() rows_only_in_df = pd.DataFrame() incremental = False if os.path.exists(INDEX_PATH): incremental = True - current_df = pd.read_csv(INDEX_PATH, sep='|') - current_df['id'] = current_df['chat_id'] + current_df = pd.read_csv(INDEX_PATH, sep="|") + current_df["id"] = current_df["chat_id"] current_df.set_index("id", inplace=True) # Use merge with indicator=True to find rows present in one DataFrame but not the other - merged_df = df.merge(current_df, how='outer', indicator=True) + merged_df = df.merge(current_df, how="outer", indicator=True) # Query rows only present in df1 - rows_only_in_df = merged_df.query('_merge == "left_only"').drop(columns='_merge') + rows_only_in_df = merged_df.query('_merge == "left_only"').drop( + columns="_merge" + ) else: rows_only_in_df = df - + if incremental and len(rows_only_in_df) > 0: - logger.info("Only generating embeddings for new conversations to save money.") - + print( + "[yellow]Only generating embeddings for new conversations to save money.[/yellow]" + ) + + import pickle + + with open("convos.pkl", "wb") as f: + pickle.dump(rows_only_in_df, f) + # Generate and add embeddings to the index embeddings = generate_embeddings(rows_only_in_df.text.tolist()) - rows_only_in_df['embeddings'] = embeddings + rows_only_in_df["embeddings"] = embeddings final_df = pd.concat([rows_only_in_df, current_df]) - logger.info(f"Total conversations: {len(final_df)}") - final_df.to_csv(INDEX_PATH, sep='|', index=False) + print(f"[cyan]Total conversations:[/cyan] {len(final_df)}") + final_df.to_csv(INDEX_PATH, sep="|", index=False) + @main.command() -@click.argument('keyword', required=True) -def search(keyword): +def search(keyword: str, topk: int = 5, thr: float | None = None): """ - Search a keyword within the index + Search a keyword within the index with an optional threshold argument. """ - # TODO: Implement search function - # Load the index from the predefined path - logger.info("Searching for keyword: %s", keyword) + print(f"[cyan]Searching for:[/cyan] '{keyword}'") if os.path.exists(INDEX_PATH): - df = pd.read_csv(INDEX_PATH, sep='|') - df['embeddings'] = df.embeddings.apply(lambda x: [float(t) for t in json.loads(x)]) + df = pd.read_csv(INDEX_PATH, sep="|") + df["embeddings"] = df.embeddings.apply( + lambda x: [float(t) for t in json.loads(x)] + ) filtered = df[df.text.str.contains(keyword)] - - # Calculate top titles and their corresponding chat IDs - chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword) - + + if filtered.shape[0] == 0: + print( + "[yellow]No exact matches found. Performing solely embedding search.[/yellow]" + ) + filtered = df.copy() + + # Calculate top titles and their corresponding chat IDs based on the threshold + chat_ids, top_titles, top_scores = calculate_top_titles( + filtered, keyword, thr, topk + ) + for i, t in enumerate(top_titles): - logger.info("%s: %s", chat_ids[i], t) - logger.info("ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i]) - logger.info("--------------------------------------") + print( + f"""\ +-------------------------------------------------------------------------------- +[cyan bold]url:[/cyan bold] [green]https://chat.openai.com/c/{chat_ids[i]}[/green] +[cyan bold]score:[/cyan bold] {top_scores[i]:.2f} + +{t} +-------------------------------------------------------------------------------\ +""" + ) else: - click.echo("Index not found. Please build the index first.") + print("Index not found. Please build the index first.") return + if __name__ == "__main__": main() diff --git a/gpthistory/helpers.py b/gpthistory/helpers.py index 56bf08f..7218aee 100644 --- a/gpthistory/helpers.py +++ b/gpthistory/helpers.py @@ -1,52 +1,68 @@ -import json import os -import pandas as pd +import tiktoken import numpy as np -import openai +from openai import OpenAI from dotenv import load_dotenv -import logging # Load environment variables load_dotenv() # Set up OpenAI API key -openai.api_key = os.environ.get('OPENAI_API_KEY') +client = OpenAI( + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="http://oai.hconeai.com/v1", + default_headers={ + "Helicone-Auth": f"Bearer {os.environ.get('HELICONE_API_KEY')}", + "Helicone-Property-project": "gpthistory", + }, +) + +# Load model +tokenizer = tiktoken.get_encoding("cl100k_base") +EMBEDDING_MODEL = "text-embedding-3-small" # Define the path to the index file in the user's home directory -INDEX_PATH = os.path.join(os.path.expanduser('~'), '.chatsearch', 'chatindex.csv') +INDEX_PATH = os.path.join(os.path.expanduser("~"), ".chatsearch", "chatindex.csv") + + +def count_tokens(text): + return len(tokenizer.encode(text)) + + +def get_first_n_tokens(text: str, n: int) -> str: + tokens = tokenizer.encode(text) + first_n_tokens = tokens[:n] + return tokenizer.decode(first_n_tokens) -# Set up logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) def extract_text_parts(data): """ Extract text parts from chat data. """ text_parts = [] - message = data.get('message') + message = data.get("message") if message: - content = message.get('content') - if content and content.get('content_type') == 'text': - text_parts.extend(content.get('parts', [])) + content = message.get("content") + if content and content.get("content_type") == "text": + text_parts.extend(content.get("parts", [])) return text_parts + def split_into_batches(array, batch_size): """ Split an array into batches. """ for i in range(0, len(array), batch_size): - yield array[i:i + batch_size] + yield array[i : i + batch_size] + def generate_query_embedding(query): """ Generate an embedding for a query using OpenAI API. """ - response = openai.Embedding.create( - input=[query], - model="text-embedding-ada-002" - ) - return response['data'][0]['embedding'] + response = client.embeddings.create(input=[query], model=EMBEDDING_MODEL) + return response.data[0].embedding + def generate_embeddings(conversations): """ @@ -54,35 +70,41 @@ def generate_embeddings(conversations): """ embeddings = [] for i, batch in enumerate(split_into_batches(conversations, 100)): - logger.info(f"Generating Embeddings for batch: {i + 1}") - response = openai.Embedding.create( - input=batch, - model="text-embedding-ada-002" - ) - tmp_embedding = [row['embedding'] for row in response['data']] + # Suppressing logging of individual batch processing for OpenAI requests + for i, text in enumerate(batch): + if count_tokens(text) > 8000: + batch[i] = get_first_n_tokens(text, 8000) + response = client.embeddings.create(input=batch, model=EMBEDDING_MODEL) + tmp_embedding = [r.embedding for r in response.data] embeddings += tmp_embedding if len(embeddings) > 0: - logger.info("Conversations (Chunks) = %d", len(conversations)) - logger.info("Embeddings = %d", len(embeddings)) + print(f"[cyan]Conversations (Chunks):[/cyan] {len(conversations)}") + print(f"[cyan]Embeddings:[/cyan] {len(embeddings)}") else: - logger.info("No new conversations detected") + print("[yellow]No new conversations detected[/yellow]") return embeddings -def calculate_top_titles(df, query, top_n=1000): + +def calculate_top_titles(df, query, thr=0.8, top_n=1000): """ Calculate top titles for a given query using embeddings. """ + # Extract the embeddings from the DataFrame - embedding_array = np.array(df['embeddings'].tolist()) + embedding_array = np.array(df["embeddings"].tolist()) query_embedding = generate_query_embedding(query) # Calculate the dot product between the query embedding and all embeddings in the DataFrame dot_scores = np.dot(embedding_array, query_embedding) # Filter out titles with dot scores below the threshold - mask = dot_scores >= 0.8 + if thr is not None: + mask = dot_scores >= thr + else: + mask = np.ones_like(dot_scores, dtype=bool) + filtered_dot_scores = dot_scores[mask] - filtered_titles = df.loc[mask, 'text'].tolist() - filtered_chat_ids = df.loc[mask, 'chat_id'].tolist() + filtered_titles = df.loc[mask, "text"].tolist() + filtered_chat_ids = df.loc[mask, "chat_id"].tolist() # Sort the filtered titles based on the dot scores (in descending order) sorted_indices = np.argsort(filtered_dot_scores)[::-1][:top_n] diff --git a/setup.py b/setup.py index 93a0f68..23a6e1d 100644 --- a/setup.py +++ b/setup.py @@ -1,22 +1,23 @@ from setuptools import setup, find_packages setup( - name='gpthistory', - version='0.3', - description='A tool for searching through your chatgpt conversation history', - author='Shrikar Archak', - author_email='shrikar84@gmail.com', + name="gpthistory", + version="0.3", + description="A tool for searching through your chatgpt conversation history", + author="Shrikar Archak", + author_email="shrikar84@gmail.com", packages=find_packages(), include_package_data=True, install_requires=[ - 'Click', - 'python-dotenv', - 'openai', - 'pandas', - 'numpy' + "typer", + "python-dotenv", + "openai", + "pandas", + "numpy", + "rich", ], - entry_points=''' + entry_points=""" [console_scripts] gpthistory=gpthistory.gpthistory:main - ''', + """, )