sarchak · Kabilan108 · Feb 26, 2024 · Feb 26, 2024 · Feb 26, 2024 · Feb 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,2 @@
-dist/
-build/
-*.pyc
+.env
+__pycache__/
diff --git a/gpthistory.egg-info/PKG-INFO b/gpthistory.egg-info/PKG-INFO
@@ -5,3 +5,9 @@ Summary: A tool for searching through your chatgpt conversation history
 Author: Shrikar Archak
 Author-email: shrikar84@gmail.com
 License-File: LICENSE.md
+Requires-Dist: typer
+Requires-Dist: python-dotenv
+Requires-Dist: openai
+Requires-Dist: pandas
+Requires-Dist: numpy
+Requires-Dist: loguru
diff --git a/gpthistory.egg-info/requires.txt b/gpthistory.egg-info/requires.txt
@@ -1,5 +1,6 @@
-Click
+typer
 python-dotenv
 openai
 pandas
 numpy
+loguru
diff --git a/gpthistory/gpthistory.py b/gpthistory/gpthistory.py
@@ -1,107 +1,124 @@
-import click
+import typer
 import json
 import os
 import pandas as pd
-import logging
-from gpthistory.helpers import extract_text_parts, generate_embeddings, calculate_top_titles
+from rich import print
+from gpthistory.helpers import (
+    extract_text_parts,
+    generate_embeddings,
+    calculate_top_titles,
+)
 
-# Define the path to the index file in the user's home directory
-INDEX_PATH = os.path.join(os.path.expanduser('~'), '.gpthistory', 'chatindex.csv')
+main = typer.Typer()
 
-# Configure the logger
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
+# Define the path to the index file in the user's home directory
+INDEX_PATH = os.path.join(os.path.expanduser("~"), ".gpthistory", "chatindex.csv")
 
-@click.group()
-def main():
-    """
-    Simple CLI for searching within a chat data
-    """
-    pass
 
 @main.command()
-@click.option('--file', type=click.Path(exists=True), help='Input file')
-def build_index(file):
+def build_index(file: typer.FileText):
     """
-    Build an index from a given chat data file
+    Build an index from a given chat data file xxx
     """
-    # TODO: Implement index building
-    # Write the index to the predefined path
     # Make sure the directory exists
     os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
-    
+
     # Load the chat data from the given file
-    with open(file) as f:
-        data = json.load(f)
-
+    data = json.load(file)
+
     chat_ids = []
     section_ids = []
     texts = []
     for entry in data:
-        for k, v in entry['mapping'].items():
+        for k, v in entry["mapping"].items():
             text_data = extract_text_parts(v)
-            if len(text_data) > 0 and text_data[0] != '':
+            if len(text_data) > 0 and text_data[0] != "":
                 # Add relevant chat information to the index
-                chat_ids.append(entry['id'])
+                chat_ids.append(entry["id"])
                 section_ids.append(k)
                 texts.append(text_data[0])
-    logger.info(f"Index built and stored at: {INDEX_PATH}")
-    logger.info(f"Conversations indexed: {len(chat_ids)}")
-    df = pd.DataFrame({'chat_id': chat_ids, 'section_id': section_ids, 'text': texts})
-    df = df[~df.text.isna()] 
-    df['id'] = df['chat_id']
+    print(f"[cyan]Index built and stored at:[/cyan] {INDEX_PATH}")
+    print(f"[cyan]Conversations indexed:[/cyan] {len(chat_ids)}")
+    df = pd.DataFrame({"chat_id": chat_ids, "section_id": section_ids, "text": texts})
+    df = df[~df.text.isna()]
+    df["id"] = df["chat_id"]
     df.set_index("id", inplace=True)
 
     # Handle incremental index updates
-    current_df = pd.DataFrame()    
+    current_df = pd.DataFrame()
     rows_only_in_df = pd.DataFrame()
     incremental = False
     if os.path.exists(INDEX_PATH):
         incremental = True
-        current_df = pd.read_csv(INDEX_PATH, sep='|')
-        current_df['id'] = current_df['chat_id']
+        current_df = pd.read_csv(INDEX_PATH, sep="|")
+        current_df["id"] = current_df["chat_id"]
         current_df.set_index("id", inplace=True)
         # Use merge with indicator=True to find rows present in one DataFrame but not the other
-        merged_df = df.merge(current_df, how='outer', indicator=True)
+        merged_df = df.merge(current_df, how="outer", indicator=True)
         # Query rows only present in df1
-        rows_only_in_df = merged_df.query('_merge == "left_only"').drop(columns='_merge')
+        rows_only_in_df = merged_df.query('_merge == "left_only"').drop(
+            columns="_merge"
+        )
     else:
         rows_only_in_df = df
-    
+
     if incremental and len(rows_only_in_df) > 0:
-        logger.info("Only generating embeddings for new conversations to save money.")
-
+        print(
+            "[yellow]Only generating embeddings for new conversations to save money.[/yellow]"
+        )
+
+    import pickle
+
+    with open("convos.pkl", "wb") as f:
+        pickle.dump(rows_only_in_df, f)
+
     # Generate and add embeddings to the index
     embeddings = generate_embeddings(rows_only_in_df.text.tolist())
-    rows_only_in_df['embeddings'] = embeddings
+    rows_only_in_df["embeddings"] = embeddings
     final_df = pd.concat([rows_only_in_df, current_df])
-    logger.info(f"Total conversations: {len(final_df)}")
-    final_df.to_csv(INDEX_PATH, sep='|', index=False)
+    print(f"[cyan]Total conversations:[/cyan] {len(final_df)}")
+    final_df.to_csv(INDEX_PATH, sep="|", index=False)
+
 
 @main.command()
-@click.argument('keyword', required=True)
-def search(keyword):
+def search(keyword: str, topk: int = 5, thr: float | None = None):
     """
-    Search a keyword within the index
+    Search a keyword within the index with an optional threshold argument.
     """
-    # TODO: Implement search function
-    # Load the index from the predefined path
-    logger.info("Searching for keyword: %s", keyword)
+    print(f"[cyan]Searching for:[/cyan] '{keyword}'")
     if os.path.exists(INDEX_PATH):
-        df = pd.read_csv(INDEX_PATH, sep='|')
-        df['embeddings'] = df.embeddings.apply(lambda x: [float(t) for t in json.loads(x)])
+        df = pd.read_csv(INDEX_PATH, sep="|")
+        df["embeddings"] = df.embeddings.apply(
+            lambda x: [float(t) for t in json.loads(x)]
+        )
         filtered = df[df.text.str.contains(keyword)]
-
-        # Calculate top titles and their corresponding chat IDs
-        chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword)
-
+
+        if filtered.shape[0] == 0:
+            print(
+                "[yellow]No exact matches found. Performing solely embedding search.[/yellow]"
+            )
+            filtered = df.copy()
+
+        # Calculate top titles and their corresponding chat IDs based on the threshold
+        chat_ids, top_titles, top_scores = calculate_top_titles(
+            filtered, keyword, thr, topk
+        )
+
         for i, t in enumerate(top_titles):
-            logger.info("%s: %s", chat_ids[i], t)
-            logger.info("ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i])
-            logger.info("--------------------------------------")
+            print(
+                f"""\
+--------------------------------------------------------------------------------
+[cyan bold]url:[/cyan bold] [green]https://chat.openai.com/c/{chat_ids[i]}[/green]
+[cyan bold]score:[/cyan bold] {top_scores[i]:.2f}
+
+{t}
+-------------------------------------------------------------------------------\
+"""
+            )
     else:
-        click.echo("Index not found. Please build the index first.")
+        print("Index not found. Please build the index first.")
         return
 
+
 if __name__ == "__main__":
     main()
diff --git a/gpthistory/helpers.py b/gpthistory/helpers.py
@@ -1,88 +1,110 @@
-import json
 import os
-import pandas as pd
+import tiktoken
 import numpy as np
-import openai
+from openai import OpenAI
 from dotenv import load_dotenv
-import logging
 
 # Load environment variables
 load_dotenv()
 
 # Set up OpenAI API key
-openai.api_key = os.environ.get('OPENAI_API_KEY')
+client = OpenAI(
+    api_key=os.environ.get("OPENAI_API_KEY"),
+    base_url="http://oai.hconeai.com/v1",
+    default_headers={
+        "Helicone-Auth": f"Bearer {os.environ.get('HELICONE_API_KEY')}",
+        "Helicone-Property-project": "gpthistory",
+    },
+)
+
+# Load model
+tokenizer = tiktoken.get_encoding("cl100k_base")
+EMBEDDING_MODEL = "text-embedding-3-small"
 
 # Define the path to the index file in the user's home directory
-INDEX_PATH = os.path.join(os.path.expanduser('~'), '.chatsearch', 'chatindex.csv')
+INDEX_PATH = os.path.join(os.path.expanduser("~"), ".chatsearch", "chatindex.csv")
+
+
+def count_tokens(text):
+    return len(tokenizer.encode(text))
+
+
+def get_first_n_tokens(text: str, n: int) -> str:
+    tokens = tokenizer.encode(text)
+    first_n_tokens = tokens[:n]
+    return tokenizer.decode(first_n_tokens)
 
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
 
 def extract_text_parts(data):
     """
     Extract text parts from chat data.
     """
     text_parts = []
-    message = data.get('message')
+    message = data.get("message")
     if message:
-        content = message.get('content')
-        if content and content.get('content_type') == 'text':
-            text_parts.extend(content.get('parts', []))
+        content = message.get("content")
+        if content and content.get("content_type") == "text":
+            text_parts.extend(content.get("parts", []))
     return text_parts
 
+
 def split_into_batches(array, batch_size):
     """
     Split an array into batches.
     """
     for i in range(0, len(array), batch_size):
-        yield array[i:i + batch_size]
+        yield array[i : i + batch_size]
+
 
 def generate_query_embedding(query):
     """
     Generate an embedding for a query using OpenAI API.
     """
-    response = openai.Embedding.create(
-        input=[query],
-        model="text-embedding-ada-002"
-    )
-    return response['data'][0]['embedding']
+    response = client.embeddings.create(input=[query], model=EMBEDDING_MODEL)
+    return response.data[0].embedding
+
 
 def generate_embeddings(conversations):
     """
     Generate embeddings for conversations using OpenAI API.
     """
     embeddings = []
     for i, batch in enumerate(split_into_batches(conversations, 100)):
-        logger.info(f"Generating Embeddings for batch: {i + 1}")
-        response = openai.Embedding.create(
-            input=batch,
-            model="text-embedding-ada-002"
-        )
-        tmp_embedding = [row['embedding'] for row in response['data']]
+        # Suppressing logging of individual batch processing for OpenAI requests
+        for i, text in enumerate(batch):
+            if count_tokens(text) > 8000:
+                batch[i] = get_first_n_tokens(text, 8000)
+        response = client.embeddings.create(input=batch, model=EMBEDDING_MODEL)
+        tmp_embedding = [r.embedding for r in response.data]
         embeddings += tmp_embedding
     if len(embeddings) > 0:
-        logger.info("Conversations (Chunks) = %d", len(conversations))
-        logger.info("Embeddings = %d", len(embeddings))
+        print(f"[cyan]Conversations (Chunks):[/cyan] {len(conversations)}")
+        print(f"[cyan]Embeddings:[/cyan] {len(embeddings)}")
     else:
-        logger.info("No new conversations detected")
+        print("[yellow]No new conversations detected[/yellow]")
     return embeddings
 
-def calculate_top_titles(df, query, top_n=1000):
+
+def calculate_top_titles(df, query, thr=0.8, top_n=1000):
     """
     Calculate top titles for a given query using embeddings.
     """
+
     # Extract the embeddings from the DataFrame
-    embedding_array = np.array(df['embeddings'].tolist())
+    embedding_array = np.array(df["embeddings"].tolist())
     query_embedding = generate_query_embedding(query)
     # Calculate the dot product between the query embedding and all embeddings in the DataFrame
     dot_scores = np.dot(embedding_array, query_embedding)
 
     # Filter out titles with dot scores below the threshold
-    mask = dot_scores >= 0.8
+    if thr is not None:
+        mask = dot_scores >= thr
+    else:
+        mask = np.ones_like(dot_scores, dtype=bool)
+
     filtered_dot_scores = dot_scores[mask]
-    filtered_titles = df.loc[mask, 'text'].tolist()
-    filtered_chat_ids = df.loc[mask, 'chat_id'].tolist()
+    filtered_titles = df.loc[mask, "text"].tolist()
+    filtered_chat_ids = df.loc[mask, "chat_id"].tolist()
 
     # Sort the filtered titles based on the dot scores (in descending order)
     sorted_indices = np.argsort(filtered_dot_scores)[::-1][:top_n]

diff --git a/setup.py b/setup.py
@@ -1,22 +1,23 @@
 from setuptools import setup, find_packages
 
 setup(
-    name='gpthistory',
-    version='0.3',
-    description='A tool for searching through your chatgpt conversation history',
-    author='Shrikar Archak',
-    author_email='shrikar84@gmail.com',
+    name="gpthistory",
+    version="0.3",
+    description="A tool for searching through your chatgpt conversation history",
+    author="Shrikar Archak",
+    author_email="shrikar84@gmail.com",
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'Click',
-        'python-dotenv',
-        'openai',
-        'pandas',
-        'numpy'
+        "typer",
+        "python-dotenv",
+        "openai",
+        "pandas",
+        "numpy",
+        "rich",
     ],
-    entry_points='''
+    entry_points="""
         [console_scripts]
         gpthistory=gpthistory.gpthistory:main
-    ''',
+    """,
 )