Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
dist/
build/
*.pyc
.env
__pycache__/
6 changes: 6 additions & 0 deletions gpthistory.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,9 @@ Summary: A tool for searching through your chatgpt conversation history
Author: Shrikar Archak
Author-email: shrikar84@gmail.com
License-File: LICENSE.md
Requires-Dist: typer
Requires-Dist: python-dotenv
Requires-Dist: openai
Requires-Dist: pandas
Requires-Dist: numpy
Requires-Dist: loguru
3 changes: 2 additions & 1 deletion gpthistory.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
Click
typer
python-dotenv
openai
pandas
numpy
loguru
133 changes: 75 additions & 58 deletions gpthistory/gpthistory.py
Original file line number Diff line number Diff line change
@@ -1,107 +1,124 @@
import click
import typer
import json
import os
import pandas as pd
import logging
from gpthistory.helpers import extract_text_parts, generate_embeddings, calculate_top_titles
from rich import print
from gpthistory.helpers import (
extract_text_parts,
generate_embeddings,
calculate_top_titles,
)

# Define the path to the index file in the user's home directory
INDEX_PATH = os.path.join(os.path.expanduser('~'), '.gpthistory', 'chatindex.csv')
main = typer.Typer()

# Configure the logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Define the path to the index file in the user's home directory
INDEX_PATH = os.path.join(os.path.expanduser("~"), ".gpthistory", "chatindex.csv")

@click.group()
def main():
"""
Simple CLI for searching within a chat data
"""
pass

@main.command()
@click.option('--file', type=click.Path(exists=True), help='Input file')
def build_index(file):
def build_index(file: typer.FileText):
"""
Build an index from a given chat data file
Build an index from a given chat data file xxx
"""
# TODO: Implement index building
# Write the index to the predefined path
# Make sure the directory exists
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)

# Load the chat data from the given file
with open(file) as f:
data = json.load(f)

data = json.load(file)

chat_ids = []
section_ids = []
texts = []
for entry in data:
for k, v in entry['mapping'].items():
for k, v in entry["mapping"].items():
text_data = extract_text_parts(v)
if len(text_data) > 0 and text_data[0] != '':
if len(text_data) > 0 and text_data[0] != "":
# Add relevant chat information to the index
chat_ids.append(entry['id'])
chat_ids.append(entry["id"])
section_ids.append(k)
texts.append(text_data[0])
logger.info(f"Index built and stored at: {INDEX_PATH}")
logger.info(f"Conversations indexed: {len(chat_ids)}")
df = pd.DataFrame({'chat_id': chat_ids, 'section_id': section_ids, 'text': texts})
df = df[~df.text.isna()]
df['id'] = df['chat_id']
print(f"[cyan]Index built and stored at:[/cyan] {INDEX_PATH}")
print(f"[cyan]Conversations indexed:[/cyan] {len(chat_ids)}")
df = pd.DataFrame({"chat_id": chat_ids, "section_id": section_ids, "text": texts})
df = df[~df.text.isna()]
df["id"] = df["chat_id"]
df.set_index("id", inplace=True)

# Handle incremental index updates
current_df = pd.DataFrame()
current_df = pd.DataFrame()
rows_only_in_df = pd.DataFrame()
incremental = False
if os.path.exists(INDEX_PATH):
incremental = True
current_df = pd.read_csv(INDEX_PATH, sep='|')
current_df['id'] = current_df['chat_id']
current_df = pd.read_csv(INDEX_PATH, sep="|")
current_df["id"] = current_df["chat_id"]
current_df.set_index("id", inplace=True)
# Use merge with indicator=True to find rows present in one DataFrame but not the other
merged_df = df.merge(current_df, how='outer', indicator=True)
merged_df = df.merge(current_df, how="outer", indicator=True)
# Query rows only present in df1
rows_only_in_df = merged_df.query('_merge == "left_only"').drop(columns='_merge')
rows_only_in_df = merged_df.query('_merge == "left_only"').drop(
columns="_merge"
)
else:
rows_only_in_df = df

if incremental and len(rows_only_in_df) > 0:
logger.info("Only generating embeddings for new conversations to save money.")

print(
"[yellow]Only generating embeddings for new conversations to save money.[/yellow]"
)

import pickle

with open("convos.pkl", "wb") as f:
pickle.dump(rows_only_in_df, f)

# Generate and add embeddings to the index
embeddings = generate_embeddings(rows_only_in_df.text.tolist())
rows_only_in_df['embeddings'] = embeddings
rows_only_in_df["embeddings"] = embeddings
final_df = pd.concat([rows_only_in_df, current_df])
logger.info(f"Total conversations: {len(final_df)}")
final_df.to_csv(INDEX_PATH, sep='|', index=False)
print(f"[cyan]Total conversations:[/cyan] {len(final_df)}")
final_df.to_csv(INDEX_PATH, sep="|", index=False)


@main.command()
@click.argument('keyword', required=True)
def search(keyword):
def search(keyword: str, topk: int = 5, thr: float | None = None):
"""
Search a keyword within the index
Search a keyword within the index with an optional threshold argument.
"""
# TODO: Implement search function
# Load the index from the predefined path
logger.info("Searching for keyword: %s", keyword)
print(f"[cyan]Searching for:[/cyan] '{keyword}'")
if os.path.exists(INDEX_PATH):
df = pd.read_csv(INDEX_PATH, sep='|')
df['embeddings'] = df.embeddings.apply(lambda x: [float(t) for t in json.loads(x)])
df = pd.read_csv(INDEX_PATH, sep="|")
df["embeddings"] = df.embeddings.apply(
lambda x: [float(t) for t in json.loads(x)]
)
filtered = df[df.text.str.contains(keyword)]

# Calculate top titles and their corresponding chat IDs
chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword)


if filtered.shape[0] == 0:
print(
"[yellow]No exact matches found. Performing solely embedding search.[/yellow]"
)
filtered = df.copy()

# Calculate top titles and their corresponding chat IDs based on the threshold
chat_ids, top_titles, top_scores = calculate_top_titles(
filtered, keyword, thr, topk
)

for i, t in enumerate(top_titles):
logger.info("%s: %s", chat_ids[i], t)
logger.info("ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i])
logger.info("--------------------------------------")
print(
f"""\
--------------------------------------------------------------------------------
[cyan bold]url:[/cyan bold] [green]https://chat.openai.com/c/{chat_ids[i]}[/green]
[cyan bold]score:[/cyan bold] {top_scores[i]:.2f}

{t}
-------------------------------------------------------------------------------\
"""
)
else:
click.echo("Index not found. Please build the index first.")
print("Index not found. Please build the index first.")
return


if __name__ == "__main__":
main()
88 changes: 55 additions & 33 deletions gpthistory/helpers.py
Original file line number Diff line number Diff line change
@@ -1,88 +1,110 @@
import json
import os
import pandas as pd
import tiktoken
import numpy as np
import openai
from openai import OpenAI
from dotenv import load_dotenv
import logging

# Load environment variables
load_dotenv()

# Set up OpenAI API key
openai.api_key = os.environ.get('OPENAI_API_KEY')
client = OpenAI(
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://oai.hconeai.com/v1",
default_headers={
"Helicone-Auth": f"Bearer {os.environ.get('HELICONE_API_KEY')}",
"Helicone-Property-project": "gpthistory",
},
)

# Load model
tokenizer = tiktoken.get_encoding("cl100k_base")
EMBEDDING_MODEL = "text-embedding-3-small"

# Define the path to the index file in the user's home directory
INDEX_PATH = os.path.join(os.path.expanduser('~'), '.chatsearch', 'chatindex.csv')
INDEX_PATH = os.path.join(os.path.expanduser("~"), ".chatsearch", "chatindex.csv")


def count_tokens(text):
return len(tokenizer.encode(text))


def get_first_n_tokens(text: str, n: int) -> str:
tokens = tokenizer.encode(text)
first_n_tokens = tokens[:n]
return tokenizer.decode(first_n_tokens)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def extract_text_parts(data):
"""
Extract text parts from chat data.
"""
text_parts = []
message = data.get('message')
message = data.get("message")
if message:
content = message.get('content')
if content and content.get('content_type') == 'text':
text_parts.extend(content.get('parts', []))
content = message.get("content")
if content and content.get("content_type") == "text":
text_parts.extend(content.get("parts", []))
return text_parts


def split_into_batches(array, batch_size):
"""
Split an array into batches.
"""
for i in range(0, len(array), batch_size):
yield array[i:i + batch_size]
yield array[i : i + batch_size]


def generate_query_embedding(query):
"""
Generate an embedding for a query using OpenAI API.
"""
response = openai.Embedding.create(
input=[query],
model="text-embedding-ada-002"
)
return response['data'][0]['embedding']
response = client.embeddings.create(input=[query], model=EMBEDDING_MODEL)
return response.data[0].embedding


def generate_embeddings(conversations):
"""
Generate embeddings for conversations using OpenAI API.
"""
embeddings = []
for i, batch in enumerate(split_into_batches(conversations, 100)):
logger.info(f"Generating Embeddings for batch: {i + 1}")
response = openai.Embedding.create(
input=batch,
model="text-embedding-ada-002"
)
tmp_embedding = [row['embedding'] for row in response['data']]
# Suppressing logging of individual batch processing for OpenAI requests
for i, text in enumerate(batch):
if count_tokens(text) > 8000:
batch[i] = get_first_n_tokens(text, 8000)
response = client.embeddings.create(input=batch, model=EMBEDDING_MODEL)
tmp_embedding = [r.embedding for r in response.data]
embeddings += tmp_embedding
if len(embeddings) > 0:
logger.info("Conversations (Chunks) = %d", len(conversations))
logger.info("Embeddings = %d", len(embeddings))
print(f"[cyan]Conversations (Chunks):[/cyan] {len(conversations)}")
print(f"[cyan]Embeddings:[/cyan] {len(embeddings)}")
else:
logger.info("No new conversations detected")
print("[yellow]No new conversations detected[/yellow]")
return embeddings

def calculate_top_titles(df, query, top_n=1000):

def calculate_top_titles(df, query, thr=0.8, top_n=1000):
"""
Calculate top titles for a given query using embeddings.
"""

# Extract the embeddings from the DataFrame
embedding_array = np.array(df['embeddings'].tolist())
embedding_array = np.array(df["embeddings"].tolist())
query_embedding = generate_query_embedding(query)
# Calculate the dot product between the query embedding and all embeddings in the DataFrame
dot_scores = np.dot(embedding_array, query_embedding)

# Filter out titles with dot scores below the threshold
mask = dot_scores >= 0.8
if thr is not None:
mask = dot_scores >= thr
else:
mask = np.ones_like(dot_scores, dtype=bool)

filtered_dot_scores = dot_scores[mask]
filtered_titles = df.loc[mask, 'text'].tolist()
filtered_chat_ids = df.loc[mask, 'chat_id'].tolist()
filtered_titles = df.loc[mask, "text"].tolist()
filtered_chat_ids = df.loc[mask, "chat_id"].tolist()

# Sort the filtered titles based on the dot scores (in descending order)
sorted_indices = np.argsort(filtered_dot_scores)[::-1][:top_n]
Expand Down
25 changes: 13 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
from setuptools import setup, find_packages

setup(
name='gpthistory',
version='0.3',
description='A tool for searching through your chatgpt conversation history',
author='Shrikar Archak',
author_email='shrikar84@gmail.com',
name="gpthistory",
version="0.3",
description="A tool for searching through your chatgpt conversation history",
author="Shrikar Archak",
author_email="shrikar84@gmail.com",
packages=find_packages(),
include_package_data=True,
install_requires=[
'Click',
'python-dotenv',
'openai',
'pandas',
'numpy'
"typer",
"python-dotenv",
"openai",
"pandas",
"numpy",
"rich",
],
entry_points='''
entry_points="""
[console_scripts]
gpthistory=gpthistory.gpthistory:main
''',
""",
)