Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
dist/
build/
*.pyc
incoming/
23 changes: 19 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,34 @@ We use openai embeddings to find semantic similarity. Hence before building inde
export OPENAI_API_KEY='your open ai key'

## Indexing and Search
[![asciicast](https://asciinema.org/a/ht0KVofl1GZwLgP1SEHKwKzX8.svg)](https://asciinema.org/a/ht0KVofl1GZwLgP1SEHKwKzX8)

### Indexing

The `build_index` command allows you to build an index from your chat data files. The tool extracts relevant text parts from each chat entry and stores them in the index along with their associated chat IDs and section IDs.
The `build-index` command allows you to build an index from your chat data files. The tool extracts relevant text parts from each chat entry and stores them in the index along with their associated chat IDs and section IDs.

#### Basic Usage

To build an index, run:

```bash
gpthistory build_index --file /path/to/conversations.json
gpthistory build-index --file /path/to/conversations.json
```

Replace `/path/to/conversations.json` with the path to your chat data file in JSON format.

#### Rate Limiting

You can optionally add a rate-limiting mechanism to control the frequency of API calls. The `--rate-limit` option lets you specify the sleep time in seconds between API calls, which is useful to prevent hitting rate limits on the OpenAI API.

Here's how to use it:

```bash
gpthistory build-index --rate-limit 0.002 --file /path/to/conversations.json
```

Replace `0.002` with the desired sleep time in seconds.


### Searching

Once you have built the index, you can perform searches using the `search` command. The tool takes a keyword as input and returns the top matching conversations from the index and also the conversation history link so that you can directly go to that link.
Expand All @@ -62,7 +77,7 @@ The search algorithm uses embeddings to efficiently match the keyword against th

```bash
# Build the index from conversations.json
gpthistory build_index --file conversations.json
gpthistory build-index --file conversations.json

# Search for conversations related to "chatbot"
gpthistory search "chatbot"
Expand Down
31 changes: 20 additions & 11 deletions gpthistory/gpthistory.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,23 @@ def main():
pass

@main.command()
# Add click option for rate-limiting
@click.option('--rate-limit', type=float, default=0, help='Rate limit for OpenAI API in seconds')
@click.option('--file', type=click.Path(exists=True), help='Input file')
def build_index(file):
# define the build_index function
def build_index(rate_limit, file):
"""
Build an index from a given chat data file
"""
# TODO: Implement index building
# Write the index to the predefined path
# Make sure the directory exists
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)

# Load the chat data from the given file
with open(file) as f:
data = json.load(f)

chat_ids = []
section_ids = []
texts = []
Expand Down Expand Up @@ -67,16 +70,22 @@ def build_index(file):
rows_only_in_df = merged_df.query('_merge == "left_only"').drop(columns='_merge')
else:
rows_only_in_df = df

if incremental and len(rows_only_in_df) > 0:
logger.info("Only generating embeddings for new conversations to save money.")

# Generate and add embeddings to the index
embeddings = generate_embeddings(rows_only_in_df.text.tolist())
embeddings = generate_embeddings(rows_only_in_df.text.tolist(), rate_limit)
# ensure that the generate_embeddings function returns an array of the same length as the input and log warning if not
if len(embeddings) != len(rows_only_in_df):
logger.warning("Number of embeddings generated does not match the number of conversations.")
rows_only_in_df['embeddings'] = embeddings
final_df = pd.concat([rows_only_in_df, current_df])
logger.info(f"Total conversations: {len(final_df)}")
final_df.to_csv(INDEX_PATH, sep='|', index=False)
rows_only_in_df.to_csv(INDEX_PATH, sep='|', mode='a', header=not incremental, index=False)
# Log to inform the user about the number of conversations indexed
logger.info(f"Conversations indexed: {len(rows_only_in_df)}")




@main.command()
@click.argument('keyword', required=True)
Expand All @@ -91,10 +100,10 @@ def search(keyword):
df = pd.read_csv(INDEX_PATH, sep='|')
df['embeddings'] = df.embeddings.apply(lambda x: [float(t) for t in json.loads(x)])
filtered = df[df.text.str.contains(keyword)]

# Calculate top titles and their corresponding chat IDs
chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword)

for i, t in enumerate(top_titles):
logger.info("%s: %s", chat_ids[i], t)
logger.info("ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i])
Expand Down
29 changes: 24 additions & 5 deletions gpthistory/helpers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import time
import pandas as pd
import numpy as np
import openai
Expand Down Expand Up @@ -38,36 +39,54 @@ def split_into_batches(array, batch_size):
for i in range(0, len(array), batch_size):
yield array[i:i + batch_size]

def generate_query_embedding(query):
def generate_query_embedding(query, rate_limit=0):
"""
Generate an embedding for a query using OpenAI API.
"""

import time
if rate_limit > 0:
time.sleep(rate_limit)
response = openai.Embedding.create(
input=[query],
model="text-embedding-ada-002"
)
return response['data'][0]['embedding']

def generate_embeddings(conversations):
def generate_embeddings(conversations, rate_limit=0):
"""
Generate embeddings for conversations using OpenAI API.
"""
embeddings = []
for i, batch in enumerate(split_into_batches(conversations, 100)):
logger.info(f"Generating Embeddings for batch: {i + 1}")

if rate_limit > 0:
time.sleep(rate_limit)
# log the time between API calls to verify if your rate-limiting code is functioning as expected
logger.info(f"Rate Limit: time between API calls: {rate_limit} seconds")



response = openai.Embedding.create(
input=batch,
model="text-embedding-ada-002"
)
input=batch,
model="text-embedding-ada-002"
)

tmp_embedding = [row['embedding'] for row in response['data']]
embeddings += tmp_embedding

# Debugging: print the number of embeddings generated and the number of conversations
if len(embeddings) > 0:
logger.info("Conversations (Chunks) = %d", len(conversations))
logger.info("Embeddings = %d", len(embeddings))
else:
logger.info("No new conversations detected")

return embeddings



def calculate_top_titles(df, query, top_n=1000):
"""
Calculate top titles for a given query using embeddings.
Expand Down