diff --git a/.gitignore b/.gitignore index ad8c20e..c5ffa9a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ dist/ build/ *.pyc +incoming/ diff --git a/README.md b/README.md index ce3ee92..464f7cb 100644 --- a/README.md +++ b/README.md @@ -31,19 +31,34 @@ We use openai embeddings to find semantic similarity. Hence before building inde export OPENAI_API_KEY='your open ai key' ## Indexing and Search -[![asciicast](https://asciinema.org/a/ht0KVofl1GZwLgP1SEHKwKzX8.svg)](https://asciinema.org/a/ht0KVofl1GZwLgP1SEHKwKzX8) + ### Indexing -The `build_index` command allows you to build an index from your chat data files. The tool extracts relevant text parts from each chat entry and stores them in the index along with their associated chat IDs and section IDs. +The `build-index` command allows you to build an index from your chat data files. The tool extracts relevant text parts from each chat entry and stores them in the index along with their associated chat IDs and section IDs. + +#### Basic Usage To build an index, run: ```bash -gpthistory build_index --file /path/to/conversations.json +gpthistory build-index --file /path/to/conversations.json ``` Replace `/path/to/conversations.json` with the path to your chat data file in JSON format. +#### Rate Limiting + +You can optionally add a rate-limiting mechanism to control the frequency of API calls. The `--rate-limit` option lets you specify the sleep time in seconds between API calls, which is useful to prevent hitting rate limits on the OpenAI API. + +Here's how to use it: + +```bash +gpthistory build-index --rate-limit 0.002 --file /path/to/conversations.json +``` + +Replace `0.002` with the desired sleep time in seconds. + + ### Searching Once you have built the index, you can perform searches using the `search` command. The tool takes a keyword as input and returns the top matching conversations from the index and also the conversation history link so that you can directly go to that link. @@ -62,7 +77,7 @@ The search algorithm uses embeddings to efficiently match the keyword against th ```bash # Build the index from conversations.json -gpthistory build_index --file conversations.json +gpthistory build-index --file conversations.json # Search for conversations related to "chatbot" gpthistory search "chatbot" diff --git a/gpthistory/gpthistory.py b/gpthistory/gpthistory.py index 8adc06e..5558a33 100644 --- a/gpthistory/gpthistory.py +++ b/gpthistory/gpthistory.py @@ -20,8 +20,11 @@ def main(): pass @main.command() +# Add click option for rate-limiting +@click.option('--rate-limit', type=float, default=0, help='Rate limit for OpenAI API in seconds') @click.option('--file', type=click.Path(exists=True), help='Input file') -def build_index(file): +# define the build_index function +def build_index(rate_limit, file): """ Build an index from a given chat data file """ @@ -29,11 +32,11 @@ def build_index(file): # Write the index to the predefined path # Make sure the directory exists os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True) - + # Load the chat data from the given file with open(file) as f: data = json.load(f) - + chat_ids = [] section_ids = [] texts = [] @@ -67,16 +70,22 @@ def build_index(file): rows_only_in_df = merged_df.query('_merge == "left_only"').drop(columns='_merge') else: rows_only_in_df = df - + if incremental and len(rows_only_in_df) > 0: logger.info("Only generating embeddings for new conversations to save money.") - + # Generate and add embeddings to the index - embeddings = generate_embeddings(rows_only_in_df.text.tolist()) + embeddings = generate_embeddings(rows_only_in_df.text.tolist(), rate_limit) + # ensure that the generate_embeddings function returns an array of the same length as the input and log warning if not + if len(embeddings) != len(rows_only_in_df): + logger.warning("Number of embeddings generated does not match the number of conversations.") rows_only_in_df['embeddings'] = embeddings - final_df = pd.concat([rows_only_in_df, current_df]) - logger.info(f"Total conversations: {len(final_df)}") - final_df.to_csv(INDEX_PATH, sep='|', index=False) + rows_only_in_df.to_csv(INDEX_PATH, sep='|', mode='a', header=not incremental, index=False) + # Log to inform the user about the number of conversations indexed + logger.info(f"Conversations indexed: {len(rows_only_in_df)}") + + + @main.command() @click.argument('keyword', required=True) @@ -91,10 +100,10 @@ def search(keyword): df = pd.read_csv(INDEX_PATH, sep='|') df['embeddings'] = df.embeddings.apply(lambda x: [float(t) for t in json.loads(x)]) filtered = df[df.text.str.contains(keyword)] - + # Calculate top titles and their corresponding chat IDs chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword) - + for i, t in enumerate(top_titles): logger.info("%s: %s", chat_ids[i], t) logger.info("ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i]) diff --git a/gpthistory/helpers.py b/gpthistory/helpers.py index 56bf08f..3569374 100644 --- a/gpthistory/helpers.py +++ b/gpthistory/helpers.py @@ -1,5 +1,6 @@ import json import os +import time import pandas as pd import numpy as np import openai @@ -38,36 +39,54 @@ def split_into_batches(array, batch_size): for i in range(0, len(array), batch_size): yield array[i:i + batch_size] -def generate_query_embedding(query): +def generate_query_embedding(query, rate_limit=0): """ Generate an embedding for a query using OpenAI API. """ + + import time + if rate_limit > 0: + time.sleep(rate_limit) response = openai.Embedding.create( input=[query], model="text-embedding-ada-002" ) return response['data'][0]['embedding'] -def generate_embeddings(conversations): +def generate_embeddings(conversations, rate_limit=0): """ Generate embeddings for conversations using OpenAI API. """ embeddings = [] for i, batch in enumerate(split_into_batches(conversations, 100)): logger.info(f"Generating Embeddings for batch: {i + 1}") + + if rate_limit > 0: + time.sleep(rate_limit) + # log the time between API calls to verify if your rate-limiting code is functioning as expected + logger.info(f"Rate Limit: time between API calls: {rate_limit} seconds") + + + response = openai.Embedding.create( - input=batch, - model="text-embedding-ada-002" - ) + input=batch, + model="text-embedding-ada-002" + ) + tmp_embedding = [row['embedding'] for row in response['data']] embeddings += tmp_embedding + + # Debugging: print the number of embeddings generated and the number of conversations if len(embeddings) > 0: logger.info("Conversations (Chunks) = %d", len(conversations)) logger.info("Embeddings = %d", len(embeddings)) else: logger.info("No new conversations detected") + return embeddings + + def calculate_top_titles(df, query, top_n=1000): """ Calculate top titles for a given query using embeddings.