sarchak · claytondukes · Oct 4, 2023 · Oct 4, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 dist/
 build/
 *.pyc
+incoming/
diff --git a/README.md b/README.md
@@ -31,19 +31,34 @@ We use openai embeddings to find semantic similarity. Hence before building inde
 export OPENAI_API_KEY='your open ai key'
 
 ## Indexing and Search
-[![asciicast](https://asciinema.org/a/ht0KVofl1GZwLgP1SEHKwKzX8.svg)](https://asciinema.org/a/ht0KVofl1GZwLgP1SEHKwKzX8)
+
 ### Indexing
 
-The `build_index` command allows you to build an index from your chat data files. The tool extracts relevant text parts from each chat entry and stores them in the index along with their associated chat IDs and section IDs.
+The `build-index` command allows you to build an index from your chat data files. The tool extracts relevant text parts from each chat entry and stores them in the index along with their associated chat IDs and section IDs.
+
+#### Basic Usage
 
 To build an index, run:
 
 ```bash
-gpthistory build_index --file /path/to/conversations.json
+gpthistory build-index --file /path/to/conversations.json
 ```
 
 Replace `/path/to/conversations.json` with the path to your chat data file in JSON format.
 
+#### Rate Limiting
+
+You can optionally add a rate-limiting mechanism to control the frequency of API calls. The `--rate-limit` option lets you specify the sleep time in seconds between API calls, which is useful to prevent hitting rate limits on the OpenAI API.
+
+Here's how to use it:
+
+```bash
+gpthistory build-index --rate-limit 0.002 --file /path/to/conversations.json
+```
+
+Replace `0.002` with the desired sleep time in seconds.
+
+
 ### Searching
 
 Once you have built the index, you can perform searches using the `search` command. The tool takes a keyword as input and returns the top matching conversations from the index and also the conversation history link so that you can directly go to that link.
@@ -62,7 +77,7 @@ The search algorithm uses embeddings to efficiently match the keyword against th
 
 ```bash
 # Build the index from conversations.json
-gpthistory build_index --file conversations.json
+gpthistory build-index --file conversations.json
 
 # Search for conversations related to "chatbot"
 gpthistory search "chatbot"

diff --git a/gpthistory/gpthistory.py b/gpthistory/gpthistory.py
@@ -20,20 +20,23 @@ def main():
     pass
 
 @main.command()
+# Add click option for rate-limiting
+@click.option('--rate-limit', type=float, default=0, help='Rate limit for OpenAI API in seconds')
 @click.option('--file', type=click.Path(exists=True), help='Input file')
-def build_index(file):
+# define the build_index function
+def build_index(rate_limit, file):
     """
     Build an index from a given chat data file
     """
     # TODO: Implement index building
     # Write the index to the predefined path
     # Make sure the directory exists
     os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
-    
+
     # Load the chat data from the given file
     with open(file) as f:
         data = json.load(f)
-    
+
     chat_ids = []
     section_ids = []
     texts = []
@@ -67,16 +70,22 @@ def build_index(file):
         rows_only_in_df = merged_df.query('_merge == "left_only"').drop(columns='_merge')
     else:
         rows_only_in_df = df
-    
+
     if incremental and len(rows_only_in_df) > 0:
         logger.info("Only generating embeddings for new conversations to save money.")
-    
+
     # Generate and add embeddings to the index
-    embeddings = generate_embeddings(rows_only_in_df.text.tolist())
+    embeddings = generate_embeddings(rows_only_in_df.text.tolist(), rate_limit)
+    #  ensure that the generate_embeddings function returns an array of the same length as the input and log warning if not
+    if len(embeddings) != len(rows_only_in_df):
+        logger.warning("Number of embeddings generated does not match the number of conversations.")
     rows_only_in_df['embeddings'] = embeddings
-    final_df = pd.concat([rows_only_in_df, current_df])
-    logger.info(f"Total conversations: {len(final_df)}")
-    final_df.to_csv(INDEX_PATH, sep='|', index=False)
+    rows_only_in_df.to_csv(INDEX_PATH, sep='|', mode='a', header=not incremental, index=False)
+    # Log to inform the user about the number of conversations indexed
+    logger.info(f"Conversations indexed: {len(rows_only_in_df)}")
+
+
+
 
 @main.command()
 @click.argument('keyword', required=True)
@@ -91,10 +100,10 @@ def search(keyword):
         df = pd.read_csv(INDEX_PATH, sep='|')
         df['embeddings'] = df.embeddings.apply(lambda x: [float(t) for t in json.loads(x)])
         filtered = df[df.text.str.contains(keyword)]
-        
+
         # Calculate top titles and their corresponding chat IDs
         chat_ids, top_titles, top_scores = calculate_top_titles(df, keyword)
-        
+
         for i, t in enumerate(top_titles):
             logger.info("%s: %s", chat_ids[i], t)
             logger.info("ChatGPT Conversation link: https://chat.openai.com/c/%s", chat_ids[i])

diff --git a/gpthistory/helpers.py b/gpthistory/helpers.py
@@ -1,5 +1,6 @@
 import json
 import os
+import time
 import pandas as pd
 import numpy as np
 import openai
@@ -38,36 +39,54 @@ def split_into_batches(array, batch_size):
     for i in range(0, len(array), batch_size):
         yield array[i:i + batch_size]
 
-def generate_query_embedding(query):
+def generate_query_embedding(query, rate_limit=0):
     """
     Generate an embedding for a query using OpenAI API.
     """
+
+    import time
+    if rate_limit > 0:
+        time.sleep(rate_limit)
     response = openai.Embedding.create(
         input=[query],
         model="text-embedding-ada-002"
     )
     return response['data'][0]['embedding']
 
-def generate_embeddings(conversations):
+def generate_embeddings(conversations, rate_limit=0):
     """
     Generate embeddings for conversations using OpenAI API.
     """
     embeddings = []
     for i, batch in enumerate(split_into_batches(conversations, 100)):
         logger.info(f"Generating Embeddings for batch: {i + 1}")
+
+        if rate_limit > 0:
+            time.sleep(rate_limit)
+            # log the time between API calls to verify if your rate-limiting code is functioning as expected
+            logger.info(f"Rate Limit: time between API calls: {rate_limit} seconds")
+
+
+
         response = openai.Embedding.create(
-            input=batch,
-            model="text-embedding-ada-002"
-        )
+                input=batch,
+                model="text-embedding-ada-002"
+            )
+
         tmp_embedding = [row['embedding'] for row in response['data']]
         embeddings += tmp_embedding
+
+    # Debugging: print the number of embeddings generated and the number of conversations
     if len(embeddings) > 0:
         logger.info("Conversations (Chunks) = %d", len(conversations))
         logger.info("Embeddings = %d", len(embeddings))
     else:
         logger.info("No new conversations detected")
+
     return embeddings
 
+
+
 def calculate_top_titles(df, query, top_n=1000):
     """
     Calculate top titles for a given query using embeddings.