From 5daed065f67255c5f27dc3309bfaf38c54a5c090 Mon Sep 17 00:00:00 2001
From: Muteti Erick <115213946+ericx00@users.noreply.github.com>
Date: Fri, 29 Nov 2024 12:17:56 +0300
Subject: [PATCH] Update migrate_to_pinecone.py

Pinecone Initialization:

    Uses pinecone.init() and checks if the specified index exists.
    Adds an environment variable for the Pinecone environment (e.g., us-west1-gcp).

Metadata Preservation:

    Includes all metadata from ChromaDB in the migration process and appends the document text to the metadata.

Batching Robustness:

    Handles cases where the dataset is not evenly divisible by the batch size.

Error Handling:

    Validates data consistency and raises errors for missing index or improper configurations.

Progress Feedback:

    Adds a progress bar using tqdm for better user experience.
---
 migrate_to_pinecone.py | 56 ++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/migrate_to_pinecone.py b/migrate_to_pinecone.py
index 1c757d4..3c07f20 100644
--- a/migrate_to_pinecone.py
+++ b/migrate_to_pinecone.py
@@ -1,5 +1,5 @@
-import chromadb
-from pinecone import Pinecone
+import pinecone
+from chromadb.config import PersistentClient
 from dotenv import load_dotenv
 import os
 from tqdm import tqdm
@@ -10,43 +10,57 @@
 
 def migrate_to_pinecone():
     # Initialize Pinecone
-    pc = Pinecone(
-        api_key=os.getenv('PINECONE_API_KEY')
+    pinecone.init(
+        api_key=os.getenv('PINECONE_API_KEY'),  # Pinecone API Key
+        environment=os.getenv('PINECONE_ENVIRONMENT')  # Environment (e.g., 'us-west1-gcp')
     )
     
     # Connect to Pinecone index
     index_name = os.getenv('PINECONE_INDEX_NAME')
-    index = pc.Index(index_name)
+    if index_name not in pinecone.list_indexes():
+        raise ValueError(f"Index '{index_name}' not found in Pinecone. Please create it first.")
+    index = pinecone.Index(index_name)
     
     # Initialize ChromaDB
-    chroma_client = chromadb.PersistentClient(path="./chroma_db")
+    chroma_client = PersistentClient(path="./chroma_db")
     collection = chroma_client.get_collection("ds_knowledge_base")
     
-    # Get all documents from ChromaDB
+    # Retrieve all data from ChromaDB
     results = collection.get(
         include=['embeddings', 'documents', 'metadatas']
     )
     
-    # Prepare batches for Pinecone
+    # Validate data
+    embeddings = results['embeddings']
+    documents = results['documents']
+    metadatas = results['metadatas']
+    ids = results['ids']
+    
+    if len(ids) != len(embeddings) or len(ids) != len(documents):
+        raise ValueError("Mismatch between embeddings, documents, and IDs in ChromaDB data.")
+    
+    # Prepare and migrate in batches
     batch_size = 100
-    for i in tqdm(range(0, len(results['ids']), batch_size)):
-        batch_ids = results['ids'][i:i + batch_size]
-        batch_embeddings = results['embeddings'][i:i + batch_size]
-        batch_documents = results['documents'][i:i + batch_size]
+    for i in tqdm(range(0, len(ids), batch_size), desc="Migrating batches to Pinecone"):
+        batch_ids = ids[i:i + batch_size]
+        batch_embeddings = embeddings[i:i + batch_size]
+        batch_documents = documents[i:i + batch_size]
+        batch_metadatas = metadatas[i:i + batch_size]
         
         # Prepare vectors for Pinecone
         vectors = []
-        for id_, embedding, text in zip(batch_ids, batch_embeddings, batch_documents):
-            vectors.append({
-                'id': id_,
-                'values': embedding,
-                'metadata': {'text': text}
-            })
+        for id_, embedding, document, metadata in zip(batch_ids, batch_embeddings, batch_documents, batch_metadatas):
+            metadata = metadata or {}
+            metadata['text'] = document  # Add document text to metadata
+            vectors.append((id_, embedding, metadata))
         
-        # Upsert to Pinecone
+        # Upsert vectors to Pinecone
         index.upsert(vectors=vectors)
     
-    print(f"Migration complete! Migrated {len(results['ids'])} documents to Pinecone")
+    print(f"Migration complete! Migrated {len(ids)} documents to Pinecone.")
 
 if __name__ == "__main__":
-    migrate_to_pinecone()
\ No newline at end of file
+    try:
+        migrate_to_pinecone()
+    except Exception as e:
+        print(f"An error occurred: {e}")