From 5daed065f67255c5f27dc3309bfaf38c54a5c090 Mon Sep 17 00:00:00 2001 From: Muteti Erick <115213946+ericx00@users.noreply.github.com> Date: Fri, 29 Nov 2024 12:17:56 +0300 Subject: [PATCH] Update migrate_to_pinecone.py Pinecone Initialization: Uses pinecone.init() and checks if the specified index exists. Adds an environment variable for the Pinecone environment (e.g., us-west1-gcp). Metadata Preservation: Includes all metadata from ChromaDB in the migration process and appends the document text to the metadata. Batching Robustness: Handles cases where the dataset is not evenly divisible by the batch size. Error Handling: Validates data consistency and raises errors for missing index or improper configurations. Progress Feedback: Adds a progress bar using tqdm for better user experience. --- migrate_to_pinecone.py | 56 ++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/migrate_to_pinecone.py b/migrate_to_pinecone.py index 1c757d4..3c07f20 100644 --- a/migrate_to_pinecone.py +++ b/migrate_to_pinecone.py @@ -1,5 +1,5 @@ -import chromadb -from pinecone import Pinecone +import pinecone +from chromadb.config import PersistentClient from dotenv import load_dotenv import os from tqdm import tqdm @@ -10,43 +10,57 @@ def migrate_to_pinecone(): # Initialize Pinecone - pc = Pinecone( - api_key=os.getenv('PINECONE_API_KEY') + pinecone.init( + api_key=os.getenv('PINECONE_API_KEY'), # Pinecone API Key + environment=os.getenv('PINECONE_ENVIRONMENT') # Environment (e.g., 'us-west1-gcp') ) # Connect to Pinecone index index_name = os.getenv('PINECONE_INDEX_NAME') - index = pc.Index(index_name) + if index_name not in pinecone.list_indexes(): + raise ValueError(f"Index '{index_name}' not found in Pinecone. Please create it first.") + index = pinecone.Index(index_name) # Initialize ChromaDB - chroma_client = chromadb.PersistentClient(path="./chroma_db") + chroma_client = PersistentClient(path="./chroma_db") collection = chroma_client.get_collection("ds_knowledge_base") - # Get all documents from ChromaDB + # Retrieve all data from ChromaDB results = collection.get( include=['embeddings', 'documents', 'metadatas'] ) - # Prepare batches for Pinecone + # Validate data + embeddings = results['embeddings'] + documents = results['documents'] + metadatas = results['metadatas'] + ids = results['ids'] + + if len(ids) != len(embeddings) or len(ids) != len(documents): + raise ValueError("Mismatch between embeddings, documents, and IDs in ChromaDB data.") + + # Prepare and migrate in batches batch_size = 100 - for i in tqdm(range(0, len(results['ids']), batch_size)): - batch_ids = results['ids'][i:i + batch_size] - batch_embeddings = results['embeddings'][i:i + batch_size] - batch_documents = results['documents'][i:i + batch_size] + for i in tqdm(range(0, len(ids), batch_size), desc="Migrating batches to Pinecone"): + batch_ids = ids[i:i + batch_size] + batch_embeddings = embeddings[i:i + batch_size] + batch_documents = documents[i:i + batch_size] + batch_metadatas = metadatas[i:i + batch_size] # Prepare vectors for Pinecone vectors = [] - for id_, embedding, text in zip(batch_ids, batch_embeddings, batch_documents): - vectors.append({ - 'id': id_, - 'values': embedding, - 'metadata': {'text': text} - }) + for id_, embedding, document, metadata in zip(batch_ids, batch_embeddings, batch_documents, batch_metadatas): + metadata = metadata or {} + metadata['text'] = document # Add document text to metadata + vectors.append((id_, embedding, metadata)) - # Upsert to Pinecone + # Upsert vectors to Pinecone index.upsert(vectors=vectors) - print(f"Migration complete! Migrated {len(results['ids'])} documents to Pinecone") + print(f"Migration complete! Migrated {len(ids)} documents to Pinecone.") if __name__ == "__main__": - migrate_to_pinecone() \ No newline at end of file + try: + migrate_to_pinecone() + except Exception as e: + print(f"An error occurred: {e}")