-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_chroma.py
More file actions
57 lines (47 loc) · 2.25 KB
/
debug_chroma.py
File metadata and controls
57 lines (47 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Quick diagnostic script to check what's in ChromaDB and test retrieval."""
import chromadb
import os
from dotenv import load_dotenv
load_dotenv(override=True)
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "confluence_docs"
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
col = client.get_collection(COLLECTION_NAME)
total = col.count()
print(f"Total docs in ChromaDB: {total}")
# Check sample docs
sample = col.get(limit=3, include=["documents", "metadatas"])
print(f"\nSample IDs: {sample['ids'][:3]}")
for i, meta in enumerate(sample["metadatas"][:2]):
print(f"\n--- Sample {i} metadata keys: {list(meta.keys())}")
print(f" source: {meta.get('source', 'N/A')}")
text = meta.get("text", "")
print(f" text preview: {text[:200]}...")
# Search for 'mantis' in documents
print("\n\n=== Searching stored documents for 'mantis' ===")
all_docs = col.get(include=["documents", "metadatas"])
mantis_count = 0
for doc_id, doc, meta in zip(all_docs["ids"], all_docs["documents"], all_docs["metadatas"]):
if doc and "mantis" in doc.lower():
mantis_count += 1
if mantis_count <= 5:
print(f" Found in doc {doc_id}: {doc[:150]}...")
print(f"\nTotal docs containing 'mantis': {mantis_count} out of {total}")
from langchain_community.embeddings.huggingface import HuggingFaceBgeEmbeddings
embed_model = HuggingFaceBgeEmbeddings(
model_name="BAAI/bge-m3",
model_kwargs={"device": "cuda"},
encode_kwargs={"normalize_embeddings": True},
)
# Test vector similarity search
print("\n\n=== Vector similarity search for 'Mantis DAG' ===")
query_embedding = embed_model.embed_query("tell me all you know about Mantis DAG")
print(f"Query embedding dimension: {len(query_embedding)}")
results = col.query(query_embeddings=[query_embedding], n_results=5, include=["documents", "metadatas", "distances"])
for i, (doc_id, doc, meta, dist) in enumerate(zip(results["ids"][0], results["documents"][0], results["metadatas"][0], results["distances"][0])):
print(f"\n Result {i+1} (distance={dist:.4f}):")
print(f" ID: {doc_id}")
print(f" source: {meta.get('source', 'N/A')}")
has_mantis = "mantis" in (doc or "").lower()
print(f" contains 'mantis': {has_mantis}")
print(f" preview: {(doc or '')[:200]}...")