Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
e3abd90
initial scaffolding for adding vector store / vector database integra…
AmoghTantradi Jan 12, 2025
bd1e8fd
fixed linting, ruff checks pass
AmoghTantradi Jan 12, 2025
880c31f
added changes to requirements.txt file and added additional abstract …
AmoghTantradi Jan 12, 2025
7b5dfd3
refactored
AmoghTantradi Jan 12, 2025
08dfaba
added tests for clustering and filtering
AmoghTantradi Jan 13, 2025
f3a82c1
made edits to test_filter
AmoghTantradi Jan 13, 2025
fc62846
added implementations for weaviate and pinecone vs
AmoghTantradi Jan 14, 2025
3e89b5f
fixed merge conflicts
AmoghTantradi Jan 14, 2025
f2937ad
added extra refactoring and added implementations for qdrant and chro…
AmoghTantradi Jan 14, 2025
a4c7418
fixed some type errors
AmoghTantradi Jan 14, 2025
1357fb3
made further corrections
AmoghTantradi Jan 15, 2025
c76b658
edit uuid type
AmoghTantradi Jan 15, 2025
9f257f7
changed uuid type
AmoghTantradi Jan 15, 2025
99cb535
made type changes to weaviate file
AmoghTantradi Jan 15, 2025
3c8a742
made another change
AmoghTantradi Jan 15, 2025
ccd9e48
typecheck passes for weaviate?
AmoghTantradi Jan 15, 2025
89bf974
type changes for weaviate and qdrant files
AmoghTantradi Jan 16, 2025
a76adb7
made changes to weaviate file
AmoghTantradi Jan 16, 2025
c3e0f0c
made changes to weaviate file
AmoghTantradi Jan 16, 2025
1782281
fixed pinecone type errors
AmoghTantradi Jan 16, 2025
0621b9b
fixed pinecone type errors
AmoghTantradi Jan 16, 2025
b568d1e
type checks all pass locally
AmoghTantradi Jan 16, 2025
9b33a1f
fixed linting errors
AmoghTantradi Jan 16, 2025
820f3be
made refactors to allow for testing
AmoghTantradi Jan 17, 2025
a0a70d2
made changes to tests
AmoghTantradi Jan 22, 2025
6dbd1db
fixed
AmoghTantradi Jan 22, 2025
bea1d19
changed setattr to getattr
AmoghTantradi Jan 22, 2025
f93f7ed
fixed a test
AmoghTantradi Jan 22, 2025
38ff87d
over
AmoghTantradi Jan 25, 2025
c885dbc
another change
AmoghTantradi Jan 25, 2025
8eefac0
fixed type check errors
AmoghTantradi Jan 25, 2025
23bafa5
second refactor (removed index_dir)
AmoghTantradi Jan 27, 2025
75d11ea
fixed type checks
AmoghTantradi Jan 27, 2025
0b0bf38
fixed retriever module errors
AmoghTantradi Jan 27, 2025
6bf7926
fixed key error
AmoghTantradi Jan 27, 2025
f7071a2
added fixes to failing rm tests
AmoghTantradi Jan 28, 2025
6ebe407
fixed chroma
AmoghTantradi Jan 28, 2025
e588bee
removed dynamic indexing for weaviatevs
AmoghTantradi Jan 28, 2025
d6a86e1
fixed type errors
AmoghTantradi Jan 28, 2025
ddfd549
changed weaviate index config
AmoghTantradi Jan 28, 2025
20206e1
changed rm tests index name to avoid pinecone failures
AmoghTantradi Jan 28, 2025
e7ea24f
fixed naming convention for index_dir and fixed serverless spec for p…
AmoghTantradi Jan 28, 2025
f152b54
changed serverless spec for pc index due to free plan
AmoghTantradi Jan 29, 2025
2e21a97
added debug statement
AmoghTantradi Jan 29, 2025
524b501
made changes to errors
AmoghTantradi Jan 29, 2025
87f57e1
Merge branch 'main' of github.com:guestrin-lab/lotus into at/vs_imple…
AmoghTantradi Jan 29, 2025
e995996
added some fixes to collection upload error handling
AmoghTantradi Jan 29, 2025
1a75486
made some other change
AmoghTantradi Jan 29, 2025
c5f50f6
fixed type errors for qdrant vs
AmoghTantradi Feb 9, 2025
85daf51
changed endpoint
AmoghTantradi Feb 9, 2025
6b80fd3
added changes
AmoghTantradi Feb 9, 2025
4bafdb7
Merge branch 'main' of github.com:guestrin-lab/lotus into at/vs_imple…
AmoghTantradi Feb 9, 2025
f90ff0f
added fixes
AmoghTantradi Feb 9, 2025
cccfa39
added some changes
AmoghTantradi Feb 9, 2025
6cf4f0a
added some change
AmoghTantradi Feb 9, 2025
0438b18
another set of changes
AmoghTantradi Feb 9, 2025
43e9bc3
added other logs
AmoghTantradi Feb 9, 2025
90d07d0
added logging
AmoghTantradi Feb 9, 2025
6bf69ff
chroma_vs implementation
AmoghTantradi Feb 14, 2025
38580ce
removed unused imports
AmoghTantradi Feb 14, 2025
988e072
removed pinecone reference
AmoghTantradi Feb 14, 2025
d3071f1
merged with main
AmoghTantradi Feb 16, 2025
c58e479
removed merge conflicts
AmoghTantradi Feb 16, 2025
3ffd039
modified chroma_vs call function to include optional filtering with ids
AmoghTantradi Feb 16, 2025
9eca851
fixed where filter
AmoghTantradi Feb 16, 2025
63a378c
fixed typing errs
AmoghTantradi Feb 16, 2025
d376a49
fixed linting
AmoghTantradi Feb 16, 2025
6aef446
changed threshhold
AmoghTantradi Feb 16, 2025
720e552
added additional test for chroma_vs
AmoghTantradi Feb 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions .github/tests/rm_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import lotus
from lotus.models import CrossEncoderReranker, LiteLLMRM, SentenceTransformersRM
from lotus.vector_store import FaissVS
from lotus.vector_store import ChromaVS, FaissVS

################################################################################
# Setup
Expand Down Expand Up @@ -33,6 +33,7 @@

VECTOR_STORE_TO_CLS = {
'local': FaissVS,
'chroma': ChromaVS,
}


Expand Down Expand Up @@ -254,6 +255,9 @@ def test_vs_sim_join(setup_models, setup_vs, vs, model):
)
@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
def test_vs_dedup(setup_models, setup_vs, vs):
curr_threshold = 0.85
if vs == "chroma":
curr_threshold = 0.9
rm = setup_models["intfloat/e5-small-v2"]
my_vs = setup_vs[vs]
lotus.settings.configure(rm=rm, vs=my_vs)
Expand All @@ -266,7 +270,7 @@ def test_vs_dedup(setup_models, setup_vs, vs):
]
}
df = pd.DataFrame(data)
df = df.sem_index("Text", "fourthindexdir").sem_dedup("Text", threshold=0.85)
df = df.sem_index("Text", "fourthindexdir").sem_dedup("Text", threshold=curr_threshold)
kept = df["Text"].tolist()
kept.sort()
assert len(kept) == 2, kept
Expand Down Expand Up @@ -320,8 +324,9 @@ def test_search(setup_models):
df = df.sem_search("Course Name", "Optimization", K=2, n_rerank=1)
assert df["Course Name"].tolist() == ["Optimization Methods in Engineering"]

@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_filtered_vector_search(setup_models, model):
def test_filtered_vector_search(setup_models, setup_vs, vs, model):
"""
Test filtered vector search.

Expand All @@ -336,7 +341,7 @@ def test_filtered_vector_search(setup_models, model):
expected to pick out the culinary course "Gourmet Cooking Advanced".
"""
rm = setup_models[model]
vs = FaissVS()
vs = setup_vs[vs]
lotus.settings.configure(rm=rm, vs=vs)

data = {
Expand Down
2 changes: 1 addition & 1 deletion lotus/models/rm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ def convert_query_to_query_vector(self, queries: Union[pd.Series, str, Image.Ima
queries = queries.tolist()
# Create embeddings for text queries
query_vectors = self._embed(queries)
return query_vectors
return query_vectors
1 change: 0 additions & 1 deletion lotus/sem_ops/sem_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def __call__(

df_idxs = self._obj.index
cur_min = len(df_idxs)

K = min(K, cur_min)

search_K = K
Expand Down
3 changes: 2 additions & 1 deletion lotus/vector_store/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from lotus.vector_store.vs import VS
from lotus.vector_store.faiss_vs import FaissVS
from lotus.vector_store.chroma_vs import ChromaVS

__all__ = ["VS", "FaissVS"]
__all__ = ["VS", "FaissVS", "ChromaVS"]
181 changes: 181 additions & 0 deletions lotus/vector_store/chroma_vs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
from typing import Any, List, Mapping, Optional, Union, cast

import numpy as np
import pandas as pd
from chromadb import Where
from numpy.typing import NDArray
from tqdm import tqdm

from lotus.types import RMOutput
from lotus.vector_store.vs import VS

try:
from chromadb import Client, ClientAPI
from chromadb.api import Collection
from chromadb.api.types import IncludeEnum
from chromadb.errors import InvalidDimensionException
except ImportError as err:
raise ImportError(
"The chromadb library is required to use ChromaVS. Install it with `pip install chromadb`"
) from err

class ChromaVS(VS):
def __init__(self, max_batch_size: int = 64):

client: ClientAPI = Client()

"""Initialize with ChromaDB client and embedding model"""
super()
self.client = client
self.collection: Collection | None = None
self.index_dir = None
self.max_batch_size = max_batch_size

def __del__(self):
return

def index(self, docs: Any, embeddings: Any, index_dir: str, **kwargs: dict[str, Any]):
"""Create a collection and add documents with their embeddings"""
self.index_dir = index_dir

# Create collection without embedding function (we'll provide embeddings directly)
self.collection = self.client.get_or_create_collection(
name=index_dir,
metadata={"hnsw:space": "cosine"} # Use cosine similarity for consistency
)

# Convert docs to list if it's a pandas Series
docs_list = docs.tolist() if isinstance(docs, pd.Series) else docs

# Prepare documents for addition
ids = [str(i) for i in range(len(docs_list))]
metadatas: list[Mapping[str, Union[str, int, float, bool]]] = [{"doc_id": int(i)} for i in range(len(docs_list))]

# Add documents in batches
batch_size = 100
for i in tqdm(range(0, len(docs_list), batch_size), desc="Uploading to ChromaDB"):
end_idx = min(i + batch_size, len(docs_list))
try:
self.collection.add(
ids=ids[i:end_idx],
documents=docs_list[i:end_idx],
embeddings=embeddings[i:end_idx].tolist(),
metadatas=metadatas[i:end_idx]
)
except InvalidDimensionException:
# delete, recreate, then add
self.client.delete_collection(index_dir)
# Create collection without embedding function (we'll provide embeddings directly)
self.collection = self.client.get_or_create_collection(
name=index_dir,
metadata={"hnsw:space": "cosine"} # Use cosine similarity for consistency
)
self.collection.add(
ids=ids[i:end_idx],
documents=docs_list[i:end_idx],
embeddings=embeddings[i:end_idx].tolist(),
metadatas=metadatas[i:end_idx]
)

def load_index(self, index_dir: str):
"""Load an existing collection"""
try:
self.collection = self.client.get_collection(index_dir)
self.index_dir = index_dir
except ValueError as e:
raise ValueError(f"Collection {index_dir} not found") from e

def __call__(
self,
query_vectors,
K: int,
ids: Optional[list[int]] = None,
**kwargs: dict[str, Any]
) -> RMOutput:
"""
Perform vector search using ChromaDB with optional filtering by document IDs.

Args:
query_vectors: Pre-embedded query vectors.
K (int): Number of nearest neighbors to retrieve.
ids (Optional[list[Any]]): If provided, the search will be limited to documents with these ids.
**kwargs: Additional parameters.

Returns:
RMOutput: Contains the distances and indices of the nearest neighbors.
"""
if self.collection is None:
raise ValueError("No collection loaded. Call load_index first.")

all_distances: list[list[float]] = []
all_indices: list[list[int]] = []

# Process each query vector.
for query_vector in query_vectors:
# Prepare the where clause by casting ids to a list of allowed types.
where_clause: Optional[dict[str, Union[dict[str, List[Union[str, int, float, bool]]]]]] = None
if ids:
where_clause = {"doc_id": {"$in": cast(List[Union[str, int, float, bool]], ids)}}

results = self.collection.query(
query_embeddings=[query_vector.tolist()],
n_results=K,
include=[IncludeEnum.metadatas, IncludeEnum.distances],
where=cast(Where, where_clause),
)

distances: list[float] = []
indices: list[int] = []

# Retrieve and cast search results to help the type checker.
metadatas = results.get("metadatas")
dists = results.get("distances")
if metadatas is not None and dists is not None:
metadatas = cast(
List[List[Mapping[str, Union[str, int, float, bool]]]], metadatas
)
dists = cast(List[List[float]], dists)
for metadata, distance in zip(metadatas[0], dists[0]):
if metadata is not None and distance is not None:
indices.append(int(metadata["doc_id"]))
# Convert squared L2 distances to cosine similarity.
distances.append(1 - (distance / 2))

# Pad results if fewer than K matches are returned.
while len(indices) < K:
indices.append(-1)
distances.append(0.0)

all_indices.append(indices)
all_distances.append(distances)

return RMOutput(
distances=np.array(all_distances, dtype=np.float32).tolist(),
indices=np.array(all_indices, dtype=np.int64).tolist()
)

def get_vectors_from_index(self, index_dir: str, ids: list[int]) -> NDArray[np.float64]:
"""Retrieve vectors for specific document IDs"""
if self.collection is None or self.index_dir != index_dir:
self.load_index(index_dir)


if self.collection is None: # Add this check after load_index
raise ValueError(f"Failed to load collection {index_dir}")


# Convert integer ids to strings for ChromaDB
str_ids = [str(id) for id in ids]

# Get embeddings from ChromaDB
results = self.collection.get(
ids=str_ids,
include=[IncludeEnum.embeddings]
)

if results['embeddings'] is None:
raise ValueError("No vectors found for the given ids", results['embeddings'])

return np.array(results['embeddings'], dtype=np.float64)


4 changes: 2 additions & 2 deletions lotus/vector_store/vs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __call__(
self,
query_vectors: Any,
K: int,
ids: Optional[list[Any]] = None,
ids: Optional[list[int]] = None,
**kwargs: dict[str, Any],
) -> RMOutput:
"""
Expand All @@ -52,7 +52,7 @@ def __call__(
pass

@abstractmethod
def get_vectors_from_index(self, index_dir: str, ids: list[Any]) -> NDArray[np.float64]:
def get_vectors_from_index(self, index_dir: str, ids: list[int]) -> NDArray[np.float64]:
"""
Retrieve vectors from a stored index given specific ids.
"""
Expand Down