Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
77 commits
Select commit Hold shift + click to select a range
e3abd90
initial scaffolding for adding vector store / vector database integra…
AmoghTantradi Jan 12, 2025
bd1e8fd
fixed linting, ruff checks pass
AmoghTantradi Jan 12, 2025
880c31f
added changes to requirements.txt file and added additional abstract …
AmoghTantradi Jan 12, 2025
7b5dfd3
refactored
AmoghTantradi Jan 12, 2025
08dfaba
added tests for clustering and filtering
AmoghTantradi Jan 13, 2025
f3a82c1
made edits to test_filter
AmoghTantradi Jan 13, 2025
fc62846
added implementations for weaviate and pinecone vs
AmoghTantradi Jan 14, 2025
3e89b5f
fixed merge conflicts
AmoghTantradi Jan 14, 2025
f2937ad
added extra refactoring and added implementations for qdrant and chro…
AmoghTantradi Jan 14, 2025
a4c7418
fixed some type errors
AmoghTantradi Jan 14, 2025
1357fb3
made further corrections
AmoghTantradi Jan 15, 2025
c76b658
edit uuid type
AmoghTantradi Jan 15, 2025
9f257f7
changed uuid type
AmoghTantradi Jan 15, 2025
99cb535
made type changes to weaviate file
AmoghTantradi Jan 15, 2025
3c8a742
made another change
AmoghTantradi Jan 15, 2025
ccd9e48
typecheck passes for weaviate?
AmoghTantradi Jan 15, 2025
89bf974
type changes for weaviate and qdrant files
AmoghTantradi Jan 16, 2025
a76adb7
made changes to weaviate file
AmoghTantradi Jan 16, 2025
c3e0f0c
made changes to weaviate file
AmoghTantradi Jan 16, 2025
1782281
fixed pinecone type errors
AmoghTantradi Jan 16, 2025
0621b9b
fixed pinecone type errors
AmoghTantradi Jan 16, 2025
b568d1e
type checks all pass locally
AmoghTantradi Jan 16, 2025
9b33a1f
fixed linting errors
AmoghTantradi Jan 16, 2025
820f3be
made refactors to allow for testing
AmoghTantradi Jan 17, 2025
a0a70d2
made changes to tests
AmoghTantradi Jan 22, 2025
6dbd1db
fixed
AmoghTantradi Jan 22, 2025
bea1d19
changed setattr to getattr
AmoghTantradi Jan 22, 2025
f93f7ed
fixed a test
AmoghTantradi Jan 22, 2025
38ff87d
over
AmoghTantradi Jan 25, 2025
c885dbc
another change
AmoghTantradi Jan 25, 2025
8eefac0
fixed type check errors
AmoghTantradi Jan 25, 2025
23bafa5
second refactor (removed index_dir)
AmoghTantradi Jan 27, 2025
75d11ea
fixed type checks
AmoghTantradi Jan 27, 2025
0b0bf38
fixed retriever module errors
AmoghTantradi Jan 27, 2025
6bf7926
fixed key error
AmoghTantradi Jan 27, 2025
f7071a2
added fixes to failing rm tests
AmoghTantradi Jan 28, 2025
6ebe407
fixed chroma
AmoghTantradi Jan 28, 2025
e588bee
removed dynamic indexing for weaviatevs
AmoghTantradi Jan 28, 2025
d6a86e1
fixed type errors
AmoghTantradi Jan 28, 2025
ddfd549
changed weaviate index config
AmoghTantradi Jan 28, 2025
20206e1
changed rm tests index name to avoid pinecone failures
AmoghTantradi Jan 28, 2025
e7ea24f
fixed naming convention for index_dir and fixed serverless spec for p…
AmoghTantradi Jan 28, 2025
f152b54
changed serverless spec for pc index due to free plan
AmoghTantradi Jan 29, 2025
2e21a97
added debug statement
AmoghTantradi Jan 29, 2025
524b501
made changes to errors
AmoghTantradi Jan 29, 2025
87f57e1
Merge branch 'main' of github.com:guestrin-lab/lotus into at/vs_imple…
AmoghTantradi Jan 29, 2025
e995996
added some fixes to collection upload error handling
AmoghTantradi Jan 29, 2025
1a75486
made some other change
AmoghTantradi Jan 29, 2025
c5f50f6
fixed type errors for qdrant vs
AmoghTantradi Feb 9, 2025
85daf51
changed endpoint
AmoghTantradi Feb 9, 2025
6b80fd3
added changes
AmoghTantradi Feb 9, 2025
4bafdb7
Merge branch 'main' of github.com:guestrin-lab/lotus into at/vs_imple…
AmoghTantradi Feb 9, 2025
f90ff0f
added fixes
AmoghTantradi Feb 9, 2025
cccfa39
added some changes
AmoghTantradi Feb 9, 2025
6cf4f0a
added some change
AmoghTantradi Feb 9, 2025
0438b18
another set of changes
AmoghTantradi Feb 9, 2025
43e9bc3
added other logs
AmoghTantradi Feb 9, 2025
90d07d0
added logging
AmoghTantradi Feb 9, 2025
94c9854
initial pinecone_vs commit
AmoghTantradi Feb 14, 2025
26e0708
added edits
AmoghTantradi Feb 14, 2025
70487c5
removed unused import
AmoghTantradi Feb 14, 2025
7063f46
fixed merge conflicts
AmoghTantradi Feb 16, 2025
98f4265
removed pinecone tests for testing clustering
AmoghTantradi Feb 16, 2025
d884e15
added additional test to pinecone index
AmoghTantradi Feb 16, 2025
d622d6d
fix for pinecone vs
AmoghTantradi Feb 16, 2025
80e613b
fixed filter condition and added change to __call__ function
AmoghTantradi Feb 17, 2025
5a0d79a
added small fix
AmoghTantradi Feb 17, 2025
8923e0e
added small fix
AmoghTantradi Feb 17, 2025
d54ca69
added small fix
AmoghTantradi Feb 17, 2025
697dfa8
changed index_dir name to adhere to pinecone standards
AmoghTantradi Feb 17, 2025
ef9f936
added debug statement
AmoghTantradi Feb 17, 2025
c0cb5c3
added print statement
AmoghTantradi Feb 17, 2025
8e78479
added print statement
AmoghTantradi Feb 17, 2025
86548aa
added changes
AmoghTantradi Feb 17, 2025
75b7f3e
added changes
AmoghTantradi Feb 17, 2025
e1051ab
fixed topK in pc
AmoghTantradi Feb 17, 2025
d0387cd
modified tests
AmoghTantradi Feb 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions .github/tests/rm_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import lotus
from lotus.models import CrossEncoderReranker, LiteLLMRM, SentenceTransformersRM
from lotus.vector_store import FaissVS
from lotus.vector_store import FaissVS, PineconeVS

################################################################################
# Setup
Expand Down Expand Up @@ -33,6 +33,7 @@

VECTOR_STORE_TO_CLS = {
'local': FaissVS,
'pinecone': PineconeVS,
}


Expand Down Expand Up @@ -169,7 +170,7 @@ def test_dedup(setup_models):
################################################################################


@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("vs", [key for key in VECTOR_STORE_TO_CLS.keys() if key != "pinecone"])
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_vs_cluster_by(setup_models, setup_vs, vs, model):
rm = setup_models[model]
Expand Down Expand Up @@ -219,7 +220,7 @@ def test_vs_search_rm_only(setup_models, setup_vs, vs, model):
df = df.sem_search("Course Name", "Optimization", K=1)
assert df["Course Name"].tolist() == ["Optimization Methods in Engineering"]

@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("vs", [vs for vs in VECTOR_STORE_TO_CLS.keys() if vs != 'pinecone'])
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_vs_sim_join(setup_models, setup_vs, vs, model):
rm = setup_models[model]
Expand Down Expand Up @@ -252,7 +253,7 @@ def test_vs_sim_join(setup_models, setup_vs, vs, model):
"intfloat/e5-small-v2" not in ENABLED_MODEL_NAMES,
reason="Skipping test because intfloat/e5-small-v2 is not enabled",
)
@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("vs", [key for key in VECTOR_STORE_TO_CLS.keys() if key != 'pinecone'])
def test_vs_dedup(setup_models, setup_vs, vs):
rm = setup_models["intfloat/e5-small-v2"]
my_vs = setup_vs[vs]
Expand Down Expand Up @@ -320,8 +321,9 @@ def test_search(setup_models):
df = df.sem_search("Course Name", "Optimization", K=2, n_rerank=1)
assert df["Course Name"].tolist() == ["Optimization Methods in Engineering"]

@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_filtered_vector_search(setup_models, model):
def test_filtered_vector_search(setup_models, setup_vs, vs, model):
"""
Test filtered vector search.

Expand All @@ -336,7 +338,7 @@ def test_filtered_vector_search(setup_models, model):
expected to pick out the culinary course "Gourmet Cooking Advanced".
"""
rm = setup_models[model]
vs = FaissVS()
vs = setup_vs[vs]
lotus.settings.configure(rm=rm, vs=vs)

data = {
Expand All @@ -355,7 +357,7 @@ def test_filtered_vector_search(setup_models, model):
}
df = pd.DataFrame(data)
# Index the 'Course Name' column to generate semantic embeddings.
df = df.sem_index("Course Name", "filtered_index_dir")
df = df.sem_index("Course Name", "filteredindexdir")
# Filter the DataFrame to only include Culinary courses.
df_filtered = df[df["Category"] == "Culinary"]
# Perform semantic search on the filtered DataFrame.
Expand Down
2 changes: 1 addition & 1 deletion lotus/models/rm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ def convert_query_to_query_vector(self, queries: Union[pd.Series, str, Image.Ima
queries = queries.tolist()
# Create embeddings for text queries
query_vectors = self._embed(queries)
return query_vectors
return query_vectors
1 change: 0 additions & 1 deletion lotus/sem_ops/sem_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def __call__(

df_idxs = self._obj.index
cur_min = len(df_idxs)

K = min(K, cur_min)

search_K = K
Expand Down
1 change: 1 addition & 0 deletions lotus/sem_ops/sem_sim_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def __call__(
right_ids = list(other.index)

vs_output: RMOutput = vs(query_vectors, K, ids=right_ids)
print(f'vs_output: {vs_output}')
distances = vs_output.distances
indices = vs_output.indices

Expand Down
4 changes: 2 additions & 2 deletions lotus/vector_store/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from lotus.vector_store.vs import VS
from lotus.vector_store.faiss_vs import FaissVS

__all__ = ["VS", "FaissVS"]
from lotus.vector_store.pinecone_vs import PineconeVS
__all__ = ["VS", "FaissVS", "PineconeVS"]
150 changes: 150 additions & 0 deletions lotus/vector_store/pinecone_vs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from typing import Any, Optional

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from tqdm import tqdm

from lotus.types import RMOutput
from lotus.vector_store.vs import VS

try:
from pinecone import Index, Pinecone, ServerlessSpec
except ImportError as err:
raise ImportError(
"The pinecone library is required to use PineconeVS. Install it with `pip install pinecone`",
) from err

class PineconeVS(VS):
def __init__(self, max_batch_size: int = 64):

api_key = 'pcsk_45ecSY_CW62eJeL4jwj6dUfaqM6j9dL3uwK12rudednzGisWMxJv9bHH2DLz6tWoY91W84'

"""Initialize Pinecone client with API key and environment"""
super()
self.pinecone = Pinecone(api_key=api_key)
self.pc_index:Index | None = None
self.max_batch_size = max_batch_size

def __del__(self):
return


def index(self, docs: pd.Series, embeddings: Any, index_dir: str, **kwargs: dict[str, Any]):
"""Create an index and add documents to it"""
self.index_dir = index_dir

dimension = embeddings.shape[1]

# Check if index already exists
if index_dir not in self.pinecone.list_indexes().names():
# Create new index with the correct dimension
self.pinecone.create_index(
name=index_dir,
dimension=dimension,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
elif self.pinecone.describe_index(index_dir).dimension != dimension:
# resolve any potential dimension-mismatch errors
self.pinecone.delete_index(index_dir)
self.pinecone.create_index(
name=index_dir,
dimension=dimension,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)

# Connect to index
self.pc_index = self.pinecone.Index(index_dir)

# Convert docs to list if it's a pandas Series
docs_list = docs.tolist() if isinstance(docs, pd.Series) else docs

# Prepare vectors for upsert
vectors = []
for idx, (embedding, doc) in enumerate(zip(embeddings, docs_list)):
vectors.append({
"id": str(idx),
"values": embedding.tolist(), # Pinecone expects lists, not numpy arrays
"metadata": {
"content": doc,
"doc_id": idx
}
})

# Upsert in batches of 100
batch_size = 100
for i in tqdm(range(0, len(vectors), batch_size), desc="Uploading to Pinecone"):
batch = vectors[i:i + batch_size]
self.pc_index.upsert(vectors=batch)

def load_index(self, index_dir: str):
"""Connect to an existing Pinecone index"""
if index_dir not in self.pinecone.list_indexes():
raise ValueError(f"Index {index_dir} not found")

self.index_dir = index_dir
self.pc_index = self.pinecone.Index(index_dir)

def __call__(
self,
query_vectors,
K: int,
ids: Optional[list[int]] = None,
**kwargs: dict[str, Any]
) -> RMOutput:
"""Perform vector search using Pinecone"""
if self.pc_index is None:
raise ValueError("No index loaded. Call load_index first.")
K = min(K, 10000)

# Perform searches
all_distances = []
all_indices = []

for query_vector in query_vectors:
# Query Pinecone
results = self.pc_index.query(
vector=query_vector.tolist(),
top_k=max(K, 2),
include_metadata=True,
filter={
"doc_id": {
"$in": ids
} ,
} if ids is not None else None,
**kwargs
)


# Extract distances and indices
distances = []
indices = []

for match in results.matches:
indices.append(int(match.metadata["doc_id"]))
distances.append(match.score)

# Pad results if fewer than K matches
while len(indices) < K:
indices.append(-1) # Use -1 for padding
distances.append(0.0)

all_distances.append(distances)
all_indices.append(indices)

return RMOutput(
distances=np.array(all_distances, dtype=np.float32).tolist(),
indices=np.array(all_indices, dtype=np.int64).tolist()
)

def get_vectors_from_index(self, index_dir: str, ids: list[int]) -> NDArray[np.float64]:
"""Retrieve vectors for specific document IDs"""
raise ValueError('Not a Pinecone supported operation!')