Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
e3abd90
initial scaffolding for adding vector store / vector database integra…
AmoghTantradi Jan 12, 2025
bd1e8fd
fixed linting, ruff checks pass
AmoghTantradi Jan 12, 2025
880c31f
added changes to requirements.txt file and added additional abstract …
AmoghTantradi Jan 12, 2025
7b5dfd3
refactored
AmoghTantradi Jan 12, 2025
08dfaba
added tests for clustering and filtering
AmoghTantradi Jan 13, 2025
f3a82c1
made edits to test_filter
AmoghTantradi Jan 13, 2025
fc62846
added implementations for weaviate and pinecone vs
AmoghTantradi Jan 14, 2025
3e89b5f
fixed merge conflicts
AmoghTantradi Jan 14, 2025
f2937ad
added extra refactoring and added implementations for qdrant and chro…
AmoghTantradi Jan 14, 2025
a4c7418
fixed some type errors
AmoghTantradi Jan 14, 2025
1357fb3
made further corrections
AmoghTantradi Jan 15, 2025
c76b658
edit uuid type
AmoghTantradi Jan 15, 2025
9f257f7
changed uuid type
AmoghTantradi Jan 15, 2025
99cb535
made type changes to weaviate file
AmoghTantradi Jan 15, 2025
3c8a742
made another change
AmoghTantradi Jan 15, 2025
ccd9e48
typecheck passes for weaviate?
AmoghTantradi Jan 15, 2025
89bf974
type changes for weaviate and qdrant files
AmoghTantradi Jan 16, 2025
a76adb7
made changes to weaviate file
AmoghTantradi Jan 16, 2025
c3e0f0c
made changes to weaviate file
AmoghTantradi Jan 16, 2025
1782281
fixed pinecone type errors
AmoghTantradi Jan 16, 2025
0621b9b
fixed pinecone type errors
AmoghTantradi Jan 16, 2025
b568d1e
type checks all pass locally
AmoghTantradi Jan 16, 2025
9b33a1f
fixed linting errors
AmoghTantradi Jan 16, 2025
820f3be
made refactors to allow for testing
AmoghTantradi Jan 17, 2025
a0a70d2
made changes to tests
AmoghTantradi Jan 22, 2025
6dbd1db
fixed
AmoghTantradi Jan 22, 2025
bea1d19
changed setattr to getattr
AmoghTantradi Jan 22, 2025
f93f7ed
fixed a test
AmoghTantradi Jan 22, 2025
38ff87d
over
AmoghTantradi Jan 25, 2025
c885dbc
another change
AmoghTantradi Jan 25, 2025
8eefac0
fixed type check errors
AmoghTantradi Jan 25, 2025
23bafa5
second refactor (removed index_dir)
AmoghTantradi Jan 27, 2025
75d11ea
fixed type checks
AmoghTantradi Jan 27, 2025
0b0bf38
fixed retriever module errors
AmoghTantradi Jan 27, 2025
6bf7926
fixed key error
AmoghTantradi Jan 27, 2025
f7071a2
added fixes to failing rm tests
AmoghTantradi Jan 28, 2025
6ebe407
fixed chroma
AmoghTantradi Jan 28, 2025
e588bee
removed dynamic indexing for weaviatevs
AmoghTantradi Jan 28, 2025
d6a86e1
fixed type errors
AmoghTantradi Jan 28, 2025
ddfd549
changed weaviate index config
AmoghTantradi Jan 28, 2025
20206e1
changed rm tests index name to avoid pinecone failures
AmoghTantradi Jan 28, 2025
e7ea24f
fixed naming convention for index_dir and fixed serverless spec for p…
AmoghTantradi Jan 28, 2025
f152b54
changed serverless spec for pc index due to free plan
AmoghTantradi Jan 29, 2025
2e21a97
added debug statement
AmoghTantradi Jan 29, 2025
524b501
made changes to errors
AmoghTantradi Jan 29, 2025
87f57e1
Merge branch 'main' of github.com:guestrin-lab/lotus into at/vs_imple…
AmoghTantradi Jan 29, 2025
e995996
added some fixes to collection upload error handling
AmoghTantradi Jan 29, 2025
1a75486
made some other change
AmoghTantradi Jan 29, 2025
c5f50f6
fixed type errors for qdrant vs
AmoghTantradi Feb 9, 2025
85daf51
changed endpoint
AmoghTantradi Feb 9, 2025
6b80fd3
added changes
AmoghTantradi Feb 9, 2025
4bafdb7
Merge branch 'main' of github.com:guestrin-lab/lotus into at/vs_imple…
AmoghTantradi Feb 9, 2025
f90ff0f
added fixes
AmoghTantradi Feb 9, 2025
cccfa39
added some changes
AmoghTantradi Feb 9, 2025
6cf4f0a
added some change
AmoghTantradi Feb 9, 2025
0438b18
another set of changes
AmoghTantradi Feb 9, 2025
43e9bc3
added other logs
AmoghTantradi Feb 9, 2025
90d07d0
added logging
AmoghTantradi Feb 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/tests/lm_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from lotus.cache import CacheConfig, CacheFactory, CacheType
from lotus.models import LM, SentenceTransformersRM
from lotus.types import CascadeArgs
from lotus.vector_store import FaissVS

################################################################################
# Setup
Expand Down Expand Up @@ -289,7 +290,8 @@ def test_filter_cascade(setup_models):
def test_join_cascade(setup_models):
models = setup_models
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
lotus.settings.configure(lm=models["gpt-4o-mini"], rm=rm)
vs = FaissVS()
lotus.settings.configure(lm=models["gpt-4o-mini"], rm=rm, vs=vs)

data1 = {
"School": [
Expand Down
10 changes: 7 additions & 3 deletions .github/tests/multimodality_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import lotus
from lotus.dtype_extensions import ImageArray
from lotus.models import LM, SentenceTransformersRM
from lotus.vector_store import FaissVS

################################################################################
# Setup
Expand Down Expand Up @@ -160,7 +161,8 @@ def test_topk_with_groupby_operation(setup_models, model):
@pytest.mark.parametrize("model", get_enabled("clip-ViT-B-32"))
def test_search_operation(setup_models, model):
rm = setup_models[model]
lotus.settings.configure(rm=rm)
vs = FaissVS()
lotus.settings.configure(rm=rm, vs=vs)

image_url = [
"https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0",
Expand All @@ -180,7 +182,8 @@ def test_search_operation(setup_models, model):
@pytest.mark.parametrize("model", get_enabled("clip-ViT-B-32"))
def test_sim_join_operation_image_index(setup_models, model):
rm = setup_models[model]
lotus.settings.configure(rm=rm)
vs = FaissVS()
lotus.settings.configure(rm=rm, vs=vs)

image_url = [
"https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0",
Expand All @@ -205,7 +208,8 @@ def test_sim_join_operation_image_index(setup_models, model):
@pytest.mark.parametrize("model", get_enabled("clip-ViT-B-32"))
def test_sim_join_operation_text_index(setup_models, model):
rm = setup_models[model]
lotus.settings.configure(rm=rm)
vs = FaissVS()
lotus.settings.configure(rm=rm, vs=vs)

image_url = [
"https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0",
Expand Down
143 changes: 138 additions & 5 deletions .github/tests/rm_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import lotus
from lotus.models import CrossEncoderReranker, LiteLLMRM, SentenceTransformersRM
from lotus.vector_store import ChromaVS, FaissVS, PineconeVS, QdrantVS, WeaviateVS

################################################################################
# Setup
Expand All @@ -30,6 +31,14 @@
"text-embedding-3-small": LiteLLMRM,
}

VECTOR_STORE_TO_CLS = {
'local': FaissVS,
'weaviate':WeaviateVS,
'pinecone': PineconeVS,
'chroma': ChromaVS,
'qdrant': QdrantVS
}


def get_enabled(*candidate_models: str) -> list[str]:
return [model for model in candidate_models if model in ENABLED_MODEL_NAMES]
Expand All @@ -41,16 +50,28 @@ def setup_models():

for model_name in ENABLED_MODEL_NAMES:
models[model_name] = MODEL_NAME_TO_CLS[model_name](model=model_name)


return models


@pytest.fixture(scope='session')
def setup_vs():
vs_model = {}

for vs in VECTOR_STORE_TO_CLS:
vs_model[vs] = VECTOR_STORE_TO_CLS[vs]()

return vs_model

################################################################################
# RM Only Tests
################################################################################
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_cluster_by(setup_models, model):
rm = setup_models[model]
lotus.settings.configure(rm=rm)
vs = FaissVS()
lotus.settings.configure(rm=rm, vs=vs)

data = {
"Course Name": [
Expand Down Expand Up @@ -79,7 +100,9 @@ def test_cluster_by(setup_models, model):
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_search_rm_only(setup_models, model):
rm = setup_models[model]
lotus.settings.configure(rm=rm)
vs = FaissVS()

lotus.settings.configure(rm=rm, vs=vs)

data = {
"Course Name": [
Expand All @@ -98,7 +121,8 @@ def test_search_rm_only(setup_models, model):
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_sim_join(setup_models, model):
rm = setup_models[model]
lotus.settings.configure(rm=rm)
vs = FaissVS()
lotus.settings.configure(rm=rm, vs=vs)

data1 = {
"Course Name": [
Expand All @@ -124,7 +148,8 @@ def test_sim_join(setup_models, model):
)
def test_dedup(setup_models):
rm = setup_models["intfloat/e5-small-v2"]
lotus.settings.configure(rm=rm)
vs = FaissVS()
lotus.settings.configure(rm=rm,vs=vs)
data = {
"Text": [
"Probability and Random Processes",
Expand All @@ -142,6 +167,113 @@ def test_dedup(setup_models):
assert "Probability" in kept[1], kept



################################################################################
# VS Only Tests
################################################################################


Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

an important test to add is doing filtered vector search -- ie the program starts with some df, embeds/indexes the column, does any filter op (can be a structured filter), then calls a sem op that uses search over the indexed column

@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_vs_cluster_by(setup_models, setup_vs, vs, model):
rm = setup_models[model]
my_vs = setup_vs[vs]
lotus.settings.configure(rm=rm, vs=my_vs)

data = {
"Course Name": [
"Probability and Random Processes",
"Cooking",
"Food Sciences",
"Optimization Methods in Engineering",
]
}
df = pd.DataFrame(data)
df = df.sem_index("Course Name", "indexdir")
df = df.sem_cluster_by("Course Name", 2)
groups = df.groupby("cluster_id")["Course Name"].apply(set).to_dict()
assert len(groups) == 2, groups
if "Cooking" in groups[0]:
cooking_group = groups[0]
probability_group = groups[1]
else:
cooking_group = groups[1]
probability_group = groups[0]

assert cooking_group == {"Cooking", "Food Sciences"}, groups
assert probability_group == {"Probability and Random Processes", "Optimization Methods in Engineering"}, groups

@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_vs_search_rm_only(setup_models, setup_vs, vs, model):
rm = setup_models[model]
my_vs = setup_vs[vs]
lotus.settings.configure(rm=rm, vs=my_vs)

data = {
"Course Name": [
"Probability and Random Processes",
"Cooking",
"Food Sciences",
"Optimization Methods in Engineering",
]
}
df = pd.DataFrame(data)
df = df.sem_index("Course Name", "secondindexdir")
df = df.sem_search("Course Name", "Optimization", K=1)
assert df["Course Name"].tolist() == ["Optimization Methods in Engineering"]

@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_vs_sim_join(setup_models, setup_vs, vs, model):
rm = setup_models[model]
my_vs = setup_vs[vs]
lotus.settings.configure(rm=rm, vs=my_vs)

data1 = {
"Course Name": [
"History of the Atlantic World",
"Riemannian Geometry",
]
}

data2 = {"Skill": ["Math", "History"]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2).sem_index("Skill", "thirdindexdir")
joined_df = df1.sem_sim_join(df2, left_on="Course Name", right_on="Skill", K=1)
joined_pairs = set(zip(joined_df["Course Name"], joined_df["Skill"]))
expected_pairs = {("History of the Atlantic World", "History"), ("Riemannian Geometry", "Math")}
assert joined_pairs == expected_pairs, joined_pairs


# TODO: threshold is hardcoded for intfloat/e5-small-v2
@pytest.mark.skipif(
"intfloat/e5-small-v2" not in ENABLED_MODEL_NAMES,
reason="Skipping test because intfloat/e5-small-v2 is not enabled",
)
@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
def test_vs_dedup(setup_models, setup_vs, vs):
rm = setup_models["intfloat/e5-small-v2"]
my_vs = setup_vs[vs]
lotus.settings.configure(rm=rm, vs=my_vs)
data = {
"Text": [
"Probability and Random Processes",
"Probability and Markov Chains",
"Harry Potter",
"Harry James Potter",
]
}
df = pd.DataFrame(data)
df = df.sem_index("Text", "fourthindexdir").sem_dedup("Text", threshold=0.85)
kept = df["Text"].tolist()
kept.sort()
assert len(kept) == 2, kept
assert "Harry" in kept[0], kept
assert "Probability" in kept[1], kept


################################################################################
# Reranker Only Tests
################################################################################
Expand Down Expand Up @@ -171,8 +303,9 @@ def test_search_reranker_only(setup_models, model):
def test_search(setup_models):
models = setup_models
rm = models["intfloat/e5-small-v2"]
vs = FaissVS()
reranker = models["mixedbread-ai/mxbai-rerank-xsmall-v1"]
lotus.settings.configure(rm=rm, reranker=reranker)
lotus.settings.configure(rm=rm, vs = vs, reranker=reranker)

data = {
"Course Name": [
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ jobs:
rm_test:
name: Retrieval Model Tests
runs-on: ubuntu-latest
timeout-minutes: 5
timeout-minutes: 10

steps:
- name: Checkout code
Expand Down
4 changes: 3 additions & 1 deletion examples/op_examples/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import lotus
from lotus.models import LM, SentenceTransformersRM
from lotus.vector_store import FaissVS

lm = LM(model="gpt-4o-mini")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()

lotus.settings.configure(lm=lm, rm=rm)
lotus.settings.configure(lm=lm, rm=rm, vs=vs)
data = {
"Course Name": [
"Probability and Random Processes",
Expand Down
5 changes: 3 additions & 2 deletions examples/op_examples/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

import lotus
from lotus.models import SentenceTransformersRM
from lotus.vector_store import FaissVS

rm = SentenceTransformersRM(model="intfloat/e5-base-v2")

lotus.settings.configure(rm=rm)
vs = FaissVS()
lotus.settings.configure(rm=rm, vs=vs)
data = {
"Text": [
"Probability and Random Processes",
Expand Down
4 changes: 3 additions & 1 deletion examples/op_examples/join_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import lotus
from lotus.models import LM, SentenceTransformersRM
from lotus.types import CascadeArgs
from lotus.vector_store import FaissVS

lm = LM(model="gpt-4o-mini")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()

lotus.settings.configure(lm=lm, rm=rm)
lotus.settings.configure(lm=lm, rm=rm, vs=vs)
data = {
"Course Name": [
"Digital Design and Integrated Circuits",
Expand Down
4 changes: 3 additions & 1 deletion examples/op_examples/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import lotus
from lotus.models import LM, SentenceTransformersRM
from lotus.vector_store import FaissVS

lm = LM(max_tokens=2048)
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()

lotus.settings.configure(lm=lm, rm=rm)
lotus.settings.configure(lm=lm, rm=rm, vs=vs)
data = {
"Course Name": [
"Probability and Random Processes",
Expand Down
6 changes: 4 additions & 2 deletions examples/op_examples/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

import lotus
from lotus.models import LM, CrossEncoderReranker, SentenceTransformersRM
from lotus.vector_store import FaissVS

lm = LM(model="gpt-4o-mini")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
reranker = CrossEncoderReranker(model="mixedbread-ai/mxbai-rerank-large-v1")
reranker = CrossEncoderReranker(model="mixeddbread-ai/mxbai-rerank-large-v1")
vs = FaissVS()

lotus.settings.configure(lm=lm, rm=rm, reranker=reranker)
lotus.settings.configure(lm=lm, rm=rm, reranker=reranker, vs=vs)
data = {
"Course Name": [
"Probability and Random Processes",
Expand Down
4 changes: 3 additions & 1 deletion examples/op_examples/sim_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import lotus
from lotus.models import LM, LiteLLMRM
from lotus.vector_store import FaissVS

lm = LM(model="gpt-4o-mini")
rm = LiteLLMRM(model="text-embedding-3-small")
vs = FaissVS()

lotus.settings.configure(lm=lm, rm=rm)
lotus.settings.configure(lm=lm, rm=rm, vs=vs)
data = {
"Course Name": [
"History of the Atlantic World",
Expand Down
6 changes: 4 additions & 2 deletions lotus/models/colbertv2_rm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from numpy.typing import NDArray
from PIL import Image

from lotus.models.rm import RM
from lotus.types import RMOutput

try:
Expand All @@ -16,7 +15,7 @@
pass


class ColBERTv2RM(RM):
class ColBERTv2RM():
def __init__(self) -> None:
self.docs: list[str] | None = None
self.kwargs: dict[str, Any] = {"doc_maxlen": 300, "nbits": 2}
Expand Down Expand Up @@ -46,6 +45,9 @@ def load_index(self, index_dir: str) -> None:
def get_vectors_from_index(self, index_dir: str, ids: list[int]) -> NDArray[np.float64]:
raise NotImplementedError("This method is not implemented for ColBERTv2RM")



# this should be called in vs.py if it's
def __call__(
self,
queries: str | Image.Image | list | NDArray[np.float64],
Expand Down
Loading