Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ LOTUS implements the semantic operator programming model and provides an optimiz
retriever_models
reranker_models
multimodal_models
vector_store
usage

.. toctree::
Expand Down
124 changes: 124 additions & 0 deletions docs/vector_store.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
Vector Stores
=====================

Lotus supports multiple vector store backends for efficient semantic indexing and search. This document describes how to use and configure the available vector stores, including Qdrant, Faiss, and Weaviate.

Supported Vector Stores
----------------------
- QdrantVS
- FaissVS
- WeaviateVS

QdrantVS
--------

**Installation**
^^^^^^^^^^^^^^^^
Install the Qdrant client and Lotus with Qdrant support:

.. code-block:: bash

pip install qdrant-client lotus[qdrant]

**Running Qdrant**
^^^^^^^^^^^^^^^^^^
You can run Qdrant locally using Docker:

.. code-block:: bash

docker run -p 6333:6333 -p 6334:6334 \
-v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
qdrant/qdrant

**Example Usage**
^^^^^^^^^^^^^^^^^

.. code-block:: python

import pandas as pd
from qdrant_client import QdrantClient
import lotus
from lotus.models import LiteLLMRM # or SentenceTransformersRM
from lotus.vector_store import QdrantVS

# Start Qdrant server before running this code
client = QdrantClient(url="http://localhost:6333")
rm = LiteLLMRM(model="text-embedding-3-small")
vs = QdrantVS(client)
lotus.settings.configure(rm=rm, vs=vs)

data = {"Course Name": ["Machine Learning 101", "Introduction to Cooking"]}
df = pd.DataFrame(data)
df = df.sem_index("Course Name", "my_qdrant_index")
result = df.sem_search("Course Name", "Find the course about machine learning", K=1)
print(result)

FaissVS
-------

**Installation**
^^^^^^^^^^^^^^^^

.. code-block:: bash

pip install faiss-cpu lotus

**Example Usage**
^^^^^^^^^^^^^^^^^

.. code-block:: python

import pandas as pd
import lotus
from lotus.models import LiteLLMRM
from lotus.vector_store import FaissVS

rm = LiteLLMRM(model="text-embedding-3-small")
vs = FaissVS()
lotus.settings.configure(rm=rm, vs=vs)

data = {"Course Name": ["Machine Learning 101", "Introduction to Cooking"]}
df = pd.DataFrame(data)
df = df.sem_index("Course Name", "my_faiss_index")
result = df.sem_search("Course Name", "Find the course about machine learning", K=1)
print(result)

WeaviateVS
----------

**Installation**
^^^^^^^^^^^^^^^^

.. code-block:: bash

pip install weaviate-client lotus[weaviate]

**Running Weaviate**
^^^^^^^^^^^^^^^^^^^^
You can run Weaviate locally using Docker:

.. code-block:: bash

docker run -p 8080:8080 -p 50051:50051 cr.weaviate.io/semitechnologies/weaviate:1.29.1

**Example Usage**
^^^^^^^^^^^^^^^^^

.. code-block:: python

import pandas as pd
import weaviate
import lotus
from lotus.models import LiteLLMRM
from lotus.vector_store import WeaviateVS

client = weaviate.Client("http://localhost:8080")
rm = LiteLLMRM(model="text-embedding-3-small")
vs = WeaviateVS(client)
lotus.settings.configure(rm=rm, vs=vs)

data = {"Course Name": ["Machine Learning 101", "Introduction to Cooking"]}
df = pd.DataFrame(data)
df = df.sem_index("Course Name", "my_weaviate_index")
result = df.sem_search("Course Name", "Find the course about machine learning", K=1)
print(result)
42 changes: 42 additions & 0 deletions examples/vs_examples/search_qdrant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pandas as pd
from qdrant_client import QdrantClient

import lotus
from lotus.models import SentenceTransformersRM
from lotus.vector_store import QdrantVS

# Run this command to start the qdrant server
# docker run -p 6333:6333 -p 6334:6334 \
# -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
# qdrant/qdrant
client = QdrantClient(url="http://localhost:6333")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = QdrantVS(client)

lotus.settings.configure(rm=rm, vs=vs)
data = {
"Course Name": [
"Probability and Random Processes",
"Optimization Methods in Engineering",
"Digital Design and Integrated Circuits",
"Computer Security",
"Introduction to Computer Science",
"Introduction to Data Science",
"Introduction to Machine Learning",
"Introduction to Artificial Intelligence",
"Introduction to Robotics",
"Introduction to Computer Vision",
"Introduction to Natural Language Processing",
"Introduction to Reinforcement Learning",
"Introduction to Deep Learning",
"Introduction to Computer Networks",
]
}
df = pd.DataFrame(data)

df = df.sem_index("Course Name", "index_dir").sem_search(
"Course Name",
"Which course name is most related to machine learning?",
K=8,
)
print(df)
3 changes: 2 additions & 1 deletion lotus/vector_store/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from lotus.vector_store.vs import VS
from lotus.vector_store.faiss_vs import FaissVS
from lotus.vector_store.weaviate_vs import WeaviateVS
from lotus.vector_store.qdrant_vs import QdrantVS

__all__ = ["VS", "FaissVS", "WeaviateVS"]
__all__ = ["VS", "FaissVS", "WeaviateVS", "QdrantVS"]
169 changes: 169 additions & 0 deletions lotus/vector_store/qdrant_vs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from typing import Any

import numpy as np
from numpy.typing import NDArray

from lotus.types import RMOutput
from lotus.vector_store.vs import VS

try:
from qdrant_client import QdrantClient
from qdrant_client.http import models
except ImportError:
QdrantClient = None


class QdrantVS(VS):
def __init__(self, client, max_batch_size: int = 128):
if QdrantClient is None:
raise ImportError("Please install the qdrant client using `pip install lotus[qdrant]`")

super().__init__()
self.client = client
self.max_batch_size = max_batch_size

self.index_dir: str | None = None
self.embedding_dim: int | None = None

def index(self, docs: list[str], embeddings: NDArray[np.float64], index_dir: str, **kwargs: dict[str, Any]):
"""Create a collection and add documents with their embeddings"""
self.index_dir = index_dir
self.embedding_dim = np.reshape(embeddings, (len(embeddings), -1)).shape[1]

# Delete collection if it already exists
try:
self.client.delete_collection(collection_name=index_dir)
except Exception:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Qdrant should have an exception for not found or something similar, maybe consider catching that and raise anything else?

pass

# Create the collection with appropriate settings
self.client.create_collection(
collection_name=index_dir,
vectors_config=models.VectorParams(
size=self.embedding_dim,
distance=models.Distance.COSINE,
),
)

# Prepare points to add to the collection
points = []
for idx, (doc, embedding) in enumerate(zip(docs, embeddings)):
points.append(
models.PointStruct(
id=idx,
vector=embedding.tolist(),
payload={"content": doc, "doc_id": idx},
)
)

# Add points to the collection in batches
for i in range(0, len(points), self.max_batch_size):
batch = points[i : i + self.max_batch_size]
self.client.upsert(
collection_name=index_dir,
points=batch,
wait=True,
)

def load_index(self, index_dir: str):
"""Load/set the collection name to use"""
self.index_dir = index_dir

# Verify collection exists
collections = self.client.get_collections().collections
collection_names = [collection.name for collection in collections]

if index_dir not in collection_names:
raise ValueError(f"Collection {index_dir} not found")

# Get vector size for future reference
collection_info = self.client.get_collection(collection_name=index_dir)
vectors = collection_info.config.params.vectors
if isinstance(vectors, dict):
self.embedding_dim = next(iter(vectors.values())).size
else:
self.embedding_dim = vectors.size

def __call__(
self, query_vectors: NDArray[np.float64], K: int, ids: list[int] | None = None, **kwargs: dict[str, Any]
) -> RMOutput:
"""Perform vector search using pre-computed query vectors"""
if self.index_dir is None:
raise ValueError("No collection loaded. Call load_index first.")

results = []
for query_vector in query_vectors:
# Create a filter for specific IDs if provided
id_filter = None
if ids is not None:
id_filter = models.Filter(
must=[
models.FieldCondition(
key="doc_id",
match=models.MatchAny(any=ids),
)
]
)

# Perform the search
search_result = self.client.search(
collection_name=self.index_dir,
query_vector=query_vector.tolist(),
limit=K,
query_filter=id_filter,
with_payload=True,
)
results.append(search_result)

# Process results into expected format
all_distances = []
all_indices = []

for result in results:
distances = []
indices = []

for scored_point in result:
# Get document ID
doc_id = scored_point.payload.get("doc_id", -1)
indices.append(doc_id)

# Convert score to similarity (Qdrant returns a similarity score already)
similarity = scored_point.score if scored_point.score is not None else 0.0
distances.append(similarity)

# Pad results if fewer than K matches
while len(indices) < K:
indices.append(-1)
distances.append(0.0)

all_distances.append(distances)
all_indices.append(indices)

return RMOutput(
distances=np.array(all_distances, dtype=np.float32).tolist(), # type: ignore
indices=np.array(all_indices, dtype=np.int64).tolist(), # type: ignore
)

def get_vectors_from_index(self, index_dir: str, ids: list[int]) -> NDArray[np.float64]:
"""Retrieve vectors from Qdrant collection given specific ids"""
if self.index_dir != index_dir:
self.load_index(index_dir)

# Retrieve points by IDs
points = self.client.retrieve(
collection_name=index_dir,
ids=ids,
with_vectors=True,
)

# Extract vectors and ensure order matches the input ids
assert self.embedding_dim is not None
vectors = np.zeros((len(ids), self.embedding_dim), dtype=np.float64)
id_to_idx = {id: idx for idx, id in enumerate(ids)}

for point in points:
if point.id in id_to_idx:
vectors[id_to_idx[point.id]] = np.array(point.vector, dtype=np.float64)

return vectors
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ file_extractor = [
weaviate = [
"weaviate-client",
]
qdrant = [
"qdrant-client",
]
data_connectors = [
"sqlalchemy",
"boto3",
Expand Down
Loading