-
Notifications
You must be signed in to change notification settings - Fork 140
Fix duplicate index creation in sem_join #218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import hashlib | ||
| from typing import Any | ||
|
|
||
| import pandas as pd | ||
|
|
@@ -9,12 +10,13 @@ | |
| @pd.api.extensions.register_dataframe_accessor("sem_index") | ||
| class SemIndexDataframe: | ||
| """ | ||
| Create a vecgtor similarity index over a column in the DataFrame. Indexing is required for columns used in sem_search, sem_cluster_by, and sem_sim_join. | ||
| Create a vector similarity index over a column in the DataFrame. Indexing is required for columns used in sem_search, sem_cluster_by, and sem_sim_join. | ||
| When using retrieval-based cascades for sem_filter and sem_join, indexing is required for the columns used in the semantic operation. | ||
|
|
||
| Args: | ||
| col_name (str): The column name to index. | ||
| index_dir (str): The directory to save the index. | ||
| index_dir (str): The directory to save the index. Required to prevent column name collisions. | ||
| override (bool): If True, recreate index even if it exists and data is consistent. Defaults to False. | ||
|
|
||
| Returns: | ||
| pd.DataFrame: The DataFrame with the index directory saved. | ||
|
|
@@ -46,6 +48,9 @@ class SemIndexDataframe: | |
| title | ||
| 0 Machine learning tutorial | ||
| 1 Data science guide | ||
|
|
||
| # Example 3: force recreation of index with override=True | ||
| >>> df.sem_index('title', 'title_index', override=True) ## recreates index even if it exists | ||
| """ | ||
|
|
||
| def __init__(self, pandas_obj: Any) -> None: | ||
|
|
@@ -59,7 +64,18 @@ def _validate(obj: Any) -> None: | |
| raise AttributeError("Must be a DataFrame") | ||
|
|
||
| @operator_cache | ||
| def __call__(self, col_name: str, index_dir: str) -> pd.DataFrame: | ||
| def __call__(self, col_name: str, index_dir: str, override: bool = False) -> pd.DataFrame: | ||
| """ | ||
| Create or load a semantic index for the specified column. | ||
|
|
||
| Args: | ||
| col_name: Name of the column to index | ||
| index_dir: Directory where the index should be stored/loaded from | ||
| override: If True, recreate index even if it exists and data is consistent | ||
|
|
||
| Returns: | ||
| DataFrame with index directory stored in attrs | ||
| """ | ||
| lotus.logger.warning( | ||
| "Do not reset the dataframe index to ensure proper functionality of get_vectors_from_index" | ||
| ) | ||
|
|
@@ -71,7 +87,46 @@ def __call__(self, col_name: str, index_dir: str) -> pd.DataFrame: | |
| "The retrieval model must be an instance of RM, and the vector store must be an instance of VS. Please configure a valid retrieval model using lotus.settings.configure()" | ||
| ) | ||
|
|
||
| embeddings = rm(self._obj[col_name].tolist()) | ||
| vs.index(self._obj[col_name], embeddings, index_dir) | ||
| # Get data from column | ||
| data = self._obj[col_name].tolist() | ||
| model_name = getattr(rm, "model", None) | ||
|
|
||
| # Check if index exists and data is consistent. | ||
| index_exists = vs.index_exists(index_dir) | ||
| data_consistent = False | ||
|
|
||
| if index_exists: | ||
| data_consistent = vs.is_data_consistent(index_dir, data, model_name) | ||
|
|
||
| # Determine if we need to create a new index. | ||
| should_create_index = not index_exists or not data_consistent or override | ||
|
|
||
| if should_create_index: | ||
| # Index does not exist, data is inconsistent, or override requested. Creating new index. | ||
| if index_exists and not data_consistent and not override: | ||
| raise ValueError( | ||
| f"Index exists at {index_dir} but data is inconsistent. " | ||
| f"Set override=True to recreate the index or use a different index_dir." | ||
| ) | ||
|
|
||
| # Create data hash for consistency checking. | ||
| content = str(sorted(data)) | ||
| if model_name: | ||
| content += f"__{model_name}" | ||
| data_hash = hashlib.sha256(content.encode()).hexdigest()[:32] | ||
|
|
||
| # Create new index. | ||
| embeddings = rm(data) | ||
| vs.index(self._obj[col_name], embeddings, index_dir) | ||
|
|
||
| # Store metadata for data consistency checking (FAISS only). | ||
| if hasattr(vs, "_store_metadata"): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be called directly in |
||
| vs._store_metadata(index_dir, data_hash) | ||
| lotus.logger.info(f"Created new index at {index_dir}") | ||
| else: | ||
| # Load existing index. | ||
| vs.load_index(index_dir) | ||
| lotus.logger.info(f"Loaded existing index from {index_dir}") | ||
|
|
||
| self._obj.attrs["index_dirs"][col_name] = index_dir | ||
| return self._obj | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,7 @@ | ||
| import hashlib | ||
| import json | ||
| from abc import ABC, abstractmethod | ||
| from pathlib import Path | ||
| from typing import Any | ||
|
|
||
| import numpy as np | ||
|
|
@@ -56,3 +59,36 @@ def get_vectors_from_index(self, index_dir: str, ids: list[int]) -> NDArray[np.f | |
| Retrieve vectors from a stored index given specific ids. | ||
| """ | ||
| pass | ||
|
|
||
| def index_exists(self, index_dir: str) -> bool: | ||
| """ | ||
| Check if an index exists at the given directory. | ||
| Default implementation checks for common index files. | ||
| Subclasses can override for vector store specific checks. | ||
| """ | ||
| index_path = Path(index_dir) | ||
| return index_path.exists() and (index_path / "index").exists() | ||
|
|
||
| def is_data_consistent(self, index_dir: str, data: list, model_name: str | None = None) -> bool: | ||
| """ | ||
| Check if the cached index is consistent with the current data. | ||
| Default implementation compares data hash stored in metadata. | ||
| Subclasses can override for more sophisticated consistency checks. | ||
| """ | ||
| metadata_path = Path(index_dir) / "metadata.json" | ||
| if not metadata_path.exists(): | ||
| return False | ||
|
|
||
| try: | ||
| with open(metadata_path, "r") as f: | ||
| metadata = json.load(f) | ||
|
|
||
| # Create hash of current data | ||
| content = str(sorted(data)) | ||
| if model_name: | ||
| content += f"__{model_name}" | ||
| current_hash = hashlib.sha256(content.encode()).hexdigest()[:32] | ||
|
|
||
| return metadata.get("data_hash") == current_hash | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should verify the index configs as well. The data in vector db depends on the original data, model to create embeddings, and the vector store configs. |
||
| except (json.JSONDecodeError, KeyError): | ||
| return False | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hash creation is better as a function in the vector store class so that its not repeated everywhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can just pass the data in the index function and handle hash creation inside the class itself.
Creating a common function like
hash_datathat is used in both store_metadata and is_data_consistent would simplify things