diff --git a/huf/ai/knowledge/backends/__init__.py b/huf/ai/knowledge/backends/__init__.py index fa78252b..6c67154d 100644 --- a/huf/ai/knowledge/backends/__init__.py +++ b/huf/ai/knowledge/backends/__init__.py @@ -2,8 +2,7 @@ Knowledge Backend Abstraction This module provides a unified interface for knowledge storage backends. -Phase 1: SQLite FTS only -Future: Chroma, pgvector, managed vector DBs +Supported: SQLite FTS (keyword search), SQLite Vec (vector search) """ from abc import ABC, abstractmethod @@ -65,7 +64,6 @@ def get_backend(backend_type: str) -> type: """Get backend class by type.""" backends = { "sqlite_fts": "huf.ai.knowledge.backends.sqlite_fts.SQLiteFTSBackend", - "zvec": "huf.ai.knowledge.backends.zvec_backend.ZvecBackend", "sqlite_vec": "huf.ai.knowledge.backends.sqlite_vec_backend.SQLiteVecBackend", } diff --git a/huf/ai/knowledge/backends/zvec_backend.py b/huf/ai/knowledge/backends/zvec_backend.py deleted file mode 100644 index ea7ebc28..00000000 --- a/huf/ai/knowledge/backends/zvec_backend.py +++ /dev/null @@ -1,300 +0,0 @@ -""" -Zvec Vector Database Backend for Knowledge System. - -Provides semantic (vector similarity) search using Zvec, an in-process -vector database built on Alibaba's Proxima engine. Stores embeddings -alongside text and metadata as a portable .zvec collection file. -""" - -import os -import uuid -from typing import List, Dict, Any, Optional - -import frappe -from frappe.utils import get_files_path - -from . import KnowledgeBackend, ChunkResult - - -class ZvecBackend(KnowledgeBackend): - """Zvec vector database backend for semantic search.""" - - # Default vector field name in the zvec collection - DEFAULT_VECTOR_FIELD = "embedding" - - # Scalar fields stored alongside each chunk - SCALAR_FIELDS = [ - ("text", str), - ("source_title", str), - ("input_id", str), - ("input_type", str), - ("chunk_index", int), - ("metadata_json", str), - ] - - def __init__(self): - self.collection = None - self.knowledge_source = None - self.db_path = None - self.vector_field = self.DEFAULT_VECTOR_FIELD - self.dimension = None - self._config = {} - - def initialize(self, knowledge_source: str, config: Dict[str, Any]) -> None: - """Initialize Zvec collection for knowledge source.""" - import zvec - - self.knowledge_source = knowledge_source - self._config = config - self.dimension = config.get("vector_dimension") or 1536 - self.vector_field = self.DEFAULT_VECTOR_FIELD - - # Determine database path - files_path = get_files_path(is_private=True) - knowledge_dir = os.path.join(files_path, "knowledge") - os.makedirs(knowledge_dir, exist_ok=True) - - safe_name = frappe.scrub(knowledge_source) - self.db_path = os.path.join(knowledge_dir, f"{safe_name}.zvec") - - # Build schema: scalar fields + one dense vector field - field_schemas = [ - zvec.FieldSchema(name, zvec.DataType.STRING if dtype is str else zvec.DataType.INT64) - for name, dtype in self.SCALAR_FIELDS - ] - - vector_schema = zvec.VectorSchema( - self.vector_field, - zvec.DataType.VECTOR_FP32, - self.dimension, - ) - - schema = zvec.CollectionSchema( - name=knowledge_source, - fields=field_schemas, - vectors=vector_schema, - ) - - # Create or open collection - try: - self.collection = zvec.create_and_open(path=self.db_path, schema=schema) - except (RuntimeError, Exception): - # Collection already exists — open it - self.collection = zvec.open(self.db_path) - - def add_chunks(self, chunks: List[Dict[str, Any]]) -> int: - """ - Add chunks to the Zvec collection. - - Generates embeddings for each chunk text and stores them alongside - the original text and metadata as zvec documents. - """ - import zvec - import json - - if not chunks: - return 0 - - # Get embedding configuration - embedding_model = self._config.get("embedding_model") - if not embedding_model: - frappe.throw("Embedding model is required for zvec backend") - - # Gather texts for batch embedding - texts = [chunk["text"] for chunk in chunks] - - # Generate embeddings in batch - from huf.ai.knowledge.embedding import get_embeddings, resolve_embedding_config - - embed_config = resolve_embedding_config(self.knowledge_source) - vectors = get_embeddings( - texts=texts, - model=embed_config["model"], - api_key=embed_config.get("api_key"), - api_base=embed_config.get("api_base"), - ) - - # Build zvec documents - docs = [] - for chunk, vector in zip(chunks, vectors): - chunk_id = chunk.get("chunk_id") or str(uuid.uuid4()) - metadata = chunk.get("metadata", {}) - - doc = zvec.Doc( - id=chunk_id, - vectors={self.vector_field: vector}, - fields={ - "text": chunk["text"], - "source_title": chunk.get("source_title") or "", - "input_id": chunk["input_id"], - "input_type": chunk.get("input_type", ""), - "chunk_index": chunk["chunk_index"], - "metadata_json": json.dumps(metadata) if metadata else "{}", - }, - ) - docs.append(doc) - - # Insert documents (use upsert, to handle re-indexing gracefully) - result = self.collection.upsert(docs) - - # Count successes - if isinstance(result, list): - return sum(1 for status in result if status.ok()) - # Single doc result - return 1 if result.ok() else 0 - - def delete_chunks(self, input_id: str) -> int: - """Delete all chunks for a given input_id.""" - try: - self.collection.delete_by_filter(filter=f"input_id == '{input_id}'") - # delete_by_filter doesn't return count in all zvec versions, - # so we can't reliably report count - return 0 - except Exception as e: - frappe.log_error( - f"Zvec delete_chunks error for input_id={input_id}", - str(e), - ) - return 0 - - def search( - self, - query: str, - top_k: int = 5, - filters: Optional[Dict[str, Any]] = None, - ) -> List[ChunkResult]: - """ - Search for relevant chunks using vector similarity. - - Embeds the query text, then performs approximate nearest-neighbor - search against the stored chunk embeddings. - """ - import zvec - import json - - if not query or not query.strip(): - return [] - - # Embed the query - from huf.ai.knowledge.embedding import get_embedding, resolve_embedding_config - - embed_config = resolve_embedding_config(self.knowledge_source) - query_vector = get_embedding( - text=query, - model=embed_config["model"], - api_key=embed_config.get("api_key"), - api_base=embed_config.get("api_base"), - ) - - # Build vector query - vector_query = zvec.VectorQuery( - field_name=self.vector_field, - vector=query_vector, - ) - - # Build optional filter expression - filter_expr = None - if filters: - clauses = [] - for key, value in filters.items(): - if isinstance(value, str): - clauses.append(f"{key} == '{value}'") - else: - clauses.append(f"{key} == {value}") - filter_expr = " AND ".join(clauses) - - # Execute query - query_kwargs = { - "vectors": vector_query, - "topk": top_k, - "output_fields": ["text", "source_title", "input_id", "chunk_index", "metadata_json"], - } - if filter_expr: - query_kwargs["filter"] = filter_expr - - results = self.collection.query(**query_kwargs) - - # Convert to ChunkResult objects - chunk_results = [] - for doc in results: - metadata = {} - metadata_json = doc.fields.get("metadata_json", "{}") - if metadata_json: - try: - metadata = json.loads(metadata_json) - except (json.JSONDecodeError, TypeError): - pass - - chunk_index = doc.fields.get("chunk_index") - if chunk_index is not None: - metadata["chunk_index"] = chunk_index - - chunk_results.append( - ChunkResult( - chunk_id=doc.id, - text=doc.fields.get("text", ""), - title=doc.fields.get("source_title"), - score=doc.score if hasattr(doc, "score") else 0.0, - source=doc.fields.get("input_id"), - metadata=metadata, - ) - ) - - return chunk_results - - def clear(self) -> None: - """Clear all documents from the collection.""" - import zvec - - if self.collection is None: - return - - # Close current collection, remove files, recreate - try: - self.collection.close() - except Exception: - pass - - # Remove the zvec directory/file and reinitialize - import shutil - - if os.path.exists(self.db_path): - if os.path.isdir(self.db_path): - shutil.rmtree(self.db_path) - else: - os.remove(self.db_path) - - # Reinitialize with same config - self.initialize(self.knowledge_source, self._config) - - def get_stats(self) -> Dict[str, Any]: - """Get collection statistics.""" - stats = { - "chunk_count": 0, - "input_count": 0, - "size_bytes": 0, - } - - if not self.db_path or not os.path.exists(self.db_path): - return stats - - # Calculate size (zvec may store data in a directory) - if os.path.isdir(self.db_path): - total_size = 0 - for dirpath, _dirnames, filenames in os.walk(self.db_path): - for f in filenames: - fp = os.path.join(dirpath, f) - total_size += os.path.getsize(fp) - stats["size_bytes"] = total_size - else: - stats["size_bytes"] = os.path.getsize(self.db_path) - - # Try to get document count from collection stats - if self.collection: - try: - collection_stats = self.collection.stats() - stats["chunk_count"] = getattr(collection_stats, "doc_count", 0) - except Exception: - pass - - return stats diff --git a/huf/ai/knowledge/backends/zvec_llamaindex.py b/huf/ai/knowledge/backends/zvec_llamaindex.py deleted file mode 100644 index 579840f7..00000000 --- a/huf/ai/knowledge/backends/zvec_llamaindex.py +++ /dev/null @@ -1,204 +0,0 @@ -""" -LlamaIndex-compatible VectorStore adapter for Zvec. - -Wraps ZvecBackend as a LlamaIndex BasePydanticVectorStore, enabling -Zvec collections to be used in standard LlamaIndex index/query pipelines. - -This is an optional integration layer — the primary path for Huf's -knowledge system is through the KnowledgeBackend ABC. -""" - -from typing import Any, List, Optional, Dict - -try: - from llama_index.core.vector_stores.types import ( - BasePydanticVectorStore, - VectorStoreQuery, - VectorStoreQueryResult, - ) - from llama_index.core.schema import BaseNode, TextNode - from pydantic import ConfigDict - - LLAMAINDEX_AVAILABLE = True -except ImportError: - LLAMAINDEX_AVAILABLE = False - - -def _check_llamaindex(): - if not LLAMAINDEX_AVAILABLE: - raise ImportError( - "llama-index-core is required for the LlamaIndex VectorStore adapter. " - "Install it with: pip install llama-index-core" - ) - - -if LLAMAINDEX_AVAILABLE: - - class ZvecVectorStore(BasePydanticVectorStore): - """ - LlamaIndex VectorStore backed by Zvec. - - Usage: - vector_store = ZvecVectorStore( - db_path="/path/to/collection.zvec", - dimension=1536, - collection_name="my_knowledge", - ) - index = VectorStoreIndex.from_vector_store(vector_store) - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - db_path: str - dimension: int = 1536 - collection_name: str = "knowledge" - vector_field: str = "embedding" - - _collection: Any = None - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._init_collection() - - def _init_collection(self): - """Initialize or open the zvec collection.""" - import zvec - import os - - field_schemas = [ - zvec.FieldSchema("text", zvec.DataType.STRING), - zvec.FieldSchema("ref_doc_id", zvec.DataType.STRING), - zvec.FieldSchema("metadata_json", zvec.DataType.STRING), - ] - - vector_schema = zvec.VectorSchema( - self.vector_field, - zvec.DataType.VECTOR_FP32, - self.dimension, - ) - - schema = zvec.CollectionSchema( - name=self.collection_name, - fields=field_schemas, - vectors=vector_schema, - ) - - try: - self._collection = zvec.create_and_open(path=self.db_path, schema=schema) - except (RuntimeError, Exception): - self._collection = zvec.open(self.db_path) - - @property - def client(self) -> Any: - """Return the underlying zvec collection.""" - return self._collection - - def add(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]: - """ - Add nodes to the vector store. - - Each node must have an embedding already set. - """ - import zvec - import json - - docs = [] - ids = [] - - for node in nodes: - node_id = node.node_id - embedding = node.get_embedding() - text = node.get_content() - metadata = node.metadata or {} - ref_doc_id = node.ref_doc_id or "" - - doc = zvec.Doc( - id=node_id, - vectors={self.vector_field: embedding}, - fields={ - "text": text, - "ref_doc_id": ref_doc_id, - "metadata_json": json.dumps(metadata), - }, - ) - docs.append(doc) - ids.append(node_id) - - if docs: - self._collection.upsert(docs) - - return ids - - def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: - """Delete all nodes associated with a reference document ID.""" - self._collection.delete_by_filter( - filter=f"ref_doc_id == '{ref_doc_id}'" - ) - - def query( - self, - query: VectorStoreQuery, - **kwargs: Any, - ) -> VectorStoreQueryResult: - """ - Query the vector store with a VectorStoreQuery. - - Uses the query embedding to perform similarity search. - """ - import zvec - import json - - if query.query_embedding is None: - return VectorStoreQueryResult(nodes=[], similarities=[], ids=[]) - - vector_query = zvec.VectorQuery( - field_name=self.vector_field, - vector=query.query_embedding, - ) - - topk = query.similarity_top_k or 5 - - results = self._collection.query( - vectors=vector_query, - topk=topk, - output_fields=["text", "ref_doc_id", "metadata_json"], - ) - - nodes = [] - similarities = [] - ids = [] - - for doc in results: - metadata = {} - metadata_json = doc.fields.get("metadata_json", "{}") - if metadata_json: - try: - metadata = json.loads(metadata_json) - except (json.JSONDecodeError, TypeError): - pass - - node = TextNode( - id_=doc.id, - text=doc.fields.get("text", ""), - metadata=metadata, - ) - nodes.append(node) - similarities.append(doc.score if hasattr(doc, "score") else 0.0) - ids.append(doc.id) - - return VectorStoreQueryResult( - nodes=nodes, - similarities=similarities, - ids=ids, - ) - - def get_nodes( - self, - node_ids: Optional[List[str]] = None, - filters: Optional[Any] = None, - ) -> List[BaseNode]: - """Retrieve nodes by ID. Limited implementation.""" - # Zvec doesn't support direct ID-based fetching easily, - # so we return empty for now. Full implementation would need - # per-ID vector queries or a separate lookup table. - return [] diff --git a/huf/ai/knowledge/indexer.py b/huf/ai/knowledge/indexer.py index 43f4640e..157d06fe 100644 --- a/huf/ai/knowledge/indexer.py +++ b/huf/ai/knowledge/indexer.py @@ -14,15 +14,14 @@ def _build_backend_config(source) -> dict: """Build configuration dict for backend initialization. Includes chunking settings for all backends and adds embedding - configuration for vector backends (e.g. zvec). + configuration for the sqlite_vec vector backend. """ config = { "chunk_size": source.chunk_size, "chunk_overlap": source.chunk_overlap, } - # Vector backends need embedding configuration - if source.knowledge_type in {"zvec", "sqlite_vec"}: + if source.knowledge_type == "sqlite_vec": config["embedding_model"] = source.embedding_model config["vector_dimension"] = source.vector_dimension config["embedding_provider"] = getattr(source, "embedding_provider", None) diff --git a/huf/huf/doctype/knowledge_source/knowledge_source.js b/huf/huf/doctype/knowledge_source/knowledge_source.js index 45641863..e000ac11 100644 --- a/huf/huf/doctype/knowledge_source/knowledge_source.js +++ b/huf/huf/doctype/knowledge_source/knowledge_source.js @@ -52,7 +52,7 @@ frappe.ui.form.on("Knowledge Source", { knowledge_type(frm) { // Toggle visibility of vector-specific settings - const uses_vectors = ["zvec", "sqlite_vec"].includes(frm.doc.knowledge_type); + const uses_vectors = frm.doc.knowledge_type === "sqlite_vec"; frm.toggle_reqd("embedding_model", uses_vectors); frm.toggle_reqd("vector_dimension", uses_vectors); } diff --git a/huf/huf/doctype/knowledge_source/knowledge_source.json b/huf/huf/doctype/knowledge_source/knowledge_source.json index 3aae5692..e088c2dd 100644 --- a/huf/huf/doctype/knowledge_source/knowledge_source.json +++ b/huf/huf/doctype/knowledge_source/knowledge_source.json @@ -169,22 +169,22 @@ "fieldname": "knowledge_type", "fieldtype": "Select", "label": "Knowledge Type", - "options": "sqlite_fts\nzvec\nsqlite_vec", + "options": "sqlite_fts\nsqlite_vec", "reqd": 1 }, { "fieldname": "vector_settings_section", "fieldtype": "Section Break", "label": "Vector Settings", - "depends_on": "eval:['zvec','sqlite_vec'].includes(doc.knowledge_type)" + "depends_on": "eval:doc.knowledge_type === 'sqlite_vec'" }, { "fieldname": "embedding_model", "fieldtype": "Data", "label": "Embedding Model", "description": "LiteLLM model identifier, e.g. openai/text-embedding-3-small", - "depends_on": "eval:['zvec','sqlite_vec'].includes(doc.knowledge_type)", - "mandatory_depends_on": "eval:['zvec','sqlite_vec'].includes(doc.knowledge_type)" + "depends_on": "eval:doc.knowledge_type === 'sqlite_vec'", + "mandatory_depends_on": "eval:doc.knowledge_type === 'sqlite_vec'" }, { "fieldname": "vector_dimension", @@ -193,8 +193,8 @@ "default": 1536, "non_negative": 1, "description": "Must match the embedding model output dimensionality", - "depends_on": "eval:['zvec','sqlite_vec'].includes(doc.knowledge_type)", - "mandatory_depends_on": "eval:['zvec','sqlite_vec'].includes(doc.knowledge_type)" + "depends_on": "eval:doc.knowledge_type === 'sqlite_vec'", + "mandatory_depends_on": "eval:doc.knowledge_type === 'sqlite_vec'" }, { "fieldname": "column_break_vector", @@ -206,7 +206,7 @@ "label": "Embedding Provider", "options": "AI Provider", "description": "AI Provider for API key resolution", - "depends_on": "eval:['zvec','sqlite_vec'].includes(doc.knowledge_type)" + "depends_on": "eval:doc.knowledge_type === 'sqlite_vec'" } ], "index_web_pages_for_search": 1, diff --git a/huf/huf/doctype/knowledge_source/knowledge_source.py b/huf/huf/doctype/knowledge_source/knowledge_source.py index ae99c84f..8bcf2563 100644 --- a/huf/huf/doctype/knowledge_source/knowledge_source.py +++ b/huf/huf/doctype/knowledge_source/knowledge_source.py @@ -18,7 +18,7 @@ def validate_chunk_settings(self): frappe.throw(_("Chunk overlap must be less than chunk size")) def validate_vector_settings(self): - if self.knowledge_type in {"zvec", "sqlite_vec"}: + if self.knowledge_type == "sqlite_vec": if not self.embedding_model: frappe.throw(_("Embedding Model is required for vector knowledge types")) if not self.vector_dimension or self.vector_dimension <= 0: diff --git a/pyproject.toml b/pyproject.toml index 83cfdaae..7ab692d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,6 @@ dependencies = [ "openai-agents", "litellm>=1.0.0", "llama-index-core>=0.10.0", - "tridz-zvec", "sqlite-vec", ]