diff --git a/.gitignore b/.gitignore index 41c7ff58..c3d0228a 100755 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ env-* .coverage htmlcov/ *.html +vdb_benchmark/vdbbench/benchmark/results/* # OS files .DS_Store diff --git a/vdb_benchmark/README.md b/vdb_benchmark/README.md index 880430d9..e1128b04 100644 --- a/vdb_benchmark/README.md +++ b/vdb_benchmark/README.md @@ -25,16 +25,18 @@ pip3 install -e ./ ## Deploying a Standalone Milvus Instance Stand-alone instances are available via Docker containers in the stacks directory. -> stacks -> └── milvus -> ├── cluster -> └── standalone -> ├── minio -> │   ├── .env.example -> │   └── docker-compose.yml -> └── s3 -> ├── .env.example -> └── docker-compose-s3.yml +``` + stacks + └── milvus + ├── cluster + └── standalone + ├── minio + │   ├── .env.example + │   └── docker-compose.yml + └── s3 + ├── .env.example + └── docker-compose-s3.yml +``` For each specific instance, copy the `.env.example` file to `.env` and update the values as needed. ```bash diff --git a/vdb_benchmark/vdbbench/benchmark/.env.example b/vdb_benchmark/vdbbench/benchmark/.env.example new file mode 100644 index 00000000..2e4d658f --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/.env.example @@ -0,0 +1,37 @@ +# VDB Benchmark -- backend connection parameters +# ================================================ +# +# Copy this file to .env and uncomment / edit the values you need. +# The benchmark CLI loads this file automatically (requires python-dotenv). +# +# Naming convention: +# {BACKEND}__{PARAM} +# +# Both parts are UPPER-CASED and separated by a double underscore (__). +# The PARAM name matches the backend's connection_params (see --help). +# +# Precedence (highest wins): +# CLI flags > environment / .env > YAML config > built-in defaults +# +# To verify which source each parameter comes from, run: +# python -m vdbbench.benchmark --backend milvus --config ... --what-if + + +# ── Milvus ──────────────────────────────────────────────────────── +# MILVUS__HOST=127.0.0.1 +# MILVUS__PORT=19530 +# MILVUS__MAX_MESSAGE_LENGTH=514983574 + + +# ── pgvector (PostgreSQL) ───────────────────────────────────────── +# PGVECTOR__HOST=127.0.0.1 +# PGVECTOR__PORT=5432 +# PGVECTOR__DBNAME=postgres +# PGVECTOR__USER=postgres +# PGVECTOR__PASSWORD= + + +# ── Elasticsearch ───────────────────────────────────────────────── +# ELASTICSEARCH__HOST=http://localhost:9200 +# ELASTICSEARCH__API_KEY= +# ELASTICSEARCH__CLOUD_ID= diff --git a/vdb_benchmark/vdbbench/benchmark/README.md b/vdb_benchmark/vdbbench/benchmark/README.md new file mode 100644 index 00000000..e355feb9 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/README.md @@ -0,0 +1,557 @@ +# VDB Benchmark Framework + +A modular, backend-agnostic benchmarking framework for vector databases. It +generates synthetic vectors, ingests them into a pluggable database backend, +computes brute-force ground truth, and runs ANN search benchmarks that report +QPS, recall, and latency percentiles. + +## Supported Backends + +| Backend | `--backend` | Supported Indexes | Supported Metrics | Required Packages | +|---------|-------------|-------------------|-------------------|-------------------| +| Milvus | `milvus` | HNSW, DISKANN, AISAQ, FLAT | COSINE, L2, IP | `pymilvus` | +| pgvector (PostgreSQL) | `pgvector` | HNSW, IVFFLAT, FLAT | COSINE, L2, IP | `psycopg2-binary`, `pgvector` | +| Elasticsearch | `elasticsearch` | HNSW, FLAT | COSINE, L2, IP | `elasticsearch` | + +All backends implement the same abstract interface (`VectorDBBackend`), so +the benchmark orchestrator, data generation, ground-truth computation, and +search pipeline are completely database-agnostic. + +## Directory Layout + +``` +benchmark/ +├── __init__.py # Public API exports +├── __main__.py # python -m vdbbench.benchmark entry point +├── run_benchmark.py # CLI: argument parsing, config resolution +├── orchestrator.py # BenchmarkOrchestrator + BenchmarkConfig +├── generator.py # VectorGenerator (producer thread) +├── ground_truth.py # GroundTruthBuilder (brute-force exact NN) +├── search_runner.py # SearchRunner (latency / recall measurement) +├── collection_admin.py # CLI: collection admin + interactive manager +├── .env.example # Template for backend connection env vars +├── backends/ # Pluggable database adapters +│ ├── __init__.py # BackendRegistry + auto-discovery +│ ├── base.py # Abstract VectorDBBackend + descriptors +│ ├── _env.py # Environment variable loading +│ ├── _help.py # CLI help formatting +│ ├── elasticsearch/ # Elasticsearch adapter +│ ├── milvus/ # Milvus adapter +│ └── pgvector/ # PostgreSQL + pgvector adapter +└── configs/ # Example YAML configuration files + ├── 1m_diskann.yaml + ├── 1m_hnsw.yaml + ├── elasticsearch_1m_hnsw.yaml + └── pgvector_1m_hnsw.yaml +``` + +## Modular Backend Interface + +### Abstract Base Class + +Every database adapter subclasses `VectorDBBackend` (defined in +`backends/base.py`) and implements the following abstract methods: + +#### Lifecycle + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `connect` | `(**kwargs) -> None` | Open a connection using params from the backend descriptor. | +| `disconnect` | `() -> None` | Close the connection and release resources. | + +#### Collection Management + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `create_collection` | `(name, dimension, metric_type, index_type, index_params, num_shards, force) -> CollectionInfo` | Create a collection and its index. Drops first when `force=True`. | +| `collection_exists` | `(name) -> bool` | Check whether a collection exists. | +| `drop_collection` | `(name) -> None` | Drop a collection if it exists. | + +#### Data Ingestion + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `insert_batch` | `(name, ids, vectors) -> int` | Insert vectors. `ids` is `(n,)` int64, `vectors` is `(n, dim)` float32. | +| `flush` | `(name) -> None` | Commit pending writes to durable storage. | + +#### Search + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `search` | `(name, query_vectors, top_k, search_params) -> List[List[int]]` | ANN or exact search. Returns `top_k` IDs per query, closest-first. | + +#### Status / Info + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `row_count` | `(name) -> int` | Number of vectors in the collection. | +| `get_index_progress` | `(name) -> IndexProgress` | Point-in-time index build snapshot. | + +#### Administration / Introspection + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `list_collections` | `() -> List[str]` | All collection names on the server. | +| `get_collection_info` | `(name) -> Dict` | Detailed metadata (rows, dimension, metric, index, schema). | +| `list_indexes` | `(name) -> List[Dict]` | All indexes on a collection. | +| `drop_index` | `(name, index_name=None) -> None` | Drop an index. Default raises `NotImplementedError`. | +| `get_collection_stats` | `(name) -> Dict` | Operational stats. Default returns row count + index progress. | + +#### Concrete Methods (provided by base class) + +| Method | Purpose | +|--------|---------| +| `wait_for_index(name, interval, timeout, compacted)` | Polls `get_index_progress()` with unified progress logging, rates, and ETA. | +| `compact(name)` | Trigger segment compaction. Default is a no-op. | + +### Descriptor System + +Each backend exposes a `BackendDescriptor` that declares its capabilities. +This drives CLI help, argument validation, and execution planning. + +```python +@dataclass +class BackendDescriptor: + name: str # "milvus" -- used in --backend + display_name: str # "Milvus" -- shown in help + description: str # one-paragraph overview + backend_class: Type[VectorDBBackend] + supported_metrics: List[str] # ["COSINE", "L2", "IP"] + supported_indexes: List[IndexDescriptor] + connection_params: List[ParamDescriptor] + active: bool = True # False hides from CLI/registry +``` + +Supporting dataclasses: + +```python +@dataclass +class ParamDescriptor: + name: str # e.g. "M", "host" + description: str # shown in --help + type: str = "int" # "int" | "float" | "str" | "bool" + default: Any = None + required: bool = False + +@dataclass +class IndexDescriptor: + name: str # e.g. "HNSW" + description: str + build_params: List[ParamDescriptor] + search_params: List[ParamDescriptor] +``` + +### Auto-Discovery + +Backend packages are discovered automatically when the `backends` package is +imported: + +1. Walk every sub-directory of `backends/` that is a Python package. +2. Import the package and look for a `backend_descriptor` attribute. +3. If callable, call it; otherwise use it directly. +4. If the result is a `BackendDescriptor`, register it in the global `registry`. +5. If import fails (missing dependency), log a warning and skip. + +No manual wiring is needed. Drop a new package into `backends/` and it will be +picked up on the next import. + +### Backend Registry + +The `registry` singleton (`backends/__init__.py`) provides: + +| Method | Returns | Description | +|--------|---------|-------------| +| `registry.names()` | `List[str]` | Active backend names, sorted. | +| `registry.list_backends()` | `List[BackendDescriptor]` | Active descriptors, sorted. | +| `registry.get(name)` | `BackendDescriptor` or `None` | Look up by name. | +| `registry.create_backend(name)` | `VectorDBBackend` | Instantiate (disconnected). | +| `get_backend(name)` | `VectorDBBackend` | Module-level shortcut. | + +## Environment Variable Configuration + +Connection parameters can be set via environment variables or a `.env` file +using the naming convention: + +``` +{BACKEND}__{PARAM} +``` + +Both parts are upper-cased, separated by a double underscore. Examples: + +```bash +MILVUS__HOST=10.0.0.5 +MILVUS__PORT=19530 +PGVECTOR__PASSWORD=s3cret +ELASTICSEARCH__API_KEY=abc123 +``` + +Precedence (highest wins): + +``` +CLI flags > environment variables / .env > YAML config > built-in defaults +``` + +See `.env.example` for a full template. + +## Collection Admin CLI + +`collection_admin.py` provides both non-interactive commands and an interactive +menu-driven mode for managing collections across any registered backend. + +### Non-Interactive Commands + +Require `--backend` to specify which database to operate on: + +```bash +# List all collections +collection-admin --backend milvus list + +# Detailed collection metadata +collection-admin --backend milvus info my_collection + +# List indexes +collection-admin --backend pgvector indexes my_collection + +# Collection statistics +collection-admin --backend elasticsearch stats my_collection + +# Drop a collection (requires --yes) +collection-admin --backend milvus drop my_collection --yes + +# Drop an index +collection-admin --backend pgvector drop-index my_collection --yes + +# JSON output +collection-admin --backend milvus --json list +collection-admin --backend milvus --json info my_collection + +# Override connection parameters +collection-admin --backend milvus --param host=10.0.0.5 --param port=19530 list +``` + +### Interactive Mode + +Discovers all active backends, health-checks each one, and presents +menu-driven navigation: + +```bash +# Enter interactive mode (either form works) +collection-admin interactive +collection-admin # defaults to interactive when no command given +``` + +Interactive mode flow: + +1. **Backend discovery** -- probes every active backend from the registry. + For each, loads connection params from `.env` / environment variables, + falls back to descriptor defaults, and attempts a `connect()` / + `disconnect()` health-check ping. + +2. **Backend picker** -- displays a table of all backends with health status: + ``` + | Idx | Backend | Configured | Status | Details | + |-----|----------------------|------------|-------------|-----------------------| + | 0 | Milvus | Yes | Healthy | host=10.0.0.5, port=… | + | 1 | pgvector (PostgreSQL) | defaults | Unreachable | connection refused | + | 2 | Elasticsearch | Yes | Healthy | host=http://local… | + ``` + Only healthy backends are selectable. Passwords are hidden. + +3. **Collection picker** -- lists collections on the selected backend with + row count, dimension, index type, and metric: + ``` + | Idx | Collection | Rows | Dim | Index | Metric | + |-----|------------|---------|------|---------|--------| + | 0 | bench_1m | 1,000,000 | 1536 | HNSW | COSINE | + | 1 | test_100k | 100,000 | 768 | FLAT | L2 | + ``` + +4. **Operations menu** -- run commands against the selected collection: + - `i` -- info (detailed schema, partitions) + - `s` -- stats (row count, index progress) + - `x` -- indexes (list all indexes) + - `c` -- compact (trigger compaction) + - `di` -- drop-index (with confirmation) + - `d` -- delete/drop collection (with confirmation) + - `b` -- back to collection list + - `q` -- quit + +Navigation: `b` goes back one level (operations -> collections -> backends), +`q` exits at any point. + +## Architecture Overview + +``` + BenchmarkOrchestrator + ┌──────────────────────────────────────────────┐ + │ │ + YAML / CLI ──────────>│ BenchmarkConfig (all tunables) │ + │ │ + │ ┌── LOAD PHASE ──────────────────────────┐ │ + │ │ │ │ + │ │ VectorGenerator (background thread) │ │ + │ │ │ │ │ + │ │ │ queue.Queue[VectorBlock] │ │ + │ │ │ │ │ + │ │ ├──> backend.insert_batch() │ │ + │ │ └──> GroundTruthBuilder.update() │ │ + │ │ │ │ + │ │ backend.flush() │ │ + │ │ backend.compact() (optional) │ │ + │ │ backend.get_index_progress() → wait │ │ + │ │ gt_builder.build() → truth_table │ │ + │ └────────────────────────────────────────┘ │ + │ │ + │ ┌── SEARCH PHASE ────────────────────────┐ │ + │ │ │ │ + │ │ SearchRunner │ │ + │ │ for each round x each batch: │ │ + │ │ backend.search() [timed] │ │ + │ │ compute recall vs truth_table │ │ + │ │ record latency │ │ + │ │ → SearchResult (QPS, recall, P50…) │ │ + │ └────────────────────────────────────────┘ │ + │ │ + │ save(output_dir) → artifacts on disk │ + └──────────────────────────────────────────────┘ +``` + +### Key Components + +| Component | File | Responsibility | +|-----------|------|----------------| +| **BenchmarkConfig** | `orchestrator.py` | Dataclass holding every tunable. Built from YAML + CLI. | +| **BenchmarkOrchestrator** | `orchestrator.py` | Top-level coordinator for load and search phases. | +| **VectorGenerator** | `generator.py` | Background thread producing L2-normalized `VectorBlock` objects. | +| **GroundTruthBuilder** | `ground_truth.py` | Incrementally computes exact nearest neighbors as blocks arrive. | +| **SearchRunner** | `search_runner.py` | Sends queries, measures latency, computes recall against truth table. | +| **VectorDBBackend** | `backends/base.py` | Abstract interface every database adapter implements. | +| **BackendRegistry** | `backends/__init__.py` | Auto-discovers and registers backend packages. | +| **collection_admin** | `collection_admin.py` | CLI for collection management (non-interactive + interactive). | + +## Metrics & Measurement + +### Load Phase Timings + +Every stage of the load phase is timed independently with `time.time()` and +stored in `benchmark_meta.json` under the `timings` key: + +| Metric | What is timed | +|--------|---------------| +| `query_gen_sec` | Generating random query vectors (CPU only). | +| `create_collection_sec` | Creating the collection and its primary index on the server. | +| `pipeline_sec` | The entire insert pipeline -- consuming vector blocks from the generator thread and calling `backend.insert_batch()` for each batch. Ground-truth computation runs in parallel on a background thread and does **not** inflate this number. | +| `flush_sec` | `backend.flush()` -- committing pending writes to durable storage. | +| `compact_sec` | `backend.compact()` -- merging small segments (optional, backend-dependent). | +| `index_build_sec` | Polling `backend.get_index_progress()` until the ANN index is fully built. | +| `truth_build_sec` | Finalising the brute-force ground-truth table. | + +Per-block insert and ground-truth timings are logged during the run but are +not persisted as aggregate statistics. + +### Search Phase Metrics + +Each query batch is timed with `time.perf_counter()` (high-resolution, +monotonic). Recall is computed **after** timing stops so it does not inflate +latency numbers. + +Final metrics (written to `search_results.json`): + +| Metric | Description | +|--------|-------------| +| `qps` | Queries per second -- `total_queries / wall_elapsed`. | +| `recall_at_k` | Fraction of true nearest neighbors returned, averaged across all queries. | +| `latency_p50_ms` | 50th-percentile per-query latency (ms). | +| `latency_p90_ms` | 90th-percentile per-query latency (ms). | +| `latency_p99_ms` | 99th-percentile per-query latency (ms). | +| `latency_mean_ms` | Mean per-query latency (ms). | +| `total_queries` | Total number of queries executed across all rounds. | +| `total_wall_sec` | Wall-clock duration of the search phase. | +| `intervals` | Per-interval snapshots (every `log_interval` queries) of all the above, plus `qps_interval` for the most recent window. | + +### What "I/O" Includes + +The benchmark measures **end-to-end I/O latency** including network +round-trips to the database server, not isolated disk I/O: + +| Timing | What is in the measurement | +|--------|----------------------------| +| Insert (`pipeline_sec`) | Network send + server-side WAL writes. | +| Flush (`flush_sec`) | Durable commit to storage. | +| Compact (`compact_sec`) | Server-side segment merges. | +| Index build (`index_build_sec`) | Server-side index construction. | +| Search (`latency_*_ms`) | Network query + server-side ANN search + result transfer. | + +CPU-only work -- vector generation, ground-truth computation, recall +calculation -- is either executed on a separate thread or measured outside +the timing window, so it does not contaminate I/O numbers. + +### Concurrency During Measurement + +The load phase uses a three-way producer-consumer pipeline: + +1. **VectorGenerator** (background thread) -- produces `VectorBlock` objects + into a bounded queue. +2. **Main thread** -- consumes blocks, calls `backend.insert_batch()` (network + I/O that releases the GIL). +3. **GroundTruthBuilder** (background thread via `ThreadPoolExecutor`) -- + computes brute-force nearest neighbors for each block (BLAS matmul, + also releases the GIL). + +The search phase is single-threaded: one query batch at a time, timed +individually. + +## Modes + +| Mode | What it does | Required inputs | +|------|-------------|-----------------| +| **load** (default) | Generate vectors, ingest, build ground truth, save artifacts | `collection_name`, `dimension`, `num_vectors` | +| **search** | Load artifacts from a prior run, benchmark ANN queries | `collection_name`, `artifacts_dir` | +| **both** | Run load then search in a single invocation | Same as load | + +## Configuration + +The benchmark is config-driven. All parameters live in a YAML file. The CLI +provides operational flags (`--config`, `--backend`, `--mode`, `--force`, +`--output-dir`, `--artifacts-dir`) plus introspection (`--what-if`, `--plan`). + +### YAML Structure + +```yaml +backend: milvus +mode: both + +database: + host: 127.0.0.1 + port: 19530 + +dataset: + collection_name: bench_1m_hnsw + num_vectors: 1_000_000 + dimension: 1536 + distribution: uniform + block_size: 100_000 + batch_size: 10_000 + seed: 42 + +query: + num_query_vectors: 10_000 + query_seed: 99 + +ground_truth: + truth_k: 100 + +index: + index_type: HNSW + metric_type: COSINE + index_params: + M: 64 + efConstruction: 200 + num_shards: 1 + +search: + search_k: 10 + num_search_rounds: 1 + search_batch_size: 1 + search_params: + ef: 128 + +workflow: + force: false + compact: true + monitor_interval: 5 +``` + +### CLI Examples + +```bash +# Load and search (backend set in YAML) +python -m vdbbench.benchmark --config configs/1m_hnsw.yaml + +# Override mode +python -m vdbbench.benchmark --config configs/1m_hnsw.yaml --mode load + +# Search using artifacts from a prior run +python -m vdbbench.benchmark \ + --config configs/1m_diskann.yaml \ + --mode search \ + --artifacts-dir results/bench_1m_diskann_20250120_143022 + +# Override backend +python -m vdbbench.benchmark \ + --config configs/pgvector_1m_hnsw.yaml --backend pgvector + +# Preview execution plan +python -m vdbbench.benchmark --config configs/1m_hnsw.yaml --plan + +# Dump resolved config (shows env-var sources) +python -m vdbbench.benchmark --config configs/1m_diskann.yaml --what-if +``` + +### CLI Flags + +| Flag | Description | +|------|-------------| +| `--config PATH` | YAML configuration file (required) | +| `--backend NAME` | Override backend from config | +| `--mode {load,search,both}` | Override runtime mode | +| `--force` | Drop existing collection before load | +| `--output-dir PATH` | Directory for output artifacts | +| `--artifacts-dir PATH` | Directory with prior load artifacts (search mode) | +| `--what-if` | Print resolved config and exit | +| `--plan` | Print execution plan and exit | +| `--debug` | Enable DEBUG logging | + +## Output Artifacts + +| File | Content | When | +|------|---------|------| +| `query_vectors.npy` | Query vectors `(nq, dim)` float32 | load / both | +| `ground_truth.npz` | `truth_table` `(nq, truth_k)` int64 | load / both | +| `search_results.json` | QPS, recall, latencies, intervals | search / both | +| `benchmark_meta.json` | Full config + per-phase timing | always | + +## Adding a New Backend + +1. Create `backends/mydb/__init__.py` and `backends/mydb/backend.py`. +2. Subclass `VectorDBBackend` and implement all abstract methods. +3. Write a `backend_descriptor()` function returning a `BackendDescriptor`. +4. That's it -- auto-discovery registers it on the next import. + +See `backends/README.md` for a complete walkthrough with code examples. + +## Programmatic Usage + +```python +from vdbbench.benchmark import ( + BenchmarkConfig, + BenchmarkOrchestrator, + get_backend, +) + +backend = get_backend("milvus") +backend.connect(host="127.0.0.1", port="19530") + +cfg = BenchmarkConfig( + mode="both", + num_vectors=100_000, + dimension=768, + collection_name="my_bench", + index_type="HNSW", + metric_type="COSINE", + index_params={"M": 32, "efConstruction": 128}, + search_k=10, + search_params={"ef": 64}, + num_search_rounds=3, + force=True, +) + +orch = BenchmarkOrchestrator(config=cfg, backend=backend) +summary = orch.run() +paths = orch.save("./results/my_run") + +backend.disconnect() + +print(f"QPS: {summary['search_qps']:.1f}") +print(f"Recall@10: {summary['search_recall_at_k']:.4f}") +``` diff --git a/vdb_benchmark/vdbbench/benchmark/__init__.py b/vdb_benchmark/vdbbench/benchmark/__init__.py new file mode 100644 index 00000000..c88606ad --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/__init__.py @@ -0,0 +1,46 @@ +"""Producer-consumer vector-DB benchmark framework. + +Key entry points: + +* :class:`BenchmarkOrchestrator` -- runs the full pipeline. +* :class:`BenchmarkConfig` -- all tunables. +* :mod:`backends` -- pluggable, auto-discovered database adapters. +""" + +from .backends import ( + BackendDescriptor, + BackendRegistry, + CollectionInfo, + IndexDescriptor, + ParamDescriptor, + VectorDBBackend, + get_backend, + registry, +) +from .generator import VectorBlock, VectorGenerator, generate_query_vectors +from .ground_truth import GroundTruthBuilder +from .orchestrator import BenchmarkConfig, BenchmarkOrchestrator +from .search_runner import SearchResult, SearchRunner + +__all__ = [ + # Config & orchestration + "BenchmarkConfig", + "BenchmarkOrchestrator", + # Backend framework + "BackendDescriptor", + "BackendRegistry", + "CollectionInfo", + "IndexDescriptor", + "ParamDescriptor", + "VectorDBBackend", + "get_backend", + "registry", + # Data pipeline + "GroundTruthBuilder", + "VectorBlock", + "VectorGenerator", + "generate_query_vectors", + # Search benchmark + "SearchResult", + "SearchRunner", +] diff --git a/vdb_benchmark/vdbbench/benchmark/__main__.py b/vdb_benchmark/vdbbench/benchmark/__main__.py new file mode 100644 index 00000000..84738da6 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/__main__.py @@ -0,0 +1,7 @@ +"""Allow running the benchmark as ``python -m vdbbench.benchmark``.""" + +import sys + +from .run_benchmark import main + +sys.exit(main()) diff --git a/vdb_benchmark/vdbbench/benchmark/backends/README.md b/vdb_benchmark/vdbbench/benchmark/backends/README.md new file mode 100644 index 00000000..2318a7f2 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/README.md @@ -0,0 +1,567 @@ +# Vector Database Backends + +This package provides a **pluggable backend system** for the VDB benchmark +framework. Every database adapter implements the same abstract interface +(`VectorDBBackend`), and the framework discovers and registers backends +automatically at import time -- no manual wiring required. + +## Directory Layout + +``` +backends/ +├── __init__.py # BackendRegistry + auto-discovery +├── base.py # Abstract VectorDBBackend + descriptor dataclasses +├── _env.py # Environment variable loading for connection params +├── _help.py # CLI help formatting utilities +├── elasticsearch/ # Elasticsearch adapter +│ ├── __init__.py # backend_descriptor() + exports +│ ├── backend.py # ElasticsearchBackend implementation +│ └── README.md # Elasticsearch-specific documentation +├── milvus/ # Milvus / Zilliz Cloud adapter +│ ├── __init__.py # backend_descriptor() + exports +│ ├── backend.py # MilvusBackend implementation +│ └── README.md # Milvus-specific documentation +└── pgvector/ # PostgreSQL + pgvector adapter + ├── __init__.py # backend_descriptor() + exports + ├── backend.py # PGVectorBackend implementation + └── README.md # pgvector-specific documentation +``` + +## Abstract Interface + +`VectorDBBackend` (defined in `base.py`) is the contract that every adapter +must satisfy. The benchmark orchestrator only calls methods on this interface, +so adding a new database requires **zero changes** to the generation, +ground-truth, or search pipelines. + +### Method Reference + +#### Lifecycle + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `connect` | `connect(self, **kwargs) -> None` | Open a connection. Keyword arguments come from the backend's `connection_params`. | +| `disconnect` | `disconnect(self) -> None` | Close the connection and release resources. | + +#### Collection Management + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `create_collection` | `create_collection(self, name, dimension, metric_type="COSINE", index_type="HNSW", index_params=None, num_shards=1, force=False) -> CollectionInfo` | Create a collection (or drop + recreate when `force=True`) and build its index. | +| `collection_exists` | `collection_exists(self, name: str) -> bool` | Check whether a collection already exists. | +| `drop_collection` | `drop_collection(self, name: str) -> None` | Drop a collection if it exists. | + +#### Data Ingestion + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `insert_batch` | `insert_batch(self, name, ids: np.ndarray, vectors: np.ndarray) -> int` | Insert a batch of vectors. `ids` is `(n,)` int64; `vectors` is `(n, dim)` float32. Returns the number of vectors inserted. | +| `flush` | `flush(self, name: str) -> None` | Commit pending writes to durable storage. | + +#### Search + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `search` | `search(self, name, query_vectors: np.ndarray, top_k: int, search_params=None) -> List[List[int]]` | Run an ANN (or exact) search. Returns a list of `top_k` primary-key IDs per query, ordered closest-first. | + +#### Status / Info + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `row_count` | `row_count(self, name: str) -> int` | Return the number of vectors currently in the collection. | +| `get_index_progress` | `get_index_progress(self, name: str) -> IndexProgress` | **(Abstract)** Return a point-in-time snapshot of the index build. Each backend fills in whatever fields it can (see `IndexProgress` below). | + +#### Concrete (provided by base class) + +| Method | Signature | Purpose | +|--------|-----------|---------| +| `wait_for_index` | `wait_for_index(self, name, interval=5.0, timeout=0, compacted=False) -> None` | Polls `get_index_progress()` in a loop with unified progress logging. When row counts are available (e.g. Milvus) it logs percentage, overall/recent rates, and ETA; otherwise it logs a simpler status line. Raises `TimeoutError` if `timeout > 0` is exceeded. **Do not override** -- implement `get_index_progress()` instead. | +| `compact` | `compact(self, name: str) -> None` | Trigger segment compaction. Default is a no-op; override if your backend needs it (e.g. Milvus). | + +## Descriptor System + +Every backend exposes a `BackendDescriptor` that tells the framework what the +backend supports. This descriptor drives: + +- CLI `--help` output and argument validation +- Index type and metric validation before a run starts +- The `--plan` execution planner + +### Descriptor Dataclasses + +```python +@dataclass +class ParamDescriptor: + name: str # e.g. "M", "ef", "host" + description: str # shown in --help + type: str = "int" # "int" | "float" | "str" | "bool" + default: Any = None + required: bool = False + +@dataclass +class IndexDescriptor: + name: str # e.g. "HNSW", "DISKANN" + description: str + build_params: List[ParamDescriptor] # used during create_collection + search_params: List[ParamDescriptor] # used during search + +@dataclass +class BackendDescriptor: + name: str # short key used in --backend flag + display_name: str # human-readable name + description: str # one-paragraph overview + backend_class: Type[VectorDBBackend] + supported_metrics: List[str] # e.g. ["COSINE", "L2", "IP"] + supported_indexes: List[IndexDescriptor] + connection_params: List[ParamDescriptor] + active: bool = True # set False to hide from CLI / help + +@dataclass +class CollectionInfo: + name: str + dimension: int + metric_type: str + index_type: str + row_count: int = 0 + extra: Dict[str, Any] = field(default_factory=dict) + +@dataclass +class IndexProgress: + """Snapshot of index-build progress returned by get_index_progress().""" + is_ready: bool = False # True when the build is complete + total_rows: int = 0 # total rows to index (0 if unknown) + indexed_rows: int = 0 # rows indexed so far + pending_rows: int = 0 # rows waiting to be indexed + status: str = "" # free-form backend status (e.g. "yellow") +``` + +When `total_rows > 0` the base-class `wait_for_index()` logs detailed +progress: + +``` +Building index: 55.17% complete... (551,660/1,000,000 rows) | Pending rows: 681,000 | Overall rate: 227.28 rows/sec | Recent rate: 4065.85 rows/sec | ETA: 2026-03-31 17:45:23 | Est. remaining: 0:32:52 +``` + +When only `status` is available (e.g. Elasticsearch health), a simpler +line is shown: + +``` +Waiting for index on 'my_collection' ... (status: yellow) [5s elapsed] +``` + +## Auto-Discovery + +Backend packages are discovered automatically when the `backends` package is +imported. The mechanism (in `__init__.py`) works as follows: + +1. Walk every sub-directory of `backends/` that is a Python package. +2. Import the package. +3. Look for a module-level `backend_descriptor` attribute. +4. If it is callable, call it; otherwise use it directly. +5. If the result is a `BackendDescriptor`, register it in the global + `registry`. +6. If import fails (missing dependency, etc.), log a warning and skip. + +This means installing a new backend is as simple as dropping a package into +`backends/` -- the framework will pick it up on the next import. + +## Existing Backends + +| Backend | `--backend` name | Supported Indexes | Supported Metrics | Active | Required packages | +|---------|-------------------|-------------------|-------------------|--------|-------------------| +| Milvus | `milvus` | HNSW, DISKANN, AISAQ, FLAT | COSINE, L2, IP | Yes | `pymilvus` | +| pgvector | `pgvector` | HNSW, IVFFLAT, FLAT | COSINE, L2, IP | Yes | `psycopg2-binary`, `pgvector` | +| Elasticsearch | `elasticsearch` | HNSW, FLAT | COSINE, L2, IP | Yes | `elasticsearch` | + +### Active vs Inactive Backends + +A backend can be present in the source tree but hidden from users by setting +`active=False` in its `BackendDescriptor`. Inactive backends: + +- Are **not** listed in `--help` or `help backends` output. +- Are **not** returned by `registry.names()`, `registry.list_backends()`, + or `registry.get()`. +- **Cannot** be selected via `--backend` (the CLI will report "unknown + backend"). +- **Are** still registered internally and can be inspected via + `registry.all_backends(include_inactive=True)`. + +This is useful for backends that are under development or not yet ready for +general use. To activate a backend, simply change `active=False` to +`active=True` in its `backend_descriptor()` function. + +## Environment Variable Configuration + +Backend connection parameters can be set via **environment variables** or a +**`.env` file** instead of (or in addition to) CLI flags and YAML configs. + +### Naming Convention + +``` +{BACKEND}__{PARAM} +``` + +Both parts are **upper-cased** and separated by a **double underscore** (`__`). +`PARAM` matches the `name` field of the backend's `connection_params` +descriptors. + +| Backend | Example variables | +|---------|-------------------| +| Milvus | `MILVUS__HOST`, `MILVUS__PORT`, `MILVUS__MAX_MESSAGE_LENGTH` | +| pgvector | `PGVECTOR__HOST`, `PGVECTOR__PORT`, `PGVECTOR__DBNAME`, `PGVECTOR__USER`, `PGVECTOR__PASSWORD` | +| Elasticsearch | `ELASTICSEARCH__HOST`, `ELASTICSEARCH__API_KEY`, `ELASTICSEARCH__CLOUD_ID` | + +### .env File + +If the [`python-dotenv`](https://pypi.org/project/python-dotenv/) package +is installed, the benchmark CLI automatically loads a `.env` file from the +current working directory on startup. See `.env.example` in the benchmark +directory for a template. + +```bash +pip install python-dotenv # optional; enables .env file support +cp benchmark/.env.example .env +# edit .env with your values +``` + +When `python-dotenv` is not installed, only real shell environment variables +are read. + +### Precedence + +Connection parameters are resolved with the following precedence (highest +wins): + +``` +CLI flags > environment variables / .env > YAML config > built-in defaults +``` + +For example, if `MILVUS__HOST=10.0.0.5` is set in `.env` and +`host: 127.0.0.1` is in the YAML config, the env value `10.0.0.5` wins. +But `--host 192.168.1.1` on the CLI overrides both. + +### Debugging + +Use `--what-if` to see where each connection parameter came from: + +```bash +python -m vdbbench.benchmark \ + --backend milvus --config configs/1m_hnsw.yaml --what-if +``` + +Output includes a "Connection parameters (source)" section showing each +parameter's resolved value and whether it came from CLI, env, YAML, or +default. + +### Type Coercion + +Environment variables are always strings. The framework automatically +coerces them to the type declared in `ParamDescriptor.type`: + +| `type` | Conversion | +|--------|-----------| +| `"str"` | Used as-is | +| `"int"` | `int(value)` | +| `"float"` | `float(value)` | +| `"bool"` | `true` / `1` / `yes` / `on` → `True`; everything else → `False` | + +Invalid conversions (e.g. `MILVUS__PORT=abc`) are logged as warnings and +skipped. + +--- + +## Creating a New Backend + +Follow these steps to add support for a new vector database. + +### 1. Create the package directory + +``` +backends/ +└── mydb/ + ├── __init__.py + └── backend.py +``` + +### 2. Implement the backend class (`backend.py`) + +Subclass `VectorDBBackend` and implement every abstract method: + +```python +"""MyDB backend implementation.""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional + +import numpy as np + +from ..base import CollectionInfo, IndexProgress, VectorDBBackend + +logger = logging.getLogger(__name__) + + +class MyDBBackend(VectorDBBackend): + """Concrete backend for MyDB.""" + + def __init__(self) -> None: + self._client = None + + # -- Lifecycle -------------------------------------------------------- + + def connect(self, host: str = "127.0.0.1", port: str = "6333", **kwargs) -> None: + from mydb_client import Client # import here to keep it optional + self._client = Client(host=host, port=int(port)) + logger.info("Connected to MyDB at %s:%s", host, port) + + def disconnect(self) -> None: + if self._client is not None: + self._client.close() + self._client = None + logger.info("Disconnected from MyDB") + + # -- Collection management -------------------------------------------- + + def create_collection( + self, + name: str, + dimension: int, + metric_type: str = "COSINE", + index_type: str = "HNSW", + index_params: Optional[Dict[str, Any]] = None, + num_shards: int = 1, + force: bool = False, + ) -> CollectionInfo: + if self.collection_exists(name): + if force: + self.drop_collection(name) + else: + raise ValueError(f"Collection '{name}' already exists") + + params = index_params or {} + # ... create the collection and index using your DB client ... + + return CollectionInfo( + name=name, + dimension=dimension, + metric_type=metric_type, + index_type=index_type, + row_count=0, + extra={"index_params": params}, + ) + + def collection_exists(self, name: str) -> bool: + return self._client.has_collection(name) + + def drop_collection(self, name: str) -> None: + if self.collection_exists(name): + self._client.delete_collection(name) + logger.info("Dropped collection '%s'", name) + + # -- Data ingestion --------------------------------------------------- + + def insert_batch(self, name: str, ids: np.ndarray, vectors: np.ndarray) -> int: + # ids: (n,) int64, vectors: (n, dim) float32 + self._client.upsert( + collection=name, + ids=ids.tolist(), + vectors=vectors.tolist(), + ) + return len(ids) + + def flush(self, name: str) -> None: + self._client.flush(collection=name) + logger.info("Flushed '%s'", name) + + # -- Search ----------------------------------------------------------- + + def search( + self, + name: str, + query_vectors: np.ndarray, + top_k: int, + search_params: Optional[Dict[str, Any]] = None, + ) -> List[List[int]]: + results = [] + for qvec in query_vectors: + hits = self._client.search( + collection=name, + vector=qvec.tolist(), + limit=top_k, + **(search_params or {}), + ) + results.append([hit.id for hit in hits]) + return results + + # -- Status ----------------------------------------------------------- + + def row_count(self, name: str) -> int: + return self._client.count(collection=name) + + def get_index_progress(self, name: str) -> IndexProgress: + info = self._client.index_status(collection=name) + return IndexProgress( + is_ready=info.get("ready", False), + total_rows=info.get("total", 0), + indexed_rows=info.get("indexed", 0), + pending_rows=info.get("pending", 0), + status=info.get("state", ""), + ) + + # -- Optional overrides ----------------------------------------------- + + def load_collection(self, name: str) -> None: + """Load collection into memory (if your DB requires it).""" + self._client.load(collection=name) + logger.info("Loaded collection '%s' into memory", name) +``` + +**Guidelines:** + +- Import your database client library **inside** `connect()` (not at + module level). This keeps the dependency optional -- the framework can + still import the package and show help text even when the client library + is not installed. +- Always accept `**kwargs` in `connect()` so the framework can pass + connection parameters defined in your descriptor. +- `search()` must return results sorted **closest-first**. +- `insert_batch()` receives NumPy arrays. Convert to lists or native types + as needed by your client library. +- Implement `get_index_progress()` -- **not** `wait_for_index()`. The + base class owns the polling loop and all progress logging. Your method + just returns a single `IndexProgress` snapshot. If your database has a + synchronous index build (like pgvector), simply return + `IndexProgress(is_ready=True)` once the index exists. + +### 3. Write the descriptor (`__init__.py`) + +The `__init__.py` must expose a `backend_descriptor` attribute -- either a +callable (function) that returns a `BackendDescriptor`, or a +`BackendDescriptor` instance directly. + +```python +"""MyDB backend package.""" + +from ..base import BackendDescriptor, IndexDescriptor, ParamDescriptor +from .backend import MyDBBackend + +__all__ = ["MyDBBackend", "backend_descriptor"] + + +def backend_descriptor() -> BackendDescriptor: + """Return the capability descriptor for the MyDB backend.""" + return BackendDescriptor( + name="mydb", # used in --backend mydb + display_name="MyDB", # shown in CLI help + description=( + "A scalable vector database with support for HNSW " + "and brute-force search. Requires the mydb-client " + "Python package." + ), + backend_class=MyDBBackend, + supported_metrics=["COSINE", "L2", "IP"], + supported_indexes=[ + IndexDescriptor( + name="HNSW", + description="Graph-based approximate search.", + build_params=[ + ParamDescriptor( + name="M", + description="Max connections per node.", + type="int", + default=16, + ), + ParamDescriptor( + name="efConstruction", + description="Build-time search width.", + type="int", + default=200, + ), + ], + search_params=[ + ParamDescriptor( + name="ef", + description="Query-time search width.", + type="int", + default=128, + ), + ], + ), + IndexDescriptor( + name="FLAT", + description="Brute-force exact search.", + build_params=[], + search_params=[], + ), + ], + connection_params=[ + ParamDescriptor( + name="host", + description="Server hostname or IP.", + type="str", + default="127.0.0.1", + ), + ParamDescriptor( + name="port", + description="Server port.", + type="str", + default="6333", + ), + ], + ) +``` + +**Key rules for the descriptor:** + +- `name` must be a unique, lower-case identifier. This is used as the + `--backend` CLI value. +- `supported_indexes` must list every index algorithm your backend + supports. `build_params` describe the parameters passed to + `create_collection(index_params=...)`. `search_params` describe the + parameters passed to `search(search_params=...)`. +- `connection_params` should list every keyword accepted by your + `connect()` method so the framework can generate the correct CLI flags. +- Set `active=False` to keep the backend in the tree but hidden from + users. This is useful during development. Omit the field or set + `active=True` (the default) to make it available. + +### 4. Verify + +No manual registration code is needed. Simply restart Python and the +auto-discovery will find your package: + +```bash +# Confirm the backend is discovered +python -c " +from vdbbench.benchmark.backends import registry +print(registry.names()) # should include 'mydb' +print(registry.get('mydb')) # should show your BackendDescriptor +" + +# Check CLI help +python -m vdbbench.benchmark help backend mydb + +# Run a benchmark +python -m vdbbench.benchmark \ + --backend mydb \ + --config configs/1m_hnsw.yaml \ + --mode both +``` + +### 5. Checklist + +- [ ] `backend.py` subclasses `VectorDBBackend` and implements all abstract + methods. +- [ ] `__init__.py` exposes a `backend_descriptor` callable returning a + `BackendDescriptor`. +- [ ] Client library imported inside `connect()`, not at module top level. +- [ ] `connect()` accepts `**kwargs`. +- [ ] `create_collection()` respects the `force` flag (drop + recreate). +- [ ] `search()` returns IDs sorted closest-first. +- [ ] `get_index_progress()` returns an `IndexProgress` snapshot. + `wait_for_index()` is provided by the base class -- do **not** + override it. +- [ ] `supported_indexes` lists every index type the backend handles. +- [ ] `connection_params` matches the keyword arguments of `connect()`. +- [ ] The backend appears in `registry.names()` after import. diff --git a/vdb_benchmark/vdbbench/benchmark/backends/__init__.py b/vdb_benchmark/vdbbench/benchmark/backends/__init__.py new file mode 100644 index 00000000..7a0af32d --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/__init__.py @@ -0,0 +1,183 @@ +"""Backend registry -- auto-discovers backend packages at import time. + +Every sub-directory of ``backends/`` that contains an ``__init__.py`` +with a module-level ``backend_descriptor`` attribute (a callable +returning :class:`BackendDescriptor`) is loaded and registered +automatically. + +Public API consumed by the rest of the benchmark: + +* ``registry`` -- the singleton :class:`BackendRegistry`. +* ``get_backend(name)`` -- shortcut to instantiate a backend by name. +""" + +from __future__ import annotations + +import importlib +import logging +import os +import pkgutil +from typing import Dict, List, Optional, Type + +from .base import ( + BackendDescriptor, + CollectionInfo, + IndexDescriptor, + IndexProgress, + ParamDescriptor, + VectorDBBackend, +) + +logger = logging.getLogger(__name__) + +__all__ = [ + # Data model + "BackendDescriptor", + "CollectionInfo", + "IndexDescriptor", + "IndexProgress", + "ParamDescriptor", + "VectorDBBackend", + # Registry + "BackendRegistry", + "registry", + "get_backend", +] + + +class BackendRegistry: + """Collects :class:`BackendDescriptor` instances from backend packages. + + Only **active** backends (``descriptor.active is True``) are visible + through the public query methods (``get``, ``names``, + ``list_backends``, ``create_backend``). Inactive backends are still + stored internally so they can be reactivated at runtime if needed. + """ + + def __init__(self) -> None: + self._backends: Dict[str, BackendDescriptor] = {} + + # ------------------------------------------------------------------ + # Registration + # ------------------------------------------------------------------ + def register(self, descriptor: BackendDescriptor) -> None: + """Register a backend descriptor (idempotent for the same name).""" + key = descriptor.name.lower() + if key in self._backends: + logger.debug("Backend '%s' already registered; skipping.", key) + return + self._backends[key] = descriptor + status = "active" if descriptor.active else "inactive" + logger.debug("Registered backend: %s (%s)", key, status) + + # ------------------------------------------------------------------ + # Querying (only active backends) + # ------------------------------------------------------------------ + def get(self, name: str) -> Optional[BackendDescriptor]: + """Return the descriptor for *name*, or ``None``. + + Returns ``None`` for inactive backends. + """ + desc = self._backends.get(name.lower()) + if desc is not None and not desc.active: + return None + return desc + + def list_backends(self) -> List[BackendDescriptor]: + """Return all **active** registered descriptors, sorted by name.""" + return sorted( + (d for d in self._backends.values() if d.active), + key=lambda d: d.name, + ) + + def names(self) -> List[str]: + """Return **active** registered backend names, sorted.""" + return sorted(k for k, d in self._backends.items() if d.active) + + def __contains__(self, name: str) -> bool: + desc = self._backends.get(name.lower()) + return desc is not None and desc.active + + # ------------------------------------------------------------------ + # Convenience + # ------------------------------------------------------------------ + def create_backend(self, name: str) -> VectorDBBackend: + """Instantiate and return a (disconnected) backend by name. + + Raises :class:`ValueError` for unknown or inactive backends. + """ + desc = self.get(name) + if desc is None: + available = ", ".join(self.names()) or "(none)" + raise ValueError( + f"Unknown backend '{name}'. Available: {available}" + ) + return desc.backend_class() + + # ------------------------------------------------------------------ + # Introspection (includes inactive) + # ------------------------------------------------------------------ + def all_backends(self, include_inactive: bool = True) -> List[BackendDescriptor]: + """Return every registered descriptor, optionally including inactive ones.""" + return sorted( + (d for d in self._backends.values() if include_inactive or d.active), + key=lambda d: d.name, + ) + + +# Singleton used by the rest of the package. +registry = BackendRegistry() + + +def get_backend(name: str) -> VectorDBBackend: + """Convenience: instantiate a backend by name from the global registry.""" + return registry.create_backend(name) + + +# ------------------------------------------------------------------ +# Auto-discovery +# ------------------------------------------------------------------ + +def _discover_backends() -> None: + """Walk sub-packages of this directory and register any that expose + a ``backend_descriptor`` callable. + """ + pkg_dir = os.path.dirname(os.path.abspath(__file__)) + for finder, subpkg_name, is_pkg in pkgutil.iter_modules([pkg_dir]): + if not is_pkg: + continue # skip plain .py files like base.py + fqn = f"{__name__}.{subpkg_name}" + try: + mod = importlib.import_module(fqn) + except Exception: + logger.warning( + "Failed to import backend package '%s'; skipping.", + fqn, exc_info=True, + ) + continue + + descriptor_fn = getattr(mod, "backend_descriptor", None) + if descriptor_fn is None: + logger.debug( + "Package '%s' has no backend_descriptor(); skipping.", fqn + ) + continue + + try: + desc = descriptor_fn() if callable(descriptor_fn) else descriptor_fn + if isinstance(desc, BackendDescriptor): + registry.register(desc) + else: + logger.warning( + "backend_descriptor in '%s' did not return a " + "BackendDescriptor; got %s", + fqn, type(desc).__name__, + ) + except Exception: + logger.warning( + "Error calling backend_descriptor() in '%s'; skipping.", + fqn, exc_info=True, + ) + + +_discover_backends() diff --git a/vdb_benchmark/vdbbench/benchmark/backends/_env.py b/vdb_benchmark/vdbbench/benchmark/backends/_env.py new file mode 100644 index 00000000..8852e2eb --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/_env.py @@ -0,0 +1,151 @@ +"""Load backend connection parameters from environment variables and ``.env`` files. + +Variable naming convention:: + + {BACKEND_NAME}__{PARAM_NAME} + +Both parts are **upper-cased** and separated by a **double underscore**. +The ``PARAM_NAME`` corresponds to a ``ParamDescriptor.name`` from the +backend's ``connection_params``, also upper-cased. + +Examples:: + + MILVUS__HOST=10.0.0.5 + MILVUS__PORT=19530 + PGVECTOR__PASSWORD=s3cret + ELASTICSEARCH__API_KEY=abc123 + +If the `python-dotenv`_ package is installed, a ``.env`` file in the +current working directory (or the path given to :func:`load_env_file`) is +loaded automatically so that the variables are available via +``os.environ``. When ``python-dotenv`` is not installed the module +falls back to reading ``os.environ`` directly (i.e. only real shell +environment variables are considered). + +.. _python-dotenv: https://pypi.org/project/python-dotenv/ +""" + +from __future__ import annotations + +import logging +import os +from typing import Any, Dict, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from .base import BackendDescriptor + +logger = logging.getLogger(__name__) + +# Double underscore separates the backend name from the parameter name. +_SEP = "__" + + +# ------------------------------------------------------------------ +# .env file loading +# ------------------------------------------------------------------ + +def load_env_file(path: Optional[str] = None) -> bool: + """Load a ``.env`` file into ``os.environ``. + + Parameters + ---------- + path : str, optional + Explicit path to the ``.env`` file. When *None*, ``python-dotenv`` + searches upward from the current working directory. + + Returns + ------- + bool + ``True`` if a ``.env`` file was loaded, ``False`` otherwise + (including when ``python-dotenv`` is not installed). + """ + try: + from dotenv import load_dotenv, find_dotenv # type: ignore[import-untyped] + except ImportError: + logger.debug( + "python-dotenv is not installed; skipping .env file loading. " + "Install it with: pip install python-dotenv" + ) + return False + + dotenv_path = path or find_dotenv(usecwd=True) + if not dotenv_path or not os.path.isfile(dotenv_path): + logger.debug("No .env file found") + return False + + load_dotenv(dotenv_path, override=False) + logger.info("Loaded .env file: %s", dotenv_path) + return True + + +# ------------------------------------------------------------------ +# Type coercion +# ------------------------------------------------------------------ + +def _coerce(value: str, type_hint: str) -> Any: + """Convert a string *value* to the Python type indicated by *type_hint*. + + Supported hints (matching ``ParamDescriptor.type``): + ``"int"``, ``"float"``, ``"str"``, ``"bool"``. + """ + type_hint = type_hint.lower() + if type_hint == "int": + return int(value) + if type_hint == "float": + return float(value) + if type_hint == "bool": + return value.lower() in ("1", "true", "yes", "on") + return value # "str" or anything else + + +# ------------------------------------------------------------------ +# Read env vars for a backend +# ------------------------------------------------------------------ + +def env_for_backend( + backend_name: str, + desc: "BackendDescriptor", +) -> Dict[str, Any]: + """Return a dict of connection parameters sourced from the environment. + + For each ``ParamDescriptor`` in *desc.connection_params*, the function + looks for an environment variable named + ``{BACKEND_NAME}__{PARAM_NAME}`` (both upper-cased, separated by a + double underscore). + + Values are coerced to the type declared in ``ParamDescriptor.type``. + Variables that are not set in the environment are omitted from the + returned dict. + + Parameters + ---------- + backend_name : str + Short backend key (e.g. ``"milvus"``). + desc : BackendDescriptor + The backend's descriptor (used to enumerate connection params and + their types). + + Returns + ------- + dict[str, Any] + Mapping of ``param_name -> coerced_value`` for every env var that + was found. + """ + prefix = backend_name.upper() + _SEP + result: Dict[str, Any] = {} + + for param in desc.connection_params: + env_key = prefix + param.name.upper() + raw = os.environ.get(env_key) + if raw is None: + continue + try: + result[param.name] = _coerce(raw, param.type) + logger.debug("Env var %s -> %s = %r", env_key, param.name, result[param.name]) + except (ValueError, TypeError) as exc: + logger.warning( + "Ignoring env var %s: could not coerce %r to %s: %s", + env_key, raw, param.type, exc, + ) + + return result diff --git a/vdb_benchmark/vdbbench/benchmark/backends/_help.py b/vdb_benchmark/vdbbench/benchmark/backends/_help.py new file mode 100644 index 00000000..6d69f74a --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/_help.py @@ -0,0 +1,141 @@ +"""Human-readable help formatter for backend capabilities. + +Usage from CLI:: + + help backends -- list all registered backends + help backend milvus -- detailed info for one backend + +Usage from Python:: + + from benchmark.backends._help import format_backend_help, format_backends_list + print(format_backends_list(registry)) + print(format_backend_help(registry, "milvus")) +""" + +from __future__ import annotations + +import textwrap +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from . import BackendRegistry + from .base import BackendDescriptor, IndexDescriptor + + +def format_backends_list(reg: "BackendRegistry") -> str: + """One-line summary of every registered backend.""" + backends = reg.list_backends() + if not backends: + return "No backends registered." + + lines = ["Registered vector-database backends:", ""] + name_width = max(len(d.display_name) for d in backends) + for desc in backends: + first_line = desc.description.split(".")[0].strip() + "." + metrics = ", ".join(desc.supported_metrics) + indexes = ", ".join(desc.index_names()) + lines.append( + f" {desc.display_name:<{name_width}} " + f"(name: {desc.name})" + ) + lines.append( + f" {'':<{name_width}} " + f"metrics: {metrics}" + ) + lines.append( + f" {'':<{name_width}} " + f"indexes: {indexes}" + ) + lines.append("") + + lines.append( + "Use 'help backend ' for detailed parameters. " + "Example: help backend milvus" + ) + return "\n".join(lines) + + +def format_backend_help(reg: "BackendRegistry", name: str) -> str: + """Detailed help for one backend, including every parameter.""" + desc = reg.get(name) + if desc is None: + available = ", ".join(reg.names()) or "(none)" + return f"Unknown backend '{name}'. Available: {available}" + return _render_descriptor(desc) + + +# ------------------------------------------------------------------ +# Internal renderers +# ------------------------------------------------------------------ + +_SEPARATOR = "-" * 64 + + +def _render_descriptor(desc: "BackendDescriptor") -> str: + parts: list[str] = [] + + # Header + parts.append("=" * 64) + parts.append(f"Backend: {desc.display_name} (--backend {desc.name})") + parts.append("=" * 64) + parts.append("") + parts.append(textwrap.fill(desc.description, width=64)) + parts.append("") + + # Metrics + parts.append("Supported distance metrics:") + for m in desc.supported_metrics: + parts.append(f" - {m}") + parts.append("") + + # Connection params + if desc.connection_params: + parts.append(_SEPARATOR) + parts.append("Connection parameters:") + parts.append(_SEPARATOR) + parts.append("") + for p in desc.connection_params: + parts.append(_render_param(p)) + parts.append("") + + # Index types + if desc.supported_indexes: + parts.append(_SEPARATOR) + parts.append("Index types:") + parts.append(_SEPARATOR) + for idx in desc.supported_indexes: + parts.append("") + parts.extend(_render_index(idx)) + + return "\n".join(parts) + + +def _render_index(idx: "IndexDescriptor") -> list[str]: + lines: list[str] = [] + lines.append(f" [{idx.name}]") + lines.append(f" {idx.description}") + lines.append("") + + if idx.build_params: + lines.append(" Build parameters:") + for p in idx.build_params: + lines.append(" " + _render_param(p)) + else: + lines.append(" Build parameters: (none)") + + lines.append("") + + if idx.search_params: + lines.append(" Search parameters:") + for p in idx.search_params: + lines.append(" " + _render_param(p)) + else: + lines.append(" Search parameters: (none)") + + return lines + + +def _render_param(p) -> str: + req = " (required)" if p.required else "" + default = f" [default: {p.default}]" if p.default is not None else "" + return f" --{p.name} <{p.type}>{req}{default}\n {p.description}" diff --git a/vdb_benchmark/vdbbench/benchmark/backends/base.py b/vdb_benchmark/vdbbench/benchmark/backends/base.py new file mode 100644 index 00000000..27139d32 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/base.py @@ -0,0 +1,487 @@ +"""Abstract base class for vector database backends. + +Every concrete backend (Milvus, Qdrant, Weaviate, ...) must subclass +``VectorDBBackend`` and implement the abstract methods below. The +benchmark orchestrator only talks through this interface, so swapping +databases requires zero changes to the generation / ground-truth pipeline. + +Each backend lives in its own sub-package (e.g. ``backends/milvus/``) +and exposes a :func:`backend_descriptor` function that returns a +:class:`BackendDescriptor`. The registry discovers these packages +automatically at import time. +""" + +from __future__ import annotations + +import abc +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Any, Dict, List, Optional, Type + +import numpy as np + +logger = logging.getLogger(__name__) + + +# ===================================================================== +# Capability / descriptor data model +# ===================================================================== + +@dataclass +class ParamDescriptor: + """One tunable parameter for an index or a connection.""" + name: str + description: str + type: str = "int" # "int", "float", "str", "bool" + default: Any = None + required: bool = False + + +@dataclass +class IndexDescriptor: + """Everything the benchmark needs to know about one index algorithm.""" + name: str # e.g. "HNSW" + description: str + build_params: List[ParamDescriptor] = field(default_factory=list) + search_params: List[ParamDescriptor] = field(default_factory=list) + + +@dataclass +class BackendDescriptor: + """Self-description returned by every backend package. + + The registry collects these and uses them for CLI help, validation, + and dynamic argument generation. + + Set *active* to ``False`` to keep a backend in the tree without + exposing it to users (it will be hidden from ``--help``, CLI + validation, and ``registry.names()``). + """ + name: str # short, lower-case key ("milvus") + display_name: str # human-readable ("Milvus") + description: str # one-paragraph overview + backend_class: Type["VectorDBBackend"] + supported_metrics: List[str] = field(default_factory=list) + supported_indexes: List[IndexDescriptor] = field(default_factory=list) + connection_params: List[ParamDescriptor] = field(default_factory=list) + active: bool = True + + # ------------------------------------------------------------------ + # Convenience look-ups + # ------------------------------------------------------------------ + def index_names(self) -> List[str]: + """Return the list of supported index algorithm names.""" + return [idx.name for idx in self.supported_indexes] + + def get_index(self, name: str) -> Optional[IndexDescriptor]: + """Return the :class:`IndexDescriptor` for *name*, or ``None``.""" + for idx in self.supported_indexes: + if idx.name.upper() == name.upper(): + return idx + return None + + +# ===================================================================== +# Collection metadata (unchanged) +# ===================================================================== + +@dataclass +class CollectionInfo: + """Metadata returned after a collection is created or connected to.""" + name: str + dimension: int + metric_type: str + index_type: str + row_count: int = 0 + extra: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class IndexProgress: + """Snapshot of index-build progress returned by backends. + + Backends fill in as much as they know: + + * **Milvus** – has ``total_rows``, ``indexed_rows``, and ``pending_rows``. + * **pgvector** – ``CREATE INDEX`` is synchronous; simply sets ``is_ready``. + * **Elasticsearch** – sets ``status`` (red/yellow/green) and ``is_ready``. + + The base-class ``wait_for_index`` handles all logging, adapting + the detail level to whatever fields the backend provides. + """ + is_ready: bool = False + total_rows: int = 0 + indexed_rows: int = 0 + pending_rows: int = 0 + status: str = "" # free-form backend status (e.g. "yellow") + + +class VectorDBBackend(abc.ABC): + """Thin, storage-only contract that every vector DB must satisfy.""" + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + @abc.abstractmethod + def connect(self, **kwargs) -> None: + """Establish a connection to the database server.""" + + @abc.abstractmethod + def disconnect(self) -> None: + """Cleanly disconnect from the server.""" + + # ------------------------------------------------------------------ + # Collection management + # ------------------------------------------------------------------ + @abc.abstractmethod + def create_collection( + self, + name: str, + dimension: int, + metric_type: str = "COSINE", + index_type: str = "HNSW", + index_params: Optional[Dict[str, Any]] = None, + num_shards: int = 1, + force: bool = False, + ) -> CollectionInfo: + """Create (or re-create if *force*) a collection and its index. + + Parameters + ---------- + name : str + Collection / table / index name. + dimension : int + Dimensionality of the vectors. + metric_type : str + Distance metric (``COSINE``, ``L2``, ``IP``). + index_type : str + Index algorithm (``HNSW``, ``DISKANN``, ``FLAT``, ...). + index_params : dict, optional + Backend-specific index build parameters (e.g. ``M``, + ``efConstruction`` for HNSW). + num_shards : int + Number of shards / partitions. + force : bool + If *True*, drop any existing collection with the same name first. + + Returns + ------- + CollectionInfo + """ + + @abc.abstractmethod + def collection_exists(self, name: str) -> bool: + """Return *True* if the collection already exists.""" + + @abc.abstractmethod + def drop_collection(self, name: str) -> None: + """Drop a collection if it exists.""" + + # ------------------------------------------------------------------ + # Data ingestion + # ------------------------------------------------------------------ + @abc.abstractmethod + def insert_batch( + self, + name: str, + ids: np.ndarray, + vectors: np.ndarray, + ) -> int: + """Insert a batch of vectors. + + Parameters + ---------- + name : str + Target collection name. + ids : np.ndarray + 1-D array of integer primary keys (int64). + vectors : np.ndarray + 2-D float32 array of shape ``(n, dim)``. + + Returns + ------- + int + Number of vectors successfully inserted. + """ + + @abc.abstractmethod + def flush(self, name: str) -> None: + """Flush / commit pending writes for the collection.""" + + def compact(self, name: str) -> None: + """Trigger segment compaction and wait for it to finish. + + Compaction merges many small segments into fewer large ones so + the index builder can process them efficiently. The default + implementation is a no-op (not every backend needs compaction). + """ + + # ------------------------------------------------------------------ + # Search + # ------------------------------------------------------------------ + @abc.abstractmethod + def search( + self, + name: str, + query_vectors: np.ndarray, + top_k: int, + search_params: Optional[Dict[str, Any]] = None, + ) -> List[List[int]]: + """Run an ANN (or exact) search. + + Parameters + ---------- + name : str + Collection to search. + query_vectors : np.ndarray + 2-D float32 array of shape ``(nq, dim)``. + top_k : int + Number of nearest neighbors to return per query. + search_params : dict, optional + Backend-specific search parameters (e.g. ``ef`` for HNSW). + + Returns + ------- + list[list[int]] + For each query vector, a list of ``top_k`` primary-key IDs + ordered by distance (closest first). + """ + + # ------------------------------------------------------------------ + # Status / info + # ------------------------------------------------------------------ + @abc.abstractmethod + def row_count(self, name: str) -> int: + """Return the current number of vectors in the collection.""" + + @abc.abstractmethod + def get_index_progress(self, name: str) -> IndexProgress: + """Return a point-in-time snapshot of the index build. + + Each backend fills in whatever it can. Milvus can report row + counts; pgvector simply returns ``is_ready=True`` once the + synchronous ``CREATE INDEX`` finishes; Elasticsearch checks + cluster health status. + + The base class ``wait_for_index`` calls this in a loop and + handles all progress logging. + """ + + # ------------------------------------------------------------------ + # Administration / introspection + # ------------------------------------------------------------------ + @abc.abstractmethod + def list_collections(self) -> List[str]: + """Return names of all collections (tables / indexes) on the server.""" + + @abc.abstractmethod + def get_collection_info(self, name: str) -> Dict[str, Any]: + """Return detailed metadata about a single collection. + + The returned dict should include at least: + + * ``name`` (str) + * ``row_count`` (int) + * ``dimension`` (int or None) + * ``metric_type`` (str or None) + * ``index_type`` (str or None) + * ``schema`` (list[dict] -- one entry per field/column) + + Backends may add extra keys. + """ + + @abc.abstractmethod + def list_indexes(self, name: str) -> List[Dict[str, Any]]: + """Return info about every index on *name*. + + Each dict should include at least ``index_name``, + ``index_type``, and ``params``. + """ + + def drop_index(self, name: str, index_name: Optional[str] = None) -> None: + """Drop an index from the collection. + + Parameters + ---------- + name : str + Collection name. + index_name : str, optional + Specific index to drop. When *None* the backend drops the + primary / only vector index. + + The default implementation raises :class:`NotImplementedError`. + """ + raise NotImplementedError( + f"{type(self).__name__} does not implement drop_index" + ) + + def get_collection_stats(self, name: str) -> Dict[str, Any]: + """Return operational statistics for a collection. + + The default implementation returns the row count and index + progress; backends may override to add richer metrics. + """ + prog = self.get_index_progress(name) + return { + "name": name, + "row_count": self.row_count(name), + "index_ready": prog.is_ready, + "index_status": prog.status, + "indexed_rows": prog.indexed_rows, + "total_rows": prog.total_rows, + "pending_rows": prog.pending_rows, + } + + # ------------------------------------------------------------------ + # Unified index-wait with progress logging + # ------------------------------------------------------------------ + _STALL_LOG_EVERY: int = 6 # stall reminder every N unchanged polls + + def wait_for_index( + self, + name: str, + interval: float = 5.0, + timeout: float = 0, + compacted: bool = False, + ) -> None: + """Block until the index build finishes. + + Polls :meth:`get_index_progress` every *interval* seconds and + emits unified progress logs. When the backend provides row + counts the output includes overall/recent rates and an ETA; + otherwise a simpler status line is shown. + + Parameters + ---------- + interval : float + Polling interval in seconds. + timeout : float + Maximum seconds to wait (0 = forever). + compacted : bool + Hint from the orchestrator — used only in stall warnings. + """ + start = time.time() + prev_indexed = -1 + prev_time = start + stall_polls = 0 + eta_deadline = float("inf") + warned = False + + while True: + try: + prog = self.get_index_progress(name) + now = time.time() + elapsed = now - start + + # ---------- done? ---------- + if prog.is_ready: + if prog.total_rows: + logger.info( + "Index build complete for '%s' " + "(%s rows in %.1fs)", + name, f"{prog.total_rows:,}", elapsed, + ) + else: + msg = f"Index ready for '{name}'" + if prog.status: + msg += f" (status: {prog.status})" + msg += f" [{elapsed:.1f}s]" + logger.info(msg) + return + + # ---------- row-level progress (Milvus-style) ---------- + if prog.total_rows > 0: + pct = prog.indexed_rows / prog.total_rows * 100 + + if prog.indexed_rows != prev_indexed: + delta = prog.indexed_rows - max(prev_indexed, 0) + dt = now - prev_time + recent_rate = delta / dt if dt > 0 else 0 + overall_rate = ( + prog.indexed_rows / elapsed if elapsed > 0 else 0 + ) + remaining = prog.total_rows - prog.indexed_rows + eta_secs = ( + remaining / recent_rate if recent_rate > 0 else 0 + ) + eta_deadline = now + eta_secs + eta_dt = datetime.now() + timedelta(seconds=eta_secs) + remaining_td = str(timedelta(seconds=int(eta_secs))) + logger.info( + "Building index: %.2f%% complete... " + "(%s/%s rows) | Pending rows: %s | " + "Overall rate: %.2f rows/sec | " + "Recent rate: %.2f rows/sec | " + "ETA: %s | Est. remaining: %s", + pct, + f"{prog.indexed_rows:,}", + f"{prog.total_rows:,}", + f"{prog.pending_rows:,}", + overall_rate, + recent_rate, + eta_dt.strftime("%Y-%m-%d %H:%M:%S"), + remaining_td, + ) + stall_polls = 0 + warned = False + prev_indexed = prog.indexed_rows + prev_time = now + else: + stall_polls += 1 + if not warned and now > eta_deadline: + warned = True + if compacted: + logger.warning( + "Index build has exceeded ETA by " + "%.0fs (compaction was already " + "performed). This may be normal " + "for large indexes -- waiting. " + "[%.0fs elapsed]", + now - eta_deadline, elapsed, + ) + else: + logger.warning( + "Index build has exceeded ETA by " + "%.0fs. Set 'compact: true' in " + "your config so small segments " + "are merged before index build. " + "[%.0fs elapsed]", + now - eta_deadline, elapsed, + ) + elif stall_polls % self._STALL_LOG_EVERY == 0: + overall_rate = ( + prog.indexed_rows / elapsed + if elapsed > 0 else 0 + ) + logger.info( + "Building index: %.2f%% complete... " + "(%s/%s rows) | Pending rows: %s | " + "Overall rate: %.2f rows/sec | " + "No progress for %.0fs " + "[%.0fs elapsed]", + pct, + f"{prog.indexed_rows:,}", + f"{prog.total_rows:,}", + f"{prog.pending_rows:,}", + overall_rate, + stall_polls * interval, + elapsed, + ) + # ---------- status-only (ES / pgvector-style) ---------- + else: + status_str = prog.status or "waiting" + logger.info( + "Waiting for index on '%s' … (status: %s) " + "[%.0fs elapsed]", + name, status_str, elapsed, + ) + except Exception as exc: + logger.warning("Index progress check failed: %s", exc) + + if timeout > 0 and (time.time() - start) > timeout: + raise TimeoutError( + f"Index build did not finish within {timeout}s" + ) + time.sleep(interval) diff --git a/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/README.md b/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/README.md new file mode 100644 index 00000000..df947b17 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/README.md @@ -0,0 +1,210 @@ +# Elasticsearch Backend + +Adapter for [Elasticsearch](https://www.elastic.co/elasticsearch/) 8.x+ +with native dense-vector kNN search. + +## Requirements + +```bash +pip install elasticsearch +``` + +A running Elasticsearch 8.x cluster is required. The backend uses the +[kNN search API](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html) +introduced in Elasticsearch 8.0. + +## Connection + +| Parameter | Env Variable | Default | Description | +|-----------|-------------|---------|-------------| +| `host` | `ELASTICSEARCH__HOST` | `http://localhost:9200` | Elasticsearch server URL | +| `api_key` | `ELASTICSEARCH__API_KEY` | *(none)* | API key for authentication (optional) | +| `cloud_id` | `ELASTICSEARCH__CLOUD_ID` | *(none)* | Elastic Cloud deployment ID (optional, alternative to `host`) | + +Connection precedence: +1. If `cloud_id` is set, connect via Elastic Cloud with optional `api_key`. +2. If only `api_key` is set, connect to `host` with API key authentication. +3. Otherwise, connect to `host` without authentication. + +## Supported Indexes + +### HNSW + +Default dense-vector index type in Elasticsearch 8.x. Segments are built +during refresh/merge operations. + +| Build Parameter | Type | Default | Description | +|----------------|------|---------|-------------| +| `m` | int | 16 | Max connections per node. Higher values improve recall at the cost of memory | +| `ef_construction` | int | 100 | Search width during index construction | + +| Search Parameter | Type | Default | Description | +|-----------------|------|---------|-------------| +| `num_candidates` | int | 100 | Candidate vectors to consider per shard during kNN search | + +### FLAT + +Brute-force exact search via Elasticsearch's flat index type. Perfect +recall but O(n) per query. No build or search parameters. + +## Supported Metrics + +| Metric | ES Similarity | Notes | +|--------|--------------|-------| +| `COSINE` | `cosine` | Default | +| `L2` | `l2_norm` | Euclidean distance | +| `IP` | `dot_product` | Inner product | + +## Class Structure + +``` +ElasticsearchBackend(VectorDBBackend) +│ +│ # Lifecycle +├── connect(host, **kwargs) +├── disconnect() +│ +│ # Collection (index) management +├── create_collection(name, dimension, metric_type, index_type, +│ index_params, num_shards, force) +├── collection_exists(name) -> bool +├── drop_collection(name) +│ +│ # Data ingestion +├── insert_batch(name, ids, vectors) -> int +├── flush(name) # triggers ES refresh +│ +│ # Search +├── search(name, query_vectors, top_k, search_params) +│ +│ # Status (implements abstract) +├── row_count(name) -> int +├── get_index_progress(name) -> IndexProgress +│ +│ # Optional +└── load_collection(name) # no-op +``` + +### Index Mapping + +Each Elasticsearch index is created with a single `dense_vector` field: + +```json +{ + "mappings": { + "properties": { + "vector": { + "type": "dense_vector", + "dims": 1536, + "similarity": "cosine", + "index": true, + "index_options": { + "type": "hnsw", + "m": 16, + "ef_construction": 200 + } + } + } + }, + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0 + } +} +``` + +Document IDs are stored as the Elasticsearch `_id` field (string +representation of the int64 primary key). + +### Data Ingestion + +`insert_batch()` uses the Elasticsearch +[Bulk API](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html) +with `refresh=False` for maximum throughput. Partial failures are logged +as warnings and the count of successfully inserted documents is returned. + +### Flush / Refresh + +`flush()` calls `indices.refresh()` which forces Elasticsearch to make +all recently indexed documents searchable. This is distinct from the +Elasticsearch "flush" API (which syncs the translog to disk). + +### Index Progress + +Elasticsearch builds HNSW segments during refresh/merge, so there is no +separate "index build" phase to monitor. `get_index_progress()` checks +cluster health for the index: + +- **yellow** or **green** = ready (`IndexProgress(is_ready=True)`) +- **red** = not ready, the base-class `wait_for_index()` continues polling + +The base-class progress log shows the simpler status-only format: + +``` +Waiting for index on 'bench_1m_hnsw' ... (status: yellow) [5s elapsed] +``` + +### Search + +Each query is sent individually via the kNN search API: + +```python +client.search( + index=name, + knn={ + "field": "vector", + "query_vector": [...], + "k": top_k, + "num_candidates": 100, # from search_params + }, + size=top_k, + _source=False, +) +``` + +The `num_candidates` parameter controls the per-shard candidate pool +size. Higher values improve recall at the cost of latency. + +### Load Collection + +`load_collection()` is a no-op. Elasticsearch indexes are always +queryable once refreshed -- there is no separate "load into memory" step. + +## Example YAML Config + +```yaml +backend: elasticsearch +mode: both + +database: + host: http://localhost:9200 + # api_key: "" # set via ELASTICSEARCH__API_KEY env var + # cloud_id: "" # set via ELASTICSEARCH__CLOUD_ID env var + +dataset: + collection_name: bench_1m_hnsw + num_vectors: 1_000_000 + dimension: 1536 + block_size: 100_000 + batch_size: 10_000 + seed: 42 + +index: + index_type: HNSW + metric_type: COSINE + index_params: + m: 16 + ef_construction: 200 + +search: + search_k: 10 + search_params: + num_candidates: 128 +``` + +## Files + +| File | Purpose | +|------|---------| +| `__init__.py` | `backend_descriptor()` -- registers the backend with supported indexes, metrics, and connection params | +| `backend.py` | `ElasticsearchBackend` -- full implementation of `VectorDBBackend` | diff --git a/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/__init__.py b/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/__init__.py new file mode 100644 index 00000000..3badd5af --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/__init__.py @@ -0,0 +1,103 @@ +"""Elasticsearch backend package. + +Exposes :class:`ElasticsearchBackend` and :func:`backend_descriptor` for +automatic registration by the backend registry. + +Requires the ``elasticsearch`` Python package:: + + pip install elasticsearch +""" + +from ..base import BackendDescriptor, IndexDescriptor, ParamDescriptor +from .backend import ElasticsearchBackend + +__all__ = ["ElasticsearchBackend", "backend_descriptor"] + + +def backend_descriptor() -> BackendDescriptor: + """Return the capability descriptor for the Elasticsearch backend.""" + return BackendDescriptor( + name="elasticsearch", + display_name="Elasticsearch", + description=( + "Elasticsearch with dense vector support for approximate and " + "exact k-nearest-neighbor search. Uses the kNN search API " + "introduced in Elasticsearch 8.x with HNSW and brute-force " + "(exact) retrieval. Requires a running Elasticsearch cluster " + "and the elasticsearch-py Python package." + ), + backend_class=ElasticsearchBackend, + supported_metrics=["COSINE", "L2", "IP"], + supported_indexes=[ + IndexDescriptor( + name="HNSW", + description=( + "Hierarchical Navigable Small World graph index. " + "Default dense-vector index type in Elasticsearch 8.x." + ), + build_params=[ + ParamDescriptor( + name="m", + description=( + "Max number of connections per node. Higher " + "values improve recall at the cost of memory." + ), + type="int", + default=16, + ), + ParamDescriptor( + name="ef_construction", + description=( + "Search width during index construction. " + "Higher values improve recall at the cost of " + "build time." + ), + type="int", + default=100, + ), + ], + search_params=[ + ParamDescriptor( + name="num_candidates", + description=( + "Number of candidate vectors to consider per " + "shard during kNN search. Higher values improve " + "recall at the cost of latency." + ), + type="int", + default=100, + ), + ], + ), + IndexDescriptor( + name="FLAT", + description=( + "Brute-force exact search via script_score queries. " + "Perfect recall but O(n) per query." + ), + build_params=[], + search_params=[], + ), + ], + connection_params=[ + ParamDescriptor( + name="host", + description="Elasticsearch server URL (e.g. http://localhost:9200).", + type="str", + default="http://localhost:9200", + ), + ParamDescriptor( + name="api_key", + description="API key for authentication (optional).", + type="str", + default=None, + ), + ParamDescriptor( + name="cloud_id", + description="Elastic Cloud deployment ID (optional, alternative to host).", + type="str", + default=None, + ), + ], + active=True, + ) diff --git a/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/backend.py b/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/backend.py new file mode 100644 index 00000000..dc1012a0 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/elasticsearch/backend.py @@ -0,0 +1,343 @@ +"""Elasticsearch implementation of :class:`VectorDBBackend`. + +This wraps the ``elasticsearch`` Python client behind the abstract backend +interface. The implementation targets Elasticsearch 8.x dense-vector +fields with native kNN search. + +Requirements:: + + pip install elasticsearch +""" + +from __future__ import annotations + +import logging +import time +from typing import Any, Dict, List, Optional + +import numpy as np + +from ..base import CollectionInfo, IndexProgress, VectorDBBackend + +logger = logging.getLogger(__name__) + +# Elasticsearch similarity names mapped from our canonical metric names. +_METRIC_TO_ES_SIMILARITY: Dict[str, str] = { + "COSINE": "cosine", + "L2": "l2_norm", + "IP": "dot_product", +} + + +class ElasticsearchBackend(VectorDBBackend): + """Concrete backend for Elasticsearch (8.x+ with dense vectors).""" + + def __init__(self) -> None: + self._client = None # type: Any # elasticsearch.Elasticsearch + self._index_meta: Dict[str, Dict[str, Any]] = {} # name -> {metric, dim, …} + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + def connect( + self, + host: str = "http://localhost:9200", + **kwargs, + ) -> None: + from elasticsearch import Elasticsearch + + api_key = kwargs.get("api_key") + cloud_id = kwargs.get("cloud_id") + + if cloud_id: + self._client = Elasticsearch(cloud_id=cloud_id, api_key=api_key) + elif api_key: + self._client = Elasticsearch(host, api_key=api_key) + else: + self._client = Elasticsearch(host) + + info = self._client.info() + logger.info( + "Connected to Elasticsearch %s at %s", + info["version"]["number"], + host, + ) + + def disconnect(self) -> None: + if self._client is not None: + self._client.close() + self._client = None + self._index_meta.clear() + logger.info("Disconnected from Elasticsearch") + + # ------------------------------------------------------------------ + # Collection (index) management + # ------------------------------------------------------------------ + def create_collection( + self, + name: str, + dimension: int, + metric_type: str = "COSINE", + index_type: str = "HNSW", + index_params: Optional[Dict[str, Any]] = None, + num_shards: int = 1, + force: bool = False, + ) -> CollectionInfo: + if self.collection_exists(name): + if force: + self.drop_collection(name) + else: + raise ValueError( + f"Index '{name}' already exists. Use force=True to drop it." + ) + + params = index_params or {} + similarity = _METRIC_TO_ES_SIMILARITY.get(metric_type.upper(), "cosine") + + # Build the dense_vector mapping + vector_field: Dict[str, Any] = { + "type": "dense_vector", + "dims": dimension, + "similarity": similarity, + } + + if index_type.upper() == "HNSW": + vector_field["index"] = True + vector_field["index_options"] = { + "type": "hnsw", + "m": params.get("m", 16), + "ef_construction": params.get("ef_construction", 100), + } + elif index_type.upper() == "FLAT": + vector_field["index"] = True + vector_field["index_options"] = { + "type": "flat", + } + else: + # Default to HNSW for unknown types + logger.warning( + "Unknown index type '%s'; falling back to HNSW", index_type + ) + vector_field["index"] = True + vector_field["index_options"] = {"type": "hnsw"} + + mappings = { + "properties": { + "vector": vector_field, + } + } + settings = { + "number_of_shards": num_shards, + "number_of_replicas": 0, + } + + self._client.indices.create( + index=name, + mappings=mappings, + settings=settings, + ) + logger.info( + "Created index '%s' (%d-d, %s, %s, %d shards)", + name, dimension, similarity, index_type, num_shards, + ) + + self._index_meta[name] = { + "dimension": dimension, + "metric_type": metric_type, + "index_type": index_type, + "similarity": similarity, + } + + return CollectionInfo( + name=name, + dimension=dimension, + metric_type=metric_type, + index_type=index_type, + row_count=0, + extra={"index_params": params, "similarity": similarity}, + ) + + def collection_exists(self, name: str) -> bool: + return self._client.indices.exists(index=name).body + + def drop_collection(self, name: str) -> None: + if self.collection_exists(name): + self._client.indices.delete(index=name) + self._index_meta.pop(name, None) + logger.info("Deleted index: %s", name) + + # ------------------------------------------------------------------ + # Data ingestion + # ------------------------------------------------------------------ + def insert_batch( + self, + name: str, + ids: np.ndarray, + vectors: np.ndarray, + ) -> int: + actions = [] + for i in range(len(ids)): + actions.append({"index": {"_index": name, "_id": str(int(ids[i]))}}) + actions.append({"vector": vectors[i].tolist()}) + + resp = self._client.bulk(operations=actions, refresh=False) + if resp.get("errors"): + failed = sum( + 1 for item in resp["items"] + if item.get("index", {}).get("error") + ) + logger.warning("Bulk insert had %s errors", f"{failed:,}") + return len(ids) - failed + return len(ids) + + def flush(self, name: str) -> None: + t0 = time.time() + self._client.indices.refresh(index=name) + logger.info("Refresh completed in %.2f s", time.time() - t0) + + # ------------------------------------------------------------------ + # Search + # ------------------------------------------------------------------ + def search( + self, + name: str, + query_vectors: np.ndarray, + top_k: int, + search_params: Optional[Dict[str, Any]] = None, + ) -> List[List[int]]: + params = search_params or {} + num_candidates = params.get("num_candidates", 100) + + results: List[List[int]] = [] + for qvec in query_vectors: + resp = self._client.search( + index=name, + knn={ + "field": "vector", + "query_vector": qvec.tolist(), + "k": top_k, + "num_candidates": num_candidates, + }, + size=top_k, + _source=False, + ) + ids = [int(hit["_id"]) for hit in resp["hits"]["hits"]] + results.append(ids) + + return results + + # ------------------------------------------------------------------ + # Status / info + # ------------------------------------------------------------------ + def row_count(self, name: str) -> int: + self._client.indices.refresh(index=name) + resp = self._client.count(index=name) + return resp["count"] + + def get_index_progress(self, name: str) -> IndexProgress: + """Check Elasticsearch cluster health for this index. + + Elasticsearch builds HNSW segments during refresh/merge, so + after a bulk ingest + refresh the index is queryable. Health + status of *yellow* or *green* means the index is ready. + """ + health = self._client.cluster.health( + index=name, wait_for_status="yellow", timeout="5s" + ) + status = health["status"] + is_ready = status in ("yellow", "green") + return IndexProgress(is_ready=is_ready, status=status) + + # ------------------------------------------------------------------ + # Optional: load_collection (no-op for Elasticsearch) + # ------------------------------------------------------------------ + def load_collection(self, name: str) -> None: + """No-op -- Elasticsearch indexes are always queryable once refreshed.""" + logger.debug("load_collection is a no-op for Elasticsearch") + + # ------------------------------------------------------------------ + # Administration / introspection + # ------------------------------------------------------------------ + def list_collections(self) -> List[str]: + resp = self._client.cat.indices(format="json") + return sorted( + entry["index"] + for entry in resp + if not entry["index"].startswith(".") + ) + + def get_collection_info(self, name: str) -> Dict[str, Any]: + mapping = self._client.indices.get_mapping(index=name) + props = mapping[name]["mappings"].get("properties", {}) + + # Parse vector field + dimension = None + metric_type = None + index_type = None + schema: List[Dict[str, Any]] = [] + for field_name, field_def in props.items(): + entry: Dict[str, Any] = { + "name": field_name, + "dtype": field_def.get("type", "unknown"), + } + if field_def.get("type") == "dense_vector": + dimension = field_def.get("dims") + entry["dim"] = dimension + # Reverse-map similarity back to our canonical metric + sim = field_def.get("similarity", "") + for canonical, es_sim in _METRIC_TO_ES_SIMILARITY.items(): + if es_sim == sim: + metric_type = canonical + break + idx_opts = field_def.get("index_options", {}) + index_type = idx_opts.get("type", "hnsw").upper() + schema.append(entry) + + row_count = self.row_count(name) + + return { + "name": name, + "row_count": row_count, + "dimension": dimension, + "metric_type": metric_type, + "index_type": index_type, + "schema": schema, + } + + def list_indexes(self, name: str) -> List[Dict[str, Any]]: + mapping = self._client.indices.get_mapping(index=name) + props = mapping[name]["mappings"].get("properties", {}) + + results: List[Dict[str, Any]] = [] + for field_name, field_def in props.items(): + if field_def.get("type") != "dense_vector": + continue + idx_opts = field_def.get("index_options", {}) + results.append({ + "index_name": field_name, + "field_name": field_name, + "index_type": idx_opts.get("type", "hnsw").upper(), + "similarity": field_def.get("similarity", ""), + "params": { + k: v for k, v in idx_opts.items() if k != "type" + }, + }) + return results + + def get_collection_stats(self, name: str) -> Dict[str, Any]: + stats = self._client.indices.stats(index=name) + idx_stats = stats["indices"].get(name, {}).get("primaries", {}) + docs = idx_stats.get("docs", {}) + store = idx_stats.get("store", {}) + health = self._client.cluster.health(index=name) + return { + "name": name, + "row_count": docs.get("count", 0), + "deleted_docs": docs.get("deleted", 0), + "store_size_bytes": store.get("size_in_bytes", 0), + "index_ready": health["status"] in ("yellow", "green"), + "index_status": health["status"], + "indexed_rows": 0, + "total_rows": 0, + "pending_rows": 0, + } diff --git a/vdb_benchmark/vdbbench/benchmark/backends/milvus/README.md b/vdb_benchmark/vdbbench/benchmark/backends/milvus/README.md new file mode 100644 index 00000000..11ac7455 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/milvus/README.md @@ -0,0 +1,186 @@ +# Milvus Backend + +Adapter for [Milvus](https://milvus.io/) / [Zilliz Cloud](https://zilliz.com/) +-- an open-source vector database built for scalable similarity search. + +## Requirements + +```bash +pip install pymilvus +``` + +A running Milvus server (standalone or cluster) is required. See the +[Milvus quickstart](https://milvus.io/docs/install_standalone-docker.md) +for Docker-based setup. + +## Connection + +| Parameter | Env Variable | Default | Description | +|-----------|-------------|---------|-------------| +| `host` | `MILVUS__HOST` | `127.0.0.1` | Milvus server hostname or IP | +| `port` | `MILVUS__PORT` | `19530` | Milvus gRPC port | +| `max_message_length` | `MILVUS__MAX_MESSAGE_LENGTH` | `514983574` | Max gRPC message size in bytes (~491 MB) | + +Connection uses the `pymilvus.connections.connect()` API with the +`"default"` alias. The `max_message_length` parameter controls both +`max_receive_message_length` and `max_send_message_length` on the gRPC +channel. + +## Supported Indexes + +### HNSW + +Hierarchical Navigable Small World graph index. Good general-purpose choice +balancing recall and speed. + +| Build Parameter | Type | Default | Description | +|----------------|------|---------|-------------| +| `M` | int | 16 | Max connections per node | +| `efConstruction` | int | 200 | Search width during index construction | + +| Search Parameter | Type | Default | Description | +|-----------------|------|---------|-------------| +| `ef` | int | 128 | Search width at query time (higher = better recall) | + +### DiskANN + +Microsoft DiskANN -- SSD-friendly graph index for large-scale datasets +that exceed RAM. + +| Build Parameter | Type | Default | Description | +|----------------|------|---------|-------------| +| `MaxDegree` | int | 64 | Maximum out-degree of each graph node | +| `SearchListSize` | int | 200 | Candidate-list size during index build | + +| Search Parameter | Type | Default | Description | +|-----------------|------|---------|-------------| +| `search_list` | int | 200 | Candidate-list size at query time | + +### AISAQ + +Approximate Inference with Scalar and Additive Quantization -- a +compressed index format. + +| Build Parameter | Type | Default | Description | +|----------------|------|---------|-------------| +| `inline_pq` | int | 16 | Product-quantization sub-vector count | +| `max_degree` | int | 32 | Maximum out-degree of each graph node | +| `search_list_size` | int | 100 | Candidate-list size during build | + +No search-time parameters. + +### FLAT + +Brute-force exact search. Perfect recall but O(n) per query. No +build or search parameters. + +## Supported Metrics + +`COSINE`, `L2`, `IP` + +## Class Structure + +``` +MilvusBackend(VectorDBBackend) +│ +│ # Lifecycle +├── connect(host, port, **kwargs) +├── disconnect() +│ +│ # Collection management +├── create_collection(name, dimension, metric_type, index_type, +│ index_params, num_shards, force) +├── collection_exists(name) -> bool +├── drop_collection(name) +│ +│ # Data ingestion +├── insert_batch(name, ids, vectors) -> int +├── flush(name) +├── compact(name) # overrides base no-op +│ +│ # Search +├── search(name, query_vectors, top_k, search_params) +│ +│ # Status (implements abstract) +├── row_count(name) -> int +├── get_index_progress(name) -> IndexProgress +│ +│ # Internal helpers +├── _get_collection(name) -> Collection # lazy pymilvus Collection cache +└── _build_index_params(index_type, metric_type, params) -> dict +``` + +### Schema + +Every collection uses a fixed two-field schema: + +| Field | Type | Notes | +|-------|------|-------| +| `id` | `INT64` | Primary key, not auto-generated | +| `vector` | `FLOAT_VECTOR` | Dimensionality set at creation | + +### Compaction + +Milvus is the only backend that overrides `compact()`. After batch +inserts, Milvus may have many small segments that slow down index +building. `compact()` calls `Collection.compact()` followed by +`Collection.wait_for_compaction_completed()` to merge segments before +the index build begins. + +### Index Progress + +`get_index_progress()` calls `pymilvus.utility.index_building_progress()` +which returns `total_rows`, `indexed_rows`, and `pending_index_rows`. +These feed into the base-class `wait_for_index()` progress logging with +percentage, rates, and ETA. + +### Search Parameter Handling + +The `search()` method accepts `search_params` in two formats: + +1. **Raw keys** (preferred from YAML configs): `{"ef": 128}` -- wrapped + automatically into the `{"metric_type": ..., "params": {...}}` structure + that `pymilvus` expects. +2. **pymilvus format**: `{"metric_type": "COSINE", "params": {"ef": 128}}` + -- passed through as-is. + +## Example YAML Config + +```yaml +backend: milvus +mode: both + +database: + host: 127.0.0.1 + port: 19530 + +dataset: + collection_name: bench_1m_hnsw + num_vectors: 1_000_000 + dimension: 1536 + block_size: 100_000 + batch_size: 10_000 + seed: 42 + +index: + index_type: HNSW + metric_type: COSINE + index_params: + M: 64 + efConstruction: 200 + +search: + search_k: 10 + search_params: + ef: 128 + +workflow: + compact: true +``` + +## Files + +| File | Purpose | +|------|---------| +| `__init__.py` | `backend_descriptor()` -- registers the backend with supported indexes, metrics, and connection params | +| `backend.py` | `MilvusBackend` -- full implementation of `VectorDBBackend` | diff --git a/vdb_benchmark/vdbbench/benchmark/backends/milvus/__init__.py b/vdb_benchmark/vdbbench/benchmark/backends/milvus/__init__.py new file mode 100644 index 00000000..da6b53e9 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/milvus/__init__.py @@ -0,0 +1,144 @@ +"""Milvus backend package. + +Exposes :class:`MilvusBackend` and :func:`backend_descriptor` for +automatic registration by the backend registry. +""" + +from ..base import BackendDescriptor, IndexDescriptor, ParamDescriptor +from .backend import MilvusBackend + +__all__ = ["MilvusBackend", "backend_descriptor"] + + +def backend_descriptor() -> BackendDescriptor: + """Return the capability descriptor for the Milvus backend.""" + return BackendDescriptor( + name="milvus", + display_name="Milvus", + description=( + "Open-source vector database built for scalable similarity " + "search. Supports HNSW, DiskANN, AISAQ, and FLAT index types " + "with COSINE, L2, and IP distance metrics. Requires a running " + "Milvus server (standalone or cluster) and the pymilvus Python " + "package." + ), + backend_class=MilvusBackend, + supported_metrics=["COSINE", "L2", "IP"], + supported_indexes=[ + IndexDescriptor( + name="HNSW", + description=( + "Hierarchical Navigable Small World graph index. " + "Good general-purpose choice balancing recall and speed." + ), + build_params=[ + ParamDescriptor( + name="M", + description="Max number of connections per node.", + type="int", + default=16, + ), + ParamDescriptor( + name="efConstruction", + description="Search width during index construction.", + type="int", + default=200, + ), + ], + search_params=[ + ParamDescriptor( + name="ef", + description="Search width at query time (higher = better recall).", + type="int", + default=128, + ), + ], + ), + IndexDescriptor( + name="DISKANN", + description=( + "Microsoft DiskANN -- SSD-friendly graph index for " + "large-scale datasets that exceed RAM." + ), + build_params=[ + ParamDescriptor( + name="MaxDegree", + description="Maximum out-degree of each graph node.", + type="int", + default=64, + ), + ParamDescriptor( + name="SearchListSize", + description="Candidate-list size during index build.", + type="int", + default=200, + ), + ], + search_params=[ + ParamDescriptor( + name="search_list", + description="Candidate-list size at query time.", + type="int", + default=200, + ), + ], + ), + IndexDescriptor( + name="AISAQ", + description=( + "Approximate Inference with Scalar and Additive " + "Quantization -- a compressed index format." + ), + build_params=[ + ParamDescriptor( + name="inline_pq", + description="Product-quantization sub-vector count.", + type="int", + default=16, + ), + ParamDescriptor( + name="max_degree", + description="Maximum out-degree of each graph node.", + type="int", + default=32, + ), + ParamDescriptor( + name="search_list_size", + description="Candidate-list size during build.", + type="int", + default=100, + ), + ], + search_params=[], + ), + IndexDescriptor( + name="FLAT", + description=( + "Brute-force exact search (no indexing). " + "Perfect recall but O(n) per query." + ), + build_params=[], + search_params=[], + ), + ], + connection_params=[ + ParamDescriptor( + name="host", + description="Milvus server hostname or IP.", + type="str", + default="127.0.0.1", + ), + ParamDescriptor( + name="port", + description="Milvus gRPC port.", + type="str", + default="19530", + ), + ParamDescriptor( + name="max_message_length", + description="Max gRPC message size in bytes.", + type="int", + default=514_983_574, + ), + ], + ) diff --git a/vdb_benchmark/vdbbench/benchmark/backends/milvus/backend.py b/vdb_benchmark/vdbbench/benchmark/backends/milvus/backend.py new file mode 100644 index 00000000..f21a7aaf --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/milvus/backend.py @@ -0,0 +1,314 @@ +"""Milvus implementation of :class:`VectorDBBackend`. + +This wraps ``pymilvus`` behind the abstract backend interface so the +benchmark pipeline is completely database-agnostic. The implementation +mirrors the conventions used by the existing ``load_vdb.py`` script +(schema, index params, connection options). +""" + +from __future__ import annotations + +import logging +import time +from typing import Any, Dict, List, Optional + +import numpy as np +from pymilvus import ( + Collection, + CollectionSchema, + DataType, + FieldSchema, + connections, + utility, +) + +from ..base import CollectionInfo, IndexProgress, VectorDBBackend + +logger = logging.getLogger(__name__) + + +class MilvusBackend(VectorDBBackend): + """Concrete backend for Milvus / Zilliz Cloud.""" + + def __init__(self) -> None: + self._collections: Dict[str, Collection] = {} + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + def connect( + self, + host: str = "127.0.0.1", + port: str = "19530", + **kwargs, + ) -> None: + max_msg = kwargs.get("max_message_length", 514_983_574) + connections.connect( + "default", + host=host, + port=port, + max_receive_message_length=max_msg, + max_send_message_length=max_msg, + ) + logger.info("Connected to Milvus at %s:%s", host, port) + + def disconnect(self) -> None: + connections.disconnect("default") + self._collections.clear() + logger.info("Disconnected from Milvus") + + # ------------------------------------------------------------------ + # Collection helpers + # ------------------------------------------------------------------ + def _get_collection(self, name: str) -> Collection: + if name not in self._collections: + self._collections[name] = Collection(name=name) + return self._collections[name] + + @staticmethod + def _build_index_params( + index_type: str, + metric_type: str, + params: Optional[Dict[str, Any]], + ) -> Dict[str, Any]: + params = params or {} + ip: Dict[str, Any] = { + "index_type": index_type, + "metric_type": metric_type, + "params": {}, + } + if index_type == "HNSW": + ip["params"] = { + "M": params.get("M", 16), + "efConstruction": params.get("efConstruction", 200), + } + elif index_type == "DISKANN": + ip["params"] = { + "MaxDegree": params.get("MaxDegree", 64), + "SearchListSize": params.get("SearchListSize", 200), + } + elif index_type == "AISAQ": + ip["params"] = { + "inline_pq": params.get("inline_pq", 16), + "max_degree": params.get("max_degree", 32), + "search_list_size": params.get("search_list_size", 100), + } + elif index_type == "FLAT": + pass # no extra params + else: + ip["params"] = params + return ip + + # ------------------------------------------------------------------ + # Collection management + # ------------------------------------------------------------------ + def create_collection( + self, + name: str, + dimension: int, + metric_type: str = "COSINE", + index_type: str = "HNSW", + index_params: Optional[Dict[str, Any]] = None, + num_shards: int = 1, + force: bool = False, + ) -> CollectionInfo: + if utility.has_collection(name): + if force: + Collection(name=name).drop() + logger.info("Dropped existing collection: %s", name) + else: + raise ValueError( + f"Collection '{name}' already exists. Use force=True to drop it." + ) + + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, + is_primary=True, auto_id=False), + FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dimension), + ] + schema = CollectionSchema(fields, description="Benchmark Collection") + col = Collection(name=name, schema=schema, num_shards=num_shards) + logger.info("Created collection '%s' (%s-d, %s shards)", name, f"{dimension:,}", num_shards) + + ip = self._build_index_params(index_type, metric_type, index_params) + col.create_index("vector", ip) + logger.info("Index created: %s / %s", index_type, metric_type) + + self._collections[name] = col + return CollectionInfo( + name=name, + dimension=dimension, + metric_type=metric_type, + index_type=index_type, + row_count=0, + extra={"index_params": ip}, + ) + + def collection_exists(self, name: str) -> bool: + return utility.has_collection(name) + + def drop_collection(self, name: str) -> None: + if utility.has_collection(name): + Collection(name=name).drop() + self._collections.pop(name, None) + logger.info("Dropped collection: %s", name) + + # ------------------------------------------------------------------ + # Data ingestion + # ------------------------------------------------------------------ + def insert_batch( + self, + name: str, + ids: np.ndarray, + vectors: np.ndarray, + ) -> int: + col = self._get_collection(name) + col.insert([ids.tolist(), vectors]) + return len(ids) + + def flush(self, name: str) -> None: + col = self._get_collection(name) + t0 = time.time() + col.flush() + logger.info("Flush completed in %.2f s", time.time() - t0) + + def compact(self, name: str) -> None: + """Trigger Milvus segment compaction and block until done.""" + col = self._get_collection(name) + logger.info("Triggering compaction for '%s' ...", name) + t0 = time.time() + col.compact() + col.wait_for_compaction_completed() + elapsed = time.time() - t0 + logger.info("Compaction completed in %.2f s", elapsed) + + # ------------------------------------------------------------------ + # Search + # ------------------------------------------------------------------ + def search( + self, + name: str, + query_vectors: np.ndarray, + top_k: int, + search_params: Optional[Dict[str, Any]] = None, + ) -> List[List[int]]: + col = self._get_collection(name) + col.load() + raw = search_params or {} + if "params" in raw: + # Already in pymilvus format (has metric_type + params wrapper) + sp = raw + else: + # Wrap raw keys into the structure pymilvus expects + sp = { + "metric_type": raw.get("metric_type", "COSINE"), + "params": {k: v for k, v in raw.items() + if k != "metric_type"}, + } + results = col.search( + data=query_vectors.tolist(), + anns_field="vector", + param=sp, + limit=top_k, + ) + return [[hit.id for hit in hits] for hits in results] + + # ------------------------------------------------------------------ + # Status / info + # ------------------------------------------------------------------ + def row_count(self, name: str) -> int: + col = self._get_collection(name) + col.flush() + return col.num_entities + + def get_index_progress(self, name: str) -> IndexProgress: + """Query Milvus ``index_building_progress`` and return a snapshot.""" + progress = utility.index_building_progress(name) + total = progress.get("total_rows", 0) + indexed = progress.get("indexed_rows", 0) + pending = progress.get("pending_index_rows", 0) + is_ready = total > 0 and indexed >= total and pending == 0 + return IndexProgress( + is_ready=is_ready, + total_rows=total, + indexed_rows=indexed, + pending_rows=pending, + ) + + # ------------------------------------------------------------------ + # Administration / introspection + # ------------------------------------------------------------------ + def list_collections(self) -> List[str]: + return utility.list_collections() + + def get_collection_info(self, name: str) -> Dict[str, Any]: + col = self._get_collection(name) + col.flush() + + # Extract schema fields + schema = [] + dimension = None + for field in col.schema.fields: + entry: Dict[str, Any] = { + "name": field.name, + "dtype": field.dtype.name if hasattr(field.dtype, "name") else str(field.dtype), + "is_primary": field.is_primary, + } + if field.params.get("dim"): + entry["dim"] = field.params["dim"] + dimension = field.params["dim"] + schema.append(entry) + + # Extract index info + index_type = None + metric_type = None + if col.indexes: + idx = col.indexes[0] + index_type = idx.params.get("index_type") + metric_type = idx.params.get("metric_type") + + return { + "name": name, + "row_count": col.num_entities, + "dimension": dimension, + "metric_type": metric_type, + "index_type": index_type, + "schema": schema, + "num_partitions": len(col.partitions), + "partitions": [p.name for p in col.partitions], + } + + def list_indexes(self, name: str) -> List[Dict[str, Any]]: + col = self._get_collection(name) + results: List[Dict[str, Any]] = [] + for idx in col.indexes: + results.append({ + "index_name": idx.field_name, + "field_name": idx.field_name, + "index_type": idx.params.get("index_type", "UNKNOWN"), + "metric_type": idx.params.get("metric_type", "UNKNOWN"), + "params": idx.params.get("params", {}), + }) + return results + + def drop_index(self, name: str, index_name: Optional[str] = None) -> None: + col = self._get_collection(name) + field = index_name or "vector" + col.drop_index(field_name=field) + logger.info("Dropped index on field '%s' from '%s'", field, name) + + def get_collection_stats(self, name: str) -> Dict[str, Any]: + col = self._get_collection(name) + col.flush() + prog = self.get_index_progress(name) + stats: Dict[str, Any] = { + "name": name, + "row_count": col.num_entities, + "index_ready": prog.is_ready, + "index_status": prog.status, + "indexed_rows": prog.indexed_rows, + "total_rows": prog.total_rows, + "pending_rows": prog.pending_rows, + "num_partitions": len(col.partitions), + } + return stats diff --git a/vdb_benchmark/vdbbench/benchmark/backends/pgvector/README.md b/vdb_benchmark/vdbbench/benchmark/backends/pgvector/README.md new file mode 100644 index 00000000..f50c2a4a --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/pgvector/README.md @@ -0,0 +1,182 @@ +# pgvector Backend + +Adapter for [pgvector](https://github.com/pgvector/pgvector) -- a PostgreSQL +extension for vector similarity search using standard SQL. + +## Requirements + +```bash +pip install psycopg2-binary pgvector +``` + +The target PostgreSQL server must have the `vector` extension installed: + +```sql +CREATE EXTENSION IF NOT EXISTS vector; +``` + +The backend runs this command automatically on `connect()`. + +## Connection + +| Parameter | Env Variable | Default | Description | +|-----------|-------------|---------|-------------| +| `host` | `PGVECTOR__HOST` | `127.0.0.1` | PostgreSQL server hostname or IP | +| `port` | `PGVECTOR__PORT` | `5432` | PostgreSQL server port | +| `dbname` | `PGVECTOR__DBNAME` | `postgres` | Database name | +| `user` | `PGVECTOR__USER` | `postgres` | Database user | +| `password` | `PGVECTOR__PASSWORD` | `""` | Database password | + +Connection uses `psycopg2.connect()` with `autocommit = True`. The +`pgvector.psycopg2.register_vector()` call enables transparent +NumPy-to-vector conversion. + +## Supported Indexes + +### HNSW + +Hierarchical Navigable Small World graph index. Built-in to +pgvector >= 0.5.0. + +| Build Parameter | Type | Default | Description | +|----------------|------|---------|-------------| +| `M` (or `m`) | int | 16 | Max connections per node | +| `efConstruction` (or `ef_construction`) | int | 200 | Search width during index construction | + +| Search Parameter | Type | Default | Description | +|-----------------|------|---------|-------------| +| `ef_search` | int | 40 | Search width at query time. Set via `SET LOCAL hnsw.ef_search` | + +### IVFFLAT + +Inverted-file flat index. Partitions vectors into lists and searches a +subset. Lower build time than HNSW but typically lower recall at the same +speed. + +| Build Parameter | Type | Default | Description | +|----------------|------|---------|-------------| +| `lists` (or `nlist`) | int | 100 | Number of inverted-file lists (clusters) | + +| Search Parameter | Type | Default | Description | +|-----------------|------|---------|-------------| +| `probes` | int | 10 | Number of lists to probe at query time. Set via `SET LOCAL ivfflat.probes` | + +### FLAT + +No index -- exact brute-force sequential scan via PostgreSQL `ORDER BY`. +Perfect recall but O(n) per query. No build or search parameters. Selected +by setting `index_type: FLAT` (or `NONE`) in the config. + +## Supported Metrics + +| Metric | pgvector Operator | Operator Class | +|--------|-------------------|---------------| +| `COSINE` | `<=>` | `vector_cosine_ops` | +| `L2` | `<->` | `vector_l2_ops` | +| `IP` | `<#>` | `vector_ip_ops` | + +## Class Structure + +``` +PGVectorBackend(VectorDBBackend) +│ +│ # Lifecycle +├── connect(host, port, dbname, user, password, **kwargs) +├── disconnect() +│ +│ # Collection management +├── create_collection(name, dimension, metric_type, index_type, +│ index_params, num_shards, force) +├── collection_exists(name) -> bool +├── drop_collection(name) +│ +│ # Data ingestion +├── insert_batch(name, ids, vectors) -> int +├── flush(name) # no-op (autocommit) +│ +│ # Search +├── search(name, query_vectors, top_k, search_params) +│ +│ # Status (implements abstract) +├── row_count(name) -> int +├── get_index_progress(name) -> IndexProgress +│ +│ # Internal helpers +├── _cur() -> cursor # new cursor with connection check +├── _table(name) -> str # SQL-safe identifier quoting +├── _index_name(table, suffix) -> str # deterministic index name +└── _create_index(name, dim, metric, type, params) +``` + +### Schema + +Every table uses a fixed two-column schema: + +| Column | Type | Notes | +|--------|------|-------| +| `id` | `BIGINT PRIMARY KEY` | Not auto-generated | +| `vector` | `vector(dim)` | pgvector `vector` type with fixed dimensionality | + +### Synchronous Index Build + +Unlike Milvus, `CREATE INDEX` in PostgreSQL is **synchronous** -- the +call blocks until the index is fully built. As a result: + +- `get_index_progress()` simply checks `pg_indexes` for the table and + returns `IndexProgress(is_ready=True)` once an index exists. +- The base-class `wait_for_index()` typically completes on the first + poll since the index is already built by the time inserts finish. + +### Search Parameter Handling + +Search-time GUCs (`hnsw.ef_search`, `ivfflat.probes`) require a +transaction block. The `search()` method temporarily exits `autocommit` +mode, runs `SET LOCAL` inside a transaction, executes all queries, then +commits and restores `autocommit`. When no search-time parameters are +set, queries run directly without a transaction wrapper. + +### Flush + +`flush()` is a no-op because the connection runs in `autocommit = True` +mode -- every `INSERT` is committed immediately. + +## Example YAML Config + +```yaml +backend: pgvector +mode: both + +database: + host: 127.0.0.1 + port: 5432 + dbname: postgres + user: postgres + password: "" + +dataset: + collection_name: bench_1m_hnsw + num_vectors: 1_000_000 + dimension: 1536 + block_size: 100_000 + batch_size: 10_000 + seed: 42 + +index: + index_type: HNSW + metric_type: COSINE + index_params: + m: 64 + ef_construction: 200 + +search: + search_k: 10 + search_params: + ef_search: 128 +``` + +## Files + +| File | Purpose | +|------|---------| +| `__init__.py` | `backend_descriptor()` -- registers the backend with supported indexes, metrics, and connection params | +| `backend.py` | `PGVectorBackend` -- full implementation of `VectorDBBackend` | diff --git a/vdb_benchmark/vdbbench/benchmark/backends/pgvector/__init__.py b/vdb_benchmark/vdbbench/benchmark/backends/pgvector/__init__.py new file mode 100644 index 00000000..b759ab78 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/pgvector/__init__.py @@ -0,0 +1,124 @@ +"""pgvector backend package. + +Exposes :class:`PGVectorBackend` and :func:`backend_descriptor` for +automatic registration by the backend registry. +""" + +from ..base import BackendDescriptor, IndexDescriptor, ParamDescriptor +from .backend import PGVectorBackend + +__all__ = ["PGVectorBackend", "backend_descriptor"] + + +def backend_descriptor() -> BackendDescriptor: + """Return the capability descriptor for the pgvector backend.""" + return BackendDescriptor( + name="pgvector", + display_name="pgvector (PostgreSQL)", + description=( + "PostgreSQL extension for vector similarity search. Uses " + "standard SQL with the pgvector extension for HNSW and IVFFlat " + "indexes. Supports COSINE, L2, and IP distance metrics. " + "Requires a PostgreSQL server with the vector extension " + "installed and the psycopg2-binary + pgvector Python packages." + ), + backend_class=PGVectorBackend, + supported_metrics=["COSINE", "L2", "IP"], + supported_indexes=[ + IndexDescriptor( + name="HNSW", + description=( + "Hierarchical Navigable Small World graph index. " + "Built-in to pgvector >= 0.5.0. Good general-purpose " + "choice balancing recall and speed." + ), + build_params=[ + ParamDescriptor( + name="M", + description="Max number of connections per node.", + type="int", + default=16, + ), + ParamDescriptor( + name="efConstruction", + description="Search width during index construction.", + type="int", + default=200, + ), + ], + search_params=[ + ParamDescriptor( + name="ef_search", + description="Search width at query time (higher = better recall).", + type="int", + default=40, + ), + ], + ), + IndexDescriptor( + name="IVFFLAT", + description=( + "Inverted-file flat index. Partitions vectors into " + "lists and searches a subset. Lower build time than " + "HNSW but typically lower recall at the same speed." + ), + build_params=[ + ParamDescriptor( + name="lists", + description="Number of inverted-file lists (clusters).", + type="int", + default=100, + ), + ], + search_params=[ + ParamDescriptor( + name="probes", + description="Number of lists to probe at query time.", + type="int", + default=10, + ), + ], + ), + IndexDescriptor( + name="FLAT", + description=( + "No index -- exact brute-force sequential scan. " + "Perfect recall but O(n) per query." + ), + build_params=[], + search_params=[], + ), + ], + connection_params=[ + ParamDescriptor( + name="host", + description="PostgreSQL server hostname or IP.", + type="str", + default="127.0.0.1", + ), + ParamDescriptor( + name="port", + description="PostgreSQL server port.", + type="str", + default="5432", + ), + ParamDescriptor( + name="dbname", + description="Database name to connect to.", + type="str", + default="postgres", + ), + ParamDescriptor( + name="user", + description="Database user.", + type="str", + default="postgres", + ), + ParamDescriptor( + name="password", + description="Database password.", + type="str", + default="", + ), + ], + ) diff --git a/vdb_benchmark/vdbbench/benchmark/backends/pgvector/backend.py b/vdb_benchmark/vdbbench/benchmark/backends/pgvector/backend.py new file mode 100644 index 00000000..c2c9d4b0 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/backends/pgvector/backend.py @@ -0,0 +1,439 @@ +"""pgvector (PostgreSQL) implementation of :class:`VectorDBBackend`. + +This wraps ``psycopg2`` and the ``pgvector`` extension behind the abstract +backend interface so the benchmark pipeline is completely database-agnostic. + +Requirements:: + + pip install psycopg2-binary pgvector + +The target PostgreSQL server must have the ``vector`` extension installed:: + + CREATE EXTENSION IF NOT EXISTS vector; +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional + +import numpy as np + +from ..base import CollectionInfo, IndexProgress, VectorDBBackend + +logger = logging.getLogger(__name__) + +# Mapping from the generic metric names used by the benchmark framework +# to the pgvector operator classes required by each index type. +_METRIC_TO_HNSW_OPS: Dict[str, str] = { + "L2": "vector_l2_ops", + "COSINE": "vector_cosine_ops", + "IP": "vector_ip_ops", +} + +_METRIC_TO_IVFFLAT_OPS: Dict[str, str] = { + "L2": "vector_l2_ops", + "COSINE": "vector_cosine_ops", + "IP": "vector_ip_ops", +} + +# The SQL distance operator used at query time for each metric. +_METRIC_TO_OPERATOR: Dict[str, str] = { + "L2": "<->", + "COSINE": "<=>", + "IP": "<#>", +} + + +class PGVectorBackend(VectorDBBackend): + """Concrete backend for PostgreSQL + pgvector.""" + + def __init__(self) -> None: + self._conn = None # type: Any # psycopg2 connection + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + def connect( + self, + host: str = "127.0.0.1", + port: str = "5432", + dbname: str = "postgres", + user: str = "postgres", + password: str = "", + **kwargs, + ) -> None: + import psycopg2 + from pgvector.psycopg2 import register_vector + + self._conn = psycopg2.connect( + host=host, + port=port, + dbname=dbname, + user=user, + password=password, + ) + self._conn.autocommit = True + register_vector(self._conn) + + # Ensure the vector extension exists. + with self._conn.cursor() as cur: + cur.execute("CREATE EXTENSION IF NOT EXISTS vector") + logger.info("Connected to PostgreSQL at %s:%s (db=%s)", host, port, dbname) + + def disconnect(self) -> None: + if self._conn and not self._conn.closed: + self._conn.close() + self._conn = None + logger.info("Disconnected from PostgreSQL") + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + def _cur(self): + """Return a new cursor, raising if not connected.""" + if self._conn is None or self._conn.closed: + raise RuntimeError("Not connected to PostgreSQL") + return self._conn.cursor() + + @staticmethod + def _table(name: str) -> str: + """Sanitize a collection name for use as a SQL identifier.""" + import psycopg2.extensions + return psycopg2.extensions.quote_ident(name) if hasattr( + psycopg2.extensions, "quote_ident" + ) else f'"{name}"' + + @staticmethod + def _index_name(table: str, suffix: str = "vec_idx") -> str: + return f"{table}_{suffix}" + + # ------------------------------------------------------------------ + # Collection management + # ------------------------------------------------------------------ + def create_collection( + self, + name: str, + dimension: int, + metric_type: str = "COSINE", + index_type: str = "HNSW", + index_params: Optional[Dict[str, Any]] = None, + num_shards: int = 1, + force: bool = False, + ) -> CollectionInfo: + table = self._table(name) + idx_name = self._index_name(name) + + if self.collection_exists(name): + if force: + self.drop_collection(name) + else: + raise ValueError( + f"Table '{name}' already exists. Use force=True to drop it." + ) + + with self._cur() as cur: + cur.execute( + f"CREATE TABLE {table} (" + f" id BIGINT PRIMARY KEY," + f" vector vector({dimension})" + f")" + ) + logger.info("Created table '%s' (%s-d)", name, f"{dimension:,}") + + # Build the index (unless FLAT / no index requested). + index_params = index_params or {} + if index_type.upper() not in ("FLAT", "NONE"): + self._create_index( + name, dimension, metric_type, index_type, index_params + ) + + return CollectionInfo( + name=name, + dimension=dimension, + metric_type=metric_type, + index_type=index_type, + row_count=0, + extra={"index_params": index_params}, + ) + + def _create_index( + self, + name: str, + dimension: int, + metric_type: str, + index_type: str, + index_params: Dict[str, Any], + ) -> None: + table = self._table(name) + idx_name = self._index_name(name) + upper = index_type.upper() + + if upper == "HNSW": + ops = _METRIC_TO_HNSW_OPS.get(metric_type.upper(), "vector_cosine_ops") + m = index_params.get("M", index_params.get("m", 16)) + ef_construction = index_params.get( + "efConstruction", + index_params.get("ef_construction", 200), + ) + with_clause = f"(m = {m}, ef_construction = {ef_construction})" + sql = ( + f"CREATE INDEX {idx_name} ON {table} " + f"USING hnsw (vector {ops}) WITH {with_clause}" + ) + elif upper == "IVFFLAT": + ops = _METRIC_TO_IVFFLAT_OPS.get(metric_type.upper(), "vector_cosine_ops") + nlist = index_params.get("nlist", index_params.get("lists", 100)) + with_clause = f"(lists = {nlist})" + sql = ( + f"CREATE INDEX {idx_name} ON {table} " + f"USING ivfflat (vector {ops}) WITH {with_clause}" + ) + else: + logger.warning( + "Unknown index type '%s' for pgvector; skipping index creation.", + index_type, + ) + return + + logger.info("Creating index: %s", sql) + with self._cur() as cur: + cur.execute(sql) + logger.info("Index '%s' created (%s / %s)", idx_name, index_type, metric_type) + + def collection_exists(self, name: str) -> bool: + with self._cur() as cur: + cur.execute( + "SELECT EXISTS (" + " SELECT 1 FROM information_schema.tables" + " WHERE table_name = %s" + ")", + (name,), + ) + return cur.fetchone()[0] + + def drop_collection(self, name: str) -> None: + table = self._table(name) + with self._cur() as cur: + cur.execute(f"DROP TABLE IF EXISTS {table} CASCADE") + logger.info("Dropped table: %s", name) + + # ------------------------------------------------------------------ + # Data ingestion + # ------------------------------------------------------------------ + def insert_batch( + self, + name: str, + ids: np.ndarray, + vectors: np.ndarray, + ) -> int: + import psycopg2.extras + + table = self._table(name) + n = len(ids) + # Build a list of tuples for execute_values. + rows = [(int(ids[i]), vectors[i].tolist()) for i in range(n)] + with self._cur() as cur: + psycopg2.extras.execute_values( + cur, + f"INSERT INTO {table} (id, vector) VALUES %s " + f"ON CONFLICT (id) DO NOTHING", + rows, + template="(%s, %s::vector)", + page_size=1000, + ) + return n + + def flush(self, name: str) -> None: + # With autocommit = True every statement is already committed. + logger.info("Flush (no-op with autocommit) for table '%s'", name) + + # ------------------------------------------------------------------ + # Search + # ------------------------------------------------------------------ + def search( + self, + name: str, + query_vectors: np.ndarray, + top_k: int, + search_params: Optional[Dict[str, Any]] = None, + ) -> List[List[int]]: + table = self._table(name) + search_params = search_params or {} + + # Determine distance operator from metric_type in search_params. + metric = search_params.get("metric_type", "COSINE").upper() + op = _METRIC_TO_OPERATOR.get(metric, "<=>") + + # Apply runtime search params (e.g. ef_search for HNSW, probes for IVFFlat). + ef_search = search_params.get("ef_search", search_params.get("ef")) + probes = search_params.get("probes") + + results: List[List[int]] = [] + + # SET LOCAL requires a transaction block, so temporarily leave + # autocommit mode when we need to apply search-time GUCs. + need_txn = ef_search is not None or probes is not None + if need_txn: + self._conn.autocommit = False + + try: + with self._cur() as cur: + if ef_search is not None: + cur.execute( + f"SET LOCAL hnsw.ef_search = {int(ef_search)}" + ) + if probes is not None: + cur.execute( + f"SET LOCAL ivfflat.probes = {int(probes)}" + ) + + for qvec in query_vectors: + vec_literal = "[" + ",".join(str(float(v)) for v in qvec) + "]" + cur.execute( + f"SELECT id FROM {table} " + f"ORDER BY vector {op} %s::vector " + f"LIMIT %s", + (vec_literal, top_k), + ) + results.append([row[0] for row in cur.fetchall()]) + + if need_txn: + self._conn.commit() + except Exception: + if need_txn: + self._conn.rollback() + raise + finally: + if need_txn: + self._conn.autocommit = True + + return results + + # ------------------------------------------------------------------ + # Status / info + # ------------------------------------------------------------------ + def row_count(self, name: str) -> int: + table = self._table(name) + with self._cur() as cur: + cur.execute(f"SELECT COUNT(*) FROM {table}") + return cur.fetchone()[0] + + def get_index_progress(self, name: str) -> IndexProgress: + """In PostgreSQL ``CREATE INDEX`` is synchronous, so by the time + control returns the index is already built. This simply checks + whether any index exists on the table. + """ + with self._cur() as cur: + cur.execute( + "SELECT indexname FROM pg_indexes WHERE tablename = %s", + (name,), + ) + indexes = [row[0] for row in cur.fetchall()] + if indexes: + return IndexProgress( + is_ready=True, + status=", ".join(indexes), + ) + return IndexProgress(is_ready=False, status="waiting") + + # ------------------------------------------------------------------ + # Administration / introspection + # ------------------------------------------------------------------ + def list_collections(self) -> List[str]: + with self._cur() as cur: + cur.execute( + "SELECT table_name FROM information_schema.tables " + "WHERE table_schema = 'public' " + "AND table_type = 'BASE TABLE' " + "ORDER BY table_name" + ) + return [row[0] for row in cur.fetchall()] + + def get_collection_info(self, name: str) -> Dict[str, Any]: + table = self._table(name) + + # Columns + schema: List[Dict[str, Any]] = [] + dimension = None + with self._cur() as cur: + cur.execute( + "SELECT column_name, data_type, udt_name " + "FROM information_schema.columns " + "WHERE table_name = %s ORDER BY ordinal_position", + (name,), + ) + for col_name, data_type, udt_name in cur.fetchall(): + entry: Dict[str, Any] = { + "name": col_name, + "dtype": udt_name if udt_name != data_type else data_type, + } + if udt_name == "vector": + # Retrieve dimension from atttypmod + cur.execute( + "SELECT atttypmod FROM pg_attribute " + "WHERE attrelid = %s::regclass AND attname = %s", + (name, col_name), + ) + row = cur.fetchone() + if row and row[0] > 0: + dimension = row[0] + entry["dim"] = dimension + schema.append(entry) + + # Index info + indexes = self.list_indexes(name) + index_type = indexes[0]["index_type"] if indexes else None + + # Metric type from operator class + metric_type = None + if indexes: + ops = indexes[0].get("params", {}).get("opclass", "") + for metric, op_cls in _METRIC_TO_HNSW_OPS.items(): + if op_cls == ops: + metric_type = metric + break + + row_count = self.row_count(name) + + return { + "name": name, + "row_count": row_count, + "dimension": dimension, + "metric_type": metric_type, + "index_type": index_type, + "schema": schema, + } + + def list_indexes(self, name: str) -> List[Dict[str, Any]]: + results: List[Dict[str, Any]] = [] + with self._cur() as cur: + cur.execute( + "SELECT indexname, indexdef FROM pg_indexes " + "WHERE tablename = %s", + (name,), + ) + for idx_name, idx_def in cur.fetchall(): + # Skip primary-key indexes + if "_pkey" in idx_name: + continue + idx_type = "UNKNOWN" + idx_def_upper = idx_def.upper() + if "USING HNSW" in idx_def_upper: + idx_type = "HNSW" + elif "USING IVFFLAT" in idx_def_upper: + idx_type = "IVFFLAT" + results.append({ + "index_name": idx_name, + "index_type": idx_type, + "definition": idx_def, + "params": {}, + }) + return results + + def drop_index(self, name: str, index_name: Optional[str] = None) -> None: + if index_name is None: + index_name = self._index_name(name) + with self._cur() as cur: + cur.execute(f"DROP INDEX IF EXISTS {index_name}") + logger.info("Dropped index '%s' from table '%s'", index_name, name) diff --git a/vdb_benchmark/vdbbench/benchmark/collection_admin.py b/vdb_benchmark/vdbbench/benchmark/collection_admin.py new file mode 100755 index 00000000..52a9dd37 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/collection_admin.py @@ -0,0 +1,884 @@ +#!/usr/bin/env python3 +"""Backend-agnostic collection administration CLI. + +Provides subcommands for inspecting and managing collections across +any registered vector-database backend (Milvus, pgvector, Elasticsearch, +etc.) All heavy lifting delegates to the :class:`VectorDBBackend` +admin methods so behaviour is consistent across databases. + +Usage examples:: + + # Interactive mode -- discover backends, pick one, browse collections + collection-admin interactive + + # List all collections on a Milvus server + collection-admin --backend milvus list + + # Detailed info for one collection + collection-admin --backend milvus info my_collection + + # Show indexes + collection-admin --backend pgvector indexes my_collection + + # Collection statistics + collection-admin --backend elasticsearch stats my_collection + + # Drop a collection (requires --yes for safety) + collection-admin --backend milvus drop my_collection --yes + + # Drop an index + collection-admin --backend pgvector drop-index my_collection + +Connection parameters are sourced from environment variables using the +``{BACKEND}__{PARAM}`` convention (see ``_env.py``), from a ``.env`` +file, or from ``--param key=value`` CLI flags. +""" + +from __future__ import annotations + +import sys + +# ------------------------------------------------------------------ +# Direct-execution bootstrap (same pattern as run_benchmark.py) +# ------------------------------------------------------------------ +if __name__ == "__main__": + import importlib + import pathlib + + _this = pathlib.Path(__file__).resolve() + _pkg_root = str(_this.parent.parent.parent) + if _pkg_root not in sys.path: + sys.path.insert(0, _pkg_root) + + _mod = importlib.import_module("vdbbench.benchmark.collection_admin") + raise SystemExit(_mod.main()) + +# ------------------------------------------------------------------ +# Normal imports (only reached when loaded as a package member). +# ------------------------------------------------------------------ + +import argparse +import json +import logging +import os +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +from tabulate import tabulate as _tabulate + +from .backends import registry, get_backend +from .backends._env import load_env_file, env_for_backend +from .backends.base import BackendDescriptor, VectorDBBackend + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-8s %(name)s %(message)s", +) +logger = logging.getLogger(__name__) + + +# ===================================================================== +# Output formatting helpers +# ===================================================================== + +def _json_out(data: Any) -> None: + """Print *data* as indented JSON to stdout.""" + print(json.dumps(data, indent=2, default=str)) + + +def _table_out(rows: List[Dict[str, Any]], keys: Optional[List[str]] = None) -> None: + """Print rows as a simple aligned table.""" + if not rows: + print("(no results)") + return + + keys = keys or list(rows[0].keys()) + # Column widths + widths = {k: len(k) for k in keys} + for row in rows: + for k in keys: + widths[k] = max(widths[k], len(str(row.get(k, "")))) + + header = " ".join(k.ljust(widths[k]) for k in keys) + sep = " ".join("-" * widths[k] for k in keys) + print(header) + print(sep) + for row in rows: + print(" ".join(str(row.get(k, "")).ljust(widths[k]) for k in keys)) + + +# ===================================================================== +# Backend connection helper +# ===================================================================== + +def _connect_backend( + backend_name: str, + extra_params: Optional[Dict[str, str]] = None, +) -> VectorDBBackend: + """Instantiate, connect, and return a backend. + + Connection parameters come from (highest-precedence-first): + 1. ``--param key=value`` CLI flags (*extra_params*). + 2. Environment variables (``{BACKEND}__{PARAM}``). + 3. Defaults from the backend descriptor. + """ + load_env_file() + + desc = registry.get(backend_name) + if desc is None: + available = ", ".join(registry.names()) or "(none)" + print(f"Unknown backend '{backend_name}'. Available: {available}", + file=sys.stderr) + sys.exit(1) + + # Merge env + CLI overrides + conn = env_for_backend(backend_name, desc) + if extra_params: + conn.update(extra_params) + + backend = desc.backend_class() + backend.connect(**conn) + return backend + + +# ===================================================================== +# Non-interactive subcommand handlers +# ===================================================================== + +def _cmd_list(backend: VectorDBBackend, args: argparse.Namespace) -> None: + """``list`` -- show all collections.""" + names = backend.list_collections() + if args.json: + _json_out(names) + return + if not names: + print("(no collections found)") + return + for n in sorted(names): + print(n) + + +def _cmd_info(backend: VectorDBBackend, args: argparse.Namespace) -> None: + """``info`` -- detailed metadata for one collection.""" + info = backend.get_collection_info(args.collection) + if args.json: + _json_out(info) + return + + print(f"\nCollection: {info['name']}") + print(f" Rows: {info.get('row_count', '?'):,}") + print(f" Dimension: {info.get('dimension') or '?'}") + print(f" Metric: {info.get('metric_type') or '?'}") + print(f" Index type: {info.get('index_type') or '?'}") + + schema = info.get("schema", []) + if schema: + print("\n Schema:") + for fld in schema: + extras = [] + if fld.get("dim"): + extras.append(f"dim={fld['dim']}") + if fld.get("is_primary"): + extras.append("PK") + suffix = f" ({', '.join(extras)})" if extras else "" + print(f" - {fld['name']}: {fld.get('dtype', '?')}{suffix}") + + for key in ("num_partitions", "partitions"): + if key in info: + print(f" {key}: {info[key]}") + print() + + +def _cmd_indexes(backend: VectorDBBackend, args: argparse.Namespace) -> None: + """``indexes`` -- list indexes on a collection.""" + indexes = backend.list_indexes(args.collection) + if args.json: + _json_out(indexes) + return + if not indexes: + print(f"No indexes found on '{args.collection}'") + return + _table_out(indexes) + + +def _cmd_stats(backend: VectorDBBackend, args: argparse.Namespace) -> None: + """``stats`` -- operational statistics for a collection.""" + stats = backend.get_collection_stats(args.collection) + if args.json: + _json_out(stats) + return + for k, v in stats.items(): + label = k.replace("_", " ").title() + if isinstance(v, int) and v > 999: + print(f" {label}: {v:,}") + else: + print(f" {label}: {v}") + + +def _cmd_drop(backend: VectorDBBackend, args: argparse.Namespace) -> None: + """``drop`` -- drop a collection (destructive!).""" + name = args.collection + if not backend.collection_exists(name): + print(f"Collection '{name}' does not exist.", file=sys.stderr) + sys.exit(1) + + if not args.yes: + try: + answer = input(f"Really DROP collection '{name}'? (yes/[no]) > ").strip() + except (EOFError, KeyboardInterrupt): + answer = "" + if answer.lower() != "yes": + print("Aborted.") + return + + backend.drop_collection(name) + print(f"Dropped: {name}") + + +def _cmd_drop_index(backend: VectorDBBackend, args: argparse.Namespace) -> None: + """``drop-index`` -- drop an index from a collection.""" + name = args.collection + idx = getattr(args, "index_name", None) + + if not args.yes: + target = f"index '{idx}'" if idx else "the vector index" + try: + answer = input( + f"Really DROP {target} on '{name}'? (yes/[no]) > " + ).strip() + except (EOFError, KeyboardInterrupt): + answer = "" + if answer.lower() != "yes": + print("Aborted.") + return + + backend.drop_index(name, index_name=idx) + print(f"Dropped index on '{name}'") + + +# ===================================================================== +# Interactive mode -- backend discovery, health-check, menus +# ===================================================================== + +@dataclass +class BackendStatus: + """Result of probing one backend.""" + name: str + display_name: str + configured: bool = False + healthy: bool = False + error: str = "" + conn_params: Dict[str, Any] = field(default_factory=dict) + descriptor: Optional[BackendDescriptor] = None + + +def discover_backends(env_path: Optional[str] = None) -> List[BackendStatus]: + """Probe every active backend and return their status. + + For each active backend registered in the global registry: + + 1. Load connection params from ``.env`` / environment variables. + 2. If at least one connection parameter is configured, attempt + ``connect()`` followed by ``disconnect()`` as a health check. + 3. If no env vars are set, fall back to the defaults declared in the + backend descriptor and try to connect anyway -- but mark it as + *not explicitly configured*. + """ + load_env_file(env_path) + + results: List[BackendStatus] = [] + for desc in registry.list_backends(): + status = BackendStatus( + name=desc.name, + display_name=desc.display_name, + descriptor=desc, + ) + + # Gather connection params from env + env_params = env_for_backend(desc.name, desc) + status.configured = bool(env_params) + + # Build full param set: defaults + env overrides + conn: Dict[str, Any] = {} + for p in desc.connection_params: + if p.default is not None: + conn[p.name] = p.default + conn.update(env_params) + status.conn_params = conn + + # Attempt ping + try: + backend = desc.backend_class() + backend.connect(**conn) + backend.disconnect() + status.healthy = True + except Exception as exc: + status.healthy = False + status.error = str(exc) + + results.append(status) + + return results + + +def _sep(text: str) -> str: + """Return a ``─`` line matching the widest line in *text*.""" + width = max((len(l) for l in text.splitlines()), default=0) + return "─" * width + + +def pick_backend(statuses: List[BackendStatus]) -> Optional[BackendStatus]: + """Display a table of backends and let the user choose one. + + Only healthy backends are selectable. Returns ``None`` if the user + cancels or no healthy backends exist. + """ + headers = ["Idx", "Backend", "Configured", "Status", "Details"] + rows = [] + for i, s in enumerate(statuses): + configured = "Yes" if s.configured else "defaults" + if s.healthy: + status_str = "Healthy" + detail = ", ".join(f"{k}={v}" for k, v in s.conn_params.items() + if v is not None and k != "password") + else: + status_str = "Unreachable" + detail = s.error[:60] if s.error else "" + rows.append([i, s.display_name, configured, status_str, detail]) + + table = _tabulate(rows, headers=headers, tablefmt="github") + sep = _sep(table) + print(f"\n{sep}") + print(table) + print(sep) + + healthy_ids = [i for i, s in enumerate(statuses) if s.healthy] + if not healthy_ids: + print("\nNo healthy backends found. Check your .env configuration.") + return None + + print(f"\nHealthy backends: {', '.join(str(i) for i in healthy_ids)}") + while True: + try: + choice = input("Select backend idx (or q to quit) > ").strip() + except (EOFError, KeyboardInterrupt): + return None + if choice.lower() == "q": + return None + try: + idx = int(choice) + except ValueError: + print(f"Invalid input '{choice}'. Enter a backend idx or q to quit.") + continue + if idx < 0 or idx >= len(statuses): + print(f"Index {idx} out of range. Select an idx between 0 and {len(statuses) - 1}.") + continue + if not statuses[idx].healthy: + print(f"Backend '{statuses[idx].display_name}' is not healthy. Select a healthy idx.") + continue + return statuses[idx] + + +def _connect_from_status(status: BackendStatus) -> VectorDBBackend: + """Instantiate and connect a backend from its discovered status.""" + backend = status.descriptor.backend_class() + backend.connect(**status.conn_params) + return backend + + +def pick_collection( + backend: VectorDBBackend, + backend_name: str, +) -> Optional[str]: + """List collections on the backend and let the user choose one. + + Returns the collection *name* or ``None`` if cancelled. + """ + try: + names = backend.list_collections() + except Exception as exc: + print(f"Failed to list collections: {exc}") + return None + + if not names: + print(f"\nNo collections found on '{backend_name}'.") + return None + + headers = ["Idx", "Collection", "Rows", "Dim", "Index", "Metric"] + rows = [] + for i, name in enumerate(sorted(names)): + try: + info = backend.get_collection_info(name) + row_count = (f"{info.get('row_count', '?'):,}" + if isinstance(info.get('row_count'), int) else "?") + dim = info.get("dimension") or "?" + idx_type = info.get("index_type") or "?" + metric = info.get("metric_type") or "?" + except Exception: + row_count = "?" + dim = "?" + idx_type = "?" + metric = "?" + rows.append([i, name, row_count, dim, idx_type, metric]) + + table = _tabulate(rows, headers=headers, tablefmt="github") + sep = _sep(table) + print(f"\n{sep}") + print(table) + print(sep) + + while True: + try: + choice = input("\nSelect collection idx (or b=back, q=quit) > ").strip() + except (EOFError, KeyboardInterrupt): + return None + if choice.lower() == "b": + return None + if choice.lower() == "q": + print("Bye.") + sys.exit(0) + try: + idx = int(choice) + except ValueError: + print(f"Invalid input '{choice}'. Enter a collection idx, b, or q.") + continue + if idx < 0 or idx >= len(rows): + print(f"Index {idx} out of range. Select an idx between 0 and {len(rows) - 1}.") + continue + return rows[idx][1] # collection name + + +# ── Interactive operation helpers ────────────────────────────────── + +def _iop_info(backend: VectorDBBackend, collection: str) -> None: + """Display detailed collection info.""" + try: + info = backend.get_collection_info(collection) + except Exception as exc: + print(f"Failed to get info: {exc}") + return + + print(f"\n{'='*70}") + print(f"Collection: {info['name']}") + print(f"{'='*70}") + row_count = info.get("row_count", "?") + if isinstance(row_count, int): + print(f"Rows: {row_count:,}") + else: + print(f"Rows: {row_count}") + print(f"Dimension: {info.get('dimension') or '?'}") + print(f"Metric: {info.get('metric_type') or '?'}") + print(f"Index type: {info.get('index_type') or '?'}") + + schema = info.get("schema", []) + if schema: + print("\nSchema:") + for fld in schema: + extras = [] + if fld.get("dim"): + extras.append(f"dim={fld['dim']}") + if fld.get("is_primary"): + extras.append("PK") + suffix = f" ({', '.join(extras)})" if extras else "" + print(f" - {fld['name']}: {fld.get('dtype', '?')}{suffix}") + + if "num_partitions" in info: + print(f"\nPartitions: {info['num_partitions']}") + for p in info.get("partitions", []): + print(f" - {p}") + print(f"{'='*70}\n") + + +def _iop_stats(backend: VectorDBBackend, collection: str) -> None: + """Display operational statistics.""" + try: + stats = backend.get_collection_stats(collection) + except Exception as exc: + print(f"Failed to get stats: {exc}") + return + + print(f"\nStats for '{collection}':") + for k, v in stats.items(): + label = k.replace("_", " ").title() + if isinstance(v, int) and v > 999: + print(f" {label}: {v:,}") + else: + print(f" {label}: {v}") + print() + + +def _iop_indexes(backend: VectorDBBackend, collection: str) -> None: + """List indexes on a collection.""" + try: + indexes = backend.list_indexes(collection) + except Exception as exc: + print(f"Failed to list indexes: {exc}") + return + + if not indexes: + print(f"No indexes on '{collection}'.") + return + + print(f"\nIndexes on '{collection}':") + print(_tabulate( + [{k: v for k, v in idx.items()} for idx in indexes], + headers="keys", + tablefmt="github", + )) + print() + + +def _iop_compact(backend: VectorDBBackend, collection: str) -> None: + """Trigger compaction (if supported).""" + try: + print(f"Starting compaction on '{collection}'...") + backend.compact(collection) + print("Compaction completed.") + except NotImplementedError: + print("Compaction is not supported by this backend.") + except Exception as exc: + print(f"Compact failed: {exc}") + + +def _iop_drop_index(backend: VectorDBBackend, collection: str) -> None: + """Drop the vector index from a collection.""" + try: + confirm = input( + f"Really DROP the index on '{collection}'? (yes/[no]) > " + ).strip() + except (EOFError, KeyboardInterrupt): + confirm = "" + if confirm.lower() != "yes": + print("Aborted.") + return + + try: + backend.drop_index(collection) + print(f"Index dropped on '{collection}'.") + except NotImplementedError: + print("drop_index is not supported by this backend.") + except Exception as exc: + print(f"Drop index failed: {exc}") + + +def _iop_delete(backend: VectorDBBackend, collection: str) -> None: + """Drop (delete) a collection entirely.""" + try: + confirm = input( + f"Really DROP collection '{collection}'? " + "This is irreversible. (yes/[no]) > " + ).strip() + except (EOFError, KeyboardInterrupt): + confirm = "" + if confirm.lower() != "yes": + print("Aborted; collection kept.") + return + + try: + backend.drop_collection(collection) + print(f"Collection '{collection}' dropped.") + except Exception as exc: + print(f"Delete failed: {exc}") + + +_INTERACTIVE_OPS = { + "i": ("info", "Detailed collection info", _iop_info), + "s": ("stats", "Operational statistics", _iop_stats), + "x": ("indexes", "List indexes", _iop_indexes), + "c": ("compact", "Trigger compaction", _iop_compact), + "di": ("drop-index", "Drop the vector index", _iop_drop_index), + "d": ("delete", "Drop the collection", _iop_delete), + "b": ("back", "Back to collection list", None), + "q": ("quit", "Exit", None), +} + + +def operations_menu( + backend: VectorDBBackend, + collection: str, + backend_name: str, +) -> bool: + """Run the operations loop for a single collection. + + Returns ``True`` to go back to the collection picker, + ``False`` to exit. + """ + while True: + header = f" [{backend_name}] Collection: '{collection}'" + cmd_lines = [f" {key:<4} {name:<12} {desc}" + for key, (name, desc, _) in _INTERACTIVE_OPS.items()] + body = "\n".join([header, " Available commands:"] + cmd_lines) + sep = _sep(body) + print(f"\n{sep}") + print(body) + print(sep) + + try: + choice = input("Enter command > ").strip().lower() + except (EOFError, KeyboardInterrupt): + return False + + if choice == "q": + print("Bye.") + sys.exit(0) + + if choice == "b": + return True + + entry = _INTERACTIVE_OPS.get(choice) + if entry is None: + print(f"Unknown command '{choice}'. Enter one of: " + f"{', '.join(_INTERACTIVE_OPS.keys())}") + continue + + _, _, handler = entry + if handler is not None: + handler(backend, collection) + + # If the collection was deleted, return to the picker + if choice == "d": + return True + + +def _cmd_interactive(args: argparse.Namespace) -> int: + """``interactive`` -- menu-driven backend and collection manager.""" + env_path = getattr(args, "env_file", None) + + print("Discovering backends...") + statuses = discover_backends(env_path=env_path) + + if not statuses: + print("No backends registered. Is the benchmark package installed?") + return 1 + + backend: Optional[VectorDBBackend] = None + current_status: Optional[BackendStatus] = None + + while True: + # ── backend picker ──────────────────────────────────────── + if backend is not None: + print(f"\nCurrently connected to: {current_status.display_name}") + try: + switch = input("Switch backend? (y/[n]) > ").strip().lower() + except (EOFError, KeyboardInterrupt): + break + if switch == "y": + try: + backend.disconnect() + except Exception: + pass + backend = None + + if backend is None: + chosen = pick_backend(statuses) + if chosen is None: + print("Bye.") + break + try: + backend = _connect_from_status(chosen) + current_status = chosen + print(f"\nConnected to {chosen.display_name}.") + except Exception as exc: + print(f"Connection failed: {exc}") + continue + + # ── collection picker ───────────────────────────────────── + col_name = pick_collection(backend, current_status.display_name) + if col_name is None: + try: + backend.disconnect() + except Exception: + pass + backend = None + continue + + # ── operations menu ─────────────────────────────────────── + go_back = operations_menu(backend, col_name, current_status.display_name) + if not go_back: + break + + # Cleanup + if backend is not None: + try: + backend.disconnect() + except Exception: + pass + + return 0 + + +# ===================================================================== +# Argument parser +# ===================================================================== + +_EPILOG = """\ +concepts: + collection The data container that holds vectors and their metadata + (IDs, dimensions, schema). Mapped to a Milvus Collection, + a PostgreSQL table (pgvector), or an Elasticsearch index. + Dropping a collection permanently destroys all stored data. + + index A search-acceleration structure (e.g. HNSW, IVF_FLAT, + DISKANN) built on a collection's vector field. Enables + fast approximate nearest-neighbor (ANN) queries. Created + automatically with the collection. Dropping an index + removes only the search structure -- the underlying data + remains intact and can be re-indexed. +""" + + +def _build_parser() -> argparse.ArgumentParser: + """Build the argparse parser with subcommands.""" + parser = argparse.ArgumentParser( + prog="collection_admin", + description="Backend-agnostic vector-DB collection administration.", + epilog=_EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--backend", "-b", + default=None, + help="Backend name (e.g. milvus, pgvector, elasticsearch). " + "Required for non-interactive commands.", + ) + parser.add_argument( + "--param", "-p", + action="append", + default=[], + metavar="KEY=VALUE", + help="Extra connection parameter (repeatable).", + ) + parser.add_argument( + "--json", "-j", + action="store_true", + default=False, + help="Output results as JSON.", + ) + + sub = parser.add_subparsers(dest="command") + + # -- interactive -- + p_ia = sub.add_parser( + "interactive", + help="Menu-driven interactive mode: discover backends, browse " + "collections, run operations.", + ) + p_ia.add_argument( + "--env-file", + default=None, + help="Path to .env file (default: auto-detect).", + ) + + # -- list -- + sub.add_parser("list", help="List all collections on the server.") + + # -- info -- + p_info = sub.add_parser("info", help="Show detailed collection metadata.") + p_info.add_argument("collection", help="Collection name.") + + # -- indexes -- + p_idx = sub.add_parser("indexes", help="List indexes on a collection.") + p_idx.add_argument("collection", help="Collection name.") + + # -- stats -- + p_stats = sub.add_parser("stats", help="Show collection statistics.") + p_stats.add_argument("collection", help="Collection name.") + + # -- drop -- + p_drop = sub.add_parser( + "drop", + help="Drop a collection -- permanently deletes all data and indexes.", + ) + p_drop.add_argument("collection", help="Collection name.") + p_drop.add_argument( + "--yes", "-y", + action="store_true", + default=False, + help="Skip confirmation prompt.", + ) + + # -- drop-index -- + p_di = sub.add_parser( + "drop-index", + help="Drop an index from a collection -- data is kept and can be re-indexed.", + ) + p_di.add_argument("collection", help="Collection name.") + p_di.add_argument( + "--index-name", "-i", + default=None, + help="Specific index to drop (default: primary vector index).", + ) + p_di.add_argument( + "--yes", "-y", + action="store_true", + default=False, + help="Skip confirmation prompt.", + ) + + return parser + + +def _parse_params(raw: List[str]) -> Dict[str, str]: + """Parse ``--param KEY=VALUE`` arguments into a dict.""" + result: Dict[str, str] = {} + for item in raw: + if "=" not in item: + print(f"Invalid --param format (expected KEY=VALUE): {item}", + file=sys.stderr) + sys.exit(1) + key, _, value = item.partition("=") + result[key.strip()] = value.strip() + return result + + +# ===================================================================== +# Main entry point +# ===================================================================== + +_DISPATCH = { + "list": _cmd_list, + "info": _cmd_info, + "indexes": _cmd_indexes, + "stats": _cmd_stats, + "drop": _cmd_drop, + "drop-index": _cmd_drop_index, +} + + +def main(argv: Optional[List[str]] = None) -> int: + """Parse arguments, connect to the backend, and dispatch.""" + parser = _build_parser() + args = parser.parse_args(argv) + + # Default to interactive when no subcommand given + if not args.command: + args.command = "interactive" + + # ── Interactive mode (no --backend required) ────────────────── + if args.command == "interactive": + return _cmd_interactive(args) + + # ── Non-interactive commands require --backend ──────────────── + if not args.backend: + parser.error("--backend/-b is required for non-interactive commands.") + + extra = _parse_params(args.param) + backend = _connect_backend(args.backend, extra) + + try: + handler = _DISPATCH[args.command] + handler(backend, args) + except NotImplementedError as exc: + print(f"Not supported: {exc}", file=sys.stderr) + return 1 + except Exception as exc: + logger.error("Error: %s", exc, exc_info=True) + return 1 + finally: + backend.disconnect() + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/vdb_benchmark/vdbbench/benchmark/configs/1m_diskann.yaml b/vdb_benchmark/vdbbench/benchmark/configs/1m_diskann.yaml new file mode 100644 index 00000000..fbe3db27 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/configs/1m_diskann.yaml @@ -0,0 +1,45 @@ +# --------------------------------------------------------------- +# 1M-vector DiskANN benchmark (Milvus, producer-consumer pipeline) +# --------------------------------------------------------------- +backend: milvus +mode: both + +database: + host: 127.0.0.1 + port: 19530 + +dataset: + collection_name: bench_1m_diskann + num_vectors: 1_000_000 + dimension: 1536 + distribution: uniform + block_size: 100_000 + batch_size: 10_000 + seed: 42 + +query: + num_query_vectors: 10_000 + query_seed: 99 + +ground_truth: + truth_k: 100 + +index: + index_type: DISKANN + metric_type: COSINE + index_params: + MaxDegree: 64 + SearchListSize: 200 + num_shards: 1 + +search: + search_k: 10 + num_search_rounds: 1 + search_batch_size: 1 + search_params: + search_list: 128 + +workflow: + force: false + compact: true + monitor_interval: 5 diff --git a/vdb_benchmark/vdbbench/benchmark/configs/1m_hnsw.yaml b/vdb_benchmark/vdbbench/benchmark/configs/1m_hnsw.yaml new file mode 100644 index 00000000..24d9ea6e --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/configs/1m_hnsw.yaml @@ -0,0 +1,45 @@ +# --------------------------------------------------------------- +# 1M-vector HNSW benchmark (Milvus, producer-consumer pipeline) +# --------------------------------------------------------------- +backend: milvus +mode: both + +database: + host: 127.0.0.1 + port: 19530 + +dataset: + collection_name: bench_1m_hnsw + num_vectors: 1_000_000 + dimension: 1536 + distribution: uniform + block_size: 100_000 + batch_size: 10_000 + seed: 42 + +query: + num_query_vectors: 10_000 + query_seed: 99 + +ground_truth: + truth_k: 100 + +index: + index_type: HNSW + metric_type: COSINE + index_params: + M: 64 + efConstruction: 200 + num_shards: 1 + +search: + search_k: 10 + num_search_rounds: 1 + search_batch_size: 1 + search_params: + ef: 128 + +workflow: + force: false + compact: true + monitor_interval: 5 diff --git a/vdb_benchmark/vdbbench/benchmark/configs/elasticsearch_1m_hnsw.yaml b/vdb_benchmark/vdbbench/benchmark/configs/elasticsearch_1m_hnsw.yaml new file mode 100644 index 00000000..6568ebed --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/configs/elasticsearch_1m_hnsw.yaml @@ -0,0 +1,46 @@ +# --------------------------------------------------------------- +# 1M-vector HNSW benchmark (Elasticsearch) +# --------------------------------------------------------------- +backend: elasticsearch +mode: both + +database: + host: http://localhost:9200 + # api_key: "" # set via ELASTICSEARCH__API_KEY env var + # cloud_id: "" # set via ELASTICSEARCH__CLOUD_ID env var + +dataset: + collection_name: bench_1m_hnsw + num_vectors: 1_000_000 + dimension: 1536 + distribution: uniform + block_size: 100_000 + batch_size: 10_000 + seed: 42 + +query: + num_query_vectors: 10_000 + query_seed: 99 + +ground_truth: + truth_k: 100 + +index: + index_type: HNSW + metric_type: COSINE + index_params: + m: 16 + ef_construction: 200 + num_shards: 1 + +search: + search_k: 10 + num_search_rounds: 1 + search_batch_size: 1 + search_params: + num_candidates: 128 + +workflow: + force: false + compact: true + monitor_interval: 5 diff --git a/vdb_benchmark/vdbbench/benchmark/configs/pgvector_1m_hnsw.yaml b/vdb_benchmark/vdbbench/benchmark/configs/pgvector_1m_hnsw.yaml new file mode 100644 index 00000000..cc3095ba --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/configs/pgvector_1m_hnsw.yaml @@ -0,0 +1,48 @@ +# --------------------------------------------------------------- +# 1M-vector HNSW benchmark (pgvector / PostgreSQL) +# --------------------------------------------------------------- +backend: pgvector +mode: both + +database: + host: 127.0.0.1 + port: 5432 + dbname: postgres + user: postgres + password: "" + +dataset: + collection_name: bench_1m_hnsw + num_vectors: 1_000_000 + dimension: 1536 + distribution: uniform + block_size: 100_000 + batch_size: 10_000 + seed: 42 + +query: + num_query_vectors: 10_000 + query_seed: 99 + +ground_truth: + truth_k: 100 + +index: + index_type: HNSW + metric_type: COSINE + index_params: + m: 64 + ef_construction: 200 + num_shards: 1 + +search: + search_k: 10 + num_search_rounds: 1 + search_batch_size: 1 + search_params: + ef_search: 128 + +workflow: + force: false + compact: true + monitor_interval: 5 diff --git a/vdb_benchmark/vdbbench/benchmark/generator.py b/vdb_benchmark/vdbbench/benchmark/generator.py new file mode 100644 index 00000000..b9e5fe72 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/generator.py @@ -0,0 +1,169 @@ +"""Vector generator -- the *producer* side of the pipeline. + +Generates random vectors in configurable blocks and pushes them onto a +:class:`queue.Queue`. Each block is a :class:`VectorBlock` containing: + +* ``ids`` -- int64 primary keys (globally unique, monotonically increasing) +* ``vectors`` -- float32 array of shape ``(block_size, dimension)`` + +The generator also produces a separate set of **query vectors** that are +held aside for benchmarking and ground-truth computation. + +Supported distributions: ``uniform``, ``normal``. +All vectors are L2-normalized so that COSINE distance is meaningful. +""" + +from __future__ import annotations + +import logging +import queue +import threading +from dataclasses import dataclass +from typing import Optional + +import numpy as np + +logger = logging.getLogger(__name__) + +# Sentinel pushed onto the queue after the last block. +_DONE = None + + +@dataclass +class VectorBlock: + """A batch of vectors ready for consumption.""" + ids: np.ndarray # shape (n,), dtype int64 + vectors: np.ndarray # shape (n, dim), dtype float32 + block_index: int # ordinal of this block (0-based) + + +def _generate_block( + num_vectors: int, + dimension: int, + distribution: str, + rng: np.random.RandomState, +) -> np.ndarray: + """Return a normalized float32 array of shape ``(num_vectors, dimension)``.""" + if distribution == "normal": + vectors = rng.normal(0, 1, (num_vectors, dimension)).astype(np.float32) + else: # uniform (default) + vectors = rng.random((num_vectors, dimension)).astype(np.float32) + + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + norms[norms == 0] = 1.0 # avoid division by zero + vectors /= norms + return vectors + + +def generate_query_vectors( + num_queries: int, + dimension: int, + distribution: str = "uniform", + seed: int = 99, +) -> np.ndarray: + """Deterministically generate a set of query vectors. + + Uses a *separate* seed from the database vectors so that the query + set is independent of the dataset. + + Returns + ------- + np.ndarray + Shape ``(num_queries, dimension)``, dtype float32, L2-normalized. + """ + rng = np.random.RandomState(seed) + return _generate_block(num_queries, dimension, distribution, rng) + + +class VectorGenerator: + """Producer that feeds vector blocks into a queue. + + Parameters + ---------- + total_vectors : int + How many database vectors to produce in total. + dimension : int + Dimensionality of each vector. + block_size : int + Vectors per block (the last block may be smaller). + distribution : str + ``"uniform"`` or ``"normal"``. + seed : int + Random seed for reproducibility. + max_queue_depth : int + Backpressure limit -- producer blocks when queue is this full. + """ + + def __init__( + self, + total_vectors: int, + dimension: int, + block_size: int = 100_000, + distribution: str = "uniform", + seed: int = 42, + max_queue_depth: int = 4, + ) -> None: + self.total_vectors = total_vectors + self.dimension = dimension + self.block_size = block_size + self.distribution = distribution + self.seed = seed + self.queue: queue.Queue[Optional[VectorBlock]] = queue.Queue( + maxsize=max_queue_depth + ) + self._thread: Optional[threading.Thread] = None + self._error: Optional[Exception] = None + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + def start(self) -> None: + """Spawn the producer thread. Non-blocking.""" + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + + def join(self) -> None: + """Wait for the producer to finish. Raises if it errored.""" + if self._thread is not None: + self._thread.join() + if self._error is not None: + raise self._error + + @property + def num_blocks(self) -> int: + return (self.total_vectors + self.block_size - 1) // self.block_size + + # ------------------------------------------------------------------ + # Internal + # ------------------------------------------------------------------ + def _run(self) -> None: + try: + rng = np.random.RandomState(self.seed) + remaining = self.total_vectors + block_idx = 0 + next_id = 0 + + while remaining > 0: + n = min(self.block_size, remaining) + vectors = _generate_block(n, self.dimension, self.distribution, rng) + ids = np.arange(next_id, next_id + n, dtype=np.int64) + + block = VectorBlock( + ids=ids, vectors=vectors, block_index=block_idx + ) + self.queue.put(block) + logger.info( + "Producer: block %d (%s vectors, ids %s..%s)", + block_idx, f"{n:,}", f"{next_id:,}", f"{next_id + n - 1:,}", + ) + + next_id += n + remaining -= n + block_idx += 1 + + # Sentinel signals consumers that production is done. + self.queue.put(_DONE) + except Exception as exc: + logger.exception("Producer thread failed") + self._error = exc + self.queue.put(_DONE) diff --git a/vdb_benchmark/vdbbench/benchmark/ground_truth.py b/vdb_benchmark/vdbbench/benchmark/ground_truth.py new file mode 100644 index 00000000..66f86f45 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/ground_truth.py @@ -0,0 +1,241 @@ +"""Ground-truth builder -- incremental nearest-neighbor tracking. + +As each :class:`VectorBlock` arrives from the producer, this module +computes the distances between the **query vectors** and the new block, +then merges those distances into a running top-K table. + +At the end of ingestion the result is a truth table:: + + query_index -> [id_1, id_2, ..., id_K] + +where *id_1* is the nearest database vector to that query, *id_2* the +second-nearest, etc. This is computed entirely in NumPy using +brute-force inner product / cosine distance -- no database calls needed. + +The approach is streaming-friendly: memory usage is O(num_queries * K) +for the truth table plus O(num_queries * block_size) transiently per +block. For 10 000 queries, K=100, and block_size=100 000 this is very +manageable. + +Performance notes +----------------- +* The dominant cost is the matrix multiply (BLAS ``sgemm``), which is + O(Q * B * D) per block and cannot be reduced without approximate + methods. +* Because all vectors are L2-normalized, inner-product ranking is + equivalent to L2 and cosine ranking. We therefore use a single + "higher is better" code path for every metric, which also avoids + allocating a second (Q, B) distance matrix for L2. +* The matmul is **sub-blocked** along the database-vector dimension so + that the transient similarity matrix stays within a configurable + memory budget (default 512 MiB) instead of growing to Q * B * 4 bytes + (3.8 GiB at the default config). Because the smaller tiles fit in L3 + cache, this is also marginally faster than the single large ``sgemm``. +* After the first sub-block, a per-query **threshold filter** is applied + before the expensive ``argpartition``: ``flatnonzero(row > thresh)`` + is a simple comparison+gather (~30 us / 100 K floats) vs introselect + (~230 us). Only the few candidates that beat the current worst in the + top-K need to be partially sorted, giving a ~4x merge speedup on + subsequent sub-blocks. +* The final merge (running top-K + block top-K -> new top-K) is a + single vectorized ``argpartition`` over the small ``(Q, 2K)`` matrix. +""" + +from __future__ import annotations + +import logging +from typing import Optional + +import numpy as np + +from .generator import VectorBlock + +logger = logging.getLogger(__name__) + +# Target memory budget for the transient (Q, sub_B) similarity matrix. +# The actual sub-block size is: sub_B = budget // (num_queries * 4). +# 512 MiB ⇒ sub_B ≈ 13 000 for Q = 10 000. +_SIMS_MEM_BUDGET: int = 512 << 20 # 512 MiB + + +class GroundTruthBuilder: + """Incrementally build a nearest-neighbor truth table. + + Parameters + ---------- + query_vectors : np.ndarray + Shape ``(num_queries, dimension)``, dtype float32, L2-normalized. + k : int + Number of nearest neighbors to track per query. + metric : str + ``"COSINE"`` (or ``"IP"``). Both reduce to inner-product on + L2-normalized vectors. ``"L2"`` is also supported. + """ + + def __init__( + self, + query_vectors: np.ndarray, + k: int = 100, + metric: str = "COSINE", + ) -> None: + self.query_vectors = np.ascontiguousarray(query_vectors, dtype=np.float32) + self.num_queries, self.dimension = self.query_vectors.shape + self.k = k + self.metric = metric.upper() + + # Running top-K state -- always "higher is better" internally. + # + # For L2-normalized vectors the inner product (IP) preserves the + # ranking of all three supported metrics: + # COSINE = IP (identical by definition for unit vecs) + # L2^2 = 2 - 2 * IP (monotone decreasing transform of IP) + # + # So we store IP similarities and use a single merge path. + self._top_ids: np.ndarray = np.full( + (self.num_queries, k), -1, dtype=np.int64 + ) + self._top_dist: np.ndarray = np.full( + (self.num_queries, k), -np.inf, dtype=np.float32 + ) + + self._blocks_processed = 0 + self._topk_initialized = False + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + def update(self, block: VectorBlock) -> None: + """Incorporate a new block of database vectors. + + For each query vector *q*, compute the similarity to every + vector in *block*, then merge the best results into the running + top-K. The matmul is sub-blocked along the database-vector axis + to keep the transient similarity matrix within + ``_SIMS_MEM_BUDGET``. + """ + db_vecs = np.ascontiguousarray(block.vectors, dtype=np.float32) + db_ids = block.ids # shape (n,) + B = len(db_ids) + + # Sub-block size: keep the (Q, sub_b) similarity matrix under budget. + sub_b = max(1, _SIMS_MEM_BUDGET // (self.num_queries * 4)) + + for sb in range(0, B, sub_b): + se = min(sb + sub_b, B) + # Inner product: higher = more similar = closer for all + # metrics on L2-normalized vectors. + sub_sims = self.query_vectors @ db_vecs[sb:se].T # (Q, se-sb) + sub_ids = db_ids[sb:se] + + if not self._topk_initialized: + self._merge_first_block(sub_sims, sub_ids) + self._topk_initialized = True + else: + self._merge_with_threshold(sub_sims, sub_ids) + + self._blocks_processed += 1 + logger.debug( + "GroundTruth: processed block %d (%d vectors, %d sub-blocks)", + block.block_index, B, (B + sub_b - 1) // sub_b, + ) + + def build(self) -> np.ndarray: + """Return the final truth table. + + Returns + ------- + np.ndarray + Shape ``(num_queries, k)``, dtype int64. + ``result[q]`` contains the IDs of the *k* nearest database + vectors to query *q*, ordered closest-first. + """ + # Descending similarity -- highest (closest) first. + order = np.argsort(-self._top_dist, axis=1) + sorted_ids = np.take_along_axis(self._top_ids, order, axis=1) + return sorted_ids + + # ------------------------------------------------------------------ + # Internals + # ------------------------------------------------------------------ + def _merge_first_block( + self, sims: np.ndarray, db_ids: np.ndarray, + ) -> None: + """Merge the very first sub-block (no useful threshold yet). + + Uses per-row ``argpartition`` on the full sub-block, which is + the fastest NumPy path when there is no threshold to exploit. + """ + k = self.k + Q, B = sims.shape + + if B <= k: + block_top_sims = sims + block_top_ids = np.broadcast_to(db_ids, sims.shape).copy() + else: + block_top_sims = np.empty((Q, k), dtype=np.float32) + block_top_ids = np.empty((Q, k), dtype=np.int64) + for q in range(Q): + idx = np.argpartition(sims[q], -k)[-k:] + block_top_sims[q] = sims[q, idx] + block_top_ids[q] = db_ids[idx] + + self._vectorized_merge(block_top_sims, block_top_ids) + + def _merge_with_threshold( + self, sims: np.ndarray, db_ids: np.ndarray, + ) -> None: + """Merge a sub-block using per-query threshold filtering. + + For each query, only the entries whose similarity exceeds the + current worst score in the running top-K are considered. With + high-dimensional random vectors this typically reduces the + candidate set from *B* to ~0.1--1 % of *B*, making the per-row + ``argpartition`` (and even the need for one) much cheaper. + """ + k = self.k + Q, B = sims.shape + + # Per-query threshold: worst similarity currently in the top-K. + thresh = self._top_dist.min(axis=1) # (Q,) + + block_top_sims = np.full((Q, k), -np.inf, dtype=np.float32) + block_top_ids = np.full((Q, k), -1, dtype=np.int64) + + for q in range(Q): + cand_idx = np.flatnonzero(sims[q] > thresh[q]) + nc = len(cand_idx) + if nc == 0: + continue + if nc <= k: + block_top_sims[q, :nc] = sims[q, cand_idx] + block_top_ids[q, :nc] = db_ids[cand_idx] + else: + vals = sims[q, cand_idx] + sub = np.argpartition(vals, -k)[-k:] + block_top_sims[q] = vals[sub] + block_top_ids[q] = db_ids[cand_idx[sub]] + + self._vectorized_merge(block_top_sims, block_top_ids) + + def _vectorized_merge( + self, + block_top_sims: np.ndarray, + block_top_ids: np.ndarray, + ) -> None: + """Merge block top-K into running top-K (single vectorized op). + + Concatenates ``(Q, K)`` running state with ``(Q, K_block)`` + block candidates, then selects the overall top-K via a + single ``argpartition`` along ``axis=1``. + """ + k = self.k + cand_sims = np.concatenate( + [self._top_dist, block_top_sims], axis=1, + ) + cand_ids = np.concatenate( + [self._top_ids, block_top_ids], axis=1, + ) + + best = np.argpartition(cand_sims, -k, axis=1)[:, -k:] + self._top_dist = np.take_along_axis(cand_sims, best, axis=1) + self._top_ids = np.take_along_axis(cand_ids, best, axis=1) diff --git a/vdb_benchmark/vdbbench/benchmark/orchestrator.py b/vdb_benchmark/vdbbench/benchmark/orchestrator.py new file mode 100644 index 00000000..35da4041 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/orchestrator.py @@ -0,0 +1,566 @@ +"""Benchmark orchestrator -- producer / consumer pipeline. + +Coordinates three concerns during the **load** phase: + +1. **Producer** (:class:`VectorGenerator`) -- generates random vectors in + blocks on a background thread. +2. **VDB consumer** (:class:`VectorDBBackend`) -- inserts each block into + the target database (main thread, network I/O). +3. **Ground-truth consumer** (:class:`GroundTruthBuilder`) -- computes + brute-force nearest neighbors for each block against the query set + (background thread, runs in parallel with insert). + +And during the **search** phase: + +4. **SearchRunner** -- queries the VDB in batches, computes recall + against the truth table, and logs QPS / latency percentiles. + +Three runtime modes are supported via ``BenchmarkConfig.mode``: + +* ``load`` -- generate vectors, ingest, compute ground truth. +* ``search`` -- run search queries against an already-loaded collection. +* ``both`` -- load then search. + +After all blocks have been processed the orchestrator writes artifacts +to ``output_dir``: + +* **Vectors in the database** -- already stored by the VDB consumer. +* **query_vectors.npy** -- the query-vector matrix. +* **ground_truth.npz** -- the truth table (``truth_table``) and the + query vectors (``query_vectors``). ``truth_table[q]`` is a length-K + array of database IDs ordered closest-first to query *q*. +* **search_results.json** -- search benchmark results (search/both modes). + +Usage:: + + from benchmark.orchestrator import BenchmarkOrchestrator + + orch = BenchmarkOrchestrator(config, backend) + orch.run() # blocking -- runs load, search, or both + orch.save(output_dir) # write artifacts +""" + +from __future__ import annotations + +import json +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict, dataclass, field +from typing import Any, Dict, Optional + +import numpy as np + +from .backends.base import VectorDBBackend +from .generator import VectorBlock, VectorGenerator, generate_query_vectors +from .ground_truth import GroundTruthBuilder +from .search_runner import ( + SearchResult, + SearchRunner, + build_truth_from_flat, + ensure_flat_collection, +) + +logger = logging.getLogger(__name__) + +# Valid mode values +MODES = ("load", "search", "both") +# Valid truth_mode values +TRUTH_MODES = ("precomputed", "flat_index") + + +@dataclass +class BenchmarkConfig: + """All tunables for a single benchmark run.""" + + # Run mode + mode: str = "load" # "load", "search", or "both" + + # Database vectors + num_vectors: int = 1_000_000 + dimension: int = 1536 + distribution: str = "uniform" + seed: int = 42 + block_size: int = 100_000 + batch_size: int = 10_000 + + # Query vectors + num_query_vectors: int = 10_000 + query_seed: int = 99 + + # Ground truth + truth_k: int = 100 + truth_mode: str = "precomputed" # "precomputed" or "flat_index" + + # Index + collection_name: str = "bench_vectors" + metric_type: str = "COSINE" + index_type: str = "HNSW" + index_params: Dict[str, Any] = field(default_factory=dict) + num_shards: int = 1 + force: bool = False + + # Connection (used by Milvus backend) + host: str = "127.0.0.1" + port: str = "19530" + + # Pipeline tuning + max_queue_depth: int = 4 + + # Post-load + compact: bool = False + monitor_interval: int = 5 + + # Search benchmark + search_k: int = 10 + search_params: Dict[str, Any] = field(default_factory=dict) + num_search_rounds: int = 1 + search_batch_size: int = 1 + log_interval: int = 1000 + + # Artifacts directory (for search mode -- where to load from) + artifacts_dir: str = "" + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "BenchmarkConfig": + """Build from a flat or sectioned dict (like the YAML configs). + + Nested dicts that correspond to known dict-typed fields + (e.g. ``search_params``, ``index_params``) are preserved as-is. + Other nested dicts (YAML sections like ``database``, ``dataset``) + are flattened into the top level. + """ + known = {f.name for f in cls.__dataclass_fields__.values()} + # Fields that are Dict-typed and should stay as dicts + dict_fields = { + f.name for f in cls.__dataclass_fields__.values() + if f.default_factory is dict # type: ignore[comparison-overlap] + } + flat: Dict[str, Any] = {} + for key, val in d.items(): + if isinstance(val, dict) and key not in dict_fields: + # YAML section -- flatten its contents + flat.update(val) + else: + flat[key] = val + return cls(**{k: v for k, v in flat.items() if k in known}) + + +class BenchmarkOrchestrator: + """Wire everything together and drive the pipeline. + + Parameters + ---------- + config : BenchmarkConfig + Benchmark tunables. + backend : VectorDBBackend + A connected backend instance (``connect()`` already called). + """ + + def __init__( + self, + config: BenchmarkConfig, + backend: VectorDBBackend, + ) -> None: + self.cfg = config + self.backend = backend + + self.query_vectors: Optional[np.ndarray] = None + self.truth_table: Optional[np.ndarray] = None + self.search_result: Optional[SearchResult] = None + + # Timing bookkeeping + self._timings: Dict[str, float] = {} + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + def run(self) -> Dict[str, Any]: + """Execute the benchmark in the configured mode. + + Returns a summary dict with timings and counts. + """ + mode = self.cfg.mode.lower() + if mode not in MODES: + raise ValueError( + f"Invalid mode '{mode}'. Must be one of {MODES}" + ) + + summary: Dict[str, Any] = {} + + if mode in ("load", "both"): + summary.update(self._run_load()) + + if mode in ("search", "both"): + summary.update(self._run_search()) + + logger.info("Pipeline complete (%s mode). Summary: %s", mode, summary) + return summary + + def save(self, output_dir: str) -> Dict[str, str]: + """Persist artifacts to *output_dir*. + + Returns a dict mapping artifact name to file path. + """ + os.makedirs(output_dir, exist_ok=True) + paths: Dict[str, str] = {} + + # Query vectors + if self.query_vectors is not None: + qpath = os.path.join(output_dir, "query_vectors.npy") + np.save(qpath, self.query_vectors) + paths["query_vectors"] = qpath + + # Ground-truth table + if self.truth_table is not None: + gtpath = os.path.join(output_dir, "ground_truth.npz") + np.savez_compressed( + gtpath, + truth_table=self.truth_table, + query_vectors=self.query_vectors, + ) + paths["ground_truth"] = gtpath + + # Search results + if self.search_result is not None: + spath = os.path.join(output_dir, "search_results.json") + with open(spath, "w") as f: + json.dump(self.search_result.to_dict(), f, indent=2, default=str) + paths["search_results"] = spath + + # Config + timings + meta = { + "config": self.cfg.to_dict(), + "timings": self._timings, + } + mpath = os.path.join(output_dir, "benchmark_meta.json") + with open(mpath, "w") as f: + json.dump(meta, f, indent=2, default=str) + paths["meta"] = mpath + + logger.info("Artifacts saved to %s", output_dir) + for name, p in paths.items(): + logger.info(" %s -> %s", name, p) + return paths + + # ------------------------------------------------------------------ + # Load phase + # ------------------------------------------------------------------ + def _run_load(self) -> Dict[str, Any]: + """Execute the full load pipeline (blocking).""" + cfg = self.cfg + + # ---- 1. Generate query vectors --------------------------------- + logger.info( + "Generating %s query vectors (%s-d, seed=%d) ...", + f"{cfg.num_query_vectors:,}", f"{cfg.dimension:,}", cfg.query_seed, + ) + t0 = time.time() + self.query_vectors = generate_query_vectors( + num_queries=cfg.num_query_vectors, + dimension=cfg.dimension, + distribution=cfg.distribution, + seed=cfg.query_seed, + ) + self._timings["query_gen_sec"] = time.time() - t0 + logger.info( + "%s query vectors generated in %.2f s", + f"{cfg.num_query_vectors:,}", self._timings["query_gen_sec"], + ) + + # ---- 2. Create the collection ---------------------------------- + logger.info( + "Creating collection '%s' (%s / %s) ...", + cfg.collection_name, cfg.index_type, cfg.metric_type, + ) + t0 = time.time() + self.backend.create_collection( + name=cfg.collection_name, + dimension=cfg.dimension, + metric_type=cfg.metric_type, + index_type=cfg.index_type, + index_params=cfg.index_params, + num_shards=cfg.num_shards, + force=cfg.force, + ) + self._timings["create_collection_sec"] = time.time() - t0 + + # ---- 2b. Create FLAT companion (if flat_index truth mode) ------ + flat_name = f"{cfg.collection_name}_flat" + if cfg.truth_mode == "flat_index": + ensure_flat_collection( + backend=self.backend, + source_name=cfg.collection_name, + flat_name=flat_name, + dimension=cfg.dimension, + metric_type=cfg.metric_type, + ) + + # ---- 3. Set up producer and ground-truth builder --------------- + generator = VectorGenerator( + total_vectors=cfg.num_vectors, + dimension=cfg.dimension, + block_size=cfg.block_size, + distribution=cfg.distribution, + seed=cfg.seed, + max_queue_depth=cfg.max_queue_depth, + ) + # Only build brute-force GT when in precomputed mode + gt_builder: Optional[GroundTruthBuilder] = None + if cfg.truth_mode == "precomputed": + gt_builder = GroundTruthBuilder( + query_vectors=self.query_vectors, + k=cfg.truth_k, + metric=cfg.metric_type, + ) + + # ---- 4. Run the pipeline --------------------------------------- + # Insert (network I/O) and GT update (BLAS matmul) both release + # the GIL, so they run truly in parallel when overlapped. + logger.info( + "Starting pipeline: %s vectors, block_size=%s, batch_size=%s", + f"{cfg.num_vectors:,}", f"{cfg.block_size:,}", f"{cfg.batch_size:,}", + ) + t_pipeline = time.time() + total_inserted = 0 + blocks_consumed = 0 + + def _timed_gt_update(builder, blk): + """Run GT update and return its wall-clock time.""" + t0 = time.time() + builder.update(blk) + return time.time() - t0 + + generator.start() + + with ThreadPoolExecutor(max_workers=1, + thread_name_prefix="gt") as gt_pool: + while True: + block: Optional[VectorBlock] = generator.queue.get() + if block is None: + break # sentinel + + n = len(block.ids) + t_wall = time.time() + + # -- kick off GT in background thread -------------------- + gt_future = None + if gt_builder is not None: + gt_future = gt_pool.submit( + _timed_gt_update, gt_builder, block, + ) + + # -- consumer 1: insert into VDB (main thread) ----------- + t_insert = time.time() + for off in range(0, n, cfg.batch_size): + end = min(off + cfg.batch_size, n) + self.backend.insert_batch( + name=cfg.collection_name, + ids=block.ids[off:end], + vectors=block.vectors[off:end], + ) + insert_elapsed = time.time() - t_insert + total_inserted += n + + # -- consumer 1b: mirror into FLAT collection ------------ + if cfg.truth_mode == "flat_index": + for off in range(0, n, cfg.batch_size): + end = min(off + cfg.batch_size, n) + self.backend.insert_batch( + name=flat_name, + ids=block.ids[off:end], + vectors=block.vectors[off:end], + ) + + # -- wait for GT to finish ------------------------------- + gt_elapsed = gt_future.result() if gt_future else 0.0 + wall_elapsed = time.time() - t_wall + + blocks_consumed += 1 + logger.info( + "Block %d/%d consumed: %s vectors " + "(insert=%.2fs | GT=%.2fs | wall=%.2fs). " + "Total: %s / %s", + blocks_consumed, generator.num_blocks, f"{n:,}", + insert_elapsed, gt_elapsed, wall_elapsed, + f"{total_inserted:,}", f"{cfg.num_vectors:,}", + ) + + generator.join() # propagate any producer error + + self._timings["pipeline_sec"] = time.time() - t_pipeline + logger.info( + "%s vectors inserted in %.2f s", + f"{total_inserted:,}", self._timings["pipeline_sec"], + ) + + # ---- 5. Flush + optional compaction + wait for index -------------- + logger.info("Flushing collection ...") + t0 = time.time() + self.backend.flush(cfg.collection_name) + if cfg.truth_mode == "flat_index": + self.backend.flush(flat_name) + self._timings["flush_sec"] = time.time() - t0 + logger.info("Flush completed in %.2f s", self._timings["flush_sec"]) + + if cfg.compact: + logger.info("Compacting segments ...") + t0 = time.time() + self.backend.compact(cfg.collection_name) + self.backend.flush(cfg.collection_name) + self._timings["compact_sec"] = time.time() - t0 + logger.info("Compaction completed in %.2f s", self._timings["compact_sec"]) + + logger.info("Waiting for index build ...") + t0 = time.time() + self.backend.wait_for_index( + cfg.collection_name, interval=cfg.monitor_interval, + compacted=cfg.compact, + ) + self._timings["index_build_sec"] = time.time() - t0 + + # ---- 7. Finalize ground truth ---------------------------------- + if gt_builder is not None: + logger.info("Building final truth table (k=%d) ...", cfg.truth_k) + t0 = time.time() + self.truth_table = gt_builder.build() + self._timings["truth_build_sec"] = time.time() - t0 + logger.info( + "Ground truth built in %.2f s (%s queries x k=%s)", + self._timings["truth_build_sec"], + f"{cfg.num_query_vectors:,}", f"{cfg.truth_k:,}", + ) + elif cfg.truth_mode == "flat_index": + logger.info( + "Building truth table from FLAT collection (k=%d) ...", + cfg.truth_k, + ) + t0 = time.time() + self.truth_table = build_truth_from_flat( + backend=self.backend, + flat_collection_name=flat_name, + query_vectors=self.query_vectors, + truth_k=cfg.truth_k, + metric_type=cfg.metric_type, + ) + self._timings["truth_build_sec"] = time.time() - t0 + logger.info( + "Ground truth (FLAT) built in %.2f s (%s queries x k=%s)", + self._timings["truth_build_sec"], + f"{cfg.num_query_vectors:,}", f"{cfg.truth_k:,}", + ) + + return self._load_summary(total_inserted, blocks_consumed) + + # ------------------------------------------------------------------ + # Search phase + # ------------------------------------------------------------------ + def _run_search(self) -> Dict[str, Any]: + """Execute the search benchmark (blocking).""" + cfg = self.cfg + + # ---- 1. Load query vectors + truth table ----------------------- + if self.query_vectors is None or self.truth_table is None: + self._load_artifacts() + + # ---- 2. Build search params ------------------------------------ + search_params = cfg.search_params + if not search_params: + search_params = { + "metric_type": cfg.metric_type, + "params": {}, + } + + # ---- 3. Run the search benchmark ------------------------------- + runner = SearchRunner( + backend=self.backend, + collection_name=cfg.collection_name, + query_vectors=self.query_vectors, + truth_table=self.truth_table, + search_k=cfg.search_k, + search_params=search_params, + metric_type=cfg.metric_type, + num_rounds=cfg.num_search_rounds, + batch_size=cfg.search_batch_size, + log_interval=cfg.log_interval, + ) + + t0 = time.time() + self.search_result = runner.run() + self._timings["search_sec"] = time.time() - t0 + + return self._search_summary() + + def _load_artifacts(self) -> None: + """Load query vectors and truth table from a previous run.""" + d = self.cfg.artifacts_dir + if not d: + raise ValueError( + "In 'search' mode, either run 'load' first (mode=both) " + "or provide --artifacts-dir pointing to a previous run." + ) + qpath = os.path.join(d, "query_vectors.npy") + gtpath = os.path.join(d, "ground_truth.npz") + + if not os.path.isfile(qpath) or not os.path.isfile(gtpath): + raise FileNotFoundError( + f"Expected artifacts not found in '{d}'. " + f"Looking for query_vectors.npy and ground_truth.npz" + ) + + self.query_vectors = np.load(qpath) + gt = np.load(gtpath) + self.truth_table = gt["truth_table"] + + logger.info( + "Loaded artifacts from '%s': queries=%s, truth=%s", + d, self.query_vectors.shape, self.truth_table.shape, + ) + + # If truth_mode is flat_index and we don't have precomputed truth, + # build it on-the-fly + if (self.cfg.truth_mode == "flat_index" + and self.truth_table is None): + flat_name = f"{self.cfg.collection_name}_flat" + self.truth_table = build_truth_from_flat( + backend=self.backend, + flat_collection_name=flat_name, + query_vectors=self.query_vectors, + truth_k=self.cfg.truth_k, + metric_type=self.cfg.metric_type, + ) + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + def _load_summary(self, total_inserted: int, blocks: int) -> Dict[str, Any]: + return { + "total_vectors_inserted": total_inserted, + "blocks_processed": blocks, + "num_query_vectors": self.cfg.num_query_vectors, + "truth_k": self.cfg.truth_k, + "truth_table_shape": list(self.truth_table.shape) + if self.truth_table is not None + else None, + "timings": dict(self._timings), + } + + def _search_summary(self) -> Dict[str, Any]: + r = self.search_result + if r is None: + return {} + return { + "search_total_queries": r.total_queries, + "search_qps": r.qps, + "search_recall_at_k": r.recall_at_k, + "search_latency_p50_ms": r.latency_p50_ms, + "search_latency_p90_ms": r.latency_p90_ms, + "search_latency_p99_ms": r.latency_p99_ms, + "search_latency_mean_ms": r.latency_mean_ms, + "search_wall_sec": r.total_wall_sec, + "timings": dict(self._timings), + } diff --git a/vdb_benchmark/vdbbench/benchmark/run_benchmark.py b/vdb_benchmark/vdbbench/benchmark/run_benchmark.py new file mode 100755 index 00000000..e9a463ab --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/run_benchmark.py @@ -0,0 +1,581 @@ +#!/usr/bin/env python3 +"""CLI entry point for the producer-consumer vector-DB benchmark. + +Usage examples:: + + # List available backends + python -m vdbbench.benchmark.run_benchmark help backends + + # Show detailed help for a specific backend + python -m vdbbench.benchmark.run_benchmark help backend milvus + + # Run a benchmark (config-driven) + python -m vdbbench.benchmark.run_benchmark --config configs/1m_hnsw.yaml + + # Override mode or backend on the CLI + python -m vdbbench.benchmark.run_benchmark --config configs/1m_hnsw.yaml --mode both + python -m vdbbench.benchmark.run_benchmark --config configs/1m_hnsw.yaml --backend pgvector + + # Dry-run (print resolved config and exit) + python -m vdbbench.benchmark.run_benchmark --config configs/1m_hnsw.yaml --what-if + + # Direct script execution also works: + python benchmark/run_benchmark.py help backend milvus + +All dataset, index, search, and connection parameters are set in the YAML +config file. The CLI is intentionally minimal -- only operational switches +(``--mode``, ``--backend``, ``--force``, ``--output-dir``, etc.) may be +given on the command line. +""" + +from __future__ import annotations + +import sys + +# ------------------------------------------------------------------ +# Direct-execution bootstrap. When someone runs this file as a script +# (``python run_benchmark.py …``), Python sets __name__ = "__main__" +# and relative imports are impossible. We detect that case *before* +# any relative imports, fix sys.path, re-import ourselves as a proper +# package member, and delegate to main(). +# ------------------------------------------------------------------ +if __name__ == "__main__": + import importlib + import pathlib + + _this = pathlib.Path(__file__).resolve() + # …/vdb_benchmark/vdbbench/benchmark/run_benchmark.py + # parent.parent.parent → …/vdb_benchmark (contains vdbbench/) + _pkg_root = str(_this.parent.parent.parent) + if _pkg_root not in sys.path: + sys.path.insert(0, _pkg_root) + + _mod = importlib.import_module("vdbbench.benchmark.run_benchmark") + raise SystemExit(_mod.main()) + +# ------------------------------------------------------------------ +# Normal imports (only reached when loaded as a package member). +# ------------------------------------------------------------------ + +import argparse +import json +import logging +import math +import os +import sys +import time +from datetime import datetime + +import yaml + +from .backends import registry, get_backend +from .backends._env import load_env_file, env_for_backend +from .backends._help import format_backend_help, format_backends_list +from .orchestrator import BenchmarkConfig, BenchmarkOrchestrator, MODES, TRUTH_MODES + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-8s %(name)s %(message)s", +) +logger = logging.getLogger(__name__) + +# ------------------------------------------------------------------ +# YAML helpers (mirrors existing config_loader.py pattern) +# ------------------------------------------------------------------ + +def _load_yaml(path: str) -> dict: + """Try *path* directly, then under ``configs/``.""" + for candidate in [path, os.path.join("configs", path)]: + if os.path.isfile(candidate): + with open(candidate) as fh: + cfg = yaml.safe_load(fh) + logger.info("Loaded config from %s", candidate) + return cfg or {} + # Also try relative to this file's directory + pkg_dir = os.path.dirname(os.path.abspath(__file__)) + candidate = os.path.join(pkg_dir, "configs", path) + if os.path.isfile(candidate): + with open(candidate) as fh: + cfg = yaml.safe_load(fh) + logger.info("Loaded config from %s", candidate) + return cfg or {} + logger.error("Config file not found: %s", path) + return {} + +# ------------------------------------------------------------------ +# Help sub-commands +# ------------------------------------------------------------------ + +def _handle_help(argv: list[str]) -> bool: + """If *argv* starts with ``help ...``, print the requested info + and return ``True`` (meaning: handled, exit). Otherwise return + ``False``. + """ + if not argv or argv[0].lower() != "help": + return False + + rest = [a.lower() for a in argv[1:]] + + # help backends + if rest == ["backends"]: + print(format_backends_list(registry)) + return True + + # help backend + if len(rest) == 2 and rest[0] == "backend": + print(format_backend_help(registry, rest[1])) + return True + + # Bare "help" or unknown + print("Usage:") + print(" help backends -- list all registered backends") + print(" help backend -- show parameters for a backend") + print() + print(format_backends_list(registry)) + return True + +# ------------------------------------------------------------------ +# CLI +# ------------------------------------------------------------------ + +def _build_parser() -> argparse.ArgumentParser: + available = ", ".join(registry.names()) or "(none)" + p = argparse.ArgumentParser( + description="Vector-DB benchmark: generate, ingest, build ground truth, and search", + epilog=( + "All dataset, index, search, and connection parameters live in " + "the YAML config file. Run 'help backends' or " + "'help backend ' for backend-specific details." + ), + ) + + # Config file (the primary input) + p.add_argument("--config", type=str, required=False, + help="Path to YAML config file (required for benchmark runs)") + + # Operational overrides (take precedence over YAML values) + p.add_argument( + "--mode", type=str, dest="mode", + choices=list(MODES), + help="Override runtime mode: 'load', 'search', or 'both'", + ) + p.add_argument( + "--backend", type=str, dest="backend", + help=f"Override backend ({available})", + ) + p.add_argument("--force", action="store_true", default=None, + help="Drop collection if it already exists") + p.add_argument("--output-dir", type=str, dest="output_dir", + help="Directory for artifacts (default: auto-timestamped)") + p.add_argument("--artifacts-dir", type=str, dest="artifacts_dir", + help="Load query/truth artifacts from this directory " + "(required for --mode search without prior load)") + + # Introspection + p.add_argument("--what-if", action="store_true", + help="Print resolved config and exit") + p.add_argument("--plan", action="store_true", + help="Show the full execution plan (steps, sizes, " + "estimates) without running anything") + p.add_argument("--debug", action="store_true", + help="Enable DEBUG logging") + + return p + + +def _merge_cli_over_yaml(yaml_cfg: dict, cli_ns: argparse.Namespace) -> dict: + """Flatten YAML sections and overlay non-None CLI values.""" + flat: dict = {} + for key, val in yaml_cfg.items(): + if isinstance(val, dict): + flat.update(val) + else: + flat[key] = val + + skip = {"config", "what_if", "plan", "debug", "output_dir", "artifacts_dir"} + for key, val in vars(cli_ns).items(): + if key in skip: + continue + if val is not None: + flat[key] = val + + return flat + + +def _collect_index_params(flat: dict) -> dict: + """Pull index-specific keys into the nested ``index_params`` dict.""" + ip = flat.get("index_params", {}) + if isinstance(ip, dict): + ip = dict(ip) + else: + ip = {} + for k in ("M", "efConstruction", "MaxDegree", "SearchListSize", + "inline_pq", "max_degree", "search_list_size", + "lists", "ef_search", "probes"): + if k in flat and flat[k] is not None: + ip[k] = flat[k] + flat["index_params"] = ip + return flat + + +def _resolve_backend_name(flat: dict, cli_ns: argparse.Namespace) -> str: + """Determine which backend to use. + + Precedence: ``--backend`` CLI flag > ``backend`` key in YAML config + > ``"milvus"`` (default). + """ + if cli_ns.backend: + return cli_ns.backend.lower() + if "backend" in flat: + return str(flat["backend"]).lower() + return "milvus" + + +# ------------------------------------------------------------------ +# Plan formatter +# ------------------------------------------------------------------ + +def _sizeof_fmt(num_bytes: float) -> str: + """Human-readable byte size (e.g. ``5.86 GB``).""" + for unit in ("B", "KB", "MB", "GB", "TB"): + if abs(num_bytes) < 1024: + return f"{num_bytes:.2f} {unit}" + num_bytes /= 1024 + return f"{num_bytes:.2f} PB" + + +def _format_plan(cfg: BenchmarkConfig, desc) -> str: + """Build a human-readable execution plan from *cfg* and the backend + *desc* (:class:`BackendDescriptor`). No database connection needed. + """ + W = 64 + SEP = "-" * W + lines: list[str] = [] + + def heading(title: str) -> None: + lines.append("") + lines.append("=" * W) + lines.append(f" {title}") + lines.append("=" * W) + + def step(num: int, title: str) -> None: + lines.append("") + lines.append(SEP) + lines.append(f" Step {num}: {title}") + lines.append(SEP) + + def kv(key: str, val, indent: int = 4) -> None: + pad = " " * indent + lines.append(f"{pad}{key:<32s}: {val}") + + # -- Sizes ----------------------------------------------------------- + bytes_per_vector = cfg.dimension * 4 # float32 + db_vector_bytes = cfg.num_vectors * bytes_per_vector + query_vector_bytes = cfg.num_query_vectors * bytes_per_vector + # truth table: int64 ids per query + truth_bytes = cfg.num_query_vectors * cfg.truth_k * 8 + num_blocks = math.ceil(cfg.num_vectors / cfg.block_size) + inserts_per_block = math.ceil(cfg.block_size / cfg.batch_size) + total_inserts = num_blocks * inserts_per_block + + # Ground-truth working memory: the builder keeps a running top-K + # matrix of shape (num_queries, K) for IDs and distances (both float64). + gt_working_bytes = cfg.num_query_vectors * cfg.truth_k * 8 * 2 + + # Per-block GT compute: cosine/IP needs (num_queries x block_size) + # distance matrix in float32. + gt_block_bytes = cfg.num_query_vectors * cfg.block_size * 4 + + # -- Header ---------------------------------------------------------- + heading("BENCHMARK EXECUTION PLAN") + lines.append("") + kv("Backend", f"{desc.display_name} (--backend {desc.name})") + kv("Mode", cfg.mode) + kv("Collection", cfg.collection_name) + kv("Force recreate", "yes" if cfg.force else "no") + + # -- Step 1: Query vector generation --------------------------------- + step(1, "Generate query vectors") + kv("Num query vectors", f"{cfg.num_query_vectors:,}") + kv("Dimension", f"{cfg.dimension:,}") + kv("Distribution", cfg.distribution) + kv("Query seed", cfg.query_seed) + kv("Memory", _sizeof_fmt(query_vector_bytes)) + kv("Output", "held in memory (saved to query_vectors.npy later)") + + # -- Step 2: Create collection + index ------------------------------- + step(2, "Create collection and index") + kv("Index type", cfg.index_type) + kv("Metric type", cfg.metric_type) + kv("Num shards", cfg.num_shards) + idx_desc = desc.get_index(cfg.index_type) + if idx_desc and cfg.index_params: + for p in idx_desc.build_params: + val = cfg.index_params.get(p.name, p.default) + kv(f" {p.name}", val) + elif idx_desc: + for p in idx_desc.build_params: + kv(f" {p.name}", f"{p.default} (default)") + + # -- Step 3: Vector generation + ingestion + GT ---------------------- + step(3, "Generate, ingest, and compute ground truth") + lines.append("") + lines.append(" Producer (background thread):") + kv("Total database vectors", f"{cfg.num_vectors:,}") + kv("Dimension", f"{cfg.dimension:,}") + kv("Distribution", cfg.distribution) + kv("Vector seed", cfg.seed) + kv("Block size", f"{cfg.block_size:,} vectors") + kv("Num blocks", f"{num_blocks:,}") + kv("Queue depth", f"{cfg.max_queue_depth} blocks") + kv("Per-block memory", _sizeof_fmt(cfg.block_size * bytes_per_vector)) + kv("Total vector data", _sizeof_fmt(db_vector_bytes)) + + lines.append("") + lines.append(" Consumer 1 -- Database ingestion:") + kv("Batch size", f"{cfg.batch_size:,} vectors/insert") + kv("Inserts per block", f"{inserts_per_block:,}") + kv("Total insert calls", f"{total_inserts:,}") + + lines.append("") + lines.append(" Consumer 2 -- Ground-truth builder:") + kv("Query vectors", f"{cfg.num_query_vectors:,}") + kv("K (neighbors)", f"{cfg.truth_k:,}") + kv("Metric", cfg.metric_type) + kv("Per-block distance matrix", _sizeof_fmt(gt_block_bytes)) + kv("Running top-K memory", _sizeof_fmt(gt_working_bytes)) + + # -- Step 4: Flush --------------------------------------------------- + step(4, "Flush collection") + kv("Action", "commit pending writes to storage") + + # -- Step 5: Optional compaction ------------------------------------- + if cfg.compact: + step(5, "Compact collection") + kv("Action", "merge small segments before index build") + else: + lines.append("") + lines.append(f" (Step 5: Compact -- skipped, compact not set)") + + # -- Step 6: Wait for index build ------------------------------------ + step(6, "Wait for index build") + kv("Poll interval", f"{cfg.monitor_interval}s") + + # -- Step 7: Finalize ground truth ----------------------------------- + step(7, "Finalize ground truth") + kv("Truth table shape", f"({cfg.num_query_vectors:,}, {cfg.truth_k:,})") + kv("Truth table size", _sizeof_fmt(truth_bytes)) + + # -- Step 8: Save artifacts ------------------------------------------ + step(8, "Save artifacts") + kv("query_vectors.npy", _sizeof_fmt(query_vector_bytes)) + kv("ground_truth.npz", f"~{_sizeof_fmt(truth_bytes + query_vector_bytes)}" + " (compressed)") + kv("benchmark_meta.json", "config + timings") + + # -- Search steps (when mode is 'search' or 'both') ------------------ + mode = cfg.mode.lower() + if mode in ("search", "both"): + step(9, "Load collection into memory") + kv("Collection", cfg.collection_name) + kv("Action", "ensure collection is loaded for search") + + step(10, "Run search benchmark") + kv("Search K (top-K)", cfg.search_k) + kv("Query vectors", f"{cfg.num_query_vectors:,}") + kv("Rounds", cfg.num_search_rounds) + kv("Batch size", cfg.search_batch_size) + kv("Log interval", f"every {cfg.log_interval} queries") + kv("Truth K", cfg.truth_k) + kv("Search params", cfg.search_params or "(backend defaults)") + kv("Total queries", f"{cfg.num_query_vectors * cfg.num_search_rounds:,}") + + # -- Summary --------------------------------------------------------- + heading("RESOURCE ESTIMATES") + lines.append("") + peak_mem = ( + query_vector_bytes # query vectors + + cfg.max_queue_depth * cfg.block_size * bytes_per_vector # queue + + gt_working_bytes # GT top-K state + + gt_block_bytes # GT distance matrix + ) + kv("Peak memory (estimate)", _sizeof_fmt(peak_mem)) + kv("Total vector data generated", _sizeof_fmt(db_vector_bytes)) + kv("Disk artifacts (approx)", _sizeof_fmt( + query_vector_bytes + truth_bytes + query_vector_bytes + 4096)) + lines.append("") + + return "\n".join(lines) +# Main +# ------------------------------------------------------------------ + +def main(argv: list[str] | None = None) -> int: + raw_argv = argv if argv is not None else sys.argv[1:] + + # No arguments at all → show usage and exit. + if not raw_argv: + _build_parser().print_help() + print() + print(format_backends_list(registry)) + return 0 + + # Intercept "help" sub-commands before argparse runs. + if _handle_help(raw_argv): + return 0 + + parser = _build_parser() + args = parser.parse_args(raw_argv) + + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + + # A config file is required for any real work. + if not args.config and not (args.what_if or args.plan): + parser.error("--config is required (or use --what-if / --plan)") + + # Load .env file (if python-dotenv is installed and .env exists) + load_env_file() + + # Build resolved config: defaults <- YAML <- CLI overrides + yaml_cfg = _load_yaml(args.config) if args.config else {} + flat = _merge_cli_over_yaml(yaml_cfg, args) + flat = _collect_index_params(flat) + + # Inject CLI-only overrides that are not part of YAML sections + if args.artifacts_dir is not None: + flat["artifacts_dir"] = args.artifacts_dir + + # Resolve backend + backend_name = _resolve_backend_name(flat, args) + desc = registry.get(backend_name) + if desc is None: + available = ", ".join(registry.names()) or "(none)" + parser.error( + f"Unknown backend '{backend_name}'. Available: {available}" + ) + + cfg = BenchmarkConfig.from_dict(flat) + + # --what-if: show config and exit + if args.what_if: + print(f"\nBackend: {desc.display_name} (--backend {desc.name})") + print("\nResolved benchmark configuration:") + print("=" * 60) + display = {k: v for k, v in cfg.to_dict().items() + if not (k == "compact" and v)} + print(json.dumps(display, indent=2, default=str)) + print("=" * 60) + + # Show resolved connection parameters with sources + _env = env_for_backend(backend_name, desc) + if desc.connection_params: + print("\nConnection parameters (source):") + for p in desc.connection_params: + k = p.name + env_val = _env.get(k) + yaml_val = flat.get(k) + if env_val is not None: + print(f" {k}: {env_val!r} (env: {backend_name.upper()}__{k.upper()})") + elif yaml_val is not None: + print(f" {k}: {yaml_val!r} (config)") + else: + print(f" {k}: {p.default!r} (default)") + return 0 + + # --plan: show step-by-step execution plan and exit + if args.plan: + print(_format_plan(cfg, desc)) + return 0 + + # Validate essentials + mode = cfg.mode.lower() + if mode in ("load", "both"): + if not cfg.collection_name or not cfg.dimension or not cfg.num_vectors: + parser.error( + "collection_name, dimension, and num_vectors are required " + "for load/both modes (set them in the config file)." + ) + elif mode == "search": + if not cfg.collection_name: + parser.error( + "collection_name is required for search mode " + "(set it in the config file)." + ) + if not cfg.artifacts_dir: + parser.error( + "--artifacts-dir is required for search mode to load " + "query vectors and ground truth." + ) + + # Validate index type against backend capabilities + if cfg.index_type and cfg.index_type.upper() not in ( + n.upper() for n in desc.index_names() + ): + parser.error( + f"Backend '{desc.name}' does not support index type " + f"'{cfg.index_type}'. Supported: {', '.join(desc.index_names())}" + ) + + # Output directory + output_dir = args.output_dir or os.path.join( + "results", + f"{cfg.collection_name}_{datetime.now():%Y%m%d_%H%M%S}", + ) + + # Connect backend. + # Precedence: environment variables (.env / shell) > YAML config > defaults + backend = desc.backend_class() + env_kwargs = env_for_backend(backend_name, desc) + conn_kwargs: dict = {} + for p in desc.connection_params: + k = p.name + env_val = env_kwargs.get(k) # env var / .env file + yaml_val = flat.get(k) # YAML config + if env_val is not None: + conn_kwargs[k] = env_val + elif yaml_val is not None: + conn_kwargs[k] = yaml_val + # else: omitted → backend.connect() uses its own default + backend.connect(**conn_kwargs) + + try: + orch = BenchmarkOrchestrator(config=cfg, backend=backend) + summary = orch.run() + paths = orch.save(output_dir) + + mode = cfg.mode.lower() + + print("\n" + "=" * 60) + print(f"BENCHMARK COMPLETE (backend: {desc.display_name}, mode: {mode})") + print("=" * 60) + + if mode in ("load", "both"): + print(f" Vectors inserted : {summary.get('total_vectors_inserted', 'N/A'):,}") + print(f" Query vectors : {cfg.num_query_vectors:,}") + print(f" Truth table : {summary.get('truth_table_shape', 'N/A')}") + print(f" Truth mode : {cfg.truth_mode}") + + if mode in ("search", "both"): + print(f"\n --- Search Results ---") + print(f" Total queries : {summary.get('search_total_queries', 'N/A'):,}") + print(f" QPS : {summary.get('search_qps', 0):.1f}") + print(f" Recall@{cfg.search_k:<9d}: {summary.get('search_recall_at_k', 0):.4f}") + print(f" Latency P50 : {summary.get('search_latency_p50_ms', 0):.2f} ms") + print(f" Latency P90 : {summary.get('search_latency_p90_ms', 0):.2f} ms") + print(f" Latency P99 : {summary.get('search_latency_p99_ms', 0):.2f} ms") + print(f" Latency mean : {summary.get('search_latency_mean_ms', 0):.2f} ms") + print(f" Wall time : {summary.get('search_wall_sec', 0):.2f} s") + + print(f"\n Output dir : {output_dir}") + for name, p in paths.items(): + print(f" {name:20s} -> {p}") + print("=" * 60) + print("\nTimings:") + for k, v in summary.get("timings", {}).items(): + print(f" {k:30s} : {v:>10.2f} s") + print() + + finally: + backend.disconnect() + + return 0 diff --git a/vdb_benchmark/vdbbench/benchmark/search_runner.py b/vdb_benchmark/vdbbench/benchmark/search_runner.py new file mode 100644 index 00000000..016b9f69 --- /dev/null +++ b/vdb_benchmark/vdbbench/benchmark/search_runner.py @@ -0,0 +1,463 @@ +"""Search benchmark runner -- query the VDB and measure performance. + +Sends query vectors to the vector database in batches, measures +latency per batch, computes recall against a ground-truth table, +and periodically logs aggregate statistics. + +Two ground-truth modes are supported: + +* **precomputed** -- a truth table (``num_queries × K`` array of IDs) + is provided up-front (e.g. from the load phase). +* **flat_index** -- a second collection with a ``FLAT`` index is + queried at the start of the run to build the truth table on-the-fly. + +Usage:: + + runner = SearchRunner(cfg, backend, query_vectors, truth_table) + result = runner.run() + runner.save(output_dir) +""" + +from __future__ import annotations + +import json +import logging +import os +import time +from dataclasses import asdict, dataclass, field +from typing import Any, Dict, List, Optional + +import numpy as np + +from .backends.base import VectorDBBackend + +logger = logging.getLogger(__name__) + + +# ===================================================================== +# Result data model +# ===================================================================== + +@dataclass +class IntervalStats: + """Stats captured every *log_interval* queries.""" + interval_index: int + wall_clock_sec: float + total_queries: int + interval_queries: int + qps_cumulative: float + qps_interval: float + recall_at_k: float + latency_p50_ms: float + latency_p90_ms: float + latency_p99_ms: float + latency_mean_ms: float + + +@dataclass +class SearchResult: + """Final result of a search benchmark run.""" + total_queries: int + total_wall_sec: float + qps: float + recall_at_k: float + search_k: int + truth_k: int + + # Aggregate latency (all queries) + latency_p50_ms: float + latency_p90_ms: float + latency_p99_ms: float + latency_mean_ms: float + + # Per-interval snapshots + intervals: List[Dict[str, Any]] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +# ===================================================================== +# Recall helpers +# ===================================================================== + +def _recall_at_k( + predicted_ids: np.ndarray, + truth_ids: np.ndarray, + k: int, +) -> float: + """Compute mean recall@k across all queries. + + Parameters + ---------- + predicted_ids : np.ndarray + Shape ``(nq, pred_k)`` -- IDs returned by ANN search. + truth_ids : np.ndarray + Shape ``(nq, truth_k)`` -- ground-truth nearest IDs. + k : int + Evaluate recall using the top-*k* of the truth table. + + Returns + ------- + float + Mean recall in [0, 1]. + """ + nq = predicted_ids.shape[0] + truth_top_k = truth_ids[:, :k] + hits = 0 + for q in range(nq): + gt_set = set(truth_top_k[q].tolist()) + pred_set = set(predicted_ids[q].tolist()) + hits += len(gt_set & pred_set) + return hits / (nq * k) + + +# ===================================================================== +# Ground-truth via FLAT index +# ===================================================================== + +def build_truth_from_flat( + backend: VectorDBBackend, + flat_collection_name: str, + query_vectors: np.ndarray, + truth_k: int, + metric_type: str = "COSINE", +) -> np.ndarray: + """Query a FLAT-index collection to produce a truth table. + + Parameters + ---------- + backend : + Connected backend instance. + flat_collection_name : + Name of a collection that already has a FLAT index and + contains the same vectors as the ANN collection. + query_vectors : + Shape ``(nq, dim)``, dtype float32. + truth_k : + Number of neighbors per query. + metric_type : + Distance metric used by the collection. + + Returns + ------- + np.ndarray + Shape ``(nq, truth_k)``, dtype int64. + """ + logger.info( + "Building truth table from FLAT collection '%s' (k=%d) ...", + flat_collection_name, truth_k, + ) + t0 = time.time() + + # Search in small batches to avoid overwhelming the server + batch = 100 + nq = query_vectors.shape[0] + all_ids: list[list[int]] = [] + + search_params = { + "metric_type": metric_type, + "params": {}, + } + + for start in range(0, nq, batch): + end = min(start + batch, nq) + batch_results = backend.search( + name=flat_collection_name, + query_vectors=query_vectors[start:end], + top_k=truth_k, + search_params=search_params, + ) + all_ids.extend(batch_results) + + truth = np.array(all_ids, dtype=np.int64) + elapsed = time.time() - t0 + logger.info( + "Truth table built from FLAT index in %.2f s (shape %s)", + elapsed, truth.shape, + ) + return truth + + +def ensure_flat_collection( + backend: VectorDBBackend, + source_name: str, + flat_name: str, + dimension: int, + metric_type: str, +) -> bool: + """Create the FLAT companion collection if it does not exist. + + Returns True if the collection already exists, False if it must + be populated by the caller (e.g. during the load phase). + """ + if backend.collection_exists(flat_name): + logger.info("FLAT collection '%s' already exists", flat_name) + return True + + logger.info("Creating FLAT collection '%s' ...", flat_name) + backend.create_collection( + name=flat_name, + dimension=dimension, + metric_type=metric_type, + index_type="FLAT", + index_params={}, + num_shards=1, + force=False, + ) + return False + + +# ===================================================================== +# Search runner +# ===================================================================== + +class SearchRunner: + """Execute a search benchmark against a loaded VDB collection. + + Parameters + ---------- + backend : + Connected backend (collection must already be loaded with data). + collection_name : + Name of the ANN collection to search. + query_vectors : + Shape ``(nq, dim)``, dtype float32. + truth_table : + Shape ``(nq, truth_k)``, dtype int64 -- ground-truth IDs. + search_k : + Number of neighbors to retrieve per query. + search_params : + Backend-specific search parameters (e.g. ``ef`` for HNSW). + metric_type : + Distance metric (for ``search_params`` wrapper). + num_rounds : + How many times to cycle through the full query set. + batch_size : + Number of query vectors per ``backend.search()`` call. + log_interval : + Log aggregate stats every *log_interval* queries. + """ + + def __init__( + self, + backend: VectorDBBackend, + collection_name: str, + query_vectors: np.ndarray, + truth_table: np.ndarray, + search_k: int = 10, + search_params: Optional[Dict[str, Any]] = None, + metric_type: str = "COSINE", + num_rounds: int = 1, + batch_size: int = 1, + log_interval: int = 1000, + ) -> None: + self.backend = backend + self.collection_name = collection_name + self.query_vectors = np.ascontiguousarray(query_vectors, dtype=np.float32) + self.truth_table = truth_table + self.search_k = search_k + self.metric_type = metric_type + self.num_rounds = num_rounds + self.batch_size = batch_size + self.log_interval = log_interval + + # Build search params in the format backends expect + if search_params is not None: + self.search_params = search_params + else: + self.search_params = { + "metric_type": metric_type, + "params": {}, + } + + self.result: Optional[SearchResult] = None + + def run(self) -> SearchResult: + """Run the search benchmark. + + Returns + ------- + SearchResult + Aggregate and per-interval statistics. + """ + nq = self.query_vectors.shape[0] + total_queries_planned = nq * self.num_rounds + k = self.search_k + + logger.info( + "Starting search benchmark: %s queries x %d rounds = %s total, " + "k=%d, batch_size=%d, log every %s queries", + f"{nq:,}", self.num_rounds, f"{total_queries_planned:,}", + k, self.batch_size, f"{self.log_interval:,}", + ) + + all_latencies: list[float] = [] + all_predicted: list[np.ndarray] = [] + all_truth: list[np.ndarray] = [] + intervals: list[IntervalStats] = [] + + # Latencies for the current logging interval + interval_latencies: list[float] = [] + interval_predicted: list[np.ndarray] = [] + interval_truth: list[np.ndarray] = [] + interval_idx = 0 + + total_queries = 0 + wall_start = time.time() + interval_start = wall_start + + for round_num in range(self.num_rounds): + # Shuffle query order each round (except the first) for + # realistic cache behavior + if round_num == 0: + order = np.arange(nq) + else: + order = np.random.permutation(nq) + + for batch_start in range(0, nq, self.batch_size): + batch_end = min(batch_start + self.batch_size, nq) + batch_idx = order[batch_start:batch_end] + batch_queries = self.query_vectors[batch_idx] + batch_truth = self.truth_table[batch_idx] + + # Timed search + t0 = time.perf_counter() + result_ids = self.backend.search( + name=self.collection_name, + query_vectors=batch_queries, + top_k=k, + search_params=self.search_params, + ) + elapsed_ms = (time.perf_counter() - t0) * 1000.0 + + batch_n = batch_end - batch_start + per_query_ms = elapsed_ms / batch_n + + # Record per-query latency + for _ in range(batch_n): + all_latencies.append(per_query_ms) + interval_latencies.append(per_query_ms) + + predicted_arr = np.array(result_ids, dtype=np.int64) + all_predicted.append(predicted_arr) + all_truth.append(batch_truth) + interval_predicted.append(predicted_arr) + interval_truth.append(batch_truth) + + total_queries += batch_n + + # Check if we should log an interval + if total_queries >= (interval_idx + 1) * self.log_interval: + stats = self._compute_interval( + interval_idx=interval_idx, + wall_start=wall_start, + interval_start=interval_start, + total_queries=total_queries, + interval_latencies=interval_latencies, + interval_predicted=interval_predicted, + interval_truth=interval_truth, + ) + intervals.append(stats) + self._log_stats(stats) + + # Reset interval accumulators + interval_latencies = [] + interval_predicted = [] + interval_truth = [] + interval_start = time.time() + interval_idx += 1 + + wall_elapsed = time.time() - wall_start + + # Final stats across all queries + lat_arr = np.array(all_latencies) + pred_all = np.concatenate(all_predicted, axis=0) + truth_all = np.concatenate(all_truth, axis=0) + recall = _recall_at_k(pred_all, truth_all, k) + + self.result = SearchResult( + total_queries=total_queries, + total_wall_sec=wall_elapsed, + qps=total_queries / wall_elapsed if wall_elapsed > 0 else 0, + recall_at_k=recall, + search_k=k, + truth_k=self.truth_table.shape[1], + latency_p50_ms=float(np.percentile(lat_arr, 50)), + latency_p90_ms=float(np.percentile(lat_arr, 90)), + latency_p99_ms=float(np.percentile(lat_arr, 99)), + latency_mean_ms=float(np.mean(lat_arr)), + intervals=[asdict(s) for s in intervals], + ) + + logger.info( + "Search benchmark complete: %s queries in %.2f s " + "(%.1f QPS, recall@%d=%.4f)", + f"{total_queries:,}", wall_elapsed, self.result.qps, + k, recall, + ) + return self.result + + def save(self, output_dir: str) -> str: + """Save search results to *output_dir*. + + Returns the path to the JSON results file. + """ + os.makedirs(output_dir, exist_ok=True) + path = os.path.join(output_dir, "search_results.json") + with open(path, "w") as f: + json.dump(self.result.to_dict(), f, indent=2, default=str) + logger.info("Search results saved to %s", path) + return path + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + def _compute_interval( + self, + interval_idx: int, + wall_start: float, + interval_start: float, + total_queries: int, + interval_latencies: list[float], + interval_predicted: list[np.ndarray], + interval_truth: list[np.ndarray], + ) -> IntervalStats: + now = time.time() + wall_elapsed = now - wall_start + interval_elapsed = now - interval_start + + lat_arr = np.array(interval_latencies) + pred = np.concatenate(interval_predicted, axis=0) + truth = np.concatenate(interval_truth, axis=0) + recall = _recall_at_k(pred, truth, self.search_k) + iq = len(interval_latencies) + + return IntervalStats( + interval_index=interval_idx, + wall_clock_sec=wall_elapsed, + total_queries=total_queries, + interval_queries=iq, + qps_cumulative=total_queries / wall_elapsed if wall_elapsed > 0 else 0, + qps_interval=iq / interval_elapsed if interval_elapsed > 0 else 0, + recall_at_k=recall, + latency_p50_ms=float(np.percentile(lat_arr, 50)), + latency_p90_ms=float(np.percentile(lat_arr, 90)), + latency_p99_ms=float(np.percentile(lat_arr, 99)), + latency_mean_ms=float(np.mean(lat_arr)), + ) + + @staticmethod + def _log_stats(stats: IntervalStats) -> None: + logger.info( + "[Interval %d] queries=%s cumQPS=%.1f intQPS=%.1f " + "recall@k=%.4f P50=%.2fms P90=%.2fms P99=%.2fms", + stats.interval_index, + f"{stats.total_queries:,}", + stats.qps_cumulative, + stats.qps_interval, + stats.recall_at_k, + stats.latency_p50_ms, + stats.latency_p90_ms, + stats.latency_p99_ms, + )