diff --git a/packages/ducpy/src/ducpy/__init__.py b/packages/ducpy/src/ducpy/__init__.py index bfa1831..8b24000 100644 --- a/packages/ducpy/src/ducpy/__init__.py +++ b/packages/ducpy/src/ducpy/__init__.py @@ -14,5 +14,6 @@ from .enums import * from .parse import (DucData, get_external_file, list_external_files, parse_duc, parse_duc_lazy) +from .search import * from .serialize import DUC_SCHEMA_VERSION, serialize_duc from .utils import * diff --git a/packages/ducpy/src/ducpy/builders/sql_builder.py b/packages/ducpy/src/ducpy/builders/sql_builder.py index e3f3c9f..8697adb 100644 --- a/packages/ducpy/src/ducpy/builders/sql_builder.py +++ b/packages/ducpy/src/ducpy/builders/sql_builder.py @@ -29,6 +29,7 @@ from __future__ import annotations import os +import re import sqlite3 import tempfile from pathlib import Path @@ -37,6 +38,9 @@ __all__ = ["DucSQL"] +_MIGRATION_RE = re.compile(r"^(\d+)_to_(\d+)$") + + def _find_schema_dir() -> Optional[Path]: current = Path(__file__).resolve() for parent in current.parents: @@ -46,6 +50,61 @@ def _find_schema_dir() -> Optional[Path]: return None +def _get_current_schema_version() -> int: + """Read the target user_version from duc.sql — mirrors Rust's CURRENT_VERSION.""" + schema_dir = _find_schema_dir() + if schema_dir is None: + return 0 + duc_sql = (schema_dir / "duc.sql").read_text(encoding="utf-8") + m = re.search(r"PRAGMA\s+user_version\s*=\s*(\d+)", duc_sql, re.IGNORECASE) + return int(m.group(1)) if m else 0 + + +def _read_migrations() -> list[tuple[int, int, str]]: + """Load all migration SQL files from schema/migrations/, sorted by from_version.""" + schema_dir = _find_schema_dir() + if schema_dir is None: + return [] + migrations_dir = schema_dir / "migrations" + if not migrations_dir.exists(): + return [] + result: list[tuple[int, int, str]] = [] + for path in sorted(migrations_dir.glob("*.sql")): + m = _MIGRATION_RE.match(path.stem) + if m: + result.append((int(m.group(1)), int(m.group(2)), path.read_text(encoding="utf-8"))) + result.sort(key=lambda x: x[0]) + return result + + +def _apply_migrations(conn: sqlite3.Connection) -> None: + """Walk the migration chain until user_version reaches the current schema version. + + Mirrors the migration logic in Rust's ``bootstrap.rs``: + reads ``schema/migrations/{from}_to_{to}.sql`` files in order and executes + them until ``PRAGMA user_version`` matches the version declared in ``duc.sql``. + Safe to call on already-current or brand-new databases. + """ + user_version: int = conn.execute("PRAGMA user_version").fetchone()[0] + if user_version == 0: + return # unversioned / brand-new DB — schema applied by DucSQL.new() + current_version = _get_current_schema_version() + if user_version >= current_version: + return # already up to date + migrations = _read_migrations() + current = user_version + while current < current_version: + migration = next(((f, t, sql) for f, t, sql in migrations if f == current), None) + if migration is None: + raise RuntimeError( + f"No migration path from schema version {current} to {current_version}. " + "Upgrade the ducpy package." + ) + _, to_v, sql = migration + conn.executescript(sql) + current = conn.execute("PRAGMA user_version").fetchone()[0] + + def _read_schema_sql() -> str: schema_dir = _find_schema_dir() if schema_dir is None: @@ -83,6 +142,7 @@ def __init__(self, path: Union[str, Path]): self.conn: sqlite3.Connection = sqlite3.connect(path) self.conn.row_factory = sqlite3.Row _apply_pragmas(self.conn) + _apply_migrations(self.conn) self._path: Optional[str] = path self._temp: Optional[str] = None self._closed = False @@ -114,6 +174,7 @@ def from_bytes(cls, data: bytes) -> DucSQL: inst.conn = sqlite3.connect(tmp.name) inst.conn.row_factory = sqlite3.Row _apply_pragmas(inst.conn) + _apply_migrations(inst.conn) inst._path = tmp.name inst._temp = tmp.name inst._closed = False diff --git a/packages/ducpy/src/ducpy/search/__init__.py b/packages/ducpy/src/ducpy/search/__init__.py new file mode 100644 index 0000000..23f20df --- /dev/null +++ b/packages/ducpy/src/ducpy/search/__init__.py @@ -0,0 +1,13 @@ +"""Search helpers for DUC SQLite databases.""" + +from .search_elements import (DucElementSearchResult, DucFileSearchResult, + DucSearchResponse, DucSearchResult, + search_duc_elements) + +__all__ = [ + "DucElementSearchResult", + "DucFileSearchResult", + "DucSearchResponse", + "DucSearchResult", + "search_duc_elements", +] \ No newline at end of file diff --git a/packages/ducpy/src/ducpy/search/search_elements.py b/packages/ducpy/src/ducpy/search/search_elements.py new file mode 100644 index 0000000..8e56f85 --- /dev/null +++ b/packages/ducpy/src/ducpy/search/search_elements.py @@ -0,0 +1,683 @@ +"""Element search helpers for ``.duc`` SQLite databases. + +This module uses a hybrid strategy: + +1. SQLite FTS5 narrows candidates quickly from the searchable DUC tables. +2. Python applies a second ranking pass that combines: + - FTS rank + - exact / prefix / substring behavior + - token coverage + - string similarity + +The JSON output is shaped for downstream consumers that need: + +- the original query +- total raw element hits +- the ordered list of all matching element ids +- result rows for individual elements or grouped file-backed elements +""" + +from __future__ import annotations + +import json +import re +import sqlite3 +import unicodedata +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from pathlib import Path +from typing import Any + +from ..builders.sql_builder import DucSQL +from ..parse import parse_duc_lazy + +__all__ = [ + "DucElementSearchResult", + "DucFileSearchResult", + "DucSearchResponse", + "DucSearchResult", + "search_duc_elements", +] + +_TOKEN_RE = re.compile(r"[\w]+", re.UNICODE) +_FILE_AGGREGATE_TYPES = {"pdf", "image", "table", "doc"} + +@dataclass(slots=True) +class DucSearchResult: + """Compatibility placeholder for exported result types.""" + + +@dataclass(slots=True) +class DucElementSearchResult: + """One result row for a single element.""" + + element_id: str + element_type: str + matches: list[str] + score: float + + def to_dict(self) -> dict[str, Any]: + return { + "element_id": self.element_id, + "element_type": self.element_type, + "matches": self.matches, + "score": round(self.score, 6), + } + + +@dataclass(slots=True) +class DucFileSearchResult: + """One grouped result row for repeated file-backed elements.""" + + file_id: str + element_type: str + matches: list[str] + score: float + hits: int + element_ids: list[str] + + def to_dict(self) -> dict[str, Any]: + return { + "file_id": self.file_id, + "element_type": self.element_type, + "matches": self.matches, + "score": round(self.score, 6), + "hits": self.hits, + "element_ids": self.element_ids, + } + + +DucSearchResult = DucElementSearchResult | DucFileSearchResult + + +@dataclass(slots=True) +class DucSearchResponse: + """Search response and JSON export metadata.""" + + query: str + results: list[DucSearchResult] + total_hits: int + all_element_ids: list[str] + output_path: str | None = None + + def to_dict(self) -> dict[str, Any]: + """Convert the response to a JSON-friendly dictionary.""" + + payload: dict[str, Any] = { + "query": self.query, + "total_hits": self.total_hits, + "all_element_ids": self.all_element_ids, + "results": [result.to_dict() for result in self.results], + } + return payload + + +@dataclass(slots=True) +class _ElementAggregate: + element_id: str + raw_element_type: str + label: str + description: str | None + match_scores: dict[str, tuple[str, float]] = field(default_factory=dict) + best_score: float = 0.0 + file_id: str | None = None + + def add_match(self, text: str, score: float) -> None: + normalized = _normalize_text(text) + current = self.match_scores.get(normalized) + if current is None or score > current[1]: + self.match_scores[normalized] = (text, score) + if score > self.best_score: + self.best_score = score + + @property + def ordered_matches(self) -> list[str]: + ordered = sorted( + self.match_scores.values(), + key=lambda item: (-item[1], _normalize_text(item[0]), item[0]), + ) + return [text for text, _score in ordered] + +@dataclass(frozen=True, slots=True) +class _SourceQuery: + table_name: str + source_weight: float + sql: str + + +_SOURCE_QUERIES: tuple[_SourceQuery, ...] = ( + _SourceQuery( + table_name="search_elements", + source_weight=1.0, + sql=""" + SELECT + e.id AS element_id, + e.element_type, + e.label, + e.description, + e.label AS candidate_text_1, + e.description AS candidate_text_2, + NULL AS candidate_text_3, + bm25(search_elements, 8.0, 3.0) AS fts_rank, + 'search_elements' AS source_table + FROM search_elements + JOIN elements AS e ON e.rowid = search_elements.rowid + WHERE search_elements MATCH ? + AND e.is_deleted = 0 + ORDER BY bm25(search_elements, 8.0, 3.0) + LIMIT ? + """, + ), + _SourceQuery( + table_name="search_element_text", + source_weight=0.94, + sql=""" + SELECT + e.id AS element_id, + e.element_type, + e.label, + e.description, + et.text AS candidate_text_1, + et.original_text AS candidate_text_2, + NULL AS candidate_text_3, + bm25(search_element_text, 6.0, 2.0) AS fts_rank, + 'search_element_text' AS source_table + FROM search_element_text + JOIN element_text AS et ON et.rowid = search_element_text.rowid + JOIN elements AS e ON e.id = et.element_id + WHERE search_element_text MATCH ? + AND e.is_deleted = 0 + ORDER BY bm25(search_element_text, 6.0, 2.0) + LIMIT ? + """, + ), + _SourceQuery( + table_name="search_element_doc", + source_weight=0.88, + sql=""" + SELECT + e.id AS element_id, + e.element_type, + e.label, + e.description, + ed.text AS candidate_text_1, + NULL AS candidate_text_2, + NULL AS candidate_text_3, + bm25(search_element_doc, 4.0) AS fts_rank, + 'search_element_doc' AS source_table + FROM search_element_doc + JOIN element_doc AS ed ON ed.rowid = search_element_doc.rowid + JOIN elements AS e ON e.id = ed.element_id + WHERE search_element_doc MATCH ? + AND e.is_deleted = 0 + ORDER BY bm25(search_element_doc, 4.0) + LIMIT ? + """, + ), + _SourceQuery( + table_name="search_element_model", + source_weight=0.72, + sql=""" + SELECT + e.id AS element_id, + e.element_type, + e.label, + e.description, + em.code AS candidate_text_1, + NULL AS candidate_text_2, + NULL AS candidate_text_3, + bm25(search_element_model, 2.0) AS fts_rank, + 'search_element_model' AS source_table + FROM search_element_model + JOIN element_model AS em ON em.rowid = search_element_model.rowid + JOIN elements AS e ON e.id = em.element_id + WHERE search_element_model MATCH ? + AND e.is_deleted = 0 + ORDER BY bm25(search_element_model, 2.0) + LIMIT ? + """, + ), +) + + +def _normalize_text(value: str | None) -> str: + if not value: + return "" + normalized = unicodedata.normalize("NFKD", value) + without_marks = "".join(ch for ch in normalized if not unicodedata.combining(ch)) + collapsed = " ".join(without_marks.casefold().split()) + return collapsed + + +def _tokenize(value: str | None) -> list[str]: + return _TOKEN_RE.findall(_normalize_text(value)) + + +def _escape_fts_term(term: str) -> str: + return term.replace('"', '""') + + +def _build_query_variants(query: str) -> list[tuple[str, str, float]]: + tokens = _tokenize(query) + if not tokens: + raise ValueError("The search query must contain at least one searchable token.") + + variants: list[tuple[str, str, float]] = [] + seen: set[str] = set() + + def add_variant(name: str, expression: str, boost: float) -> None: + if expression and expression not in seen: + seen.add(expression) + variants.append((name, expression, boost)) + + if len(tokens) > 1: + phrase = '"' + " ".join(_escape_fts_term(token) for token in tokens) + '"' + add_variant("phrase", phrase, 1.0) + + exact_terms = " AND ".join(f'"{_escape_fts_term(token)}"' for token in tokens) + add_variant("exact_terms", exact_terms, 0.97) + + prefix_terms = " AND ".join( + f'{_escape_fts_term(token)}*' if len(token) >= 2 else f'"{_escape_fts_term(token)}"' + for token in tokens + ) + add_variant("prefix_terms", prefix_terms, 0.9) + + return variants + + +def _token_match_score(query_token: str, candidate_token: str) -> float: + if not query_token or not candidate_token: + return 0.0 + if candidate_token == query_token: + return 1.0 + if candidate_token.startswith(query_token): + return len(query_token) / max(len(candidate_token), 1) + if query_token in candidate_token: + return 0.75 * (len(query_token) / max(len(candidate_token), 1)) + return 0.45 * SequenceMatcher(None, query_token, candidate_token).ratio() + + +def _fts_rank_to_score(fts_rank: float | None) -> float: + if fts_rank is None: + return 0.0 + return 1.0 / (1.0 + abs(float(fts_rank))) + + +def _score_candidate( + *, + text_quality: float, + token_coverage: float, + field_exact: float, + field_prefix: float, + similarity_score: float, + fts_rank: float | None, + source_weight: float, + variant_boost: float, +) -> float: + final_score = ( + 0.28 * text_quality + + 0.20 * token_coverage + + 0.16 * field_exact + + 0.10 * field_prefix + + 0.14 * similarity_score + + 0.07 * _fts_rank_to_score(fts_rank) + + 0.05 * source_weight + ) * variant_boost + return max(0.0, min(final_score, 1.0)) + + +def _evaluate_match_text( + query: str, + raw_text: str | None, + *, + fts_rank: float | None, + source_weight: float, + variant_boost: float, +) -> tuple[float, float]: + if not raw_text: + return 0.0, 0.0 + + query_normalized = _normalize_text(query) + query_tokens = _tokenize(query) + normalized = _normalize_text(raw_text) + if not normalized: + return 0.0, 0.0 + + candidate_tokens = _tokenize(raw_text) + if query_tokens and candidate_tokens: + token_scores = [ + max((_token_match_score(query_token, candidate_token) for candidate_token in candidate_tokens), default=0.0) + for query_token in query_tokens + ] + token_coverage = sum(token_scores) / len(token_scores) + else: + token_scores = [] + token_coverage = 0.0 + + field_exact = 1.0 if normalized == query_normalized else 0.0 + field_prefix = ( + len(query_normalized) / len(normalized) + if query_normalized and normalized.startswith(query_normalized) + else 0.0 + ) + similarity_score = SequenceMatcher(None, query_normalized, normalized).ratio() + text_quality = max( + field_exact, + field_prefix, + token_coverage, + 0.7 * similarity_score, + ) + contains_query = bool(query_normalized and query_normalized in normalized) + meaningful_match = ( + field_exact == 1.0 + or field_prefix > 0.0 + or contains_query + or (token_scores and min(token_scores) >= 0.6) + or (similarity_score >= 0.75 and token_coverage >= 0.5) + ) + if not meaningful_match: + return 0.0, similarity_score + + final_score = _score_candidate( + text_quality=text_quality, + token_coverage=token_coverage, + field_exact=field_exact, + field_prefix=field_prefix, + similarity_score=similarity_score, + fts_rank=fts_rank, + source_weight=source_weight, + variant_boost=variant_boost, + ) + return final_score, similarity_score + + +def _collect_candidates( + conn: sqlite3.Connection, + query: str, + *, + limit_per_source: int, +) -> list[_ElementAggregate]: + aggregates: dict[str, _ElementAggregate] = {} + + for _variant_name, expression, variant_boost in _build_query_variants(query): + for source in _SOURCE_QUERIES: + rows = conn.execute(source.sql, (expression, limit_per_source)).fetchall() + for row in rows: + aggregate = aggregates.get(row["element_id"]) + if aggregate is None: + aggregate = _ElementAggregate( + element_id=row["element_id"], + raw_element_type=row["element_type"], + label=row["label"] or "", + description=row["description"], + ) + aggregates[aggregate.element_id] = aggregate + + fts_rank = float(row["fts_rank"]) if row["fts_rank"] is not None else None + for raw_text in (row["candidate_text_1"], row["candidate_text_2"], row["candidate_text_3"]): + score, _similarity = _evaluate_match_text( + query, + raw_text, + fts_rank=fts_rank, + source_weight=source.source_weight, + variant_boost=variant_boost, + ) + if score > 0.0 and raw_text: + aggregate.add_match(raw_text, score) + + results = list(aggregates.values()) + results.sort(key=lambda item: (-item.best_score, item.raw_element_type.casefold(), item.element_id)) + return results + + +def _resolve_file_ids(conn: sqlite3.Connection, element_ids: list[str]) -> dict[str, str]: + if not element_ids: + return {} + + placeholders = ", ".join("?" for _ in element_ids) + bindings: tuple[str, ...] = tuple(element_ids) + file_ids: dict[str, str] = {} + + for row in conn.execute( + f"SELECT element_id, file_id FROM document_grid_config WHERE file_id IS NOT NULL AND element_id IN ({placeholders})", + bindings, + ): + file_ids[row["element_id"]] = row["file_id"] + + for row in conn.execute( + f"SELECT element_id, file_id FROM element_image WHERE file_id IS NOT NULL AND element_id IN ({placeholders})", + bindings, + ): + file_ids[row["element_id"]] = row["file_id"] + + for row in conn.execute( + f"SELECT element_id, file_id FROM element_table WHERE file_id IS NOT NULL AND element_id IN ({placeholders})", + bindings, + ): + file_ids[row["element_id"]] = row["file_id"] + + return file_ids + + +def _collect_candidates_from_parsed_duc( + duc_data: dict[str, Any], + query: str, + *, + limit: int, +) -> list[_ElementAggregate]: + elements = duc_data.get("elements", []) or [] + aggregates: dict[str, _ElementAggregate] = {} + field_weights = { + "label": 1.0, + "description": 0.9, + "text": 0.94, + "original_text": 0.88, + "code": 0.72, + } + + for _variant_name, _expression, variant_boost in _build_query_variants(query): + for element in elements: + if element.get("is_deleted"): + continue + + element_id = element.get("id") + element_type = element.get("type") + if not element_id or not element_type: + continue + + aggregate = aggregates.get(element_id) + if aggregate is None: + aggregate = _ElementAggregate( + element_id=element_id, + raw_element_type=element_type, + label=element.get("label") or "", + description=element.get("description"), + ) + aggregates[element_id] = aggregate + + for field_name, source_weight in field_weights.items(): + raw_text = element.get(field_name) + score, _similarity = _evaluate_match_text( + query, + raw_text, + fts_rank=None, + source_weight=source_weight, + variant_boost=variant_boost, + ) + if score > 0.0 and raw_text: + aggregate.add_match(raw_text, score) + + results = [aggregate for aggregate in aggregates.values() if aggregate.best_score > 0.0] + element_lookup = {element.get("id"): element for element in elements} + for aggregate in results: + element = element_lookup.get(aggregate.element_id, {}) + file_id = element.get("file_id") + if file_id is None: + file_ids = element.get("file_ids") or [] + if file_ids: + file_id = file_ids[0] + aggregate.file_id = file_id + + results.sort(key=lambda item: (-item.best_score, item.raw_element_type.casefold(), item.element_id)) + return results[:limit] + + +def _search_non_sqlite_duc( + duc_file: Path, + query: str, + *, + output_path: Path, + limit: int, +) -> DucSearchResponse: + duc_data = parse_duc_lazy(str(duc_file)) + candidates = _collect_candidates_from_parsed_duc(duc_data, query, limit=limit) + all_element_ids, results = _build_result_payloads(candidates) + response = DucSearchResponse( + query=query, + results=results, + total_hits=len(all_element_ids), + all_element_ids=all_element_ids, + output_path=str(output_path), + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + json.dumps(response.to_dict(), indent=2, ensure_ascii=False), + encoding="utf-8", + ) + return response + +def _build_result_payloads(candidates: list[_ElementAggregate]) -> tuple[list[str], list[DucSearchResult]]: + all_element_ids = [candidate.element_id for candidate in candidates] + + file_groups: dict[tuple[str, str], list[_ElementAggregate]] = {} + ungrouped: list[_ElementAggregate] = [] + + for candidate in candidates: + if candidate.raw_element_type in _FILE_AGGREGATE_TYPES and candidate.file_id: + file_groups.setdefault((candidate.raw_element_type, candidate.file_id), []).append(candidate) + else: + ungrouped.append(candidate) + + results: list[DucSearchResult] = [] + + for candidate in ungrouped: + results.append( + DucElementSearchResult( + element_id=candidate.element_id, + element_type=candidate.raw_element_type, + matches=candidate.ordered_matches, + score=candidate.best_score, + ) + ) + + for (raw_type, file_id), group in file_groups.items(): + group.sort(key=lambda item: (-item.best_score, item.element_id)) + if len(group) == 1: + candidate = group[0] + results.append( + DucElementSearchResult( + element_id=candidate.element_id, + element_type=candidate.raw_element_type, + matches=candidate.ordered_matches, + score=candidate.best_score, + ) + ) + continue + + merged_matches: dict[str, tuple[str, float]] = {} + for candidate in group: + for normalized, (text, score) in candidate.match_scores.items(): + current = merged_matches.get(normalized) + if current is None or score > current[1]: + merged_matches[normalized] = (text, score) + + ordered_matches = [ + text + for text, _score in sorted( + merged_matches.values(), + key=lambda item: (-item[1], _normalize_text(item[0]), item[0]), + ) + ] + results.append( + DucFileSearchResult( + file_id=file_id, + element_type=raw_type, + matches=ordered_matches, + score=max(candidate.best_score for candidate in group), + hits=len(group), + element_ids=[candidate.element_id for candidate in group], + ) + ) + + results.sort( + key=lambda item: ( + -item.score, + item.element_type.casefold(), + getattr(item, "element_id", getattr(item, "file_id", "")), + ) + ) + return all_element_ids, results + + +def _default_output_path(duc_path: Path, query: str) -> Path: + slug_tokens = _tokenize(query) + slug = "-".join(slug_tokens[:8]) if slug_tokens else "search" + if not slug: + slug = "search" + return duc_path.with_name(f"{duc_path.stem}.{slug}.search-results.json") + + +def search_duc_elements( + duc_path: str | Path, + query: str, + *, + output_path: str | Path | None = None, + limit: int = 50, +) -> DucSearchResponse: + """Search DUC elements and export ordered results to JSON. + + Args: + duc_path: Path to the ``.duc`` SQLite database. + query: Plain-text search query. + output_path: Optional JSON output path. When omitted, a file is created + next to the ``.duc`` file. + limit: Maximum number of ranked element results to keep. + """ + + duc_file = Path(duc_path) + if not duc_file.exists(): + raise FileNotFoundError(f"DUC file not found: {duc_file}") + if limit <= 0: + raise ValueError("limit must be greater than zero") + + destination = Path(output_path) if output_path else _default_output_path(duc_file, query) + + try: + with DucSQL(duc_file) as db: + candidates = _collect_candidates(db.conn, query, limit_per_source=max(limit * 3, 25))[:limit] + file_id_map = _resolve_file_ids(db.conn, [candidate.element_id for candidate in candidates]) + for candidate in candidates: + candidate.file_id = file_id_map.get(candidate.element_id) + + all_element_ids, results = _build_result_payloads(candidates) + response = DucSearchResponse( + query=query, + results=results, + total_hits=len(all_element_ids), + all_element_ids=all_element_ids, + output_path=str(destination), + ) + destination.parent.mkdir(parents=True, exist_ok=True) + destination.write_text( + json.dumps(response.to_dict(), indent=2, ensure_ascii=False), + encoding="utf-8", + ) + return response + except sqlite3.DatabaseError: + return _search_non_sqlite_duc( + duc_file, + query, + output_path=destination, + limit=limit, + ) \ No newline at end of file diff --git a/packages/ducpy/src/tests/src/test_search_module.py b/packages/ducpy/src/tests/src/test_search_module.py new file mode 100644 index 0000000..e607003 --- /dev/null +++ b/packages/ducpy/src/tests/src/test_search_module.py @@ -0,0 +1,170 @@ +"""Tests for the DUC element search module.""" + +from __future__ import annotations + +import json +import pytest +from collections import Counter +from pathlib import Path + +from ducpy.parse import parse_duc_lazy +from ducpy.search import search_duc_elements + + +def _asset_input_path(filename: str) -> Path: + return Path(__file__).resolve().parents[5] / "assets" / "testing" / "duc-files" / filename + + +def _run_asset_search( + asset_name: str, + query: str, + *, + test_output_dir: str, + test_name: str, + limit: int = 50, +) -> tuple[dict, object, Path, dict]: + asset_path = _asset_input_path(asset_name) + assert asset_path.exists(), f"Missing asset file: {asset_path}" + + json_path = Path(test_output_dir+"/search_results") / f"{test_name}.json" + if json_path.exists(): + json_path.unlink() + + response = search_duc_elements(asset_path, query, output_path=json_path, limit=limit) + payload = json.loads(json_path.read_text(encoding="utf-8")) + parsed_asset = parse_duc_lazy(str(asset_path)) + return payload, response, json_path, parsed_asset + + +def _find_elements(parsed_asset: dict, *, element_type: str | None = None, label_prefix: str | None = None) -> list[dict]: + elements = parsed_asset.get("elements", []) or [] + matches: list[dict] = [] + for element in elements: + if element.get("is_deleted"): + continue + if element_type is not None and element.get("type") != element_type: + continue + label = element.get("label") or "" + if label_prefix is not None and not str(label).startswith(label_prefix): + continue + matches.append(element) + return matches + + +def test_search_exact_rectangle_result_from_blocks_instances_asset(test_output_dir, request): + payload, response, json_path, parsed_asset = _run_asset_search( + "blocks_instances.duc", + "Rectangle 1", + test_output_dir=test_output_dir, + test_name=request.node.name, + ) + exact = next(element for element in parsed_asset["elements"] if element.get("label") == "Rectangle 1") + + assert payload["query"] == "Rectangle 1" + assert payload["total_hits"] >= 1 + assert payload["results"][0]["element_id"] == exact["id"] + assert payload["results"][0]["element_type"] == "rectangle" + assert payload["results"][0]["matches"][0] == "Rectangle 1" + assert response.output_path == str(json_path) + + +def test_search_groups_repeated_pdf_file_results(test_output_dir, request): + pdf_elements = [element for element in parse_duc_lazy(str(_asset_input_path("universal.duc")))["elements"] if element.get("type") == "pdf" and element.get("file_id") and not element.get("is_deleted")] + duplicated_file_id = next( + file_id for file_id, count in Counter(element["file_id"] for element in pdf_elements).items() if count > 1 + ) + selected = [element for element in pdf_elements if element.get("file_id") == duplicated_file_id] + assert len(selected) == 2 + + # Use the label of the first selected element as the query + query = selected[0]["label"] + payload, response, _json_path, parsed_asset = _run_asset_search( + "universal.duc", + query, + test_output_dir=test_output_dir, + test_name=request.node.name, + limit=2, + ) + + assert payload["query"] == query + assert payload["total_hits"] == 2 + assert set(payload["all_element_ids"]) == {element["id"] for element in selected} + assert len(payload["results"]) == 1 + + grouped = payload["results"][0] + assert grouped["file_id"] == duplicated_file_id + assert grouped["element_type"] == "pdf" + assert grouped["hits"] == 2 + assert set(grouped["element_ids"]) == {element["id"] for element in selected} + assert grouped["matches"][0] == query + assert set(grouped["matches"]) == {element["label"] for element in selected} + assert grouped["score"] == round(max(result.score for result in response.results), 6) + + +def test_search_prefix_query_rec_returns_multiple_rectangle_matches(test_output_dir, request): + payload, response, _json_path, parsed_asset = _run_asset_search( + "blocks_instances.duc", + "rec", + test_output_dir=test_output_dir, + test_name=request.node.name, + ) + rectangle_count = len(_find_elements(parsed_asset, element_type="rectangle", label_prefix="Rectangle")) + assert rectangle_count >= 4 + + assert payload["query"] == "rec" + assert payload["total_hits"] >= 4 + assert len(payload["results"]) >= 4 + assert response.results[0].score >= response.results[-1].score + assert payload["results"][0]["element_type"] == "rectangle" + assert "rec" in payload["results"][0]["matches"][0].lower() + assert sum(result["element_type"] == "rectangle" for result in payload["results"]) >= 4 + assert all(any("rec" in match.lower() for match in result["matches"]) for result in payload["results"][:4]) + + +def test_search_text_query_linear_finds_text_content(test_output_dir, request): + payload, _response, _json_path, parsed_asset = _run_asset_search( + "universal.duc", + "Linear", + test_output_dir=test_output_dir, + test_name=request.node.name, + ) + expected = next(element for element in parsed_asset["elements"] if element.get("type") == "text" and element.get("text") == "Linear") + + assert payload["query"] == "Linear" + assert payload["total_hits"] >= 1 + assert payload["results"][0]["element_id"] == expected["id"] + assert payload["results"][0]["element_type"] == "text" + assert any("linear" in match.lower() for match in payload["results"][0]["matches"]) + + +def test_search_gibberish_query_returns_empty_results(test_output_dir, request): + payload, _response, json_path, parsed_asset = _run_asset_search( + "universal.duc", + "siauhfbohasbjflasvl", + test_output_dir=test_output_dir, + test_name=request.node.name, + ) + assert len(parsed_asset.get("elements", [])) > 0 + + assert payload["query"] == "siauhfbohasbjflasvl" + assert payload["total_hits"] == 0 + assert payload["all_element_ids"] == [] + assert payload["results"] == [] + assert json.loads(json_path.read_text(encoding="utf-8")) == payload +def test_search_empty_query_raises_value_error(test_output_dir): + asset_path = _asset_input_path("universal.duc") + with pytest.raises(ValueError, match="The search query must contain at least one searchable token."): + search_duc_elements(asset_path, "", limit=5) + +def test_search_respects_limit_parameter(test_output_dir, request): + payload, response, _json_path, parsed_asset = _run_asset_search( + "blocks_instances.duc", + "Rectangle", + test_output_dir=test_output_dir, + test_name=request.node.name, + limit=2, + ) + assert payload["query"] == "Rectangle" + assert payload["total_hits"] >= 2 + assert len(payload["results"]) == 2 + diff --git a/packages/ducrs/src/api/version_control.rs b/packages/ducrs/src/api/version_control.rs index 4e7633b..202dee4 100644 --- a/packages/ducrs/src/api/version_control.rs +++ b/packages/ducrs/src/api/version_control.rs @@ -4,7 +4,6 @@ //! - Restoring document state at any version (checkpoint or delta replay) //! - Creating new checkpoints and deltas //! - Listing version history -//! - Pruning old versions //! //! All operations work directly against the embedded SQLite schema //! (`version_control.sql`) and produce/consume the canonical Rust types diff --git a/schema/duc.sql b/schema/duc.sql index f96264e..cf555a9 100644 --- a/schema/duc.sql +++ b/schema/duc.sql @@ -1,7 +1,7 @@ -- "DUC_" in ASCII -- Apply in order: duc.sql → version_control.sql → search.sql PRAGMA application_id = 1146569567; -PRAGMA user_version = 3000002; +PRAGMA user_version = 3000003; PRAGMA journal_mode = WAL; PRAGMA foreign_keys = ON; PRAGMA synchronous = NORMAL; diff --git a/schema/migrations/3000002_to_3000003.sql b/schema/migrations/3000002_to_3000003.sql new file mode 100644 index 0000000..0bf5225 --- /dev/null +++ b/schema/migrations/3000002_to_3000003.sql @@ -0,0 +1,144 @@ +-- Migration: 3000002 → 3000003 +-- Rebuild FTS5 search tables with improved tokenizer and prefix support. +-- This migration is necessary because FTS5 virtual tables cannot be altered +-- to change tokenizer or prefix options - they must be dropped and recreated. + +-- 1. Drop old FTS tables (triggers are automatically dropped) +DROP TABLE IF EXISTS search_elements; +DROP TABLE IF EXISTS search_element_text; +DROP TABLE IF EXISTS search_element_doc; +DROP TABLE IF EXISTS search_element_model; +DROP TABLE IF EXISTS search_blocks; + +-- 2. Recreate FTS tables with new configuration +-- FTS over element labels and descriptions. +CREATE VIRTUAL TABLE search_elements USING fts5( + label, + description, + content='elements', + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' +); + +CREATE TRIGGER trg_elements_ai AFTER INSERT ON elements BEGIN + INSERT INTO search_elements(rowid, label, description) + VALUES (NEW.rowid, NEW.label, NEW.description); +END; +CREATE TRIGGER trg_elements_ad AFTER DELETE ON elements BEGIN + INSERT INTO search_elements(search_elements, rowid, label, description) + VALUES ('delete', OLD.rowid, OLD.label, OLD.description); +END; +CREATE TRIGGER trg_elements_au AFTER UPDATE OF label, description ON elements BEGIN + INSERT INTO search_elements(search_elements, rowid, label, description) + VALUES ('delete', OLD.rowid, OLD.label, OLD.description); + INSERT INTO search_elements(rowid, label, description) + VALUES (NEW.rowid, NEW.label, NEW.description); +END; + +-- FTS over text element content. +CREATE VIRTUAL TABLE search_element_text USING fts5( + text, + original_text, + content='element_text', + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' +); + +CREATE TRIGGER trg_element_text_ai AFTER INSERT ON element_text BEGIN + INSERT INTO search_element_text(rowid, text, original_text) + VALUES (NEW.rowid, NEW.text, NEW.original_text); +END; +CREATE TRIGGER trg_element_text_ad AFTER DELETE ON element_text BEGIN + INSERT INTO search_element_text(search_element_text, rowid, text, original_text) + VALUES ('delete', OLD.rowid, OLD.text, OLD.original_text); +END; +CREATE TRIGGER trg_element_text_au AFTER UPDATE OF text, original_text ON element_text BEGIN + INSERT INTO search_element_text(search_element_text, rowid, text, original_text) + VALUES ('delete', OLD.rowid, OLD.text, OLD.original_text); + INSERT INTO search_element_text(rowid, text, original_text) + VALUES (NEW.rowid, NEW.text, NEW.original_text); +END; + +-- FTS over doc element content. +CREATE VIRTUAL TABLE search_element_doc USING fts5( + text, + content='element_doc', + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' +); + +CREATE TRIGGER trg_element_doc_ai AFTER INSERT ON element_doc BEGIN + INSERT INTO search_element_doc(rowid, text) + VALUES (NEW.rowid, NEW.text); +END; +CREATE TRIGGER trg_element_doc_ad AFTER DELETE ON element_doc BEGIN + INSERT INTO search_element_doc(search_element_doc, rowid, text) + VALUES ('delete', OLD.rowid, OLD.text); +END; +CREATE TRIGGER trg_element_doc_au AFTER UPDATE OF text ON element_doc BEGIN + INSERT INTO search_element_doc(search_element_doc, rowid, text) + VALUES ('delete', OLD.rowid, OLD.text); + INSERT INTO search_element_doc(rowid, text) + VALUES (NEW.rowid, NEW.text); +END; + +-- FTS over model element source code. +CREATE VIRTUAL TABLE search_element_model USING fts5( + code, + content='element_model', + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' +); + +CREATE TRIGGER trg_element_model_ai AFTER INSERT ON element_model BEGIN + INSERT INTO search_element_model(rowid, code) + VALUES (NEW.rowid, NEW.code); +END; +CREATE TRIGGER trg_element_model_ad AFTER DELETE ON element_model BEGIN + INSERT INTO search_element_model(search_element_model, rowid, code) + VALUES ('delete', OLD.rowid, OLD.code); +END; +CREATE TRIGGER trg_element_model_au AFTER UPDATE OF code ON element_model BEGIN + INSERT INTO search_element_model(search_element_model, rowid, code) + VALUES ('delete', OLD.rowid, OLD.code); + INSERT INTO search_element_model(rowid, code) + VALUES (NEW.rowid, NEW.code); +END; + +-- FTS over block labels and descriptions. +CREATE VIRTUAL TABLE search_blocks USING fts5( + label, + description, + content='blocks', + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' +); + +CREATE TRIGGER trg_blocks_ai AFTER INSERT ON blocks BEGIN + INSERT INTO search_blocks(rowid, label, description) + VALUES (NEW.rowid, NEW.label, NEW.description); +END; +CREATE TRIGGER trg_blocks_ad AFTER DELETE ON blocks BEGIN + INSERT INTO search_blocks(search_blocks, rowid, label, description) + VALUES ('delete', OLD.rowid, OLD.label, OLD.description); +END; +CREATE TRIGGER trg_blocks_au AFTER UPDATE OF label, description ON blocks BEGIN + INSERT INTO search_blocks(search_blocks, rowid, label, description) + VALUES ('delete', OLD.rowid, OLD.label, OLD.description); + INSERT INTO search_blocks(rowid, label, description) + VALUES (NEW.rowid, NEW.label, NEW.description); +END; + +-- 3. Rebuild FTS indexes from existing data +INSERT INTO search_elements(search_elements) VALUES ('rebuild'); +INSERT INTO search_element_text(search_element_text) VALUES ('rebuild'); +INSERT INTO search_element_doc(search_element_doc) VALUES ('rebuild'); +INSERT INTO search_element_model(search_element_model) VALUES ('rebuild'); +INSERT INTO search_blocks(search_blocks) VALUES ('rebuild'); + +PRAGMA user_version = 3000003; diff --git a/schema/search.sql b/schema/search.sql index e943e91..f2b4adc 100644 --- a/schema/search.sql +++ b/schema/search.sql @@ -10,24 +10,32 @@ -- Query examples: -- SELECT rowid, rank FROM search_elements WHERE search_elements MATCH 'motor'; -- SELECT rowid, rank FROM search_element_text WHERE search_element_text MATCH 'dimension'; +-- +-- Notes: +-- - `unicode61 remove_diacritics 2` improves case/diacritic-insensitive matching. +-- - `prefix=` speeds up prefix lookups such as `for*`. +-- - `rebuild` statements at the end backfill indexes for databases that already +-- contain rows before this schema is applied. -- FTS over element labels and descriptions. -CREATE VIRTUAL TABLE search_elements USING fts5( +CREATE VIRTUAL TABLE IF NOT EXISTS search_elements USING fts5( label, description, content='elements', - content_rowid='rowid' + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' ); -CREATE TRIGGER trg_elements_ai AFTER INSERT ON elements BEGIN +CREATE TRIGGER IF NOT EXISTS trg_elements_ai AFTER INSERT ON elements BEGIN INSERT INTO search_elements(rowid, label, description) VALUES (NEW.rowid, NEW.label, NEW.description); END; -CREATE TRIGGER trg_elements_ad AFTER DELETE ON elements BEGIN +CREATE TRIGGER IF NOT EXISTS trg_elements_ad AFTER DELETE ON elements BEGIN INSERT INTO search_elements(search_elements, rowid, label, description) VALUES ('delete', OLD.rowid, OLD.label, OLD.description); END; -CREATE TRIGGER trg_elements_au AFTER UPDATE OF label, description ON elements BEGIN +CREATE TRIGGER IF NOT EXISTS trg_elements_au AFTER UPDATE OF label, description ON elements BEGIN INSERT INTO search_elements(search_elements, rowid, label, description) VALUES ('delete', OLD.rowid, OLD.label, OLD.description); INSERT INTO search_elements(rowid, label, description) @@ -35,22 +43,24 @@ CREATE TRIGGER trg_elements_au AFTER UPDATE OF label, description ON elements BE END; -- FTS over text element content. -CREATE VIRTUAL TABLE search_element_text USING fts5( +CREATE VIRTUAL TABLE IF NOT EXISTS search_element_text USING fts5( text, original_text, content='element_text', - content_rowid='rowid' + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' ); -CREATE TRIGGER trg_element_text_ai AFTER INSERT ON element_text BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_text_ai AFTER INSERT ON element_text BEGIN INSERT INTO search_element_text(rowid, text, original_text) VALUES (NEW.rowid, NEW.text, NEW.original_text); END; -CREATE TRIGGER trg_element_text_ad AFTER DELETE ON element_text BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_text_ad AFTER DELETE ON element_text BEGIN INSERT INTO search_element_text(search_element_text, rowid, text, original_text) VALUES ('delete', OLD.rowid, OLD.text, OLD.original_text); END; -CREATE TRIGGER trg_element_text_au AFTER UPDATE OF text, original_text ON element_text BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_text_au AFTER UPDATE OF text, original_text ON element_text BEGIN INSERT INTO search_element_text(search_element_text, rowid, text, original_text) VALUES ('delete', OLD.rowid, OLD.text, OLD.original_text); INSERT INTO search_element_text(rowid, text, original_text) @@ -58,21 +68,23 @@ CREATE TRIGGER trg_element_text_au AFTER UPDATE OF text, original_text ON elemen END; -- FTS over doc element content. -CREATE VIRTUAL TABLE search_element_doc USING fts5( +CREATE VIRTUAL TABLE IF NOT EXISTS search_element_doc USING fts5( text, content='element_doc', - content_rowid='rowid' + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' ); -CREATE TRIGGER trg_element_doc_ai AFTER INSERT ON element_doc BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_doc_ai AFTER INSERT ON element_doc BEGIN INSERT INTO search_element_doc(rowid, text) VALUES (NEW.rowid, NEW.text); END; -CREATE TRIGGER trg_element_doc_ad AFTER DELETE ON element_doc BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_doc_ad AFTER DELETE ON element_doc BEGIN INSERT INTO search_element_doc(search_element_doc, rowid, text) VALUES ('delete', OLD.rowid, OLD.text); END; -CREATE TRIGGER trg_element_doc_au AFTER UPDATE OF text ON element_doc BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_doc_au AFTER UPDATE OF text ON element_doc BEGIN INSERT INTO search_element_doc(search_element_doc, rowid, text) VALUES ('delete', OLD.rowid, OLD.text); INSERT INTO search_element_doc(rowid, text) @@ -80,21 +92,23 @@ CREATE TRIGGER trg_element_doc_au AFTER UPDATE OF text ON element_doc BEGIN END; -- FTS over model element source code. -CREATE VIRTUAL TABLE search_element_model USING fts5( +CREATE VIRTUAL TABLE IF NOT EXISTS search_element_model USING fts5( code, content='element_model', - content_rowid='rowid' + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' ); -CREATE TRIGGER trg_element_model_ai AFTER INSERT ON element_model BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_model_ai AFTER INSERT ON element_model BEGIN INSERT INTO search_element_model(rowid, code) VALUES (NEW.rowid, NEW.code); END; -CREATE TRIGGER trg_element_model_ad AFTER DELETE ON element_model BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_model_ad AFTER DELETE ON element_model BEGIN INSERT INTO search_element_model(search_element_model, rowid, code) VALUES ('delete', OLD.rowid, OLD.code); END; -CREATE TRIGGER trg_element_model_au AFTER UPDATE OF code ON element_model BEGIN +CREATE TRIGGER IF NOT EXISTS trg_element_model_au AFTER UPDATE OF code ON element_model BEGIN INSERT INTO search_element_model(search_element_model, rowid, code) VALUES ('delete', OLD.rowid, OLD.code); INSERT INTO search_element_model(rowid, code) @@ -102,24 +116,33 @@ CREATE TRIGGER trg_element_model_au AFTER UPDATE OF code ON element_model BEGIN END; -- FTS over block labels and descriptions. -CREATE VIRTUAL TABLE search_blocks USING fts5( +CREATE VIRTUAL TABLE IF NOT EXISTS search_blocks USING fts5( label, description, content='blocks', - content_rowid='rowid' + content_rowid='rowid', + tokenize='unicode61 remove_diacritics 2', + prefix='2 3 4 5 6 7 8 9 10' ); -CREATE TRIGGER trg_blocks_ai AFTER INSERT ON blocks BEGIN +CREATE TRIGGER IF NOT EXISTS trg_blocks_ai AFTER INSERT ON blocks BEGIN INSERT INTO search_blocks(rowid, label, description) VALUES (NEW.rowid, NEW.label, NEW.description); END; -CREATE TRIGGER trg_blocks_ad AFTER DELETE ON blocks BEGIN +CREATE TRIGGER IF NOT EXISTS trg_blocks_ad AFTER DELETE ON blocks BEGIN INSERT INTO search_blocks(search_blocks, rowid, label, description) VALUES ('delete', OLD.rowid, OLD.label, OLD.description); END; -CREATE TRIGGER trg_blocks_au AFTER UPDATE OF label, description ON blocks BEGIN +CREATE TRIGGER IF NOT EXISTS trg_blocks_au AFTER UPDATE OF label, description ON blocks BEGIN INSERT INTO search_blocks(search_blocks, rowid, label, description) VALUES ('delete', OLD.rowid, OLD.label, OLD.description); INSERT INTO search_blocks(rowid, label, description) VALUES (NEW.rowid, NEW.label, NEW.description); END; + +-- Backfill FTS indexes for databases that already contain data. +INSERT INTO search_elements(search_elements) VALUES ('rebuild'); +INSERT INTO search_element_text(search_element_text) VALUES ('rebuild'); +INSERT INTO search_element_doc(search_element_doc) VALUES ('rebuild'); +INSERT INTO search_element_model(search_element_model) VALUES ('rebuild'); +INSERT INTO search_blocks(search_blocks) VALUES ('rebuild');