From 2c19a19ae63558d172504b03ae8ba31f3ea7459f Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 9 Jun 2026 22:59:41 +0300 Subject: [PATCH 1/9] fix(tests): green suite on Windows, hermetic auto-update, coverage over the gate - test_cold_run_installs_from_requirements_lock compared the requirements.lock path as a raw string; bash writes '/' joins while pathlib renders '\' on Windows. Extract the -r argument from the pip log and compare as Path. - New tests/test_skill_update.py: update/backup/rollback/auto-update paths (skill_update.py 42% -> 96%). - Cover the multi-intent merge branch in detect_intent and the vec_cache / vec_meta / orphan-pruning family in storage/repo.py (70% -> 92%) using plain stand-in tables so the optional sqlite-vec extension is not required. - CBX_NO_SKILL_AUTO_UPDATE=1 guard in _try_auto_update_skills + conftest: the suite (and any CLI run inside the checkout) used to rewrite the committed .skill_version stamps whenever installed package metadata was stale. Total coverage 78.13% -> 80.64%; the 80% gate passes again. Co-Authored-By: Claude Fable 5 --- src/codebase_index/cli.py | 2 + tests/conftest.py | 6 ++ tests/test_bootstrap.py | 10 ++- tests/test_intent.py | 14 ++++ tests/test_skill_update.py | 136 +++++++++++++++++++++++++++++++++++++ tests/test_storage.py | 87 ++++++++++++++++++++++++ 6 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 tests/test_skill_update.py diff --git a/src/codebase_index/cli.py b/src/codebase_index/cli.py index 4126fcf..d148d28 100644 --- a/src/codebase_index/cli.py +++ b/src/codebase_index/cli.py @@ -179,6 +179,8 @@ def _resolve_init_targets(root: Path, requested: str | None) -> tuple[list[str], def _try_auto_update_skills(root_opt: Optional[Path]) -> None: """Silently update all installed skills when the package version changed.""" + if os.environ.get("CBX_NO_SKILL_AUTO_UPDATE") == "1": + return try: from .config import find_root from . import scaffold diff --git a/tests/conftest.py b/tests/conftest.py index 0283719..5b2ef0e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os import sqlite3 from pathlib import Path @@ -7,6 +8,11 @@ from codebase_index.storage.db import Database +# Tests invoke the CLI from inside this checkout; without the guard, the skill +# version-stamp auto-update rewrites the committed copies under .claude/.codex/ +# .opencode whenever the installed package metadata is stale. +os.environ.setdefault("CBX_NO_SKILL_AUTO_UPDATE", "1") + FIXTURE_ROOT = Path(__file__).parent / "fixtures" / "sample_repo" diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py index 8920e1e..efbd4b6 100644 --- a/tests/test_bootstrap.py +++ b/tests/test_bootstrap.py @@ -105,7 +105,15 @@ def test_cold_run_installs_from_requirements_lock(tmp_path): res = _run(root, env) assert res.returncode == 0, res.stderr pip_log = Path(env["FAKE_PIP_LOG"]).read_text(encoding="utf-8") - assert f"-r {root / 'requirements.lock'}" in pip_log + # Compare as paths, not raw strings: bootstrap.sh joins with "/", while + # pathlib on Windows renders "\", so the bytes differ but the path is equal. + lock_args = [ + line.split(" install -r ", 1)[1] + for line in pip_log.splitlines() + if " install -r " in line + ] + assert lock_args, pip_log + assert any(Path(arg) == root / "requirements.lock" for arg in lock_args) @pytest.mark.skipif(not BASH_OK, reason="bash not available or non-functional") diff --git a/tests/test_intent.py b/tests/test_intent.py index 3d7a125..c76a7ff 100644 --- a/tests/test_intent.py +++ b/tests/test_intent.py @@ -44,3 +44,17 @@ def test_semantic_intents_have_vector_weight(): def test_locate_impl_still_favors_symbol_over_vector(): plan = detect_intent("where is refresh_access_token implemented") assert plan.weight("symbol") > plan.weight("vector") + + +def test_multiple_matched_intents_merge(): + # Matches both ARCHITECTURE ("architecture") and HOW_IT_WORKS ("how does"). + plan = detect_intent("how does the architecture work") + # Primary intent comes from the first matched rule (ARCHITECTURE precedes + # HOW_IT_WORKS in the rule list), as do graph_strategy and summaries_first. + assert plan.intent is Intent.ARCHITECTURE + assert plan.graph_strategy == "none" + assert plan.summaries_first is True + # Weights take the max per retriever; budget takes the max across plans. + assert plan.weight("fts") == 1.0 # max(0.6 architecture, 1.0 how_it_works) + assert plan.weight("vector") == 0.8 # max(0.5, 0.8) + assert plan.token_budget == 2500 # max(2500, 2200) diff --git a/tests/test_skill_update.py b/tests/test_skill_update.py new file mode 100644 index 0000000..e7b120c --- /dev/null +++ b/tests/test_skill_update.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_index import scaffold, skill_update + + +def _install(root: Path, target: str = "claude") -> Path: + scaffold.materialize_skill(root, force=False, target=target) + return root / scaffold.skill_rel_for_target(target) + + +def test_fresh_install_needs_no_update(tmp_path): + skill_dir = _install(tmp_path) + assert (skill_dir / skill_update.VERSION_FILE).is_file() + assert skill_update.needs_update(skill_dir) is False + + +def test_needs_update_when_stamp_differs_or_missing(tmp_path): + skill_dir = _install(tmp_path) + (skill_dir / skill_update.VERSION_FILE).write_text("0.0.1\n", encoding="utf-8") + assert skill_update.needs_update(skill_dir) is True + (skill_dir / skill_update.VERSION_FILE).unlink() + assert skill_update.needs_update(skill_dir) is True + + +def test_package_version_unknown_when_metadata_missing(monkeypatch): + def boom(_name): + raise RuntimeError("no metadata") + + monkeypatch.setattr("importlib.metadata.version", boom) + assert skill_update._package_version() == "unknown" + + +def test_update_skill_refreshes_stamps_and_backs_up(tmp_path): + skill_dir = _install(tmp_path) + template_text = (skill_dir / "SKILL.md").read_text(encoding="utf-8") + (skill_dir / "SKILL.md").write_text("locally edited\n", encoding="utf-8") + (skill_dir / skill_update.VERSION_FILE).write_text("0.0.1\n", encoding="utf-8") + + result = skill_update.update_skill(tmp_path, "claude") + + assert result["updated"] is True + assert result["backed_up"] is True + assert result["old_version"] == "0.0.1" + assert result["new_version"] == skill_update._package_version() + # Skill content is re-materialized from the template... + assert (skill_dir / "SKILL.md").read_text(encoding="utf-8") == template_text + assert skill_update._installed_version(skill_dir) == result["new_version"] + # ...and the pre-update state is preserved in the cache backup. + bak = skill_update._backup_dir(tmp_path, "claude") + assert (bak / "SKILL.md").read_text(encoding="utf-8") == "locally edited\n" + + +def test_update_skill_without_backup(tmp_path): + _install(tmp_path) + result = skill_update.update_skill(tmp_path, "claude", backup=False) + assert result["backed_up"] is False + assert not skill_update._backup_dir(tmp_path, "claude").exists() + + +def test_update_skill_on_missing_dir_writes_no_backup(tmp_path): + result = skill_update.update_skill(tmp_path, "claude") + assert result["updated"] is True + assert result["backed_up"] is False + assert result["old_version"] == "" + + +def test_rollback_restores_previous_skill(tmp_path): + skill_dir = _install(tmp_path) + (skill_dir / "SKILL.md").write_text("pre-update state\n", encoding="utf-8") + skill_update.update_skill(tmp_path, "claude") + + result = skill_update.rollback_skill(tmp_path, "claude") + + assert result == {"target": "claude", "rolled_back": True} + assert (skill_dir / "SKILL.md").read_text(encoding="utf-8") == "pre-update state\n" + + +def test_rollback_without_backup_reports_reason(tmp_path): + result = skill_update.rollback_skill(tmp_path, "claude") + assert result["rolled_back"] is False + assert result["reason"] == "no backup found" + + +def test_auto_update_skips_missing_skill_dir(tmp_path): + assert skill_update.auto_update_if_needed(tmp_path, "claude") is False + + +def test_auto_update_skips_when_current(tmp_path): + _install(tmp_path) + assert skill_update.auto_update_if_needed(tmp_path, "claude") is False + + +def test_auto_update_applies_when_outdated(tmp_path): + skill_dir = _install(tmp_path) + (skill_dir / skill_update.VERSION_FILE).write_text("0.0.1\n", encoding="utf-8") + assert skill_update.auto_update_if_needed(tmp_path, "claude") is True + assert skill_update.needs_update(skill_dir) is False + + +def test_auto_update_swallows_failures(tmp_path, monkeypatch): + skill_dir = _install(tmp_path) + (skill_dir / skill_update.VERSION_FILE).write_text("0.0.1\n", encoding="utf-8") + + def boom(*args, **kwargs): + raise RuntimeError("materialize failed") + + monkeypatch.setattr(scaffold, "materialize_skill", boom) + assert skill_update.auto_update_if_needed(tmp_path, "claude") is False + + +def test_cli_auto_update_respects_disable_env(tmp_path, monkeypatch): + from codebase_index import cli + + skill_dir = _install(tmp_path) + (skill_dir / skill_update.VERSION_FILE).write_text("0.0.1\n", encoding="utf-8") + + monkeypatch.setenv("CBX_NO_SKILL_AUTO_UPDATE", "1") + cli._try_auto_update_skills(tmp_path) + assert skill_update._installed_version(skill_dir) == "0.0.1" # untouched + + monkeypatch.delenv("CBX_NO_SKILL_AUTO_UPDATE") + cli._try_auto_update_skills(tmp_path) + assert skill_update.needs_update(skill_dir) is False + + +@pytest.mark.parametrize("target", ["claude", "codex", "opencode"]) +def test_update_skill_supports_all_targets(tmp_path, target): + result = skill_update.update_skill(tmp_path, target) + assert result["updated"] is True + skill_dir = tmp_path / scaffold.skill_rel_for_target(target) + assert (skill_dir / "SKILL.md").is_file() + assert skill_update.needs_update(skill_dir) is False diff --git a/tests/test_storage.py b/tests/test_storage.py index d69e7fa..92d1a8c 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -398,3 +398,90 @@ def test_fingerprints_returns_mtime_size_and_sha(tmp_path): "src/b.py": (222, 20, "bbb"), } db.close() + + +def _create_vec_tables(conn): + # Plain stand-ins matching the vec0 layout: every function under test runs + # ordinary SQL against these tables, so the optional sqlite-vec extension + # is not needed to exercise the cache/meta/orphan logic. + conn.execute("CREATE TABLE vec_chunks (chunk_id INTEGER PRIMARY KEY, embedding BLOB)") + conn.execute("CREATE TABLE vec_meta (model TEXT, dim INTEGER, built_at TEXT)") + conn.execute( + "CREATE TABLE vec_cache (model TEXT NOT NULL, content_sha TEXT NOT NULL, " + "embedding BLOB NOT NULL, PRIMARY KEY (model, content_sha))" + ) + + +def test_vec_meta_roundtrip_replaces_previous_row(tmp_path): + db = _open(tmp_path) + _create_vec_tables(db.conn) + assert repo.get_vec_meta(db.conn) is None + repo.set_vec_meta(db.conn, model="m1", dim=4, built_at="2026-06-09T00:00:00Z") + repo.set_vec_meta(db.conn, model="m2", dim=8, built_at="2026-06-09T01:00:00Z") + row = repo.get_vec_meta(db.conn) + assert (row["model"], row["dim"]) == ("m2", 8) + db.close() + + +def test_embedding_cache_roundtrip_dedup_and_batching(tmp_path): + db = _open(tmp_path) + _create_vec_tables(db.conn) + assert repo.cached_embeddings(db.conn, model="m", shas=[]) == {} + items = [(f"sha{i}", f"blob{i}".encode()) for i in range(600)] + repo.store_cached_embeddings(db.conn, model="m", items=items) + repo.store_cached_embeddings(db.conn, model="m", items=[]) # no-op + # 600 shas exercise the >500 IN-list chunking; duplicates collapse to one. + shas = [sha for sha, _ in items] + ["sha0", "missing"] + out = repo.cached_embeddings(db.conn, model="m", shas=shas) + assert len(out) == 600 + assert out["sha0"] == b"blob0" + # The cache is keyed by model: another model sees nothing. + assert repo.cached_embeddings(db.conn, model="other", shas=["sha0"]) == {} + db.close() + + +def test_vector_blob_upsert_count_and_clear(tmp_path): + db = _open(tmp_path) + _create_vec_tables(db.conn) + repo.upsert_chunk_vector_blob(db.conn, 1, b"v1") + repo.upsert_chunk_vector_blob(db.conn, 1, b"v2") # replaces, no duplicate + repo.upsert_chunk_vector_blob(db.conn, 2, b"v3") + assert repo.count_vectors(db.conn) == 2 + assert repo.embedded_chunk_ids(db.conn) == {1, 2} + repo.clear_vectors(db.conn) + assert repo.count_vectors(db.conn) == 0 + db.close() + + +def test_vector_helpers_tolerate_missing_vec_tables(tmp_path): + db = _open(tmp_path) + assert repo.embedded_chunk_ids(db.conn) == set() + assert repo.prune_orphan_vectors(db.conn) == 0 + db.close() + + +def test_chunks_for_embedding_and_prune_orphans(tmp_path): + db = _open(tmp_path) + _create_vec_tables(db.conn) + fid = repo.upsert_file( + db.conn, path="src/a.py", lang="python", size_bytes=1, sha256="a", + mtime_ns=111, git_status=None, parser="line", indexed_at="t", is_generated=False, + ) + repo.replace_chunks( + db.conn, + fid, + [ + Chunk(line_start=1, line_end=5, content="def a(): ...", token_est=5), + Chunk(line_start=6, line_end=9, content="def b(): ...", token_est=5), + ], + ) + rows = repo.chunks_for_embedding(db.conn) + assert [r["content"] for r in rows] == ["def a(): ...", "def b(): ..."] + + live_ids = [int(r["id"]) for r in rows] + for cid in live_ids: + repo.upsert_chunk_vector_blob(db.conn, cid, b"vec") + repo.upsert_chunk_vector_blob(db.conn, 9999, b"orphan") + assert repo.prune_orphan_vectors(db.conn) == 1 + assert repo.embedded_chunk_ids(db.conn) == set(live_ids) + db.close() From 698e8e22320642a5c3678f0885d5a3bdffd95c57 Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 9 Jun 2026 22:59:57 +0300 Subject: [PATCH 2/9] feat(release): single-source skill copies and version with a CI sync gate - scripts/sync_skill_copies.py: src/codebase_index/skill_template/ is the canonical source; the script regenerates the committed installed copies (.claude/.codex/.opencode), the plugin skill (skills/), the shared installer files (skill/), the .skill_version stamps, the plugin.json version and the requirements.lock release tag. --check reports drift and exits 1 for CI. Comparison normalizes CRLF/LF so core.autocrlf worktrees don't false-positive. - CI lint job gains a 'Skill copies in sync' step. - pyproject.toml switches to hatch dynamic versioning: the version now lives only in src/codebase_index/__init__.py; test_plugin_manifest.py reads it from there. - .gitattributes: cbx is a POSIX script without an extension, so *.sh never matched it and Windows checkouts got a CRLF shebang. Pin it to eol=lf. - CONTRIBUTING.md documents the bump-and-sync flow. Co-Authored-By: Claude Fable 5 --- .gitattributes | 2 + .github/workflows/ci.yml | 2 + CONTRIBUTING.md | 7 ++ pyproject.toml | 5 +- scripts/sync_skill_copies.py | 175 ++++++++++++++++++++++++++++++++ tests/test_plugin_manifest.py | 8 +- tests/test_sync_skill_copies.py | 89 ++++++++++++++++ 7 files changed, 284 insertions(+), 4 deletions(-) create mode 100644 scripts/sync_skill_copies.py create mode 100644 tests/test_sync_skill_copies.py diff --git a/.gitattributes b/.gitattributes index db057f3..5a02ab9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,3 +3,5 @@ *.sh text eol=lf *.ps1 text eol=lf *.py text eol=lf +# cbx — POSIX-скрипт без расширения; CRLF в worktree ломает shebang при копировании +cbx text eol=lf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4cd321b..3e2657b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,6 +19,8 @@ jobs: run: ruff check src tests - name: Mypy run: mypy src/codebase_index + - name: Skill copies in sync + run: python scripts/sync_skill_copies.py --check test: needs: lint diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 44387d4..16837e0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -105,6 +105,13 @@ the project version unless a maintainer explicitly requests it. Add user-visible changes to `CHANGELOG.md` under `[Unreleased]`; maintainers choose the release version according to the project's versioning policy. +The version lives in one place: `src/codebase_index/__init__.py` (`__version__`). +`pyproject.toml` reads it via hatch dynamic versioning. After changing the +version or anything under `src/codebase_index/skill_template/`, run +`python scripts/sync_skill_copies.py` to regenerate the committed skill copies +and version stamps; CI rejects the PR if they drift +(`python scripts/sync_skill_copies.py --check`). + ## Test Requirements - All new features must include tests. diff --git a/pyproject.toml b/pyproject.toml index 0620caa..25d501a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "codebase-index" -version = "1.2.2" +dynamic = ["version"] description = "Local-first hybrid codebase index for AI coding agents, exposed as CLI, Skill, and MCP tools." readme = "README.md" requires-python = ">=3.11" @@ -59,6 +59,9 @@ Documentation = "https://github.com/denfry/codebase-index/tree/main/docs" Changelog = "https://github.com/denfry/codebase-index/blob/main/CHANGELOG.md" Issues = "https://github.com/denfry/codebase-index/issues" +[tool.hatch.version] +path = "src/codebase_index/__init__.py" + [tool.hatch.build.targets.wheel] packages = ["src/codebase_index"] diff --git a/scripts/sync_skill_copies.py b/scripts/sync_skill_copies.py new file mode 100644 index 0000000..1cd76aa --- /dev/null +++ b/scripts/sync_skill_copies.py @@ -0,0 +1,175 @@ +# scripts/sync_skill_copies.py +"""Keep every committed copy of the skill package in sync with the canonical +source: src/codebase_index/skill_template/ (the copy shipped in the wheel) and +the package version in src/codebase_index/__init__.py. + +Derived copies maintained: + .claude/skills/codebase-index/ installed copy, committed for this repo + .codex/skills/codebase-index/ installed copy, committed for this repo + .opencode/skills/codebase-index/ installed copy, committed for this repo + skills/codebase-index/SKILL.md plugin skill (Claude Code picks up skills/) + skill/SKILL.md, skill/scripts/cbx, skill/scripts/cbx.ps1 + installer source package (shared files only; + the rest of skill/ is owned by install.sh) + +Version stamps maintained: + /.skill_version == __version__ + .claude-plugin/plugin.json "version" field == __version__ + requirements.lock release tag v<__version__> + +Usage: + python scripts/sync_skill_copies.py # rewrite derived copies + python scripts/sync_skill_copies.py --check # list drift, exit 1 (for CI) +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + +REPO = Path(__file__).resolve().parents[1] + +TEMPLATE_REL = Path("src/codebase_index/skill_template") +INSTALLED_COPIES = ( + Path(".claude/skills/codebase-index"), + Path(".codex/skills/codebase-index"), + Path(".opencode/skills/codebase-index"), +) +PLUGIN_SKILL_REL = Path("skills/codebase-index") +INSTALLER_SHARED = ("SKILL.md", "scripts/cbx", "scripts/cbx.ps1") + +VERSION_RE = re.compile(r'^__version__ = "([^"]+)"$', re.M) +PLUGIN_VERSION_RE = re.compile(r'("version"\s*:\s*)"[^"]+"') +LOCK_TAG_RE = re.compile(r"(refs/tags/v)[0-9][^/]*?(\.tar\.gz)") + + +def package_version(repo: Path) -> str: + text = (repo / "src/codebase_index/__init__.py").read_text(encoding="utf-8") + match = VERSION_RE.search(text) + if not match: + raise SystemExit("could not find __version__ in src/codebase_index/__init__.py") + return match.group(1) + + +def template_files(repo: Path) -> list[Path]: + root = repo / TEMPLATE_REL + return sorted(p.relative_to(root) for p in root.rglob("*") if p.is_file()) + + +def expected_files(repo: Path, version: str) -> dict[Path, bytes]: + """Map every derived file (repo-relative) to the bytes it must contain.""" + template = repo / TEMPLATE_REL + rels = template_files(repo) + expected: dict[Path, bytes] = {} + + for copy in INSTALLED_COPIES: + for rel in rels: + expected[copy / rel] = (template / rel).read_bytes() + expected[copy / ".skill_version"] = f"{version}\n".encode() + + expected[PLUGIN_SKILL_REL / "SKILL.md"] = (template / "SKILL.md").read_bytes() + + for rel in INSTALLER_SHARED: + expected[Path("skill") / rel] = (template / rel).read_bytes() + + return expected + + +def version_stamp_problems(repo: Path, version: str) -> list[str]: + problems: list[str] = [] + + plugin_path = repo / ".claude-plugin/plugin.json" + plugin_ver = json.loads(plugin_path.read_text(encoding="utf-8")).get("version") + if plugin_ver != version: + problems.append(f".claude-plugin/plugin.json: version {plugin_ver!r} != {version!r}") + + lock_path = repo / "requirements.lock" + match = LOCK_TAG_RE.search(lock_path.read_text(encoding="utf-8")) + lock_ver = match.group(0).removeprefix("refs/tags/v").removesuffix(".tar.gz") if match else None + if lock_ver != version: + problems.append(f"requirements.lock: release tag {lock_ver!r} != {version!r}") + + return problems + + +def _norm(data: bytes) -> bytes: + """Normalize line endings before comparing: with core.autocrlf the worktree + may hold CRLF for files that the index stores with LF, and that must not + count as drift.""" + return data.replace(b"\r\n", b"\n") + + +def check(repo: Path, version: str) -> list[str]: + problems: list[str] = [] + for rel, want in expected_files(repo, version).items(): + path = repo / rel + if not path.exists(): + problems.append(f"{rel.as_posix()}: missing") + elif _norm(path.read_bytes()) != _norm(want): + problems.append(f"{rel.as_posix()}: differs from skill_template") + problems.extend(version_stamp_problems(repo, version)) + return problems + + +def sync(repo: Path, version: str) -> list[str]: + written: list[str] = [] + for rel, want in expected_files(repo, version).items(): + path = repo / rel + if path.exists() and _norm(path.read_bytes()) == _norm(want): + continue + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(want) + written.append(rel.as_posix()) + + plugin_path = repo / ".claude-plugin/plugin.json" + plugin_text = plugin_path.read_text(encoding="utf-8") + new_text = PLUGIN_VERSION_RE.sub(rf'\g<1>"{version}"', plugin_text, count=1) + if new_text != plugin_text: + plugin_path.write_text(new_text, encoding="utf-8") + written.append(".claude-plugin/plugin.json") + + lock_path = repo / "requirements.lock" + lock_text = lock_path.read_text(encoding="utf-8") + new_lock = LOCK_TAG_RE.sub(rf"\g<1>{version}\g<2>", lock_text) + if new_lock != lock_text: + lock_path.write_text(new_lock, encoding="utf-8") + written.append("requirements.lock") + + return written + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--check", action="store_true", help="report drift without writing") + parser.add_argument("--repo", type=Path, default=REPO, help=argparse.SUPPRESS) + args = parser.parse_args(argv) + + repo = args.repo.resolve() + version = package_version(repo) + + if args.check: + problems = check(repo, version) + if problems: + print(f"skill copies out of sync with skill_template / version {version}:") + for p in problems: + print(f" - {p}") + print("run: python scripts/sync_skill_copies.py") + return 1 + print(f"all skill copies in sync (version {version})") + return 0 + + written = sync(repo, version) + if written: + print(f"updated {len(written)} file(s) to match skill_template / version {version}:") + for w in written: + print(f" - {w}") + else: + print(f"all skill copies already in sync (version {version})") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_plugin_manifest.py b/tests/test_plugin_manifest.py index cd56095..1e5c887 100644 --- a/tests/test_plugin_manifest.py +++ b/tests/test_plugin_manifest.py @@ -27,9 +27,11 @@ def test_marketplace_lists_plugin_from_repo_root(): assert entries["codebase-index"]["source"] == "./" -def test_plugin_version_matches_pyproject(): - pyproject = (ROOT / "pyproject.toml").read_text(encoding="utf-8") - ver = re.search(r'^version = "([^"]+)"', pyproject, re.MULTILINE).group(1) +def test_plugin_version_matches_package(): + # pyproject.toml uses hatch dynamic versioning; the single source of truth + # for the version is src/codebase_index/__init__.py. + init_text = (ROOT / "src/codebase_index/__init__.py").read_text(encoding="utf-8") + ver = re.search(r'^__version__ = "([^"]+)"$', init_text, re.MULTILINE).group(1) assert _load(".claude-plugin/plugin.json")["version"] == ver diff --git a/tests/test_sync_skill_copies.py b/tests/test_sync_skill_copies.py new file mode 100644 index 0000000..8b4bfdb --- /dev/null +++ b/tests/test_sync_skill_copies.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +import importlib.util +import json +from pathlib import Path + +REPO = Path(__file__).resolve().parents[1] + +_spec = importlib.util.spec_from_file_location( + "sync_skill_copies", REPO / "scripts" / "sync_skill_copies.py" +) +assert _spec is not None and _spec.loader is not None +sync_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(sync_mod) + + +def _mini_repo(tmp_path: Path, version: str = "9.9.9") -> Path: + repo = tmp_path / "repo" + (repo / "src/codebase_index/skill_template/scripts").mkdir(parents=True) + (repo / "src/codebase_index/__init__.py").write_text( + f'__version__ = "{version}"\n', encoding="utf-8" + ) + tpl = repo / "src/codebase_index/skill_template" + (tpl / "SKILL.md").write_text("skill body\n", encoding="utf-8") + (tpl / "scripts" / "cbx").write_text("#!/bin/sh\n", encoding="utf-8") + (tpl / "scripts" / "cbx.ps1").write_text("Write-Output ok\n", encoding="utf-8") + (repo / ".claude-plugin").mkdir() + (repo / ".claude-plugin/plugin.json").write_text( + '{\n "name": "codebase-index",\n "version": "%s"\n}\n' % version, encoding="utf-8" + ) + (repo / "requirements.lock").write_text( + f"codebase-index @ https://github.com/x/y/archive/refs/tags/v{version}.tar.gz\n", + encoding="utf-8", + ) + sync_mod.sync(repo, version) + return repo + + +def test_real_repo_is_in_sync(): + version = sync_mod.package_version(REPO) + assert sync_mod.check(REPO, version) == [] + + +def test_check_detects_drift_and_sync_repairs(tmp_path): + repo = _mini_repo(tmp_path) + assert sync_mod.check(repo, "9.9.9") == [] + + drifted = repo / ".claude/skills/codebase-index/SKILL.md" + drifted.write_text("locally edited\n", encoding="utf-8") + (repo / "skills/codebase-index/SKILL.md").unlink() + + problems = sync_mod.check(repo, "9.9.9") + assert any("differs" in p for p in problems) + assert any("missing" in p for p in problems) + + sync_mod.sync(repo, "9.9.9") + assert sync_mod.check(repo, "9.9.9") == [] + assert drifted.read_text(encoding="utf-8") == "skill body\n" + + +def test_version_bump_flows_to_all_stamps(tmp_path): + repo = _mini_repo(tmp_path) + (repo / "src/codebase_index/__init__.py").write_text( + '__version__ = "9.9.10"\n', encoding="utf-8" + ) + version = sync_mod.package_version(repo) + assert version == "9.9.10" + + problems = sync_mod.check(repo, version) + assert any(".skill_version" in p for p in problems) + assert any("plugin.json" in p for p in problems) + assert any("requirements.lock" in p for p in problems) + + sync_mod.sync(repo, version) + assert sync_mod.check(repo, version) == [] + stamp = repo / ".claude/skills/codebase-index/.skill_version" + assert stamp.read_text(encoding="utf-8") == "9.9.10\n" + plugin = json.loads((repo / ".claude-plugin/plugin.json").read_text(encoding="utf-8")) + assert plugin == {"name": "codebase-index", "version": "9.9.10"} + assert "v9.9.10.tar.gz" in (repo / "requirements.lock").read_text(encoding="utf-8") + + +def test_crlf_worktree_is_not_drift(tmp_path): + # core.autocrlf=true checks text files out with CRLF; that must not be + # reported as drift against the LF template. + repo = _mini_repo(tmp_path) + skill_md = repo / ".claude/skills/codebase-index/SKILL.md" + skill_md.write_bytes(b"skill body\r\n") + assert sync_mod.check(repo, "9.9.9") == [] From 01da326afd4c1d711b86a8296fed6cd0f3d35002 Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 9 Jun 2026 22:59:57 +0300 Subject: [PATCH 3/9] chore(lint): drop unused imports in benchmark_real_repo ruff check tests was failing on F401 before unrelated changes. Co-Authored-By: Claude Fable 5 --- tests/benchmark_real_repo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/benchmark_real_repo.py b/tests/benchmark_real_repo.py index 6b09b8a..aff4e3d 100644 --- a/tests/benchmark_real_repo.py +++ b/tests/benchmark_real_repo.py @@ -29,8 +29,7 @@ from codebase_index.config import Config from codebase_index.graph.expand import impact_lookup -from codebase_index.indexer.freshness import compute_freshness -from codebase_index.indexer.pipeline import build_index, update_index +from codebase_index.indexer.pipeline import build_index from codebase_index.retrieval.pipeline import search from codebase_index.storage import repo as repo_store from codebase_index.storage.db import Database From de3082a039cd1e233168e352a3e3d8832dd5fde6 Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 9 Jun 2026 23:09:08 +0300 Subject: [PATCH 4/9] perf(index): batch edge resolution, vector writes, and index edges(file_id) - resolve_edges did one indexed lookup per symbol edge and up to ~20 full-table LIKE scans per import edge. Now: one GROUP BY for globally unique symbol names, one pass over file paths expanded into an in-memory '/'-aligned suffix map (case-folded like SQLite LIKE), one executemany. Identical resolutions on this repo's index (digest-checked), 7-28x faster at 248 files; the gap grows with file count since LIKE scans disappear. - edges(file_id) was unindexed: replace_edges deletes per file on every incremental update and files deletions cascade into edges - both were full scans (EXPLAIN-verified before/after). - _embed_chunks wrote vector blobs row by row; now a single batched upsert_chunk_vector_blobs (executemany). - recompute_degrees was checked and left alone: its correlated subqueries already use idx_edges_src/idx_edges_dst. - New graph tests pin the resolution semantics: ambiguous symbol names stay unresolved, ambiguous import suffixes need a longer path, import matching stays ASCII case-insensitive. Co-Authored-By: Claude Fable 5 --- src/codebase_index/graph/builder.py | 60 ++++++++++++++++++------- src/codebase_index/indexer/pipeline.py | 7 +-- src/codebase_index/storage/repo.py | 41 +++++++++++++++-- src/codebase_index/storage/schema.sql | 3 ++ tests/test_graph.py | 61 ++++++++++++++++++++++++++ 5 files changed, 151 insertions(+), 21 deletions(-) diff --git a/src/codebase_index/graph/builder.py b/src/codebase_index/graph/builder.py index 80ee6b4..99f6ede 100644 --- a/src/codebase_index/graph/builder.py +++ b/src/codebase_index/graph/builder.py @@ -6,6 +6,11 @@ UNAMBIGUOUS name match — if two definitions share a name, the edge is left unresolved rather than guessed. Import edges resolve their module path to a file by POSIX path-suffix match (e.g. 'auth.token' -> '%/auth/token.py'). + +The pass is batched: one query for globally-unique symbol names, one for file +paths (expanded into an in-memory suffix map), one executemany for the updates. +The per-edge variant did an indexed lookup per symbol edge and up to ~20 +full-table LIKE scans per import edge, which dominated large builds. """ from __future__ import annotations @@ -26,30 +31,55 @@ def build_graph(conn: sqlite3.Connection) -> dict[str, int]: def resolve_edges(conn: sqlite3.Connection) -> int: - resolved = 0 - for edge in repo.unresolved_edges(conn): + edges = repo.unresolved_edges(conn) + if not edges: + return 0 + + unique_symbols = repo.unique_symbol_ids_by_name(conn) + suffix_map = _path_suffix_map(repo.all_file_ids_with_paths(conn)) + + resolutions: list[tuple[str, int, int]] = [] + for edge in edges: name = edge["dst_name"] if edge["edge_type"] == "import": - file_id = _module_to_file_id(conn, name) + file_id = _module_to_file_id(suffix_map, name) if file_id is not None: - repo.resolve_edge(conn, edge["id"], "file", file_id) - resolved += 1 + resolutions.append(("file", file_id, edge["id"])) elif edge["edge_type"] in _SYMBOL_EDGE_TYPES: - sym_id = repo.symbol_id_for_unique_name(conn, name) + sym_id = unique_symbols.get(name) if sym_id is not None: - repo.resolve_edge(conn, edge["id"], "symbol", sym_id) - resolved += 1 - return resolved + resolutions.append(("symbol", sym_id, edge["id"])) + + repo.resolve_edges_bulk(conn, resolutions) + return len(resolutions) + + +def _path_suffix_map(rows: list[sqlite3.Row]) -> dict[str, Optional[int]]: + """Map every '/'-aligned path suffix to its file id, or None when ambiguous. + + Mirrors files_with_suffix(path = suffix OR path LIKE '%/suffix') semantics: + a suffix shared by several files resolves to None (like a multi-row result), + and matching is case-insensitive the way SQLite LIKE folds ASCII. + """ + mapping: dict[str, Optional[int]] = {} + for row in rows: + parts = row["path"].lower().split("/") + for i in range(len(parts)): + suffix = "/".join(parts[i:]) + mapping[suffix] = None if suffix in mapping else int(row["id"]) + return mapping -def _module_to_file_id(conn: sqlite3.Connection, module: str) -> Optional[int]: +def _module_to_file_id( + suffix_map: dict[str, Optional[int]], module: str +) -> Optional[int]: """Resolve a module/import path to a unique file id, or None. Handles Python, TypeScript/JavaScript, Java/Kotlin/Scala, Rust (:: separator), Go (last path segment), C#, Ruby, and PHP import conventions. """ - base = module.replace(".", "/").strip("/") - rust_base = module.replace("::", "/").strip("/") + base = module.lower().replace(".", "/").strip("/") + rust_base = module.lower().replace("::", "/").strip("/") if not base: return None # Last segment used for Go package-level resolution @@ -84,7 +114,7 @@ def _module_to_file_id(conn: sqlite3.Connection, module: str) -> Optional[int]: # PHP f"{base}.php", ): - rows = repo.files_with_suffix(conn, suffix) - if len(rows) == 1: - return int(rows[0]["id"]) + file_id = suffix_map.get(suffix) + if file_id is not None: + return file_id return None diff --git a/src/codebase_index/indexer/pipeline.py b/src/codebase_index/indexer/pipeline.py index 6f8237d..6feabca 100644 --- a/src/codebase_index/indexer/pipeline.py +++ b/src/codebase_index/indexer/pipeline.py @@ -227,9 +227,10 @@ def _embed_chunks(cfg, db, conn) -> int: fresh[sha] = sqlite_vec.serialize_float32(vec) repo.store_cached_embeddings(conn, model=backend.name, items=list(fresh.items())) - for row, sha in zip(rows, shas): - blob = cached.get(sha) or fresh[sha] - repo.upsert_chunk_vector_blob(conn, int(row["id"]), blob) + repo.upsert_chunk_vector_blobs( + conn, + [(int(row["id"]), cached.get(sha) or fresh[sha]) for row, sha in zip(rows, shas)], + ) built_at = datetime.now(timezone.utc).isoformat() repo.set_vec_meta(conn, model=backend.name, dim=backend.dim, built_at=built_at) diff --git a/src/codebase_index/storage/repo.py b/src/codebase_index/storage/repo.py index 338a12b..5f8dab7 100644 --- a/src/codebase_index/storage/repo.py +++ b/src/codebase_index/storage/repo.py @@ -381,6 +381,30 @@ def resolve_edge(conn: sqlite3.Connection, edge_id: int, dst_kind: str, dst_id: ) +def resolve_edges_bulk( + conn: sqlite3.Connection, resolutions: Sequence[tuple[str, int, int]] +) -> None: + """Apply (dst_kind, dst_id, edge_id) resolutions in one executemany.""" + conn.executemany( + "UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1 WHERE id = ?", + resolutions, + ) + + +def unique_symbol_ids_by_name(conn: sqlite3.Connection) -> dict[str, int]: + """Map symbol name -> id for names defined exactly once in the repo.""" + return { + row["name"]: int(row["sym_id"]) + for row in conn.execute( + "SELECT name, MIN(id) AS sym_id FROM symbols GROUP BY name HAVING COUNT(*) = 1" + ) + } + + +def all_file_ids_with_paths(conn: sqlite3.Connection) -> list[sqlite3.Row]: + return conn.execute("SELECT id, path FROM files").fetchall() + + def symbol_id_for_unique_name(conn: sqlite3.Connection, name: str) -> Optional[int]: rows = conn.execute( "SELECT id FROM symbols WHERE name = ? LIMIT 2", (name,) @@ -481,10 +505,21 @@ def upsert_chunk_vector( def upsert_chunk_vector_blob(conn: sqlite3.Connection, chunk_id: int, blob: bytes) -> None: """Write a pre-serialized float32 embedding blob for a chunk (cache-reuse path).""" - conn.execute("DELETE FROM vec_chunks WHERE chunk_id = ?", (int(chunk_id),)) - conn.execute( + upsert_chunk_vector_blobs(conn, [(chunk_id, blob)]) + + +def upsert_chunk_vector_blobs( + conn: sqlite3.Connection, items: Sequence[tuple[int, bytes]] +) -> None: + """Batch-write pre-serialized embedding blobs (one executemany per statement).""" + if not items: + return + conn.executemany( + "DELETE FROM vec_chunks WHERE chunk_id = ?", [(int(cid),) for cid, _ in items] + ) + conn.executemany( "INSERT INTO vec_chunks (chunk_id, embedding) VALUES (?, ?)", - (int(chunk_id), blob), + [(int(cid), blob) for cid, blob in items], ) diff --git a/src/codebase_index/storage/schema.sql b/src/codebase_index/storage/schema.sql index ad4607e..e0d08ed 100644 --- a/src/codebase_index/storage/schema.sql +++ b/src/codebase_index/storage/schema.sql @@ -64,6 +64,9 @@ CREATE INDEX IF NOT EXISTS idx_edges_src ON edges(src_kind, src_id); CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_kind, dst_id); CREATE INDEX IF NOT EXISTS idx_edges_name ON edges(dst_name); CREATE INDEX IF NOT EXISTS idx_edges_type ON edges(edge_type); +-- replace_edges deletes per file on every incremental update, and files(id) +-- deletions cascade here; without this index both are full edges scans. +CREATE INDEX IF NOT EXISTS idx_edges_file ON edges(file_id); CREATE TABLE IF NOT EXISTS modules ( id INTEGER PRIMARY KEY, diff --git a/tests/test_graph.py b/tests/test_graph.py index 74e75cf..f8e5110 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -63,6 +63,67 @@ def test_build_graph_resolves_symbol_and_import_edges(tmp_path): db.close() +def _file(db, path, sha="x"): + return repo.upsert_file( + db.conn, path=path, lang="python", size_bytes=1, sha256=sha, + mtime_ns=1, git_status=None, parser="treesitter", indexed_at="t", is_generated=False, + ) + + +def test_ambiguous_symbol_name_stays_unresolved(tmp_path): + db = _db(tmp_path) + fid_a = _file(db, "src/a.py", "a") + fid_b = _file(db, "src/b.py", "b") + repo.replace_symbols(db.conn, fid_a, [ + Symbol(name="helper", kind="function", line_start=1, line_end=2, qualified="helper"), + ]) + repo.replace_symbols(db.conn, fid_b, [ + Symbol(name="helper", kind="function", line_start=1, line_end=2, qualified="helper"), + ]) + repo.replace_edges(db.conn, fid_a, [ + {"edge_type": "call", "src_kind": "file", "src_id": fid_a, + "dst_kind": None, "dst_id": None, "dst_name": "helper", "line": 3, "resolved": 0}, + ]) + res = build_graph(db.conn) + assert res["resolved"] == 0 and res["unresolved"] == 1 + db.close() + + +def test_ambiguous_import_suffix_needs_longer_path(tmp_path): + db = _db(tmp_path) + fid_a = _file(db, "pkg_a/utils.py", "a") + _file(db, "pkg_b/utils.py", "b") + src = _file(db, "src/main.py", "c") + repo.replace_edges(db.conn, src, [ + {"edge_type": "import", "src_kind": "file", "src_id": src, + "dst_kind": None, "dst_id": None, "dst_name": "utils", "line": 1, "resolved": 0}, + {"edge_type": "import", "src_kind": "file", "src_id": src, + "dst_kind": None, "dst_id": None, "dst_name": "pkg_a.utils", "line": 2, "resolved": 0}, + ]) + res = build_graph(db.conn) + # bare "utils" matches two files -> unresolved; the qualified path is unique. + assert res["resolved"] == 1 and res["unresolved"] == 1 + inc = repo.incoming_edges(db.conn, "file", fid_a) + assert any(r["edge_type"] == "import" for r in inc) + db.close() + + +def test_import_resolution_is_case_insensitive(tmp_path): + # SQLite LIKE folds ASCII case; the in-memory suffix map must keep that. + db = _db(tmp_path) + fid_a = _file(db, "src/Auth/Token.py", "a") + src = _file(db, "src/main.py", "b") + repo.replace_edges(db.conn, src, [ + {"edge_type": "import", "src_kind": "file", "src_id": src, + "dst_kind": None, "dst_id": None, "dst_name": "auth.token", "line": 1, "resolved": 0}, + ]) + res = build_graph(db.conn) + assert res["resolved"] == 1 + inc = repo.incoming_edges(db.conn, "file", fid_a) + assert any(r["edge_type"] == "import" for r in inc) + db.close() + + def _indexed(sample_repo, tmp_path): cfg = Config() cfg.root = str(sample_repo) From ab9e19847c61b849416635026f10c0c5b30ab5b3 Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 9 Jun 2026 23:27:36 +0300 Subject: [PATCH 5/9] refactor(service): shared CLI/MCP service layer; close surface drift - New src/codebase_index/service.py owns what the two surfaces duplicated: cache/db path formula (was hand-built in 5+ places), db+config resolution, the explain query rewrite, vector-aware search sessions, and the stats payload. - MCP search_code/explain_code now resolve the embedding backend like the CLI does (warnings to stderr - stdout carries JSON-RPC). Before, hybrid search over MCP silently ran without the vector channel even with embeddings on. - MCP index_stats now reports the per-language graph tier (full/partial) the skill keys on - it was CLI-only. - MCP db resolution unified with the CLI: config loads from CBX_ROOT/cwd; CBX_DB_PATH overrides only the index location (it used to also hijack the config root). stats/watch now honor CBX_DB_PATH like the search family. - cli.py module docstring no longer claims commands are M0 stubs. Co-Authored-By: Claude Fable 5 --- src/codebase_index/cli.py | 115 ++++++++-------------------- src/codebase_index/mcp/server.py | 83 +++++++-------------- src/codebase_index/service.py | 124 +++++++++++++++++++++++++++++++ tests/test_service.py | 39 ++++++++++ 4 files changed, 219 insertions(+), 142 deletions(-) create mode 100644 src/codebase_index/service.py create mode 100644 tests/test_service.py diff --git a/src/codebase_index/cli.py b/src/codebase_index/cli.py index d148d28..2e2a6ef 100644 --- a/src/codebase_index/cli.py +++ b/src/codebase_index/cli.py @@ -1,8 +1,9 @@ """Typer CLI app — the single entry point for both humans and the Claude Code skill. -Commands map 1:1 to docs/ARCHITECTURE.md §5 (CLI contract). At M0 these are stubs that parse the -documented flags and emit `not implemented`; later milestones fill in the bodies by delegating to -the `indexer`, `retrieval`, and `storage` layers. +Commands map 1:1 to docs/ARCHITECTURE.md §5 (CLI contract) and delegate to the +`indexer`, `retrieval`, and `storage` layers through `service.py` — the same +layer the MCP server uses, so the two surfaces cannot drift. Only `clean` is +still a stub. Conventions: * every command accepts global options via the Typer context: --root, --json, --quiet @@ -40,25 +41,13 @@ def _todo(name: str) -> None: raise typer.Exit(code=0) -def _resolve_db_path(ctx: "typer.Context") -> Path: - from .config import load - - override = os.environ.get("CBX_DB_PATH") - if override: - return Path(override) - root_opt = ctx.obj.get("root") if ctx.obj else None - cfg = load(root_opt) - return Path(cfg.root) / ".claude" / "cache" / "codebase-index" / "index.sqlite" - - def _ensure_index(ctx: "typer.Context") -> tuple[Path, Any]: - from .config import load from .indexer.pipeline import build_index + from .service import resolve_db from .storage.db import Database root_opt = ctx.obj.get("root") if ctx.obj else None - cfg = load(root_opt) - db_path = _resolve_db_path(ctx) + db_path, cfg = resolve_db(root_opt) if db_path.exists(): return db_path, cfg @@ -86,17 +75,12 @@ def _open_in_browser(path: Path) -> None: def _resolve_backend_for_search(ctx: "typer.Context"): - """Resolve an embedding backend from config for query-time vector search. - - Returns a NoopBackend (enabled=False) when embeddings are off, so callers can - branch on `backend.enabled`. Network/external gating is enforced by - resolve_backend (SECURITY.md §4). - """ + """Embedding backend for query-time vector search (see service.search_backend).""" from .config import load - from .embeddings.backend import resolve_backend + from .service import search_backend cfg = load(ctx.obj.get("root") if ctx.obj else None) - return resolve_backend(cfg, warn=lambda m: typer.echo(m, err=True)) + return search_backend(cfg, warn=lambda m: typer.echo(m, err=True)) def _interactive_target_choice(detected_cli: list[str], detected_mcp: list[str]) -> str: @@ -388,8 +372,7 @@ def search( """Hybrid ranked search; returns compact results + recommended_reads.""" from .output import json as json_renderer from .output import markdown as md_renderer - from .retrieval.pipeline import search as run_search - from .storage.db import Database + from .service import search_payload if offset < 0: typer.echo("[codebase-index] --offset must be >= 0.") @@ -406,15 +389,10 @@ def search( raise typer.Exit(code=2) db_path, cfg = _ensure_index(ctx) - - with Database(db_path) as db: - if backend is not None and getattr(backend, "enabled", False): - db.enable_vectors() - payload = run_search( - db.conn, query, mode=mode, limit=limit, offset=offset, - token_budget=token_budget, no_fallback=no_fallback, backend=backend, - root=Path(cfg.root), config=cfg, - ) + payload = search_payload( + db_path, cfg, query, mode=mode, limit=limit, offset=offset, + token_budget=token_budget, no_fallback=no_fallback, backend=backend, + ) want_json = json_out or (ctx.obj and ctx.obj.get("json")) typer.echo(json_renderer.render(payload) if want_json else md_renderer.render(payload)) @@ -495,21 +473,15 @@ def explain( """Intent-aware bundle for 'how does X work' / overview questions.""" from .output import json as json_renderer from .output import markdown as md_renderer - from .retrieval.pipeline import search as run_search - from .storage.db import Database + from .service import normalize_explain_query, search_payload backend = _resolve_backend_for_search(ctx) db_path, cfg = _ensure_index(ctx) - q = query if any(w in query.lower() for w in ("how", "architecture", "overview")) else f"how does {query} work" - with Database(db_path) as db: - if getattr(backend, "enabled", False): - db.enable_vectors() - payload = run_search( - db.conn, q, mode="hybrid", limit=10, - token_budget=token_budget, no_fallback=False, backend=backend, - root=Path(cfg.root), config=cfg, - ) + payload = search_payload( + db_path, cfg, normalize_explain_query(query), mode="hybrid", limit=10, + token_budget=token_budget, no_fallback=False, backend=backend, + ) want_json = json_out or (ctx.obj and ctx.obj.get("json")) typer.echo(json_renderer.render(payload) if want_json else md_renderer.render(payload)) @@ -529,11 +501,12 @@ def graph_view( import json as _json from .graph.export import export_graph_html + from .service import cache_dir_for from .storage.db import Database is_json = json_flag or bool(ctx.obj and ctx.obj.get("json")) db_path, cfg = _ensure_index(ctx) - out = output or Path(cfg.root) / ".claude" / "cache" / "codebase-index" / "graph.html" + out = output or cache_dir_for(cfg) / "graph.html" with Database(db_path) as db: stats = export_graph_html( @@ -570,14 +543,11 @@ def stats( """Index size, coverage %, and freshness.""" import json as _json - from .config import load - from .parsers.languages import has_full_graph - from .storage import repo + from .service import resolve_db, stats_payload from .storage.db import Database root_opt = ctx.obj.get("root") if ctx.obj else None - cfg = load(root_opt) - db_path = Path(cfg.root) / ".claude" / "cache" / "codebase-index" / "index.sqlite" + db_path, _cfg = resolve_db(root_opt) is_json = json_flag or bool(ctx.obj and ctx.obj.get("json")) @@ -589,38 +559,16 @@ def stats( raise typer.Exit(code=0) with Database(db_path) as db: - files = repo.count_files(db.conn) - symbols = repo.count_symbols(db.conn) - built_at = repo.get_meta(db.conn, "built_at") - head = repo.get_meta(db.conn, "head_commit") - coverage = [ - { - "lang": r["lang"], - "files": r["files"], - "symbols": r["symbols"], - # Tier-A languages get import/inheritance edges; Tier-B is symbols-only, - # so refs/impact are partial for them. - "graph": "full" if has_full_graph(r["lang"]) else "partial", - } - for r in repo.treesitter_coverage(db.conn) - ] + payload = stats_payload(db.conn) if is_json: + typer.echo(_json.dumps(payload)) + else: typer.echo( - _json.dumps( - { - "files": files, - "symbols": symbols, - "built_at": built_at, - "head_commit": head, - "treesitter_coverage": coverage, - "exists": True, - } - ) + f"files={payload['files']} symbols={payload['symbols']} " + f"built_at={payload['built_at']} head={payload['head_commit']}" ) - else: - typer.echo(f"files={files} symbols={symbols} built_at={built_at} head={head}") - for r in coverage: + for r in payload["treesitter_coverage"]: flag = " ⚠ 0 symbols" if (r["symbols"] or 0) == 0 and r["files"] >= 3 else "" tier = " · partial graph (Tier-B)" if r["graph"] == "partial" else "" typer.echo(f" {r['lang']}: {r['files']} files, {r['symbols']} symbols{flag}{tier}") @@ -712,11 +660,10 @@ def watch( debounce: int = typer.Option(500, "--debounce", help="Debounce window in ms."), ) -> None: """Live incremental indexing via filesystem events (requires the 'watch' extra).""" - from .config import load + from .service import resolve_db from .watch.watcher import run_watch - cfg = load(ctx.obj.get("root") if ctx.obj else None) - db_path = Path(cfg.root) / ".claude" / "cache" / "codebase-index" / "index.sqlite" + db_path, cfg = resolve_db(ctx.obj.get("root") if ctx.obj else None) if not db_path.exists(): typer.echo("No index found. Run `codebase-index index` before `watch`.") raise typer.Exit(code=1) diff --git a/src/codebase_index/mcp/server.py b/src/codebase_index/mcp/server.py index aeaa9fe..710917a 100644 --- a/src/codebase_index/mcp/server.py +++ b/src/codebase_index/mcp/server.py @@ -19,6 +19,7 @@ import json import os +import sys from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -44,20 +45,19 @@ ) -def _resolve_db() -> tuple[Path, Config]: +def _resolve_db() -> tuple[Path, "Config"]: """Return (db_path, config). Respects CBX_DB_PATH and CBX_ROOT env vars.""" - from ..config import load - - override = os.environ.get("CBX_DB_PATH") - if override: - db_path = Path(override) - cfg: Config = load(Path(db_path.parent)) - return db_path, cfg + from ..service import resolve_db root_env = os.environ.get("CBX_ROOT") - cfg = load(Path(root_env) if root_env else None) - db_path = Path(cfg.root) / ".claude" / "cache" / "codebase-index" / "index.sqlite" - return db_path, cfg + return resolve_db(Path(root_env) if root_env else None) + + +def _search_backend(cfg: "Config"): + # stdout carries the JSON-RPC stream — warnings must go to stderr. + from ..service import search_backend + + return search_backend(cfg, warn=lambda m: print(m, file=sys.stderr)) def _no_index_error() -> str: @@ -114,21 +114,12 @@ def search_code( if not db_path.exists(): return _no_index_error() - from ..retrieval.pipeline import search as run_search - from ..storage.db import Database + from ..service import search_payload - with Database(db_path) as db: - payload = run_search( - db.conn, - query, - mode=mode, - limit=limit, - token_budget=token_budget, - no_fallback=False, - root=Path(cfg.root), - config=cfg, - offset=offset, - ) + payload = search_payload( + db_path, cfg, query, mode=mode, limit=limit, offset=offset, + token_budget=token_budget, no_fallback=False, backend=_search_backend(cfg), + ) return json.dumps(payload) @@ -232,22 +223,13 @@ def explain_code( if not db_path.exists(): return _no_index_error() - from ..retrieval.pipeline import search as run_search - from ..storage.db import Database + from ..service import normalize_explain_query, search_payload - q = query if any(w in query.lower() for w in ("how", "architecture", "overview")) else f"how does {query} work" - with Database(db_path) as db: - payload = run_search( - db.conn, - q, - mode="hybrid", - limit=10, - token_budget=token_budget, - no_fallback=False, - root=Path(cfg.root), - config=cfg, - offset=offset, - ) + payload = search_payload( + db_path, cfg, normalize_explain_query(query), mode="hybrid", limit=10, + offset=offset, token_budget=token_budget, no_fallback=False, + backend=_search_backend(cfg), + ) return json.dumps(payload) @@ -258,27 +240,12 @@ def index_stats() -> str: if not db_path.exists(): return json.dumps({"exists": False, "error": "No index found."}) - from ..storage import repo + from ..service import stats_payload from ..storage.db import Database with Database(db_path) as db: - files = repo.count_files(db.conn) - symbols = repo.count_symbols(db.conn) - built_at = repo.get_meta(db.conn, "built_at") - head = repo.get_meta(db.conn, "head_commit") - coverage = [ - {"lang": r["lang"], "files": r["files"], "symbols": r["symbols"]} - for r in repo.treesitter_coverage(db.conn) - ] - - return json.dumps({ - "exists": True, - "files": files, - "symbols": symbols, - "built_at": built_at, - "head_commit": head, - "treesitter_coverage": coverage, - }) + payload = stats_payload(db.conn) + return json.dumps(payload) def run() -> None: diff --git a/src/codebase_index/service.py b/src/codebase_index/service.py new file mode 100644 index 0000000..5e7981e --- /dev/null +++ b/src/codebase_index/service.py @@ -0,0 +1,124 @@ +"""Shared service layer for the CLI and the MCP server. + +Both surfaces drive the same retrieval/storage code; this module owns the +pieces that used to be duplicated and drift apart: the cache-path formula, +db/config resolution, the explain query rewrite, vector-aware search +sessions, and the stats payload (including the per-language graph tier the +skill keys on). +""" + +from __future__ import annotations + +import os +import sqlite3 +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +if TYPE_CHECKING: + from .config import Config + +_EXPLAIN_HINTS = ("how", "architecture", "overview") + + +def cache_dir_for(cfg: "Config") -> Path: + """Per-project cache directory (index DB, graph exports, skill backups).""" + return Path(cfg.root) / ".claude" / "cache" / "codebase-index" + + +def db_path_for(cfg: "Config") -> Path: + """Index location for a resolved config; the CBX_DB_PATH env var overrides.""" + override = os.environ.get("CBX_DB_PATH") + if override: + return Path(override) + return cache_dir_for(cfg) / "index.sqlite" + + +def resolve_db(root: Optional[Union[Path, str]] = None) -> tuple[Path, "Config"]: + """Resolve (db_path, config) the same way on every surface. + + The config loads from *root* (CLI --root, MCP CBX_ROOT, else upward + discovery from cwd); CBX_DB_PATH overrides only the index location. + """ + from .config import load + + cfg = load(Path(root) if root is not None else None) + return db_path_for(cfg), cfg + + +def search_backend(cfg: "Config", warn: Callable[[str], None]) -> Any: + """Embedding backend for query-time vector search. + + Returns a NoopBackend (enabled=False) when embeddings are off, so callers + can branch on `backend.enabled`. Network/external gating is enforced by + resolve_backend (SECURITY.md §4). + """ + from .embeddings.backend import resolve_backend + + return resolve_backend(cfg, warn=warn) + + +def normalize_explain_query(query: str) -> str: + """Rewrite a bare topic into a how-does-X-work question for intent detection.""" + if any(w in query.lower() for w in _EXPLAIN_HINTS): + return query + return f"how does {query} work" + + +def search_payload( + db_path: Path, + cfg: "Config", + query: str, + *, + mode: str = "hybrid", + limit: int = 10, + offset: int = 0, + token_budget: int = 1500, + no_fallback: bool = False, + backend: Any = None, +) -> dict: + """One search session: open the DB (vector-enabled when the backend is + live), run retrieval, return the payload dict both surfaces serialize.""" + from .retrieval.pipeline import search as run_search + from .storage.db import Database + + with Database(db_path) as db: + if backend is not None and getattr(backend, "enabled", False): + db.enable_vectors() + return run_search( + db.conn, + query, + mode=mode, + limit=limit, + offset=offset, + token_budget=token_budget, + no_fallback=no_fallback, + backend=backend, + root=Path(cfg.root), + config=cfg, + ) + + +def stats_payload(conn: sqlite3.Connection) -> dict[str, Any]: + """Index size, freshness, and per-language coverage with the graph tier.""" + from .parsers.languages import has_full_graph + from .storage import repo + + coverage = [ + { + "lang": r["lang"], + "files": r["files"], + "symbols": r["symbols"], + # Tier-A languages get import/inheritance edges; Tier-B is + # symbols-only, so refs/impact are partial for them. + "graph": "full" if has_full_graph(r["lang"]) else "partial", + } + for r in repo.treesitter_coverage(conn) + ] + return { + "files": repo.count_files(conn), + "symbols": repo.count_symbols(conn), + "built_at": repo.get_meta(conn, "built_at"), + "head_commit": repo.get_meta(conn, "head_commit"), + "treesitter_coverage": coverage, + "exists": True, + } diff --git a/tests/test_service.py b/tests/test_service.py new file mode 100644 index 0000000..c9803c4 --- /dev/null +++ b/tests/test_service.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from pathlib import Path + +from codebase_index import service +from codebase_index.config import Config + + +def _cfg(tmp_path: Path) -> Config: + cfg = Config() + cfg.root = str(tmp_path) + return cfg + + +def test_db_path_follows_cache_dir(tmp_path): + cfg = _cfg(tmp_path) + assert service.cache_dir_for(cfg) == tmp_path / ".claude" / "cache" / "codebase-index" + assert service.db_path_for(cfg) == service.cache_dir_for(cfg) / "index.sqlite" + + +def test_db_path_env_override(tmp_path, monkeypatch): + custom = tmp_path / "custom.sqlite" + monkeypatch.setenv("CBX_DB_PATH", str(custom)) + assert service.db_path_for(_cfg(tmp_path)) == custom + + +def test_resolve_db_loads_config_from_root(tmp_path, monkeypatch): + monkeypatch.delenv("CBX_DB_PATH", raising=False) + (tmp_path / ".git").mkdir() + db_path, cfg = service.resolve_db(tmp_path) + assert Path(cfg.root) == tmp_path + assert db_path == tmp_path / ".claude" / "cache" / "codebase-index" / "index.sqlite" + + +def test_normalize_explain_query(): + assert service.normalize_explain_query("auth tokens") == "how does auth tokens work" + # Queries that already carry an explain hint pass through unchanged. + for q in ("how does auth work", "architecture overview", "How is X structured"): + assert service.normalize_explain_query(q) == q From 11820ea3bb0b08ff70d39fa63fa7d35fd530428d Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 9 Jun 2026 23:35:48 +0300 Subject: [PATCH 6/9] fix(errors): make swallowed failures visible, narrow vector except clauses - _parse_all: the silent fallback from ProcessPool to sequential parsing now prints a stderr warning - a degraded build was indistinguishable from a normal one. - skill auto-update (cli callback + auto_update_if_needed): failures are still swallowed so they never break the user's real command, but they now report to stderr with the exception and a recovery hint. - embedded_chunk_ids / prune_orphan_vectors: except Exception narrowed to sqlite3.OperationalError - the only expected case is vec tables not created yet; real errors propagate instead of silently disabling the vector channel. - Deliberately left alone: _package_version 'unknown' fallbacks, walker/ freshness OSError skips (files vanishing mid-walk), _open_in_browser's platform fallback chain - all are legitimate fallbacks, not hidden failures. Co-Authored-By: Claude Fable 5 --- src/codebase_index/cli.py | 5 +++-- src/codebase_index/indexer/pipeline.py | 8 +++++++- src/codebase_index/skill_update.py | 8 +++++++- src/codebase_index/storage/repo.py | 8 ++++---- tests/test_pipeline.py | 24 ++++++++++++++++++++++++ tests/test_skill_update.py | 4 +++- 6 files changed, 48 insertions(+), 9 deletions(-) diff --git a/src/codebase_index/cli.py b/src/codebase_index/cli.py index 2e2a6ef..9bf6657 100644 --- a/src/codebase_index/cli.py +++ b/src/codebase_index/cli.py @@ -173,8 +173,9 @@ def _try_auto_update_skills(root_opt: Optional[Path]) -> None: root = Path(root_opt).resolve() if root_opt else find_root() for target in scaffold.CLI_TARGETS: auto_update_if_needed(root, target) - except Exception: - pass # never let an auto-update failure crash the real command + except Exception as exc: + # Never let an auto-update failure crash the real command — but say so. + typer.echo(f"[codebase-index] skill auto-update skipped: {exc}", err=True) @app.callback() diff --git a/src/codebase_index/indexer/pipeline.py b/src/codebase_index/indexer/pipeline.py index 6feabca..7bd92c1 100644 --- a/src/codebase_index/indexer/pipeline.py +++ b/src/codebase_index/indexer/pipeline.py @@ -5,6 +5,7 @@ import hashlib import os import subprocess +import sys from concurrent.futures import ProcessPoolExecutor from dataclasses import dataclass from datetime import datetime, timezone @@ -123,7 +124,12 @@ def _parse_all(candidates: list, config: Config) -> list[_ParseResult]: initargs=(config,), ) as pool: return list(pool.map(_parse_one, candidates)) - except Exception: + except Exception as exc: + print( + f"[codebase-index] parallel parse unavailable ({type(exc).__name__}: {exc}); " + f"falling back to sequential parsing for {len(candidates)} files.", + file=sys.stderr, + ) return [_parse_one_inline(c, config) for c in candidates] diff --git a/src/codebase_index/skill_update.py b/src/codebase_index/skill_update.py index 6065be3..bfc23b5 100644 --- a/src/codebase_index/skill_update.py +++ b/src/codebase_index/skill_update.py @@ -14,6 +14,7 @@ from __future__ import annotations import shutil +import sys from pathlib import Path VERSION_FILE = ".skill_version" @@ -140,5 +141,10 @@ def auto_update_if_needed(root: Path, target: str) -> bool: update_skill(root, target, backup=True) return True - except Exception: + except Exception as exc: + print( + f"[codebase-index] skill auto-update for '{target}' failed " + f"({type(exc).__name__}: {exc}); run `codebase-index skill-update`.", + file=sys.stderr, + ) return False diff --git a/src/codebase_index/storage/repo.py b/src/codebase_index/storage/repo.py index 5f8dab7..debd4e2 100644 --- a/src/codebase_index/storage/repo.py +++ b/src/codebase_index/storage/repo.py @@ -570,8 +570,8 @@ def embedded_chunk_ids(conn: sqlite3.Connection) -> set[int]: try: rows = conn.execute("SELECT chunk_id FROM vec_chunks").fetchall() return {int(r[0]) for r in rows} - except Exception: - return set() + except sqlite3.OperationalError: + return set() # vec tables not created yet (embeddings never enabled) def prune_orphan_vectors(conn: sqlite3.Connection) -> int: @@ -586,8 +586,8 @@ def prune_orphan_vectors(conn: sqlite3.Connection) -> int: if orphan_ids: conn.executemany("DELETE FROM vec_chunks WHERE chunk_id = ?", orphan_ids) return len(orphan_ids) - except Exception: - return 0 + except sqlite3.OperationalError: + return 0 # vec tables not created yet (embeddings never enabled) def path_mtimes(conn: sqlite3.Connection) -> dict[str, int]: diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 89179a2..ea7ea14 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -153,3 +153,27 @@ def test_reindex_graph_idempotent(sample_repo, tmp_path): s2 = build_index(cfg, db, root=sample_repo) assert s1.edges == s2.edges and s1.edges_resolved == s2.edges_resolved db.close() + + +def test_parse_all_falls_back_sequentially_with_warning(tmp_path, monkeypatch, capsys): + from codebase_index.discovery.walker import walk + from codebase_index.indexer import pipeline + + (tmp_path / "a.py").write_text("def a(): ...\n", encoding="utf-8") + (tmp_path / "b.py").write_text("def b(): ...\n", encoding="utf-8") + cfg = Config() + cfg.root = str(tmp_path) + candidates = list(walk(tmp_path, cfg)) + assert candidates + + class BrokenPool: + def __init__(self, *args, **kwargs): + raise RuntimeError("no pool for you") + + monkeypatch.setattr(pipeline, "_MIN_PARALLEL_FILES", 1) + monkeypatch.setattr(pipeline, "ProcessPoolExecutor", BrokenPool) + + results = pipeline._parse_all(candidates, cfg) + assert len(results) == len(candidates) + # The degradation must be visible, not silent. + assert "falling back to sequential" in capsys.readouterr().err diff --git a/tests/test_skill_update.py b/tests/test_skill_update.py index e7b120c..7f09274 100644 --- a/tests/test_skill_update.py +++ b/tests/test_skill_update.py @@ -101,7 +101,7 @@ def test_auto_update_applies_when_outdated(tmp_path): assert skill_update.needs_update(skill_dir) is False -def test_auto_update_swallows_failures(tmp_path, monkeypatch): +def test_auto_update_swallows_failures_but_warns(tmp_path, monkeypatch, capsys): skill_dir = _install(tmp_path) (skill_dir / skill_update.VERSION_FILE).write_text("0.0.1\n", encoding="utf-8") @@ -110,6 +110,8 @@ def boom(*args, **kwargs): monkeypatch.setattr(scaffold, "materialize_skill", boom) assert skill_update.auto_update_if_needed(tmp_path, "claude") is False + err = capsys.readouterr().err + assert "skill auto-update" in err and "materialize failed" in err def test_cli_auto_update_respects_disable_env(tmp_path, monkeypatch): From 8ed74162e780360f2f21fc23aec3c9a336ea879f Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 9 Jun 2026 23:37:30 +0300 Subject: [PATCH 7/9] docs(architecture): single accurate repo layout, drop stale module claims The layout section contained two concatenated trees from different eras (one claimed graph/ was a stub and listed incremental.py / summarize.py that never shipped). Replace with one tree matching the actual source, including service.py and the generated skill-copy note; describe the real incremental mechanism (fingerprints + freshness.py) and the schema version guard instead of nonexistent migrations. Co-Authored-By: Claude Fable 5 --- docs/ARCHITECTURE.md | 225 ++++++++++--------------------------------- 1 file changed, 49 insertions(+), 176 deletions(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index c101c0a..5fe4bda 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -64,181 +64,52 @@ keeps the prompt small and lets the engine evolve without editing the skill. ``` codebase-index/ -├── README.md -├── LICENSE -├── CHANGELOG.md -├── CONTRIBUTING.md -├── CODE_OF_CONDUCT.md -├── SECURITY.md -├── ROADMAP.md -├── pyproject.toml -├── .gitignore -├── .editorconfig -├── .github/ -│ ├── ISSUE_TEMPLATE/ -│ │ ├── bug_report.yml -│ │ ├── feature_request.yml -│ │ └── skill_listing_request.yml -│ ├── PULL_REQUEST_TEMPLATE.md -│ ├── workflows/ -│ │ ├── ci.yml -│ │ └── release.yml -│ └── FUNDING.yml -├── docs/ -│ ├── ARCHITECTURE.md # this file -│ ├── INSTALLATION.md # install guide + troubleshooting -│ ├── QUICKSTART.md # 5-minute setup -│ ├── SKILL_DESIGN.md # skill behavior and extension -│ ├── RETRIEVAL_PIPELINE.md # retrieval + ranking detail -│ ├── DATABASE_SCHEMA.md # SQLite/FTS5 schema -│ ├── SECURITY_MODEL.md # security model + threat model -│ ├── COMPARISON.md # vs Cursor, Aider, Cody, grep -│ ├── FAQ.md # user questions -│ ├── SEO.md # repository SEO plan -│ └── ROADMAP.md # milestones M0-M9 -├── skill/ # canonical source of the skill -│ ├── SKILL.md -│ ├── scripts/ -│ │ ├── install.py # skill installation script -│ │ ├── doctor.py # environment check -│ │ └── smoke_test.py # end-to-end test -│ └── examples/ -│ ├── basic-usage.md -│ ├── claude-md-example.md -│ └── hooks-example.json -├── src/ -│ └── codebase_index/ -│ ├── __init__.py -│ ├── cli.py # Typer app: all commands -│ ├── config.py # config load/merge/validate (pydantic) -│ ├── models.py # shared pydantic result models -│ ├── discovery/ # file walking + ignore rules + classification -│ │ ├── __init__.py -│ │ ├── walker.py -│ │ ├── ignore.py # .gitignore/.claudeignore/.codeindexignore -│ │ └── classify.py # language, binary, secret, size gates -│ ├── parsers/ # turn files into chunks + symbols -│ │ ├── __init__.py -│ │ ├── base.py # Parser protocol + data types -│ │ ├── treesitter.py # AST symbol extraction -│ │ ├── line_chunker.py # fallback chunking -│ │ ├── symbol_chunks.py # symbol-aligned chunking -│ │ └── languages.py # grammar registry + node→symbol maps -│ ├── indexer/ # orchestration of a build/update -│ │ ├── __init__.py -│ │ └── pipeline.py # full + incremental build -│ ├── graph/ # import/call/reference/dependency edges -│ │ └── __init__.py # stub — dependency/call graph -│ ├── storage/ # SQLite persistence -│ │ ├── __init__.py -│ │ ├── db.py # connection, pragmas, migrations -│ │ ├── schema.sql # DDL -│ │ └── repo.py # typed read/write accessors -│ ├── retrieval/ # the search engine -│ │ ├── __init__.py -│ │ └── searchers.py # FTS5 searcher + query building -│ ├── embeddings/ # OPTIONAL, opt-in vector backend -│ │ └── __init__.py # stub -│ ├── output/ # rendering results -│ │ ├── __init__.py -│ │ ├── markdown.py # compact Markdown for Claude -│ │ ├── json.py # machine JSON -│ │ └── redact.py # secret redaction -│ └── watch/ # OPTIONAL live indexing -│ └── __init__.py # stub -├── tests/ -│ ├── fixtures/ # sample repos with planted secrets -│ └── test_*.py # test suite -└── examples/ - ├── queries.md # example questions → commands - ├── config.example.json - └── hooks/ - └── settings.json # optional PostToolUse auto-update hook -``` -codebase-index/ -├── README.md -├── pyproject.toml -├── .gitignore -├── docs/ -│ ├── ARCHITECTURE.md # this file -│ ├── RETRIEVAL.md # retrieval pipeline + intent detection -│ ├── SCHEMA.md # SQLite/FTS5 schema -│ ├── SECURITY.md # security model -│ ├── INSTALLATION.md # install + configure + hooks -│ └── ROADMAP.md # milestones M0–M9 -├── skill/ # canonical source of the skill (copied on `init`) -│ ├── SKILL.md -│ └── scripts/ -│ ├── cbx # POSIX wrapper -> resolves CLI, passes args -│ └── cbx.ps1 # Windows PowerShell wrapper -├── src/ -│ └── codebase_index/ -│ ├── __init__.py -│ ├── cli.py # Typer app: all commands -│ ├── config.py # config load/merge/validate (pydantic) -│ ├── models.py # shared dataclasses/pydantic result models -│ ├── discovery/ # file walking + ignore rules + file classification -│ │ ├── __init__.py -│ │ ├── walker.py -│ │ ├── ignore.py # .gitignore/.cursorignore/.claudeignore/.codeindexignore -│ │ └── classify.py # language detection, binary/secret/size gates -│ ├── parsers/ # turn files into chunks + symbols -│ │ ├── __init__.py -│ │ ├── base.py # Parser protocol -│ │ ├── treesitter.py # AST symbol extraction -│ │ ├── line_chunker.py # fallback chunking -│ │ └── languages.py # grammar registry + node→symbol maps -│ ├── indexer/ # orchestration of a build/update -│ │ ├── __init__.py -│ │ ├── pipeline.py # full + incremental build -│ │ ├── incremental.py # hash/mtime/git change detection -│ │ └── summarize.py # file/module/package summaries -│ ├── graph/ # import/call/reference/dependency edges -│ │ ├── __init__.py -│ │ ├── builder.py # extract edges from AST + resolve targets -│ │ └── expand.py # graph expansion + impact (blast radius) -│ ├── storage/ # SQLite persistence -│ │ ├── __init__.py -│ │ ├── db.py # connection, pragmas, migrations -│ │ ├── schema.sql # DDL (mirrors docs/SCHEMA.md) -│ │ └── repo.py # typed read/write accessors -│ ├── retrieval/ # the search engine -│ │ ├── __init__.py -│ │ ├── intent.py # query intent classification -│ │ ├── searchers.py # path/symbol/fts/vector searchers -│ │ ├── fusion.py # Reciprocal Rank Fusion -│ │ ├── rerank.py # feature-based reranking -│ │ └── budget.py # token budgeting of results -│ ├── embeddings/ # OPTIONAL, opt-in vector backend -│ │ ├── __init__.py -│ │ ├── backend.py # pluggable Backend protocol -│ │ ├── local.py # sentence-transformers / local model -│ │ └── noop.py # default: disabled -│ ├── output/ # rendering results -│ │ ├── __init__.py -│ │ ├── markdown.py # compact Markdown for Claude -│ │ └── json.py # machine JSON -│ ├── watch/ # OPTIONAL live indexing -│ │ ├── __init__.py -│ │ └── watcher.py -│ └── skill_template/ # packaged copy of skill/ shipped in the wheel -│ ├── SKILL.md -│ └── scripts/ -├── tests/ -│ ├── fixtures/ # tiny sample repos -│ ├── test_discovery.py -│ ├── test_ignore.py -│ ├── test_parsers.py -│ ├── test_storage.py -│ ├── test_retrieval.py -│ ├── test_graph.py -│ └── test_cli.py -└── examples/ - ├── hooks/settings.json # optional PostToolUse auto-update hook - ├── config.example.json - └── queries.md # example questions → commands +├── README.md / LICENSE / CHANGELOG.md / CONTRIBUTING.md / SECURITY.md / ROADMAP.md +├── pyproject.toml # hatch dynamic version <- src/codebase_index/__init__.py +├── requirements.lock # pinned install spec for the plugin bootstrap +├── install.sh / install.ps1 # multi-CLI installer (drives adapters/ + lib/) +├── adapters/ # per-CLI install logic (claude/codex/opencode, sh + ps1) +├── lib/ # shared shell helpers for the installer +├── bin/ # plugin wrappers (cbx resolves the provisioned venv) +├── scripts/ # bootstrap.sh/.ps1, release_smoke.py, sync_skill_copies.py +├── hooks/ # plugin hooks.json (SessionStart bootstrap) +├── .claude-plugin/ # plugin manifest + marketplace catalog +├── .github/ # CI (lint, skill-sync gate, OS/Python test matrix), release +├── docs/ # this file + installation/retrieval/schema/security/faq +├── skill/ # installer source package (SKILL.md, scripts, examples) +├── skills/codebase-index/ # plugin skill copy (generated — scripts/sync_skill_copies.py) +├── .claude/ .codex/ .opencode/ # committed installed copies (generated — same script) +├── examples/ # sample queries, configs, hooks +├── tests/ # pytest suite + fixtures (sample_repo, multilang) +└── src/codebase_index/ + ├── cli.py # Typer app: all commands (delegates to service.py) + ├── service.py # shared CLI/MCP service layer: paths, search sessions, stats + ├── config.py # config load/merge/validate (pydantic) + ├── models.py # shared pydantic result models + ├── doctor.py # config/security diagnostics + ├── scaffold.py # init: skill + config + gitignore + MCP client configs + ├── skill_update.py # skill auto-update/rollback with version stamps + ├── discovery/ # walker.py, ignore.py, classify.py + ├── parsers/ # treesitter.py, languages.py, line_chunker.py, + │ # symbol_chunks.py, base.py + ├── indexer/ # pipeline.py (full + incremental build), freshness.py, + │ # doc_chunks.py + ├── graph/ # builder.py (edge resolution), expand.py (impact), + │ # export.py (HTML graph) + ├── storage/ # db.py (pragmas, schema, version guard), schema.sql, repo.py + ├── retrieval/ # intent.py, searchers.py, fusion.py, rerank.py, + │ # budget.py, pipeline.py, types.py + ├── embeddings/ # backend.py, noop.py (default), local.py, external.py — opt-in + ├── output/ # markdown.py, json.py, redact.py + ├── watch/ # watcher.py (optional, watchdog-based) + ├── mcp/ # server.py (stdio MCP over the same service layer) + └── skill_template/ # canonical skill source shipped in the wheel ``` +The committed skill copies (`skill/`, `skills/`, `.claude/`, `.codex/`, `.opencode/`) are +generated from `src/codebase_index/skill_template/` by `scripts/sync_skill_copies.py`; +CI fails if they drift (`--check`). + ## 4. Module responsibilities - **discovery** — Walk the repo, apply layered ignore rules, classify each file (language, binary, @@ -249,9 +120,11 @@ codebase-index/ Tree-sitter when a grammar exists; line-based chunker otherwise. - **graph/builder** — From AST, extract `imports`, `calls`, `references`, `extends/implements`, and resolve them to target symbols/files where possible. Unresolved edges are kept as soft text refs. -- **indexer/pipeline** — Drives a build: discovery → parse → store chunks/symbols → build graph → - summaries → FTS sync → (optional) embeddings. `incremental.py` decides what to re-process. -- **storage** — Owns the SQLite DB, pragmas (WAL, foreign keys), migrations, and typed accessors. +- **indexer/pipeline** — Drives a build: discovery → parse (process pool on large repos) → store + chunks/symbols → build graph → FTS sync → (optional) embeddings. `update_index` re-processes + only files whose (mtime, size, sha256) fingerprint changed; `freshness.py` reports staleness. +- **storage** — Owns the SQLite DB, pragmas (WAL, foreign keys), the schema version guard + (a future-versioned index asks for a rebuild rather than guessing), and typed accessors. FTS5 virtual tables and (optional) `sqlite-vec` vector tables live here. - **retrieval** — The query path. `intent.py` classifies the query; `searchers.py` runs the relevant retrievers; `fusion.py` merges them with RRF; `rerank.py` reorders; `graph.expand` From 4446a498cdd71357d764cbe0d097176ba8703dfc Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 9 Jun 2026 23:44:54 +0300 Subject: [PATCH 8/9] chore(release): 1.3.0 - Version bump in src/codebase_index/__init__.py (single source; hatch dynamic versioning propagates to package metadata). - scripts/sync_skill_copies.py stamped plugin.json, requirements.lock (v1.3.0 tarball), and the committed .skill_version copies. - CHANGELOG: 1.3.0 section consolidating this cycle (embedding cache, batched graph build, shared CLI/MCP service layer, skill-copy sync tooling, coverage signals, pagination, error visibility, test-suite hermeticity). - README + docs: install tags moved to @v1.3.0, Project Status rewritten, stale current-release references updated. Co-Authored-By: Claude Fable 5 --- .claude-plugin/plugin.json | 2 +- .claude/skills/codebase-index/.skill_version | 2 +- .codex/skills/codebase-index/.skill_version | 2 +- .../skills/codebase-index/.skill_version | 2 +- CHANGELOG.md | 29 +++++++++++++++-- README.md | 31 +++++++++++-------- docs/ARCHITECTURE.md | 2 +- docs/FAQ.md | 6 ++-- docs/INSTALLATION.md | 20 ++++++------ docs/MCP.md | 2 +- docs/QUICKSTART.md | 2 +- docs/SEO.md | 6 ++-- docs/installer.md | 4 +-- requirements.lock | 2 +- src/codebase_index/__init__.py | 2 +- 15 files changed, 72 insertions(+), 42 deletions(-) diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 91036f6..c9a2865 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -3,7 +3,7 @@ "name": "codebase-index", "displayName": "Codebase Index", "description": "Local-first hybrid codebase index. Auto-provisions its Python CLI on first session start; the skill searches the index so Claude reads only the most relevant files.", - "version": "1.2.2", + "version": "1.3.0", "author": { "name": "codebase-index contributors" }, diff --git a/.claude/skills/codebase-index/.skill_version b/.claude/skills/codebase-index/.skill_version index 23aa839..f0bb29e 100644 --- a/.claude/skills/codebase-index/.skill_version +++ b/.claude/skills/codebase-index/.skill_version @@ -1 +1 @@ -1.2.2 +1.3.0 diff --git a/.codex/skills/codebase-index/.skill_version b/.codex/skills/codebase-index/.skill_version index 23aa839..f0bb29e 100644 --- a/.codex/skills/codebase-index/.skill_version +++ b/.codex/skills/codebase-index/.skill_version @@ -1 +1 @@ -1.2.2 +1.3.0 diff --git a/.opencode/skills/codebase-index/.skill_version b/.opencode/skills/codebase-index/.skill_version index 23aa839..f0bb29e 100644 --- a/.opencode/skills/codebase-index/.skill_version +++ b/.opencode/skills/codebase-index/.skill_version @@ -1 +1 @@ -1.2.2 +1.3.0 diff --git a/CHANGELOG.md b/CHANGELOG.md index cb9e12c..8cde91a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,13 +6,18 @@ All notable changes to this project are documented here. The format is based on ## [Unreleased] +## [1.3.0] - 2026-06-09 + ### Added - **Content-addressed embedding cache**: a new `vec_cache` table (keyed by `(model, content_sha)`) persists chunk embeddings across rebuilds. Because chunk ids churn on every full rebuild, the embedding pass now hashes chunk content and only calls the (potentially slow or paid) backend for text never embedded under the active model — unchanged content reuses its cached vector for free. - -### Added +- **Shared CLI/MCP service layer** (`codebase_index/service.py`): both surfaces now resolve the + index path, run search sessions, and build stats payloads through the same code, so they cannot + drift. Two real drifts were closed: MCP `search_code`/`explain_code` now blend in vector results + when embeddings are enabled (previously the vector channel was CLI-only), and MCP `index_stats` + now reports the per-language `graph: full|partial` tier the skill keys on. - **Repo-wide graph tier in diagnostics**: `stats` now tags each tree-sitter language with `graph: full|partial`, and `doctor` adds a `graph_coverage` finding listing Tier-B languages present in the index. Surfaces upfront which languages have partial `refs`/`impact` (symbols but @@ -24,8 +29,24 @@ All notable changes to this project are documented here. The format is based on inconclusive rather than authoritative. `coverage.partial` flags this so agents fall back to Grep instead of reading "no references" as proof. Markdown output prints a matching warning; the skill documents the field. +- **Skill-copy sync tooling**: `scripts/sync_skill_copies.py` regenerates every committed copy of + the skill (`.claude/`, `.codex/`, `.opencode/`, `skills/`, shared `skill/` files) plus all + version stamps from the canonical `src/codebase_index/skill_template/`; CI fails when copies + drift (`--check`). The package version now lives in one place + (`src/codebase_index/__init__.py`) via hatch dynamic versioning. +- `CBX_NO_SKILL_AUTO_UPDATE=1` disables the silent skill auto-update — used by the test suite, + useful for CI and scripted environments. ### Changed +- **Graph build is batched**: edge resolution now runs one query for globally-unique symbol names + and one pass over file paths (in-memory suffix map) instead of per-edge lookups and up to ~20 + full-table `LIKE` scans per import edge — 7–28× faster on a small repo with identical results, + and the gap grows with repository size. Vector blobs are written with a single batched + `executemany`; a new `edges(file_id)` index removes full-table scans from incremental updates + and file-deletion cascades. +- Silent failure paths now report to stderr: the ProcessPool→sequential parsing fallback and skill + auto-update failures were previously invisible; vector helpers only swallow + `sqlite3.OperationalError` (missing vec tables) instead of every exception. - The embedding pass reports cache **misses** (vectors actually computed) as its "embedded" count. - `prune_orphan_vectors` now deletes stale `vec_chunks` rows in a single batched `executemany`. - **Skill**: documented the `--mode vector` semantic-search path, the `intent`/`mode`/`pagination` @@ -48,6 +69,10 @@ All notable changes to this project are documented here. The format is based on matching `search --mode hybrid`. - The `cbx` wrapper whitelist (skill + plugin `bin/`) now includes `doctor`, which the skill's fallback diagnostics already invoke; previously `cbx doctor` was refused. +- The test suite is green on Windows again (`bootstrap` path comparison) and no longer rewrites + the committed `.skill_version` stamps as a side effect of running the CLI inside the checkout. +- `docs/ARCHITECTURE.md` no longer shows two contradictory repository layouts or claims `graph/` + is a stub. ## [1.2.2] - 2026-06-05 diff --git a/README.md b/README.md index 5a7e17d..998502c 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ If you are opening this repository for the first time, follow this order: If you only need the shortest path, run: ```bash -pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" cd your-project codebase-index init # prompts for Claude Code / Codex CLI / OpenCode codebase-index index @@ -50,16 +50,21 @@ codebase-index search "where is authentication implemented?" ## Project Status -**`1.2.1` is released.** The current release includes repository discovery, +**`1.3.0` is released.** The current release includes repository discovery, SQLite FTS5 storage, Tree-sitter symbols and references, hybrid ranking, graph impact analysis, token-budgeted retrieval packets, optional local embeddings, hooks/watch support, multi-CLI installation, MCP server support, and a tested GitHub-only `pipx` install path. -The `1.2.1` release adds skill auto-update/rollback commands and version stamps -so installed skills stay in sync with the package automatically. -The `1.2.0` release added HTML graph export, auto-indexing search commands, and -updated skill resources. +The `1.3.0` release adds a content-addressed embedding cache (rebuilds reuse +vectors for unchanged content), a batched graph build (7–28× faster edge +resolution plus a new `edges(file_id)` index), a shared CLI/MCP service layer +(MCP hybrid search now uses the vector channel; `index_stats` reports the +per-language graph tier), graph-coverage signals in `stats`/`refs`/`impact`, +CLI pagination via `search --offset`, and single-source versioning with a CI +gate that keeps every committed skill copy in sync. +The `1.2.1` release added skill auto-update/rollback commands and version +stamps so installed skills stay in sync with the package automatically. See [CHANGELOG.md](CHANGELOG.md) and [docs/ROADMAP.md](docs/ROADMAP.md). @@ -82,7 +87,7 @@ For most users, install the package from the tagged GitHub release and run `init` inside the repository you want to index: ```bash -pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" cd your-project codebase-index init # choose Claude Code, Codex CLI, OpenCode, or all codebase-index index @@ -120,7 +125,7 @@ fetch the package; later sessions are offline. The skill builds its index on your first codebase question, so there is no manual `index` step. **Distribution note:** the plugin bootstrap installs the pinned requirement from -`requirements.lock`. In `1.2.1`, that lock points at the tagged GitHub release +`requirements.lock`. In `1.3.0`, that lock points at the tagged GitHub release instead of PyPI. You can override it with `CBX_INSTALL_SPEC` when testing a local checkout or a different Git ref. @@ -202,7 +207,7 @@ irm https://raw.githubusercontent.com/denfry/codebase-index/main/install.ps1 | i ```bash cd your-project -pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" codebase-index init codebase-index index ``` @@ -222,13 +227,13 @@ the `pipx` environment was likely created with an older Python version. Reinstal ```powershell pipx uninstall codebase-index py -0p -pipx install --python "\python.exe" "git+https://github.com/denfry/codebase-index.git@v1.2.1" +pipx install --python "\python.exe" "git+https://github.com/denfry/codebase-index.git@v1.3.0" ``` For example: ```powershell -pipx install --python "C:\Users\you\AppData\Local\Programs\Python\Python312\python.exe" "git+https://github.com/denfry/codebase-index.git@v1.2.1" +pipx install --python "C:\Users\you\AppData\Local\Programs\Python\Python312\python.exe" "git+https://github.com/denfry/codebase-index.git@v1.3.0" ``` Then run initialization again: @@ -242,7 +247,7 @@ codebase-index index ### Option 2: Install with pipx from GitHub ```bash -pipx install "git+https://github.com/denfry/codebase-index.git@v1.2.1" +pipx install "git+https://github.com/denfry/codebase-index.git@v1.3.0" cd your-project codebase-index init --target auto codebase-index index @@ -260,7 +265,7 @@ pip install -e ".[dev]" PyPI, `uvx`, Homebrew, signed release checksums, and SBOMs are important for a tool that reads entire repositories, but they are not all verified as shipped in -`1.2.1`. Target install story: +`1.3.0`. Target install story: ```bash uvx codebase-index init diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 5fe4bda..1555eae 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -2,7 +2,7 @@ ## 1. Overview -`codebase-index` is a **local-first** code intelligence layer for AI coding agents. In `1.2.0` +`codebase-index` is a **local-first** code intelligence layer for AI coding agents. In `1.3.0` it has two shipped faces: 1. **A Claude Code Skill** (`.claude/skills/codebase-index/SKILL.md`) that Claude auto-invokes for diff --git a/docs/FAQ.md b/docs/FAQ.md index 0418b21..3a14b1e 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -10,7 +10,7 @@ This page answers the most common questions about installing, running, and trust with `pipx` (isolated) or `pip`, pinned to a release tag for reproducibility: ```bash -pipx install "git+https://github.com/denfry/codebase-index.git@v1.2.1" +pipx install "git+https://github.com/denfry/codebase-index.git@v1.3.0" ``` Then run `codebase-index init` inside your project and `codebase-index index` to build @@ -152,9 +152,9 @@ Yes. Use any of these methods: ## Is it production-ready? -Yes — `codebase-index` is released as **v1.2.1**. Indexing, hybrid search, Tree-sitter +Yes — `codebase-index` is released as **v1.3.0**. Indexing, hybrid search, Tree-sitter The core indexing and search functionality is implemented and tested. The -current `1.2.1` package includes: +current `1.3.0` package includes: - Hybrid FTS/path/symbol/vector retrieval - Import/call/reference graph expansion and `impact` diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md index 6a440df..5509a73 100644 --- a/docs/INSTALLATION.md +++ b/docs/INSTALLATION.md @@ -3,7 +3,7 @@ This page explains how to install `codebase-index` and make it available in Claude Code, Codex CLI, or OpenCode. > **Distribution:** `codebase-index` is **not on PyPI**. It is installed directly -> from GitHub via `git+https://...@`. Pin to a release tag (e.g. `@v1.2.1`) +> from GitHub via `git+https://...@`. Pin to a release tag (e.g. `@v1.3.0`) > for reproducible installs; use `@main` to track the latest. ## Choose Your Path @@ -28,7 +28,7 @@ Install the tagged GitHub release and scaffold the skill into your project: ```bash cd your-project -pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" codebase-index init codebase-index index ``` @@ -62,13 +62,13 @@ ln -s ~/codebase-index/skill ~/.claude/skills/codebase-index ```bash # Using pip from the tagged GitHub release -pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" # Using pipx from GitHub (isolated environment) -pipx install "git+https://github.com/denfry/codebase-index.git@v1.2.1" +pipx install "git+https://github.com/denfry/codebase-index.git@v1.3.0" # Using uv from GitHub -uv tool install "git+https://github.com/denfry/codebase-index.git@v1.2.1" +uv tool install "git+https://github.com/denfry/codebase-index.git@v1.3.0" # From source (editable mode) git clone https://github.com/denfry/codebase-index.git @@ -91,7 +91,7 @@ pip install -e ".[embeddings-local,watch,dev]" ### PyPI / uvx / Homebrew status -As of `1.2.1`, this documentation treats GitHub tag installs as the verified +As of `1.3.0`, this documentation treats GitHub tag installs as the verified path. PyPI, `uvx codebase-index init`, Homebrew tap installation, signed checksums, and SBOMs are distribution targets for a more complete release story. @@ -108,7 +108,7 @@ brew install denfry/tap/codebase-index On a machine with only Python + pipx: ```bash -pipx install "git+https://github.com/denfry/codebase-index.git@v1.2.1" +pipx install "git+https://github.com/denfry/codebase-index.git@v1.3.0" cd /path/to/your/repo codebase-index init # writes .claude/skills/codebase-index/ + .gitignore rules codebase-index index # builds .claude/cache/codebase-index/index.sqlite @@ -131,7 +131,7 @@ Expected output: === codebase-index Doctor === [OK] Python 3.12 (requires 3.11+) -[OK] codebase-index package installed (v1.2.1) +[OK] codebase-index package installed (v1.3.0) [OK] tree-sitter is available [INFO] Cache directory not yet created: ... [INFO] Skill not installed in .claude/skills/ @@ -188,7 +188,7 @@ Use `codebase-index doctor` to verify which hooks are enabled. For heavy editing For heavy editing sessions, `watch` mode keeps the index fresh via a debounced filesystem observer. Requires the `[watch]` extra: ```bash -pip install "codebase-index[watch] @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index[watch] @ git+https://github.com/denfry/codebase-index.git@v1.3.0" codebase-index watch --debounce 500 ``` @@ -278,7 +278,7 @@ Set `allow_external` to `false` to disable external API calls. ## Recommended Flow for First-Time Users ```bash -pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" cd your-project codebase-index init codebase-index index diff --git a/docs/MCP.md b/docs/MCP.md index dfb8ac8..4a64a0b 100644 --- a/docs/MCP.md +++ b/docs/MCP.md @@ -3,7 +3,7 @@ `codebase-index` ships a stdio MCP server powered by the optional `mcp` extra: ```bash -pip install "codebase-index[mcp] @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index[mcp] @ git+https://github.com/denfry/codebase-index.git@v1.3.0" codebase-index mcp --root /path/to/repo ``` diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index 77c20eb..76eade6 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -11,7 +11,7 @@ Use this guide if you are new to `codebase-index` and want the fastest path to y ## Step 1: Install ```bash -pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" ``` Or from source: diff --git a/docs/SEO.md b/docs/SEO.md index c962c6c..7f74fa4 100644 --- a/docs/SEO.md +++ b/docs/SEO.md @@ -146,7 +146,7 @@ Submit to these lists for backlinks and discoverability: ## Release Announcement Template ``` -codebase-index v1.2.1 +codebase-index v1.3.0 A local-first codebase index for AI coding agents. @@ -162,7 +162,7 @@ Features: - Secret redaction - Respects .gitignore -Install: pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +Install: pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" GitHub: https://github.com/denfry/codebase-index ``` @@ -177,5 +177,5 @@ codebase-index builds a local hybrid index so Claude finds the right files witho - No network by default - Token-efficient output -pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.2.1" +pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.3.0" ``` diff --git a/docs/installer.md b/docs/installer.md index ef75648..db77dc3 100644 --- a/docs/installer.md +++ b/docs/installer.md @@ -108,7 +108,7 @@ pwsh ./install.ps1 -Target claude -InstallDir "D:\skills\codebase-index" **Pinning по ветке/тегу** (воспроизводимость и безопасность): ```sh -sh install.sh --branch v1.2.1 +sh install.sh --branch v1.3.0 ``` --- @@ -151,7 +151,7 @@ sh install.sh --branch v1.2.1 ```json { "skill_name": "codebase-index", - "version": "1.2.1", + "version": "1.3.0", "installed_at": "2026-05-29T12:00:00Z", "target": "claude", "os": "linux", diff --git a/requirements.lock b/requirements.lock index f87af0a..4c95b71 100644 --- a/requirements.lock +++ b/requirements.lock @@ -1,3 +1,3 @@ -codebase-index @ https://github.com/denfry/codebase-index/archive/refs/tags/v1.2.2.tar.gz +codebase-index @ https://github.com/denfry/codebase-index/archive/refs/tags/v1.3.0.tar.gz tree-sitter==0.25.2 tree-sitter-language-pack==1.8.1 diff --git a/src/codebase_index/__init__.py b/src/codebase_index/__init__.py index 95e877b..b61488b 100644 --- a/src/codebase_index/__init__.py +++ b/src/codebase_index/__init__.py @@ -4,4 +4,4 @@ See docs/ARCHITECTURE.md for the module map. """ -__version__ = "1.2.2" +__version__ = "1.3.0" From 6309ac0d2448dacdddbdfd3d7071387d8a35c7bc Mon Sep 17 00:00:00 2001 From: denfry Date: Wed, 10 Jun 2026 00:15:09 +0300 Subject: [PATCH 9/9] fix(tests): make bootstrap and golden tests pass on bare CI runners Both failures predate this branch (CI on main has been red since Jun 5): - test_missing_python_reports_clearly subtracted python's directories from PATH, but on CI runners the system python3 lives in /usr/bin next to mkdir, so bootstrap.sh found it and attempted a real install. PATH now contains only thin wrappers exec'ing the needed coreutils by absolute path. - The golden fixture's 'git commit' silently failed on Windows runners (no git identity, auto-detection fails there), leaving head_commit null. Commit with an explicit -c user.name/user.email and assert it succeeds. Co-Authored-By: Claude Fable 5 --- tests/test_bootstrap.py | 35 +++++++++++++++++------------------ tests/test_cli_golden.py | 9 ++++++++- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py index efbd4b6..6d2b384 100644 --- a/tests/test_bootstrap.py +++ b/tests/test_bootstrap.py @@ -140,24 +140,23 @@ def test_lock_change_triggers_reinstall(tmp_path): @pytest.mark.skipif(not BASH_OK, reason="bash not available or non-functional") def test_missing_python_reports_clearly(tmp_path): root, data, env = _stage(tmp_path) - # Remove python from PATH while KEEPING coreutils (mkdir/diff/cp/rm) — on - # Git Bash for Windows those live in a different dir than python, so the - # script can run far enough to hit its "Python not found" branch instead of - # dying at `mkdir`. (Wiping PATH entirely would also remove mkdir itself.) - empty = tmp_path / "empty" - empty.mkdir() - util_dirs = { - str(Path(p).parent) - for p in (shutil.which("mkdir"), shutil.which("diff"), shutil.which("cp"), shutil.which("rm")) - if p - } - python_dirs = { - str(Path(p).parent) - for p in (shutil.which("python"), shutil.which("python3")) - if p - } - safe_dirs = [d for d in util_dirs if d not in python_dirs] - env["PATH"] = os.pathsep.join([str(empty), *safe_dirs]) # no python on PATH + # PATH contains ONLY thin wrappers for the coreutils bootstrap.sh needs, + # each exec'ing its real binary by absolute path — and no python at all. + # Subtracting python's directories from PATH is not enough: on CI runners + # the system python3 lives in /usr/bin right next to mkdir, so the script + # found it and attempted a real install instead of hitting the + # "Python not found" branch. + safebin = tmp_path / "safebin" + safebin.mkdir() + for name in ("mkdir", "diff", "cp", "rm"): + real = shutil.which(name) + assert real, f"{name} missing from PATH" + wrapper = safebin / name + wrapper.write_text( + f'#!/bin/sh\nexec "{Path(real).as_posix()}" "$@"\n', encoding="utf-8" + ) + wrapper.chmod(0o755) + env["PATH"] = str(safebin) # no python anywhere on PATH res = _run(root, env) assert res.returncode == 0 # SessionStart must not hard-fail the session assert "Python 3.11+" in res.stderr diff --git a/tests/test_cli_golden.py b/tests/test_cli_golden.py index ab6b8af..608875d 100644 --- a/tests/test_cli_golden.py +++ b/tests/test_cli_golden.py @@ -44,9 +44,16 @@ def indexed_repo(tmp_path_factory): dest = tmp_path_factory.mktemp("indexed") / "repo" shutil.copytree(FIXTURE_ROOT, dest) + # Explicit identity: CI runners have no global git config, and on Windows + # git's identity auto-detection fails, so the commit silently never happens + # and head_commit becomes null instead of "". + identity = ["-c", "user.name=golden", "-c", "user.email=golden@test"] subprocess.run(["git", "init"], cwd=dest, capture_output=True) subprocess.run(["git", "add", "."], cwd=dest, capture_output=True) - subprocess.run(["git", "commit", "-m", "initial"], cwd=dest, capture_output=True) + commit = subprocess.run( + ["git", *identity, "commit", "-m", "initial"], cwd=dest, capture_output=True, text=True + ) + assert commit.returncode == 0, commit.stderr assert runner.invoke(app, ["--root", str(dest), "index"]).exit_code == 0 return dest