From e939fe81487163dcd02cdd48ca78bdee9fe77e3f Mon Sep 17 00:00:00 2001 From: Adam Herring Date: Fri, 5 Jun 2026 15:46:23 -0700 Subject: [PATCH] fix(merge): recognize batch-existing.json so incremental updates don't drop ~75% of nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During an incremental `/understand` update the skill writes the pruned existing-graph payload as `batch-existing.json` alongside the freshly analyzed `batch-.json` files. The merge step bucketed it by the `batch-(\d+)` filename match, so `batch-existing.json` (no digits) fell through to "unrecognized" and was silently dropped at load — only a stderr warning, exit 0. Net effect: incremental merges lost every carried-over node. In a 305-node baseline + 10 changed files, the incremental output had 92 nodes; the 213 unchanged-file nodes were gone. Fix: recognize `existing` as a valid logical batch (sorted/bucketed as index -1 so it loads before the numbered batches). Adds a regression test (TestIncrementalExistingBatch) that runs the script end-to-end and asserts batch-existing.json nodes survive the merge. Fixes #292 Co-Authored-By: Claude Opus 4.8 --- .../understand/test_merge_batch_graphs.py | 84 +++++++++++++++++++ .../skills/understand/merge-batch-graphs.py | 28 +++++-- 2 files changed, 103 insertions(+), 9 deletions(-) diff --git a/tests/skill/understand/test_merge_batch_graphs.py b/tests/skill/understand/test_merge_batch_graphs.py index 7239bfe6..31fa33a2 100644 --- a/tests/skill/understand/test_merge_batch_graphs.py +++ b/tests/skill/understand/test_merge_batch_graphs.py @@ -1183,5 +1183,89 @@ def test_range_filename_also_unrecognized(self) -> None: self.assertNotIn("file:src/y.ts", node_ids) +# ── Incremental update: batch-existing.json (#292) ──────────────────────── + + +class TestIncrementalExistingBatch(unittest.TestCase): + """Regression test for #292. + + During an incremental update the skill writes the pruned existing-graph + payload as `batch-existing.json` alongside freshly-analyzed `batch-.json` + files. Before the fix, `batch-existing.json` failed the `\\d+`-only filename + match -> was bucketed as "unrecognized" -> silently dropped at load, losing + ~75% of nodes on every incremental run. It must now be recognized, loaded, + and merged like any other batch. + """ + + def setUp(self) -> None: + import tempfile + self.tmp = Path(tempfile.mkdtemp(prefix="ua-mbg-existing-")) + self.intermediate = self.tmp / ".understand-anything" / "intermediate" + self.intermediate.mkdir(parents=True, exist_ok=True) + + def tearDown(self) -> None: + import shutil + shutil.rmtree(self.tmp, ignore_errors=True) + + def _write_batch(self, name: str, nodes: list, edges: list) -> None: + import json as _j + (self.intermediate / name).write_text( + _j.dumps({"nodes": nodes, "edges": edges}), + encoding="utf-8", + ) + + def _run_merge(self) -> tuple[int, str, dict]: + import subprocess + import json as _j + result = subprocess.run( + ["python3", str(_MODULE_PATH), str(self.tmp)], + capture_output=True, text=True, + ) + out_path = self.intermediate / "assembled-graph.json" + assembled = _j.loads(out_path.read_text()) if out_path.exists() else {} + return result.returncode, result.stderr, assembled + + def test_batch_existing_is_merged_not_dropped(self) -> None: + # batch-existing.json = the unchanged-file nodes carried over from the + # previous full scan; batch-0.json = the freshly-analyzed changed files. + self._write_batch( + "batch-existing.json", + [_file_node("src/unchanged-a.ts"), _file_node("src/unchanged-b.ts")], + [], + ) + self._write_batch( + "batch-0.json", + [_file_node("src/changed-c.ts")], + [], + ) + rc, stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + # The whole point of #292: existing nodes survive the merge. + self.assertEqual( + node_ids, + { + "file:src/unchanged-a.ts", + "file:src/unchanged-b.ts", + "file:src/changed-c.ts", + }, + ) + # And batch-existing.json must NOT be reported as an unrecognized drop. + self.assertNotIn("unrecognized filenames", stderr) + + def test_batch_existing_alone_still_merges(self) -> None: + # Defensive: batch-existing.json on its own must still produce a valid + # graph rather than erroring with "no valid batch files loaded". + self._write_batch( + "batch-existing.json", + [_file_node("src/only.ts")], + [], + ) + rc, _stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, {"file:src/only.ts"}) + + if __name__ == "__main__": unittest.main() diff --git a/understand-anything-plugin/skills/understand/merge-batch-graphs.py b/understand-anything-plugin/skills/understand/merge-batch-graphs.py index 2021f9ae..9614d097 100644 --- a/understand-anything-plugin/skills/understand/merge-batch-graphs.py +++ b/understand-anything-plugin/skills/understand/merge-batch-graphs.py @@ -1012,13 +1012,17 @@ def main() -> None: print(f"Error: {intermediate_dir} does not exist", file=sys.stderr) sys.exit(1) - # Discover batch files, sorted by numeric index (not lexicographic) - batch_files = sorted( - intermediate_dir.glob("batch-*.json"), - key=lambda p: int(re.search(r"batch-(\d+)", p.stem).group(1)) - if re.search(r"batch-(\d+)", p.stem) - else 0, - ) + # Discover batch files, sorted by numeric index (not lexicographic). + # `batch-existing.json` (the pruned existing-graph payload written during an + # incremental update) sorts first so it loads before the freshly-analyzed + # numbered batches. + def _batch_sort_key(p: Path) -> int: + if p.stem == "batch-existing": + return -1 + m = re.search(r"batch-(\d+)", p.stem) + return int(m.group(1)) if m else 0 + + batch_files = sorted(intermediate_dir.glob("batch-*.json"), key=_batch_sort_key) if not batch_files: print("Error: no batch-*.json files found in intermediate/", file=sys.stderr) sys.exit(1) @@ -1033,9 +1037,15 @@ def main() -> None: by_batch = _dd(list) unrecognized_batch_files: list[str] = [] for f in batch_files: - m = re.match(r"batch-(\d+)(?:-part-(\d+))?\.json", f.name) + m = re.match(r"batch-(\d+|existing)(?:-part-(\d+))?\.json", f.name) if m: - by_batch[int(m.group(1))].append((f.name, int(m.group(2)) if m.group(2) else None)) + # `existing` = the pruned existing-graph payload written during an + # incremental update; bucket it as logical batch -1 so it loads + # before the freshly-analyzed numbered batches. Before this fix it + # failed the `\d+`-only match → landed in `unrecognized` → was + # silently dropped, losing ~75% of nodes on incremental runs (#292). + idx = -1 if m.group(1) == "existing" else int(m.group(1)) + by_batch[idx].append((f.name, int(m.group(2)) if m.group(2) else None)) else: unrecognized_batch_files.append(f.name)