Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions tests/skill/understand/test_merge_batch_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1183,5 +1183,89 @@ def test_range_filename_also_unrecognized(self) -> None:
self.assertNotIn("file:src/y.ts", node_ids)


# ── Incremental update: batch-existing.json (#292) ────────────────────────


class TestIncrementalExistingBatch(unittest.TestCase):
"""Regression test for #292.

During an incremental update the skill writes the pruned existing-graph
payload as `batch-existing.json` alongside freshly-analyzed `batch-<N>.json`
files. Before the fix, `batch-existing.json` failed the `\\d+`-only filename
match -> was bucketed as "unrecognized" -> silently dropped at load, losing
~75% of nodes on every incremental run. It must now be recognized, loaded,
and merged like any other batch.
"""

def setUp(self) -> None:
import tempfile
self.tmp = Path(tempfile.mkdtemp(prefix="ua-mbg-existing-"))
self.intermediate = self.tmp / ".understand-anything" / "intermediate"
self.intermediate.mkdir(parents=True, exist_ok=True)

def tearDown(self) -> None:
import shutil
shutil.rmtree(self.tmp, ignore_errors=True)

def _write_batch(self, name: str, nodes: list, edges: list) -> None:
import json as _j
(self.intermediate / name).write_text(
_j.dumps({"nodes": nodes, "edges": edges}),
encoding="utf-8",
)

def _run_merge(self) -> tuple[int, str, dict]:
import subprocess
import json as _j
result = subprocess.run(
["python3", str(_MODULE_PATH), str(self.tmp)],
capture_output=True, text=True,
)
out_path = self.intermediate / "assembled-graph.json"
assembled = _j.loads(out_path.read_text()) if out_path.exists() else {}
return result.returncode, result.stderr, assembled

def test_batch_existing_is_merged_not_dropped(self) -> None:
# batch-existing.json = the unchanged-file nodes carried over from the
# previous full scan; batch-0.json = the freshly-analyzed changed files.
self._write_batch(
"batch-existing.json",
[_file_node("src/unchanged-a.ts"), _file_node("src/unchanged-b.ts")],
[],
)
self._write_batch(
"batch-0.json",
[_file_node("src/changed-c.ts")],
[],
)
rc, stderr, assembled = self._run_merge()
self.assertEqual(rc, 0)
node_ids = {n["id"] for n in assembled["nodes"]}
# The whole point of #292: existing nodes survive the merge.
self.assertEqual(
node_ids,
{
"file:src/unchanged-a.ts",
"file:src/unchanged-b.ts",
"file:src/changed-c.ts",
},
)
# And batch-existing.json must NOT be reported as an unrecognized drop.
self.assertNotIn("unrecognized filenames", stderr)

def test_batch_existing_alone_still_merges(self) -> None:
# Defensive: batch-existing.json on its own must still produce a valid
# graph rather than erroring with "no valid batch files loaded".
self._write_batch(
"batch-existing.json",
[_file_node("src/only.ts")],
[],
)
rc, _stderr, assembled = self._run_merge()
self.assertEqual(rc, 0)
node_ids = {n["id"] for n in assembled["nodes"]}
self.assertEqual(node_ids, {"file:src/only.ts"})


if __name__ == "__main__":
unittest.main()
28 changes: 19 additions & 9 deletions understand-anything-plugin/skills/understand/merge-batch-graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,13 +1012,17 @@ def main() -> None:
print(f"Error: {intermediate_dir} does not exist", file=sys.stderr)
sys.exit(1)

# Discover batch files, sorted by numeric index (not lexicographic)
batch_files = sorted(
intermediate_dir.glob("batch-*.json"),
key=lambda p: int(re.search(r"batch-(\d+)", p.stem).group(1))
if re.search(r"batch-(\d+)", p.stem)
else 0,
)
# Discover batch files, sorted by numeric index (not lexicographic).
# `batch-existing.json` (the pruned existing-graph payload written during an
# incremental update) sorts first so it loads before the freshly-analyzed
# numbered batches.
def _batch_sort_key(p: Path) -> int:
if p.stem == "batch-existing":
return -1
m = re.search(r"batch-(\d+)", p.stem)
return int(m.group(1)) if m else 0

batch_files = sorted(intermediate_dir.glob("batch-*.json"), key=_batch_sort_key)
if not batch_files:
print("Error: no batch-*.json files found in intermediate/", file=sys.stderr)
sys.exit(1)
Expand All @@ -1033,9 +1037,15 @@ def main() -> None:
by_batch = _dd(list)
unrecognized_batch_files: list[str] = []
for f in batch_files:
m = re.match(r"batch-(\d+)(?:-part-(\d+))?\.json", f.name)
m = re.match(r"batch-(\d+|existing)(?:-part-(\d+))?\.json", f.name)
if m:
by_batch[int(m.group(1))].append((f.name, int(m.group(2)) if m.group(2) else None))
# `existing` = the pruned existing-graph payload written during an
# incremental update; bucket it as logical batch -1 so it loads
# before the freshly-analyzed numbered batches. Before this fix it
# failed the `\d+`-only match → landed in `unrecognized` → was
# silently dropped, losing ~75% of nodes on incremental runs (#292).
idx = -1 if m.group(1) == "existing" else int(m.group(1))
by_batch[idx].append((f.name, int(m.group(2)) if m.group(2) else None))
else:
unrecognized_batch_files.append(f.name)

Expand Down
Loading