diff --git a/tests/skill/understand/test_merge_batch_graphs.py b/tests/skill/understand/test_merge_batch_graphs.py index 7239bfe6..54795e29 100644 --- a/tests/skill/understand/test_merge_batch_graphs.py +++ b/tests/skill/understand/test_merge_batch_graphs.py @@ -877,6 +877,276 @@ def test_linker_runs_during_merge(self) -> None: self.assertIn("tested", prod_node["tags"]) +class PruneExistingGraphTests(unittest.TestCase): + """Incremental prune of the existing graph, source-only. + + Regression for issue #366 Bug 2 — the previous `source OR target` rule + silently dropped inbound edges from unchanged files (`U imports F`, + `U calls F`, `U tests F`) every time F was re-analyzed, because the + unchanged U never regenerates them. + """ + + def test_inbound_edges_from_unchanged_sources_survive(self) -> None: + # Fixture: F1 imports F2, F3 imports F2. F2 is the only changed file. + # Both inbound edges (F1→F2 and F3→F2) must survive the prune even + # though their target (F2) was removed. + existing = { + "nodes": [ + _file_node("src/F1.ts"), + _file_node("src/F2.ts"), + _file_node("src/F3.ts"), + ], + "edges": [ + { + "source": "file:src/F1.ts", + "target": "file:src/F2.ts", + "type": "imports", + "direction": "forward", + "weight": 0.7, + }, + { + "source": "file:src/F3.ts", + "target": "file:src/F2.ts", + "type": "imports", + "direction": "forward", + "weight": 0.7, + }, + ], + } + + pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"}) + + # F2's node was removed, F1 + F3 survive + surviving_ids = {n["id"] for n in pruned["nodes"]} + self.assertEqual(surviving_ids, {"file:src/F1.ts", "file:src/F3.ts"}) + + # Both inbound edges into F2 were retained (target removed, but + # source survived → keep). The merge's dangling-edge sweep will + # absorb them once fresh F2 batch lands. + edge_pairs = {(e["source"], e["target"]) for e in pruned["edges"]} + self.assertIn(("file:src/F1.ts", "file:src/F2.ts"), edge_pairs) + self.assertIn(("file:src/F3.ts", "file:src/F2.ts"), edge_pairs) + self.assertEqual(len(pruned["edges"]), 2) + + def test_outbound_edges_from_removed_nodes_are_dropped(self) -> None: + # Edges sourced by a removed node have no use in the merged graph — + # the fresh batch for that file will regenerate them (or not). + existing = { + "nodes": [ + _file_node("src/F1.ts"), + _file_node("src/F2.ts"), + ], + "edges": [ + # Outbound from changed F2 — must be dropped (fresh batch + # will re-emit if still valid). + { + "source": "file:src/F2.ts", + "target": "file:src/F1.ts", + "type": "imports", + "direction": "forward", + "weight": 0.7, + }, + ], + } + + pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"}) + + # F2 dropped from nodes, F2's outbound edge dropped too + self.assertEqual({n["id"] for n in pruned["nodes"]}, {"file:src/F1.ts"}) + self.assertEqual(pruned["edges"], []) + + def test_function_level_nodes_inside_changed_file_are_removed(self) -> None: + # Function/class nodes share their host file's `filePath` — they + # must be dropped when that file changes (their fresh batch will + # re-emit them under canonical IDs). + existing = { + "nodes": [ + _file_node("src/F1.ts"), + _file_node("src/F2.ts"), + { + "id": "function:src/F2.ts:helper", + "type": "function", + "name": "helper", + "filePath": "src/F2.ts", + "summary": "", + "tags": [], + "complexity": "simple", + }, + ], + "edges": [ + # Inbound function-level edge: F1's caller → F2's helper. + # Retained at this step; merge's dangling sweep decides + # later (drops it if `function:src/F2.ts:helper` does not + # reappear in the fresh batch). + { + "source": "file:src/F1.ts", + "target": "function:src/F2.ts:helper", + "type": "calls", + "direction": "forward", + "weight": 0.5, + }, + ], + } + + pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"}) + + surviving_ids = {n["id"] for n in pruned["nodes"]} + self.assertEqual(surviving_ids, {"file:src/F1.ts"}) + # Inbound edge retained — source F1 survived + self.assertEqual(len(pruned["edges"]), 1) + self.assertEqual(pruned["edges"][0]["target"], "function:src/F2.ts:helper") + + def test_no_changed_files_is_noop(self) -> None: + existing = { + "nodes": [_file_node("src/a.ts"), _file_node("src/b.ts")], + "edges": [ + { + "source": "file:src/a.ts", + "target": "file:src/b.ts", + "type": "imports", + "direction": "forward", + "weight": 0.7, + }, + ], + } + pruned = mbg.prune_existing_graph(existing, set()) + self.assertEqual(len(pruned["nodes"]), 2) + self.assertEqual(len(pruned["edges"]), 1) + + def test_unrelated_top_level_fields_preserved(self) -> None: + # Real graphs carry `projectName`, `frameworks`, `layers`, etc. + # alongside `nodes` and `edges`. Prune must not strip them. + existing = { + "projectName": "demo", + "frameworks": ["next"], + "layers": [{"id": "ui", "nodeIds": []}], + "nodes": [_file_node("src/F1.ts"), _file_node("src/F2.ts")], + "edges": [], + } + pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"}) + self.assertEqual(pruned["projectName"], "demo") + self.assertEqual(pruned["frameworks"], ["next"]) + self.assertEqual(pruned["layers"], [{"id": "ui", "nodeIds": []}]) + + def test_does_not_mutate_input(self) -> None: + existing = { + "nodes": [_file_node("src/F1.ts"), _file_node("src/F2.ts")], + "edges": [ + { + "source": "file:src/F2.ts", + "target": "file:src/F1.ts", + "type": "imports", + "direction": "forward", + "weight": 0.7, + }, + ], + } + original_node_count = len(existing["nodes"]) + original_edge_count = len(existing["edges"]) + + mbg.prune_existing_graph(existing, {"src/F2.ts"}) + + # Input graph untouched + self.assertEqual(len(existing["nodes"]), original_node_count) + self.assertEqual(len(existing["edges"]), original_edge_count) + + def test_merge_dangling_sweep_drops_truly_removed_function_target(self) -> None: + # End-to-end safety check: an inbound edge into a deleted function + # survives the source-only prune (good), then the merge's dangling + # sweep drops it because the function does NOT reappear in the + # fresh batch (also good — no zombie edges in the final graph). + existing = { + "nodes": [ + _file_node("src/F1.ts"), + _file_node("src/F2.ts"), + { + "id": "function:src/F2.ts:removedFn", + "type": "function", + "name": "removedFn", + "filePath": "src/F2.ts", + "summary": "", + "tags": [], + "complexity": "simple", + }, + ], + "edges": [ + # Inbound function-level edge from unchanged F1 + { + "source": "file:src/F1.ts", + "target": "function:src/F2.ts:removedFn", + "type": "calls", + "direction": "forward", + "weight": 0.5, + }, + ], + } + + pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"}) + + # Fresh batch for F2: file-level only, removedFn is gone + fresh_batch = { + "nodes": [_file_node("src/F2.ts")], + "edges": [], + } + + # Stitch into the merge pipeline by treating pruned as a batch + existing_batch = {"nodes": pruned["nodes"], "edges": pruned["edges"]} + assembled, _report = mbg.merge_and_normalize([existing_batch, fresh_batch]) + + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertIn("file:src/F1.ts", node_ids) + self.assertIn("file:src/F2.ts", node_ids) + # removedFn never reappeared + self.assertNotIn("function:src/F2.ts:removedFn", node_ids) + + # The inbound calls edge gets dropped by the dangling-edge sweep + # in merge_and_normalize Step 6 — its target node is missing. + edge_pairs = {(e["source"], e["target"]) for e in assembled["edges"]} + self.assertNotIn( + ("file:src/F1.ts", "function:src/F2.ts:removedFn"), + edge_pairs, + ) + + def test_merge_preserves_inbound_file_edge_after_prune(self) -> None: + # Full end-to-end: F1 imports F2, F3 imports F2. F2 is reanalyzed + # and the fresh batch re-emits file:F2. The inbound edges (file-level + # targets) survive prune + merge. + existing = { + "nodes": [ + _file_node("src/F1.ts"), + _file_node("src/F2.ts"), + _file_node("src/F3.ts"), + ], + "edges": [ + { + "source": "file:src/F1.ts", + "target": "file:src/F2.ts", + "type": "imports", + "direction": "forward", + "weight": 0.7, + }, + { + "source": "file:src/F3.ts", + "target": "file:src/F2.ts", + "type": "imports", + "direction": "forward", + "weight": 0.7, + }, + ], + } + + pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"}) + fresh_batch = {"nodes": [_file_node("src/F2.ts")], "edges": []} + + existing_batch = {"nodes": pruned["nodes"], "edges": pruned["edges"]} + assembled, _report = mbg.merge_and_normalize([existing_batch, fresh_batch]) + + edge_pairs = {(e["source"], e["target"], e["type"]) for e in assembled["edges"]} + # Both inbound imports edges into F2 survived end-to-end + self.assertIn(("file:src/F1.ts", "file:src/F2.ts", "imports"), edge_pairs) + self.assertIn(("file:src/F3.ts", "file:src/F2.ts", "imports"), edge_pairs) + + class NormalizeDirectionTests(unittest.TestCase): """`direction` canonicalization mirrors the dashboard schema validator.""" diff --git a/understand-anything-plugin/hooks/auto-update-prompt.md b/understand-anything-plugin/hooks/auto-update-prompt.md index b20aa18d..2c83d364 100644 --- a/understand-anything-plugin/hooks/auto-update-prompt.md +++ b/understand-anything-plugin/hooks/auto-update-prompt.md @@ -187,10 +187,10 @@ Only re-analyze files with structural changes. This is the **only** phase that c 5. **Merge with existing graph:** - Remove old nodes whose `filePath` matches any file in `filesToReanalyze` or in the deleted files list - - Remove old edges whose `source` or `target` references a removed node + - **Prune by source only:** Keep each existing edge iff its `source` node survives. Edges whose `target` was removed are intentionally retained — this preserves inbound edges from unchanged files (`U imports F`, `U calls F`, `U tests F`) that the unchanged U will never regenerate. A naive `source OR target` prune silently erodes inbound edges on every incremental run. - Add new nodes and edges from the fresh analysis - Deduplicate nodes by ID (keep latest), edges by `source + target + type` - - Remove any edge with dangling `source` or `target` references + - Remove any edge with dangling `source` or `target` references — this is the safety net that drops inbound function-level edges into since-deleted functions inside changed files --- diff --git a/understand-anything-plugin/skills/understand/SKILL.md b/understand-anything-plugin/skills/understand/SKILL.md index 35c0b112..03fb6a3d 100644 --- a/understand-anything-plugin/skills/understand/SKILL.md +++ b/understand-anything-plugin/skills/understand/SKILL.md @@ -379,9 +379,10 @@ Then dispatch file-analyzer subagents per the same template as the full path. After batches complete: 1. Remove old nodes whose `filePath` matches any changed file from the existing graph -2. Remove old edges whose `source` or `target` references a removed node +2. **Prune by source only:** Keep each existing edge iff its `source` node survives the prune in step 1. Edges whose `target` was removed are intentionally retained. This preserves inbound edges from unchanged files — when file F is changed, `U imports F`, `U calls F`, and `U tests F` edges (where U is unchanged) would otherwise be silently dropped because U is never re-analyzed and therefore never regenerates them. A naive `source OR target` prune erodes inbound edges on every incremental run. + - Trade-off: an inbound function-level edge into a since-deleted function inside F (e.g. `function:U.ts:caller → function:F.ts:removedFn`) may briefly persist past this step. The merge script's existing dangling-edge sweep (Step 6) drops it once `function:F.ts:removedFn` does not reappear in the fresh batch. Over-keeping is the right trade-off: the dangling sweep cleans up safely, but a deleted inbound edge from an unchanged source can never be recovered until the next full re-analysis. 3. Write the pruned existing nodes/edges as `batch-existing.json` in the intermediate directory -4. Run the same merge script — it will combine `batch-existing.json` with the fresh `batch-*.json` files: +4. Run the same merge script — it will combine `batch-existing.json` with the fresh `batch-*.json` files. The merge's edge dedup `(source, target, type, direction)` absorbs any overlap with regenerated outbound edges, and its dangling-edge drop removes any edges whose target node never reappears: ```bash python /merge-batch-graphs.py $PROJECT_ROOT ``` diff --git a/understand-anything-plugin/skills/understand/merge-batch-graphs.py b/understand-anything-plugin/skills/understand/merge-batch-graphs.py index 2021f9ae..8caaac97 100644 --- a/understand-anything-plugin/skills/understand/merge-batch-graphs.py +++ b/understand-anything-plugin/skills/understand/merge-batch-graphs.py @@ -719,6 +719,87 @@ def link_tests( return added, dropped, tagged, swapped +# ── Incremental prune (source-only) ─────────────────────────────────────── +# +# When Phase 2 runs in `--changed-files` mode, only batches containing changed +# files are re-analyzed. The existing graph must be pruned of stale nodes +# belonging to changed files before being merged with the fresh batches. +# +# The prune rule is **source-only**: we keep an existing edge iff its +# `source` node survives the node prune. Edges whose `target` was removed +# are intentionally retained. +# +# Why source-only (not source-OR-target): +# When file F is modified, F is re-analyzed and regenerates F's OUTBOUND +# edges. But edges INTO F from an unchanged file U (e.g. `U imports F`, +# `U calls F`, `U tests F`) are NEVER regenerated because U is not in the +# re-analyzed batch set. A naive `source OR target` prune would drop those +# inbound edges every incremental run — they would silently erode over the +# life of the project until the next full rebuild. +# +# Safety net: +# Inbound function-level edges into since-deleted functions inside F (e.g. +# `function:U.ts:caller → function:F.ts:removedFn`) briefly survive this +# prune, but the merge's Step 6 dangling-edge drop removes them once +# `function:F.ts:removedFn` does not reappear in the fresh batch. Over- +# keeping is the right trade-off: dangling sweep cleans up safely, but a +# deleted inbound edge from an unchanged source can never be recovered. + +def prune_existing_graph( + existing: dict[str, Any], + changed_files: set[str], +) -> dict[str, Any]: + """Prune the existing graph for an incremental update. + + Removes nodes whose `filePath` is in `changed_files`, then prunes edges + by source-only: keep an edge iff its `source` node survives. Returns a + new graph dict (does not mutate `existing`). + + Caller wires the returned graph back into the merge by writing it as + `batch-existing.json` in the intermediate directory, where + `merge_and_normalize` will combine it with the fresh `batch-*.json` + files and handle dedup + dangling-target sweep. + + Edges whose target was a removed node are retained here on purpose — + if the target reappears in a fresh batch (file-level node `file:F` or a + function that still exists in F), the dedup / dangling-drop in + `merge_and_normalize` will resolve it. If it does not reappear (the + function was deleted), the dangling sweep drops the edge. + """ + existing_nodes = existing.get("nodes", []) or [] + existing_edges = existing.get("edges", []) or [] + + # Step 1: drop nodes whose filePath matches a changed file. We compare on + # `filePath` (not `id`) because function/class nodes share their host + # file's filePath but have IDs like `function:src/foo.ts:bar` — we want + # to drop those too when src/foo.ts changes. + surviving_nodes: list[dict[str, Any]] = [] + for node in existing_nodes: + fp = node.get("filePath") + if isinstance(fp, str) and fp in changed_files: + continue + surviving_nodes.append(node) + + surviving_node_ids: set[str] = { + node["id"] for node in surviving_nodes if isinstance(node.get("id"), str) + } + + # Step 2: prune edges by source only. Target may reference a removed + # node — `merge_and_normalize`'s Step 6 dangling-drop is the safety net. + surviving_edges: list[dict[str, Any]] = [] + for edge in existing_edges: + src = edge.get("source") + if not isinstance(src, str) or src not in surviving_node_ids: + continue + surviving_edges.append(edge) + + return { + **{k: v for k, v in existing.items() if k not in ("nodes", "edges")}, + "nodes": surviving_nodes, + "edges": surviving_edges, + } + + # ── Main merge + normalize ──────────────────────────────────────────────── def merge_and_normalize(batches: list[dict[str, Any]]) -> tuple[dict[str, Any], list[str]]: