Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
270 changes: 270 additions & 0 deletions tests/skill/understand/test_merge_batch_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,276 @@ def test_linker_runs_during_merge(self) -> None:
self.assertIn("tested", prod_node["tags"])


class PruneExistingGraphTests(unittest.TestCase):
"""Incremental prune of the existing graph, source-only.

Regression for issue #366 Bug 2 — the previous `source OR target` rule
silently dropped inbound edges from unchanged files (`U imports F`,
`U calls F`, `U tests F`) every time F was re-analyzed, because the
unchanged U never regenerates them.
"""

def test_inbound_edges_from_unchanged_sources_survive(self) -> None:
# Fixture: F1 imports F2, F3 imports F2. F2 is the only changed file.
# Both inbound edges (F1→F2 and F3→F2) must survive the prune even
# though their target (F2) was removed.
existing = {
"nodes": [
_file_node("src/F1.ts"),
_file_node("src/F2.ts"),
_file_node("src/F3.ts"),
],
"edges": [
{
"source": "file:src/F1.ts",
"target": "file:src/F2.ts",
"type": "imports",
"direction": "forward",
"weight": 0.7,
},
{
"source": "file:src/F3.ts",
"target": "file:src/F2.ts",
"type": "imports",
"direction": "forward",
"weight": 0.7,
},
],
}

pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"})

# F2's node was removed, F1 + F3 survive
surviving_ids = {n["id"] for n in pruned["nodes"]}
self.assertEqual(surviving_ids, {"file:src/F1.ts", "file:src/F3.ts"})

# Both inbound edges into F2 were retained (target removed, but
# source survived → keep). The merge's dangling-edge sweep will
# absorb them once fresh F2 batch lands.
edge_pairs = {(e["source"], e["target"]) for e in pruned["edges"]}
self.assertIn(("file:src/F1.ts", "file:src/F2.ts"), edge_pairs)
self.assertIn(("file:src/F3.ts", "file:src/F2.ts"), edge_pairs)
self.assertEqual(len(pruned["edges"]), 2)

def test_outbound_edges_from_removed_nodes_are_dropped(self) -> None:
# Edges sourced by a removed node have no use in the merged graph —
# the fresh batch for that file will regenerate them (or not).
existing = {
"nodes": [
_file_node("src/F1.ts"),
_file_node("src/F2.ts"),
],
"edges": [
# Outbound from changed F2 — must be dropped (fresh batch
# will re-emit if still valid).
{
"source": "file:src/F2.ts",
"target": "file:src/F1.ts",
"type": "imports",
"direction": "forward",
"weight": 0.7,
},
],
}

pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"})

# F2 dropped from nodes, F2's outbound edge dropped too
self.assertEqual({n["id"] for n in pruned["nodes"]}, {"file:src/F1.ts"})
self.assertEqual(pruned["edges"], [])

def test_function_level_nodes_inside_changed_file_are_removed(self) -> None:
# Function/class nodes share their host file's `filePath` — they
# must be dropped when that file changes (their fresh batch will
# re-emit them under canonical IDs).
existing = {
"nodes": [
_file_node("src/F1.ts"),
_file_node("src/F2.ts"),
{
"id": "function:src/F2.ts:helper",
"type": "function",
"name": "helper",
"filePath": "src/F2.ts",
"summary": "",
"tags": [],
"complexity": "simple",
},
],
"edges": [
# Inbound function-level edge: F1's caller → F2's helper.
# Retained at this step; merge's dangling sweep decides
# later (drops it if `function:src/F2.ts:helper` does not
# reappear in the fresh batch).
{
"source": "file:src/F1.ts",
"target": "function:src/F2.ts:helper",
"type": "calls",
"direction": "forward",
"weight": 0.5,
},
],
}

pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"})

surviving_ids = {n["id"] for n in pruned["nodes"]}
self.assertEqual(surviving_ids, {"file:src/F1.ts"})
# Inbound edge retained — source F1 survived
self.assertEqual(len(pruned["edges"]), 1)
self.assertEqual(pruned["edges"][0]["target"], "function:src/F2.ts:helper")

def test_no_changed_files_is_noop(self) -> None:
existing = {
"nodes": [_file_node("src/a.ts"), _file_node("src/b.ts")],
"edges": [
{
"source": "file:src/a.ts",
"target": "file:src/b.ts",
"type": "imports",
"direction": "forward",
"weight": 0.7,
},
],
}
pruned = mbg.prune_existing_graph(existing, set())
self.assertEqual(len(pruned["nodes"]), 2)
self.assertEqual(len(pruned["edges"]), 1)

def test_unrelated_top_level_fields_preserved(self) -> None:
# Real graphs carry `projectName`, `frameworks`, `layers`, etc.
# alongside `nodes` and `edges`. Prune must not strip them.
existing = {
"projectName": "demo",
"frameworks": ["next"],
"layers": [{"id": "ui", "nodeIds": []}],
"nodes": [_file_node("src/F1.ts"), _file_node("src/F2.ts")],
"edges": [],
}
pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"})
self.assertEqual(pruned["projectName"], "demo")
self.assertEqual(pruned["frameworks"], ["next"])
self.assertEqual(pruned["layers"], [{"id": "ui", "nodeIds": []}])

def test_does_not_mutate_input(self) -> None:
existing = {
"nodes": [_file_node("src/F1.ts"), _file_node("src/F2.ts")],
"edges": [
{
"source": "file:src/F2.ts",
"target": "file:src/F1.ts",
"type": "imports",
"direction": "forward",
"weight": 0.7,
},
],
}
original_node_count = len(existing["nodes"])
original_edge_count = len(existing["edges"])

mbg.prune_existing_graph(existing, {"src/F2.ts"})

# Input graph untouched
self.assertEqual(len(existing["nodes"]), original_node_count)
self.assertEqual(len(existing["edges"]), original_edge_count)

def test_merge_dangling_sweep_drops_truly_removed_function_target(self) -> None:
# End-to-end safety check: an inbound edge into a deleted function
# survives the source-only prune (good), then the merge's dangling
# sweep drops it because the function does NOT reappear in the
# fresh batch (also good — no zombie edges in the final graph).
existing = {
"nodes": [
_file_node("src/F1.ts"),
_file_node("src/F2.ts"),
{
"id": "function:src/F2.ts:removedFn",
"type": "function",
"name": "removedFn",
"filePath": "src/F2.ts",
"summary": "",
"tags": [],
"complexity": "simple",
},
],
"edges": [
# Inbound function-level edge from unchanged F1
{
"source": "file:src/F1.ts",
"target": "function:src/F2.ts:removedFn",
"type": "calls",
"direction": "forward",
"weight": 0.5,
},
],
}

pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"})

# Fresh batch for F2: file-level only, removedFn is gone
fresh_batch = {
"nodes": [_file_node("src/F2.ts")],
"edges": [],
}

# Stitch into the merge pipeline by treating pruned as a batch
existing_batch = {"nodes": pruned["nodes"], "edges": pruned["edges"]}
assembled, _report = mbg.merge_and_normalize([existing_batch, fresh_batch])

node_ids = {n["id"] for n in assembled["nodes"]}
self.assertIn("file:src/F1.ts", node_ids)
self.assertIn("file:src/F2.ts", node_ids)
# removedFn never reappeared
self.assertNotIn("function:src/F2.ts:removedFn", node_ids)

# The inbound calls edge gets dropped by the dangling-edge sweep
# in merge_and_normalize Step 6 — its target node is missing.
edge_pairs = {(e["source"], e["target"]) for e in assembled["edges"]}
self.assertNotIn(
("file:src/F1.ts", "function:src/F2.ts:removedFn"),
edge_pairs,
)

def test_merge_preserves_inbound_file_edge_after_prune(self) -> None:
# Full end-to-end: F1 imports F2, F3 imports F2. F2 is reanalyzed
# and the fresh batch re-emits file:F2. The inbound edges (file-level
# targets) survive prune + merge.
existing = {
"nodes": [
_file_node("src/F1.ts"),
_file_node("src/F2.ts"),
_file_node("src/F3.ts"),
],
"edges": [
{
"source": "file:src/F1.ts",
"target": "file:src/F2.ts",
"type": "imports",
"direction": "forward",
"weight": 0.7,
},
{
"source": "file:src/F3.ts",
"target": "file:src/F2.ts",
"type": "imports",
"direction": "forward",
"weight": 0.7,
},
],
}

pruned = mbg.prune_existing_graph(existing, {"src/F2.ts"})
fresh_batch = {"nodes": [_file_node("src/F2.ts")], "edges": []}

existing_batch = {"nodes": pruned["nodes"], "edges": pruned["edges"]}
assembled, _report = mbg.merge_and_normalize([existing_batch, fresh_batch])

edge_pairs = {(e["source"], e["target"], e["type"]) for e in assembled["edges"]}
# Both inbound imports edges into F2 survived end-to-end
self.assertIn(("file:src/F1.ts", "file:src/F2.ts", "imports"), edge_pairs)
self.assertIn(("file:src/F3.ts", "file:src/F2.ts", "imports"), edge_pairs)


class NormalizeDirectionTests(unittest.TestCase):
"""`direction` canonicalization mirrors the dashboard schema validator."""

Expand Down
4 changes: 2 additions & 2 deletions understand-anything-plugin/hooks/auto-update-prompt.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,10 @@ Only re-analyze files with structural changes. This is the **only** phase that c

5. **Merge with existing graph:**
- Remove old nodes whose `filePath` matches any file in `filesToReanalyze` or in the deleted files list
- Remove old edges whose `source` or `target` references a removed node
- **Prune by source only:** Keep each existing edge iff its `source` node survives. Edges whose `target` was removed are intentionally retained — this preserves inbound edges from unchanged files (`U imports F`, `U calls F`, `U tests F`) that the unchanged U will never regenerate. A naive `source OR target` prune silently erodes inbound edges on every incremental run.
- Add new nodes and edges from the fresh analysis
- Deduplicate nodes by ID (keep latest), edges by `source + target + type`
- Remove any edge with dangling `source` or `target` references
- Remove any edge with dangling `source` or `target` references — this is the safety net that drops inbound function-level edges into since-deleted functions inside changed files

---

Expand Down
5 changes: 3 additions & 2 deletions understand-anything-plugin/skills/understand/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -379,9 +379,10 @@ Then dispatch file-analyzer subagents per the same template as the full path.

After batches complete:
1. Remove old nodes whose `filePath` matches any changed file from the existing graph
2. Remove old edges whose `source` or `target` references a removed node
2. **Prune by source only:** Keep each existing edge iff its `source` node survives the prune in step 1. Edges whose `target` was removed are intentionally retained. This preserves inbound edges from unchanged files — when file F is changed, `U imports F`, `U calls F`, and `U tests F` edges (where U is unchanged) would otherwise be silently dropped because U is never re-analyzed and therefore never regenerates them. A naive `source OR target` prune erodes inbound edges on every incremental run.
- Trade-off: an inbound function-level edge into a since-deleted function inside F (e.g. `function:U.ts:caller → function:F.ts:removedFn`) may briefly persist past this step. The merge script's existing dangling-edge sweep (Step 6) drops it once `function:F.ts:removedFn` does not reappear in the fresh batch. Over-keeping is the right trade-off: the dangling sweep cleans up safely, but a deleted inbound edge from an unchanged source can never be recovered until the next full re-analysis.
3. Write the pruned existing nodes/edges as `batch-existing.json` in the intermediate directory
4. Run the same merge script — it will combine `batch-existing.json` with the fresh `batch-*.json` files:
4. Run the same merge script — it will combine `batch-existing.json` with the fresh `batch-*.json` files. The merge's edge dedup `(source, target, type, direction)` absorbs any overlap with regenerated outbound edges, and its dangling-edge drop removes any edges whose target node never reappears:
```bash
python <SKILL_DIR>/merge-batch-graphs.py $PROJECT_ROOT
```
Expand Down
81 changes: 81 additions & 0 deletions understand-anything-plugin/skills/understand/merge-batch-graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,87 @@ def link_tests(
return added, dropped, tagged, swapped


# ── Incremental prune (source-only) ───────────────────────────────────────
#
# When Phase 2 runs in `--changed-files` mode, only batches containing changed
# files are re-analyzed. The existing graph must be pruned of stale nodes
# belonging to changed files before being merged with the fresh batches.
#
# The prune rule is **source-only**: we keep an existing edge iff its
# `source` node survives the node prune. Edges whose `target` was removed
# are intentionally retained.
#
# Why source-only (not source-OR-target):
# When file F is modified, F is re-analyzed and regenerates F's OUTBOUND
# edges. But edges INTO F from an unchanged file U (e.g. `U imports F`,
# `U calls F`, `U tests F`) are NEVER regenerated because U is not in the
# re-analyzed batch set. A naive `source OR target` prune would drop those
# inbound edges every incremental run — they would silently erode over the
# life of the project until the next full rebuild.
#
# Safety net:
# Inbound function-level edges into since-deleted functions inside F (e.g.
# `function:U.ts:caller → function:F.ts:removedFn`) briefly survive this
# prune, but the merge's Step 6 dangling-edge drop removes them once
# `function:F.ts:removedFn` does not reappear in the fresh batch. Over-
# keeping is the right trade-off: dangling sweep cleans up safely, but a
# deleted inbound edge from an unchanged source can never be recovered.

def prune_existing_graph(
existing: dict[str, Any],
changed_files: set[str],
) -> dict[str, Any]:
"""Prune the existing graph for an incremental update.

Removes nodes whose `filePath` is in `changed_files`, then prunes edges
by source-only: keep an edge iff its `source` node survives. Returns a
new graph dict (does not mutate `existing`).

Caller wires the returned graph back into the merge by writing it as
`batch-existing.json` in the intermediate directory, where
`merge_and_normalize` will combine it with the fresh `batch-*.json`
files and handle dedup + dangling-target sweep.

Edges whose target was a removed node are retained here on purpose —
if the target reappears in a fresh batch (file-level node `file:F` or a
function that still exists in F), the dedup / dangling-drop in
`merge_and_normalize` will resolve it. If it does not reappear (the
function was deleted), the dangling sweep drops the edge.
"""
existing_nodes = existing.get("nodes", []) or []
existing_edges = existing.get("edges", []) or []

# Step 1: drop nodes whose filePath matches a changed file. We compare on
# `filePath` (not `id`) because function/class nodes share their host
# file's filePath but have IDs like `function:src/foo.ts:bar` — we want
# to drop those too when src/foo.ts changes.
surviving_nodes: list[dict[str, Any]] = []
for node in existing_nodes:
fp = node.get("filePath")
if isinstance(fp, str) and fp in changed_files:
continue
surviving_nodes.append(node)

surviving_node_ids: set[str] = {
node["id"] for node in surviving_nodes if isinstance(node.get("id"), str)
}

# Step 2: prune edges by source only. Target may reference a removed
# node — `merge_and_normalize`'s Step 6 dangling-drop is the safety net.
surviving_edges: list[dict[str, Any]] = []
for edge in existing_edges:
src = edge.get("source")
if not isinstance(src, str) or src not in surviving_node_ids:
continue
surviving_edges.append(edge)

return {
**{k: v for k, v in existing.items() if k not in ("nodes", "edges")},
"nodes": surviving_nodes,
"edges": surviving_edges,
}


# ── Main merge + normalize ────────────────────────────────────────────────

def merge_and_normalize(batches: list[dict[str, Any]]) -> tuple[dict[str, Any], list[str]]:
Expand Down
Loading