From 6362200a7733e1788d49a94c60a59d3d82f28830 Mon Sep 17 00:00:00 2001 From: tpateeq Date: Sat, 4 Jul 2026 23:19:53 +0530 Subject: [PATCH 1/2] feat(build): warn on hub-node degree collapse during incremental merge (#1652b) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit build_merge REPLACES each re-extracted file's prior nodes/edges. If a re-extraction emits a DIFFERENT id for an entity that already exists as a hub, the old hub and its in-file edges are dropped and its cross-file edges are orphaned onto a bare node — the hub silently collapses from many edges to ~0 while the node count may not shrink, so the count-based shrink guard never fires (it is also gated out of the normal dedup=True --update path anyway). Snapshot the pre-merge graph's hub degrees (nodes with degree >= HUB_DEGREE_MIN) before the replace filter, then after the merge WARN when a former hub vanishes or loses more than DEGREE_DROP_FRAC of its degree. This is a warning, not an error, because a genuine large refactor can legitimately shed a hub's edges. Active on every path, including dedup=True. Refs #1652 (sub-proposal b only); guards the #1651 collapse vector. Co-Authored-By: Claude Opus 4.8 (1M context) --- graphify/build.py | 95 +++++++++++++++ tests/test_build_merge_degree_drop.py | 165 ++++++++++++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100644 tests/test_build_merge_degree_drop.py diff --git a/graphify/build.py b/graphify/build.py index 279eccbf4..55f97ce7b 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -732,6 +732,89 @@ def deduplicate_by_label(nodes: list[dict], edges: list[dict]) -> tuple[list[dic return deduped_nodes, deduped_edges +# Incremental --update (build_merge, below) REPLACES each re-extracted file's +# prior nodes/edges. If a re-extraction emits a DIFFERENT id for an entity that +# already exists as a well-connected hub, the old hub and its cross-file edges +# are dropped and a fresh node is created with only this file's edges — the hub +# silently collapses (e.g. 174 edges -> ~0) while total node count may even rise, +# so the count-based shrink guard never catches it (#1651/#1652b). We snapshot +# pre-merge hub degrees and WARN (not raise — a real refactor can legitimately +# shed edges) when a former hub vanishes or loses most of its degree. +HUB_DEGREE_MIN = 20 +DEGREE_DROP_FRAC = 0.5 +_HUB_DROP_REPORT_LIMIT = 10 + + +def _hub_degrees( + nodes: list[dict], edges: list[dict], threshold: int = HUB_DEGREE_MIN +) -> dict[str, tuple[str, int]]: + """Map id -> (label, degree) for every node whose degree >= threshold. + + Degree is computed with plain undirected nx.Graph semantics (parallel edges + collapse, self-loops count twice) so it lines up with G.degree() on the + post-merge graph. Direction is irrelevant to degree, so the direct-JSON read + build_merge does to preserve edge direction is not needed here. + """ + g = nx.Graph() + for n in nodes: + nid = n.get("id") + if nid is not None: + g.add_node(nid, label=n.get("label", nid)) + for e in edges: + s, t = e.get("source"), e.get("target") + if s is not None and t is not None: + g.add_edge(s, t) + return { + nid: (g.nodes[nid].get("label", nid), deg) + for nid, deg in g.degree() + if deg >= threshold + } + + +def _hub_degree_drops( + pre_hubs: dict[str, tuple[str, int]], + G: nx.Graph, + drop_frac: float = DEGREE_DROP_FRAC, +) -> list[tuple[str, int, int]]: + """Pre-merge hubs that vanished or lost more than drop_frac of their degree. + + Returns [(label, before, after)] sorted worst-drop first. + """ + post_deg = dict(G.degree()) + drops: list[tuple[str, int, int]] = [] + for nid, (label, before) in pre_hubs.items(): + if before <= 0: + continue + after = post_deg.get(nid, 0) + if (before - after) / before > drop_frac: + drops.append((label, before, after)) + drops.sort(key=lambda d: d[1] - d[2], reverse=True) + return drops + + +def _warn_hub_degree_drops(pre_hubs: dict[str, tuple[str, int]], G: nx.Graph) -> None: + """Emit a stderr WARNING for any hub node that collapsed during a merge.""" + drops = _hub_degree_drops(pre_hubs, G) + if not drops: + return + print( + f"[graphify] WARNING: {len(drops)} hub node(s) lost >" + f"{int(DEGREE_DROP_FRAC * 100)}% of their edges after this merge — " + "possible silent data loss from a re-extraction changing an entity's id " + "(#1651).", + file=sys.stderr, + ) + for label, before, after in drops[:_HUB_DROP_REPORT_LIMIT]: + suffix = " (node dropped entirely)" if after == 0 else "" + print( + f"[graphify] - '{label}': {before} -> {after} edges{suffix}", + file=sys.stderr, + ) + extra = len(drops) - _HUB_DROP_REPORT_LIMIT + if extra > 0: + print(f"[graphify] ... and {extra} more hub node(s).", file=sys.stderr) + + def build_merge( new_chunks: list[dict], graph_path: str | Path | None = None, @@ -780,6 +863,11 @@ def build_merge( existing_hyperedges = [] had_graph = False + # Snapshot the pre-merge graph's hub-node degrees so we can warn if this merge + # collapses one (#1651/#1652b). Captured from the graph AS LOADED, before the + # replace-per-source filter below drops the re-extracted files' contribution. + pre_merge_hubs = _hub_degrees(existing_nodes, existing_edges) if had_graph else {} + # Effective root for relativizing absolute source_file / prune paths back to the # stored relative source_file keys. When the caller passes root we use it; # otherwise fall back to the graph's recorded scan root, so absolute @@ -906,6 +994,13 @@ def _kept(item: dict) -> bool: f"Pass prune_sources explicitly if you intend to remove nodes." ) + # Degree-drop alert (#1652b): warn when a former hub collapsed. Unlike the + # count-based shrink guard above, this runs on every path — including the + # normal dedup=True --update — and warns rather than raises, since a genuine + # large refactor can legitimately shed a hub's edges. + if pre_merge_hubs: + _warn_hub_degree_drops(pre_merge_hubs, G) + return G diff --git a/tests/test_build_merge_degree_drop.py b/tests/test_build_merge_degree_drop.py new file mode 100644 index 000000000..e11e93667 --- /dev/null +++ b/tests/test_build_merge_degree_drop.py @@ -0,0 +1,165 @@ +"""Degree-drop alert for build_merge (#1652b, guards the #1651 collapse vector). + +Incremental --update REPLACES each re-extracted file's prior nodes/edges. If a +re-extraction emits a DIFFERENT id for an entity that already exists as a hub, +the old hub and its in-file edges are dropped and its cross-file edges are +orphaned onto a bare node — the hub silently collapses from many edges to ~0 +while the node count may not shrink, so the count-based shrink guard never fires. +build_merge now snapshots pre-merge hub degrees and WARNS (not raises) when a +former hub vanishes or loses more than DEGREE_DROP_FRAC of its degree. The alert +is active on the normal dedup=True path (the shrink guard is not). +""" +from __future__ import annotations + +import json + +from graphify.build import ( + DEGREE_DROP_FRAC, + HUB_DEGREE_MIN, + _hub_degree_drops, + _hub_degrees, + build_merge, +) + + +def _write_graph(graph_path, nodes, edges) -> None: + graph_path.write_text( + json.dumps({"nodes": nodes, "edges": edges, "hyperedges": []}), + encoding="utf-8", + ) + + +def _seed_hub_graph(graph_path, n_leaves: int = 25): + """A hub node in hub.py wired to n_leaves in-file leaves plus one cross-file + caller in other.py. Hub degree = n_leaves + 1 (>= HUB_DEGREE_MIN).""" + nodes = [{"id": "hub", "label": "Hub", "file_type": "code", "source_file": "hub.py"}] + edges = [] + for i in range(n_leaves): + nodes.append( + {"id": f"leaf{i}", "label": f"leaf{i}", "file_type": "code", "source_file": "hub.py"} + ) + edges.append( + {"source": "hub", "target": f"leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "hub.py", "weight": 1.0} + ) + nodes.append({"id": "caller", "label": "caller", "file_type": "code", "source_file": "other.py"}) + edges.append( + {"source": "caller", "target": "hub", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "other.py", "weight": 1.0} + ) + _write_graph(graph_path, nodes, edges) + return n_leaves + 1 # hub's pre-merge degree + + +def test_collapsing_hub_triggers_warning_on_dedup_path(tmp_path, capsys): + """Re-extracting hub.py with a NEW id for the hub drops its in-file edges and + orphans the cross-file one — the hub collapses. The warning must fire even on + the default dedup=True path (which the shrink guard is gated out of).""" + graph_path = tmp_path / "graph.json" + before = _seed_hub_graph(graph_path) + assert before >= HUB_DEGREE_MIN + + # hub.py re-extracted; the entity now carries a different id (the #1651 vector). + new_chunk = { + "nodes": [{"id": "hub_renamed", "label": "Hub", "file_type": "code", "source_file": "hub.py"}], + "edges": [], + } + G = build_merge([new_chunk], graph_path, dedup=True) + + # Old hub id survives only via the orphaned cross-file edge (degree ~1), or not + # at all — either way it lost far more than DEGREE_DROP_FRAC of its degree. + post_hub_degree = G.degree("hub") if "hub" in G else 0 + assert post_hub_degree <= 1 + + err = capsys.readouterr().err + assert "hub node(s) lost" in err + assert "'Hub'" in err + assert f"{before} ->" in err # before -> after, with the true pre-merge degree + + +def test_benign_reextraction_does_not_warn(tmp_path, capsys): + """Re-extracting hub.py while preserving the hub's id and all its edges keeps + the hub intact — no degree-drop warning.""" + graph_path = tmp_path / "graph.json" + n_leaves = 25 + _seed_hub_graph(graph_path, n_leaves) + + nodes = [{"id": "hub", "label": "Hub", "file_type": "code", "source_file": "hub.py"}] + edges = [] + for i in range(n_leaves): + nodes.append( + {"id": f"leaf{i}", "label": f"leaf{i}", "file_type": "code", "source_file": "hub.py"} + ) + edges.append( + {"source": "hub", "target": f"leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "hub.py", "weight": 1.0} + ) + G = build_merge([{"nodes": nodes, "edges": edges}], graph_path, dedup=True) + + assert G.degree("hub") == n_leaves + 1 # cross-file caller edge preserved too + err = capsys.readouterr().err + assert "hub node(s) lost" not in err + + +def test_small_degree_change_below_threshold_does_not_warn(tmp_path, capsys): + """A hub that sheds only a couple of edges (< DEGREE_DROP_FRAC) is a benign + edit, not a collapse — no warning.""" + graph_path = tmp_path / "graph.json" + n_leaves = 25 + _seed_hub_graph(graph_path, n_leaves) + + # Re-extract hub.py keeping the hub id but dropping just 2 of its 25 leaves. + nodes = [{"id": "hub", "label": "Hub", "file_type": "code", "source_file": "hub.py"}] + edges = [] + for i in range(n_leaves - 2): + nodes.append( + {"id": f"leaf{i}", "label": f"leaf{i}", "file_type": "code", "source_file": "hub.py"} + ) + edges.append( + {"source": "hub", "target": f"leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "hub.py", "weight": 1.0} + ) + build_merge([{"nodes": nodes, "edges": edges}], graph_path, dedup=True) + + err = capsys.readouterr().err + assert "hub node(s) lost" not in err + + +def test_no_warning_when_no_prior_hub(tmp_path, capsys): + """A graph whose busiest node is below HUB_DEGREE_MIN cannot collapse a hub.""" + graph_path = tmp_path / "graph.json" + nodes = [ + {"id": "a", "label": "A", "file_type": "code", "source_file": "a.py"}, + {"id": "b", "label": "B", "file_type": "code", "source_file": "a.py"}, + ] + edges = [{"source": "a", "target": "b", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "a.py", "weight": 1.0}] + _write_graph(graph_path, nodes, edges) + + build_merge([{"nodes": [{"id": "a", "label": "A", "file_type": "code", "source_file": "a.py"}], + "edges": []}], graph_path, dedup=True) + err = capsys.readouterr().err + assert "hub node(s) lost" not in err + + +# ── helper-level unit tests ─────────────────────────────────────────────────── + +def test_hub_degrees_picks_only_above_threshold(): + nodes = [{"id": "h", "label": "Hub"}] + [{"id": f"l{i}", "label": f"l{i}"} for i in range(HUB_DEGREE_MIN)] + edges = [{"source": "h", "target": f"l{i}"} for i in range(HUB_DEGREE_MIN)] + hubs = _hub_degrees(nodes, edges) + assert hubs["h"] == ("Hub", HUB_DEGREE_MIN) + assert all(nid == "h" for nid in hubs) # leaves (degree 1) excluded + + +def test_hub_degree_drops_flags_vanished_and_collapsed(): + import networkx as nx + + pre = {"h": ("Hub", 40), "keep": ("Keep", 30)} + G = nx.Graph() + G.add_node("keep") + for i in range(30): + G.add_edge("keep", f"n{i}") # keep still has degree 30 — unchanged + # "h" is absent from G entirely -> vanished + drops = _hub_degree_drops(pre, G, drop_frac=DEGREE_DROP_FRAC) + assert drops == [("Hub", 40, 0)] From 4789c08268eac00aee5fe95f1c3c966e650ebc65 Mon Sep 17 00:00:00 2001 From: tpateeq Date: Sat, 4 Jul 2026 23:46:00 +0530 Subject: [PATCH 2/2] fix(build): exempt intentional prunes from the hub degree-drop alert (#1652b) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review follow-up to 9c32b4c. Five fixes: - [primary] The degree-drop alert fired the misleading "silent data loss from a re-extraction changing an entity's id (#1651)" message on intentional prune_sources deletions — the normal way --update removes a deleted file — causing alarm fatigue on the exact signal meant to catch real corruption. build_merge now snapshots each hub's source_file and exempts any hub whose file is in prune_set (the same rule the node prune uses, mirroring the shrink guard's `not prune_sources` exemption). A hub that collapsed WITHOUT its file being pruned still warns; the genuine id-drift case still fires. - _hub_degrees threads `directed` and builds a DiGraph/Graph to match G.degree() on build_merge(directed=True), where a bidirectional pair is 2. - _hub_degrees counts an edge only when BOTH endpoints are in the node set, mirroring build_from_json (which drops dangling edges) instead of inflating the pre-merge degree via add_edge's implicit node creation. - Fix the "orphaned onto a bare node" wording (build_from_json DROPS the edge). - Tests: vanished-hub `-> 0 (node dropped entirely)` suffix; >10-hub truncation; exactly-50%-loss boundary (pins strict `>`); pure-prune silence; combined prune-exempt-but-id-drift-warns; plus directed and dangling-edge unit tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 1 + graphify/build.py | 64 +++++++++-- tests/test_build_merge_degree_drop.py | 152 ++++++++++++++++++++++++-- 3 files changed, 194 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 626fac385..a72f133ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Full release notes with details on each version: [GitHub Releases](https://githu ## Unreleased +- Feat: `graphify update`/`watch` now warns when an incremental merge collapses a god-node (#1652, thanks @Ns2384-star). An incremental `--update` REPLACES each re-extracted file's prior nodes/edges; if a re-extraction emits a *different* id for an entity that already exists as a well-connected hub, the old hub and all its edges are dropped (in-file edges go with the replaced contribution, cross-file edges are discarded by build since their endpoint id no longer has a node) — the hub silently collapses from many edges to ~0 while the node count may even rise, so the count-based shrink guard never fires (#1651). `build_merge` now snapshots pre-merge hub degrees (nodes with degree ≥ 20, counted with the same directedness as the merge and only across edges whose endpoints both exist) and prints a stderr WARNING — not a hard error, since a genuine large refactor can legitimately shed a hub's edges — listing each hub that vanished or lost more than half its degree. The alert is suppressed for a hub whose `source_file` was intentionally pruned this run (a requested deletion, not corruption), mirroring the shrink guard's prune exemption, so `--update` deletions don't cause alarm fatigue on the exact signal meant to catch a real id-drift collapse. - Fix: a malformed semantic chunk no longer crashes `extract` and discards every successful chunk (#1631, thanks @ssazy). When an LLM returned a well-formed object whose `edges` (or `nodes`/`hyperedges`) array carried a stray non-dict entry — a nested list where an edge object belongs — the AST+semantic merge and the semantic-cache write both called `.get()` per entry and raised `AttributeError: 'list' object has no attribute 'get'`. On a 34-chunk run where 33 succeeded, that meant no `graph.json` was written and the cache write failed too, so a re-run re-extracted everything. `_parse_llm_json` now sanitizes each fragment at the single parse chokepoint (keeping only dict entries and coercing a non-list value to `[]`), so the cache writer, the adaptive-retry merge, and the CLI merge are all protected in one place. - Fix: an unresolved bare npm import no longer aliases onto an unrelated same-named local file (#1638, thanks @EveX1). `import colors from "tailwindcss/colors"` in a `.tsx` file emitted an `imports_from` edge to the bare id `colors`, and build.py's pre-migration alias index (which registers every local file's bare stem) then remapped it onto an unrelated `backend/utils/colors.py` — a confident (`EXTRACTED`) cross-language phantom edge, and one per `.tsx` file sharing the import. In a real monorepo eight unrelated `.tsx` files all landed on a single Python module. Common package subpaths (`colors`, `utils`, `types`, `config`, `client`) collide this way constantly. The external-import fallback now namespaces its target with the `ref` prefix (the same J-4 convention used for tsconfig `extends`/`$ref` externals), so it can never collapse to a local file/symbol id; the ref-namespaced target has no node, so build drops it as an external reference — the correct outcome for a third-party import. - Fix: `graph.json` node/edge ordering is now stable run-to-run for document/semantic corpora (#1632, thanks @umeshpsatwe). With a parallel LLM backend, `extract_corpus_parallel` merged chunk results in completion order, so which network call happened to return first reordered the nodes and edges even when the model returned identical content — churning `graph.json` between otherwise-identical runs. Chunks are now merged in deterministic submission order after the pool drains (matching the serial path); the progress callback still fires in completion order so long local runs aren't silent. Note: the semantic content the LLM extracts is itself nondeterministic run-to-run — this fix removes the pipeline's own ordering churn, not the model's variance. diff --git a/graphify/build.py b/graphify/build.py index 55f97ce7b..f00ed4aaf 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -746,23 +746,32 @@ def deduplicate_by_label(nodes: list[dict], edges: list[dict]) -> tuple[list[dic def _hub_degrees( - nodes: list[dict], edges: list[dict], threshold: int = HUB_DEGREE_MIN + nodes: list[dict], edges: list[dict], threshold: int = HUB_DEGREE_MIN, + *, directed: bool = False, ) -> dict[str, tuple[str, int]]: """Map id -> (label, degree) for every node whose degree >= threshold. - Degree is computed with plain undirected nx.Graph semantics (parallel edges - collapse, self-loops count twice) so it lines up with G.degree() on the - post-merge graph. Direction is irrelevant to degree, so the direct-JSON read - build_merge does to preserve edge direction is not needed here. + Mirrors G.degree() on the post-merge graph so the before/after comparison is + like-for-like. Two subtleties: + + - Directedness matches the merge (`directed`): a DiGraph counts a bidirectional + pair as degree 2 where an undirected Graph collapses it to 1, so snapshotting + with the wrong type would skew the ratio for build_merge(directed=True). + - An edge is counted only when BOTH endpoints are in the node set. build_from_json + DROPS an edge whose endpoint id has no node rather than orphaning it onto an + auto-created bare node, so counting dangling stored edges here (as add_edge's + implicit node creation would) overstates the pre-merge degree. """ - g = nx.Graph() + g: nx.Graph = nx.DiGraph() if directed else nx.Graph() + node_ids: set = set() for n in nodes: nid = n.get("id") if nid is not None: g.add_node(nid, label=n.get("label", nid)) + node_ids.add(nid) for e in edges: s, t = e.get("source"), e.get("target") - if s is not None and t is not None: + if s in node_ids and t in node_ids: g.add_edge(s, t) return { nid: (g.nodes[nid].get("label", nid), deg) @@ -792,8 +801,21 @@ def _hub_degree_drops( return drops -def _warn_hub_degree_drops(pre_hubs: dict[str, tuple[str, int]], G: nx.Graph) -> None: - """Emit a stderr WARNING for any hub node that collapsed during a merge.""" +def _warn_hub_degree_drops( + pre_hubs: dict[str, tuple[str, int]], + G: nx.Graph, + exclude: set[str] | None = None, +) -> None: + """Emit a stderr WARNING for any hub node that collapsed during a merge. + + Hubs in `exclude` are skipped: their source_file was intentionally pruned this + run, so their collapse is an operator-requested deletion, not the #1651 id-drift + corruption the alert exists to catch. Warning on them would be alarm fatigue on + the exact signal that matters (mirrors the shrink guard's `not prune_sources` + exemption). A hub that collapsed WITHOUT its file being pruned still warns. + """ + if exclude: + pre_hubs = {nid: v for nid, v in pre_hubs.items() if nid not in exclude} drops = _hub_degree_drops(pre_hubs, G) if not drops: return @@ -866,7 +888,19 @@ def build_merge( # Snapshot the pre-merge graph's hub-node degrees so we can warn if this merge # collapses one (#1651/#1652b). Captured from the graph AS LOADED, before the # replace-per-source filter below drops the re-extracted files' contribution. - pre_merge_hubs = _hub_degrees(existing_nodes, existing_edges) if had_graph else {} + pre_merge_hubs = ( + _hub_degrees(existing_nodes, existing_edges, directed=directed) + if had_graph else {} + ) + # Remember each hub's source_file from the graph AS LOADED, so the degree-drop + # alert below can exempt a hub whose collapse is explained by an intentional + # prune — using the same source_file-in-prune_set rule the node prune uses. + pre_merge_hub_sources: dict[str, str | None] = {} + if pre_merge_hubs: + for _n in existing_nodes: + _nid = _n.get("id") + if _nid in pre_merge_hubs and _nid not in pre_merge_hub_sources: + pre_merge_hub_sources[_nid] = _n.get("source_file") # Effective root for relativizing absolute source_file / prune paths back to the # stored relative source_file keys. When the caller passes root we use it; @@ -999,7 +1033,15 @@ def _kept(item: dict) -> bool: # normal dedup=True --update — and warns rather than raises, since a genuine # large refactor can legitimately shed a hub's edges. if pre_merge_hubs: - _warn_hub_degree_drops(pre_merge_hubs, G) + # Exempt hubs whose source_file was pruned this run: their collapse is a + # deletion the operator asked for (the normal way --update drops a removed + # file), not the #1651 id-drift corruption the alert exists to catch. A hub + # that collapsed WITHOUT its file being pruned still warns. + pruned_hub_ids = { + nid for nid, sf in pre_merge_hub_sources.items() + if sf in prune_set or _norm_source_file(sf, _eff_root) in prune_set + } + _warn_hub_degree_drops(pre_merge_hubs, G, exclude=pruned_hub_ids) return G diff --git a/tests/test_build_merge_degree_drop.py b/tests/test_build_merge_degree_drop.py index e11e93667..35ad6fad3 100644 --- a/tests/test_build_merge_degree_drop.py +++ b/tests/test_build_merge_degree_drop.py @@ -2,12 +2,15 @@ Incremental --update REPLACES each re-extracted file's prior nodes/edges. If a re-extraction emits a DIFFERENT id for an entity that already exists as a hub, -the old hub and its in-file edges are dropped and its cross-file edges are -orphaned onto a bare node — the hub silently collapses from many edges to ~0 -while the node count may not shrink, so the count-based shrink guard never fires. -build_merge now snapshots pre-merge hub degrees and WARNS (not raises) when a -former hub vanishes or loses more than DEGREE_DROP_FRAC of its degree. The alert -is active on the normal dedup=True path (the shrink guard is not). +the old hub and all its edges are dropped: its in-file edges go with the replaced +contribution, and its cross-file edges are DROPPED by build_from_json — which +discards an edge whose endpoint id has no node rather than orphaning it onto a +bare node. The hub silently collapses from many edges to ~0 while the node count +may not shrink, so the count-based shrink guard never fires. build_merge now +snapshots pre-merge hub degrees and WARNS (not raises) when a former hub vanishes +or loses more than DEGREE_DROP_FRAC of its degree. The alert is active on the +normal dedup=True path (the shrink guard is not), and is suppressed for a hub whose +source_file was intentionally pruned (that collapse is a requested deletion). """ from __future__ import annotations @@ -16,8 +19,10 @@ from graphify.build import ( DEGREE_DROP_FRAC, HUB_DEGREE_MIN, + _HUB_DROP_REPORT_LIMIT, _hub_degree_drops, _hub_degrees, + _warn_hub_degree_drops, build_merge, ) @@ -53,8 +58,9 @@ def _seed_hub_graph(graph_path, n_leaves: int = 25): def test_collapsing_hub_triggers_warning_on_dedup_path(tmp_path, capsys): """Re-extracting hub.py with a NEW id for the hub drops its in-file edges and - orphans the cross-file one — the hub collapses. The warning must fire even on - the default dedup=True path (which the shrink guard is gated out of).""" + drops the cross-file one (its old endpoint no longer has a node) — the hub + collapses. The warning must fire even on the default dedup=True path (which the + shrink guard is gated out of).""" graph_path = tmp_path / "graph.json" before = _seed_hub_graph(graph_path) assert before >= HUB_DEGREE_MIN @@ -66,15 +72,17 @@ def test_collapsing_hub_triggers_warning_on_dedup_path(tmp_path, capsys): } G = build_merge([new_chunk], graph_path, dedup=True) - # Old hub id survives only via the orphaned cross-file edge (degree ~1), or not - # at all — either way it lost far more than DEGREE_DROP_FRAC of its degree. + # The old hub id vanishes entirely: build_from_json DROPS the cross-file edge + # (its old endpoint has no node), so "hub" is absent from the post-merge graph — + # a total collapse, far past DEGREE_DROP_FRAC. post_hub_degree = G.degree("hub") if "hub" in G else 0 - assert post_hub_degree <= 1 + assert post_hub_degree == 0 err = capsys.readouterr().err assert "hub node(s) lost" in err assert "'Hub'" in err - assert f"{before} ->" in err # before -> after, with the true pre-merge degree + assert f"{before} -> 0" in err # before -> after, with the true pre-merge degree + assert "(node dropped entirely)" in err # vanished-hub suffix def test_benign_reextraction_does_not_warn(tmp_path, capsys): @@ -142,6 +150,101 @@ def test_no_warning_when_no_prior_hub(tmp_path, capsys): assert "hub node(s) lost" not in err +# ── prune interaction (#1652b: don't cry corruption on intentional deletes) ──── + +def test_pure_prune_of_hub_file_is_silent(tmp_path, capsys): + """Deleting a hub's file via prune_sources (no re-extraction) collapses the hub + to nothing, but that is the operator-requested deletion path — the degree-drop + alert stays silent and never blames the #1651 id-drift cause.""" + graph_path = tmp_path / "graph.json" + before = _seed_hub_graph(graph_path) # hub in hub.py + assert before >= HUB_DEGREE_MIN + + G = build_merge([], graph_path, prune_sources=["hub.py"], dedup=True) + + assert "hub" not in G # pruned away entirely + err = capsys.readouterr().err + assert "hub node(s) lost" not in err + assert "#1651" not in err + + +def test_pruned_hub_silent_but_id_drift_hub_still_warns(tmp_path, capsys): + """Same run, two hubs: a.py is pruned (requested deletion) and b.py is + re-extracted with a NEW id for its hub (the #1651 vector). The alert must NOT + blame the pruned hub, but MUST still fire on the id-drift collapse — the prune + exemption cannot mute the corruption signal it was carved out of.""" + graph_path = tmp_path / "graph.json" + n_leaves = 25 + nodes = [ + {"id": "hubA", "label": "HubA", "file_type": "code", "source_file": "a.py"}, + {"id": "hubB", "label": "HubB", "file_type": "code", "source_file": "b.py"}, + ] + edges = [] + for i in range(n_leaves): + nodes.append({"id": f"a_leaf{i}", "label": f"a_leaf{i}", "file_type": "code", "source_file": "a.py"}) + edges.append({"source": "hubA", "target": f"a_leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "a.py", "weight": 1.0}) + nodes.append({"id": f"b_leaf{i}", "label": f"b_leaf{i}", "file_type": "code", "source_file": "b.py"}) + edges.append({"source": "hubB", "target": f"b_leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "b.py", "weight": 1.0}) + _write_graph(graph_path, nodes, edges) + + # b.py re-extracted with a NEW id for hubB (#1651); a.py deleted (pruned). + new_chunk = { + "nodes": [{"id": "hubB_renamed", "label": "HubB", "file_type": "code", "source_file": "b.py"}], + "edges": [], + } + G = build_merge([new_chunk], graph_path, prune_sources=["a.py"], dedup=True) + + assert "hubA" not in G # pruned + assert "hubB" not in G # collapsed via id-drift + + err = capsys.readouterr().err + assert "hub node(s) lost" in err # the alert DID fire + assert "1 hub node(s) lost" in err # exactly one — hubA is exempted + assert "'HubB'" in err # id-drift hub reported + assert "'HubA'" not in err # pruned hub NOT reported + + +# ── report shape: truncation and the strict-> boundary ──────────────────────── + +def test_many_collapsing_hubs_are_truncated_in_report(capsys): + """More than _HUB_DROP_REPORT_LIMIT collapsing hubs: the per-hub list is capped + at the limit and a '... and N more' line summarizes the rest.""" + import networkx as nx + + n_hubs = _HUB_DROP_REPORT_LIMIT + 2 + pre = {f"h{i}": (f"Hub{i}", 30) for i in range(n_hubs)} + G = nx.Graph() # every hub vanished + _warn_hub_degree_drops(pre, G) + + err = capsys.readouterr().err + assert f"{n_hubs} hub node(s) lost" in err + # only the first _HUB_DROP_REPORT_LIMIT per-hub lines are printed + assert err.count("edges (node dropped entirely)") == _HUB_DROP_REPORT_LIMIT + assert f"... and {n_hubs - _HUB_DROP_REPORT_LIMIT} more hub node(s)." in err + + +def test_exactly_half_degree_loss_is_below_threshold(): + """A hub losing EXACTLY DEGREE_DROP_FRAC of its degree is benign — the guard + uses strict `>`, so 50% does not fire and 50%+one-edge does. Pins `>` (not + `>=`) at the drop check.""" + import networkx as nx + + pre = {"h": ("Hub", 40)} + # after = 20 -> exactly 50% lost -> NOT flagged (strict >) + g_half = nx.Graph() + for i in range(20): + g_half.add_edge("h", f"n{i}") + assert _hub_degree_drops(pre, g_half, drop_frac=DEGREE_DROP_FRAC) == [] + + # after = 19 -> 52.5% lost -> flagged + g_over = nx.Graph() + for i in range(19): + g_over.add_edge("h", f"n{i}") + assert _hub_degree_drops(pre, g_over, drop_frac=DEGREE_DROP_FRAC) == [("Hub", 40, 19)] + + # ── helper-level unit tests ─────────────────────────────────────────────────── def test_hub_degrees_picks_only_above_threshold(): @@ -152,6 +255,31 @@ def test_hub_degrees_picks_only_above_threshold(): assert all(nid == "h" for nid in hubs) # leaves (degree 1) excluded +def test_hub_degrees_directed_matches_digraph_degree(): + """directed=True snapshots a DiGraph, so a bidirectional pair counts as degree 2 + — matching G.degree() on build_merge(directed=True)'s DiGraph. Undirected + collapses the pair to 1, which would skew the before/after ratio.""" + nodes = [{"id": "h", "label": "Hub"}] + [{"id": f"l{i}", "label": f"l{i}"} for i in range(15)] + edges = [{"source": "h", "target": f"l{i}"} for i in range(15)] + edges += [{"source": f"l{i}", "target": "h"} for i in range(15)] # reverse of each pair + # undirected: 15 unique pairs -> degree 15 (< 20) -> h is not a hub + assert _hub_degrees(nodes, edges) == {} + # directed: in + out -> degree 30 -> h is a hub + assert _hub_degrees(nodes, edges, directed=True)["h"] == ("Hub", 30) + + +def test_hub_degrees_ignores_dangling_edges(): + """An edge whose endpoint has no node is DROPPED (mirroring build_from_json), + not counted onto an auto-created bare node — so it can't inflate a hub's + pre-merge degree.""" + nodes = [{"id": "h", "label": "Hub"}] + [{"id": f"l{i}", "label": f"l{i}"} for i in range(HUB_DEGREE_MIN)] + edges = [{"source": "h", "target": f"l{i}"} for i in range(HUB_DEGREE_MIN)] + edges += [{"source": "h", "target": f"ghost{i}"} for i in range(5)] # endpoints absent + hubs = _hub_degrees(nodes, edges) + assert hubs["h"] == ("Hub", HUB_DEGREE_MIN) # dangling edges excluded, not +5 + assert "ghost0" not in hubs # a ghost endpoint never became a node + + def test_hub_degree_drops_flags_vanished_and_collapsed(): import networkx as nx