diff --git a/CHANGELOG.md b/CHANGELOG.md index 626fac385..a72f133ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Full release notes with details on each version: [GitHub Releases](https://githu ## Unreleased +- Feat: `graphify update`/`watch` now warns when an incremental merge collapses a god-node (#1652, thanks @Ns2384-star). An incremental `--update` REPLACES each re-extracted file's prior nodes/edges; if a re-extraction emits a *different* id for an entity that already exists as a well-connected hub, the old hub and all its edges are dropped (in-file edges go with the replaced contribution, cross-file edges are discarded by build since their endpoint id no longer has a node) — the hub silently collapses from many edges to ~0 while the node count may even rise, so the count-based shrink guard never fires (#1651). `build_merge` now snapshots pre-merge hub degrees (nodes with degree ≥ 20, counted with the same directedness as the merge and only across edges whose endpoints both exist) and prints a stderr WARNING — not a hard error, since a genuine large refactor can legitimately shed a hub's edges — listing each hub that vanished or lost more than half its degree. The alert is suppressed for a hub whose `source_file` was intentionally pruned this run (a requested deletion, not corruption), mirroring the shrink guard's prune exemption, so `--update` deletions don't cause alarm fatigue on the exact signal meant to catch a real id-drift collapse. - Fix: a malformed semantic chunk no longer crashes `extract` and discards every successful chunk (#1631, thanks @ssazy). When an LLM returned a well-formed object whose `edges` (or `nodes`/`hyperedges`) array carried a stray non-dict entry — a nested list where an edge object belongs — the AST+semantic merge and the semantic-cache write both called `.get()` per entry and raised `AttributeError: 'list' object has no attribute 'get'`. On a 34-chunk run where 33 succeeded, that meant no `graph.json` was written and the cache write failed too, so a re-run re-extracted everything. `_parse_llm_json` now sanitizes each fragment at the single parse chokepoint (keeping only dict entries and coercing a non-list value to `[]`), so the cache writer, the adaptive-retry merge, and the CLI merge are all protected in one place. - Fix: an unresolved bare npm import no longer aliases onto an unrelated same-named local file (#1638, thanks @EveX1). `import colors from "tailwindcss/colors"` in a `.tsx` file emitted an `imports_from` edge to the bare id `colors`, and build.py's pre-migration alias index (which registers every local file's bare stem) then remapped it onto an unrelated `backend/utils/colors.py` — a confident (`EXTRACTED`) cross-language phantom edge, and one per `.tsx` file sharing the import. In a real monorepo eight unrelated `.tsx` files all landed on a single Python module. Common package subpaths (`colors`, `utils`, `types`, `config`, `client`) collide this way constantly. The external-import fallback now namespaces its target with the `ref` prefix (the same J-4 convention used for tsconfig `extends`/`$ref` externals), so it can never collapse to a local file/symbol id; the ref-namespaced target has no node, so build drops it as an external reference — the correct outcome for a third-party import. - Fix: `graph.json` node/edge ordering is now stable run-to-run for document/semantic corpora (#1632, thanks @umeshpsatwe). With a parallel LLM backend, `extract_corpus_parallel` merged chunk results in completion order, so which network call happened to return first reordered the nodes and edges even when the model returned identical content — churning `graph.json` between otherwise-identical runs. Chunks are now merged in deterministic submission order after the pool drains (matching the serial path); the progress callback still fires in completion order so long local runs aren't silent. Note: the semantic content the LLM extracts is itself nondeterministic run-to-run — this fix removes the pipeline's own ordering churn, not the model's variance. diff --git a/graphify/build.py b/graphify/build.py index 279eccbf4..f00ed4aaf 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -732,6 +732,111 @@ def deduplicate_by_label(nodes: list[dict], edges: list[dict]) -> tuple[list[dic return deduped_nodes, deduped_edges +# Incremental --update (build_merge, below) REPLACES each re-extracted file's +# prior nodes/edges. If a re-extraction emits a DIFFERENT id for an entity that +# already exists as a well-connected hub, the old hub and its cross-file edges +# are dropped and a fresh node is created with only this file's edges — the hub +# silently collapses (e.g. 174 edges -> ~0) while total node count may even rise, +# so the count-based shrink guard never catches it (#1651/#1652b). We snapshot +# pre-merge hub degrees and WARN (not raise — a real refactor can legitimately +# shed edges) when a former hub vanishes or loses most of its degree. +HUB_DEGREE_MIN = 20 +DEGREE_DROP_FRAC = 0.5 +_HUB_DROP_REPORT_LIMIT = 10 + + +def _hub_degrees( + nodes: list[dict], edges: list[dict], threshold: int = HUB_DEGREE_MIN, + *, directed: bool = False, +) -> dict[str, tuple[str, int]]: + """Map id -> (label, degree) for every node whose degree >= threshold. + + Mirrors G.degree() on the post-merge graph so the before/after comparison is + like-for-like. Two subtleties: + + - Directedness matches the merge (`directed`): a DiGraph counts a bidirectional + pair as degree 2 where an undirected Graph collapses it to 1, so snapshotting + with the wrong type would skew the ratio for build_merge(directed=True). + - An edge is counted only when BOTH endpoints are in the node set. build_from_json + DROPS an edge whose endpoint id has no node rather than orphaning it onto an + auto-created bare node, so counting dangling stored edges here (as add_edge's + implicit node creation would) overstates the pre-merge degree. + """ + g: nx.Graph = nx.DiGraph() if directed else nx.Graph() + node_ids: set = set() + for n in nodes: + nid = n.get("id") + if nid is not None: + g.add_node(nid, label=n.get("label", nid)) + node_ids.add(nid) + for e in edges: + s, t = e.get("source"), e.get("target") + if s in node_ids and t in node_ids: + g.add_edge(s, t) + return { + nid: (g.nodes[nid].get("label", nid), deg) + for nid, deg in g.degree() + if deg >= threshold + } + + +def _hub_degree_drops( + pre_hubs: dict[str, tuple[str, int]], + G: nx.Graph, + drop_frac: float = DEGREE_DROP_FRAC, +) -> list[tuple[str, int, int]]: + """Pre-merge hubs that vanished or lost more than drop_frac of their degree. + + Returns [(label, before, after)] sorted worst-drop first. + """ + post_deg = dict(G.degree()) + drops: list[tuple[str, int, int]] = [] + for nid, (label, before) in pre_hubs.items(): + if before <= 0: + continue + after = post_deg.get(nid, 0) + if (before - after) / before > drop_frac: + drops.append((label, before, after)) + drops.sort(key=lambda d: d[1] - d[2], reverse=True) + return drops + + +def _warn_hub_degree_drops( + pre_hubs: dict[str, tuple[str, int]], + G: nx.Graph, + exclude: set[str] | None = None, +) -> None: + """Emit a stderr WARNING for any hub node that collapsed during a merge. + + Hubs in `exclude` are skipped: their source_file was intentionally pruned this + run, so their collapse is an operator-requested deletion, not the #1651 id-drift + corruption the alert exists to catch. Warning on them would be alarm fatigue on + the exact signal that matters (mirrors the shrink guard's `not prune_sources` + exemption). A hub that collapsed WITHOUT its file being pruned still warns. + """ + if exclude: + pre_hubs = {nid: v for nid, v in pre_hubs.items() if nid not in exclude} + drops = _hub_degree_drops(pre_hubs, G) + if not drops: + return + print( + f"[graphify] WARNING: {len(drops)} hub node(s) lost >" + f"{int(DEGREE_DROP_FRAC * 100)}% of their edges after this merge — " + "possible silent data loss from a re-extraction changing an entity's id " + "(#1651).", + file=sys.stderr, + ) + for label, before, after in drops[:_HUB_DROP_REPORT_LIMIT]: + suffix = " (node dropped entirely)" if after == 0 else "" + print( + f"[graphify] - '{label}': {before} -> {after} edges{suffix}", + file=sys.stderr, + ) + extra = len(drops) - _HUB_DROP_REPORT_LIMIT + if extra > 0: + print(f"[graphify] ... and {extra} more hub node(s).", file=sys.stderr) + + def build_merge( new_chunks: list[dict], graph_path: str | Path | None = None, @@ -780,6 +885,23 @@ def build_merge( existing_hyperedges = [] had_graph = False + # Snapshot the pre-merge graph's hub-node degrees so we can warn if this merge + # collapses one (#1651/#1652b). Captured from the graph AS LOADED, before the + # replace-per-source filter below drops the re-extracted files' contribution. + pre_merge_hubs = ( + _hub_degrees(existing_nodes, existing_edges, directed=directed) + if had_graph else {} + ) + # Remember each hub's source_file from the graph AS LOADED, so the degree-drop + # alert below can exempt a hub whose collapse is explained by an intentional + # prune — using the same source_file-in-prune_set rule the node prune uses. + pre_merge_hub_sources: dict[str, str | None] = {} + if pre_merge_hubs: + for _n in existing_nodes: + _nid = _n.get("id") + if _nid in pre_merge_hubs and _nid not in pre_merge_hub_sources: + pre_merge_hub_sources[_nid] = _n.get("source_file") + # Effective root for relativizing absolute source_file / prune paths back to the # stored relative source_file keys. When the caller passes root we use it; # otherwise fall back to the graph's recorded scan root, so absolute @@ -906,6 +1028,21 @@ def _kept(item: dict) -> bool: f"Pass prune_sources explicitly if you intend to remove nodes." ) + # Degree-drop alert (#1652b): warn when a former hub collapsed. Unlike the + # count-based shrink guard above, this runs on every path — including the + # normal dedup=True --update — and warns rather than raises, since a genuine + # large refactor can legitimately shed a hub's edges. + if pre_merge_hubs: + # Exempt hubs whose source_file was pruned this run: their collapse is a + # deletion the operator asked for (the normal way --update drops a removed + # file), not the #1651 id-drift corruption the alert exists to catch. A hub + # that collapsed WITHOUT its file being pruned still warns. + pruned_hub_ids = { + nid for nid, sf in pre_merge_hub_sources.items() + if sf in prune_set or _norm_source_file(sf, _eff_root) in prune_set + } + _warn_hub_degree_drops(pre_merge_hubs, G, exclude=pruned_hub_ids) + return G diff --git a/tests/test_build_merge_degree_drop.py b/tests/test_build_merge_degree_drop.py new file mode 100644 index 000000000..35ad6fad3 --- /dev/null +++ b/tests/test_build_merge_degree_drop.py @@ -0,0 +1,293 @@ +"""Degree-drop alert for build_merge (#1652b, guards the #1651 collapse vector). + +Incremental --update REPLACES each re-extracted file's prior nodes/edges. If a +re-extraction emits a DIFFERENT id for an entity that already exists as a hub, +the old hub and all its edges are dropped: its in-file edges go with the replaced +contribution, and its cross-file edges are DROPPED by build_from_json — which +discards an edge whose endpoint id has no node rather than orphaning it onto a +bare node. The hub silently collapses from many edges to ~0 while the node count +may not shrink, so the count-based shrink guard never fires. build_merge now +snapshots pre-merge hub degrees and WARNS (not raises) when a former hub vanishes +or loses more than DEGREE_DROP_FRAC of its degree. The alert is active on the +normal dedup=True path (the shrink guard is not), and is suppressed for a hub whose +source_file was intentionally pruned (that collapse is a requested deletion). +""" +from __future__ import annotations + +import json + +from graphify.build import ( + DEGREE_DROP_FRAC, + HUB_DEGREE_MIN, + _HUB_DROP_REPORT_LIMIT, + _hub_degree_drops, + _hub_degrees, + _warn_hub_degree_drops, + build_merge, +) + + +def _write_graph(graph_path, nodes, edges) -> None: + graph_path.write_text( + json.dumps({"nodes": nodes, "edges": edges, "hyperedges": []}), + encoding="utf-8", + ) + + +def _seed_hub_graph(graph_path, n_leaves: int = 25): + """A hub node in hub.py wired to n_leaves in-file leaves plus one cross-file + caller in other.py. Hub degree = n_leaves + 1 (>= HUB_DEGREE_MIN).""" + nodes = [{"id": "hub", "label": "Hub", "file_type": "code", "source_file": "hub.py"}] + edges = [] + for i in range(n_leaves): + nodes.append( + {"id": f"leaf{i}", "label": f"leaf{i}", "file_type": "code", "source_file": "hub.py"} + ) + edges.append( + {"source": "hub", "target": f"leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "hub.py", "weight": 1.0} + ) + nodes.append({"id": "caller", "label": "caller", "file_type": "code", "source_file": "other.py"}) + edges.append( + {"source": "caller", "target": "hub", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "other.py", "weight": 1.0} + ) + _write_graph(graph_path, nodes, edges) + return n_leaves + 1 # hub's pre-merge degree + + +def test_collapsing_hub_triggers_warning_on_dedup_path(tmp_path, capsys): + """Re-extracting hub.py with a NEW id for the hub drops its in-file edges and + drops the cross-file one (its old endpoint no longer has a node) — the hub + collapses. The warning must fire even on the default dedup=True path (which the + shrink guard is gated out of).""" + graph_path = tmp_path / "graph.json" + before = _seed_hub_graph(graph_path) + assert before >= HUB_DEGREE_MIN + + # hub.py re-extracted; the entity now carries a different id (the #1651 vector). + new_chunk = { + "nodes": [{"id": "hub_renamed", "label": "Hub", "file_type": "code", "source_file": "hub.py"}], + "edges": [], + } + G = build_merge([new_chunk], graph_path, dedup=True) + + # The old hub id vanishes entirely: build_from_json DROPS the cross-file edge + # (its old endpoint has no node), so "hub" is absent from the post-merge graph — + # a total collapse, far past DEGREE_DROP_FRAC. + post_hub_degree = G.degree("hub") if "hub" in G else 0 + assert post_hub_degree == 0 + + err = capsys.readouterr().err + assert "hub node(s) lost" in err + assert "'Hub'" in err + assert f"{before} -> 0" in err # before -> after, with the true pre-merge degree + assert "(node dropped entirely)" in err # vanished-hub suffix + + +def test_benign_reextraction_does_not_warn(tmp_path, capsys): + """Re-extracting hub.py while preserving the hub's id and all its edges keeps + the hub intact — no degree-drop warning.""" + graph_path = tmp_path / "graph.json" + n_leaves = 25 + _seed_hub_graph(graph_path, n_leaves) + + nodes = [{"id": "hub", "label": "Hub", "file_type": "code", "source_file": "hub.py"}] + edges = [] + for i in range(n_leaves): + nodes.append( + {"id": f"leaf{i}", "label": f"leaf{i}", "file_type": "code", "source_file": "hub.py"} + ) + edges.append( + {"source": "hub", "target": f"leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "hub.py", "weight": 1.0} + ) + G = build_merge([{"nodes": nodes, "edges": edges}], graph_path, dedup=True) + + assert G.degree("hub") == n_leaves + 1 # cross-file caller edge preserved too + err = capsys.readouterr().err + assert "hub node(s) lost" not in err + + +def test_small_degree_change_below_threshold_does_not_warn(tmp_path, capsys): + """A hub that sheds only a couple of edges (< DEGREE_DROP_FRAC) is a benign + edit, not a collapse — no warning.""" + graph_path = tmp_path / "graph.json" + n_leaves = 25 + _seed_hub_graph(graph_path, n_leaves) + + # Re-extract hub.py keeping the hub id but dropping just 2 of its 25 leaves. + nodes = [{"id": "hub", "label": "Hub", "file_type": "code", "source_file": "hub.py"}] + edges = [] + for i in range(n_leaves - 2): + nodes.append( + {"id": f"leaf{i}", "label": f"leaf{i}", "file_type": "code", "source_file": "hub.py"} + ) + edges.append( + {"source": "hub", "target": f"leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "hub.py", "weight": 1.0} + ) + build_merge([{"nodes": nodes, "edges": edges}], graph_path, dedup=True) + + err = capsys.readouterr().err + assert "hub node(s) lost" not in err + + +def test_no_warning_when_no_prior_hub(tmp_path, capsys): + """A graph whose busiest node is below HUB_DEGREE_MIN cannot collapse a hub.""" + graph_path = tmp_path / "graph.json" + nodes = [ + {"id": "a", "label": "A", "file_type": "code", "source_file": "a.py"}, + {"id": "b", "label": "B", "file_type": "code", "source_file": "a.py"}, + ] + edges = [{"source": "a", "target": "b", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "a.py", "weight": 1.0}] + _write_graph(graph_path, nodes, edges) + + build_merge([{"nodes": [{"id": "a", "label": "A", "file_type": "code", "source_file": "a.py"}], + "edges": []}], graph_path, dedup=True) + err = capsys.readouterr().err + assert "hub node(s) lost" not in err + + +# ── prune interaction (#1652b: don't cry corruption on intentional deletes) ──── + +def test_pure_prune_of_hub_file_is_silent(tmp_path, capsys): + """Deleting a hub's file via prune_sources (no re-extraction) collapses the hub + to nothing, but that is the operator-requested deletion path — the degree-drop + alert stays silent and never blames the #1651 id-drift cause.""" + graph_path = tmp_path / "graph.json" + before = _seed_hub_graph(graph_path) # hub in hub.py + assert before >= HUB_DEGREE_MIN + + G = build_merge([], graph_path, prune_sources=["hub.py"], dedup=True) + + assert "hub" not in G # pruned away entirely + err = capsys.readouterr().err + assert "hub node(s) lost" not in err + assert "#1651" not in err + + +def test_pruned_hub_silent_but_id_drift_hub_still_warns(tmp_path, capsys): + """Same run, two hubs: a.py is pruned (requested deletion) and b.py is + re-extracted with a NEW id for its hub (the #1651 vector). The alert must NOT + blame the pruned hub, but MUST still fire on the id-drift collapse — the prune + exemption cannot mute the corruption signal it was carved out of.""" + graph_path = tmp_path / "graph.json" + n_leaves = 25 + nodes = [ + {"id": "hubA", "label": "HubA", "file_type": "code", "source_file": "a.py"}, + {"id": "hubB", "label": "HubB", "file_type": "code", "source_file": "b.py"}, + ] + edges = [] + for i in range(n_leaves): + nodes.append({"id": f"a_leaf{i}", "label": f"a_leaf{i}", "file_type": "code", "source_file": "a.py"}) + edges.append({"source": "hubA", "target": f"a_leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "a.py", "weight": 1.0}) + nodes.append({"id": f"b_leaf{i}", "label": f"b_leaf{i}", "file_type": "code", "source_file": "b.py"}) + edges.append({"source": "hubB", "target": f"b_leaf{i}", "relation": "calls", + "confidence": "EXTRACTED", "source_file": "b.py", "weight": 1.0}) + _write_graph(graph_path, nodes, edges) + + # b.py re-extracted with a NEW id for hubB (#1651); a.py deleted (pruned). + new_chunk = { + "nodes": [{"id": "hubB_renamed", "label": "HubB", "file_type": "code", "source_file": "b.py"}], + "edges": [], + } + G = build_merge([new_chunk], graph_path, prune_sources=["a.py"], dedup=True) + + assert "hubA" not in G # pruned + assert "hubB" not in G # collapsed via id-drift + + err = capsys.readouterr().err + assert "hub node(s) lost" in err # the alert DID fire + assert "1 hub node(s) lost" in err # exactly one — hubA is exempted + assert "'HubB'" in err # id-drift hub reported + assert "'HubA'" not in err # pruned hub NOT reported + + +# ── report shape: truncation and the strict-> boundary ──────────────────────── + +def test_many_collapsing_hubs_are_truncated_in_report(capsys): + """More than _HUB_DROP_REPORT_LIMIT collapsing hubs: the per-hub list is capped + at the limit and a '... and N more' line summarizes the rest.""" + import networkx as nx + + n_hubs = _HUB_DROP_REPORT_LIMIT + 2 + pre = {f"h{i}": (f"Hub{i}", 30) for i in range(n_hubs)} + G = nx.Graph() # every hub vanished + _warn_hub_degree_drops(pre, G) + + err = capsys.readouterr().err + assert f"{n_hubs} hub node(s) lost" in err + # only the first _HUB_DROP_REPORT_LIMIT per-hub lines are printed + assert err.count("edges (node dropped entirely)") == _HUB_DROP_REPORT_LIMIT + assert f"... and {n_hubs - _HUB_DROP_REPORT_LIMIT} more hub node(s)." in err + + +def test_exactly_half_degree_loss_is_below_threshold(): + """A hub losing EXACTLY DEGREE_DROP_FRAC of its degree is benign — the guard + uses strict `>`, so 50% does not fire and 50%+one-edge does. Pins `>` (not + `>=`) at the drop check.""" + import networkx as nx + + pre = {"h": ("Hub", 40)} + # after = 20 -> exactly 50% lost -> NOT flagged (strict >) + g_half = nx.Graph() + for i in range(20): + g_half.add_edge("h", f"n{i}") + assert _hub_degree_drops(pre, g_half, drop_frac=DEGREE_DROP_FRAC) == [] + + # after = 19 -> 52.5% lost -> flagged + g_over = nx.Graph() + for i in range(19): + g_over.add_edge("h", f"n{i}") + assert _hub_degree_drops(pre, g_over, drop_frac=DEGREE_DROP_FRAC) == [("Hub", 40, 19)] + + +# ── helper-level unit tests ─────────────────────────────────────────────────── + +def test_hub_degrees_picks_only_above_threshold(): + nodes = [{"id": "h", "label": "Hub"}] + [{"id": f"l{i}", "label": f"l{i}"} for i in range(HUB_DEGREE_MIN)] + edges = [{"source": "h", "target": f"l{i}"} for i in range(HUB_DEGREE_MIN)] + hubs = _hub_degrees(nodes, edges) + assert hubs["h"] == ("Hub", HUB_DEGREE_MIN) + assert all(nid == "h" for nid in hubs) # leaves (degree 1) excluded + + +def test_hub_degrees_directed_matches_digraph_degree(): + """directed=True snapshots a DiGraph, so a bidirectional pair counts as degree 2 + — matching G.degree() on build_merge(directed=True)'s DiGraph. Undirected + collapses the pair to 1, which would skew the before/after ratio.""" + nodes = [{"id": "h", "label": "Hub"}] + [{"id": f"l{i}", "label": f"l{i}"} for i in range(15)] + edges = [{"source": "h", "target": f"l{i}"} for i in range(15)] + edges += [{"source": f"l{i}", "target": "h"} for i in range(15)] # reverse of each pair + # undirected: 15 unique pairs -> degree 15 (< 20) -> h is not a hub + assert _hub_degrees(nodes, edges) == {} + # directed: in + out -> degree 30 -> h is a hub + assert _hub_degrees(nodes, edges, directed=True)["h"] == ("Hub", 30) + + +def test_hub_degrees_ignores_dangling_edges(): + """An edge whose endpoint has no node is DROPPED (mirroring build_from_json), + not counted onto an auto-created bare node — so it can't inflate a hub's + pre-merge degree.""" + nodes = [{"id": "h", "label": "Hub"}] + [{"id": f"l{i}", "label": f"l{i}"} for i in range(HUB_DEGREE_MIN)] + edges = [{"source": "h", "target": f"l{i}"} for i in range(HUB_DEGREE_MIN)] + edges += [{"source": "h", "target": f"ghost{i}"} for i in range(5)] # endpoints absent + hubs = _hub_degrees(nodes, edges) + assert hubs["h"] == ("Hub", HUB_DEGREE_MIN) # dangling edges excluded, not +5 + assert "ghost0" not in hubs # a ghost endpoint never became a node + + +def test_hub_degree_drops_flags_vanished_and_collapsed(): + import networkx as nx + + pre = {"h": ("Hub", 40), "keep": ("Keep", 30)} + G = nx.Graph() + G.add_node("keep") + for i in range(30): + G.add_edge("keep", f"n{i}") # keep still has degree 30 — unchanged + # "h" is absent from G entirely -> vanished + drops = _hub_degree_drops(pre, G, drop_frac=DEGREE_DROP_FRAC) + assert drops == [("Hub", 40, 0)]