diff --git a/CHANGELOG.md b/CHANGELOG.md index 626fac385..c0df37dd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Full release notes with details on each version: [GitHub Releases](https://githu ## Unreleased +- Feat: a re-clustered community that keeps most of its members now retains its saved LLM label instead of resetting to a structural hub name (#1653, thanks @Ns2384-star). `graphify cluster-only` previously dropped a community's saved label on any exact-membership-signature change, so a community that merely gained or lost a member was renamed to its highest-degree hub (e.g. an `auth` community reverting to a bare `log_action`). Label carry-over is now gated on member overlap: a community that still shares at least `LABEL_CARRYOVER_MIN_JACCARD` (0.75) Jaccard overlap with the previous community of the same id keeps its name, while a mostly-replaced community still falls back to the hub label. The "run `graphify label`" nag now fires only for communities that genuinely lost their label. - Fix: a malformed semantic chunk no longer crashes `extract` and discards every successful chunk (#1631, thanks @ssazy). When an LLM returned a well-formed object whose `edges` (or `nodes`/`hyperedges`) array carried a stray non-dict entry — a nested list where an edge object belongs — the AST+semantic merge and the semantic-cache write both called `.get()` per entry and raised `AttributeError: 'list' object has no attribute 'get'`. On a 34-chunk run where 33 succeeded, that meant no `graph.json` was written and the cache write failed too, so a re-run re-extracted everything. `_parse_llm_json` now sanitizes each fragment at the single parse chokepoint (keeping only dict entries and coercing a non-list value to `[]`), so the cache writer, the adaptive-retry merge, and the CLI merge are all protected in one place. - Fix: an unresolved bare npm import no longer aliases onto an unrelated same-named local file (#1638, thanks @EveX1). `import colors from "tailwindcss/colors"` in a `.tsx` file emitted an `imports_from` edge to the bare id `colors`, and build.py's pre-migration alias index (which registers every local file's bare stem) then remapped it onto an unrelated `backend/utils/colors.py` — a confident (`EXTRACTED`) cross-language phantom edge, and one per `.tsx` file sharing the import. In a real monorepo eight unrelated `.tsx` files all landed on a single Python module. Common package subpaths (`colors`, `utils`, `types`, `config`, `client`) collide this way constantly. The external-import fallback now namespaces its target with the `ref` prefix (the same J-4 convention used for tsconfig `extends`/`$ref` externals), so it can never collapse to a local file/symbol id; the ref-namespaced target has no node, so build drops it as an external reference — the correct outcome for a third-party import. - Fix: `graph.json` node/edge ordering is now stable run-to-run for document/semantic corpora (#1632, thanks @umeshpsatwe). With a parallel LLM backend, `extract_corpus_parallel` merged chunk results in completion order, so which network call happened to return first reordered the nodes and edges even when the model returned identical content — churning `graph.json` between otherwise-identical runs. Chunks are now merged in deterministic submission order after the pool drains (matching the serial path); the progress callback still fires in completion order so long local runs aren't silent. Note: the semantic content the LLM extracts is itself nondeterministic run-to-run — this fix removes the pipeline's own ordering churn, not the model's variance. diff --git a/graphify/__main__.py b/graphify/__main__.py index 59dcd70a5..83df689f5 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -3559,7 +3559,12 @@ def main() -> None: # is told to `graphify label` for fresh LLM names. Unchanged communities keep # their saved label. When no signature sidecar exists (labels predate this), # fall back to hub-filling only the communities missing a label. - from graphify.cluster import community_member_sigs, label_communities_by_hub + from graphify.cluster import ( + LABEL_CARRYOVER_MIN_JACCARD, + community_member_sigs, + community_overlap_ratios, + label_communities_by_hub, + ) sig_path = labels_path.parent / (labels_path.name + ".sig") saved_sigs: dict[int, str] = {} if sig_path.exists(): @@ -3572,21 +3577,31 @@ def main() -> None: except Exception: saved_sigs = {} cur_sigs = community_member_sigs(communities) + # Overlap of each (already remapped) community against the previous + # community that shared its id. An exact-signature mismatch used to + # discard the saved LLM label outright, so a community that merely + # gained/lost a member was renamed to its hub (#1653). Carry the + # label over when the two are still substantially the same set. + overlap_ratios = community_overlap_ratios(communities, previous_node_community) count_mismatch = len(existing_labels) != len(communities) labels = {} hub_labels: dict[int, str] | None = None changed = 0 for cid in communities: have_label = cid in existing_labels + # Same community, give or take a member: keep its saved label + # even though the exact signature changed, gated on a conservative + # Jaccard so a genuinely different community can't inherit it. + carried = have_label and overlap_ratios.get(cid, 0.0) >= LABEL_CARRYOVER_MIN_JACCARD if saved_sigs: # Precise: the membership signature tells us if this exact # community changed since it was labeled. - fresh = have_label and saved_sigs.get(cid) == cur_sigs.get(cid) + fresh = (have_label and saved_sigs.get(cid) == cur_sigs.get(cid)) or carried else: # No signature sidecar (labels predate it). A differing community # COUNT means the labels describe a different clustering, so a cid's # old label can't be trusted; equal count is the best "same" signal. - fresh = have_label and not count_mismatch + fresh = (have_label and not count_mismatch) or carried if fresh: labels[cid] = existing_labels[cid] else: diff --git a/graphify/cluster.py b/graphify/cluster.py index 682210700..375a9ca85 100644 --- a/graphify/cluster.py +++ b/graphify/cluster.py @@ -318,3 +318,52 @@ def remap_communities_to_previous( for new_cid, nodes in communities.items(): remapped[new_to_final[new_cid]] = sorted(nodes) return dict(sorted(remapped.items(), key=lambda kv: kv[0])) + + +# Minimum Jaccard overlap between a re-clustered community and the previous +# community that shared its id for the saved LLM label to be carried over +# instead of reset to a structural hub name (#1653). Conservative on purpose: +# the exact-membership signature check (community_member_sigs) was added to stop +# stale labels surviving a re-scope, so carry-over only kicks in when the two +# communities are "the same, give or take a member". At 0.75 a five-member +# community may gain or lose one member (Jaccard 5/6 ≈ 0.83) and keep its name, +# but a community that swapped out a quarter of its members drops to the hub. +LABEL_CARRYOVER_MIN_JACCARD = 0.75 + + +def community_overlap_ratios( + communities: dict[int, list[str]], + previous_node_community: dict[str, int], +) -> dict[int, float]: + """Jaccard overlap of each community against the previous community with the + same id: ``{cid: |new ∩ old| / |new ∪ old|}``. + + Call *after* :func:`remap_communities_to_previous` has aligned ids to the + prior assignment, so cid ``X``'s members are the natural successor of the + community whose saved label/signature are also keyed on ``X``. A ratio near + 1.0 means "the same community, give or take a member" — enough to carry a + saved LLM label across a re-cluster (see ``LABEL_CARRYOVER_MIN_JACCARD``) + rather than resetting it to a hub name (#1653). A cid with no previous + community of the same id (a genuinely new community) scores 0.0. + + ``previous_node_community`` is read from the surviving nodes' saved + ``community`` tags in the current ``graph.json``, so the Jaccard is computed + over SURVIVING nodes only: a member deleted from the graph is absent from + both sets and neither shrinks nor inflates the overlap. This deletion- + insensitivity is intentional — a community losing nodes to deletion is still + "the same community", so it keeps its label. + """ + old_sets: dict[int, set[str]] = {} + for node, old_cid in previous_node_community.items(): + old_sets.setdefault(old_cid, set()).add(str(node)) + + ratios: dict[int, float] = {} + for cid, nodes in communities.items(): + new_set = {str(n) for n in nodes} + old_set = old_sets.get(cid) + if not new_set or not old_set: + ratios[cid] = 0.0 + continue + union = len(new_set | old_set) + ratios[cid] = (len(new_set & old_set) / union) if union else 0.0 + return ratios diff --git a/tests/test_community_hub_labels.py b/tests/test_community_hub_labels.py index 281a853d9..52610ca27 100644 --- a/tests/test_community_hub_labels.py +++ b/tests/test_community_hub_labels.py @@ -81,3 +81,87 @@ def test_community_member_sigs_change_when_membership_changes(): before = community_member_sigs({0: ["x", "y", "z"]}) after = community_member_sigs({0: ["x", "y"]}) # a node left the community assert before[0] != after[0], "signature must change when a community's members change" + + +# ── label carry-over via member overlap (cluster-only re-cluster, #1653) ─────── + +def test_overlap_ratio_identical_community_is_one(): + from graphify.cluster import community_overlap_ratios + prev = {"a": 0, "b": 0, "c": 0} + ratios = community_overlap_ratios({0: ["a", "b", "c"]}, prev) + assert ratios[0] == 1.0 + + +def test_overlap_ratio_gained_one_member_stays_above_threshold(): + # Old community was 5 members; new run gained one -> Jaccard 5/6 ≈ 0.83. + from graphify.cluster import community_overlap_ratios, LABEL_CARRYOVER_MIN_JACCARD + prev = {f"n{i}": 0 for i in range(5)} + new_members = [f"n{i}" for i in range(5)] + ["n5"] + ratios = community_overlap_ratios({0: new_members}, prev) + assert ratios[0] == 5 / 6 + assert ratios[0] >= LABEL_CARRYOVER_MIN_JACCARD, "a one-member drift must clear the carry-over gate" + + +def test_overlap_ratio_swapped_most_members_drops_below_threshold(): + # Only one of six members survives -> Jaccard 1/6 ≈ 0.17, well below the gate. + from graphify.cluster import community_overlap_ratios, LABEL_CARRYOVER_MIN_JACCARD + prev = {f"old{i}": 0 for i in range(6)} + new_members = ["old0"] + [f"new{i}" for i in range(5)] + ratios = community_overlap_ratios({0: new_members}, prev) + assert ratios[0] == 1 / 11 + assert ratios[0] < LABEL_CARRYOVER_MIN_JACCARD, "a mostly-new community must NOT carry the stale label" + + +def test_overlap_ratio_new_community_scores_zero(): + # cid 1 has no previous community of the same id -> genuinely new -> 0.0. + from graphify.cluster import community_overlap_ratios + prev = {"a": 0, "b": 0} + ratios = community_overlap_ratios({0: ["a", "b"], 1: ["x", "y"]}, prev) + assert ratios[1] == 0.0 + + +def test_overlap_ratio_empty_previous_is_all_zero(): + from graphify.cluster import community_overlap_ratios + ratios = community_overlap_ratios({0: ["a"], 1: ["b"]}, {}) + assert ratios == {0: 0.0, 1: 0.0} + + +def test_carryover_threshold_is_conservative(): + # A stale-label guard: the default must be a strong majority overlap so a + # re-scoped community can't silently inherit an old LLM name (#1653). + from graphify.cluster import LABEL_CARRYOVER_MIN_JACCARD + assert 0.5 < LABEL_CARRYOVER_MIN_JACCARD <= 1.0 + + +def test_overlap_ratio_exactly_at_threshold_keeps_inclusive(): + # old {1,2,3} -> new {1,2,3,4}: intersection 3, union 4 -> Jaccard exactly 0.75. + # The gate is inclusive (>=), so a community sitting right on the boundary + # KEEPS its label. Pins the `>=` against an accidental flip to `>`. + from graphify.cluster import community_overlap_ratios, LABEL_CARRYOVER_MIN_JACCARD + prev = {"1": 0, "2": 0, "3": 0} + ratios = community_overlap_ratios({0: ["1", "2", "3", "4"]}, prev) + assert ratios[0] == 0.75 + assert ratios[0] >= LABEL_CARRYOVER_MIN_JACCARD, "an exact 0.75 overlap must clear the inclusive gate" + + +def test_overlap_ratio_single_member_swap_drops_below_threshold(): + # A single add+remove on a 4-member community: old {1,2,3,4} -> new {1,2,3,5}. + # intersection 3, union 5 -> Jaccard 0.6, just under the gate -> DROP the label. + # Unlike the extreme 1/11 swap, this is the tightest failing case. + from graphify.cluster import community_overlap_ratios, LABEL_CARRYOVER_MIN_JACCARD + prev = {"1": 0, "2": 0, "3": 0, "4": 0} + ratios = community_overlap_ratios({0: ["1", "2", "3", "5"]}, prev) + assert ratios[0] == 0.6 + assert ratios[0] < LABEL_CARRYOVER_MIN_JACCARD, "a single-member swap on a small community must drop the label" + + +def test_overlap_ratio_reused_cid_partial_overlap(): + # A reused cid whose community only partially overlaps its predecessor: + # old {p,q,r,s} -> new {p,q,x,y}. intersection 2, union 6 -> Jaccard 1/3. + # A partial reuse is neither a fresh community (0.0) nor an identity (1.0). + from graphify.cluster import community_overlap_ratios, LABEL_CARRYOVER_MIN_JACCARD + prev = {"p": 0, "q": 0, "r": 0, "s": 0} + ratios = community_overlap_ratios({0: ["p", "q", "x", "y"]}, prev) + assert ratios[0] == 1 / 3 + assert 0.0 < ratios[0] < 1.0 + assert ratios[0] < LABEL_CARRYOVER_MIN_JACCARD, "a partial reuse below the gate must not carry the label" diff --git a/tests/test_labeling.py b/tests/test_labeling.py index d3a68067c..81e6fb1d9 100644 --- a/tests/test_labeling.py +++ b/tests/test_labeling.py @@ -4,7 +4,10 @@ malformed replies, and the no-backend fallback. """ import json +import os +import subprocess import sys +from pathlib import Path import networkx as nx import pytest @@ -410,3 +413,133 @@ def test_label_communities_forces_serial_for_ollama(monkeypatch): monkeypatch.delenv("GRAPHIFY_OLLAMA_PARALLEL", raising=False) label_communities(G, communities, backend="ollama", batch_size=1, max_concurrency=8) assert state["peak"] == 1, "ollama must be forced serial" + + +# --- #1653: carry LLM labels over a re-cluster via member overlap ------------- +# When `graphify cluster-only` re-clusters, a community that only gained or lost +# a member should KEEP its saved LLM name (its exact membership signature +# changed, but it is still the same conceptual community). A community that was +# largely replaced should DROP the stale name and fall back to a hub label. + +def _cluster_only(cwd: Path) -> subprocess.CompletedProcess: + env = dict(os.environ) + env.pop("GRAPHIFY_OUT", None) # pin the default graphify-out/ layout + return subprocess.run( + [sys.executable, "-m", "graphify", "cluster-only", ".", "--no-viz"], + cwd=cwd, capture_output=True, text=True, env=env, + ) + + +def _clique_links(nodes): + return [{"source": a, "target": b} + for i, a in enumerate(nodes) for b in nodes[i + 1:]] + + +def _seed_carryover_fixture(tmp_path: Path, new_nodes, prev_community, links, write_sig=True): + """Write graph.json + labels + sig emulating a prior labeled clustering. + + new_nodes: {node_id: label} for the CURRENT graph. + prev_community: {node_id: cid} the previous per-node community attribute + (nodes absent from this map are treated as new, unlabeled communities). + links: edge list for the current graph. + write_sig: when False, omit the `.graphify_labels.json.sig` sidecar to + emulate labels that predate the signature file (exercises the no-sig + reuse branch, which falls back to a community-COUNT check + overlap gate). + Community 100 was labeled "Auth Service", 200 "Billing Service"; the sig + sidecar records each community's EXACT prior membership. + """ + from graphify.cluster import community_member_sigs + + out = tmp_path / "graphify-out" + out.mkdir() + nodes = [ + {"id": nid, "label": lbl, "community": prev_community.get(nid)} + for nid, lbl in new_nodes.items() + ] + graph = {"directed": False, "multigraph": False, "nodes": nodes, "links": links} + (out / "graph.json").write_text(json.dumps(graph), encoding="utf-8") + + labels = {"100": "Auth Service", "200": "Billing Service"} + (out / ".graphify_labels.json").write_text(json.dumps(labels), encoding="utf-8") + + if write_sig: + # Prior EXACT membership of each community (what was hashed at labeling time). + prior_members: dict[int, list[str]] = {} + for nid, cid in prev_community.items(): + prior_members.setdefault(cid, []).append(nid) + sigs = community_member_sigs(prior_members) + (out / ".graphify_labels.json.sig").write_text( + json.dumps({str(k): v for k, v in sigs.items()}), encoding="utf-8") + return out + + +def test_cluster_only_keeps_label_when_community_gains_one_member(tmp_path): + # Community 100 was {a0..a4}; the current graph adds a5 to the same clique. + # Jaccard 5/6 ≈ 0.83 ≥ threshold -> the saved LLM label survives. + a_nodes = [f"a{i}" for i in range(6)] # a0..a5 (a5 is new) + b_nodes = [f"b{i}" for i in range(5)] # unchanged community 200 + new_nodes = {n: n for n in a_nodes + b_nodes} + prev_community = {n: 100 for n in a_nodes[:5]} | {n: 200 for n in b_nodes} + links = _clique_links(a_nodes) + _clique_links(b_nodes) + + out = _seed_carryover_fixture(tmp_path, new_nodes, prev_community, links) + r = _cluster_only(tmp_path) + assert r.returncode == 0, r.stderr + + labels = json.loads((out / ".graphify_labels.json").read_text(encoding="utf-8")) + assert labels["100"] == "Auth Service", ( + f"a one-member gain must keep the saved LLM label, got {labels}") + assert labels["200"] == "Billing Service" + # Nothing genuinely lost its label -> no nag to re-run `graphify label`. + assert "community set changed since labeling" not in r.stderr + + +def test_cluster_only_drops_label_when_community_is_mostly_replaced(tmp_path): + # Community 100's surviving member is a0; the current clique around it is 5 + # otherwise-fresh nodes. Jaccard 1/6 ≈ 0.17 < threshold, and the exact + # signature differs -> the stale LLM name is dropped for a hub label. + a_clique = ["a0", "c0", "c1", "c2", "c3", "c4"] # a0 is the lexical hub + b_nodes = [f"b{i}" for i in range(5)] + new_nodes = {n: (f"{n}_fn") for n in a_clique + b_nodes} + prev_community = {"a0": 100} | {n: 200 for n in b_nodes} + links = _clique_links(a_clique) + _clique_links(b_nodes) + + out = _seed_carryover_fixture(tmp_path, new_nodes, prev_community, links) + r = _cluster_only(tmp_path) + assert r.returncode == 0, r.stderr + + labels = json.loads((out / ".graphify_labels.json").read_text(encoding="utf-8")) + assert labels["100"] != "Auth Service", ( + f"a mostly-replaced community must NOT keep the stale label, got {labels}") + assert labels["100"] == "a0_fn", f"dropped community should take its hub name, got {labels}" + assert labels["200"] == "Billing Service", "the unchanged community keeps its label" + # The user is nagged only about the community that actually lost its label. + assert "community set changed since labeling" in r.stderr + assert "renamed 1 community" in r.stderr + + +def test_cluster_only_carries_label_without_sig_under_count_mismatch(tmp_path): + # Labels predate the signature sidecar, so NO `.graphify_labels.json.sig` is + # written -> the reuse path takes its no-sig branch. The current graph has 3 + # communities vs 2 saved labels (a brand-new c-clique), so the community COUNT + # differs, which on its own distrusts every saved label. Community 100 still + # overlaps its prior membership by 5/6 ≈ 0.83, so the overlap gate (`or carried` + # on the no-sig branch) carries the label across the count mismatch anyway. + a_nodes = [f"a{i}" for i in range(6)] # a0..a4 were community 100; a5 is new + b_nodes = [f"b{i}" for i in range(5)] # unchanged community 200 + c_nodes = [f"c{i}" for i in range(5)] # a brand-new community -> count mismatch + new_nodes = {n: n for n in a_nodes + b_nodes + c_nodes} + prev_community = {n: 100 for n in a_nodes[:5]} | {n: 200 for n in b_nodes} + links = _clique_links(a_nodes) + _clique_links(b_nodes) + _clique_links(c_nodes) + + out = _seed_carryover_fixture(tmp_path, new_nodes, prev_community, links, write_sig=False) + assert not (out / ".graphify_labels.json.sig").exists() + r = _cluster_only(tmp_path) + assert r.returncode == 0, r.stderr + + labels = json.loads((out / ".graphify_labels.json").read_text(encoding="utf-8")) + assert labels["100"] == "Auth Service", ( + f"overlap must carry the label with no .sig despite a count mismatch, got {labels}") + assert labels["200"] == "Billing Service", "the unchanged community keeps its label" + # Both labeled communities were carried, so nothing was renamed by its hub. + assert "community set changed since labeling" not in r.stderr