diff --git a/.gitattributes b/.gitattributes
index faa0f4b26..340d331cb 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,6 @@
+# Normalise line endings: LF in repo, platform-native in working tree
+* text=auto
+
# Tell GitHub Linguist to ignore generated/example HTML files when calculating
# the repo's primary language. Without this, large graph.html artifacts in
# worked/ dominate the byte count and the repo shows as HTML instead of Python.
diff --git a/graphify/cli.py b/graphify/cli.py
new file mode 100644
index 000000000..3fabd0cbd
--- /dev/null
+++ b/graphify/cli.py
@@ -0,0 +1,24 @@
+"""CLI entry point for graphify-live - enhanced animated HTML visualization.
+
+Usage:
+ graphify-live export html /path/to/project
+
+All subcommands work identically to `graphify` except that `export html`
+uses the enhanced animated D3.js HTML exporter instead of the default
+vis-network exporter.
+"""
+
+
+def main():
+ import graphify.export
+ import graphify.export_live
+
+ graphify.export.to_html = graphify.export_live.to_html
+
+ from graphify.__main__ import main as _original_main
+
+ _original_main()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/graphify/export.py b/graphify/export.py
index 176b17909..0e39e9e34 100644
--- a/graphify/export.py
+++ b/graphify/export.py
@@ -1181,6 +1181,8 @@ def _community_name(cid) -> str:
)
fname = community_filename[cid] + ".md"
+ (out / fname).write_text("\n".join(lines), encoding="utf-8") # nosec
+ community_notes_written += 1
if _owned_write(fname, "\n".join(lines)):
community_notes_written += 1
diff --git a/graphify/export_live.py b/graphify/export_live.py
new file mode 100644
index 000000000..635680151
--- /dev/null
+++ b/graphify/export_live.py
@@ -0,0 +1,2377 @@
+# write graph to HTML, JSON, SVG, GraphML, Obsidian vault, and Neo4j Cypher
+from __future__ import annotations
+import hashlib
+import html as _html
+import json
+import math
+import os
+import re
+import shutil
+from collections import Counter
+from datetime import date
+from pathlib import Path
+import networkx as nx
+from networkx.readwrite import json_graph
+from graphify.security import sanitize_label
+from graphify.analyze import _node_community_map
+from graphify.build import edge_data
+
+
+# Artifacts worth preserving across rebuilds (non-regenerable without LLM or curation).
+_BACKUP_ARTIFACTS = [
+ "graph.json",
+ "GRAPH_REPORT.md",
+ ".graphify_labels.json",
+ ".graphify_analysis.json",
+ "manifest.json",
+ ".graphify_semantic_marker",
+ "cost.json",
+]
+
+
+def backup_if_protected(out_dir: Path) -> "Path | None":
+ """Snapshot graph artifacts to a dated subfolder before an overwrite.
+
+ Triggers when graph.json exists AND either:
+ - .graphify_semantic_marker is present (graph cost real LLM tokens), or
+ - .graphify_labels.json contains at least one non-default community label
+ (graph has been curated by a human or skill).
+
+ Returns the backup folder path, or None if no backup was taken.
+ Never raises — backup failure prints a warning but never blocks the write.
+ Set GRAPHIFY_NO_BACKUP=1 to disable.
+ """
+ if os.environ.get("GRAPHIFY_NO_BACKUP"):
+ return None
+ out = Path(out_dir)
+ if not (out / "graph.json").exists():
+ return None
+
+ is_semantic = (out / ".graphify_semantic_marker").exists()
+ is_curated = False
+ labels_file = out / ".graphify_labels.json"
+ if labels_file.exists():
+ try:
+ labels = json.loads(labels_file.read_text(encoding="utf-8"))
+ is_curated = any(v != f"Community {k}" for k, v in labels.items())
+ except Exception:
+ pass
+
+ if not is_semantic and not is_curated:
+ return None
+
+ reason = "+".join(filter(None, ["semantic" if is_semantic else "", "curated" if is_curated else ""]))
+ today = date.today().isoformat()
+ backup_dir = out / today
+ graph_src = out / "graph.json"
+
+ # Skip re-copying if today's backup already has identical graph.json content.
+ # If content differs (graph changed since the last backup today), overwrite
+ # the backup in place — one folder per day, always the latest pre-overwrite state.
+ if backup_dir.exists() and (backup_dir / "graph.json").exists():
+ src_hash = hashlib.sha256(graph_src.read_bytes()).hexdigest()
+ bak_hash = hashlib.sha256((backup_dir / "graph.json").read_bytes()).hexdigest()
+ if src_hash == bak_hash:
+ return backup_dir # identical content, nothing to do
+
+ try:
+ backup_dir.mkdir(parents=True, exist_ok=True)
+ copied = 0
+ for name in _BACKUP_ARTIFACTS:
+ src = out / name
+ if src.exists():
+ try:
+ shutil.copy2(src, backup_dir / name)
+ copied += 1
+ except Exception:
+ pass
+ if copied:
+ print(f"[graphify] backed up {reason} graph ({copied} files) -> {backup_dir.name}/")
+ return backup_dir
+ except Exception as exc:
+ import sys
+ print(f"[graphify] warning: backup failed ({exc}) - continuing with overwrite", file=sys.stderr)
+ return None
+
+def _obsidian_tag(name: str) -> str:
+ """Sanitize a community name for use as an Obsidian tag.
+
+ Obsidian tags only allow alphanumerics, hyphens, underscores, and slashes.
+ Spaces become underscores; everything else is stripped.
+ """
+ return re.sub(r"[^a-zA-Z0-9_\-/]", "", name.replace(" ", "_"))
+
+
+def _strip_diacritics(text: str | None) -> str:
+ import unicodedata
+ if not isinstance(text, str):
+ text = "" if text is None else str(text)
+ nfkd = unicodedata.normalize("NFKD", text)
+ return "".join(c for c in nfkd if not unicodedata.combining(c))
+
+
+def _yaml_str(s: str) -> str:
+ """Escape a value for safe embedding in a YAML double-quoted scalar (F-009).
+
+ See `graphify.ingest._yaml_str` for the full rationale; duplicated here to
+ avoid pulling the URL-fetching `ingest` module into export's dependency
+ graph. Handles backslash, double-quote, all line breaks (\\n, \\r,
+ U+2028, U+2029), tab, NUL, and other C0/DEL control characters that
+ would otherwise let a hostile `source_file` / `community` / etc. break
+ out of the YAML scalar and inject sibling keys.
+ """
+ if s is None:
+ return ""
+ out: list[str] = []
+ for ch in str(s):
+ cp = ord(ch)
+ if ch == "\\":
+ out.append("\\\\")
+ elif ch == '"':
+ out.append('\\"')
+ elif ch == "\n":
+ out.append("\\n")
+ elif ch == "\r":
+ out.append("\\r")
+ elif ch == "\t":
+ out.append("\\t")
+ elif ch == "\0":
+ out.append("\\0")
+ elif cp == 0x2028:
+ out.append("\\L")
+ elif cp == 0x2029:
+ out.append("\\P")
+ elif cp < 0x20 or cp == 0x7F:
+ out.append(f"\\x{cp:02x}")
+ else:
+ out.append(ch)
+ return "".join(out)
+
+
+COMMUNITY_COLORS = [
+ "#4E79A7", "#F28E2B", "#E15759", "#76B7B2", "#59A14F",
+ "#EDC948", "#B07AA1", "#FF9DA7", "#9C755F", "#BAB0AC",
+ "#6366f1", "#ec4899", "#14b8a6", "#f97316", "#84cc16",
+ "#06b6d4", "#d946ef", "#22c55e", "#eab308", "#64748b",
+]
+
+MAX_NODES_FOR_VIZ = 5_000
+
+
+def _viz_node_limit() -> int:
+ """Return the effective viz node limit, honoring GRAPHIFY_VIZ_NODE_LIMIT env var.
+
+ Falls back to MAX_NODES_FOR_VIZ when the env var is unset, empty, or non-integer.
+ Set to 0 to disable HTML viz unconditionally (useful for CI runners).
+ """
+ import os
+ raw = os.environ.get("GRAPHIFY_VIZ_NODE_LIMIT")
+ if raw is None or not raw.strip():
+ return MAX_NODES_FOR_VIZ
+ try:
+ return int(raw)
+ except ValueError:
+ return MAX_NODES_FOR_VIZ
+
+
+def _html_styles() -> str:
+ return """"""
+
+
+def _hyperedge_script(hyperedges_json: str) -> str:
+ if hyperedges_json.strip() in ("[]", "null", ""):
+ return ""
+ return f"""
+const RAW_HYPEREDGES = {hyperedges_json};
+"""
+
+
+def _d3_source() -> str:
+ """Read the embedded D3.js library from the package static directory."""
+ pkg_dir = Path(__file__).resolve().parent
+ d3_path = pkg_dir / "static" / "d3.min.js"
+ if d3_path.exists():
+ return d3_path.read_text(encoding="utf-8")
+ # fallback: try relative to assets
+ fallback = pkg_dir.parent / "static" / "d3.min.js"
+ if fallback.exists():
+ return fallback.read_text(encoding="utf-8")
+ raise FileNotFoundError(
+ f"D3.js library not found at {d3_path}. "
+ "Reinstall the package or run: "
+ "curl -sL https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js -o graphify/static/d3.min.js"
+ )
+
+
+def _html_script(nodes_json: str, edges_json: str, legend_json: str) -> str:
+ return """
+""".replace('__D3_SOURCE__', _d3_source()).replace('__NODES_JSON__', nodes_json).replace('__EDGES_JSON__', edges_json).replace('__LEGEND_JSON__', legend_json)
+
+
+_CONFIDENCE_SCORE_DEFAULTS = {"EXTRACTED": 1.0, "INFERRED": 0.5, "AMBIGUOUS": 0.2}
+
+
+def attach_hyperedges(G: nx.Graph, hyperedges: list) -> None:
+ """Store hyperedges in the graph's metadata dict."""
+ existing = G.graph.get("hyperedges", [])
+ seen_ids = {h["id"] for h in existing}
+ for h in hyperedges:
+ if h.get("id") and h["id"] not in seen_ids:
+ existing.append(h)
+ seen_ids.add(h["id"])
+ G.graph["hyperedges"] = existing
+
+
+def _git_head() -> str | None:
+ """Return the current git HEAD commit hash, or None if not in a git repo."""
+ import subprocess as _sp
+ try:
+ r = _sp.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, timeout=3)
+ return r.stdout.strip() if r.returncode == 0 else None
+ except Exception:
+ return None
+
+
+def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None, community_labels: dict[int, str] | None = None) -> bool:
+ # Safety check: refuse to silently shrink an existing graph (#479)
+ existing_path = Path(output_path)
+ if not force and existing_path.exists():
+ try:
+ from graphify.security import check_graph_file_size_cap
+ check_graph_file_size_cap(existing_path)
+ existing_data = json.loads(existing_path.read_text(encoding="utf-8"))
+ existing_n = len(existing_data.get("nodes", []))
+ new_n = G.number_of_nodes()
+ if new_n < existing_n:
+ import sys as _sys
+ print(
+ f"[graphify] WARNING: new graph has {new_n} nodes but existing "
+ f"graph.json has {existing_n} (net -{existing_n - new_n}). "
+ f"Refusing to overwrite. Possible causes: missing chunk files from "
+ f"a previous session, or fuzzy dedup collapsed same-named symbols "
+ f"across files during an --update on an already-current graph. "
+ f"Run a full rebuild (/graphify .) to be safe, or pass force=True "
+ f"only if you have verified the reduction is legitimate.",
+ file=_sys.stderr,
+ )
+ return False
+ except Exception:
+ pass # unreadable existing file — proceed with write
+
+ node_community = _node_community_map(communities)
+ _labels: dict[int, str] = {int(k): v for k, v in (community_labels or {}).items()}
+ try:
+ data = json_graph.node_link_data(G, edges="links")
+ except TypeError:
+ data = json_graph.node_link_data(G)
+ for node in data["nodes"]:
+ cid = node_community.get(node["id"])
+ node["community"] = cid
+ if cid is not None and _labels:
+ node["community_name"] = _labels.get(cid, f"Community {cid}")
+ node["norm_label"] = _strip_diacritics(node.get("label", "")).lower()
+ for link in data["links"]:
+ if "confidence_score" not in link:
+ conf = link.get("confidence", "EXTRACTED")
+ link["confidence_score"] = _CONFIDENCE_SCORE_DEFAULTS.get(conf, 1.0)
+ # Restore original edge direction. Undirected NetworkX storage may
+ # canonicalize endpoint order, flipping `calls` and other directional
+ # edges in graph.json. The build path stashes the true endpoints in
+ # _src/_tgt for exactly this purpose (#563).
+ true_src = link.pop("_src", None)
+ true_tgt = link.pop("_tgt", None)
+ if true_src is not None and true_tgt is not None:
+ link["source"] = true_src
+ link["target"] = true_tgt
+ data["hyperedges"] = getattr(G, "graph", {}).get("hyperedges", [])
+ commit = built_at_commit if built_at_commit is not None else _git_head()
+ if commit:
+ data["built_at_commit"] = commit
+ with open(output_path, "w", encoding="utf-8") as f: # nosec
+ json.dump(data, f, indent=2)
+ return True
+
+
+def prune_dangling_edges(graph_data: dict) -> tuple[dict, int]:
+ """Remove edges whose source or target node is not in the node set.
+
+ Returns the cleaned graph_data dict and the number of pruned edges.
+ """
+ node_ids = {n["id"] for n in graph_data["nodes"]}
+ links_key = "links" if "links" in graph_data else "edges"
+ before = len(graph_data[links_key])
+ graph_data[links_key] = [
+ e for e in graph_data[links_key]
+ if e["source"] in node_ids and e["target"] in node_ids
+ ]
+ return graph_data, before - len(graph_data[links_key])
+
+
+def _cypher_escape(s: str) -> str:
+ """Escape a string for safe embedding in a Cypher single-quoted literal.
+
+ Handles all characters that could prematurely terminate the literal or
+ inject control sequences:
+ - `\\` and `'` (literal terminators)
+ - newlines/CRs (would break the per-line statement framing)
+ - NUL/control bytes (defensive — Neo4j errors on raw NULs)
+
+ Also strips any leading/trailing whitespace that would let an attacker
+ break the `;`-terminated statement boundary used by `cypher-shell`.
+ Closing `}` and `)` are NOT special inside a single-quoted Cypher string,
+ so escaping the quote and backslash correctly is sufficient (a `}` inside
+ a properly-closed `'...'` literal is just a character) — but we previously
+ missed `\\n` / `\\r` which DO let a payload break out of the statement
+ line and inject a fresh MATCH/DELETE on the following line. See F-008.
+ """
+ # First normalise: drop NUL and other C0 control chars except tab.
+ s = "".join(ch for ch in s if ch >= " " or ch == "\t")
+ return (
+ s.replace("\\", "\\\\")
+ .replace("'", "\\'")
+ .replace("\n", "\\n")
+ .replace("\r", "\\r")
+ )
+
+
+# Restrict identifier-position values (labels and relationship types are NOT
+# quoted in Cypher and so cannot be safely escaped — they must be allowlisted).
+_CYPHER_IDENT_RE = re.compile(r"[^A-Za-z0-9_]")
+
+
+def _cypher_label(raw: str, fallback: str) -> str:
+ """Sanitise a value used in identifier position (node label / rel type).
+
+ Cypher does not provide a way to escape `:Foo` label syntax, so we must
+ strip everything except `[A-Za-z0-9_]` and require the result to start
+ with a letter; otherwise we fall back to a safe constant.
+ """
+ cleaned = _CYPHER_IDENT_RE.sub("", raw or "")
+ if not cleaned or not cleaned[0].isalpha():
+ return fallback
+ return cleaned
+
+
+def to_cypher(G: nx.Graph, output_path: str) -> None:
+ lines = ["// Neo4j Cypher import - generated by /graphify", ""]
+ for node_id, data in G.nodes(data=True):
+ label = _cypher_escape(data.get("label", node_id))
+ node_id_esc = _cypher_escape(node_id)
+ ftype = _cypher_label(
+ (data.get("file_type", "unknown") or "unknown").capitalize(),
+ "Entity",
+ )
+ lines.append(f"MERGE (n:{ftype} {{id: '{node_id_esc}', label: '{label}'}});")
+ lines.append("")
+ for u, v, data in G.edges(data=True):
+ rel = _cypher_label(
+ (data.get("relation", "RELATES_TO") or "RELATES_TO").upper(),
+ "RELATES_TO",
+ )
+ conf = _cypher_escape(data.get("confidence", "EXTRACTED"))
+ u_esc = _cypher_escape(u)
+ v_esc = _cypher_escape(v)
+ lines.append(
+ f"MATCH (a {{id: '{u_esc}'}}), (b {{id: '{v_esc}'}}) "
+ f"MERGE (a)-[:{rel} {{confidence: '{conf}'}}]->(b);"
+ )
+ with open(output_path, "w", encoding="utf-8") as f: # nosec
+ f.write("\n".join(lines))
+
+
+def to_html(
+ G: nx.Graph,
+ communities: dict[int, list[str]],
+ output_path: str,
+ community_labels: dict[int, str] | None = None,
+ member_counts: dict[int, int] | None = None,
+ node_limit: int | None = None,
+) -> None:
+ """Generate an interactive vis.js HTML visualization of the graph.
+
+ Features: node size by degree, click-to-inspect panel, search box,
+ community filter, physics clustering by community, confidence-styled edges.
+ Raises ValueError if graph exceeds MAX_NODES_FOR_VIZ.
+
+ If member_counts is provided (aggregated community view), node sizes are
+ based on community member counts rather than graph degree.
+
+ If node_limit is set and the graph exceeds it, automatically builds an
+ aggregated community-level meta-graph instead of raising ValueError.
+ """
+ limit = node_limit if node_limit is not None else _viz_node_limit()
+ if G.number_of_nodes() > limit:
+ if node_limit is not None:
+ # Build aggregated community meta-graph
+ from collections import Counter as _Counter
+ import networkx as _nx
+ print(f"Graph has {G.number_of_nodes()} nodes (above {limit} limit). Building aggregated community view...")
+ node_to_community = {nid: cid for cid, members in communities.items() for nid in members}
+ meta = _nx.Graph()
+ for cid, members in communities.items():
+ meta.add_node(str(cid), label=(community_labels or {}).get(cid, f"Community {cid}"))
+ edge_counts = _Counter()
+ for u, v in G.edges():
+ cu, cv = node_to_community.get(u), node_to_community.get(v)
+ if cu is not None and cv is not None and cu != cv:
+ edge_counts[(min(cu, cv), max(cu, cv))] += 1
+ for (cu, cv), w in edge_counts.items():
+ meta.add_edge(str(cu), str(cv), weight=w,
+ relation=f"{w} cross-community edges", confidence="AGGREGATED")
+ if meta.number_of_nodes() <= 1:
+ print("Single community - aggregated view not useful. Skipping graph.html.")
+ return
+ meta_communities = {cid: [str(cid)] for cid in communities}
+ mc = {cid: len(members) for cid, members in communities.items()}
+ # Remap hyperedges from semantic node IDs to community IDs
+ raw_hyperedges = G.graph.get("hyperedges", [])
+ if raw_hyperedges:
+ remapped = []
+ for he in raw_hyperedges:
+ he_members = he.get("nodes") or he.get("members") or []
+ comm_ids, seen = [], set()
+ for nid in he_members:
+ c = node_to_community.get(nid)
+ if c is None:
+ continue
+ s = str(c)
+ if s in seen:
+ continue
+ seen.add(s)
+ comm_ids.append(s)
+ if len(comm_ids) < 2:
+ continue
+ remapped.append({
+ "id": he.get("id", ""),
+ "label": he.get("label") or he.get("relation", "").replace("_", " "),
+ "nodes": comm_ids,
+ })
+ meta.graph["hyperedges"] = remapped
+ to_html(meta, meta_communities, output_path,
+ community_labels=community_labels, member_counts=mc)
+ print(f"graph.html written (aggregated: {meta.number_of_nodes()} community nodes, {meta.number_of_edges()} cross-community edges)")
+ print("Tip: run with --obsidian for full node-level detail.")
+ return
+ raise ValueError(
+ f"Graph has {G.number_of_nodes()} nodes - too large for HTML viz "
+ f"(limit: {limit}). Use --no-viz, raise GRAPHIFY_VIZ_NODE_LIMIT, "
+ f"or reduce input size."
+ )
+
+ node_community = _node_community_map(communities)
+ degree = dict(G.degree())
+ max_deg = max(degree.values(), default=1) or 1
+ max_mc = (max(member_counts.values(), default=1) or 1) if member_counts else 1
+
+ # Build nodes list for vis.js
+ vis_nodes = []
+ for node_id, data in G.nodes(data=True):
+ cid = node_community.get(node_id, 0)
+ color = COMMUNITY_COLORS[cid % len(COMMUNITY_COLORS)]
+ label = sanitize_label(data.get("label", node_id))
+ deg = degree.get(node_id, 1)
+ if member_counts:
+ mc = member_counts.get(cid, 1)
+ size = 10 + 30 * (mc / max_mc)
+ font_size = 12
+ else:
+ size = 10 + 30 * (deg / max_deg)
+ # Only show label for high-degree nodes by default; others show on hover
+ font_size = 12 if deg >= max_deg * 0.15 else 0
+ vis_nodes.append({
+ "id": node_id,
+ "label": label,
+ "community": cid,
+ "community_name": sanitize_label((community_labels or {}).get(cid, f"Community {cid}")),
+ "source_file": sanitize_label(str(data.get("source_file") or "")),
+ "source_location": sanitize_label(str(data.get("source_location", "") or "")),
+ "file_type": data.get("file_type", ""),
+ "degree": deg,
+ })
+
+ # Build edges list. Restore original edge direction from _src/_tgt
+ # (stashed by build.py for exactly this reason): undirected NetworkX
+ # canonicalizes endpoint order, which would otherwise flip the arrow
+ # for `calls` and `rationale_for` in the rendered graph (#563).
+ vis_edges = []
+ for u, v, data in G.edges(data=True):
+ confidence = data.get("confidence", "EXTRACTED")
+ relation = data.get("relation", "")
+ true_src = data.get("_src", u)
+ true_tgt = data.get("_tgt", v)
+ vis_edges.append({
+ "from": true_src,
+ "to": true_tgt,
+ "relation": relation,
+ "confidence": confidence,
+ "weight": data.get("weight", 1.0),
+ })
+
+ # Build community legend data
+ legend_data = []
+ all_cids = set()
+ for n in vis_nodes:
+ if n["community"] is not None:
+ all_cids.add(n["community"])
+ for cid in sorted(all_cids):
+ color = COMMUNITY_COLORS[cid % len(COMMUNITY_COLORS)]
+ lbl = _html.escape(sanitize_label((community_labels or {}).get(cid, f"Community {cid}")))
+ n = member_counts.get(cid, len(communities.get(cid, []))) if member_counts else len(communities.get(cid, []))
+ legend_data.append({"cid": cid, "color": color, "label": lbl, "count": n})
+
+ # Escape sequences so embedded JSON cannot break out of the script tag
+ def _js_safe(obj) -> str:
+ return json.dumps(obj).replace("", "<\\/")
+
+ nodes_json = _js_safe(vis_nodes)
+ edges_json = _js_safe(vis_edges)
+ legend_json = _js_safe(legend_data)
+ hyperedges_json = _js_safe(getattr(G, "graph", {}).get("hyperedges", []))
+ title = _html.escape(sanitize_label(str(output_path)))
+ stats = f"{G.number_of_nodes()} nodes · {G.number_of_edges()} edges · {len(communities)} communities"
+
+ file_type_counts = Counter(n.get("file_type", "unknown") for n in vis_nodes)
+ stats_data = {
+ "nodes": G.number_of_nodes(),
+ "edges": G.number_of_edges(),
+ "communities": len(communities),
+ "files": len({n.get("source_file", "") for n in vis_nodes if n.get("source_file")}),
+ "types": len(file_type_counts),
+ }
+
+ html = f"""
+
+