Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion graphify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,8 @@ def main() -> None:
print(" watch <path> watch a folder and rebuild the graph on code changes")
print(" update <path> re-extract code files and update the graph (no LLM needed)")
print(" cluster-only <path> rerun clustering on an existing graph.json and regenerate report")
print(" --hierarchical run multi-resolution Leiden (3 levels) + community summaries")
print(" --summary-backend B summary backend: extractive (default), ollama, claude")
print(" query \"<question>\" BFS traversal of graph.json for a question")
print(" --dfs use depth-first instead of breadth-first")
print(" --budget N cap output at N tokens (default 2000)")
Expand Down Expand Up @@ -1199,6 +1201,11 @@ def main() -> None:
if not graph_json.exists():
print(f"error: no graph found at {graph_json} — run /graphify first", file=sys.stderr)
sys.exit(1)
use_hierarchical = "--hierarchical" in sys.argv
summary_backend = "extractive"
for i, a in enumerate(sys.argv):
if a == "--summary-backend" and i + 1 < len(sys.argv):
summary_backend = sys.argv[i + 1]
from networkx.readwrite import json_graph as _jg
from graphify.build import build_from_json
from graphify.cluster import cluster, score_all
Expand All @@ -1221,7 +1228,19 @@ def main() -> None:
{}, tokens, str(watch_path), suggested_questions=questions)
out = watch_path / "graphify-out"
(out / "GRAPH_REPORT.md").write_text(report, encoding="utf-8")
to_json(G, communities, str(out / "graph.json"))
# Hierarchical clustering + summaries (opt-in)
hierarchy = None
summaries = None
if use_hierarchical:
from graphify.cluster import hierarchical_cluster
from graphify.summarize import summarize_all_communities
print(f"Running hierarchical clustering (3 levels)...")
hierarchy = hierarchical_cluster(G)
print(f"Generating community summaries (backend={summary_backend})...")
summaries = summarize_all_communities(G, communities, backend=summary_backend)
print(f" {len(hierarchy)} hierarchy levels, {len(summaries)} summaries generated")
to_json(G, communities, str(out / "graph.json"),
community_hierarchy=hierarchy, community_summaries=summaries)
print(f"Done — {len(communities)} communities. GRAPH_REPORT.md and graph.json updated.")

elif cmd == "update":
Expand Down
81 changes: 81 additions & 0 deletions graphify/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,84 @@ def cohesion_score(G: nx.Graph, community_nodes: list[str]) -> float:

def score_all(G: nx.Graph, communities: dict[int, list[str]]) -> dict[int, float]:
return {cid: cohesion_score(G, nodes) for cid, nodes in communities.items()}


def _partition_at_resolution(G: nx.Graph, resolution: float) -> dict[str, int]:
"""Run community detection at a specific resolution parameter.

Higher resolution → more communities (finer granularity).
Lower resolution → fewer communities (coarser granularity).

Tries Leiden (graspologic) first, falls back to Louvain (networkx).
"""
try:
from graspologic.partition import leiden
old_stderr = sys.stderr
try:
sys.stderr = io.StringIO()
with _suppress_output():
result = leiden(G, resolution=resolution)
finally:
sys.stderr = old_stderr
return result
except ImportError:
pass

# Fallback: networkx louvain with resolution parameter
kwargs: dict = {"seed": 42, "threshold": 1e-4, "resolution": resolution}
if "max_level" in inspect.signature(nx.community.louvain_communities).parameters:
kwargs["max_level"] = 10
communities = nx.community.louvain_communities(G, **kwargs)
return {node: cid for cid, nodes in enumerate(communities) for node in nodes}


def hierarchical_cluster(
G: nx.Graph,
resolutions: list[float] | None = None,
) -> dict[int, dict[int, list[str]]]:
"""Run Leiden community detection at multiple resolution levels.

Returns {level: {community_id: [node_ids]}} where level 0 is the coarsest
(fewest communities) and higher levels are progressively finer.

Default resolutions are [0.5, 1.0, 2.0] — coarse, medium, fine.

Accepts directed or undirected graphs. DiGraphs are converted to undirected
internally since Louvain/Leiden require undirected input.
"""
if resolutions is None:
resolutions = [0.5, 1.0, 2.0]

if G.number_of_nodes() == 0:
return {level: {} for level in range(len(resolutions))}
if G.is_directed():
G = G.to_undirected()
if G.number_of_edges() == 0:
singleton = {i: [n] for i, n in enumerate(sorted(G.nodes))}
return {level: dict(singleton) for level in range(len(resolutions))}

# Handle isolates separately (same as cluster())
isolates = [n for n in G.nodes() if G.degree(n) == 0]
connected_nodes = [n for n in G.nodes() if G.degree(n) > 0]
connected = G.subgraph(connected_nodes)

hierarchy: dict[int, dict[int, list[str]]] = {}

for level, res in enumerate(sorted(resolutions)):
raw: dict[int, list[str]] = {}
if connected.number_of_nodes() > 0:
partition = _partition_at_resolution(connected, res)
for node, cid in partition.items():
raw.setdefault(cid, []).append(node)

# Each isolate becomes its own single-node community
next_cid = max(raw.keys(), default=-1) + 1
for node in isolates:
raw[next_cid] = [node]
next_cid += 1

# Re-index by size descending for deterministic ordering
communities_list = sorted(raw.values(), key=len, reverse=True)
hierarchy[level] = {i: sorted(nodes) for i, nodes in enumerate(communities_list)}

return hierarchy
14 changes: 13 additions & 1 deletion graphify/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,14 @@ def attach_hyperedges(G: nx.Graph, hyperedges: list) -> None:
G.graph["hyperedges"] = existing


def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str) -> None:
def to_json(
G: nx.Graph,
communities: dict[int, list[str]],
output_path: str,
*,
community_hierarchy: dict[int, dict[int, list[str]]] | None = None,
community_summaries: dict[int, str] | None = None,
) -> None:
node_community = _node_community_map(communities)
try:
data = json_graph.node_link_data(G, edges="links")
Expand All @@ -302,6 +309,11 @@ def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str) ->
conf = link.get("confidence", "EXTRACTED")
link["confidence_score"] = _CONFIDENCE_SCORE_DEFAULTS.get(conf, 1.0)
data["hyperedges"] = getattr(G, "graph", {}).get("hyperedges", [])
# Hierarchical community data (opt-in via --hierarchical flag)
if community_hierarchy is not None:
data["community_hierarchy"] = {str(k): v for k, v in community_hierarchy.items()}
if community_summaries is not None:
data["community_summaries"] = {str(k): v for k, v in community_summaries.items()}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)

Expand Down
120 changes: 120 additions & 0 deletions graphify/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,39 @@ def _strip_diacritics(text: str) -> str:
return "".join(c for c in nfkd if not unicodedata.combining(c))


def _community_summaries_from_data(graph_path: str) -> dict[int, str]:
"""Load community_summaries from graph.json if present."""
try:
data = json.loads(Path(graph_path).resolve().read_text(encoding="utf-8"))
raw = data.get("community_summaries", {})
return {int(k): v for k, v in raw.items()}
except Exception:
return {}


def _community_hierarchy_from_data(graph_path: str) -> dict[int, dict[int, list[str]]]:
"""Load community_hierarchy from graph.json if present."""
try:
data = json.loads(Path(graph_path).resolve().read_text(encoding="utf-8"))
raw = data.get("community_hierarchy", {})
return {int(level): {int(cid): nodes for cid, nodes in comms.items()} for level, comms in raw.items()}
except Exception:
return {}


def _community_relevance_score(summary: str, terms: list[str]) -> float:
"""Score a community summary against query terms using simple term overlap.

Returns a float in [0, 1] based on the fraction of query terms found in
the summary text (case-insensitive).
"""
if not terms or not summary:
return 0.0
summary_lower = summary.lower()
matches = sum(1 for t in terms if t in summary_lower)
return matches / len(terms)


def _score_nodes(G: nx.Graph, terms: list[str]) -> list[tuple[float, str]]:
scored = []
norm_terms = [_strip_diacritics(t).lower() for t in terms]
Expand Down Expand Up @@ -158,6 +191,8 @@ def serve(graph_path: str = "graphify-out/graph.json") -> None:

G = _load_graph(graph_path)
communities = _communities_from_graph(G)
community_summaries = _community_summaries_from_data(graph_path)
community_hierarchy = _community_hierarchy_from_data(graph_path)

server = Server("graphify")

Expand Down Expand Up @@ -232,6 +267,19 @@ async def list_tools() -> list[types.Tool]:
"required": ["source", "target"],
},
),
types.Tool(
name="list_communities",
description="List all communities with their summaries. Use to browse the graph structure before querying.",
inputSchema={
"type": "object",
"properties": {
"level": {
"type": "integer",
"description": "Hierarchy level (0=coarse, higher=finer). Omit for flat communities.",
},
},
},
),
]

def _tool_query_graph(arguments: dict) -> str:
Expand All @@ -240,6 +288,53 @@ def _tool_query_graph(arguments: dict) -> str:
depth = min(int(arguments.get("depth", 3)), 6)
budget = int(arguments.get("token_budget", 2000))
terms = [t.lower() for t in question.split() if len(t) > 2]

# Community-pruned query: if summaries exist, score communities first
# and restrict traversal to nodes in top-K relevant communities
if community_summaries and terms:
scored_communities = [
(_community_relevance_score(summary, terms), cid)
for cid, summary in community_summaries.items()
]
scored_communities.sort(reverse=True)
# Take communities with score > 0, up to top 3
relevant_cids = [
cid for score, cid in scored_communities
if score > 0
][:3]

if relevant_cids:
# Restrict to nodes in relevant communities
relevant_nodes = set()
for cid in relevant_cids:
relevant_nodes.update(communities.get(cid, []))
# Score only within relevant community nodes
subgraph = G.subgraph(relevant_nodes)
scored = _score_nodes(subgraph, terms)
start_nodes = [nid for _, nid in scored[:3]]
if not start_nodes:
# Fall back to community hub nodes (highest degree in each community)
for cid in relevant_cids:
c_nodes = communities.get(cid, [])
if c_nodes:
hub = max(c_nodes, key=lambda n: G.degree(n))
start_nodes.append(hub)
if len(start_nodes) >= 3:
break
if start_nodes:
nodes, edges = _dfs(G, start_nodes, depth) if mode == "dfs" else _bfs(G, start_nodes, depth)
# Constrain output to relevant community scope
nodes = nodes & relevant_nodes
matched_summaries = [f" Community {cid}: {community_summaries[cid]}" for cid in relevant_cids if cid in community_summaries]
header = (
f"Traversal: {mode.upper()} depth={depth} | "
f"Pruned to {len(relevant_cids)} communities | "
f"{len(nodes)} nodes found\n"
f"Matched communities:\n" + "\n".join(matched_summaries) + "\n\n"
)
return header + _subgraph_to_text(G, nodes, edges, budget)

# Fallback: original behavior (no community summaries or no matches)
scored = _score_nodes(G, terms)
start_nodes = [nid for _, nid in scored[:3]]
if not start_nodes:
Expand Down Expand Up @@ -338,6 +433,30 @@ def _tool_shortest_path(arguments: dict) -> str:
segments.append(f"--{rel}{conf_str}--> {G.nodes[v].get('label', v)}")
return f"Shortest path ({hops} hops):\n " + " ".join(segments)

def _tool_list_communities(arguments: dict) -> str:
level = arguments.get("level")
if level is not None and community_hierarchy:
level = int(level)
level_data = community_hierarchy.get(level)
if level_data is None:
available = sorted(community_hierarchy.keys())
return f"Hierarchy level {level} not found. Available levels: {available}"
lines = [f"Communities at hierarchy level {level} ({len(level_data)} communities):"]
for cid, nodes in sorted(level_data.items(), key=lambda x: -len(x[1])):
top_labels = [G.nodes[n].get("label", n) for n in nodes[:3] if n in G]
lines.append(f" Community {cid} ({len(nodes)} nodes): {', '.join(top_labels)}...")
return "\n".join(lines)

# Flat communities with summaries
lines = [f"Communities ({len(communities)} total):"]
for cid, nodes in sorted(communities.items(), key=lambda x: -len(x[1])):
summary = community_summaries.get(cid, "")
summary_str = f" — {summary}" if summary else ""
lines.append(f" Community {cid} ({len(nodes)} nodes){summary_str}")
if community_hierarchy:
lines.append(f"\nHierarchy available ({len(community_hierarchy)} levels). Use level=0..{len(community_hierarchy)-1} to browse.")
return "\n".join(lines)

_handlers = {
"query_graph": _tool_query_graph,
"get_node": _tool_get_node,
Expand All @@ -346,6 +465,7 @@ def _tool_shortest_path(arguments: dict) -> str:
"god_nodes": _tool_god_nodes,
"graph_stats": _tool_graph_stats,
"shortest_path": _tool_shortest_path,
"list_communities": _tool_list_communities,
}

@server.call_tool()
Expand Down
Loading