Skip to content

Commit 24fdad8

Browse files
committed
feat: .ksh extension + shebang-based language detection for extension-less scripts (#235, #237)
Two parser improvements that expand code-review-graph's file coverage to extension-less Unix scripts and Korn shell files. Feature 1: .ksh extension → bash parser (#235) ----------------------------------------------- Register .ksh (Korn shell) with tree-sitter-bash alongside the existing .sh / .bash / .zsh entries shipped in v2.3.0. Korn shell is close enough to bash syntactically that tree-sitter-bash handles the structural features the graph captures correctly. Context: in the close comment on PR #230, @tirth8205 explicitly flagged this as worth adding: "The .ksh extension in particular looks worth adding — I didn't include it in #227." Tests: test_detects_language extended with .ksh assertion; test_ksh_extension_parses_as_bash — end-to-end regression test that copies sample.sh to a temp .ksh file, parses it, and asserts identical function set and edge counts. Feature 2: shebang-based language detection (#237) -------------------------------------------------- detect_language() was extension-only — any file with no extension returned None and was silently skipped. This misses a huge category of production files: git hooks, CI scripts, bin/ entry points, installers. New SHEBANG_INTERPRETER_TO_LANGUAGE table maps common interpreter basenames to languages already registered: bash/sh/zsh/ksh/dash/ash -> bash python/python2/python3/pypy/pypy3 -> python node/nodejs -> javascript ruby, perl, lua, Rscript, php New _detect_language_from_shebang(path) static method reads the first 256 bytes, handles direct form (#!/bin/bash), env indirection (#!/usr/bin/env bash), env -S flags, trailing flags (#!/bin/bash -e), CRLF, binary content, and strict UTF-8 decoding. detect_language() now falls back to the shebang probe for files with no extension (suffix == ""). Files with a known extension are never re-read — extension-based detection stays authoritative. Tests (16 new in test_parser.py): every interpreter mapping, env -S flag, trailing flags, missing shebang, empty file, binary content, unknown interpreter, extension-does-not-get-overridden, and end-to-end parse_file producing function nodes from an extension-less bash script. Files changed ------------- - code_review_graph/parser.py — .ksh mapping + SHEBANG_INTERPRETER_TO_LANGUAGE table + _detect_language_from_shebang() + detect_language() fallback - tests/test_multilang.py — .ksh detection + end-to-end ksh parsing test - tests/test_parser.py — 16 shebang detection tests
1 parent 80d22bf commit 24fdad8

5 files changed

Lines changed: 310 additions & 20 deletions

File tree

code_review_graph/analysis.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55

66
import logging
77
from collections import Counter, defaultdict
8+
from typing import Any
89

910
from .graph import GraphStore, _sanitize_name
1011

1112
logger = logging.getLogger(__name__)
1213

1314

14-
def find_hub_nodes(store: GraphStore, top_n: int = 10) -> list[dict]:
15+
def find_hub_nodes(store: GraphStore, top_n: int = 10) -> list[dict[str, Any]]:
1516
"""Find the most connected nodes (highest in+out degree), excluding File nodes.
1617
1718
Returns list of dicts with: name, qualified_name, kind, file,
@@ -29,7 +30,7 @@ def find_hub_nodes(store: GraphStore, top_n: int = 10) -> list[dict]:
2930
nodes = store.get_all_nodes(exclude_files=True)
3031
community_map = store.get_all_community_ids()
3132

32-
scored = []
33+
scored: list[dict[str, Any]] = []
3334
for n in nodes:
3435
qn = n.qualified_name
3536
ind = in_degree.get(qn, 0)
@@ -54,7 +55,7 @@ def find_hub_nodes(store: GraphStore, top_n: int = 10) -> list[dict]:
5455

5556
def find_bridge_nodes(
5657
store: GraphStore, top_n: int = 10
57-
) -> list[dict]:
58+
) -> list[dict[str, Any]]:
5859
"""Find nodes with highest betweenness centrality.
5960
6061
These are architectural chokepoints that sit on shortest paths
@@ -142,7 +143,7 @@ def find_knowledge_gaps(store: GraphStore) -> dict[str, list[dict]]:
142143
})
143144

144145
# 2. Thin communities (< 3 members)
145-
communities = store.get_communities_list()
146+
communities = [dict(r) for r in store.get_communities_list()]
146147
thin = []
147148
for c in communities:
148149
if c.get("size", 0) < 3:
@@ -153,7 +154,7 @@ def find_knowledge_gaps(store: GraphStore) -> dict[str, list[dict]]:
153154
})
154155

155156
# 3. Untested hotspots (degree >= 5, no TESTED_BY)
156-
untested_hotspots = []
157+
untested_hotspots: list[dict[str, Any]] = []
157158
for n in nodes:
158159
d = degree.get(n.qualified_name, 0)
159160
if (d >= 5
@@ -199,7 +200,7 @@ def find_knowledge_gaps(store: GraphStore) -> dict[str, list[dict]]:
199200

200201
def find_surprising_connections(
201202
store: GraphStore, top_n: int = 15
202-
) -> list[dict]:
203+
) -> list[dict[str, Any]]:
203204
"""Find edges with high surprise scores.
204205
205206
Detects unexpected architectural coupling based on:
@@ -228,7 +229,7 @@ def find_surprising_connections(
228229
median_deg = sorted(degrees)[len(degrees) // 2]
229230
high_deg_threshold = max(median_deg * 3, 10)
230231

231-
scored_edges = []
232+
scored_edges: list[dict[str, Any]] = []
232233
for e in edges:
233234
src = node_map.get(e.source_qualified)
234235
tgt = node_map.get(e.target_qualified)
@@ -302,7 +303,7 @@ def find_surprising_connections(
302303

303304
def generate_suggested_questions(
304305
store: GraphStore,
305-
) -> list[dict]:
306+
) -> list[dict[str, Any]]:
306307
"""Auto-generate review questions from graph analysis.
307308
308309
Categories:

code_review_graph/parser.py

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ class EdgeInfo:
108108
".sh": "bash",
109109
".bash": "bash",
110110
".zsh": "bash",
111+
".ksh": "bash", # Korn shell — close enough to bash for tree-sitter-bash (#235)
111112
".ex": "elixir",
112113
".exs": "elixir",
113114
".ipynb": "notebook",
@@ -119,6 +120,41 @@ class EdgeInfo:
119120
".jl": "julia",
120121
}
121122

123+
# Shebang interpreter → language mapping for extension-less Unix scripts.
124+
# Each key is the **basename** of the interpreter path as it appears after
125+
# ``#!`` (or after ``#!/usr/bin/env``). Only languages already registered
126+
# above are listed — this file strictly routes extension-less scripts, it
127+
# does NOT introduce new languages on its own. See issue #237.
128+
SHEBANG_INTERPRETER_TO_LANGUAGE: dict[str, str] = {
129+
# POSIX / bash-compatible shells — all routed through tree-sitter-bash
130+
"bash": "bash",
131+
"sh": "bash",
132+
"zsh": "bash",
133+
"ksh": "bash",
134+
"dash": "bash",
135+
"ash": "bash",
136+
# Python (every common variant)
137+
"python": "python",
138+
"python2": "python",
139+
"python3": "python",
140+
"pypy": "python",
141+
"pypy3": "python",
142+
# JavaScript via Node
143+
"node": "javascript",
144+
"nodejs": "javascript",
145+
# Ruby / Perl / Lua / R / PHP
146+
"ruby": "ruby",
147+
"perl": "perl",
148+
"lua": "lua",
149+
"Rscript": "r",
150+
"php": "php",
151+
}
152+
153+
# Maximum bytes to read from the head of a file when probing for a shebang.
154+
# 256 is enough for any reasonable shebang line (``#!/usr/bin/env python3 -u\n``
155+
# is ~30 chars) while keeping the worst-case read tiny even on fat binaries.
156+
_SHEBANG_PROBE_BYTES = 256
157+
122158
# Tree-sitter node type mappings per language
123159
# Maps (language) -> dict of semantic role -> list of TS node types
124160
_CLASS_TYPES: dict[str, list[str]] = {
@@ -383,7 +419,88 @@ def _get_parser(self, language: str): # type: ignore[arg-type]
383419
return self._parsers[language]
384420

385421
def detect_language(self, path: Path) -> Optional[str]:
386-
return EXTENSION_TO_LANGUAGE.get(path.suffix.lower())
422+
"""Map a file path to its language name.
423+
424+
Extension-based lookup is tried first. For extension-less files
425+
(typical for Unix scripts like ``bin/myapp`` or ``.git/hooks/pre-commit``)
426+
we fall back to reading the first line for a shebang. Files that
427+
already have a known extension are never re-read — shebang probing
428+
only runs when the extension lookup returns ``None`` **and** the path
429+
has no suffix at all. See issue #237.
430+
"""
431+
suffix = path.suffix.lower()
432+
lang = EXTENSION_TO_LANGUAGE.get(suffix)
433+
if lang is not None:
434+
return lang
435+
# Only probe shebang for files without any extension — "README", "LICENSE",
436+
# and other extension-less text files also fall here, but the probe is a
437+
# cheap 256-byte read that returns None when no shebang is found.
438+
if suffix == "":
439+
return self._detect_language_from_shebang(path)
440+
return None
441+
442+
@staticmethod
443+
def _detect_language_from_shebang(path: Path) -> Optional[str]:
444+
"""Inspect the first line of ``path`` for a shebang interpreter.
445+
446+
Returns the mapped language name or ``None`` if the file has no
447+
shebang, is unreadable, or names an interpreter we don't map.
448+
449+
Accepted shapes::
450+
451+
#!/bin/bash
452+
#!/usr/bin/env python3
453+
#!/usr/bin/env -S node --experimental-vm-modules
454+
#!/usr/bin/bash -e
455+
456+
Only the basename of the interpreter is consulted. Trailing flags
457+
after the interpreter are ignored. Windows-style ``\r\n`` line
458+
endings are handled. Binary files read as garbage bytes simply
459+
fail the ``#!`` prefix check and return ``None``.
460+
"""
461+
try:
462+
with path.open("rb") as fh:
463+
head = fh.read(_SHEBANG_PROBE_BYTES)
464+
except (OSError, PermissionError):
465+
return None
466+
if not head.startswith(b"#!"):
467+
return None
468+
469+
# Take just the first line, stripped of leading "#!" and any
470+
# surrounding whitespace. Split on NUL to defend against accidental
471+
# binary content following a ``#!`` prefix.
472+
first_line = head.split(b"\n", 1)[0].split(b"\0", 1)[0]
473+
try:
474+
line = first_line[2:].decode("utf-8", errors="strict").strip()
475+
except UnicodeDecodeError:
476+
return None
477+
if not line:
478+
return None
479+
480+
tokens = line.split()
481+
if not tokens:
482+
return None
483+
484+
first = tokens[0]
485+
# `/usr/bin/env` indirection: the interpreter is the next token.
486+
# `/usr/bin/env -S node --flag` is also valid — skip any leading
487+
# ``-`` options after env.
488+
if first.endswith("/env") or first == "env":
489+
interpreter_token: Optional[str] = None
490+
for tok in tokens[1:]:
491+
if tok.startswith("-"):
492+
# ``-S`` takes no argument in most envs; skip and continue.
493+
continue
494+
interpreter_token = tok
495+
break
496+
if interpreter_token is None:
497+
return None
498+
interpreter = interpreter_token.rsplit("/", 1)[-1]
499+
else:
500+
# Direct form: ``#!/bin/bash`` or ``#!/usr/local/bin/python3``.
501+
interpreter = first.rsplit("/", 1)[-1]
502+
503+
return SHEBANG_INTERPRETER_TO_LANGUAGE.get(interpreter)
387504

388505
def parse_file(self, path: Path) -> tuple[list[NodeInfo], list[EdgeInfo]]:
389506
"""Parse a single file and return extracted nodes and edges."""

code_review_graph/tools/analysis_tools.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
find_surprising_connections,
1212
generate_suggested_questions,
1313
)
14+
from pathlib import Path
15+
1416
from ._common import _get_store, _validate_repo_root
1517

1618

@@ -28,8 +30,8 @@ def get_hub_nodes_func(
2830
repo_root: Repository root (auto-detected if empty).
2931
top_n: Number of top hubs to return (default 10).
3032
"""
31-
root = _validate_repo_root(repo_root)
32-
store = _get_store(str(root))
33+
root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
34+
store, _ = _get_store(str(root))
3335
hubs = find_hub_nodes(store, top_n=top_n)
3436
return {
3537
"hub_nodes": hubs,
@@ -56,8 +58,8 @@ def get_bridge_nodes_func(
5658
repo_root: Repository root (auto-detected if empty).
5759
top_n: Number of top bridges to return (default 10).
5860
"""
59-
root = _validate_repo_root(repo_root)
60-
store = _get_store(str(root))
61+
root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
62+
store, _ = _get_store(str(root))
6163
bridges = find_bridge_nodes(store, top_n=top_n)
6264
return {
6365
"bridge_nodes": bridges,
@@ -82,8 +84,8 @@ def get_knowledge_gaps_func(
8284
Args:
8385
repo_root: Repository root (auto-detected if empty).
8486
"""
85-
root = _validate_repo_root(repo_root)
86-
store = _get_store(str(root))
87+
root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
88+
store, _ = _get_store(str(root))
8789
gaps = find_knowledge_gaps(store)
8890
total = sum(len(v) for v in gaps.values())
8991
return {
@@ -118,8 +120,8 @@ def get_surprising_connections_func(
118120
repo_root: Repository root (auto-detected if empty).
119121
top_n: Number of top surprises to return (default 15).
120122
"""
121-
root = _validate_repo_root(repo_root)
122-
store = _get_store(str(root))
123+
root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
124+
store, _ = _get_store(str(root))
123125
surprises = find_surprising_connections(store, top_n=top_n)
124126
return {
125127
"surprising_connections": surprises,
@@ -144,10 +146,10 @@ def get_suggested_questions_func(
144146
Args:
145147
repo_root: Repository root (auto-detected if empty).
146148
"""
147-
root = _validate_repo_root(repo_root)
148-
store = _get_store(str(root))
149+
root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
150+
store, _ = _get_store(str(root))
149151
questions = generate_suggested_questions(store)
150-
by_priority = {"high": [], "medium": [], "low": []}
152+
by_priority: dict[str, list[Any]] = {"high": [], "medium": [], "low": []}
151153
for q in questions:
152154
by_priority.get(q["priority"], []).append(q)
153155
return {

tests/test_multilang.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,39 @@ def test_detects_language(self):
10871087
assert self.parser.detect_language(Path("build.sh")) == "bash"
10881088
assert self.parser.detect_language(Path("build.bash")) == "bash"
10891089
assert self.parser.detect_language(Path("run.zsh")) == "bash"
1090+
# Regression for #235 — Korn shell (.ksh) should parse as bash.
1091+
assert self.parser.detect_language(Path("legacy.ksh")) == "bash"
1092+
1093+
def test_ksh_extension_parses_as_bash(self, tmp_path):
1094+
"""Regression for #235: a real .ksh file is parsed through the bash
1095+
grammar end-to-end and produces the same structural nodes/edges
1096+
as an equivalent .sh file."""
1097+
fixture_source = (FIXTURES / "sample.sh").read_text(encoding="utf-8")
1098+
ksh_copy = tmp_path / "legacy.ksh"
1099+
ksh_copy.write_text(fixture_source, encoding="utf-8")
1100+
1101+
ksh_nodes, ksh_edges = self.parser.parse_file(ksh_copy)
1102+
1103+
# Language tagging: every node must be "bash".
1104+
assert ksh_nodes, "parser produced zero nodes for .ksh file"
1105+
for n in ksh_nodes:
1106+
assert n.language == "bash"
1107+
1108+
# Same function set as the .sh fixture.
1109+
ksh_funcs = {n.name for n in ksh_nodes if n.kind == "Function"}
1110+
sh_funcs = {n.name for n in self.nodes if n.kind == "Function"}
1111+
assert ksh_funcs == sh_funcs, (
1112+
f".ksh and .sh produced different function sets: "
1113+
f"sh-only={sh_funcs - ksh_funcs}, ksh-only={ksh_funcs - sh_funcs}"
1114+
)
1115+
1116+
# Same structural-edge totals by kind.
1117+
def by_kind(edges):
1118+
counts: dict[str, int] = {}
1119+
for e in edges:
1120+
counts[e.kind] = counts.get(e.kind, 0) + 1
1121+
return counts
1122+
assert by_kind(ksh_edges) == by_kind(self.edges)
10901123

10911124
def test_nodes_have_bash_language(self):
10921125
for n in self.nodes:

0 commit comments

Comments
 (0)