feat: .ksh extension + shebang-based language detection for extension-less scripts (#235, #237)

azizur100389 · azizur100389 · commit 24fdad8c583f · 2026-04-14T16:06:48.000+01:00
Two parser improvements that expand code-review-graph's file coverage to extension-less Unix scripts and Korn shell files. Feature 1: .ksh extension → bash parser (#235) ----------------------------------------------- Register .ksh (Korn shell) with tree-sitter-bash alongside the existing .sh / .bash / .zsh entries shipped in v2.3.0. Korn shell is close enough to bash syntactically that tree-sitter-bash handles the structural features the graph captures correctly. Context: in the close comment on PR #230, @tirth8205 explicitly flagged this as worth adding: "The .ksh extension in particular looks worth adding — I didn't include it in #227." Tests: test_detects_language extended with .ksh assertion; test_ksh_extension_parses_as_bash — end-to-end regression test that copies sample.sh to a temp .ksh file, parses it, and asserts identical function set and edge counts. Feature 2: shebang-based language detection (#237) -------------------------------------------------- detect_language() was extension-only — any file with no extension returned None and was silently skipped. This misses a huge category of production files: git hooks, CI scripts, bin/ entry points, installers. New SHEBANG_INTERPRETER_TO_LANGUAGE table maps common interpreter basenames to languages already registered: bash/sh/zsh/ksh/dash/ash -> bash python/python2/python3/pypy/pypy3 -> python node/nodejs -> javascript ruby, perl, lua, Rscript, php New _detect_language_from_shebang(path) static method reads the first 256 bytes, handles direct form (#!/bin/bash), env indirection (#!/usr/bin/env bash), env -S flags, trailing flags (#!/bin/bash -e), CRLF, binary content, and strict UTF-8 decoding. detect_language() now falls back to the shebang probe for files with no extension (suffix == ""). Files with a known extension are never re-read — extension-based detection stays authoritative. Tests (16 new in test_parser.py): every interpreter mapping, env -S flag, trailing flags, missing shebang, empty file, binary content, unknown interpreter, extension-does-not-get-overridden, and end-to-end parse_file producing function nodes from an extension-less bash script. Files changed ------------- - code_review_graph/parser.py — .ksh mapping + SHEBANG_INTERPRETER_TO_LANGUAGE table + _detect_language_from_shebang() + detect_language() fallback - tests/test_multilang.py — .ksh detection + end-to-end ksh parsing test - tests/test_parser.py — 16 shebang detection tests
diff --git a/code_review_graph/analysis.py b/code_review_graph/analysis.py
@@ -5,13 +5,14 @@
 
 import logging
 from collections import Counter, defaultdict
+from typing import Any
 
 from .graph import GraphStore, _sanitize_name
 
 logger = logging.getLogger(__name__)
 
 
-def find_hub_nodes(store: GraphStore, top_n: int = 10) -> list[dict]:
+def find_hub_nodes(store: GraphStore, top_n: int = 10) -> list[dict[str, Any]]:
     """Find the most connected nodes (highest in+out degree), excluding File nodes.
 
     Returns list of dicts with: name, qualified_name, kind, file,
@@ -29,7 +30,7 @@ def find_hub_nodes(store: GraphStore, top_n: int = 10) -> list[dict]:
     nodes = store.get_all_nodes(exclude_files=True)
     community_map = store.get_all_community_ids()
 
-    scored = []
+    scored: list[dict[str, Any]] = []
     for n in nodes:
         qn = n.qualified_name
         ind = in_degree.get(qn, 0)
@@ -54,7 +55,7 @@ def find_hub_nodes(store: GraphStore, top_n: int = 10) -> list[dict]:
 
 def find_bridge_nodes(
     store: GraphStore, top_n: int = 10
-) -> list[dict]:
+) -> list[dict[str, Any]]:
     """Find nodes with highest betweenness centrality.
 
     These are architectural chokepoints that sit on shortest paths
@@ -142,7 +143,7 @@ def find_knowledge_gaps(store: GraphStore) -> dict[str, list[dict]]:
             })
 
     # 2. Thin communities (< 3 members)
-    communities = store.get_communities_list()
+    communities = [dict(r) for r in store.get_communities_list()]
     thin = []
     for c in communities:
         if c.get("size", 0) < 3:
@@ -153,7 +154,7 @@ def find_knowledge_gaps(store: GraphStore) -> dict[str, list[dict]]:
             })
 
     # 3. Untested hotspots (degree >= 5, no TESTED_BY)
-    untested_hotspots = []
+    untested_hotspots: list[dict[str, Any]] = []
     for n in nodes:
         d = degree.get(n.qualified_name, 0)
         if (d >= 5
@@ -199,7 +200,7 @@ def find_knowledge_gaps(store: GraphStore) -> dict[str, list[dict]]:
 
 def find_surprising_connections(
     store: GraphStore, top_n: int = 15
-) -> list[dict]:
+) -> list[dict[str, Any]]:
     """Find edges with high surprise scores.
 
     Detects unexpected architectural coupling based on:
@@ -228,7 +229,7 @@ def find_surprising_connections(
     median_deg = sorted(degrees)[len(degrees) // 2]
     high_deg_threshold = max(median_deg * 3, 10)
 
-    scored_edges = []
+    scored_edges: list[dict[str, Any]] = []
     for e in edges:
         src = node_map.get(e.source_qualified)
         tgt = node_map.get(e.target_qualified)
@@ -302,7 +303,7 @@ def find_surprising_connections(
 
 def generate_suggested_questions(
     store: GraphStore,
-) -> list[dict]:
+) -> list[dict[str, Any]]:
     """Auto-generate review questions from graph analysis.
 
     Categories:
diff --git a/code_review_graph/parser.py b/code_review_graph/parser.py
@@ -108,6 +108,7 @@ class EdgeInfo:
     ".sh": "bash",
     ".bash": "bash",
     ".zsh": "bash",
+    ".ksh": "bash",  # Korn shell — close enough to bash for tree-sitter-bash (#235)
     ".ex": "elixir",
     ".exs": "elixir",
     ".ipynb": "notebook",
@@ -119,6 +120,41 @@ class EdgeInfo:
     ".jl": "julia",
 }
 
+# Shebang interpreter → language mapping for extension-less Unix scripts.
+# Each key is the **basename** of the interpreter path as it appears after
+# ``#!`` (or after ``#!/usr/bin/env``).  Only languages already registered
+# above are listed — this file strictly routes extension-less scripts, it
+# does NOT introduce new languages on its own.  See issue #237.
+SHEBANG_INTERPRETER_TO_LANGUAGE: dict[str, str] = {
+    # POSIX / bash-compatible shells — all routed through tree-sitter-bash
+    "bash": "bash",
+    "sh": "bash",
+    "zsh": "bash",
+    "ksh": "bash",
+    "dash": "bash",
+    "ash": "bash",
+    # Python (every common variant)
+    "python": "python",
+    "python2": "python",
+    "python3": "python",
+    "pypy": "python",
+    "pypy3": "python",
+    # JavaScript via Node
+    "node": "javascript",
+    "nodejs": "javascript",
+    # Ruby / Perl / Lua / R / PHP
+    "ruby": "ruby",
+    "perl": "perl",
+    "lua": "lua",
+    "Rscript": "r",
+    "php": "php",
+}
+
+# Maximum bytes to read from the head of a file when probing for a shebang.
+# 256 is enough for any reasonable shebang line (``#!/usr/bin/env python3 -u\n``
+# is ~30 chars) while keeping the worst-case read tiny even on fat binaries.
+_SHEBANG_PROBE_BYTES = 256
+
 # Tree-sitter node type mappings per language
 # Maps (language) -> dict of semantic role -> list of TS node types
 _CLASS_TYPES: dict[str, list[str]] = {
@@ -383,7 +419,88 @@ def _get_parser(self, language: str):  # type: ignore[arg-type]
         return self._parsers[language]
 
     def detect_language(self, path: Path) -> Optional[str]:
-        return EXTENSION_TO_LANGUAGE.get(path.suffix.lower())
+        """Map a file path to its language name.
+
+        Extension-based lookup is tried first.  For extension-less files
+        (typical for Unix scripts like ``bin/myapp`` or ``.git/hooks/pre-commit``)
+        we fall back to reading the first line for a shebang.  Files that
+        already have a known extension are never re-read — shebang probing
+        only runs when the extension lookup returns ``None`` **and** the path
+        has no suffix at all.  See issue #237.
+        """
+        suffix = path.suffix.lower()
+        lang = EXTENSION_TO_LANGUAGE.get(suffix)
+        if lang is not None:
+            return lang
+        # Only probe shebang for files without any extension — "README", "LICENSE",
+        # and other extension-less text files also fall here, but the probe is a
+        # cheap 256-byte read that returns None when no shebang is found.
+        if suffix == "":
+            return self._detect_language_from_shebang(path)
+        return None
+
+    @staticmethod
+    def _detect_language_from_shebang(path: Path) -> Optional[str]:
+        """Inspect the first line of ``path`` for a shebang interpreter.
+
+        Returns the mapped language name or ``None`` if the file has no
+        shebang, is unreadable, or names an interpreter we don't map.
+
+        Accepted shapes::
+
+            #!/bin/bash
+            #!/usr/bin/env python3
+            #!/usr/bin/env -S node --experimental-vm-modules
+            #!/usr/bin/bash -e
+
+        Only the basename of the interpreter is consulted.  Trailing flags
+        after the interpreter are ignored.  Windows-style ``\r\n`` line
+        endings are handled.  Binary files read as garbage bytes simply
+        fail the ``#!`` prefix check and return ``None``.
+        """
+        try:
+            with path.open("rb") as fh:
+                head = fh.read(_SHEBANG_PROBE_BYTES)
+        except (OSError, PermissionError):
+            return None
+        if not head.startswith(b"#!"):
+            return None
+
+        # Take just the first line, stripped of leading "#!" and any
+        # surrounding whitespace.  Split on NUL to defend against accidental
+        # binary content following a ``#!`` prefix.
+        first_line = head.split(b"\n", 1)[0].split(b"\0", 1)[0]
+        try:
+            line = first_line[2:].decode("utf-8", errors="strict").strip()
+        except UnicodeDecodeError:
+            return None
+        if not line:
+            return None
+
+        tokens = line.split()
+        if not tokens:
+            return None
+
+        first = tokens[0]
+        # `/usr/bin/env` indirection: the interpreter is the next token.
+        # `/usr/bin/env -S node --flag` is also valid — skip any leading
+        # ``-`` options after env.
+        if first.endswith("/env") or first == "env":
+            interpreter_token: Optional[str] = None
+            for tok in tokens[1:]:
+                if tok.startswith("-"):
+                    # ``-S`` takes no argument in most envs; skip and continue.
+                    continue
+                interpreter_token = tok
+                break
+            if interpreter_token is None:
+                return None
+            interpreter = interpreter_token.rsplit("/", 1)[-1]
+        else:
+            # Direct form: ``#!/bin/bash`` or ``#!/usr/local/bin/python3``.
+            interpreter = first.rsplit("/", 1)[-1]
+
+        return SHEBANG_INTERPRETER_TO_LANGUAGE.get(interpreter)
 
     def parse_file(self, path: Path) -> tuple[list[NodeInfo], list[EdgeInfo]]:
         """Parse a single file and return extracted nodes and edges."""
diff --git a/code_review_graph/tools/analysis_tools.py b/code_review_graph/tools/analysis_tools.py
@@ -11,6 +11,8 @@
     find_surprising_connections,
     generate_suggested_questions,
 )
+from pathlib import Path
+
 from ._common import _get_store, _validate_repo_root
 
 
@@ -28,8 +30,8 @@ def get_hub_nodes_func(
         repo_root: Repository root (auto-detected if empty).
         top_n: Number of top hubs to return (default 10).
     """
-    root = _validate_repo_root(repo_root)
-    store = _get_store(str(root))
+    root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
+    store, _ = _get_store(str(root))
     hubs = find_hub_nodes(store, top_n=top_n)
     return {
         "hub_nodes": hubs,
@@ -56,8 +58,8 @@ def get_bridge_nodes_func(
         repo_root: Repository root (auto-detected if empty).
         top_n: Number of top bridges to return (default 10).
     """
-    root = _validate_repo_root(repo_root)
-    store = _get_store(str(root))
+    root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
+    store, _ = _get_store(str(root))
     bridges = find_bridge_nodes(store, top_n=top_n)
     return {
         "bridge_nodes": bridges,
@@ -82,8 +84,8 @@ def get_knowledge_gaps_func(
     Args:
         repo_root: Repository root (auto-detected if empty).
     """
-    root = _validate_repo_root(repo_root)
-    store = _get_store(str(root))
+    root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
+    store, _ = _get_store(str(root))
     gaps = find_knowledge_gaps(store)
     total = sum(len(v) for v in gaps.values())
     return {
@@ -118,8 +120,8 @@ def get_surprising_connections_func(
         repo_root: Repository root (auto-detected if empty).
         top_n: Number of top surprises to return (default 15).
     """
-    root = _validate_repo_root(repo_root)
-    store = _get_store(str(root))
+    root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
+    store, _ = _get_store(str(root))
     surprises = find_surprising_connections(store, top_n=top_n)
     return {
         "surprising_connections": surprises,
@@ -144,10 +146,10 @@ def get_suggested_questions_func(
     Args:
         repo_root: Repository root (auto-detected if empty).
     """
-    root = _validate_repo_root(repo_root)
-    store = _get_store(str(root))
+    root = _validate_repo_root(Path(repo_root)) if repo_root else Path.cwd()
+    store, _ = _get_store(str(root))
     questions = generate_suggested_questions(store)
-    by_priority = {"high": [], "medium": [], "low": []}
+    by_priority: dict[str, list[Any]] = {"high": [], "medium": [], "low": []}
     for q in questions:
         by_priority.get(q["priority"], []).append(q)
     return {
diff --git a/tests/test_multilang.py b/tests/test_multilang.py
@@ -1087,6 +1087,39 @@ def test_detects_language(self):
         assert self.parser.detect_language(Path("build.sh")) == "bash"
         assert self.parser.detect_language(Path("build.bash")) == "bash"
         assert self.parser.detect_language(Path("run.zsh")) == "bash"
+        # Regression for #235 — Korn shell (.ksh) should parse as bash.
+        assert self.parser.detect_language(Path("legacy.ksh")) == "bash"
+
+    def test_ksh_extension_parses_as_bash(self, tmp_path):
+        """Regression for #235: a real .ksh file is parsed through the bash
+        grammar end-to-end and produces the same structural nodes/edges
+        as an equivalent .sh file."""
+        fixture_source = (FIXTURES / "sample.sh").read_text(encoding="utf-8")
+        ksh_copy = tmp_path / "legacy.ksh"
+        ksh_copy.write_text(fixture_source, encoding="utf-8")
+
+        ksh_nodes, ksh_edges = self.parser.parse_file(ksh_copy)
+
+        # Language tagging: every node must be "bash".
+        assert ksh_nodes, "parser produced zero nodes for .ksh file"
+        for n in ksh_nodes:
+            assert n.language == "bash"
+
+        # Same function set as the .sh fixture.
+        ksh_funcs = {n.name for n in ksh_nodes if n.kind == "Function"}
+        sh_funcs = {n.name for n in self.nodes if n.kind == "Function"}
+        assert ksh_funcs == sh_funcs, (
+            f".ksh and .sh produced different function sets: "
+            f"sh-only={sh_funcs - ksh_funcs}, ksh-only={ksh_funcs - sh_funcs}"
+        )
+
+        # Same structural-edge totals by kind.
+        def by_kind(edges):
+            counts: dict[str, int] = {}
+            for e in edges:
+                counts[e.kind] = counts.get(e.kind, 0) + 1
+            return counts
+        assert by_kind(ksh_edges) == by_kind(self.edges)
 
     def test_nodes_have_bash_language(self):
         for n in self.nodes:
diff --git a/tests/test_parser.py b/tests/test_parser.py