Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 108 additions & 1 deletion graphify/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5304,7 +5304,114 @@ def extract_js(path: Path) -> dict:
config = _TS_CONFIG
else:
config = _JS_CONFIG
return _extract_generic(path, config)
result = _extract_generic(path, config)
if "error" not in result:
_extract_js_rationale(path, result)
return result


# ── JS/TS rationale + doc-reference extraction ────────────────────────────────
#
# Parity with _extract_python_rationale: Python files get rationale nodes from
# docstrings and `# NOTE:`-style comments, but JS/TS comments were discarded
# entirely. That silently drops two high-value signals in mixed corpora:
# 1. rationale comments (`// NOTE:`, `// WHY:`, ...) — same as Python;
# 2. architecture-decision references (`ADR-0011`, `RFC 793`) that teams
# conventionally cite in file/function headers. These are the natural
# join points between code and design docs in the same graph — without
# them, code<->ADR edges never form even when the code cites the ADR.

_JS_RATIONALE_PREFIXES = (
"// NOTE:", "// IMPORTANT:", "// HACK:", "// WHY:", "// RATIONALE:",
"// TODO:", "// FIXME:",
"* NOTE:", "* IMPORTANT:", "* HACK:", "* WHY:", "* RATIONALE:",
"* TODO:", "* FIXME:",
)

# Doc-reference tokens worth first-classing as graph nodes. Deliberately
# conservative: ADR-NNNN (Architecture Decision Records, any zero padding)
# and RFC NNNN / RFC-NNNN.
_JS_DOC_REF_RE = re.compile(r"\b(ADR[- ]?\d{1,5}|RFC[- ]?\d{1,5})\b", re.IGNORECASE)

# Only look for doc references inside comments, not string literals or code.
_JS_COMMENT_LINE_RE = re.compile(r"^\s*(//|/\*|\*)")


def _extract_js_rationale(path: Path, result: dict) -> None:
"""Post-pass: extract rationale comments and doc references from JS/TS source.
Mutates result in-place by appending to result['nodes'] and result['edges'].
"""
try:
source_text = path.read_text(encoding="utf-8", errors="replace")
except Exception:
return

stem = _file_stem(path)
str_path = str(path)
nodes = result["nodes"]
edges = result["edges"]
seen_ids = {n["id"] for n in nodes}
file_nid = _make_id(str(path))
seen_doc_refs: set[str] = set()

def _add_rationale(text: str, line: int) -> None:
label = text[:80].replace("\r\n", " ").replace("\r", " ").replace("\n", " ").strip()
rid = _make_id(stem, "rationale", str(line))
if rid not in seen_ids:
seen_ids.add(rid)
nodes.append({
"id": rid,
"label": label,
"file_type": "rationale",
"source_file": str_path,
"source_location": f"L{line}",
})
edges.append({
"source": rid,
"target": file_nid,
"relation": "rationale_for",
"confidence": "EXTRACTED",
"source_file": str_path,
"source_location": f"L{line}",
"weight": 1.0,
})

def _add_doc_ref(token: str, line: int) -> None:
# Normalize "adr 11" / "ADR-0011" spellings to a canonical "ADR-0011"
# style label so references to the same document collapse to one node.
kind, num = re.match(r"([A-Za-z]+)[- ]?(\d+)", token).groups()
kind = kind.upper()
label = f"{kind}-{num.zfill(4)}" if kind == "ADR" else f"{kind}-{num}"
if label in seen_doc_refs:
return
seen_doc_refs.add(label)
rid = _make_id("docref", label)
if rid not in seen_ids:
seen_ids.add(rid)
nodes.append({
"id": rid,
"label": label,
"file_type": "doc_ref",
"source_file": str_path,
"source_location": f"L{line}",
})
edges.append({
"source": file_nid,
"target": rid,
"relation": "cites",
"confidence": "EXTRACTED",
"source_file": str_path,
"source_location": f"L{line}",
"weight": 1.0,
})

for lineno, line_text in enumerate(source_text.splitlines(), start=1):
stripped = line_text.strip()
if any(stripped.startswith(p) for p in _JS_RATIONALE_PREFIXES):
_add_rationale(stripped.lstrip("/* "), lineno)
if _JS_COMMENT_LINE_RE.match(line_text):
for m in _JS_DOC_REF_RE.finditer(stripped):
_add_doc_ref(m.group(1), lineno)


def extract_svelte(path: Path) -> dict:
Expand Down
69 changes: 69 additions & 0 deletions tests/test_rationale.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,72 @@ def normal(self) -> int:
f"rationale node {r_id} for ``.{decorated_name}()`` is orphaned "
f"(degree 0) after build_from_json"
)


# ── JS/TS rationale + doc-reference extraction ────────────────────────────────


def _write_ts(tmp_path: Path, code: str) -> Path:
p = tmp_path / "sample.ts"
p.write_text(textwrap.dedent(code))
return p


def test_js_rationale_comment_extracted(tmp_path):
from graphify.extract import extract_js
path = _write_ts(tmp_path, '''
// NOTE: must run before compile() or the linker will fail
export function build(): void {}
''')
result = extract_js(path)
rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
assert any("NOTE" in n["label"] for n in rationale)


def test_js_block_comment_rationale_extracted(tmp_path):
from graphify.extract import extract_js
path = _write_ts(tmp_path, '''
/**
* WHY: retries are capped because the upstream rate-limits at 10 rps.
*/
export function fetchData(): void {}
''')
result = extract_js(path)
rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
assert any("rate-limits" in n["label"] for n in rationale)


def test_js_adr_reference_extracted(tmp_path):
from graphify.extract import extract_js
path = _write_ts(tmp_path, '''
// Gateway pattern per ADR-0002; provider selection per ADR-0015.
export function route(): void {}
''')
result = extract_js(path)
refs = [n for n in result["nodes"] if n.get("file_type") == "doc_ref"]
labels = {n["label"] for n in refs}
assert "ADR-0002" in labels and "ADR-0015" in labels
cites = [e for e in result["edges"] if e.get("relation") == "cites"]
assert len(cites) == 2


def test_js_adr_reference_normalized_and_deduped(tmp_path):
from graphify.extract import extract_js
path = _write_ts(tmp_path, '''
// See ADR-11 for the trust boundary.
// ADR 0011 also governs the injection containment below.
export function guard(): void {}
''')
result = extract_js(path)
refs = [n for n in result["nodes"] if n.get("file_type") == "doc_ref"]
assert [n["label"] for n in refs] == ["ADR-0011"]


def test_js_adr_in_string_literal_not_extracted(tmp_path):
from graphify.extract import extract_js
path = _write_ts(tmp_path, '''
export const banner = "compliant with ADR-0099";
''')
result = extract_js(path)
refs = [n for n in result["nodes"] if n.get("file_type") == "doc_ref"]
assert refs == []