From a255e6bf4da684e68af199f4b424dfc6b2b66429 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Fri, 5 Jun 2026 17:06:54 +0100 Subject: [PATCH] fix(parse-knowledge-base): extract CommonMark [](page.md) links in Karpathy code path (#361) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deterministic parser only extracted links via `[[wikilink]]` syntax. A Karpathy-pattern wiki (has index.md + multiple cross-linked .md files + schema) that uses CommonMark `[label](page.md)` links — common on GitHub/GitLab where `[[wikilinks]]` aren't rendered — was detected as karpathy but produced zero deterministic edges, leaving the graph to be inferred entirely from prose by the LLM phase. Inside the existing Karpathy code path, also extract `[label](page.md)` links and resolve them by normalised relative path. Both `parse_index` and the per-article extraction loop now scan both link styles, so category membership and inter-article edges are recovered for mixed and pure CommonMark Karpathy wikis. Pure-wikilink wikis remain byte-for-byte equivalent (no regression). Resolution handles `pages/x.md`, `./pages/x.md`, and `/pages/x.md` identically; query/fragment suffixes are stripped; image links, external URLs, and fenced code blocks are filtered. Distinct from #342 (still wikilink-only) and #312 (separate doctrine format gated on `index.md` being absent). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test_parse_knowledge_base.py | 494 ++++++++++++++++++ .../parse-knowledge-base.py | 291 ++++++++++- 2 files changed, 769 insertions(+), 16 deletions(-) create mode 100644 tests/skill/understand-knowledge/test_parse_knowledge_base.py diff --git a/tests/skill/understand-knowledge/test_parse_knowledge_base.py b/tests/skill/understand-knowledge/test_parse_knowledge_base.py new file mode 100644 index 00000000..6cc443ce --- /dev/null +++ b/tests/skill/understand-knowledge/test_parse_knowledge_base.py @@ -0,0 +1,494 @@ +#!/usr/bin/env python3 +""" +test_parse_knowledge_base.py — Tests for the Karpathy-pattern wiki parser. + +Focus: regression coverage for issue #361 — Karpathy wikis using CommonMark +`[label](page.md)` links yield 0 deterministic edges. + +The fix extracts CommonMark `[](page.md)` links inside the Karpathy code path +alongside the existing `[[wikilink]]` handling. The tests below cover: + + - pure CommonMark wikis (no `[[ ]]` anywhere) — must produce real edges. + - mixed `[[ ]]` + `[](page.md)` wikis — both styles must contribute edges. + - pure-wikilink wikis — regression: must remain byte-for-byte equivalent. + - md-link helpers — filter external URLs, anchors, image links, fenced + code blocks; resolve relative, `./relative`, and `/absolute` targets. + +Run from the repo root: + python3 -m unittest tests.skill.understand-knowledge.test_parse_knowledge_base -v + +Or directly: + python3 tests/skill/understand-knowledge/test_parse_knowledge_base.py +""" + +from __future__ import annotations + +import importlib.util +import shutil +import sys +import tempfile +import unittest +from pathlib import Path +from typing import Any + + +# ── Module loader ───────────────────────────────────────────────────────── +# `parse-knowledge-base.py` has a hyphen in its name, so we cannot `import` +# it directly. Load it via importlib so we can call its module-level helpers. + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parent.parent.parent +_MODULE_PATH = ( + _REPO_ROOT + / "understand-anything-plugin" + / "skills" + / "understand-knowledge" + / "parse-knowledge-base.py" +) + + +def _load_module() -> Any: + spec = importlib.util.spec_from_file_location( + "parse_knowledge_base", _MODULE_PATH + ) + if spec is None or spec.loader is None: + raise RuntimeError(f"Could not load module from {_MODULE_PATH}") + module = importlib.util.module_from_spec(spec) + sys.modules["parse_knowledge_base"] = module + spec.loader.exec_module(module) + return module + + +pkb = _load_module() + + +# ── Fixture builder ─────────────────────────────────────────────────────── + + +class _WikiFixture: + """Build a temp Karpathy-pattern wiki on disk for parse_wiki().""" + + def __init__(self) -> None: + self.tmp = Path(tempfile.mkdtemp(prefix="ua-pkb-")) + + def write(self, rel_path: str, content: str) -> Path: + p = self.tmp / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content, encoding="utf-8") + return p + + def cleanup(self) -> None: + shutil.rmtree(self.tmp, ignore_errors=True) + + +def _edge_pairs(manifest: dict, edge_type: str | None = None) -> set[tuple[str, str]]: + """Return {(source, target)} for edges in manifest, optionally filtered by type.""" + return { + (e["source"], e["target"]) + for e in manifest["edges"] + if edge_type is None or e["type"] == edge_type + } + + +# ── is_internal_md_target ───────────────────────────────────────────────── + + +class IsInternalMdTargetTests(unittest.TestCase): + """Filter logic for raw markdown-link targets.""" + + def test_accepts_relative_md_paths(self) -> None: + for href in ["page.md", "pages/alpha.md", "./pages/alpha.md", "/pages/alpha.md"]: + with self.subTest(href=href): + self.assertTrue(pkb.is_internal_md_target(href)) + + def test_rejects_external_urls(self) -> None: + for href in [ + "https://example.com/page.md", + "http://example.com", + "mailto:foo@example.com", + "ftp://example.com/file.md", + ]: + with self.subTest(href=href): + self.assertFalse(pkb.is_internal_md_target(href)) + + def test_rejects_bare_anchors(self) -> None: + self.assertFalse(pkb.is_internal_md_target("#section")) + self.assertFalse(pkb.is_internal_md_target("#")) + + def test_rejects_non_md_assets(self) -> None: + for href in ["image.png", "data.json", "script.js", "page", "pages/"]: + with self.subTest(href=href): + self.assertFalse(pkb.is_internal_md_target(href)) + + def test_accepts_md_with_anchor_or_query(self) -> None: + # Path-part ends in .md once query/fragment are stripped. + self.assertTrue(pkb.is_internal_md_target("page.md#section")) + self.assertTrue(pkb.is_internal_md_target("page.md?v=1")) + + def test_rejects_empty(self) -> None: + self.assertFalse(pkb.is_internal_md_target("")) + self.assertFalse(pkb.is_internal_md_target(" ")) + + +# ── extract_md_links ────────────────────────────────────────────────────── + + +class ExtractMdLinksTests(unittest.TestCase): + """`[label](page.md)` extraction with image / code-block / URL filters.""" + + def test_extracts_basic_md_link(self) -> None: + links = pkb.extract_md_links("See [Alpha](pages/alpha.md) for details.") + self.assertEqual(len(links), 1) + self.assertEqual(links[0]["target"], "pages/alpha.md") + self.assertEqual(links[0]["display"], "Alpha") + + def test_skips_image_links(self) -> None: + # `![alt](src)` is an image embed, not a page link — never an edge. + text = "![diagram](pages/diagram.md)\n[Alpha](pages/alpha.md)" + links = pkb.extract_md_links(text) + self.assertEqual([l["target"] for l in links], ["pages/alpha.md"]) + + def test_skips_external_urls(self) -> None: + text = "[GitHub](https://github.com/foo/bar) and [Alpha](pages/alpha.md)" + links = pkb.extract_md_links(text) + self.assertEqual([l["target"] for l in links], ["pages/alpha.md"]) + + def test_skips_links_in_fenced_code_blocks(self) -> None: + text = ( + "Live link: [Alpha](pages/alpha.md)\n" + "\n" + "```markdown\n" + "Example: [NotARealEdge](pages/example.md)\n" + "```\n" + ) + links = pkb.extract_md_links(text) + self.assertEqual([l["target"] for l in links], ["pages/alpha.md"]) + + def test_skips_anchors_and_non_md(self) -> None: + text = "[anchor](#section) and [json](data.json) and [Alpha](alpha.md)" + links = pkb.extract_md_links(text) + self.assertEqual([l["target"] for l in links], ["alpha.md"]) + + def test_returns_empty_for_text_without_links(self) -> None: + self.assertEqual(pkb.extract_md_links("plain text, no links"), []) + self.assertEqual(pkb.extract_md_links(""), []) + + def test_preserves_wikilinks_untouched_in_extract_wikilinks(self) -> None: + # Backward-compat sanity: extract_wikilinks is unchanged. + text = "See [[Alpha]] and [Alpha](pages/alpha.md)." + wls = pkb.extract_wikilinks(text) + self.assertEqual([w["target"] for w in wls], ["Alpha"]) + + +# ── _normalise_md_target ────────────────────────────────────────────────── + + +class NormaliseMdTargetTests(unittest.TestCase): + """Path normalisation for md-link resolution.""" + + def test_bare_relative_resolves_against_base_dir(self) -> None: + # File at `pages/alpha.md` links to `beta.md` → resolves to + # `pages/beta.md` relative to wiki_root. + norm = pkb._normalise_md_target( + "beta.md", Path("pages"), Path("/wiki") + ) + self.assertEqual(norm, "pages/beta.md") + + def test_dot_slash_prefix_normalised(self) -> None: + norm = pkb._normalise_md_target( + "./beta.md", Path("pages"), Path("/wiki") + ) + self.assertEqual(norm, "pages/beta.md") + + def test_absolute_path_treated_as_wiki_root_relative(self) -> None: + norm = pkb._normalise_md_target( + "/pages/alpha.md", Path("anywhere"), Path("/wiki") + ) + self.assertEqual(norm, "pages/alpha.md") + + def test_parent_dir_traversal(self) -> None: + # `pages/sub/file.md` links to `../alpha.md` → `pages/alpha.md`. + norm = pkb._normalise_md_target( + "../alpha.md", Path("pages/sub"), Path("/wiki") + ) + self.assertEqual(norm, "pages/alpha.md") + + def test_escape_above_wiki_root_returns_none(self) -> None: + # `pages/alpha.md` links to `../../escape.md` (would escape wiki_root). + norm = pkb._normalise_md_target( + "../../escape.md", Path("pages"), Path("/wiki") + ) + self.assertIsNone(norm) + + def test_query_and_fragment_stripped(self) -> None: + norm = pkb._normalise_md_target( + "pages/alpha.md#section", Path("."), Path("/wiki") + ) + self.assertEqual(norm, "pages/alpha.md") + norm2 = pkb._normalise_md_target( + "pages/alpha.md?v=1", Path("."), Path("/wiki") + ) + self.assertEqual(norm2, "pages/alpha.md") + + def test_normalised_lowercase(self) -> None: + # `path_map` uses lower-cased keys for case-insensitive resolution. + norm = pkb._normalise_md_target( + "Pages/Alpha.MD", Path("."), Path("/wiki") + ) + self.assertEqual(norm, "pages/alpha.md") + + +# ── parse_wiki end-to-end ───────────────────────────────────────────────── + + +class ParseWikiCommonMarkOnlyTests(unittest.TestCase): + """Regression for issue #361: a Karpathy-detected wiki using only + CommonMark `[](page.md)` links must produce deterministic edges. + + Pre-fix behaviour: 0 edges, 0 category memberships → silent degradation. + """ + + def setUp(self) -> None: + self.fix = _WikiFixture() + self.addCleanup(self.fix.cleanup) + # A minimal Karpathy-shaped wiki (has index.md, multiple .md files, + # ≥3 markdown files) but using only CommonMark links. + self.fix.write( + "index.md", + "# Wiki Index\n\n" + "## Topic\n\n" + "- [Alpha](pages/alpha.md)\n" + "- [Beta](pages/beta.md)\n", + ) + self.fix.write( + "pages/alpha.md", + "# Alpha\n\nAlpha relates to [Beta](beta.md) and back to " + "[the index](../index.md).\n", + ) + self.fix.write( + "pages/beta.md", + "# Beta\n\nBeta references [Alpha](alpha.md).\n", + ) + + def test_detected_as_karpathy(self) -> None: + det = pkb.detect_format(self.fix.tmp) + self.assertTrue(det["detected"]) + self.assertEqual(det["format"], "karpathy") + + def test_md_link_edges_resolved(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + related_pairs = _edge_pairs(manifest, "related") + # Alpha → Beta and Beta → Alpha (both via [](beta.md) and [](alpha.md)) + self.assertIn( + ("article:pages/alpha", "article:pages/beta"), related_pairs, + f"Expected alpha→beta edge; got: {related_pairs}", + ) + self.assertIn( + ("article:pages/beta", "article:pages/alpha"), related_pairs, + ) + + def test_categorized_under_edges_from_md_links_in_index(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + cat_pairs = _edge_pairs(manifest, "categorized_under") + # Both alpha and beta should be categorised under "Topic". + self.assertIn(("article:pages/alpha", "topic:topic"), cat_pairs) + self.assertIn(("article:pages/beta", "topic:topic"), cat_pairs) + + def test_category_present_on_article_nodes(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + article_nodes = { + n["id"]: n for n in manifest["nodes"] if n["type"] == "article" + } + self.assertEqual( + article_nodes["article:pages/alpha"]["knowledgeMeta"]["category"], + "Topic", + ) + self.assertEqual( + article_nodes["article:pages/beta"]["knowledgeMeta"]["category"], + "Topic", + ) + + def test_topic_count_includes_md_links(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + topic_node = next(n for n in manifest["nodes"] if n["type"] == "topic") + self.assertEqual(topic_node["name"], "Topic") + # Summary mentions "(2 articles)" — both md-link entries counted. + self.assertIn("(2 articles)", topic_node["summary"]) + + def test_stats_reports_md_links(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + stats = manifest["stats"] + # 3 md-links in body (alpha→beta + alpha→index + beta→alpha). The + # alpha→index link resolves to an infra page (not an article) and is + # counted as unresolved. We assert the floor (>= 2 successful) rather + # than the exact total to keep the test resilient to additions. + self.assertGreaterEqual(stats["mdLinks"], 2) + # The deterministic parser produced edges — the core regression check. + self.assertGreaterEqual(len(manifest["edges"]), 2) + + +class ParseWikiMixedSyntaxTests(unittest.TestCase): + """Mixed Karpathy wiki: some pages use `[[ ]]`, others use `[](page.md)`. + Both styles must contribute edges; neither path may regress.""" + + def setUp(self) -> None: + self.fix = _WikiFixture() + self.addCleanup(self.fix.cleanup) + # Index uses both syntaxes side-by-side under a single category. + self.fix.write( + "index.md", + "# Wiki Index\n\n" + "## Topic\n\n" + "- [[alpha]]\n" + "- [Beta](pages/beta.md)\n", + ) + self.fix.write( + "alpha.md", + "# Alpha\n\nAlpha links via wikilink to [[beta]] and via " + "md-link to [Gamma](pages/gamma.md).\n", + ) + self.fix.write( + "pages/beta.md", + "# Beta\n\nBeta references [Alpha](../alpha.md).\n", + ) + self.fix.write( + "pages/gamma.md", + "# Gamma\n\nGamma links back via [[alpha]].\n", + ) + + def test_wikilink_edges_preserved(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + related = _edge_pairs(manifest, "related") + # alpha → beta via [[beta]]; gamma → alpha via [[alpha]] + self.assertIn(("article:alpha", "article:pages/beta"), related) + self.assertIn(("article:pages/gamma", "article:alpha"), related) + + def test_md_link_edges_added(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + related = _edge_pairs(manifest, "related") + # alpha → gamma via [Gamma](pages/gamma.md) + self.assertIn(("article:alpha", "article:pages/gamma"), related) + # beta → alpha via [Alpha](../alpha.md) + self.assertIn(("article:pages/beta", "article:alpha"), related) + + def test_mixed_category_lookups(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + article_nodes = { + n["id"]: n for n in manifest["nodes"] if n["type"] == "article" + } + # alpha categorised via [[alpha]] wikilink in index. + self.assertEqual( + article_nodes["article:alpha"]["knowledgeMeta"]["category"], + "Topic", + ) + # beta categorised via [Beta](pages/beta.md) md-link in index. + self.assertEqual( + article_nodes["article:pages/beta"]["knowledgeMeta"]["category"], + "Topic", + ) + + def test_categorized_under_mix(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + cat_pairs = _edge_pairs(manifest, "categorized_under") + self.assertIn(("article:alpha", "topic:topic"), cat_pairs) + self.assertIn(("article:pages/beta", "topic:topic"), cat_pairs) + + +class ParseWikiPureWikilinkRegressionTests(unittest.TestCase): + """Existing pure-wikilink Karpathy wikis must produce the same edges as + before — no regression from the md-link extraction additions.""" + + def setUp(self) -> None: + self.fix = _WikiFixture() + self.addCleanup(self.fix.cleanup) + self.fix.write( + "index.md", + "# Wiki Index\n\n## Topic\n\n- [[alpha]]\n- [[beta]]\n", + ) + self.fix.write( + "alpha.md", + "# Alpha\n\nAlpha relates to [[beta]].\n", + ) + self.fix.write( + "beta.md", + "# Beta\n\nBeta relates to [[alpha]].\n", + ) + + def test_no_md_link_stats_when_pure_wikilink(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + # mdLinks key exists but is 0 — no regression in counter behaviour. + self.assertEqual(manifest["stats"]["mdLinks"], 0) + # alpha.md and beta.md each carry one wikilink in their bodies + # (`[[beta]]` and `[[alpha]]` respectively). Wikilinks inside + # index.md are tallied by `parse_index`, not by `stats["wikilinks"]`, + # which only counts links inside article bodies. + self.assertEqual(manifest["stats"]["wikilinks"], 2) + + def test_wikilink_edges_match_expected(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + related = _edge_pairs(manifest, "related") + self.assertEqual( + related, + {("article:alpha", "article:beta"), ("article:beta", "article:alpha")}, + ) + + def test_categorized_under_unchanged(self) -> None: + manifest = pkb.parse_wiki(self.fix.tmp) + cat_pairs = _edge_pairs(manifest, "categorized_under") + self.assertEqual( + cat_pairs, + {("article:alpha", "topic:topic"), ("article:beta", "topic:topic")}, + ) + + def test_no_mdlinks_key_in_knowledge_meta(self) -> None: + # Articles without md-links shouldn't carry an empty `mdLinks` key — + # keeps the manifest output identical to pre-fix for pure wikilink + # wikis. + manifest = pkb.parse_wiki(self.fix.tmp) + for node in manifest["nodes"]: + if node["type"] == "article": + self.assertNotIn( + "mdLinks", node.get("knowledgeMeta", {}), + f"node {node['id']} unexpectedly has mdLinks key", + ) + + +# ── resolve_md_link ─────────────────────────────────────────────────────── + + +class ResolveMdLinkTests(unittest.TestCase): + """`resolve_md_link` direct-call tests against a synthetic path_map.""" + + def test_resolves_relative(self) -> None: + path_map = {"pages/alpha.md": "pages/alpha"} + article_ids = {"article:pages/alpha"} + resolved = pkb.resolve_md_link( + "alpha.md", Path("pages"), Path("/wiki"), path_map, article_ids, + ) + self.assertEqual(resolved, "article:pages/alpha") + + def test_resolves_absolute(self) -> None: + path_map = {"pages/alpha.md": "pages/alpha"} + article_ids = {"article:pages/alpha"} + resolved = pkb.resolve_md_link( + "/pages/alpha.md", Path("other"), Path("/wiki"), path_map, article_ids, + ) + self.assertEqual(resolved, "article:pages/alpha") + + def test_returns_none_for_unresolved(self) -> None: + resolved = pkb.resolve_md_link( + "missing.md", Path("."), Path("/wiki"), {}, set(), + ) + self.assertIsNone(resolved) + + def test_returns_none_when_not_in_node_set(self) -> None: + path_map = {"pages/alpha.md": "pages/alpha"} + # node_ids deliberately empty — article is in path_map but not nodes. + resolved = pkb.resolve_md_link( + "alpha.md", Path("pages"), Path("/wiki"), path_map, set(), + ) + self.assertIsNone(resolved) + + +if __name__ == "__main__": + unittest.main() diff --git a/understand-anything-plugin/skills/understand-knowledge/parse-knowledge-base.py b/understand-anything-plugin/skills/understand-knowledge/parse-knowledge-base.py index d6070512..df68dd1d 100644 --- a/understand-anything-plugin/skills/understand-knowledge/parse-knowledge-base.py +++ b/understand-anything-plugin/skills/understand-knowledge/parse-knowledge-base.py @@ -23,10 +23,21 @@ # Regex patterns # --------------------------------------------------------------------------- WIKILINK_RE = re.compile(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]") +# CommonMark inline link: [label](target). +# - `(? list[dict]: return links +def is_internal_md_target(target: str) -> bool: + """Return True if a markdown-link target points at an internal .md page. + + Filters out external URLs (http://, mailto:, etc.), bare anchors + (`#section`), and explicit non-markdown asset paths. Targets without a + `.md` extension are rejected — this parser only links between pages. + """ + if not target: + return False + t = target.strip() + if not t: + return False + # Pure anchor inside the current document — not a page link. + if t.startswith("#"): + return False + # External / scheme-prefixed URLs (http://, https://, mailto:, ftp:, …). + if _URL_SCHEME_RE.match(t): + return False + # Strip query / fragment for extension check. + path_part = t.split("#", 1)[0].split("?", 1)[0] + if not path_part: + return False + # Only resolve targets that point at a markdown file. + return path_part.lower().endswith(".md") + + +def extract_md_links(text: str) -> list[dict]: + """Extract CommonMark `[label](page.md)` links pointing at internal .md + pages. + + Skips image links (`![]()`), external URLs, anchors, and non-markdown + assets. Returned targets are raw (path-relative as written) — call + `resolve_md_link` to map them to article IDs. + + Fenced code blocks are stripped before scanning so that a syntax-coloured + example link inside ```` ```md ```` does not get treated as a real edge. + """ + if not text: + return [] + # Strip fenced code blocks before scanning. We can't reliably tell which + # links inside a code fence are intentional, so we exclude them all — + # mirrors how renderers display them as inert text. + stripped = re.sub(r"```[\s\S]*?```", "", text) + links = [] + for m in MD_LINK_RE.finditer(stripped): + target = m.group(2).strip() + if not is_internal_md_target(target): + continue + links.append({ + "target": target, + "display": m.group(1).strip() or None, + }) + return links + + def extract_headings(text: str) -> list[dict]: """Extract all markdown headings with level and text.""" return [ @@ -168,7 +234,20 @@ def extract_h1(text: str) -> str: # --------------------------------------------------------------------------- def parse_index(index_path: Path) -> list[dict]: - """Parse index.md to extract categories from ## headings and their wikilinks.""" + """Parse index.md to extract categories from ## headings and their links. + + Recognises both `[[wikilink]]` and CommonMark `[label](page.md)` styles + under each `## Section` heading. Returns categories with two parallel + target lists: + + - `articles` — raw wikilink targets (stems or filenames), kept as + strings for backward compatibility with existing call sites. + - `md_links` — raw CommonMark link targets (relative paths) that need + path-based resolution. + + The two lists are populated independently so a wiki that uses only one + syntax (or both) keeps working. + """ if not index_path.is_file(): return [] text = index_path.read_text(encoding="utf-8", errors="replace") @@ -182,6 +261,7 @@ def parse_index(index_path: Path) -> list[dict]: current_category = { "name": sec_match.group(1).strip(), "articles": [], + "md_links": [], } categories.append(current_category) continue @@ -190,6 +270,15 @@ def parse_index(index_path: Path) -> list[dict]: if current_category: for wl in WIKILINK_RE.finditer(line): current_category["articles"].append(wl.group(1).strip()) + # Also collect CommonMark `[label](page.md)` links so a Karpathy + # wiki rendered on GitHub/GitLab (which doesn't render `[[ ]]`) + # still produces deterministic category membership. Each link is + # filtered through `is_internal_md_target` so external URLs and + # image links are ignored. + for ml in MD_LINK_RE.finditer(line): + target = ml.group(2).strip() + if is_internal_md_target(target): + current_category["md_links"].append(target) return categories @@ -275,6 +364,94 @@ def resolve_wikilink(target: str, name_map: dict[str, str], node_ids: set[str] | return None +def build_path_to_stem_map(wiki_root: Path) -> dict[str, str]: + """Build a case-insensitive map from `posix-style-relative-path.md` to + article stem (relative to wiki_root, no extension). + + Used by `resolve_md_link` so CommonMark `[label](page.md)` targets resolve + by relative path even when the basename collides with another file (where + `name_map` deliberately drops the ambiguous bare-basename entry). + """ + path_map: dict[str, str] = {} + for md_file in wiki_root.rglob("*.md"): + rel = md_file.relative_to(wiki_root) + stem = rel.with_suffix("").as_posix() + path_map[rel.as_posix().lower()] = stem + return path_map + + +def _normalise_md_target(target: str, base_dir: Path, wiki_root: Path) -> str | None: + """Normalise a CommonMark link `target` to a posix path relative to + `wiki_root`. + + `target` is the raw href as written in the markdown source. `base_dir` is + the directory of the file containing the link (relative to `wiki_root` — + use `Path('.')` for files at the wiki root). Behaviour: + + - strips a trailing `#anchor` and `?query`; + - resolves `./`, `../`, and bare relative paths against `base_dir`; + - treats absolute paths (`/pages/x.md`) as relative to `wiki_root`; + - rejects paths that escape `wiki_root` (returns None). + + Returns the lower-cased posix relative path (e.g. `"pages/alpha.md"`) or + None if the target is unresolvable. + """ + if not target: + return None + # Strip query/fragment. + href = target.split("#", 1)[0].split("?", 1)[0].strip() + if not href: + return None + # Absolute paths in the wiki are treated as relative to the wiki root — + # mirrors how GitHub renders `/pages/x.md` in repo-rooted markdown. + if href.startswith("/"): + candidate = Path(href.lstrip("/")) + else: + candidate = base_dir / href + # Manual normalisation of `.` and `..` segments without touching the + # filesystem (Path.resolve would follow symlinks and require existence). + parts: list[str] = [] + for part in candidate.as_posix().split("/"): + if part in ("", "."): + continue + if part == "..": + if not parts: + # Escapes wiki_root — unresolvable. + return None + parts.pop() + else: + parts.append(part) + if not parts: + return None + return "/".join(parts).lower() + + +def resolve_md_link( + target: str, + base_dir: Path, + wiki_root: Path, + path_map: dict[str, str], + node_ids: set[str] | None = None, +) -> str | None: + """Resolve a CommonMark `[label](path.md)` target to an article node ID. + + Resolution is by normalised relative path (`pages/alpha.md`, + `./pages/alpha.md`, and `/pages/alpha.md` all map to the same key). + Returns None when the target cannot be matched against `path_map` or when + `node_ids` is provided and the resolved candidate is not in it. + """ + norm = _normalise_md_target(target, base_dir, wiki_root) + if not norm: + return None + stem = path_map.get(norm) + if not stem: + return None + candidate = f"article:{stem}" + if node_ids is not None and candidate not in node_ids: + return None + return candidate + + def parse_wiki(root: Path) -> dict: """Parse a Karpathy-pattern wiki and produce the scan manifest.""" detection = detect_format(root) @@ -286,8 +463,10 @@ def parse_wiki(root: Path) -> dict: wiki_root = Path(detection["wiki_root"]) raw_root = root / "raw" - # Build name resolution map + # Build name resolution map (wikilinks: by stem/basename) name_map = build_name_to_stem_map(wiki_root) + # Build path resolution map (md-links: by full relative path) + path_map = build_path_to_stem_map(wiki_root) # Find index.md and log.md index_path = wiki_root / "index.md" @@ -301,11 +480,33 @@ def parse_wiki(root: Path) -> dict: categories = parse_index(index_path) log_entries = parse_log(log_path) - # Build category lookup: wikilink target → category name + # Resolve the index file's directory relative to wiki_root. This is the + # base against which md-link targets inside index.md are resolved. When + # the index lives outside wiki_root (e.g. repo-root index.md while + # wiki_root is root/wiki), `_normalise_md_target` will reject targets + # that escape via `..` — those won't have matching article IDs anyway. + try: + index_base = index_path.parent.relative_to(wiki_root) + except ValueError: + index_base = Path(".") + + # Build category lookups: + # - by wikilink target (lower-cased stem/basename) — existing behaviour + # - by md-link relative-stem (resolved against the index file's directory) + # The md_category_lookup is keyed by the resolved `article:` ID so + # the per-article lookup below is a single dict access. category_lookup: dict[str, str] = {} + md_category_lookup: dict[str, str] = {} for cat in categories: for article_target in cat["articles"]: category_lookup[article_target.lower()] = cat["name"] + for md_target in cat.get("md_links", []): + norm = _normalise_md_target(md_target, index_base, wiki_root) + if not norm: + continue + stem = path_map.get(norm) + if stem: + md_category_lookup[f"article:{stem}"] = cat["name"] # --- Pre-compute article IDs (for edge resolution validation) --- # Only skip infra files at the wiki root level, not in subdirectories @@ -323,7 +524,14 @@ def parse_wiki(root: Path) -> dict: nodes = [] edges = [] warnings = [] - stats = {"articles": 0, "sources": 0, "topics": 0, "wikilinks": 0, "unresolved": 0} + stats = { + "articles": 0, + "sources": 0, + "topics": 0, + "wikilinks": 0, + "mdLinks": 0, + "unresolved": 0, + } for md_file in sorted(wiki_root.rglob("*.md")): rel = md_file.relative_to(wiki_root) @@ -338,17 +546,22 @@ def parse_wiki(root: Path) -> dict: h1 = extract_h1(text) frontmatter = extract_frontmatter(text) wikilinks = extract_wikilinks(text) + md_links = extract_md_links(text) headings = extract_headings(text) code_langs = extract_code_blocks(text) summary = extract_first_paragraph(text) line_count = text.count("\n") + 1 word_count = len(text.split()) - # Derive category from index.md lookup + node_id = f"article:{stem}" + + # Derive category from index.md lookup. + # Order: wikilink basename → wikilink stem → md-link by article ID. category = category_lookup.get(basename.lower(), "") if not category: - # Try stem match category = category_lookup.get(stem.lower(), "") + if not category: + category = md_category_lookup.get(node_id, "") # Derive tags (deduplicated) tag_set: set[str] = set() @@ -361,16 +574,15 @@ def parse_wiki(root: Path) -> dict: tag_set.update(t.strip() for t in fm_tags.split(",") if t.strip()) tags = sorted(tag_set) - # Complexity from wikilink density - wl_count = len(wikilinks) - if wl_count > 15: + # Complexity from total link density (wikilinks + md-links). + link_count = len(wikilinks) + len(md_links) + if link_count > 15: complexity = "complex" - elif wl_count > 5: + elif link_count > 5: complexity = "moderate" else: complexity = "simple" - node_id = f"article:{stem}" nodes.append({ "id": node_id, "type": "article", @@ -381,12 +593,14 @@ def parse_wiki(root: Path) -> dict: "complexity": complexity, "knowledgeMeta": { "wikilinks": [wl["target"] for wl in wikilinks], + **({"mdLinks": [ml["target"] for ml in md_links]} if md_links else {}), **({"category": category} if category else {}), "content": text[:3000], # First 3000 chars for LLM analysis }, }) stats["articles"] += 1 - stats["wikilinks"] += wl_count + stats["wikilinks"] += len(wikilinks) + stats["mdLinks"] += len(md_links) # Build edges from wikilinks (resolve against known article IDs) for wl in wikilinks: @@ -403,20 +617,43 @@ def parse_wiki(root: Path) -> dict: warnings.append(f"Unresolved wikilink: [[{wl['target']}]] in {rel}") stats["unresolved"] += 1 + # Build edges from CommonMark md-links (resolved relative to this + # file's directory). Same edge shape as wikilinks so downstream + # consumers stay unchanged. + for ml in md_links: + target_id = resolve_md_link( + ml["target"], rel.parent, wiki_root, path_map, article_ids + ) + if target_id and target_id != node_id: + edges.append({ + "source": node_id, + "target": target_id, + "type": "related", + "direction": "forward", + "weight": 0.7, + }) + elif not target_id: + warnings.append(f"Unresolved md-link: [{ml['display']}]({ml['target']}) in {rel}") + stats["unresolved"] += 1 + # --- Build topic nodes from index.md categories --- for cat in categories: topic_id = f"topic:{cat['name'].lower().replace(' ', '-')}" + md_link_count = len(cat.get("md_links", [])) + article_count = len(cat["articles"]) + md_link_count nodes.append({ "id": topic_id, "type": "topic", "name": cat["name"], - "summary": f"Category from index: {cat['name']} ({len(cat['articles'])} articles)", + "summary": f"Category from index: {cat['name']} ({article_count} articles)", "tags": ["category"], "complexity": "simple", }) stats["topics"] += 1 - # categorized_under edges (only resolve to known article nodes) + # categorized_under edges (only resolve to known article nodes). + # Wikilink targets resolve via name_map; CommonMark md-link targets + # resolve by relative path via path_map. for article_target in cat["articles"]: article_id = resolve_wikilink(article_target, name_map, article_ids) if article_id: @@ -427,6 +664,18 @@ def parse_wiki(root: Path) -> dict: "direction": "forward", "weight": 0.6, }) + for md_target in cat.get("md_links", []): + article_id = resolve_md_link( + md_target, index_base, wiki_root, path_map, article_ids + ) + if article_id: + edges.append({ + "source": article_id, + "target": topic_id, + "type": "categorized_under", + "direction": "forward", + "weight": 0.6, + }) # --- Build source nodes from raw/ --- if raw_root.is_dir(): @@ -471,7 +720,13 @@ def parse_wiki(root: Path) -> dict: return { "format": "karpathy", "stats": stats, - "categories": [{"name": c["name"], "count": len(c["articles"])} for c in categories], + "categories": [ + { + "name": c["name"], + "count": len(c["articles"]) + len(c.get("md_links", [])), + } + for c in categories + ], "logEntries": len(log_entries), "nodes": nodes, "edges": deduped_edges, @@ -499,8 +754,12 @@ def main(): # Report to stderr s = manifest["stats"] + md_links = s.get("mdLinks", 0) + link_summary = f"{s['wikilinks']} wikilinks" + if md_links: + link_summary += f", {md_links} md-links" print(f"[parse] Karpathy wiki: {s['articles']} articles, {s['sources']} sources, " - f"{s['topics']} topics, {s['wikilinks']} wikilinks " + f"{s['topics']} topics, {link_summary} " f"({s['unresolved']} unresolved)", file=sys.stderr) print(f"[parse] Output: {out_path}", file=sys.stderr)