From 6796a9aa4b29d021beb6a8425e5b04f2e04e5b7a Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 11 May 2026 20:42:30 +0000 Subject: [PATCH] issue #34: add broken-image pre-commit hook for internal refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scans staged HTML/Markdown for image references (src=, image=, and Markdown ![]()), and fails the commit if a root-relative path doesn't exist in the repo. Catches typos and the dibs-web01-style failure mode where a referenced image goes missing. Scope is internal refs only — external URLs and Liquid-templated paths are skipped. Lychee in site-health CI already covers external links, and pre-commit shouldn't make commits depend on network. CI gets the new hook automatically via the existing pre-commit workflow. Stdlib-only Python (no new dependencies). --- .pre-commit-config.yaml | 6 +++ README.md | 3 ++ scripts/check_image_refs.py | 89 +++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100755 scripts/check_image_refs.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a4b54c8..bcebe6c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,6 +29,12 @@ repos: language: system types_or: [image, svg] + - id: image-refs + name: image-refs (verify internal image paths resolve) + entry: python3 scripts/check_image_refs.py + language: system + types_or: [html, markdown] + - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: diff --git a/README.md b/README.md index ba61ca5..13d269d 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,9 @@ After that, every `git commit` will automatically: - Minify staged SVGs with `svgo` - Block any image still over 1 MB after compression, and warn on images between 500 KB and 1 MB (see "Image size policy" below) +- Verify internal image references in HTML/Markdown actually resolve + to a file in the repo (catches typos and orphaned references; external + URLs are covered by lychee in the site-health CI workflow) - Strip trailing whitespace, fix mixed line endings, ensure files end with a final newline - Validate YAML and JSON files (`_data/`, `_config.yml`, etc.) diff --git a/scripts/check_image_refs.py b/scripts/check_image_refs.py new file mode 100755 index 0000000..4b72144 --- /dev/null +++ b/scripts/check_image_refs.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Verify image references in staged HTML and Markdown resolve to local files. + +External URLs (http://, https://, //) and Liquid-templated paths are skipped — +lychee covers those in CI. This hook focuses on internal refs, which is where +the value-add is: it catches typos, deleted images that someone forgot to +update, and the "broken /images/foo.jpg" failure mode that just hit us with +the dibs-web01 migration. +""" + +from __future__ import annotations + +import re +import sys +from pathlib import Path + +# Catches src="...", image="...", and Markdown ![alt](url). +# Single-line only; multi-line attribute values are exotic and not worth the +# regex complexity. +URL_PATTERNS = [ + re.compile(r'(?:src|image)=["\']([^"\']+)["\']'), + re.compile(r'!\[[^\]]*\]\(([^)\s]+)'), +] + +IMG_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".bmp"} +EXTERNAL_PREFIXES = ("http://", "https://", "//", "data:", "mailto:") +LIQUID_PAT = re.compile(r"\{[%{]") + + +def is_skippable(url: str) -> bool: + if url.startswith(EXTERNAL_PREFIXES): + return True + if LIQUID_PAT.search(url): + return True + return False + + +def is_image_path(url: str) -> bool: + return Path(url.split("?")[0].split("#")[0]).suffix.lower() in IMG_EXTS + + +def resolve(url: str, repo_root: Path) -> Path: + """Resolve a root-relative URL to a filesystem path under repo root.""" + clean = url.split("?")[0].split("#")[0].lstrip("/") + return repo_root / clean + + +def line_of(text: str, pos: int) -> int: + return text.count("\n", 0, pos) + 1 + + +def check_file(path: Path, repo_root: Path) -> list[tuple[int, str, Path]]: + text = path.read_text(errors="replace") + failures: list[tuple[int, str, Path]] = [] + for pattern in URL_PATTERNS: + for match in pattern.finditer(text): + url = match.group(1) + if is_skippable(url) or not is_image_path(url): + continue + if not url.startswith("/"): + # Non-leading-slash relative paths are ambiguous under Jekyll + # (depends on the served URL of the page). Skip them rather + # than risk false positives — lychee will catch them in CI + # via the built _site/. + continue + target = resolve(url, repo_root) + if not target.exists(): + failures.append((line_of(text, match.start()), url, target)) + return failures + + +def main(argv: list[str]) -> int: + repo_root = Path.cwd() + total = 0 + for arg in argv: + path = Path(arg) + for line, url, target in check_file(path, repo_root): + rel = target.relative_to(repo_root) if target.is_absolute() else target + print( + f"{path}:{line}: broken image ref {url!r} " + f"(looked for {rel})", + file=sys.stderr, + ) + total += 1 + return 1 if total else 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:]))