Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ repos:
language: system
types_or: [image, svg]

- id: image-refs
name: image-refs (verify internal image paths resolve)
entry: python3 scripts/check_image_refs.py
language: system
types_or: [html, markdown]

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ After that, every `git commit` will automatically:
- Minify staged SVGs with `svgo`
- Block any image still over 1 MB after compression, and warn on images
between 500 KB and 1 MB (see "Image size policy" below)
- Verify internal image references in HTML/Markdown actually resolve
to a file in the repo (catches typos and orphaned references; external
URLs are covered by lychee in the site-health CI workflow)
- Strip trailing whitespace, fix mixed line endings, ensure files end
with a final newline
- Validate YAML and JSON files (`_data/`, `_config.yml`, etc.)
Expand Down
89 changes: 89 additions & 0 deletions scripts/check_image_refs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""Verify image references in staged HTML and Markdown resolve to local files.

External URLs (http://, https://, //) and Liquid-templated paths are skipped —
lychee covers those in CI. This hook focuses on internal refs, which is where
the value-add is: it catches typos, deleted images that someone forgot to
update, and the "broken /images/foo.jpg" failure mode that just hit us with
the dibs-web01 migration.
"""

from __future__ import annotations

import re
import sys
from pathlib import Path

# Catches src="...", image="...", and Markdown ![alt](url).
# Single-line only; multi-line attribute values are exotic and not worth the
# regex complexity.
URL_PATTERNS = [
re.compile(r'(?:src|image)=["\']([^"\']+)["\']'),
re.compile(r'!\[[^\]]*\]\(([^)\s]+)'),
]

IMG_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".bmp"}
EXTERNAL_PREFIXES = ("http://", "https://", "//", "data:", "mailto:")
LIQUID_PAT = re.compile(r"\{[%{]")


def is_skippable(url: str) -> bool:
if url.startswith(EXTERNAL_PREFIXES):
return True
if LIQUID_PAT.search(url):
return True
return False


def is_image_path(url: str) -> bool:
return Path(url.split("?")[0].split("#")[0]).suffix.lower() in IMG_EXTS


def resolve(url: str, repo_root: Path) -> Path:
"""Resolve a root-relative URL to a filesystem path under repo root."""
clean = url.split("?")[0].split("#")[0].lstrip("/")
return repo_root / clean


def line_of(text: str, pos: int) -> int:
return text.count("\n", 0, pos) + 1


def check_file(path: Path, repo_root: Path) -> list[tuple[int, str, Path]]:
text = path.read_text(errors="replace")
failures: list[tuple[int, str, Path]] = []
for pattern in URL_PATTERNS:
for match in pattern.finditer(text):
url = match.group(1)
if is_skippable(url) or not is_image_path(url):
continue
if not url.startswith("/"):
# Non-leading-slash relative paths are ambiguous under Jekyll
# (depends on the served URL of the page). Skip them rather
# than risk false positives — lychee will catch them in CI
# via the built _site/.
continue
target = resolve(url, repo_root)
if not target.exists():
failures.append((line_of(text, match.start()), url, target))
return failures


def main(argv: list[str]) -> int:
repo_root = Path.cwd()
total = 0
for arg in argv:
path = Path(arg)
for line, url, target in check_file(path, repo_root):
rel = target.relative_to(repo_root) if target.is_absolute() else target
print(
f"{path}:{line}: broken image ref {url!r} "
f"(looked for {rel})",
file=sys.stderr,
)
total += 1
return 1 if total else 0


if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Loading