diff --git a/.github/actions/setup-dev-image/action.yml b/.github/actions/setup-dev-image/action.yml index e4c126e..f4a6fb6 100644 --- a/.github/actions/setup-dev-image/action.yml +++ b/.github/actions/setup-dev-image/action.yml @@ -21,7 +21,7 @@ description: Pull (or build) the afm dev image, install `just`, and enable sccac runs: using: composite steps: - - uses: taiki-e/install-action@v2 + - uses: taiki-e/install-action@7a79fe8c3a13344501c80d99cae481c1c9085912 # v2 with: tool: just @@ -30,7 +30,7 @@ runs: # inside the dev image. Setting the three env vars below routes # container-side `sccache` calls through the GHA cache via the # runner's ACTIONS_CACHE_URL + ACTIONS_RUNTIME_TOKEN. - - uses: mozilla-actions/sccache-action@v0.0.9 + - uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9 # Forward the GHA cache env into the compose services. Without these # the container sccache silently falls back to the disk volume at @@ -68,7 +68,7 @@ runs: # fallback below. Same-repo runs always log in. - name: Log in to GHCR (same-repo runs only) if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }} - uses: docker/login-action@v3 + uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3 with: registry: ghcr.io username: ${{ github.actor }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1e307dc..d26b67d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,7 +57,7 @@ jobs: uses: dtolnay/rust-toolchain@stable with: toolchain: 1.95.0 - - uses: Swatinem/rust-cache@v2 + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 with: key: msrv - run: cargo check --workspace --all-targets @@ -159,7 +159,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: taiki-e/install-action@v2 + - uses: taiki-e/install-action@7a79fe8c3a13344501c80d99cae481c1c9085912 # v2.81.10 with: tool: just - run: just verify-version-pins @@ -207,15 +207,23 @@ jobs: # Phase-1 partial (deny, audit gate cheaply on Cargo.lock alone), but # `udeps` is nightly-bound and needs the dep graph compiled — so the # job gates on check to skip wasted work when typecheck has already - # failed. `fail-fast: false` keeps `deny` failures from masking an - # `audit` failure and vice-versa. + # failed. `fail-fast: false` keeps one leg's failure from masking the + # others. + # + # `audit-comrak` (C1/F4) is its own matrix leg so a RUSTSEC advisory + # against the vendored path-dep comrak (which `cargo audit`/`cargo deny` + # can't see — it's absent from the registry dep graph) surfaces as a + # distinctly-named failing check. It also runs transitively as a + # dependency of the `audit` leg, so local `just ci` is covered too; the + # standalone leg just sharpens the CI signal. No scheduled workflow is + # used (user preference: no cron) — this runs per-PR. audit: runs-on: ubuntu-latest needs: check strategy: fail-fast: false matrix: - target: [deny, audit, udeps] + target: [deny, audit, audit-comrak, udeps] steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: ./.github/actions/setup-dev-image diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 071b444..2e7fefb 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -6,8 +6,9 @@ # *changes* a PR introduces and comments the diff directly in the # review UI. # -# Action pinning: @v tags for now; Dependabot will upgrade to -# commit SHA pins on its first sweep. +# Action pinning: the actions below are pinned to full commit SHAs +# (the `# vX.Y.Z` trailer is the resolved tag); Dependabot's +# github-actions ecosystem keeps the SHAs current. name: dependency-review @@ -24,9 +25,9 @@ jobs: name: dependency review runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.2 - - uses: actions/dependency-review-action@v5 + - uses: actions/dependency-review-action@a1d282b36b6f3519aa1f3fc636f609c47dddb294 # v5.0.0 with: # Match the cargo-deny policy: reject any newly introduced # advisory of moderate severity or worse. diff --git a/.github/workflows/dev-image.yml b/.github/workflows/dev-image.yml index 5de6f96..dddd750 100644 --- a/.github/workflows/dev-image.yml +++ b/.github/workflows/dev-image.yml @@ -49,15 +49,15 @@ jobs: name: build & push (ghcr.io/p4suta/afm-dev) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.2 - - uses: docker/setup-buildx-action@v4 + - uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0 - name: Log in to GHCR # PR builds from forks must not have GHCR credentials — we still # build to verify but skip the login + push there. if: github.event_name != 'pull_request' - uses: docker/login-action@v4 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: registry: ghcr.io username: ${{ github.actor }} @@ -65,7 +65,7 @@ jobs: - name: Extract image metadata id: meta - uses: docker/metadata-action@v6 + uses: docker/metadata-action@80c7e94dd9b9319bd5eb7a0e0fe9291e23a2a2e9 # v6.1.0 with: images: ghcr.io/p4suta/afm-dev tags: | @@ -73,7 +73,7 @@ jobs: type=sha,format=long - name: Build and push (ci stage) - uses: docker/build-push-action@v7 + uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0 with: context: . file: Dockerfile diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 6f1e371..4f3096a 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,8 +17,12 @@ # WASM_PACK_VERSION arg so dev and CI agree on the wasm-bindgen-cli # that gets auto-fetched. # -# Action pinning: @v tags are intentional pre-first-run; -# Dependabot will upgrade them to commit SHAs on its first weekly sweep. +# Action pinning: every third-party action below is pinned to a full +# commit SHA (the `# vX.Y.Z` trailer is the tag it resolved to). +# Dependabot's github-actions ecosystem keeps those SHAs current. +# `dtolnay/rust-toolchain@stable` is the deliberate exception — its ref +# is the moving `stable` branch (we want the newest stable rustc), and +# dependabot.yml ignores it. name: docs @@ -43,15 +47,15 @@ jobs: name: build site runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.2 - uses: dtolnay/rust-toolchain@stable - - uses: Swatinem/rust-cache@v2 + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 with: key: docs - - uses: peaceiris/actions-mdbook@v2 + - uses: peaceiris/actions-mdbook@ee69d230fe19748b7abf22df32acaa93833fad08 # v2.0.0 with: mdbook-version: latest @@ -68,13 +72,13 @@ jobs: run: mdbook build crates/afm-book - name: Install bun - uses: oven-sh/setup-bun@v2 + uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2.2.0 with: # Pinned alongside the Dockerfile BUN_VERSION arg. bun-version: '1.3.14' - name: Install wasm-pack - uses: jetli/wasm-pack-action@v0.4.0 + uses: jetli/wasm-pack-action@0d096b08b4e5a7de8c28de67e11e945404e9eefa # v0.4.0 with: # Pinned alongside the Dockerfile WASM_PACK_VERSION arg. version: 'v0.15.0' @@ -100,7 +104,7 @@ jobs: # GitHub Pages otherwise runs Jekyll and hides paths like `_sources`. touch site/.nojekyll - - uses: actions/upload-pages-artifact@v5 + - uses: actions/upload-pages-artifact@fc324d3547104276b827a68afc52ff2a11cc49c9 # v5.0.0 with: path: site @@ -112,5 +116,5 @@ jobs: name: github-pages url: ${{ steps.deployment.outputs.page_url }} steps: - - uses: actions/deploy-pages@v5 + - uses: actions/deploy-pages@cd2ce8fcbc39b97be8ca5fce6e763baed58fa128 # v5.0.0 id: deployment diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index de0b13a..2ad2ac1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -11,11 +11,13 @@ # toolchain here. See CONTRIBUTING.md "How to release" for the full # rationale. # -# Action pinning: we use `@v` tags below rather than commit SHAs -# because Dependabot (`.github/dependabot.yml`) only discovers and -# SHA-pins new workflow files on its weekly sweep. Once that sweep has -# run, the pins will be upgraded automatically and this file will align -# with `ci.yml`'s commit-SHA convention. +# Action pinning: every third-party action below is pinned to a full +# commit SHA (the `# vX.Y.Z` trailer is the human-readable tag it +# resolved to) so a re-pointed tag can't silently change what runs in a +# release. Dependabot's github-actions ecosystem bumps these SHAs weekly. +# The sole exception is `dtolnay/rust-toolchain@stable`: its ref is the +# moving `stable` branch by design (we want the latest stable rustc for +# release artefacts), and dependabot.yml ignores it for that reason. name: release @@ -50,7 +52,7 @@ jobs: runner: windows-latest archive: zip steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.2 - uses: dtolnay/rust-toolchain@stable @@ -59,7 +61,7 @@ jobs: # add` is idempotent and unambiguous on every cache state. - run: rustup target add ${{ matrix.target }} - - uses: Swatinem/rust-cache@v2 + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 with: key: release-${{ matrix.target }} @@ -116,7 +118,7 @@ jobs: echo "name=${name}" >> "$GITHUB_OUTPUT" - name: Upload build artefacts - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: ${{ steps.archive.outputs.name }} path: | @@ -134,9 +136,9 @@ jobs: id-token: write attestations: write steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.2 - - uses: actions/download-artifact@v8 + - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: pattern: afm-* merge-multiple: true @@ -153,14 +155,14 @@ jobs: cat SHA256SUMS - name: Attest build provenance - uses: actions/attest-build-provenance@v2 + uses: actions/attest-build-provenance@e8998f949152b193b063cb0ec769d69d929409be # v2.4.0 with: subject-path: | artefacts/afm-*.tar.gz artefacts/afm-*.zip - name: Generate release notes with git-cliff - uses: orhun/git-cliff-action@v4 + uses: orhun/git-cliff-action@f50e11560dce63f7c33227798f90b924471a88b5 # v4.8.0 with: config: cliff.toml args: --latest --strip all @@ -171,7 +173,7 @@ jobs: - name: Show release notes run: cat CHANGES.md - - uses: softprops/action-gh-release@v3 + - uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3.0.0 with: files: | artefacts/afm-*.tar.gz diff --git a/Dockerfile b/Dockerfile index 1253b09..928834f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,14 +3,26 @@ # Every developer and CI job runs inside this image. Host toolchain is never invoked. # # Layered so upstream-sync / dependency bumps rebuild minimal surface. - -ARG RUST_VERSION=1.95.0 +# +# External base images (rust, playwright) are pinned by immutable +# manifest-list digest (supply-chain hardening, C2/F9): a floating tag +# like `rust:1.95.0-bookworm` can be re-pushed, so we pin the sha256 and +# keep the human-readable tag inline. Dependabot's `docker` ecosystem +# (.github/dependabot.yml) bumps the tag AND the digest together on its +# weekly sweep, so the pin stays current without a human resolving the +# sha by hand. Resolve a fresh digest with +# `docker buildx imagetools inspect rust:1.95.0-bookworm`. +# +# NODE_VERSION stays an ARG: it parameterises an apt source URL in the +# node-base stage (deb.nodesource.com/setup_.x), not a FROM line, so +# there is no base-image digest to pin for it. ARG NODE_VERSION=22 ######################################################################## # Stage: toolchain — Rust stable + system deps for builds and CJK work ######################################################################## -FROM rust:${RUST_VERSION}-bookworm AS toolchain +# rust:1.95.0-bookworm (digest pinned; tag kept for humans / Dependabot) +FROM rust:1.95.0-bookworm@sha256:6258907abe69656e41cd992e0b705cdcfabcbbe3db374f92ed2d47121282d4a1 AS toolchain RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ @@ -270,8 +282,13 @@ CMD ["mdbook", "serve", "--hostname", "0.0.0.0", "--port", "3000"] ######################################################################## # Stage: browser — Playwright with Chromium + WebKit for M3 onward +# +# Pinned by digest (see the toolchain-stage note at the top); Dependabot +# bumps the tag + sha together. Refresh via +# `docker buildx imagetools inspect mcr.microsoft.com/playwright:v1.60.0-jammy`. ######################################################################## -FROM mcr.microsoft.com/playwright:v1.60.0-jammy AS browser +# mcr.microsoft.com/playwright:v1.60.0-jammy (digest pinned; tag kept for humans / Dependabot) +FROM mcr.microsoft.com/playwright:v1.60.0-jammy@sha256:e1529a04087193966ea15d4a1617345bdaa0791690a24ab2c42b65f9ce5b2cdc AS browser WORKDIR /workspace CMD ["bash"] diff --git a/Justfile b/Justfile index 1a46fdd..a819d62 100644 --- a/Justfile +++ b/Justfile @@ -552,10 +552,66 @@ verify-version-pins: deny: {{_dev}} cargo deny check -# RustSec advisory scan -audit: +# RustSec advisory scan. +# +# Depends on `audit-comrak` so the vendored-comrak advisory gate runs in +# the same invocation: `cargo audit` alone keys advisories off the +# *registry* dependency graph in `Cargo.lock`, and comrak is a PATH dep +# (`upstream/comrak/`, ADR-0001) — it never appears there, so a future +# RUSTSEC advisory against comrak 0.52.0 would be invisible to the plain +# scan. `audit-comrak` closes that blind spot. Both are wired into +# `just ci` via this recipe (and the `audit` matrix leg in ci.yml). +audit: audit-comrak {{_dev}} cargo audit +# Vendored-comrak RUSTSEC gate (C1/F4). +# +# comrak is vendored as a path dependency at `upstream/comrak/` (version +# in `upstream/comrak/Cargo.toml`, sha in `upstream/comrak/COMRAK_SHA`), +# pinned bit-for-bit to upstream v0.52.0 (ADR-0001, 0-line diff). Because +# it is a path dep it is absent from the registry dependency graph +# `cargo audit` / `cargo deny` walk, so neither tool would ever flag a +# RUSTSEC advisory filed against the `comrak` crate at our pinned +# version. This recipe re-introduces that coverage WITHOUT a scheduled +# workflow (user preference: no cron): it runs per-PR as part of +# `just ci`. +# +# Mechanism: synthesise a one-crate `Cargo.lock` that declares `comrak` +# at exactly the vendored version as a crates.io registry package, then +# point the real `cargo audit` at it (`--file`). This delegates the +# advisory version-range matching to the authoritative RustSec engine +# (no hand-rolled semver parsing) and uses the same advisory-db +# `cargo audit` already fetches. `--deny warnings` makes ANY advisory +# match — vulnerability, unmaintained, or yanked notice keyed to comrak +# — fail the gate. The version is read from `upstream/comrak/Cargo.toml` +# so there is no second source of truth to drift. +# +# Exit semantics: clean (no advisory affects the pinned version) → 0; +# an advisory matches → non-zero, halting `just ci`. On a hit, read the +# RUSTSEC id it prints, then either bump the vendored tree +# (`just upstream-sync `) past the patched version or, if the +# advisory does not apply to how afm drives comrak, record a documented +# `ignore` in the gate (see afm/SECURITY.md "Vendored comrak"). +audit-comrak: + {{_dev}} bash -c '\ + set -euo pipefail; \ + ver=$(grep -m1 -E "^version[[:space:]]*=" upstream/comrak/Cargo.toml | sed -E "s/.*\"([^\"]+)\".*/\\1/"); \ + if [ -z "$ver" ]; then echo "audit-comrak: could not read comrak version from upstream/comrak/Cargo.toml" >&2; exit 1; fi; \ + echo "audit-comrak: checking vendored comrak $ver against RUSTSEC advisories"; \ + lock=$(mktemp -d)/Cargo.lock; \ + printf "%s\n" \ + "# Synthetic lockfile — generated by just audit-comrak (C1/F4)." \ + "# Pins the vendored comrak version as a registry crate so" \ + "# cargo audit can match RUSTSEC advisories the path dep hides." \ + "version = 3" \ + "" \ + "[[package]]" \ + "name = \"comrak\"" \ + "version = \"$ver\"" \ + "source = \"registry+https://github.com/rust-lang/crates.io-index\"" \ + > "$lock"; \ + cargo audit --file "$lock" --deny warnings' + # Unused dependency scan (requires nightly) udeps: {{_fuzz}} cargo +nightly udeps --workspace --all-targets diff --git a/SECURITY.md b/SECURITY.md index b25d49e..d624633 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -52,6 +52,54 @@ Out of scope: - Issues in dependencies with no afm-specific exploitation path — cargo-deny's advisory check catches these at CI time. +## Vendored comrak advisory tracking + +comrak is **vendored as a path dependency** at `upstream/comrak/` +(pinned bit-for-bit to upstream v0.52.0; version in +`upstream/comrak/Cargo.toml`, commit in `upstream/comrak/COMRAK_SHA`; +ADR-0001 keeps the diff at 0 lines). A path dependency does **not** +appear in the registry dependency graph that `cargo audit` and +`cargo deny` walk, so neither tool would flag a [RustSec][rustsec] +advisory filed against the `comrak` crate at our pinned version. That +is a real supply-chain blind spot for a vendored fork. + +We close it with a dedicated per-PR gate, `just audit-comrak` (wired +into `just audit` → `just ci`, and run as its own leg in the `audit` +matrix of `.github/workflows/ci.yml`). The gate synthesises a one-crate +`Cargo.lock` pinning `comrak` at the vendored version as a crates.io +package and runs the authoritative `cargo audit` engine against it, so +RustSec advisory version-range matching applies to the vendored tree +exactly as it would to a registry dependency. There is **no scheduled +/ cron workflow** (maintainer preference); the check rides every pull +request instead. + +When the gate fails it prints the matching `RUSTSEC-…` id. The fix is +normally to advance the vendored tree past the patched version with +`just upstream-sync `. If — and only if — an advisory provably +does not apply to how afm drives comrak (afm composes vanilla comrak as +a black box and never enables its raw-HTML passthrough by default), the +advisory id may be recorded as a documented `ignore` in the +`audit-comrak` recipe with a one-line justification, mirroring the +`[advisories] ignore` convention in `deny.toml`. + +[rustsec]: https://rustsec.org/ + +## Release profile: `panic = "abort"` + +The release profile builds with `panic = "abort"`. A panic that is +nevertheless reached at runtime therefore **aborts the entire host +process** (`SIGABRT`) — it does not unwind and cannot be caught with +`std::panic::catch_unwind`. afm targets a panic-free rendering path on +untrusted input (enforced by the fuzz harnesses and the no-bare-`[#` +Tier-A invariant), but an embedder must treat any residual panic as a +hard crash of its own process. **Pre-validate untrusted input** (cap +length — the security scope above is bounded at 10 MiB — reject inputs +you will not render) before calling into afm in a process whose +liveness matters, and isolate rendering of attacker-controlled content +in a worker/subprocess if a single render must not be able to take the +host down. Report any panic reachable from a well-formed call as a +vulnerability per the policy above. + ## Supported versions afm is pre-1.0. Only the `main` branch is supported; security fixes diff --git a/crates/afm-markdown/src/ast_splice.rs b/crates/afm-markdown/src/ast_splice.rs index 3951b38..73a049e 100644 --- a/crates/afm-markdown/src/ast_splice.rs +++ b/crates/afm-markdown/src/ast_splice.rs @@ -114,40 +114,70 @@ struct AstSplicer<'a, 'src> { } impl<'a, 'src> AstSplicer<'a, 'src> { - /// Depth-first traversal. Children are snapshotted before - /// iterating because we may detach / reorder them during dispatch. - fn walk(&mut self, node: &'a AstNode<'a>) { - let children: Vec<&'a AstNode<'a>> = node.children().collect(); - for child in children { - self.dispatch(child); - } - } - - fn dispatch(&mut self, node: &'a AstNode<'a>) { - let (action, is_heading) = { - let data = node.data.borrow(); - ( - classify(&data.value), - matches!(&data.value, NodeValue::Heading(_)), - ) - }; - match action { - DispatchAction::Paragraph => self.dispatch_paragraph(node), - DispatchAction::TextWith(text) => self.split_text_node(node, &text), - DispatchAction::Recurse => { - if is_heading { - self.in_heading_depth += 1; - self.walk(node); + /// Depth-first traversal over an explicit work stack rather than + /// recursion. + /// + /// comrak can build an arbitrarily deep AST from a small input + /// (deeply nested blockquotes `> > > …`, nested list items, or + /// nested inline emphasis), and `handle_blockquote` carries no + /// nesting cap. A recursive descent would exhaust the call stack — + /// under the release profile's `panic = "abort"` that is a hard + /// process abort, i.e. a crash on untrusted input, which both repos' + /// `SECURITY.md` scope IN as a vulnerability. The explicit stack + /// moves the unbounded growth to the heap, where it is bounded by + /// the input size rather than the OS stack. comrak's own + /// `format_html` / AST post-processing are iterative for the same + /// reason; this brings the splice walk in line with them. + /// + /// Children are pushed in reverse so the `Vec`-as-stack pops them in + /// document order, and a `Heading`'s subtree is bracketed by a + /// [`Work::ExitHeading`] marker so `in_heading_depth` is incremented + /// for exactly the heading's descendants — preserving, node for + /// node, the recursive `in_heading_depth += 1; walk; -= 1` behaviour + /// the Tier-A / Tier-C splice contract depends on. Each leaf + /// dispatch (`split_text_node` / `handle_block_sentinel` / + /// `handle_heading_hint`) only ever inserts fresh siblings or + /// detaches the current node, never the already-stacked siblings, so + /// the snapshot-on-push discipline stays sound. + fn walk(&mut self, root: &'a AstNode<'a>) { + let mut stack: Vec> = Vec::new(); + push_children_rev(&mut stack, root); + while let Some(work) = stack.pop() { + let node = match work { + Work::ExitHeading => { self.in_heading_depth -= 1; - } else { - self.walk(node); + continue; + } + Work::Visit(node) => node, + }; + let (action, is_heading) = { + let data = node.data.borrow(); + ( + classify(&data.value), + matches!(&data.value, NodeValue::Heading(_)), + ) + }; + match action { + DispatchAction::Skip => {} + DispatchAction::TextWith(text) => self.split_text_node(node, &text), + DispatchAction::Paragraph => self.dispatch_paragraph(node, &mut stack), + DispatchAction::Recurse => { + if is_heading { + self.in_heading_depth += 1; + stack.push(Work::ExitHeading); + } + push_children_rev(&mut stack, node); } } - DispatchAction::Skip => {} } } - fn dispatch_paragraph(&mut self, paragraph: &'a AstNode<'a>) { + /// Dispatch a paragraph (Cases 1/2/3 in module-doc order). The + /// ordinary-paragraph case descends by pushing the paragraph's + /// children onto the shared work `stack` (a paragraph is never a + /// `Heading`, so no depth marker is needed); the block-sentinel and + /// heading-hint cases mutate in place and do not descend. + fn dispatch_paragraph(&mut self, paragraph: &'a AstNode<'a>, stack: &mut Vec>) { if let Some(kind) = paragraph_sole_block_sentinel(paragraph) { self.handle_block_sentinel(paragraph, kind); return; @@ -159,7 +189,7 @@ impl<'a, 'src> AstSplicer<'a, 'src> { } // Case 3: ordinary paragraph — descend to children for inline // sentinel splitting. - self.walk(paragraph); + push_children_rev(stack, paragraph); } fn handle_block_sentinel(&mut self, paragraph: &'a AstNode<'a>, kind: BlockSentinelKind) { @@ -342,6 +372,27 @@ impl<'a, 'src> AstSplicer<'a, 'src> { } } +/// One entry on [`AstSplicer::walk`]'s explicit traversal stack. +enum Work<'a> { + /// Classify and dispatch this node. + Visit(&'a AstNode<'a>), + /// Sentinel popped after a `Heading`'s entire subtree has been + /// processed, to restore `in_heading_depth` — the iterative + /// analogue of the recursive `in_heading_depth -= 1` on unwind. + ExitHeading, +} + +/// Push `parent`'s children onto `stack` as [`Work::Visit`] items in +/// reverse document order, so the `Vec`-as-stack pops them +/// left-to-right. Children are snapshotted here (by being moved onto +/// the stack) before any dispatch mutates the tree, mirroring the +/// previous recursive walk's `children().collect()`. +fn push_children_rev<'a>(stack: &mut Vec>, parent: &'a AstNode<'a>) { + let start = stack.len(); + stack.extend(parent.children().map(Work::Visit)); + stack[start..].reverse(); +} + /// Per-node dispatch verdict. Snapshotted from a borrowed `NodeValue` /// so the borrow is released before the splicer mutates the tree. #[derive(Debug)] diff --git a/crates/afm-markdown/src/ir/mod.rs b/crates/afm-markdown/src/ir/mod.rs index b3f5362..786b4ac 100644 --- a/crates/afm-markdown/src/ir/mod.rs +++ b/crates/afm-markdown/src/ir/mod.rs @@ -176,6 +176,10 @@ struct IrWalker<'src> { /// Stack of currently-open paired containers. Each frame owns the /// blocks gathered between its open and (eventual) close marker. open: Vec, + /// Current block/inline nesting depth, bounded by [`MAX_AST_DEPTH`] + /// so pathologically deep input cannot overflow the recursive + /// `collect_blocks` / `collect_inlines` descent. + depth: usize, } struct OpenContainer { @@ -184,12 +188,28 @@ struct OpenContainer { children: Vec, } +/// Maximum IR block/inline nesting depth. +/// +/// comrak can emit arbitrarily deep trees from a small input (nested +/// blockquotes — `handle_blockquote` carries no cap — nested list items, +/// nested inline emphasis), and the IR builder's `collect_blocks` / +/// `collect_inlines` recurse over them. Without a bound a crafted input +/// would overflow the call stack and abort the process under the release +/// profile's `panic = "abort"` — a crash on untrusted input that +/// `SECURITY.md` scopes IN as a vulnerability. 256 is far beyond any real +/// document (comrak itself caps list nesting at 100) while leaving the OS +/// stack comfortable; beyond it the IR truncates the over-deep subtree. +/// The HTML splice path is iterative ([`crate::ast_splice`]) and stays +/// complete regardless. +const MAX_AST_DEPTH: usize = 256; + impl<'src> IrWalker<'src> { fn new(cursor: SentinelCursor<'src>) -> Self { Self { cursor, top: Vec::new(), open: Vec::new(), + depth: 0, } } @@ -408,12 +428,21 @@ impl<'src> IrWalker<'src> { } fn collect_blocks<'a>(&mut self, node: &'a AstNode<'a>) -> Vec { + // Depth-bound the block recursion (`collect_blocks` → `walk_block` + // → `collect_blocks` for nested blockquotes / list items). Past + // the bound the over-deep subtree is dropped from the IR rather + // than overflowing the stack; see [`MAX_AST_DEPTH`]. + if self.depth >= MAX_AST_DEPTH { + return Vec::new(); + } + self.depth += 1; let mut out = Vec::new(); for child in node.children() { if let Some(block) = self.walk_block(child, false) { out.push(block); } } + self.depth -= 1; out } @@ -447,10 +476,19 @@ impl<'src> IrWalker<'src> { } fn collect_inlines<'a>(&mut self, node: &'a AstNode<'a>) -> Vec { + // Depth-bound the inline recursion (`collect_inlines` → + // `emit_inline` → `collect_inlines` for nested emphasis / links / + // images). Past the bound the over-deep inline subtree is dropped + // rather than overflowing the stack; see [`MAX_AST_DEPTH`]. + if self.depth >= MAX_AST_DEPTH { + return Vec::new(); + } + self.depth += 1; let mut out = Vec::new(); for child in node.children() { self.emit_inline(child, &mut out); } + self.depth -= 1; out } diff --git a/crates/afm-markdown/src/lib.rs b/crates/afm-markdown/src/lib.rs index cd8cef6..b9aca34 100644 --- a/crates/afm-markdown/src/lib.rs +++ b/crates/afm-markdown/src/lib.rs @@ -119,9 +119,27 @@ impl Options { } } - /// Plain CommonMark (no GFM, no Aozora, raw HTML enabled). Used by - /// the CommonMark 0.31.2 spec-conformance test to verify the - /// wrapper does not perturb comrak's CommonMark behaviour. + /// Plain CommonMark (no GFM, no Aozora) with comrak's raw-HTML + /// passthrough **enabled** (`render.unsafe = true`). Spec-conformance + /// scaffolding only — it exists so the CommonMark 0.31.2 runner can + /// verify the wrapper does not perturb comrak's CommonMark behaviour + /// against a spec whose expected output includes raw HTML. + /// + /// Hidden from the published API surface (`#[doc(hidden)]`): this is + /// not a production configuration. Use [`Options::afm_default`] (which + /// keeps `render.unsafe = false`) or a hand-built [`Options`] for any + /// real workload. + /// + /// # Security + /// + /// **Raw-HTML passthrough — never use on untrusted input.** This adds + /// no Rust `unsafe`, but it is a security footgun: it turns on + /// comrak's raw-HTML passthrough (`render.unsafe = true`), so comrak + /// emits raw HTML verbatim and passes through unsanitized URLs + /// (`javascript:` schemes included). Feeding attacker-controlled + /// source through these `Options` is an XSS sink. Reach for + /// [`Options::afm_default`] instead, which leaves raw HTML escaped. + #[doc(hidden)] #[must_use] pub fn commonmark_only() -> Self { let mut comrak = comrak::Options::default(); @@ -133,8 +151,25 @@ impl Options { } } - /// Pure-GFM extension set (no Aozora, raw HTML enabled). Used by - /// the GFM 0.29 spec-conformance test. + /// Pure-GFM extension set (no Aozora) with comrak's raw-HTML + /// passthrough **enabled** (`render.unsafe = true`). Spec-conformance + /// scaffolding only — it backs the GFM 0.29 conformance runner. + /// + /// Hidden from the published API surface (`#[doc(hidden)]`): this is + /// not a production configuration. Use [`Options::afm_default`] (which + /// keeps `render.unsafe = false`) or a hand-built [`Options`] for any + /// real workload. + /// + /// # Security + /// + /// **Raw-HTML passthrough — never use on untrusted input.** This adds + /// no Rust `unsafe`, but it is a security footgun: it turns on + /// comrak's raw-HTML passthrough (`render.unsafe = true`), so comrak + /// emits raw HTML verbatim and passes through unsanitized URLs + /// (`javascript:` schemes included). Feeding attacker-controlled + /// source through these `Options` is an XSS sink. Reach for + /// [`Options::afm_default`] instead, which leaves raw HTML escaped. + #[doc(hidden)] #[must_use] pub fn gfm_only() -> Self { let mut comrak = comrak::Options::default(); @@ -189,6 +224,35 @@ pub struct RenderedIr { pub diagnostics: Vec, } +/// Largest source afm will hand to the aozora lexer. +/// +/// The core lexer keys every span on a `u32` byte offset and asserts +/// `source.len() <= u32::MAX` at entry (`aozora_pipeline`'s Phase 0 / +/// `tokenize_in`). Under this workspace's `panic = "abort"` release +/// profile that assert is a hard process abort, not a catchable panic — +/// an in-scope crash per `SECURITY.md` for a >4 GiB hostile input. afm's +/// public entry points guard on this boundary *before* reaching the core +/// so an oversized input degrades to a graceful empty render instead of +/// aborting the host process. Mirrors `aozora-py`'s `PyValueError` guard +/// (`source exceeds 4 GiB (u32::MAX) span limit`). +const MAX_SOURCE_BYTES: usize = u32::MAX as usize; + +/// `true` when a source of `len` bytes is within the lexer's +/// addressable `u32` span budget. +/// +/// Split out from [`source_within_span_budget`] so the boundary +/// arithmetic is unit-testable at `u32::MAX` / `u32::MAX + 1` without +/// allocating a multi-gigabyte `String`. +const fn len_within_span_budget(len: usize) -> bool { + len <= MAX_SOURCE_BYTES +} + +/// `true` when `input` is within the lexer's addressable `u32` span +/// budget. `false` inputs must not be handed to the core. +const fn source_within_span_budget(input: &str) -> bool { + len_within_span_budget(input.len()) +} + /// Render afm source text to HTML. /// /// One-stop entry point for the typical caller (afm CLI, afm-epub). @@ -204,6 +268,14 @@ pub struct RenderedIr { /// substituting each sentinel with the matching /// `aozora::render::render_node` output. /// +/// # Oversized input +/// +/// If `input` exceeds `MAX_SOURCE_BYTES` (4 GiB − 1, the lexer's `u32` +/// span budget) this returns an empty [`Rendered`] (`html: ""`, no +/// diagnostics) **without** invoking the core lexer — the core would +/// otherwise `assert!` and abort the process under `panic = "abort"`. +/// See `MAX_SOURCE_BYTES` for the rationale. +/// /// # Panics /// /// Panics if `comrak::format_html` fails to write into the internal @@ -211,6 +283,12 @@ pub struct RenderedIr { /// branch is unreachable in normal use. #[must_use] pub fn render_to_string(input: &str, options: &Options) -> Rendered { + if !source_within_span_budget(input) { + return Rendered { + html: String::new(), + diagnostics: Vec::new(), + }; + } let (html, diagnostics, ()) = drive_pipeline(input, options, |_root, _lex_out| ()); Rendered { html, diagnostics } } @@ -229,6 +307,12 @@ pub fn render_to_string(input: &str, options: &Options) -> Rendered { /// their host paragraph to `IrBlock::Heading` so the IR shape /// matches the rendered HTML one-for-one. /// +/// # Oversized input +/// +/// If `input` exceeds `MAX_SOURCE_BYTES` this returns an empty +/// [`RenderedIr`] (empty IR document, `html: ""`, no diagnostics) +/// without invoking the core lexer. See `MAX_SOURCE_BYTES`. +/// /// # Panics /// /// Panics if `comrak::format_html` fails to write into the internal @@ -236,6 +320,13 @@ pub fn render_to_string(input: &str, options: &Options) -> Rendered { /// branch is unreachable in normal use. #[must_use] pub fn render_to_ir(input: &str, options: &Options) -> RenderedIr { + if !source_within_span_budget(input) { + return RenderedIr { + ir: ir::IrDocument::default(), + html: String::new(), + diagnostics: Vec::new(), + }; + } let (html, diagnostics, ir) = drive_pipeline(input, options, ir::build_ir); RenderedIr { ir, @@ -356,11 +447,20 @@ pub struct RenderedBlock { /// are emitted as separate blocks; the consumer is responsible for /// re-assembling them. The whole-document `render_to_ir` path /// preserves cross-block structure if you need it. +/// +/// # Oversized input +/// +/// If `input` exceeds `MAX_SOURCE_BYTES` this returns +/// `(Vec::new(), Vec::new())` — no blocks, no diagnostics — without +/// invoking the core lexer. See `MAX_SOURCE_BYTES`. #[must_use] pub fn render_blocks_to_ir( input: &str, options: &Options, ) -> (Vec, Vec) { + if !source_within_span_budget(input) { + return (Vec::new(), Vec::new()); + } if !options.aozora_enabled { let comrak_arena = comrak::Arena::new(); let root = comrak::parse_document(&comrak_arena, input, &options.comrak); @@ -434,8 +534,20 @@ fn collect_rendered_blocks<'a>( /// borrowed-AST inverse of `lex_into_arena`. Plain CommonMark portions /// of the input pass through verbatim because the lexer leaves them /// untouched. +/// +/// # Oversized input +/// +/// If `input` exceeds `MAX_SOURCE_BYTES` this returns an empty +/// `String` without invoking the core lexer (which would otherwise +/// `assert!` and abort under `panic = "abort"`). See +/// `MAX_SOURCE_BYTES`. The round-trip is therefore *not* identity on +/// inputs larger than 4 GiB — but such input cannot be lexed at all, so +/// an empty serialization is the only graceful option. #[must_use] pub fn serialize(input: &str) -> String { + if !source_within_span_budget(input) { + return String::new(); + } let arena = Arena::new(); let lex_out = aozora::lex_into_arena(input, &arena); aozora_serialize::serialize(&lex_out) @@ -536,6 +648,82 @@ mod tests { assert!(!contains_bare_bracket("no marker at all")); } + // ------------------------------------------------------------------- + // (a) Spec-conformance constructors are #[doc(hidden)] but still + // wire raw-HTML passthrough on for the spec runners. These tests pin + // that the hidden constructors keep their unsafe spec config so a + // future refactor that breaks the spec wiring is caught here. + // ------------------------------------------------------------------- + + #[test] + fn commonmark_only_enables_raw_html_and_disables_aozora() { + let opts = Options::commonmark_only(); + assert!( + opts.comrak.render.r#unsafe, + "commonmark_only must enable raw-HTML passthrough for the spec runner" + ); + assert!( + !opts.aozora_enabled, + "commonmark_only must skip the aozora pass" + ); + } + + #[test] + fn afm_default_does_not_enable_raw_html() { + // The production constructor must NOT inherit the spec runners' + // raw-HTML passthrough — that is the XSS-safety contract that + // motivated hiding commonmark_only / gfm_only. + let opts = Options::afm_default(); + assert!( + !opts.comrak.render.r#unsafe, + "afm_default must leave raw HTML escaped (no render.unsafe)" + ); + assert!(opts.aozora_enabled, "afm_default must run the aozora pass"); + } + + // ------------------------------------------------------------------- + // (b) Oversized-input boundary guard. The lexer asserts + // `source.len() <= u32::MAX` and aborts under panic=abort; the afm + // entry points must degrade to an empty render instead. We cannot + // allocate a >4 GiB string in a test, so the threshold arithmetic is + // pinned on the pure `len_within_span_budget` helper, and the entry + // points are exercised on realistic (in-budget) input. + // ------------------------------------------------------------------- + + #[test] + fn len_budget_boundary_is_exactly_u32_max() { + assert!(len_within_span_budget(0)); + assert!(len_within_span_budget(1024)); + assert!( + len_within_span_budget(MAX_SOURCE_BYTES), + "exactly u32::MAX bytes is still addressable" + ); + // `checked_add` keeps the test sound on a hypothetical 32-bit + // target where `MAX_SOURCE_BYTES == usize::MAX` and `+ 1` would + // overflow; there, "one past the budget" is unrepresentable, so + // the over-budget assertion is vacuously satisfied. On the + // workspace's 64-bit targets `over` is `u32::MAX + 1`, the exact + // value the core lexer's assert rejects. + if let Some(over) = MAX_SOURCE_BYTES.checked_add(1) { + assert!( + !len_within_span_budget(over), + "one byte past u32::MAX must be rejected" + ); + } + } + + #[test] + fn in_budget_input_still_renders_normally() { + // Guard must be transparent for ordinary input. + let r = render_to_string("# hi\n\nbody", &Options::afm_default()); + assert!(r.html.contains("

hi

"), "html: {}", r.html); + let ir = render_to_ir("para", &Options::afm_default()); + assert!(!ir.ir.blocks.is_empty()); + let (blocks, _) = render_blocks_to_ir("a\n\nb", &Options::afm_default()); + assert_eq!(blocks.len(), 2); + assert_eq!(serialize("plain"), "plain"); + } + /// Tier-A canary: every occurrence of `[#` must be inside an /// `afm-annotation` wrapper — never in raw body text. fn contains_bare_bracket(html: &str) -> bool { diff --git a/crates/afm-markdown/src/sentinel_stream.rs b/crates/afm-markdown/src/sentinel_stream.rs index 4575314..1fe092f 100644 --- a/crates/afm-markdown/src/sentinel_stream.rs +++ b/crates/afm-markdown/src/sentinel_stream.rs @@ -117,49 +117,65 @@ pub(crate) fn visit_text_leaves<'a, F>( where F: FnMut(&str) -> ControlFlow<()>, { - fn recurse<'a, F>(node: &'a AstNode<'a>, mode: InlineDescend, visit: &mut F) -> Result<(), ()> - where - F: FnMut(&str) -> ControlFlow<()>, - { - for child in node.children() { - let data = child.data.borrow(); - match &data.value { - NodeValue::Text(s) => { - // Hold the `child.data` borrow across `visit` rather - // than cloning the string out. The visitor only ever - // sees `&str` — it cannot reach `child.data` — and - // every visitor on this path is read-only (the - // splice's tree mutation runs in a separate, later - // walk), so the immutable borrow is sound and the - // per-leaf `Cow::clone` — an owned-string deep copy - // on consolidated comrak text — is pure waste. - let flow = visit(s); - drop(data); - if flow == ControlFlow::Break(()) { - return Err(()); - } - // A `Text` node can in principle have children - // under non-pathological comrak inputs (emphasis - // splits etc.). Recurse through them too. - if child.first_child().is_some() { - recurse(child, mode, visit)?; - } + // Iterative depth-first traversal over an explicit stack rather than + // recursion. comrak can build arbitrarily deep *inline* nesting from + // a small input (e.g. deeply nested emphasis / links), and a + // recursive descent would exhaust the call stack — under the release + // profile's `panic = "abort"` that is a hard process abort, which + // both repos' SECURITY.md scope IN as a vulnerability (a crash on + // untrusted input). The explicit stack moves the unbounded growth to + // the heap, where it is bounded by the input size, not the OS stack. + // + // Ordering: `extend_children_rev` pushes a node's children in reverse + // so they pop left-to-right, and a `Text` leaf is visited *before* its + // own descendants are pushed. That reproduces the previous recursion's + // exact left-to-right pre-order (visit a leaf, then its subtree, then + // its siblings), which `paragraph_sole_block_sentinel` and `ParaScan` + // both depend on for their sentinel-count / first-hit semantics. + let mut stack: Vec<&'a AstNode<'a>> = Vec::new(); + extend_children_rev(&mut stack, node); + while let Some(child) = stack.pop() { + let data = child.data.borrow(); + match &data.value { + NodeValue::Text(s) => { + // Hold the `child.data` borrow across `visit` rather than + // cloning the string out. The visitor only ever sees + // `&str` — it cannot reach `child.data` — and every + // visitor on this path is read-only (the splice's tree + // mutation runs in a separate, later walk), so the + // immutable borrow is sound and the per-leaf `Cow::clone` + // — an owned-string deep copy on consolidated comrak text + // — is pure waste. + let flow = visit(s); + drop(data); + if flow == ControlFlow::Break(()) { + return Err(()); } - _ => match mode { - InlineDescend::StopAtNonText => return Err(()), - InlineDescend::DescendThrough => { - let has_descendants = child.first_child().is_some(); - drop(data); - if has_descendants { - recurse(child, mode, visit)?; - } - } - }, + // A `Text` node can in principle have children under + // non-pathological comrak inputs (emphasis splits etc.). + // Visit them after the leaf itself (pre-order), before any + // of the leaf's siblings. + extend_children_rev(&mut stack, child); } + _ => match mode { + InlineDescend::StopAtNonText => return Err(()), + InlineDescend::DescendThrough => { + drop(data); + extend_children_rev(&mut stack, child); + } + }, } - Ok(()) } - recurse(node, mode, &mut visit) + Ok(()) +} + +/// Push `parent`'s children onto `stack` in reverse document order, so a +/// `Vec`-as-stack pops them left-to-right. Shared by the iterative +/// [`visit_text_leaves`] traversal. +fn extend_children_rev<'a>(stack: &mut Vec<&'a AstNode<'a>>, parent: &'a AstNode<'a>) { + let start = stack.len(); + stack.extend(parent.children()); + stack[start..].reverse(); } /// Walk a comrak paragraph node and return `Some(kind)` iff its diff --git a/crates/afm-markdown/tests/deep_nesting.rs b/crates/afm-markdown/tests/deep_nesting.rs new file mode 100644 index 0000000..e2aeb14 --- /dev/null +++ b/crates/afm-markdown/tests/deep_nesting.rs @@ -0,0 +1,92 @@ +//! Regression tests for deeply-nested input (stack-overflow / `DoS`). +//! +//! comrak builds an arbitrarily deep AST from a *small* input — nested +//! blockquotes carry no depth cap (`handle_blockquote` in the vendored +//! tree), unlike list nesting which it caps at 100 — and afm walks that +//! AST to splice Aozora spans and to project the IR. Before the splice +//! walk (`ast_splice`), the inline descent (`sentinel_stream`), and the +//! IR builder were made iterative / depth-bounded, a deeply nested +//! document overflowed the call stack. Under the release profile's +//! `panic = "abort"` that is a hard process abort — a crash on untrusted +//! input, which both repos' `SECURITY.md` scope IN as a vulnerability +//! (and which is fatal for any server-side library embedder). +//! +//! These tests pin that the public entry points *return* on pathological +//! nesting instead of crashing. Reaching the assertions at all (no abort) +//! is the core guarantee; the assertions additionally pin that the +//! innermost content still renders. + +use afm_markdown::{Options, render_blocks_to_ir, render_to_ir, render_to_string, serialize}; + +/// ~100k nested blockquotes on a single line, wrapping a leaf paragraph. +/// Pre-fix this overflowed `ast_splice::walk`'s recursion; the input is +/// only ~200 KB so parsing and the now-iterative walk stay fast. +fn deeply_nested_blockquotes() -> String { + format!("{}deep", "> ".repeat(100_000)) +} + +#[test] +fn render_to_string_survives_deep_blockquote_nesting() { + let out = render_to_string(&deeply_nested_blockquotes(), &Options::afm_default()); + assert!( + out.html.contains("deep"), + "innermost content should still render" + ); +} + +#[test] +fn render_to_ir_survives_deep_blockquote_nesting() { + // Exercises the IR builder's depth guard (the `collect_blocks` + // recursion over nested blockquotes). + let rendered = render_to_ir(&deeply_nested_blockquotes(), &Options::afm_default()); + assert!(rendered.html.contains("deep")); +} + +#[test] +fn render_blocks_to_ir_survives_deep_blockquote_nesting() { + let (blocks, _diagnostics) = + render_blocks_to_ir(&deeply_nested_blockquotes(), &Options::afm_default()); + assert!(!blocks.is_empty(), "the document should yield blocks"); +} + +#[test] +fn serialize_survives_deep_nesting() { + // `serialize` runs the aozora linear serializer (not comrak), but + // pin it too so the whole public surface is covered. + let serialized = serialize(&deeply_nested_blockquotes()); + assert!(serialized.contains("deep")); +} + +#[test] +fn deep_nesting_with_aozora_annotation_holds_tier_a() { + // A page-break annotation buried under deep nesting must still not + // leak a bare [# into the output (Tier-A), and no PUA sentinel + // (U+E001..E004) may survive into the HTML. + let input = format!("{}[#改ページ]", "> ".repeat(50_000)); + let out = render_to_string(&input, &Options::afm_default()); + assert!( + !out.html.contains('\u{E001}') + && !out.html.contains('\u{E002}') + && !out.html.contains('\u{E003}') + && !out.html.contains('\u{E004}'), + "a PUA sentinel leaked into HTML under deep nesting" + ); +} + +#[test] +fn deeply_nested_lists_survive() { + // List nesting is capped at 100 inside comrak, so this never reaches + // the old overflow, but it exercises the `collect_list_items` -> + // `collect_blocks` recursion path under the depth guard. + let mut input = String::new(); + for i in 0..2_000_usize { + for _ in 0..i.min(120) { + input.push_str(" "); + } + input.push_str("- item\n"); + } + let rendered = render_to_string(&input, &Options::afm_default()); + let ir = render_to_ir(&input, &Options::afm_default()); + assert!(rendered.html.contains("item"), "list items should render"); + assert!(!ir.html.is_empty(), "IR render should produce HTML"); +} diff --git a/crates/afm-wasm/src/lib.rs b/crates/afm-wasm/src/lib.rs index 9259b0d..aa8a380 100644 --- a/crates/afm-wasm/src/lib.rs +++ b/crates/afm-wasm/src/lib.rs @@ -105,6 +105,44 @@ fn build_options(opts: &RenderOptions) -> Options { base } +/// Largest input the aozora parser core accepts, in bytes. Its span +/// offsets are `u32`, so a longer source trips a `u32::MAX` assert +/// inside the lexer (`afm` feeds the source through +/// `aozora::lex_into_arena`). Under `panic = "abort"` that assert would +/// abort the whole Wasm instance. +const MAX_SOURCE_BYTES: usize = u32::MAX as usize; + +/// `Ok(())` iff a source of `byte_len` UTF-8 bytes is within the parser +/// core's `u32` span-offset limit. Pure (takes the length, not the +/// string) so the boundary is unit-testable without allocating a 4 GiB +/// buffer. +/// +/// # Errors +/// +/// `Err(&'static str)` when `byte_len > u32::MAX`. +const fn source_len_within_span_limit(byte_len: usize) -> Result<(), &'static str> { + if byte_len > MAX_SOURCE_BYTES { + return Err("source exceeds 4 GiB (u32::MAX) span limit"); + } + Ok(()) +} + +/// Reject sources larger than the parser core's `u32` span limit before +/// any parsing starts, returning a catchable `Err(JsValue)`. +/// +/// `afm` masks code-block triggers before lexing, but masking is a 1:1 +/// character substitution (`|`/`《`/… → U+E000, both 3-byte UTF-8), so +/// the masked source is byte-for-byte the same length as `source` — +/// checking `source.len()` here is exact. +/// +/// # Errors +/// +/// `Err(JsValue)` when `source.len()` (UTF-8 bytes) exceeds +/// [`u32::MAX`]. +fn guard_source_len(source: &str) -> Result<(), JsValue> { + source_len_within_span_limit(source.len()).map_err(JsValue::from_str) +} + /// Render afm source to IR + HTML + diagnostics. /// /// `options` is decoded as `{ aozoraEnabled?: boolean, @@ -113,11 +151,13 @@ fn build_options(opts: &RenderOptions) -> Options { /// /// # Errors /// -/// Returns `Err(JsValue::String)` when `options` cannot be deserialized -/// from JS or when the resulting `RenderResult` cannot be serialized -/// back to JS. +/// Returns `Err(JsValue::String)` when `source` exceeds the parser +/// core's `u32` span limit (~4 GiB), when `options` cannot be +/// deserialized from JS, or when the resulting `RenderResult` cannot be +/// serialized back to JS. #[wasm_bindgen(js_name = renderAfm)] pub fn render_afm(source: &str, options: JsValue) -> Result { + guard_source_len(source)?; let opts: RenderOptions = if options.is_undefined() || options.is_null() { RenderOptions::default() } else { @@ -148,8 +188,9 @@ pub fn render_afm(source: &str, options: JsValue) -> Result { /// /// # Errors /// -/// Returns `Err(JsValue::String)` when the resulting `RenderResult` -/// cannot be serialized back to JS. +/// Returns `Err(JsValue::String)` when `text` exceeds the parser core's +/// `u32` span limit (~4 GiB; delegated to [`render_afm`]) or when the +/// resulting `RenderResult` cannot be serialized back to JS. #[wasm_bindgen(js_name = renderAozoraOnly)] pub fn render_aozora_only(text: &str) -> Result { render_afm(text, JsValue::UNDEFINED) @@ -189,11 +230,13 @@ struct BlocksResult { /// /// # Errors /// -/// Returns `Err(JsValue::String)` when `options` cannot be deserialized -/// from JS or when the resulting `BlocksResult` cannot be serialized -/// back to JS. +/// Returns `Err(JsValue::String)` when `source` exceeds the parser +/// core's `u32` span limit (~4 GiB), when `options` cannot be +/// deserialized from JS, or when the resulting `BlocksResult` cannot be +/// serialized back to JS. #[wasm_bindgen(js_name = renderBlocks)] pub fn render_blocks(source: &str, options: JsValue) -> Result { + guard_source_len(source)?; let opts: RenderOptions = if options.is_undefined() || options.is_null() { RenderOptions::default() } else { @@ -540,3 +583,37 @@ fn build_resolution_value(source: &str, start: usize, end: usize) -> Option + diff --git a/playground/vite.config.ts b/playground/vite.config.ts index 5dd277a..ee9daf1 100644 --- a/playground/vite.config.ts +++ b/playground/vite.config.ts @@ -1,7 +1,65 @@ -import { defineConfig } from 'vite'; +import { defineConfig, type Plugin } from 'vite'; import solid from 'vite-plugin-solid'; import wasm from 'vite-plugin-wasm'; +// Strict Content-Security-Policy for the production bundle. Defense-in-depth +// layered *on top of* the renderer's escaping: the preview is mounted via +// `innerHTML` into `.afm-root` (components/PreviewPane.tsx), but the afm +// renderer (comrak + aozora-render) entity-escapes all text and emits no +// active markup, so the CSP is a second wall — not the primary XSS guard. +// +// Directive rationale (kept as tight as the app allows): +// default-src 'self' — same-origin baseline for everything. +// script-src 'self' — our bundle/chunks only… +// 'wasm-unsafe-eval' — …plus WebAssembly.instantiate for the +// afm-wasm module (no JS eval/unsafe-eval). +// style-src 'self' — hashed CSS assets, incl. the dynamically +// swapped #afm-theme … +// 'unsafe-inline' — …plus the runtime