diff --git a/.gitignore b/.gitignore
index 233688f9..f2f0e36f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,9 @@ temp/**
 scripts/*
 pixi.lock
 Train-FIRE/GM12878-fire-v0.1-filtered.cram.crai
+
+# Benchmark dataset fetched by benches/fetch-data.sh (not vendored in-repo)
+benches/data/
+
+# Criterion baseline / measurement storage (redundant with target/ but explicit)
+target/criterion/
diff --git a/Cargo.lock b/Cargo.lock
index 12771c0b..73810894 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -73,6 +73,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
 [[package]]
 name = "anstream"
 version = "0.6.19"
@@ -1069,6 +1075,12 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
 
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
 [[package]]
 name = "castaway"
 version = "0.2.4"
@@ -1120,6 +1132,33 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
 [[package]]
 name = "cipher"
 version = "0.4.4"
@@ -1361,6 +1400,42 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools 0.10.5",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools 0.10.5",
+]
+
 [[package]]
 name = "critical-section"
 version = "1.2.0"
@@ -2391,6 +2466,7 @@ dependencies = [
  "clap_mangen",
  "colored 2.2.0",
  "console 0.15.11",
+ "criterion",
  "derive_builder 0.12.0",
  "duct",
  "env_logger",
@@ -3562,6 +3638,17 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "is-terminal"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
+dependencies = [
+ "hermit-abi 0.5.2",
+ "libc",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.1"
@@ -4596,6 +4683,12 @@ dependencies = [
  "strum 0.27.1",
 ]
 
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+
 [[package]]
 name = "openssl-src"
 version = "300.5.1+3.5.1"
@@ -4779,6 +4872,34 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
+
 [[package]]
 name = "png"
 version = "0.17.16"
@@ -6372,6 +6493,16 @@ dependencies = [
  "zerovec",
 ]
 
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "tinyvec"
 version = "1.9.0"
diff --git a/Cargo.toml b/Cargo.toml
index ec2b245a..95bff780 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -84,6 +84,7 @@ vergen-git2 = { version = "1.0.2", features = [
 
 [dev-dependencies]
 assert_cmd = "2.0.11"
+criterion = { version = "0.5", features = ["html_reports"] }
 insta = "1.47.2"
 predicates = "3.0.3"
 tempfile = "3.3.0"
@@ -95,6 +96,10 @@ tch = ["dep:tch", "burn/tch"]
 burn = ["dep:burn"]
 
 
+[[bench]]
+name = "subcommands"
+harness = false
+
 # generated by 'cargo dist init'
 [profile.dist]
 inherits = "release"
diff --git a/benches/README.md b/benches/README.md
new file mode 100644
index 00000000..fad00944
--- /dev/null
+++ b/benches/README.md
@@ -0,0 +1,109 @@
+# `fibertools-rs` end-to-end benchmarks
+
+Criterion-driven wall-clock benchmarks for the four read-heavy fibertools
+subcommands. Each iterates or emits per-record annotations (m6a / cpg /
+msp / nuc / fire), so wall-clock is dominated by annotation traversal
+cost rather than ML inference or external I/O:
+
+- `pileup` (with `--m6a --cpg` to exercise every annotation type)
+- `fire` (the only bench that writes annotations back to an output BAM)
+- `track-decorators` (read-only traversal building a BED12 decorator track)
+- `extract --all` (per-record TSV with every annotation column)
+
+Every bench uses `sample_size(10)` and otherwise leaves criterion's timing
+defaults alone. Slower benches just take longer to gather their 10 samples;
+criterion may print an "unable to complete 10 samples" note, which is
+harmless. A full `cargo bench --bench subcommands` run takes ~15–20 minutes
+on a warm machine.
+
+## 1. Fetch the dataset (once)
+
+```sh
+bash benches/fetch-data.sh
+```
+
+Pulls ~133 MB of public FIRE test-data into `benches/data/fire-test-data/`
+(gitignored). Idempotent — re-running skips files already present. No
+AWS CLI required; the bucket is anonymous-public over plain HTTPS.
+
+## 2. Save a `main` baseline
+
+On the commit you want to compare against (typically `main` HEAD):
+
+```sh
+cargo bench --bench subcommands -- --save-baseline main
+```
+
+Persists per-bench measurements to `target/criterion/<group>/main/`.
+`target/criterion/` lives outside git, so the baseline survives any
+`git switch` / `git rebase`.
+
+## 3. Compare a feature branch
+
+After switching to the branch you want to measure:
+
+```sh
+cargo bench --bench subcommands -- --baseline main
+```
+
+Criterion prints a per-bench table:
+
+```
+pileup/default          time:   [12.341 s 12.567 s 12.812 s]
+                        change: [+3.2% +5.1% +7.0%] (p = 0.001 < 0.05)
+                        Performance has regressed.
+```
+
+HTML reports land in `target/criterion/<group>/report/`.
+
+## Running individual benches
+
+```sh
+cargo bench --bench subcommands -- pileup        # only the pileup group
+cargo bench --bench subcommands -- --quick fire  # one sample, fast smoke
+cargo bench --bench subcommands -- extract       # extract group only
+```
+
+## Reducing measurement noise
+
+Wall-clock benches are sensitive to background load. Recommended:
+
+- **Linux:** set the CPU governor to `performance` for the duration of the run
+  (`sudo cpupower frequency-set -g performance`).
+- **macOS:** plug into AC power, close Chrome and other heavyweights,
+  consider excluding `benches/data/` from Spotlight (System Settings →
+  Siri & Spotlight → Spotlight Privacy).
+- Run with `nice -n -5 cargo bench ...` if you have the permission to
+  raise priority, and avoid concurrent `cargo build` in another shell.
+
+Variance >5% between back-to-back runs of the same commit means the
+environment is too noisy to trust deltas under ~10%; address the noise
+before reading the comparison.
+
+## Refreshing the baseline
+
+If `main` moves and you want a new comparison reference:
+
+```sh
+git switch main && git pull
+cargo bench --bench subcommands -- --save-baseline main
+git switch -                              # back to your branch
+cargo bench --bench subcommands -- --baseline main
+```
+
+Criterion overwrites the `main` baseline in place; no cleanup needed.
+
+## Why these four subcommands?
+
+They each do substantial per-record annotation work and none of them
+are dominated by orthogonal cost centers (ML inference, pangenome-graph
+ops, network I/O) that would swamp the annotation-traversal signal:
+
+- `pileup` — builds per-position coverage from m6a/cpg/msp/nuc/fire annotations
+- `fire` — reads, scores, and writes annotations on every fiber
+- `track-decorators` — emits a BED12 plus a decorator track from fire/m6a/cpg/nuc positions
+- `extract --all` — dumps every per-record annotation column to TSV
+
+`predict_m6a`, `add_nucleosomes`, and `pg_*` are intentionally not
+benched here — their wall-clock is dominated by ML inference or
+pangenome-graph operations, which would dilute the signal we care about.
diff --git a/benches/fetch-data.sh b/benches/fetch-data.sh
new file mode 100755
index 00000000..a0998611
--- /dev/null
+++ b/benches/fetch-data.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Idempotent fetch of the FIRE test-data into benches/data/fire-test-data/.
+# The bucket is public over anonymous HTTPS, so no AWS CLI is required.
+# Source: https://github.com/fiberseq/FIRE pixi.toml `test-data` task.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DEST="$SCRIPT_DIR/data/fire-test-data"
+BASE="https://s3.kopah.orci.washington.edu/stergachis/public/FIRE/test-data"
+
+mkdir -p "$DEST"
+
+FILES=(
+  test.cram
+  test.cram.crai
+  test.fa.gz
+  test.fa.gz.fai
+  test.fa.gz.gzi
+  test.tbl
+  test.yaml
+)
+
+for f in "${FILES[@]}"; do
+  if [ -s "$DEST/$f" ]; then
+    echo "skip  $f (already present)"
+  else
+    echo "fetch $f"
+    curl --fail --location --silent --show-error --output "$DEST/$f" "$BASE/$f"
+  fi
+done
+
+echo
+echo "Dataset ready at: $DEST"
+du -sh "$DEST"
diff --git a/benches/subcommands.rs b/benches/subcommands.rs
new file mode 100644
index 00000000..d2292bfd
--- /dev/null
+++ b/benches/subcommands.rs
@@ -0,0 +1,109 @@
+//! End-to-end criterion benches for the read-heavy fibertools subcommands
+//! (`pileup`, `fire`, `track-decorators`, `extract`). Each emits per-record
+//! annotations (m6a / cpg / msp / nuc / fire), so wall-clock is dominated by
+//! annotation traversal cost.
+//!
+//! Each bench shells out to the freshly-built `ft` binary so we measure what
+//! the user actually runs. Inputs come from the public FIRE test-data CRAM;
+//! fetch via `bash benches/fetch-data.sh` before running.
+
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use std::path::{Path, PathBuf};
+use std::process::{Command, Stdio};
+use tempfile::TempDir;
+
+const FT_BIN: &str = env!("CARGO_BIN_EXE_ft");
+
+fn data_root() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("benches/data/fire-test-data")
+}
+
+fn cram() -> PathBuf {
+    let p = data_root().join("test.cram");
+    assert!(
+        p.exists(),
+        "bench data missing at {} \u{2014} run `bash benches/fetch-data.sh` first",
+        p.display()
+    );
+    p
+}
+
+fn run_ft(args: &[&str], cwd: &Path) {
+    let status = Command::new(FT_BIN)
+        .env("REF_PATH", data_root().join("test.fa.gz"))
+        .args(args)
+        .current_dir(cwd)
+        .stdout(Stdio::null())
+        .stderr(Stdio::null())
+        .status()
+        .expect("failed to spawn ft");
+    assert!(
+        status.success(),
+        "ft exited non-zero ({:?}) with args: {:?}",
+        status.code(),
+        args
+    );
+}
+
+/// Run `ft <args>` once per iteration in a fresh tempdir (so output files don't
+/// accumulate). Output paths in `args` are relative — they land in the tempdir.
+fn bench_cmd(c: &mut Criterion, name: &str, args: &[&str]) {
+    let mut g = c.benchmark_group(name);
+    g.sample_size(10);
+    g.bench_function("default", |b| {
+        b.iter_batched(
+            || TempDir::new().expect("tempdir creation failed"),
+            |tmp| run_ft(args, tmp.path()),
+            BatchSize::PerIteration,
+        );
+    });
+    g.finish();
+}
+
+fn bench_pileup(c: &mut Criterion) {
+    let cram = cram();
+    let cram = cram.to_str().expect("cram path is not utf-8");
+    bench_cmd(
+        c,
+        "pileup",
+        &["pileup", "--m6a", "--cpg", "-o", "pileup.bed.gz", cram],
+    );
+}
+
+fn bench_fire(c: &mut Criterion) {
+    let cram = cram();
+    let cram = cram.to_str().expect("cram path is not utf-8");
+    bench_cmd(c, "fire", &["fire", cram, "fire.bam"]);
+}
+
+fn bench_decorator(c: &mut Criterion) {
+    let cram = cram();
+    let cram = cram.to_str().expect("cram path is not utf-8");
+    bench_cmd(
+        c,
+        "decorator",
+        &[
+            "track-decorators",
+            "--bed12",
+            "decorator.bed12",
+            "--decorator",
+            "decorator.dec.bed",
+            cram,
+        ],
+    );
+}
+
+fn bench_extract(c: &mut Criterion) {
+    let cram = cram();
+    let cram = cram.to_str().expect("cram path is not utf-8");
+    bench_cmd(c, "extract", &["extract", "--all", "extract.tsv.gz", cram]);
+}
+
+criterion_group!(
+    benches,
+    bench_pileup,
+    bench_fire,
+    bench_decorator,
+    bench_extract
+);
+criterion_main!(benches);