diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c76e54b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,9 @@ +# Keep command prompt files and the token-baseline snapshot LF-normalized so the +# size census (tests/token-baseline.sh, issue #87) is byte-for-byte reproducible +# across Windows/WSL (CRLF risk) and Linux CI (LF). +commands/*.md text eol=lf +tests/fixtures/token-baseline.txt text eol=lf + +# Shell scripts must be LF so they execute on Linux CI regardless of a +# contributor's core.autocrlf setting (a CRLF shebang/line breaks `bash`). +*.sh text eol=lf diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 3d6402a..898946b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -63,3 +63,9 @@ jobs: - name: Verify exit_reason and hitl_decision enums stay in sync across config.md / ship.md / review.md run: bash tests/enum-sync-check.sh + + - name: Self-test the command-file token-baseline census tool + run: bash tests/token-baseline-test.sh + + - name: Command-file token-baseline census (informational — prints table + drift) + run: bash tests/token-baseline.sh --check diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c3f8e24..60b6035 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -38,6 +38,11 @@ test "$(jq -r .version .claude-plugin/plugin.json)" \ # Frontmatter parses (matches CI) python3 .github/workflows/check-frontmatter.py + +# Command-file size census — informational table + drift vs snapshot (never fails) +bash tests/token-baseline.sh --check +# Self-test for the census tool itself +bash tests/token-baseline-test.sh ``` To try the plugin in a sandbox without publishing: @@ -98,6 +103,20 @@ The fixture itself is markdown-only, so the measurement is not affected by netwo Only the four token-efficiency flags (and their config equivalents) carry this measurement obligation. PRs that don't change cascade gating or skill invocation paths can skip the table — `rtk gain` is for verifying claims about token impact, not a universal PR requirement. +## Static command-file size baseline (`tests/token-baseline.sh`) + +`rtk gain` above measures *runtime* token consumption. This is the complementary *static* measure: a deterministic census of the command prompt files themselves (`commands/*.md`), used to prove the per-command token reductions in the v0.14.0 optimization milestone (#87) and to catch accidental bloat. + +```bash +bash tests/token-baseline.sh --check # print the per-file table + drift vs snapshot (always exits 0) +bash tests/token-baseline.sh --update # refresh the committed snapshot after an intended change +``` + +- `~tokens` is an **approximation** (`bytes / 4`) — no tokenizer is involved; a byte census is enough to track reductions. +- The committed snapshot lives at `tests/fixtures/token-baseline.txt`. A compression PR is expected to change it — run `--update` and commit the refreshed snapshot as part of the PR so the diff shows the reduction. +- Byte counts are reproducible across Windows/WSL and Linux CI: `commands/*.md` are pinned to LF via `.gitattributes`, and the script strips `CR` before counting. +- `--check` is **informational only** (it never hard-fails on growth or shrinkage). A bloat hard-fail guard is intentionally deferred until after the compression milestone. + ## Design principles A few load-bearing principles that shape what gets accepted into `commands/`: diff --git a/tests/fixtures/token-baseline.txt b/tests/fixtures/token-baseline.txt new file mode 100644 index 0000000..d96fcfc --- /dev/null +++ b/tests/fixtures/token-baseline.txt @@ -0,0 +1,13 @@ +# token-baseline snapshot — commands/*.md size census (issue #87) +# ~tokens is APPROXIMATE (bytes/4); byte counts assume LF (CR stripped, see .gitattributes) +# columns: filelinesbytes~tokens (regenerate: bash tests/token-baseline.sh --update) +config.md 379 30846 7711 +doctor.md 471 27637 6909 +goal.md 241 32295 8073 +propose.md 552 24730 6182 +review.md 543 33737 8434 +ship.md 663 48858 12214 +start.md 1107 85109 21277 +status.md 192 11918 2979 +tag.md 425 18580 4645 +TOTAL 4573 313710 78424 diff --git a/tests/token-baseline-test.sh b/tests/token-baseline-test.sh new file mode 100755 index 0000000..1fe92e9 --- /dev/null +++ b/tests/token-baseline-test.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# tests/token-baseline-test.sh +# +# Self-test for tests/token-baseline.sh (issue #87, v0.14.0 milestone). +# Verifies the size-census TOOL behaves to contract — it does NOT assert the +# snapshot is in sync (the snapshot churns intentionally as commands/*.md are +# compressed in #89-#92). What it guards: +# - the tool exists and is executable +# - --check is informational (exits 0) and prints a per-file table + TOTAL +# - ~tokens is the documented bytes/4 approximation +# - --update is deterministic (idempotent), so snapshot diffs stay to the point +# - right after --update, --check reports an exact match +# +# Run from anywhere: bash tests/token-baseline-test.sh + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd) +TOOL="$REPO_ROOT/tests/token-baseline.sh" +CMD_DIR="$REPO_ROOT/commands" +SNAPSHOT="$REPO_ROOT/tests/fixtures/token-baseline.txt" + +fail() { echo "FAIL: $1"; exit 1; } + +[ -x "$TOOL" ] || fail "tests/token-baseline.sh missing or not executable" + +# --check must be informational: exit 0 even if the snapshot drifts. +OUT=$(bash "$TOOL" --check) || fail "--check exited non-zero (must be informational/exit 0)" + +# Column header present. +printf '%s\n' "$OUT" | grep -q 'file' || fail "no column header line containing 'file'" + +# One data row per command file, plus a TOTAL row. Use awk with a TAB field +# separator so we never embed literal tabs in this test. +NFILES=$(ls "$CMD_DIR"/*.md | wc -l | tr -d ' ') +NROWS=$(printf '%s\n' "$OUT" | awk -F'\t' 'NF==4 && $1 ~ /\.md$/ {c++} END{print c+0}') +[ "$NROWS" -eq "$NFILES" ] || fail "expected $NFILES file rows, got $NROWS" +printf '%s\n' "$OUT" | awk -F'\t' '$1=="TOTAL" && NF==4 {found=1} END{exit found?0:1}' \ + || fail "no TOTAL row with 4 tab-separated columns" + +# ~tokens is bytes/4 (integer) for start.md. +B=$(printf '%s\n' "$OUT" | awk -F'\t' '$1=="start.md"{print $3}') +T=$(printf '%s\n' "$OUT" | awk -F'\t' '$1=="start.md"{print $4}') +[ -n "$B" ] || fail "no start.md row" +[ "$T" -eq "$((B / 4))" ] || fail "~tokens != bytes/4 for start.md (bytes=$B tokens=$T)" + +# --update is deterministic. +bash "$TOOL" --update >/dev/null +H1=$(sha1sum "$SNAPSHOT" | cut -d' ' -f1) +bash "$TOOL" --update >/dev/null +H2=$(sha1sum "$SNAPSHOT" | cut -d' ' -f1) +[ "$H1" = "$H2" ] || fail "--update is not deterministic (snapshot hash changed between runs)" + +# Right after --update, --check must report an exact match. +bash "$TOOL" --check | grep -q 'OK: matches snapshot' \ + || fail "--check should report 'OK: matches snapshot' immediately after --update" + +# Drift case — the contract the tool exists to surface (AC #3): when the snapshot +# drifts, --check must STILL exit 0 (informational, never hard-fails) AND warn. +printf 'drift-sentinel\t1\t1\t0\n' >> "$SNAPSHOT" +DRIFT_OUT=$(bash "$TOOL" --check 2>&1) || fail "--check must exit 0 even on snapshot drift (AC #3)" +printf '%s\n' "$DRIFT_OUT" | grep -q 'WARN: size drift' \ + || fail "--check should warn on drift (WARN goes to stderr)" +bash "$TOOL" --update >/dev/null # restore the snapshot to the real census + +echo "PASS: token-baseline.sh ($NFILES command files measured)" diff --git a/tests/token-baseline.sh b/tests/token-baseline.sh new file mode 100755 index 0000000..4459fc6 --- /dev/null +++ b/tests/token-baseline.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +# tests/token-baseline.sh +# +# Deterministic size census of commands/*.md (issue #87, v0.14.0 milestone). +# Measures per-command lines / bytes / approximate tokens, prints a table, and +# compares against the committed snapshot tests/fixtures/token-baseline.txt. +# +# WHY: before compressing the command prompt files (#89-#92) we need a +# deterministic baseline to prove the reduction and catch accidental bloat. +# Token count is a census, not a sample — no LLM/tokenizer needed. +# +# ~tokens is an APPROXIMATION (bytes / 4). There is no real tokenizer here; a +# byte census is enough to track the compression milestone's reductions. +# +# REPRODUCIBILITY: byte counts depend on line endings. This plugin is developed +# on Windows/WSL (CRLF risk) and CI runs on Linux (LF). Two defenses keep the +# numbers identical on both: (1) commands/*.md are pinned to LF via +# .gitattributes (text eol=lf); (2) this script strips CR before measuring, so +# a CRLF working-tree checkout still yields the same bytes as LF. +# +# MODES: +# (default) / --check : measure, print the table, diff vs the snapshot, print +# any drift as a WARNING. ALWAYS exits 0 — informational +# only, never hard-fails on reduction or growth. (A +# bloat hard-fail guard is intentionally deferred to a +# follow-up after the compression milestone; see #87.) +# --update : (re)generate the snapshot file in place. +# +# Run from anywhere: bash tests/token-baseline.sh [--check|--update] + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd) +SNAPSHOT="$REPO_ROOT/tests/fixtures/token-baseline.txt" +CMD_DIR="$REPO_ROOT/commands" + +MODE="check" +case "${1:-}" in + --update) MODE="update" ;; + --check | "") MODE="check" ;; + *) echo "usage: token-baseline.sh [--check|--update]" >&2; exit 2 ;; +esac + +# Emit the full snapshot text (header comments + sorted per-file rows + TOTAL). +# Deterministic: rows are sorted by filename; CR is stripped before counting. +generate() { + local rows="" total_lines=0 total_bytes=0 total_tok=0 + local f name lines bytes tok + for f in "$CMD_DIR"/*.md; do + [ -e "$f" ] || continue + name=$(basename "$f") + lines=$(tr -d '\r' < "$f" | wc -l | tr -d ' ') + bytes=$(tr -d '\r' < "$f" | wc -c | tr -d ' ') + tok=$(( bytes / 4 )) + total_lines=$(( total_lines + lines )) + total_bytes=$(( total_bytes + bytes )) + total_tok=$(( total_tok + tok )) + rows+=$(printf '%s\t%s\t%s\t%s' "$name" "$lines" "$bytes" "$tok")$'\n' + done + printf '# token-baseline snapshot — commands/*.md size census (issue #87)\n' + printf '# ~tokens is APPROXIMATE (bytes/4); byte counts assume LF (CR stripped, see .gitattributes)\n' + printf '# columns: filelinesbytes~tokens (regenerate: bash tests/token-baseline.sh --update)\n' + printf '%s' "$rows" | sort + printf 'TOTAL\t%s\t%s\t%s\n' "$total_lines" "$total_bytes" "$total_tok" +} + +if [ "$MODE" = "update" ]; then + mkdir -p "$(dirname "$SNAPSHOT")" + generate > "$SNAPSHOT" + echo "updated snapshot: tests/fixtures/token-baseline.txt" + exit 0 +fi + +# --check mode: print the table, then diff against the snapshot (informational). +TMP=$(mktemp) +trap 'rm -f "$TMP"' EXIT +generate > "$TMP" +cat "$TMP" + +if [ ! -f "$SNAPSHOT" ]; then + echo "WARN: no snapshot yet — create it with: bash tests/token-baseline.sh --update" >&2 + exit 0 +fi + +if diff -u "$SNAPSHOT" "$TMP" >/dev/null 2>&1; then + echo "OK: matches snapshot" +else + echo "WARN: size drift vs snapshot (informational — not a failure):" >&2 + diff -u "$SNAPSHOT" "$TMP" >&2 || true + echo " if this drift is intended (e.g. a compression PR), refresh with: bash tests/token-baseline.sh --update" >&2 +fi +exit 0