diff --git a/AGENTS.md b/AGENTS.md
index 8ddbe5c..1096b1b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -57,7 +57,8 @@ README.md            public overview and development notes
 .github/workflows/   shared CI and PR enforcement, committed
 .agents/skills/      shared skills, committed
 .claude/skills       symlink to .agents/skills for claude compatibility
-bin/                 helper scripts, committed, including fm-fleet-sync.sh for clean default-branch refreshes and gone-branch pruning; read each script's header before first use
+systemd/             firstmate.service unit for crash/reboot autostart (section 5); install via bin/fm-install-autostart.sh
+bin/                 helper scripts, committed, including fm-fleet-sync.sh for clean default-branch refreshes and gone-branch pruning, fm-resume.sh (autostart watchdog) and fm-install-autostart.sh; read each script's header before first use
 config/crew-harness  crewmate harness override; LOCAL, gitignored; absent or "default" = same as firstmate
 data/                personal fleet records; LOCAL, gitignored as a whole
   backlog.md         task queue, dependencies, history
@@ -202,6 +203,8 @@ Reconcile reality with your records before doing anything else:
 A firstmate restart must be a non-event.
 All truth lives in tmux, state files, data/backlog.md, and treehouse; your conversation memory is a cache.
 
+**Autostart (crash/reboot resilience).** A WSL VM teardown (host sleep, idle timeout, or a Windows Update reboot) kills tmux and every crewmate at once, and nothing relaunched firstmate after the VM came back. `systemd/firstmate.service` closes that gap: a watchdog (`bin/fm-resume.sh --watch`) recreates the persistent `firstmate` tmux session on boot and self-heals if it dies, so the captain re-attaches (`tmux attach -t firstmate`) to a live, state-intact firstmate instead of a cold start. Install/remove with `bin/fm-install-autostart.sh [install|status|uninstall]`; the unit's `KillMode=process` means stopping the service never kills a running firstmate. Pair it with `~/.wslconfig` `vmIdleTimeout=-1` (prevents the idle teardown in the first place; needs `wsl --shutdown` to apply). The watchdog does NOT relaunch crewmates - resuming in-flight work is recovery's job once the captain is back.
+
 ## 6. Project management
 
 All projects live flat under `projects/`.
diff --git a/bin/fm-install-autostart.sh b/bin/fm-install-autostart.sh
new file mode 100644
index 0000000..ed071c5
--- /dev/null
+++ b/bin/fm-install-autostart.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# Install (or remove) the systemd unit that makes firstmate survive a WSL VM
+# teardown - the failure that took the fleet out overnight (AGENTS.md section 5).
+#
+# systemd runs as PID 1 in this distro ([boot] systemd=true in /etc/wsl.conf),
+# so a system unit started at multi-user.target is the native, reboot-proof way
+# to auto-resurrect firstmate. The unit runs bin/fm-resume.sh as a watchdog.
+#
+# Usage:
+#   fm-install-autostart.sh            install, enable, and start the unit
+#   fm-install-autostart.sh status     show unit + session state
+#   fm-install-autostart.sh uninstall  stop, disable, and remove the unit
+#
+# Reversible: `uninstall` leaves no trace and never touches a running firstmate
+# session (KillMode=process in the unit), so removing autostart cannot discard work.
+set -eu
+
+FM_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+UNIT_SRC="$FM_ROOT/systemd/firstmate.service"
+UNIT_NAME="firstmate.service"
+UNIT_DST="/etc/systemd/system/$UNIT_NAME"
+
+require_systemd() {
+  if ! command -v systemctl >/dev/null 2>&1; then
+    echo "error: systemctl not found; this distro is not running systemd" >&2
+    exit 1
+  fi
+  if [ "$(ps -p 1 -o comm= 2>/dev/null)" != systemd ]; then
+    echo "error: PID 1 is not systemd; enable '[boot] systemd=true' in /etc/wsl.conf, then 'wsl --shutdown'" >&2
+    exit 1
+  fi
+}
+
+cmd_install() {
+  require_systemd
+  [ -f "$UNIT_SRC" ] || { echo "error: unit not found at $UNIT_SRC" >&2; exit 1; }
+  # Install a copy (not a symlink): systemd does not follow symlinks under
+  # /mnt/c reliably, and the repo path is not guaranteed mounted at early boot.
+  install -m 0644 "$UNIT_SRC" "$UNIT_DST"
+  systemctl daemon-reload
+  systemctl enable "$UNIT_NAME"
+  systemctl restart "$UNIT_NAME"
+  echo "installed and enabled $UNIT_NAME"
+  echo "firstmate will now auto-resurrect on every boot and self-heal if it dies."
+  cmd_status
+}
+
+cmd_uninstall() {
+  require_systemd
+  systemctl disable "$UNIT_NAME" 2>/dev/null || true
+  systemctl stop "$UNIT_NAME" 2>/dev/null || true
+  rm -f "$UNIT_DST"
+  systemctl daemon-reload
+  echo "removed $UNIT_NAME (any running firstmate session was left untouched)"
+}
+
+cmd_status() {
+  require_systemd
+  echo "--- unit ---"
+  systemctl is-enabled "$UNIT_NAME" 2>/dev/null | sed 's/^/enabled: /' || echo "enabled: no"
+  systemctl is-active "$UNIT_NAME" 2>/dev/null | sed 's/^/active: /' || echo "active: no"
+  echo "--- session ---"
+  if tmux has-session -t "${FM_SESSION:-firstmate}" 2>/dev/null; then
+    echo "firstmate tmux session: LIVE (attach with: tmux attach -t ${FM_SESSION:-firstmate})"
+  else
+    echo "firstmate tmux session: not present"
+  fi
+}
+
+case "${1:-install}" in
+  install)   cmd_install ;;
+  uninstall) cmd_uninstall ;;
+  status)    cmd_status ;;
+  *) echo "usage: $(basename "$0") [install|status|uninstall]" >&2; exit 2 ;;
+esac
diff --git a/bin/fm-resume.sh b/bin/fm-resume.sh
new file mode 100644
index 0000000..568e7d2
--- /dev/null
+++ b/bin/fm-resume.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# Idempotently ensure the persistent firstmate tmux session exists.
+#
+# This is the recovery half of crash resilience (AGENTS.md section 5): a WSL VM
+# teardown (host sleep, idle timeout, or a Windows Update reboot) kills the tmux
+# server and every process in it, including firstmate and any crewmates. systemd
+# survives the reboot but nothing relaunched firstmate - so it stayed dead until
+# the captain reconnected. This script, run at boot and on a self-heal timer by
+# firstmate.service (see systemd/), brings the session back automatically. The
+# captain then attaches to a live, state-intact firstmate instead of a cold start.
+#
+# It is deliberately idempotent and safe to run repeatedly: if the session is
+# already alive it is a no-op. It does NOT relaunch crewmate workers - their
+# autonomous processes died with the VM and re-spawning them is firstmate's job
+# via its recovery protocol once the captain is back. Worker state on disk
+# (data/, state/, backlog) is untouched and survives regardless.
+#
+# Usage:
+#   fm-resume.sh            ensure the session once, then exit
+#   fm-resume.sh --watch    ensure forever, re-checking every FM_RESUME_INTERVAL
+#                           seconds (the long-running ExecStart for the service)
+set -eu
+
+FM_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+# How firstmate itself is launched. These mirror the captain's live session
+# exactly (verified from the running process): the REAL claude binary, not the
+# round-robin crew shim on PATH; the orchestrator account; sandbox + skip-perms
+# because firstmate runs as root inside an isolated WSL VM.
+SESSION="${FM_SESSION:-firstmate}"
+FM_CLAUDE_BIN="${FM_CLAUDE_BIN:-$HOME/.local/bin/claude}"
+FM_CONFIG_DIR="${FM_CONFIG_DIR:-/mnt/c/Users/Owenz/.claude-orchestrator}"
+FM_RESUME_INTERVAL="${FM_RESUME_INTERVAL:-60}"
+
+ensure_session() {
+  if tmux has-session -t "$SESSION" 2>/dev/null; then
+    return 0
+  fi
+
+  # Build the firstmate launch command. Single-quoted values are expanded in the
+  # tmux pane's shell, not here. We exec the real binary by absolute path so a
+  # PATH that prefers the crew shim cannot misroute firstmate onto a worker account.
+  local launch
+  printf -v launch \
+    'cd %q && CLAUDE_CONFIG_DIR=%q IS_SANDBOX=1 exec %q --dangerously-skip-permissions' \
+    "$FM_ROOT" "$FM_CONFIG_DIR" "$FM_CLAUDE_BIN"
+
+  # Target the session (active window) rather than a hardcoded ":0": tmux configs
+  # commonly set base-index 1, so the first window is not always index 0.
+  tmux new-session -d -s "$SESSION" -c "$FM_ROOT"
+  tmux send-keys -t "$SESSION" "$launch" Enter
+  echo "fm-resume: created session '$SESSION' running firstmate"
+  return 0
+}
+
+if [ "${1:-}" = "--watch" ]; then
+  while :; do
+    ensure_session || true
+    sleep "$FM_RESUME_INTERVAL"
+  done
+else
+  if tmux has-session -t "$SESSION" 2>/dev/null; then
+    echo "fm-resume: session '$SESSION' already live"
+  else
+    ensure_session
+  fi
+fi
diff --git a/bin/fm-spawn.sh b/bin/fm-spawn.sh
index d0b1dc4..e221c0d 100755
--- a/bin/fm-spawn.sh
+++ b/bin/fm-spawn.sh
@@ -121,10 +121,12 @@ tmux send-keys -t "$T" 'treehouse get' Enter
 WT=""
 for _ in $(seq 1 60); do
   p=$(tmux display-message -p -t "$T" '#{pane_current_path}' 2>/dev/null || true)
-  if [ -n "$p" ] && [ "$p" != "$PROJ_ABS" ]; then
-    WT="$p"
-    break
-  fi
+  # Wait specifically for a treehouse worktree (under {root}/.treehouse/), not just any
+  # cwd change: a freshly-created window can transiently report the session's default cwd
+  # before `treehouse get` lands, which would otherwise be misrecorded as the worktree.
+  case "$p" in
+    */.treehouse/*) WT="$p"; break ;;
+  esac
   sleep 1
 done
 if [ -z "$WT" ]; then
diff --git a/bin/fm-teardown.sh b/bin/fm-teardown.sh
index 4fd2355..e717525 100755
--- a/bin/fm-teardown.sh
+++ b/bin/fm-teardown.sh
@@ -59,7 +59,7 @@ if [ -d "$WT" ] && [ "$FORCE" != "--force" ]; then
     # The work is safe once it is merged into the local default branch (firstmate
     # does that merge on the captain's approval). Refuse until then.
     DEFAULT=$(default_branch) || { echo "REFUSED: cannot determine default branch for $PROJ; expected origin/HEAD, main, or master." >&2; exit 1; }
-    dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '^\?\? \.claude/' | head -1 || true)
+    dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '\.claude/settings\.local\.json|^\?\? \.claude/' | head -1 || true)
     unmerged=$(git -C "$WT" log --oneline HEAD --not "$DEFAULT" -- 2>/dev/null | head -5 || true)
     if [ -n "$dirty" ] || [ -n "$unmerged" ]; then
       echo "REFUSED: local-only worktree $WT has work not yet merged into $DEFAULT." >&2
@@ -70,7 +70,7 @@ if [ -d "$WT" ] && [ "$FORCE" != "--force" ]; then
     fi
   else
     # The fm-spawn hook file is ours, never work product; ignore it in the dirty check.
-    dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '^\?\? \.claude/' | head -1 || true)
+    dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '\.claude/settings\.local\.json|^\?\? \.claude/' | head -1 || true)
     unpushed=$(git -C "$WT" log --oneline HEAD --not --remotes -- 2>/dev/null | head -5 || true)
     if [ -n "$dirty" ] || [ -n "$unpushed" ]; then
       echo "REFUSED: worktree $WT has work not on any remote." >&2
diff --git a/bin/fm-wake-lib.sh b/bin/fm-wake-lib.sh
index bfb7ba9..d66eb18 100755
--- a/bin/fm-wake-lib.sh
+++ b/bin/fm-wake-lib.sh
@@ -7,7 +7,13 @@ FM_ROOT="${FM_ROOT_OVERRIDE:-${FM_ROOT:-$FM_WAKE_DEFAULT_ROOT}}"
 STATE="${FM_STATE_OVERRIDE:-${STATE:-$FM_ROOT/state}}"
 FM_WAKE_QUEUE="${FM_WAKE_QUEUE:-$STATE/.wake-queue}"
 FM_WAKE_QUEUE_LOCK="${FM_WAKE_QUEUE_LOCK:-$STATE/.wake-queue.lock}"
-FM_LOCK_STALE_AFTER="${FM_LOCK_STALE_AFTER:-2}"
+# Grace before an empty-pid lock dir (holder died in the microsecond window
+# between mkdir and writing its pid) is treated as stale and reclaimed. A normal
+# holder death is reclaimed instantly via pid-liveness (kill -0), so this only
+# bounds recovery from that rare window. Kept generous (10s) so that under heavy
+# scheduling delay a live holder mid-write is never mistaken for stale and have
+# its lock stolen - the double-grant that made the concurrency tests flake.
+FM_LOCK_STALE_AFTER="${FM_LOCK_STALE_AFTER:-10}"
 mkdir -p "$STATE"
 
 fm_current_pid() {
@@ -36,78 +42,100 @@ fm_path_age() {
   echo $(( $(date +%s) - m ))
 }
 
-fm_lock_remove_stale() {
-  local lockdir=$1 expected_pid=$2 current_pid
-  current_pid=$(cat "$lockdir/pid" 2>/dev/null || true)
-  [ "$current_pid" = "$expected_pid" ] || return 1
-  if fm_pid_alive "$current_pid"; then
-    return 1
-  fi
-  case "$current_pid" in
-    ''|*[!0-9]*)
-      [ "$(fm_path_age "$lockdir")" -ge "$FM_LOCK_STALE_AFTER" ] || return 1
-      ;;
-  esac
-  rm -f "$lockdir/pid" 2>/dev/null || return 1
-  rmdir "$lockdir" 2>/dev/null
-}
+# The lock is a single FILE created with O_EXCL (bash `set -C` noclobber). This
+# replaced an mkdir-based dir lock: plain mkdir is NOT atomic on every target
+# filesystem - on WSL2's filesystem several concurrent mkdir calls were observed
+# to all "succeed" on one path (verified: 4 simultaneous successes in a 20-way
+# barrier race), which silently double-granted the old lock and made the watcher
+# singleton and wake-queue draining race. O_EXCL create IS atomic everywhere we
+# run (Linux, WSL2, macOS) and writes the holder pid in the SAME redirection, so
+# there is never a window where the lock exists with an unknown owner.
+#
+# The O_EXCL create is the ONE and ONLY grant. Reclaiming a dead holder's lock
+# never grants directly: it only frees the lock and lets the next O_EXCL create
+# (one atomic winner) take it. A live holder's lock can never be stolen, because
+# reclaim is gated on the holder pid being dead, and the one path that moves the
+# file (dead-holder reclaim) re-checks what it actually took and restores it via
+# an atomic hardlink if a live holder had reappeared in the gap.
 
 fm_lock_try_acquire() {
-  local lockdir=$1 pid
+  local lockfile=$1 pid me steal spid
   FM_LOCK_HELD_PID=
-  if mkdir "$lockdir" 2>/dev/null; then
-    if { fm_current_pid > "$lockdir/pid"; } 2>/dev/null; then
-      return 0
+  # Compute the pid in THIS shell, not inside the O_EXCL subshell below (where
+  # BASHPID would be the subshell's). Expanded before the subshell forks, so the
+  # holder pid is written - and matches what fm_lock_release compares against.
+  me=${BASHPID:-$$}
+
+  # If a reclaimable lock is present (dead holder, long-empty file, or a legacy
+  # pre-O_EXCL directory), free it first. A LIVE holder's lock is never freed.
+  if [ -d "$lockfile" ]; then
+    pid=$(cat "$lockfile/pid" 2>/dev/null || true)
+    if [ -n "$pid" ] && fm_pid_alive "$pid"; then
+      FM_LOCK_HELD_PID=$pid
+      return 1
     fi
-    rm -f "$lockdir/pid" 2>/dev/null || true
-    rmdir "$lockdir" 2>/dev/null || true
-    return 1
-  fi
-
-  pid=$(cat "$lockdir/pid" 2>/dev/null || true)
-  if fm_pid_alive "$pid"; then
-    FM_LOCK_HELD_PID=$pid
-    return 1
-  fi
-  case "$pid" in
-    ''|*[!0-9]*)
-      if [ "$(fm_path_age "$lockdir")" -lt "$FM_LOCK_STALE_AFTER" ]; then
-        FM_LOCK_HELD_PID=$pid
+    rm -rf "$lockfile" 2>/dev/null || true
+  elif [ -e "$lockfile" ]; then
+    pid=$(cat "$lockfile" 2>/dev/null || true)
+    if fm_pid_alive "$pid"; then
+      FM_LOCK_HELD_PID=$pid
+      return 1
+    fi
+    # Empty-but-fresh file: tolerate a brief writer gap rather than reclaim.
+    if [ -z "$pid" ] && [ "$(fm_path_age "$lockfile")" -lt "$FM_LOCK_STALE_AFTER" ]; then
+      FM_LOCK_HELD_PID=$pid
+      return 1
+    fi
+    # Dead (or long-empty) holder: move the lock aside and re-check the exact
+    # bytes we moved. If a live holder had replaced it in the gap, restore it
+    # with an atomic hardlink (ln fails if a fresh holder already exists, so we
+    # never clobber one) and back off. Otherwise it is freed.
+    steal="$lockfile.stale.$me"
+    rm -f "$steal" 2>/dev/null || true
+    if mv "$lockfile" "$steal" 2>/dev/null; then
+      spid=$(cat "$steal" 2>/dev/null || true)
+      if [ -n "$spid" ] && fm_pid_alive "$spid"; then
+        ln "$steal" "$lockfile" 2>/dev/null || true
+        rm -f "$steal" 2>/dev/null || true
+        FM_LOCK_HELD_PID=$spid
         return 1
       fi
-      ;;
-  esac
-
-  fm_lock_remove_stale "$lockdir" "$pid" || true
-  if mkdir "$lockdir" 2>/dev/null; then
-    if { fm_current_pid > "$lockdir/pid"; } 2>/dev/null; then
-      return 0
+      rm -f "$steal" 2>/dev/null || true
     fi
-    rm -f "$lockdir/pid" 2>/dev/null || true
-    rmdir "$lockdir" 2>/dev/null || true
-    return 1
   fi
 
-  pid=$(cat "$lockdir/pid" 2>/dev/null || true)
+  # The one and only grant: an atomic O_EXCL create. Exactly one racer wins;
+  # losers (someone created it first) fall through to report the holder.
+  if ( set -C; printf '%s\n' "$me" > "$lockfile" ) 2>/dev/null; then
+    return 0
+  fi
+  pid=$(cat "$lockfile" 2>/dev/null || true)
   # shellcheck disable=SC2034 # Read by callers after fm_lock_try_acquire returns.
   FM_LOCK_HELD_PID=$pid
   return 1
 }
 
 fm_lock_acquire_wait() {
-  local lockdir=$1
-  while ! fm_lock_try_acquire "$lockdir"; do
+  local lockfile=$1
+  while ! fm_lock_try_acquire "$lockfile"; do
     sleep 0.1
   done
 }
 
 fm_lock_release() {
-  local lockdir=$1 pid current
+  local lockfile=$1 pid current
   current=${BASHPID:-$$}
-  pid=$(cat "$lockdir/pid" 2>/dev/null || true)
+  # Remove only our own lock. A directory is the legacy format; treat its pid
+  # file the same way so an in-flight upgrade releases cleanly.
+  if [ -d "$lockfile" ]; then
+    pid=$(cat "$lockfile/pid" 2>/dev/null || true)
+    [ "$pid" = "$current" ] || return 0
+    rm -rf "$lockfile" 2>/dev/null || true
+    return 0
+  fi
+  pid=$(cat "$lockfile" 2>/dev/null || true)
   [ "$pid" = "$current" ] || return 0
-  rm -f "$lockdir/pid" 2>/dev/null || true
-  rmdir "$lockdir" 2>/dev/null || true
+  rm -f "$lockfile" 2>/dev/null || true
 }
 
 fm_wake_clean_field() {
diff --git a/systemd/firstmate.service b/systemd/firstmate.service
new file mode 100644
index 0000000..3488fef
--- /dev/null
+++ b/systemd/firstmate.service
@@ -0,0 +1,27 @@
+[Unit]
+Description=Firstmate persistent supervisor session (crash/reboot resilience)
+Documentation=https://github.com/kunchenguid/firstmate
+# Wait for the Windows drive mounts; the orchestrator account config lives under
+# /mnt/c. Non-fatal if the mount unit name differs - the resume script retries.
+After=local-fs.target
+
+[Service]
+Type=simple
+User=root
+WorkingDirectory=/root/firstmate
+Environment=HOME=/root
+# The watchdog: ensure the firstmate tmux session exists, re-checking forever.
+# On VM boot this recreates the session; if the loop itself ever dies, Restart
+# brings it back. The loop re-execs nothing destructive - it no-ops when the
+# session is already live.
+ExecStart=/root/firstmate/bin/fm-resume.sh --watch
+Restart=always
+RestartSec=10
+# Kill ONLY the watchdog loop on stop/restart, never the tmux server it guards.
+# This makes firstmate survive `systemctl restart firstmate` and decouples the
+# supervisor session's lifetime from this unit: the service is a guardian, not
+# the owner. To fully stop firstmate, kill its tmux session directly.
+KillMode=process
+
+[Install]
+WantedBy=multi-user.target
diff --git a/tests/fm-lock-exclusivity.test.sh b/tests/fm-lock-exclusivity.test.sh
new file mode 100644
index 0000000..a538175
--- /dev/null
+++ b/tests/fm-lock-exclusivity.test.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# Regression test for the wake-queue lock's core guarantee: mutual exclusion.
+#
+# The lock once used `mkdir` as its atomic primitive. mkdir is NOT atomic on
+# every filesystem - on WSL2's filesystem several concurrent mkdir calls were
+# observed to all succeed on one path, which silently double-granted the lock and
+# made the watcher singleton and wake-queue draining race. The lock now uses an
+# O_EXCL create (atomic everywhere we run). This test fails loudly if the lock
+# ever stops being mutually exclusive again.
+set -u
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+fail() { printf 'not ok - %s\n' "$1" >&2; exit 1; }
+pass() { printf 'ok - %s\n' "$1"; }
+
+TMP=$(mktemp -d "${TMPDIR:-/tmp}/fm-lock-test.XXXXXX")
+trap 'rm -rf "$TMP"' EXIT
+export FM_STATE_OVERRIDE="$TMP"
+# shellcheck source=bin/fm-wake-lib.sh
+. "$ROOT/bin/fm-wake-lib.sh"
+
+LK="$FM_WAKE_QUEUE_LOCK"
+CUR="$TMP/current"; BAD="$TMP/bad"; : > "$BAD"; echo init > "$CUR"
+
+# Canonical mutex test: while holding the lock, write our pid to a shared file
+# and read it back. With true mutual exclusion it is always our own pid; a
+# non-empty foreign pid means another holder ran concurrently (double-grant).
+# (An empty read is a transient fork artifact under load, not a double-grant.)
+worker() {
+  local _ who
+  for _ in $(seq 1 30); do
+    fm_lock_acquire_wait "$LK"
+    echo "$BASHPID" > "$CUR"
+    who=$(cat "$CUR" 2>/dev/null || true)
+    if [ -n "$who" ] && [ "$who" != "$BASHPID" ]; then echo "$who" >> "$BAD"; fi
+    fm_lock_release "$LK"
+  done
+}
+worker & worker & worker & worker & wait
+
+doubles=$(grep -c . "$BAD" 2>/dev/null || true)
+[ "$doubles" -eq 0 ] || fail "lock double-granted $doubles time(s): a second holder overwrote shared state while the lock was held"
+pass "wake-queue lock is mutually exclusive under contention (4 workers)"
+
+# A dead holder's lock must be reclaimable in a single try_acquire (fm-watch.sh
+# relies on this to take over after a crashed watcher); a live holder's must not.
+rm -f "$LK"
+dead=999999; while kill -0 "$dead" 2>/dev/null; do dead=$((dead + 1)); done
+printf '%s\n' "$dead" > "$LK"
+fm_lock_try_acquire "$LK" || fail "could not reclaim a dead holder's lock in one call"
+fm_lock_release "$LK"
+
+sleep 30 & holder=$!
+printf '%s\n' "$holder" > "$LK"
+if fm_lock_try_acquire "$LK"; then
+  kill "$holder" 2>/dev/null || true
+  fail "stole a live holder's lock"
+fi
+kill "$holder" 2>/dev/null || true
+rm -f "$LK"
+pass "lock reclaims a dead holder but never a live one"
diff --git a/tests/fm-resume.test.sh b/tests/fm-resume.test.sh
new file mode 100644
index 0000000..77a941c
--- /dev/null
+++ b/tests/fm-resume.test.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Behavior tests for bin/fm-resume.sh - the crash/reboot resurrection script.
+# Runs against a private tmux server (-L socket) so it never touches the live
+# firstmate session, and sets base-index 1 on that server to guard the bug where
+# send-keys targeted a hardcoded window ":0" that does not exist under base-index 1.
+set -u
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+RESUME="$ROOT/bin/fm-resume.sh"
+
+fail() { printf 'not ok - %s\n' "$1" >&2; exit 1; }
+pass() { printf 'ok - %s\n' "$1"; }
+
+if ! command -v tmux >/dev/null 2>&1; then
+  printf 'ok - SKIP fm-resume (tmux not installed)\n'
+  exit 0
+fi
+
+REAL_TMUX=$(command -v tmux)
+SOCK="fmresume$$"
+TMP=$(mktemp -d "${TMPDIR:-/tmp}/fm-resume-test.XXXXXX")
+
+cleanup() {
+  "$REAL_TMUX" -L "$SOCK" kill-server 2>/dev/null || true
+  [ -n "${TMP:-}" ] && rm -rf "$TMP"
+}
+trap cleanup EXIT
+
+# Private server with base-index 1; a dummy session keeps the server alive and
+# proves new sessions inherit the non-zero base index.
+"$REAL_TMUX" -L "$SOCK" new-session -d -s seed -x 80 -y 24
+"$REAL_TMUX" -L "$SOCK" set -g base-index 1
+
+# Wrapper so fm-resume's bare `tmux` calls hit the private server.
+mkdir -p "$TMP/bin"
+cat > "$TMP/bin/tmux" <<EOF
+#!/usr/bin/env bash
+exec "$REAL_TMUX" -L "$SOCK" "\$@"
+EOF
+chmod +x "$TMP/bin/tmux"
+
+# Fake firstmate binary: records its launch context, then idles.
+FAKE="$TMP/fake-claude"
+cat > "$FAKE" <<EOF
+#!/usr/bin/env bash
+printf 'cwd=%s cfg=%s sandbox=%s\n' "\$PWD" "\${CLAUDE_CONFIG_DIR:-}" "\${IS_SANDBOX:-}" > "$TMP/launch.txt"
+sleep 300
+EOF
+chmod +x "$FAKE"
+
+run_resume() {
+  PATH="$TMP/bin:$PATH" FM_SESSION=fmtest FM_CLAUDE_BIN="$FAKE" FM_CONFIG_DIR=/cfg/orch "$RESUME"
+}
+
+# 1) create path: session made, firstmate launched with the right context.
+out=$(run_resume) || fail "resume create returned nonzero"
+printf '%s\n' "$out" | grep -q "created session 'fmtest'" || fail "create not reported: $out"
+for _ in $(seq 1 40); do [ -f "$TMP/launch.txt" ] && break; sleep 0.25; done
+[ -f "$TMP/launch.txt" ] || fail "firstmate never launched (base-index regression?)"
+grep -qF "cwd=$ROOT " "$TMP/launch.txt" || fail "wrong cwd: $(cat "$TMP/launch.txt")"
+grep -qF "cfg=/cfg/orch " "$TMP/launch.txt" || fail "config dir not applied: $(cat "$TMP/launch.txt")"
+grep -qF "sandbox=1" "$TMP/launch.txt" || fail "IS_SANDBOX not set: $(cat "$TMP/launch.txt")"
+pass "fm-resume creates the session and launches firstmate with the right context"
+
+# 2) idempotent: a second run no-ops and never makes a duplicate session.
+out2=$(run_resume) || fail "resume no-op returned nonzero"
+printf '%s\n' "$out2" | grep -q "already live" || fail "second run should no-op: $out2"
+count=$("$REAL_TMUX" -L "$SOCK" list-sessions 2>/dev/null | grep -c '^fmtest:')
+[ "$count" -eq 1 ] || fail "expected exactly one fmtest session, got $count"
+pass "fm-resume is idempotent (no duplicate session)"
diff --git a/tests/fm-wake-queue.test.sh b/tests/fm-wake-queue.test.sh
index c7df55a..12283b5 100755
--- a/tests/fm-wake-queue.test.sh
+++ b/tests/fm-wake-queue.test.sh
@@ -274,10 +274,20 @@ test_singleton_start() {
   pid1=$!
   PATH="$fakebin:$PATH" FM_STATE_OVERRIDE="$state" FM_POLL=5 FM_SIGNAL_GRACE=1 FM_CHECK_INTERVAL=999999 FM_HEARTBEAT=999999 "$WATCH" > "$out2" &
   pid2=$!
-  sleep 0.5
-  live=0
-  is_live_non_zombie "$pid1" && live=$((live + 1))
-  is_live_non_zombie "$pid2" && live=$((live + 1))
+  # Poll for the actual invariant rather than a fixed delay: the losing watcher
+  # must exit (leaving exactly one live) AND report the existing singleton. Under
+  # load the loser may not have reached its exit within a guessed sleep, which is
+  # what made this assertion flake. Break as soon as it settles (usually <0.5s).
+  live=2
+  for _ in $(seq 1 40); do
+    live=0
+    is_live_non_zombie "$pid1" && live=$((live + 1))
+    is_live_non_zombie "$pid2" && live=$((live + 1))
+    if [ "$live" -eq 1 ] && grep -hq 'watcher: already running pid ' "$out1" "$out2"; then
+      break
+    fi
+    sleep 0.25
+  done
   [ "$live" -eq 1 ] || fail "expected exactly one live watcher, got $live"
   grep -h 'watcher: already running pid ' "$out1" "$out2" >/dev/null || fail "second watcher did not report existing singleton"
   kill "$pid1" "$pid2" 2>/dev/null || true