kunchenguid · owenzhang26-sys · Jun 23, 2026 · Jun 23, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -57,7 +57,8 @@ README.md            public overview and development notes
 .github/workflows/   shared CI and PR enforcement, committed
 .agents/skills/      shared skills, committed
 .claude/skills       symlink to .agents/skills for claude compatibility
-bin/                 helper scripts, committed, including fm-fleet-sync.sh for clean default-branch refreshes and gone-branch pruning; read each script's header before first use
+systemd/             firstmate.service unit for crash/reboot autostart (section 5); install via bin/fm-install-autostart.sh
+bin/                 helper scripts, committed, including fm-fleet-sync.sh for clean default-branch refreshes and gone-branch pruning, fm-resume.sh (autostart watchdog) and fm-install-autostart.sh; read each script's header before first use
 config/crew-harness  crewmate harness override; LOCAL, gitignored; absent or "default" = same as firstmate
 data/                personal fleet records; LOCAL, gitignored as a whole
   backlog.md         task queue, dependencies, history
@@ -202,6 +203,8 @@ Reconcile reality with your records before doing anything else:
 A firstmate restart must be a non-event.
 All truth lives in tmux, state files, data/backlog.md, and treehouse; your conversation memory is a cache.
 
+**Autostart (crash/reboot resilience).** A WSL VM teardown (host sleep, idle timeout, or a Windows Update reboot) kills tmux and every crewmate at once, and nothing relaunched firstmate after the VM came back. `systemd/firstmate.service` closes that gap: a watchdog (`bin/fm-resume.sh --watch`) recreates the persistent `firstmate` tmux session on boot and self-heals if it dies, so the captain re-attaches (`tmux attach -t firstmate`) to a live, state-intact firstmate instead of a cold start. Install/remove with `bin/fm-install-autostart.sh [install|status|uninstall]`; the unit's `KillMode=process` means stopping the service never kills a running firstmate. Pair it with `~/.wslconfig` `vmIdleTimeout=-1` (prevents the idle teardown in the first place; needs `wsl --shutdown` to apply). The watchdog does NOT relaunch crewmates - resuming in-flight work is recovery's job once the captain is back.
+
 ## 6. Project management
 
 All projects live flat under `projects/`.

diff --git a/bin/fm-install-autostart.sh b/bin/fm-install-autostart.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# Install (or remove) the systemd unit that makes firstmate survive a WSL VM
+# teardown - the failure that took the fleet out overnight (AGENTS.md section 5).
+#
+# systemd runs as PID 1 in this distro ([boot] systemd=true in /etc/wsl.conf),
+# so a system unit started at multi-user.target is the native, reboot-proof way
+# to auto-resurrect firstmate. The unit runs bin/fm-resume.sh as a watchdog.
+#
+# Usage:
+#   fm-install-autostart.sh            install, enable, and start the unit
+#   fm-install-autostart.sh status     show unit + session state
+#   fm-install-autostart.sh uninstall  stop, disable, and remove the unit
+#
+# Reversible: `uninstall` leaves no trace and never touches a running firstmate
+# session (KillMode=process in the unit), so removing autostart cannot discard work.
+set -eu
+
+FM_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+UNIT_SRC="$FM_ROOT/systemd/firstmate.service"
+UNIT_NAME="firstmate.service"
+UNIT_DST="/etc/systemd/system/$UNIT_NAME"
+
+require_systemd() {
+  if ! command -v systemctl >/dev/null 2>&1; then
+    echo "error: systemctl not found; this distro is not running systemd" >&2
+    exit 1
+  fi
+  if [ "$(ps -p 1 -o comm= 2>/dev/null)" != systemd ]; then
+    echo "error: PID 1 is not systemd; enable '[boot] systemd=true' in /etc/wsl.conf, then 'wsl --shutdown'" >&2
+    exit 1
+  fi
+}
+
+cmd_install() {
+  require_systemd
+  [ -f "$UNIT_SRC" ] || { echo "error: unit not found at $UNIT_SRC" >&2; exit 1; }
+  # Install a copy (not a symlink): systemd does not follow symlinks under
+  # /mnt/c reliably, and the repo path is not guaranteed mounted at early boot.
+  install -m 0644 "$UNIT_SRC" "$UNIT_DST"
+  systemctl daemon-reload
+  systemctl enable "$UNIT_NAME"
+  systemctl restart "$UNIT_NAME"
+  echo "installed and enabled $UNIT_NAME"
+  echo "firstmate will now auto-resurrect on every boot and self-heal if it dies."
+  cmd_status
+}
+
+cmd_uninstall() {
+  require_systemd
+  systemctl disable "$UNIT_NAME" 2>/dev/null || true
+  systemctl stop "$UNIT_NAME" 2>/dev/null || true
+  rm -f "$UNIT_DST"
+  systemctl daemon-reload
+  echo "removed $UNIT_NAME (any running firstmate session was left untouched)"
+}
+
+cmd_status() {
+  require_systemd
+  echo "--- unit ---"
+  systemctl is-enabled "$UNIT_NAME" 2>/dev/null | sed 's/^/enabled: /' || echo "enabled: no"
+  systemctl is-active "$UNIT_NAME" 2>/dev/null | sed 's/^/active: /' || echo "active: no"
+  echo "--- session ---"
+  if tmux has-session -t "${FM_SESSION:-firstmate}" 2>/dev/null; then
+    echo "firstmate tmux session: LIVE (attach with: tmux attach -t ${FM_SESSION:-firstmate})"
+  else
+    echo "firstmate tmux session: not present"
+  fi
+}
+
+case "${1:-install}" in
+  install)   cmd_install ;;
+  uninstall) cmd_uninstall ;;
+  status)    cmd_status ;;
+  *) echo "usage: $(basename "$0") [install|status|uninstall]" >&2; exit 2 ;;
+esac
diff --git a/bin/fm-resume.sh b/bin/fm-resume.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# Idempotently ensure the persistent firstmate tmux session exists.
+#
+# This is the recovery half of crash resilience (AGENTS.md section 5): a WSL VM
+# teardown (host sleep, idle timeout, or a Windows Update reboot) kills the tmux
+# server and every process in it, including firstmate and any crewmates. systemd
+# survives the reboot but nothing relaunched firstmate - so it stayed dead until
+# the captain reconnected. This script, run at boot and on a self-heal timer by
+# firstmate.service (see systemd/), brings the session back automatically. The
+# captain then attaches to a live, state-intact firstmate instead of a cold start.
+#
+# It is deliberately idempotent and safe to run repeatedly: if the session is
+# already alive it is a no-op. It does NOT relaunch crewmate workers - their
+# autonomous processes died with the VM and re-spawning them is firstmate's job
+# via its recovery protocol once the captain is back. Worker state on disk
+# (data/, state/, backlog) is untouched and survives regardless.
+#
+# Usage:
+#   fm-resume.sh            ensure the session once, then exit
+#   fm-resume.sh --watch    ensure forever, re-checking every FM_RESUME_INTERVAL
+#                           seconds (the long-running ExecStart for the service)
+set -eu
+
+FM_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+# How firstmate itself is launched. These mirror the captain's live session
+# exactly (verified from the running process): the REAL claude binary, not the
+# round-robin crew shim on PATH; the orchestrator account; sandbox + skip-perms
+# because firstmate runs as root inside an isolated WSL VM.
+SESSION="${FM_SESSION:-firstmate}"
+FM_CLAUDE_BIN="${FM_CLAUDE_BIN:-$HOME/.local/bin/claude}"
+FM_CONFIG_DIR="${FM_CONFIG_DIR:-/mnt/c/Users/Owenz/.claude-orchestrator}"
+FM_RESUME_INTERVAL="${FM_RESUME_INTERVAL:-60}"
+
+ensure_session() {
+  if tmux has-session -t "$SESSION" 2>/dev/null; then
+    return 0
+  fi
+
+  # Build the firstmate launch command. Single-quoted values are expanded in the
+  # tmux pane's shell, not here. We exec the real binary by absolute path so a
+  # PATH that prefers the crew shim cannot misroute firstmate onto a worker account.
+  local launch
+  printf -v launch \
+    'cd %q && CLAUDE_CONFIG_DIR=%q IS_SANDBOX=1 exec %q --dangerously-skip-permissions' \
+    "$FM_ROOT" "$FM_CONFIG_DIR" "$FM_CLAUDE_BIN"
+
+  # Target the session (active window) rather than a hardcoded ":0": tmux configs
+  # commonly set base-index 1, so the first window is not always index 0.
+  tmux new-session -d -s "$SESSION" -c "$FM_ROOT"
+  tmux send-keys -t "$SESSION" "$launch" Enter
+  echo "fm-resume: created session '$SESSION' running firstmate"
+  return 0
+}
+
+if [ "${1:-}" = "--watch" ]; then
+  while :; do
+    ensure_session || true
+    sleep "$FM_RESUME_INTERVAL"
+  done
+else
+  if tmux has-session -t "$SESSION" 2>/dev/null; then
+    echo "fm-resume: session '$SESSION' already live"
+  else
+    ensure_session
+  fi
+fi
diff --git a/bin/fm-spawn.sh b/bin/fm-spawn.sh
@@ -121,10 +121,12 @@ tmux send-keys -t "$T" 'treehouse get' Enter
 WT=""
 for _ in $(seq 1 60); do
   p=$(tmux display-message -p -t "$T" '#{pane_current_path}' 2>/dev/null || true)
-  if [ -n "$p" ] && [ "$p" != "$PROJ_ABS" ]; then
-    WT="$p"
-    break
-  fi
+  # Wait specifically for a treehouse worktree (under {root}/.treehouse/), not just any
+  # cwd change: a freshly-created window can transiently report the session's default cwd
+  # before `treehouse get` lands, which would otherwise be misrecorded as the worktree.
+  case "$p" in
+    */.treehouse/*) WT="$p"; break ;;
+  esac
   sleep 1
 done
 if [ -z "$WT" ]; then

diff --git a/bin/fm-teardown.sh b/bin/fm-teardown.sh
@@ -59,7 +59,7 @@ if [ -d "$WT" ] && [ "$FORCE" != "--force" ]; then
     # The work is safe once it is merged into the local default branch (firstmate
     # does that merge on the captain's approval). Refuse until then.
     DEFAULT=$(default_branch) || { echo "REFUSED: cannot determine default branch for $PROJ; expected origin/HEAD, main, or master." >&2; exit 1; }
-    dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '^\?\? \.claude/' | head -1 || true)
+    dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '\.claude/settings\.local\.json|^\?\? \.claude/' | head -1 || true)
     unmerged=$(git -C "$WT" log --oneline HEAD --not "$DEFAULT" -- 2>/dev/null | head -5 || true)
     if [ -n "$dirty" ] || [ -n "$unmerged" ]; then
       echo "REFUSED: local-only worktree $WT has work not yet merged into $DEFAULT." >&2
@@ -70,7 +70,7 @@ if [ -d "$WT" ] && [ "$FORCE" != "--force" ]; then
     fi
   else
     # The fm-spawn hook file is ours, never work product; ignore it in the dirty check.
-    dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '^\?\? \.claude/' | head -1 || true)
+    dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '\.claude/settings\.local\.json|^\?\? \.claude/' | head -1 || true)
     unpushed=$(git -C "$WT" log --oneline HEAD --not --remotes -- 2>/dev/null | head -5 || true)
     if [ -n "$dirty" ] || [ -n "$unpushed" ]; then
       echo "REFUSED: worktree $WT has work not on any remote." >&2

diff --git a/bin/fm-wake-lib.sh b/bin/fm-wake-lib.sh
@@ -7,7 +7,13 @@ FM_ROOT="${FM_ROOT_OVERRIDE:-${FM_ROOT:-$FM_WAKE_DEFAULT_ROOT}}"
 STATE="${FM_STATE_OVERRIDE:-${STATE:-$FM_ROOT/state}}"
 FM_WAKE_QUEUE="${FM_WAKE_QUEUE:-$STATE/.wake-queue}"
 FM_WAKE_QUEUE_LOCK="${FM_WAKE_QUEUE_LOCK:-$STATE/.wake-queue.lock}"
-FM_LOCK_STALE_AFTER="${FM_LOCK_STALE_AFTER:-2}"
+# Grace before an empty-pid lock dir (holder died in the microsecond window
+# between mkdir and writing its pid) is treated as stale and reclaimed. A normal
+# holder death is reclaimed instantly via pid-liveness (kill -0), so this only
+# bounds recovery from that rare window. Kept generous (10s) so that under heavy
+# scheduling delay a live holder mid-write is never mistaken for stale and have
+# its lock stolen - the double-grant that made the concurrency tests flake.
+FM_LOCK_STALE_AFTER="${FM_LOCK_STALE_AFTER:-10}"
 mkdir -p "$STATE"
 
 fm_current_pid() {
@@ -36,78 +42,100 @@ fm_path_age() {
   echo $(( $(date +%s) - m ))
 }
 
-fm_lock_remove_stale() {
-  local lockdir=$1 expected_pid=$2 current_pid
-  current_pid=$(cat "$lockdir/pid" 2>/dev/null || true)
-  [ "$current_pid" = "$expected_pid" ] || return 1
-  if fm_pid_alive "$current_pid"; then
-    return 1
-  fi
-  case "$current_pid" in
-    ''|*[!0-9]*)
-      [ "$(fm_path_age "$lockdir")" -ge "$FM_LOCK_STALE_AFTER" ] || return 1
-      ;;
-  esac
-  rm -f "$lockdir/pid" 2>/dev/null || return 1
-  rmdir "$lockdir" 2>/dev/null
-}
+# The lock is a single FILE created with O_EXCL (bash `set -C` noclobber). This
+# replaced an mkdir-based dir lock: plain mkdir is NOT atomic on every target
+# filesystem - on WSL2's filesystem several concurrent mkdir calls were observed
+# to all "succeed" on one path (verified: 4 simultaneous successes in a 20-way
+# barrier race), which silently double-granted the old lock and made the watcher
+# singleton and wake-queue draining race. O_EXCL create IS atomic everywhere we
+# run (Linux, WSL2, macOS) and writes the holder pid in the SAME redirection, so
+# there is never a window where the lock exists with an unknown owner.
+#
+# The O_EXCL create is the ONE and ONLY grant. Reclaiming a dead holder's lock
+# never grants directly: it only frees the lock and lets the next O_EXCL create
+# (one atomic winner) take it. A live holder's lock can never be stolen, because
+# reclaim is gated on the holder pid being dead, and the one path that moves the
+# file (dead-holder reclaim) re-checks what it actually took and restores it via
+# an atomic hardlink if a live holder had reappeared in the gap.
 
 fm_lock_try_acquire() {
-  local lockdir=$1 pid
+  local lockfile=$1 pid me steal spid
   FM_LOCK_HELD_PID=
-  if mkdir "$lockdir" 2>/dev/null; then
-    if { fm_current_pid > "$lockdir/pid"; } 2>/dev/null; then
-      return 0
+  # Compute the pid in THIS shell, not inside the O_EXCL subshell below (where
+  # BASHPID would be the subshell's). Expanded before the subshell forks, so the
+  # holder pid is written - and matches what fm_lock_release compares against.
+  me=${BASHPID:-$$}
+
+  # If a reclaimable lock is present (dead holder, long-empty file, or a legacy
+  # pre-O_EXCL directory), free it first. A LIVE holder's lock is never freed.
+  if [ -d "$lockfile" ]; then
+    pid=$(cat "$lockfile/pid" 2>/dev/null || true)
+    if [ -n "$pid" ] && fm_pid_alive "$pid"; then
+      FM_LOCK_HELD_PID=$pid
+      return 1
     fi
-    rm -f "$lockdir/pid" 2>/dev/null || true
-    rmdir "$lockdir" 2>/dev/null || true
-    return 1
-  fi
-
-  pid=$(cat "$lockdir/pid" 2>/dev/null || true)
-  if fm_pid_alive "$pid"; then
-    FM_LOCK_HELD_PID=$pid
-    return 1
-  fi
-  case "$pid" in
-    ''|*[!0-9]*)
-      if [ "$(fm_path_age "$lockdir")" -lt "$FM_LOCK_STALE_AFTER" ]; then
-        FM_LOCK_HELD_PID=$pid
+    rm -rf "$lockfile" 2>/dev/null || true
+  elif [ -e "$lockfile" ]; then
+    pid=$(cat "$lockfile" 2>/dev/null || true)
+    if fm_pid_alive "$pid"; then
+      FM_LOCK_HELD_PID=$pid
+      return 1
+    fi
+    # Empty-but-fresh file: tolerate a brief writer gap rather than reclaim.
+    if [ -z "$pid" ] && [ "$(fm_path_age "$lockfile")" -lt "$FM_LOCK_STALE_AFTER" ]; then
+      FM_LOCK_HELD_PID=$pid
+      return 1
+    fi
+    # Dead (or long-empty) holder: move the lock aside and re-check the exact
+    # bytes we moved. If a live holder had replaced it in the gap, restore it
+    # with an atomic hardlink (ln fails if a fresh holder already exists, so we
+    # never clobber one) and back off. Otherwise it is freed.
+    steal="$lockfile.stale.$me"
+    rm -f "$steal" 2>/dev/null || true
+    if mv "$lockfile" "$steal" 2>/dev/null; then
+      spid=$(cat "$steal" 2>/dev/null || true)
+      if [ -n "$spid" ] && fm_pid_alive "$spid"; then
+        ln "$steal" "$lockfile" 2>/dev/null || true
+        rm -f "$steal" 2>/dev/null || true
+        FM_LOCK_HELD_PID=$spid
         return 1
       fi
-      ;;
-  esac
-
-  fm_lock_remove_stale "$lockdir" "$pid" || true
-  if mkdir "$lockdir" 2>/dev/null; then
-    if { fm_current_pid > "$lockdir/pid"; } 2>/dev/null; then
-      return 0
+      rm -f "$steal" 2>/dev/null || true
     fi
-    rm -f "$lockdir/pid" 2>/dev/null || true
-    rmdir "$lockdir" 2>/dev/null || true
-    return 1
   fi
 
-  pid=$(cat "$lockdir/pid" 2>/dev/null || true)
+  # The one and only grant: an atomic O_EXCL create. Exactly one racer wins;
+  # losers (someone created it first) fall through to report the holder.
+  if ( set -C; printf '%s\n' "$me" > "$lockfile" ) 2>/dev/null; then
+    return 0
+  fi
+  pid=$(cat "$lockfile" 2>/dev/null || true)
   # shellcheck disable=SC2034 # Read by callers after fm_lock_try_acquire returns.
   FM_LOCK_HELD_PID=$pid
   return 1
 }
 
 fm_lock_acquire_wait() {
-  local lockdir=$1
-  while ! fm_lock_try_acquire "$lockdir"; do
+  local lockfile=$1
+  while ! fm_lock_try_acquire "$lockfile"; do
     sleep 0.1
   done
 }
 
 fm_lock_release() {
-  local lockdir=$1 pid current
+  local lockfile=$1 pid current
   current=${BASHPID:-$$}
-  pid=$(cat "$lockdir/pid" 2>/dev/null || true)
+  # Remove only our own lock. A directory is the legacy format; treat its pid
+  # file the same way so an in-flight upgrade releases cleanly.
+  if [ -d "$lockfile" ]; then
+    pid=$(cat "$lockfile/pid" 2>/dev/null || true)
+    [ "$pid" = "$current" ] || return 0
+    rm -rf "$lockfile" 2>/dev/null || true
+    return 0
+  fi
+  pid=$(cat "$lockfile" 2>/dev/null || true)
   [ "$pid" = "$current" ] || return 0
-  rm -f "$lockdir/pid" 2>/dev/null || true
-  rmdir "$lockdir" 2>/dev/null || true
+  rm -f "$lockfile" 2>/dev/null || true
 }
 
 fm_wake_clean_field() {

diff --git a/systemd/firstmate.service b/systemd/firstmate.service
@@ -0,0 +1,27 @@
+[Unit]
+Description=Firstmate persistent supervisor session (crash/reboot resilience)
+Documentation=https://github.com/kunchenguid/firstmate
+# Wait for the Windows drive mounts; the orchestrator account config lives under
+# /mnt/c. Non-fatal if the mount unit name differs - the resume script retries.
+After=local-fs.target
+
+[Service]
+Type=simple
+User=root
+WorkingDirectory=/root/firstmate
+Environment=HOME=/root
+# The watchdog: ensure the firstmate tmux session exists, re-checking forever.
+# On VM boot this recreates the session; if the loop itself ever dies, Restart
+# brings it back. The loop re-execs nothing destructive - it no-ops when the
+# session is already live.
+ExecStart=/root/firstmate/bin/fm-resume.sh --watch
+Restart=always
+RestartSec=10
+# Kill ONLY the watchdog loop on stop/restart, never the tmux server it guards.
+# This makes firstmate survive `systemctl restart firstmate` and decouples the
+# supervisor session's lifetime from this unit: the service is a guardian, not
+# the owner. To fully stop firstmate, kill its tmux session directly.
+KillMode=process
+
+[Install]
+WantedBy=multi-user.target