diff --git a/AGENTS.md b/AGENTS.md index 8ddbe5c..1096b1b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -57,7 +57,8 @@ README.md public overview and development notes .github/workflows/ shared CI and PR enforcement, committed .agents/skills/ shared skills, committed .claude/skills symlink to .agents/skills for claude compatibility -bin/ helper scripts, committed, including fm-fleet-sync.sh for clean default-branch refreshes and gone-branch pruning; read each script's header before first use +systemd/ firstmate.service unit for crash/reboot autostart (section 5); install via bin/fm-install-autostart.sh +bin/ helper scripts, committed, including fm-fleet-sync.sh for clean default-branch refreshes and gone-branch pruning, fm-resume.sh (autostart watchdog) and fm-install-autostart.sh; read each script's header before first use config/crew-harness crewmate harness override; LOCAL, gitignored; absent or "default" = same as firstmate data/ personal fleet records; LOCAL, gitignored as a whole backlog.md task queue, dependencies, history @@ -202,6 +203,8 @@ Reconcile reality with your records before doing anything else: A firstmate restart must be a non-event. All truth lives in tmux, state files, data/backlog.md, and treehouse; your conversation memory is a cache. +**Autostart (crash/reboot resilience).** A WSL VM teardown (host sleep, idle timeout, or a Windows Update reboot) kills tmux and every crewmate at once, and nothing relaunched firstmate after the VM came back. `systemd/firstmate.service` closes that gap: a watchdog (`bin/fm-resume.sh --watch`) recreates the persistent `firstmate` tmux session on boot and self-heals if it dies, so the captain re-attaches (`tmux attach -t firstmate`) to a live, state-intact firstmate instead of a cold start. Install/remove with `bin/fm-install-autostart.sh [install|status|uninstall]`; the unit's `KillMode=process` means stopping the service never kills a running firstmate. Pair it with `~/.wslconfig` `vmIdleTimeout=-1` (prevents the idle teardown in the first place; needs `wsl --shutdown` to apply). The watchdog does NOT relaunch crewmates - resuming in-flight work is recovery's job once the captain is back. + ## 6. Project management All projects live flat under `projects/`. diff --git a/bin/fm-install-autostart.sh b/bin/fm-install-autostart.sh new file mode 100644 index 0000000..ed071c5 --- /dev/null +++ b/bin/fm-install-autostart.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Install (or remove) the systemd unit that makes firstmate survive a WSL VM +# teardown - the failure that took the fleet out overnight (AGENTS.md section 5). +# +# systemd runs as PID 1 in this distro ([boot] systemd=true in /etc/wsl.conf), +# so a system unit started at multi-user.target is the native, reboot-proof way +# to auto-resurrect firstmate. The unit runs bin/fm-resume.sh as a watchdog. +# +# Usage: +# fm-install-autostart.sh install, enable, and start the unit +# fm-install-autostart.sh status show unit + session state +# fm-install-autostart.sh uninstall stop, disable, and remove the unit +# +# Reversible: `uninstall` leaves no trace and never touches a running firstmate +# session (KillMode=process in the unit), so removing autostart cannot discard work. +set -eu + +FM_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +UNIT_SRC="$FM_ROOT/systemd/firstmate.service" +UNIT_NAME="firstmate.service" +UNIT_DST="/etc/systemd/system/$UNIT_NAME" + +require_systemd() { + if ! command -v systemctl >/dev/null 2>&1; then + echo "error: systemctl not found; this distro is not running systemd" >&2 + exit 1 + fi + if [ "$(ps -p 1 -o comm= 2>/dev/null)" != systemd ]; then + echo "error: PID 1 is not systemd; enable '[boot] systemd=true' in /etc/wsl.conf, then 'wsl --shutdown'" >&2 + exit 1 + fi +} + +cmd_install() { + require_systemd + [ -f "$UNIT_SRC" ] || { echo "error: unit not found at $UNIT_SRC" >&2; exit 1; } + # Install a copy (not a symlink): systemd does not follow symlinks under + # /mnt/c reliably, and the repo path is not guaranteed mounted at early boot. + install -m 0644 "$UNIT_SRC" "$UNIT_DST" + systemctl daemon-reload + systemctl enable "$UNIT_NAME" + systemctl restart "$UNIT_NAME" + echo "installed and enabled $UNIT_NAME" + echo "firstmate will now auto-resurrect on every boot and self-heal if it dies." + cmd_status +} + +cmd_uninstall() { + require_systemd + systemctl disable "$UNIT_NAME" 2>/dev/null || true + systemctl stop "$UNIT_NAME" 2>/dev/null || true + rm -f "$UNIT_DST" + systemctl daemon-reload + echo "removed $UNIT_NAME (any running firstmate session was left untouched)" +} + +cmd_status() { + require_systemd + echo "--- unit ---" + systemctl is-enabled "$UNIT_NAME" 2>/dev/null | sed 's/^/enabled: /' || echo "enabled: no" + systemctl is-active "$UNIT_NAME" 2>/dev/null | sed 's/^/active: /' || echo "active: no" + echo "--- session ---" + if tmux has-session -t "${FM_SESSION:-firstmate}" 2>/dev/null; then + echo "firstmate tmux session: LIVE (attach with: tmux attach -t ${FM_SESSION:-firstmate})" + else + echo "firstmate tmux session: not present" + fi +} + +case "${1:-install}" in + install) cmd_install ;; + uninstall) cmd_uninstall ;; + status) cmd_status ;; + *) echo "usage: $(basename "$0") [install|status|uninstall]" >&2; exit 2 ;; +esac diff --git a/bin/fm-resume.sh b/bin/fm-resume.sh new file mode 100644 index 0000000..568e7d2 --- /dev/null +++ b/bin/fm-resume.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# Idempotently ensure the persistent firstmate tmux session exists. +# +# This is the recovery half of crash resilience (AGENTS.md section 5): a WSL VM +# teardown (host sleep, idle timeout, or a Windows Update reboot) kills the tmux +# server and every process in it, including firstmate and any crewmates. systemd +# survives the reboot but nothing relaunched firstmate - so it stayed dead until +# the captain reconnected. This script, run at boot and on a self-heal timer by +# firstmate.service (see systemd/), brings the session back automatically. The +# captain then attaches to a live, state-intact firstmate instead of a cold start. +# +# It is deliberately idempotent and safe to run repeatedly: if the session is +# already alive it is a no-op. It does NOT relaunch crewmate workers - their +# autonomous processes died with the VM and re-spawning them is firstmate's job +# via its recovery protocol once the captain is back. Worker state on disk +# (data/, state/, backlog) is untouched and survives regardless. +# +# Usage: +# fm-resume.sh ensure the session once, then exit +# fm-resume.sh --watch ensure forever, re-checking every FM_RESUME_INTERVAL +# seconds (the long-running ExecStart for the service) +set -eu + +FM_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +# How firstmate itself is launched. These mirror the captain's live session +# exactly (verified from the running process): the REAL claude binary, not the +# round-robin crew shim on PATH; the orchestrator account; sandbox + skip-perms +# because firstmate runs as root inside an isolated WSL VM. +SESSION="${FM_SESSION:-firstmate}" +FM_CLAUDE_BIN="${FM_CLAUDE_BIN:-$HOME/.local/bin/claude}" +FM_CONFIG_DIR="${FM_CONFIG_DIR:-/mnt/c/Users/Owenz/.claude-orchestrator}" +FM_RESUME_INTERVAL="${FM_RESUME_INTERVAL:-60}" + +ensure_session() { + if tmux has-session -t "$SESSION" 2>/dev/null; then + return 0 + fi + + # Build the firstmate launch command. Single-quoted values are expanded in the + # tmux pane's shell, not here. We exec the real binary by absolute path so a + # PATH that prefers the crew shim cannot misroute firstmate onto a worker account. + local launch + printf -v launch \ + 'cd %q && CLAUDE_CONFIG_DIR=%q IS_SANDBOX=1 exec %q --dangerously-skip-permissions' \ + "$FM_ROOT" "$FM_CONFIG_DIR" "$FM_CLAUDE_BIN" + + # Target the session (active window) rather than a hardcoded ":0": tmux configs + # commonly set base-index 1, so the first window is not always index 0. + tmux new-session -d -s "$SESSION" -c "$FM_ROOT" + tmux send-keys -t "$SESSION" "$launch" Enter + echo "fm-resume: created session '$SESSION' running firstmate" + return 0 +} + +if [ "${1:-}" = "--watch" ]; then + while :; do + ensure_session || true + sleep "$FM_RESUME_INTERVAL" + done +else + if tmux has-session -t "$SESSION" 2>/dev/null; then + echo "fm-resume: session '$SESSION' already live" + else + ensure_session + fi +fi diff --git a/bin/fm-spawn.sh b/bin/fm-spawn.sh index d0b1dc4..e221c0d 100755 --- a/bin/fm-spawn.sh +++ b/bin/fm-spawn.sh @@ -121,10 +121,12 @@ tmux send-keys -t "$T" 'treehouse get' Enter WT="" for _ in $(seq 1 60); do p=$(tmux display-message -p -t "$T" '#{pane_current_path}' 2>/dev/null || true) - if [ -n "$p" ] && [ "$p" != "$PROJ_ABS" ]; then - WT="$p" - break - fi + # Wait specifically for a treehouse worktree (under {root}/.treehouse/), not just any + # cwd change: a freshly-created window can transiently report the session's default cwd + # before `treehouse get` lands, which would otherwise be misrecorded as the worktree. + case "$p" in + */.treehouse/*) WT="$p"; break ;; + esac sleep 1 done if [ -z "$WT" ]; then diff --git a/bin/fm-teardown.sh b/bin/fm-teardown.sh index 4fd2355..e717525 100755 --- a/bin/fm-teardown.sh +++ b/bin/fm-teardown.sh @@ -59,7 +59,7 @@ if [ -d "$WT" ] && [ "$FORCE" != "--force" ]; then # The work is safe once it is merged into the local default branch (firstmate # does that merge on the captain's approval). Refuse until then. DEFAULT=$(default_branch) || { echo "REFUSED: cannot determine default branch for $PROJ; expected origin/HEAD, main, or master." >&2; exit 1; } - dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '^\?\? \.claude/' | head -1 || true) + dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '\.claude/settings\.local\.json|^\?\? \.claude/' | head -1 || true) unmerged=$(git -C "$WT" log --oneline HEAD --not "$DEFAULT" -- 2>/dev/null | head -5 || true) if [ -n "$dirty" ] || [ -n "$unmerged" ]; then echo "REFUSED: local-only worktree $WT has work not yet merged into $DEFAULT." >&2 @@ -70,7 +70,7 @@ if [ -d "$WT" ] && [ "$FORCE" != "--force" ]; then fi else # The fm-spawn hook file is ours, never work product; ignore it in the dirty check. - dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '^\?\? \.claude/' | head -1 || true) + dirty=$(git -C "$WT" status --porcelain 2>/dev/null | grep -vE '\.claude/settings\.local\.json|^\?\? \.claude/' | head -1 || true) unpushed=$(git -C "$WT" log --oneline HEAD --not --remotes -- 2>/dev/null | head -5 || true) if [ -n "$dirty" ] || [ -n "$unpushed" ]; then echo "REFUSED: worktree $WT has work not on any remote." >&2 diff --git a/bin/fm-wake-lib.sh b/bin/fm-wake-lib.sh index bfb7ba9..d66eb18 100755 --- a/bin/fm-wake-lib.sh +++ b/bin/fm-wake-lib.sh @@ -7,7 +7,13 @@ FM_ROOT="${FM_ROOT_OVERRIDE:-${FM_ROOT:-$FM_WAKE_DEFAULT_ROOT}}" STATE="${FM_STATE_OVERRIDE:-${STATE:-$FM_ROOT/state}}" FM_WAKE_QUEUE="${FM_WAKE_QUEUE:-$STATE/.wake-queue}" FM_WAKE_QUEUE_LOCK="${FM_WAKE_QUEUE_LOCK:-$STATE/.wake-queue.lock}" -FM_LOCK_STALE_AFTER="${FM_LOCK_STALE_AFTER:-2}" +# Grace before an empty-pid lock dir (holder died in the microsecond window +# between mkdir and writing its pid) is treated as stale and reclaimed. A normal +# holder death is reclaimed instantly via pid-liveness (kill -0), so this only +# bounds recovery from that rare window. Kept generous (10s) so that under heavy +# scheduling delay a live holder mid-write is never mistaken for stale and have +# its lock stolen - the double-grant that made the concurrency tests flake. +FM_LOCK_STALE_AFTER="${FM_LOCK_STALE_AFTER:-10}" mkdir -p "$STATE" fm_current_pid() { @@ -36,78 +42,100 @@ fm_path_age() { echo $(( $(date +%s) - m )) } -fm_lock_remove_stale() { - local lockdir=$1 expected_pid=$2 current_pid - current_pid=$(cat "$lockdir/pid" 2>/dev/null || true) - [ "$current_pid" = "$expected_pid" ] || return 1 - if fm_pid_alive "$current_pid"; then - return 1 - fi - case "$current_pid" in - ''|*[!0-9]*) - [ "$(fm_path_age "$lockdir")" -ge "$FM_LOCK_STALE_AFTER" ] || return 1 - ;; - esac - rm -f "$lockdir/pid" 2>/dev/null || return 1 - rmdir "$lockdir" 2>/dev/null -} +# The lock is a single FILE created with O_EXCL (bash `set -C` noclobber). This +# replaced an mkdir-based dir lock: plain mkdir is NOT atomic on every target +# filesystem - on WSL2's filesystem several concurrent mkdir calls were observed +# to all "succeed" on one path (verified: 4 simultaneous successes in a 20-way +# barrier race), which silently double-granted the old lock and made the watcher +# singleton and wake-queue draining race. O_EXCL create IS atomic everywhere we +# run (Linux, WSL2, macOS) and writes the holder pid in the SAME redirection, so +# there is never a window where the lock exists with an unknown owner. +# +# The O_EXCL create is the ONE and ONLY grant. Reclaiming a dead holder's lock +# never grants directly: it only frees the lock and lets the next O_EXCL create +# (one atomic winner) take it. A live holder's lock can never be stolen, because +# reclaim is gated on the holder pid being dead, and the one path that moves the +# file (dead-holder reclaim) re-checks what it actually took and restores it via +# an atomic hardlink if a live holder had reappeared in the gap. fm_lock_try_acquire() { - local lockdir=$1 pid + local lockfile=$1 pid me steal spid FM_LOCK_HELD_PID= - if mkdir "$lockdir" 2>/dev/null; then - if { fm_current_pid > "$lockdir/pid"; } 2>/dev/null; then - return 0 + # Compute the pid in THIS shell, not inside the O_EXCL subshell below (where + # BASHPID would be the subshell's). Expanded before the subshell forks, so the + # holder pid is written - and matches what fm_lock_release compares against. + me=${BASHPID:-$$} + + # If a reclaimable lock is present (dead holder, long-empty file, or a legacy + # pre-O_EXCL directory), free it first. A LIVE holder's lock is never freed. + if [ -d "$lockfile" ]; then + pid=$(cat "$lockfile/pid" 2>/dev/null || true) + if [ -n "$pid" ] && fm_pid_alive "$pid"; then + FM_LOCK_HELD_PID=$pid + return 1 fi - rm -f "$lockdir/pid" 2>/dev/null || true - rmdir "$lockdir" 2>/dev/null || true - return 1 - fi - - pid=$(cat "$lockdir/pid" 2>/dev/null || true) - if fm_pid_alive "$pid"; then - FM_LOCK_HELD_PID=$pid - return 1 - fi - case "$pid" in - ''|*[!0-9]*) - if [ "$(fm_path_age "$lockdir")" -lt "$FM_LOCK_STALE_AFTER" ]; then - FM_LOCK_HELD_PID=$pid + rm -rf "$lockfile" 2>/dev/null || true + elif [ -e "$lockfile" ]; then + pid=$(cat "$lockfile" 2>/dev/null || true) + if fm_pid_alive "$pid"; then + FM_LOCK_HELD_PID=$pid + return 1 + fi + # Empty-but-fresh file: tolerate a brief writer gap rather than reclaim. + if [ -z "$pid" ] && [ "$(fm_path_age "$lockfile")" -lt "$FM_LOCK_STALE_AFTER" ]; then + FM_LOCK_HELD_PID=$pid + return 1 + fi + # Dead (or long-empty) holder: move the lock aside and re-check the exact + # bytes we moved. If a live holder had replaced it in the gap, restore it + # with an atomic hardlink (ln fails if a fresh holder already exists, so we + # never clobber one) and back off. Otherwise it is freed. + steal="$lockfile.stale.$me" + rm -f "$steal" 2>/dev/null || true + if mv "$lockfile" "$steal" 2>/dev/null; then + spid=$(cat "$steal" 2>/dev/null || true) + if [ -n "$spid" ] && fm_pid_alive "$spid"; then + ln "$steal" "$lockfile" 2>/dev/null || true + rm -f "$steal" 2>/dev/null || true + FM_LOCK_HELD_PID=$spid return 1 fi - ;; - esac - - fm_lock_remove_stale "$lockdir" "$pid" || true - if mkdir "$lockdir" 2>/dev/null; then - if { fm_current_pid > "$lockdir/pid"; } 2>/dev/null; then - return 0 + rm -f "$steal" 2>/dev/null || true fi - rm -f "$lockdir/pid" 2>/dev/null || true - rmdir "$lockdir" 2>/dev/null || true - return 1 fi - pid=$(cat "$lockdir/pid" 2>/dev/null || true) + # The one and only grant: an atomic O_EXCL create. Exactly one racer wins; + # losers (someone created it first) fall through to report the holder. + if ( set -C; printf '%s\n' "$me" > "$lockfile" ) 2>/dev/null; then + return 0 + fi + pid=$(cat "$lockfile" 2>/dev/null || true) # shellcheck disable=SC2034 # Read by callers after fm_lock_try_acquire returns. FM_LOCK_HELD_PID=$pid return 1 } fm_lock_acquire_wait() { - local lockdir=$1 - while ! fm_lock_try_acquire "$lockdir"; do + local lockfile=$1 + while ! fm_lock_try_acquire "$lockfile"; do sleep 0.1 done } fm_lock_release() { - local lockdir=$1 pid current + local lockfile=$1 pid current current=${BASHPID:-$$} - pid=$(cat "$lockdir/pid" 2>/dev/null || true) + # Remove only our own lock. A directory is the legacy format; treat its pid + # file the same way so an in-flight upgrade releases cleanly. + if [ -d "$lockfile" ]; then + pid=$(cat "$lockfile/pid" 2>/dev/null || true) + [ "$pid" = "$current" ] || return 0 + rm -rf "$lockfile" 2>/dev/null || true + return 0 + fi + pid=$(cat "$lockfile" 2>/dev/null || true) [ "$pid" = "$current" ] || return 0 - rm -f "$lockdir/pid" 2>/dev/null || true - rmdir "$lockdir" 2>/dev/null || true + rm -f "$lockfile" 2>/dev/null || true } fm_wake_clean_field() { diff --git a/systemd/firstmate.service b/systemd/firstmate.service new file mode 100644 index 0000000..3488fef --- /dev/null +++ b/systemd/firstmate.service @@ -0,0 +1,27 @@ +[Unit] +Description=Firstmate persistent supervisor session (crash/reboot resilience) +Documentation=https://github.com/kunchenguid/firstmate +# Wait for the Windows drive mounts; the orchestrator account config lives under +# /mnt/c. Non-fatal if the mount unit name differs - the resume script retries. +After=local-fs.target + +[Service] +Type=simple +User=root +WorkingDirectory=/root/firstmate +Environment=HOME=/root +# The watchdog: ensure the firstmate tmux session exists, re-checking forever. +# On VM boot this recreates the session; if the loop itself ever dies, Restart +# brings it back. The loop re-execs nothing destructive - it no-ops when the +# session is already live. +ExecStart=/root/firstmate/bin/fm-resume.sh --watch +Restart=always +RestartSec=10 +# Kill ONLY the watchdog loop on stop/restart, never the tmux server it guards. +# This makes firstmate survive `systemctl restart firstmate` and decouples the +# supervisor session's lifetime from this unit: the service is a guardian, not +# the owner. To fully stop firstmate, kill its tmux session directly. +KillMode=process + +[Install] +WantedBy=multi-user.target diff --git a/tests/fm-lock-exclusivity.test.sh b/tests/fm-lock-exclusivity.test.sh new file mode 100644 index 0000000..a538175 --- /dev/null +++ b/tests/fm-lock-exclusivity.test.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Regression test for the wake-queue lock's core guarantee: mutual exclusion. +# +# The lock once used `mkdir` as its atomic primitive. mkdir is NOT atomic on +# every filesystem - on WSL2's filesystem several concurrent mkdir calls were +# observed to all succeed on one path, which silently double-granted the lock and +# made the watcher singleton and wake-queue draining race. The lock now uses an +# O_EXCL create (atomic everywhere we run). This test fails loudly if the lock +# ever stops being mutually exclusive again. +set -u + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +fail() { printf 'not ok - %s\n' "$1" >&2; exit 1; } +pass() { printf 'ok - %s\n' "$1"; } + +TMP=$(mktemp -d "${TMPDIR:-/tmp}/fm-lock-test.XXXXXX") +trap 'rm -rf "$TMP"' EXIT +export FM_STATE_OVERRIDE="$TMP" +# shellcheck source=bin/fm-wake-lib.sh +. "$ROOT/bin/fm-wake-lib.sh" + +LK="$FM_WAKE_QUEUE_LOCK" +CUR="$TMP/current"; BAD="$TMP/bad"; : > "$BAD"; echo init > "$CUR" + +# Canonical mutex test: while holding the lock, write our pid to a shared file +# and read it back. With true mutual exclusion it is always our own pid; a +# non-empty foreign pid means another holder ran concurrently (double-grant). +# (An empty read is a transient fork artifact under load, not a double-grant.) +worker() { + local _ who + for _ in $(seq 1 30); do + fm_lock_acquire_wait "$LK" + echo "$BASHPID" > "$CUR" + who=$(cat "$CUR" 2>/dev/null || true) + if [ -n "$who" ] && [ "$who" != "$BASHPID" ]; then echo "$who" >> "$BAD"; fi + fm_lock_release "$LK" + done +} +worker & worker & worker & worker & wait + +doubles=$(grep -c . "$BAD" 2>/dev/null || true) +[ "$doubles" -eq 0 ] || fail "lock double-granted $doubles time(s): a second holder overwrote shared state while the lock was held" +pass "wake-queue lock is mutually exclusive under contention (4 workers)" + +# A dead holder's lock must be reclaimable in a single try_acquire (fm-watch.sh +# relies on this to take over after a crashed watcher); a live holder's must not. +rm -f "$LK" +dead=999999; while kill -0 "$dead" 2>/dev/null; do dead=$((dead + 1)); done +printf '%s\n' "$dead" > "$LK" +fm_lock_try_acquire "$LK" || fail "could not reclaim a dead holder's lock in one call" +fm_lock_release "$LK" + +sleep 30 & holder=$! +printf '%s\n' "$holder" > "$LK" +if fm_lock_try_acquire "$LK"; then + kill "$holder" 2>/dev/null || true + fail "stole a live holder's lock" +fi +kill "$holder" 2>/dev/null || true +rm -f "$LK" +pass "lock reclaims a dead holder but never a live one" diff --git a/tests/fm-resume.test.sh b/tests/fm-resume.test.sh new file mode 100644 index 0000000..77a941c --- /dev/null +++ b/tests/fm-resume.test.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Behavior tests for bin/fm-resume.sh - the crash/reboot resurrection script. +# Runs against a private tmux server (-L socket) so it never touches the live +# firstmate session, and sets base-index 1 on that server to guard the bug where +# send-keys targeted a hardcoded window ":0" that does not exist under base-index 1. +set -u + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +RESUME="$ROOT/bin/fm-resume.sh" + +fail() { printf 'not ok - %s\n' "$1" >&2; exit 1; } +pass() { printf 'ok - %s\n' "$1"; } + +if ! command -v tmux >/dev/null 2>&1; then + printf 'ok - SKIP fm-resume (tmux not installed)\n' + exit 0 +fi + +REAL_TMUX=$(command -v tmux) +SOCK="fmresume$$" +TMP=$(mktemp -d "${TMPDIR:-/tmp}/fm-resume-test.XXXXXX") + +cleanup() { + "$REAL_TMUX" -L "$SOCK" kill-server 2>/dev/null || true + [ -n "${TMP:-}" ] && rm -rf "$TMP" +} +trap cleanup EXIT + +# Private server with base-index 1; a dummy session keeps the server alive and +# proves new sessions inherit the non-zero base index. +"$REAL_TMUX" -L "$SOCK" new-session -d -s seed -x 80 -y 24 +"$REAL_TMUX" -L "$SOCK" set -g base-index 1 + +# Wrapper so fm-resume's bare `tmux` calls hit the private server. +mkdir -p "$TMP/bin" +cat > "$TMP/bin/tmux" < "$FAKE" < "$TMP/launch.txt" +sleep 300 +EOF +chmod +x "$FAKE" + +run_resume() { + PATH="$TMP/bin:$PATH" FM_SESSION=fmtest FM_CLAUDE_BIN="$FAKE" FM_CONFIG_DIR=/cfg/orch "$RESUME" +} + +# 1) create path: session made, firstmate launched with the right context. +out=$(run_resume) || fail "resume create returned nonzero" +printf '%s\n' "$out" | grep -q "created session 'fmtest'" || fail "create not reported: $out" +for _ in $(seq 1 40); do [ -f "$TMP/launch.txt" ] && break; sleep 0.25; done +[ -f "$TMP/launch.txt" ] || fail "firstmate never launched (base-index regression?)" +grep -qF "cwd=$ROOT " "$TMP/launch.txt" || fail "wrong cwd: $(cat "$TMP/launch.txt")" +grep -qF "cfg=/cfg/orch " "$TMP/launch.txt" || fail "config dir not applied: $(cat "$TMP/launch.txt")" +grep -qF "sandbox=1" "$TMP/launch.txt" || fail "IS_SANDBOX not set: $(cat "$TMP/launch.txt")" +pass "fm-resume creates the session and launches firstmate with the right context" + +# 2) idempotent: a second run no-ops and never makes a duplicate session. +out2=$(run_resume) || fail "resume no-op returned nonzero" +printf '%s\n' "$out2" | grep -q "already live" || fail "second run should no-op: $out2" +count=$("$REAL_TMUX" -L "$SOCK" list-sessions 2>/dev/null | grep -c '^fmtest:') +[ "$count" -eq 1 ] || fail "expected exactly one fmtest session, got $count" +pass "fm-resume is idempotent (no duplicate session)" diff --git a/tests/fm-wake-queue.test.sh b/tests/fm-wake-queue.test.sh index c7df55a..12283b5 100755 --- a/tests/fm-wake-queue.test.sh +++ b/tests/fm-wake-queue.test.sh @@ -274,10 +274,20 @@ test_singleton_start() { pid1=$! PATH="$fakebin:$PATH" FM_STATE_OVERRIDE="$state" FM_POLL=5 FM_SIGNAL_GRACE=1 FM_CHECK_INTERVAL=999999 FM_HEARTBEAT=999999 "$WATCH" > "$out2" & pid2=$! - sleep 0.5 - live=0 - is_live_non_zombie "$pid1" && live=$((live + 1)) - is_live_non_zombie "$pid2" && live=$((live + 1)) + # Poll for the actual invariant rather than a fixed delay: the losing watcher + # must exit (leaving exactly one live) AND report the existing singleton. Under + # load the loser may not have reached its exit within a guessed sleep, which is + # what made this assertion flake. Break as soon as it settles (usually <0.5s). + live=2 + for _ in $(seq 1 40); do + live=0 + is_live_non_zombie "$pid1" && live=$((live + 1)) + is_live_non_zombie "$pid2" && live=$((live + 1)) + if [ "$live" -eq 1 ] && grep -hq 'watcher: already running pid ' "$out1" "$out2"; then + break + fi + sleep 0.25 + done [ "$live" -eq 1 ] || fail "expected exactly one live watcher, got $live" grep -h 'watcher: already running pid ' "$out1" "$out2" >/dev/null || fail "second watcher did not report existing singleton" kill "$pid1" "$pid2" 2>/dev/null || true