Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/components/app-sidebar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -116,20 +116,33 @@ export function SidebarBody({ onNavigate, collapsed = false, onToggleCollapsed }

useEffect(() => {
let cancelled = false;
// Monotonic sequence so a slow earlier request can't overwrite the
// result of a later one. Multiple triggers (mount, SSE event,
// visibility change, 2s poll) can fire concurrently — without
// this, the slowest reply lands last and the sidebar shows stale
// data even after a fresher fetch already returned. PR #77 audit
// followup.
let reqSeq = 0;
let latestSeq = 0;

const fetchChats = async () => {
const mySeq = ++reqSeq;
try {
const list = await listChats({ limit: 12 });
if (cancelled) return;
if (mySeq < latestSeq) return; // a newer request already won
latestSeq = mySeq;
setChats(list);
setChatsState("ready");
} catch {
if (cancelled) return;
if (mySeq < latestSeq) return;
// Same display for known daemon-down vs unexpected errors —
// the dead `instanceof DaemonError` ternary that used to live
// here had both branches equal to "error". If we ever want to
// differentiate (e.g. "Daemon offline" vs "Something broke")
// add a new `chatsState` variant first.
latestSeq = mySeq;
setChatsState("error");
}
};
Expand Down
12 changes: 11 additions & 1 deletion src/components/live-run-real/enrich-rounds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,17 @@ export function enrichRounds(
if (slot.role === "doer") return p.role === "doer" && p.lineage === slot.lineage;
if (p.role !== "reviewer") return false;
if (p.lineage !== slot.lineage) return false;
const idxFromName = parseInt(p.participant.match(/-(\d+)$/)?.[1] ?? "0", 10);
// Non-conforming participant strings (no `-<digit>$` suffix)
// previously defaulted to 0 and silently matched slot 0 — a
// future MCP-created participant with a custom name could
// hijack the first reviewer card. Return a sentinel that no
// real slot index can equal so we treat non-conforming names
// as unmatched (correct: they go through the leftover-loop
// below, where they're identified by participant string, not
// by index).
const match = p.participant.match(/-(\d+)$/);
if (!match) return false;
const idxFromName = Number.parseInt(match[1], 10);
return idxFromName === slot.reviewerIdx;
});
if (real) {
Expand Down
79 changes: 64 additions & 15 deletions src/daemon/headless.ts
Original file line number Diff line number Diff line change
Expand Up @@ -195,18 +195,40 @@ export function reapOrphanProcesses(): { reaped: number; cleared: number } {
const rec = JSON.parse(fs.readFileSync(recordPath, 'utf-8')) as PidRecord;
// Check if the process is still alive.
if (processIsAlive(rec.pid)) {
// Still alive — orphan. Send SIGTERM.
// Still alive — orphan. Send SIGTERM to the whole process group
// on Unix (we spawn children detached → each one is its own
// group leader, so -pid targets the group). On Windows or if
// negative-PID isn't supported, fall back to the single PID.
const isWindowsRec = process.platform === 'win32';
try {
process.kill(rec.pid, 'SIGTERM');
if (!isWindowsRec) {
process.kill(-rec.pid, 'SIGTERM');
} else {
process.kill(rec.pid, 'SIGTERM');
}
} catch {
/* ignore */
try {
// -pid can fail (e.g. on Windows or if the group has been
// dissolved); fall back to the head pid before giving up.
process.kill(rec.pid, 'SIGTERM');
} catch {
/* already gone */
}
}
// Schedule SIGKILL after grace; use unref so it doesn't block exit.
const t = setTimeout(() => {
try {
process.kill(rec.pid, 'SIGKILL');
if (!isWindowsRec) {
process.kill(-rec.pid, 'SIGKILL');
} else {
process.kill(rec.pid, 'SIGKILL');
}
} catch {
// already gone
try {
process.kill(rec.pid, 'SIGKILL');
} catch {
// already gone
}
}
}, KILL_GRACE_MS);
t.unref();
Expand Down Expand Up @@ -336,6 +358,15 @@ export function spawnHeadless(opts: SpawnHeadlessOptions): HeadlessRun {
env: spawnEnv(opts.env),
stdio: ['pipe', 'pipe', 'pipe'],
shell: isWindows,
// Unix-only: spawn the child as its own process group leader so we
// can send SIGTERM/SIGKILL to the WHOLE group via `process.kill(-pid)`.
// Without this, killing only the head process (`child.kill()`) leaves
// any subprocesses it forked (e.g. codex's helper python procs,
// opencode's node workers) orphaned until they finish on their own.
// Windows has no process groups in this sense; child.kill calls
// TerminateProcess and the existing `shell: true` path already
// handles its own tree. (PR #74 codex audit follow-up.)
detached: !isWindows,
});

if (child.pid !== undefined) {
Expand Down Expand Up @@ -517,20 +548,38 @@ export function spawnHeadless(opts: SpawnHeadlessOptions): HeadlessRun {
// it DOES still hold the closure (and the file-scope `child` ref).
let killGraceTimer: NodeJS.Timeout | null = null;

const sigtermThenKill = (reason: string): void => {
if (killReason) return; // already killing
killReason = reason;
// Send `sig` to the child's process group on Unix (detached: true
// above made the child its own group leader, so process.kill(-pid, sig)
// signals every descendant). Falls back to child.kill on Windows or
// when pid is undefined. Catches ESRCH ("process already gone") which
// is the common case when the child exited between signal-emit time
// and the SIGKILL timer firing.
//
// Known limitation: if a grandchild calls setsid() / spawns with its
// own `detached: true`, it creates a new process group and escapes
// `process.kill(-pid)`. Mainstream CLI helpers don't do this, but a
// future shim integration that wraps codex / opencode with a
// setsid'd worker would orphan that worker on cancel. Mitigation
// would be a shim-specific killer; flag for the integration if it
// arises. (Caught on PR #83 self-audit by opencode-cli-4 finding.)
const killTree = (sig: 'SIGTERM' | 'SIGKILL'): void => {
try {
child.kill('SIGTERM');
if (!isWindows && typeof child.pid === 'number') {
process.kill(-child.pid, sig);
} else {
child.kill(sig);
}
} catch {
/* ignore */
/* ESRCH or already-dead — fine */
}
};

const sigtermThenKill = (reason: string): void => {
if (killReason) return; // already killing
killReason = reason;
killTree('SIGTERM');
killGraceTimer = setTimeout(() => {
try {
child.kill('SIGKILL');
} catch {
/* ignore */
}
killTree('SIGKILL');
killGraceTimer = null;
}, KILL_GRACE_MS);
killGraceTimer.unref();
Expand Down
8 changes: 7 additions & 1 deletion src/daemon/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { isReviewOnlyPhase, type StandardPhase, type Template } from '../lib/tem
import { admitChat } from './chat-gate.js';
import type { ErrorDetector } from './error-detector.js';
import { runDoer } from './runner/doer-driver.js';
import { resetRound as resetFallbackRound } from './runner/fallback-registry.js';
import { readPriorRoundFeedback } from './runner/prior-round.js';
import { runReviewers } from './runner/reviewer-driver.js';
import { runReviewOnlyPhase } from './runner/review-only-phase.js';
Expand Down Expand Up @@ -407,7 +408,12 @@ export async function runChat(opts: PhaseRunnerOptions): Promise<void> {
});
}
} else {
// No reviewers: doer succeeds immediately
// No reviewers: doer succeeds immediately. Drop any fallback
// claim the doer took on this round so the registry doesn't
// leak across long-running daemons. (For phases WITH
// reviewers, the reset already fires from runReviewers'
// finally block.)
resetFallbackRound(chatId, round);
doerSucceeded = true;
break;
}
Expand Down
141 changes: 101 additions & 40 deletions src/daemon/runner/doer-driver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ import { waitForAnswer } from '../output-watcher.js';
import * as participantAborts from '../participant-aborts.js';
import type { TmuxManager } from '../tmux-types.js';
import { runDoerHeadless } from './doer.js';
import {
release as releaseFallbackClaim,
resetRound as resetFallbackRound,
tryClaim as tryClaimFallbackTarget,
} from './fallback-registry.js';
import { buildAsk } from './prompt-builder.js';
import { runWithChainFallback, runWithModelFallback } from './run-with-fallback.js';
import { sanitizeName } from './sanitize-name.js';
Expand Down Expand Up @@ -198,42 +203,28 @@ export async function runDoer(
return await runWithChainFallback(
chain,
async (entry) => {
// Cross-lineage swap: when the entry's lineage differs from the
// doer's primary, re-resolve the shim. Slot identity (agentName,
// doerDir) stays bound to the primary lineage; cli_warning below
// surfaces the swap to the cockpit.
const entryShim = entry.lineage === phase.doer.lineage
? shim
: pickShimForVoice(entry.lineage as Lineage, entry.model);
// Single-retry on transient kinds before advancing the chain.
// See reviewer-driver for full rationale and the kind taxonomy.
const MAX_ATTEMPTS = 2;
const RETRY_BACKOFF_MS = 1000;
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
const lastError: { kind?: string; message?: string } = {};
const result = await runDoerHeadless({
shim: entryShim,
chatId,
phase,
round,
agentName,
askContent: ask,
answerFile,
doerCwd,
abortSignal: handle.signal,
onEvent,
modelOverride: entry.model,
lastError,
});
if (result !== null) return result;
if (attempt === MAX_ATTEMPTS) return null;
if (!isRetryableErrorKind(lastError.kind)) return null;
if (handle.signal.aborted) return null;
// Fallback-collision guard, mirroring reviewer-driver. The
// doer normally runs alone, but its fallback chain can end at
// the same shared template fallback target as the reviewer
// slots' chains (e.g. anthropic/claude-sonnet-4-6). Without
// this claim, the doer can run claude-sonnet-4-6 then a later
// reviewer slot also resolves to claude-sonnet-4-6 and runs
// it again — duplicate cost, lineage diversity broken. Sticky
// semantics: claim is held through the round on success and
// null; throw releases. See fallback-registry.ts for the full
// rationale (PR #79 audit, opencode-cli-5 finding).
const claimed = tryClaimFallbackTarget(
chatId,
round,
entry.lineage,
entry.model,
);
if (!claimed) {
console.warn(
`[doer] retrying transient failure chat=${chatId} round=${round} ` +
`[doer] fallback collision chat=${chatId} round=${round} ` +
`agent=${agentName} ` +
`target=${entry.lineage}/${entry.model ?? '(default)'} ` +
`kind=${lastError.kind} attempt=${attempt + 1}/${MAX_ATTEMPTS}`,
`— another slot is already running it; advancing chain`,
);
onEvent({
chatId,
Expand All @@ -243,15 +234,81 @@ export async function runDoer(
round,
role: 'doer',
agent: agentName,
reason: 'transient_retry',
message: `Transient ${lastError.kind ?? 'failure'} on ${entry.lineage}/${entry.model ?? '(default)'} — retrying once before advancing fallback.`,
reason: 'fallback_collision',
fromLineage: entry.lineage,
toLineage: entry.lineage,
fromModel: entry.model ?? '(default)',
toModel: entry.model ?? '(default)',
message: `Skipping ${entry.lineage}/${entry.model ?? '(default)'} — another slot is already running it. Advancing to next fallback to preserve lineage diversity.`,
},
ts: Date.now(),
});
await abortableSleep(RETRY_BACKOFF_MS, handle.signal);
if (handle.signal.aborted) return null;
return null;
}
let threw = false;
try {
// Cross-lineage swap: when the entry's lineage differs from the
// doer's primary, re-resolve the shim. Slot identity (agentName,
// doerDir) stays bound to the primary lineage; cli_warning below
// surfaces the swap to the cockpit.
const entryShim = entry.lineage === phase.doer.lineage
? shim
: pickShimForVoice(entry.lineage as Lineage, entry.model);
// Single-retry on transient kinds before advancing the chain.
// See reviewer-driver for full rationale and the kind taxonomy.
const MAX_ATTEMPTS = 2;
const RETRY_BACKOFF_MS = 1000;
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
const lastError: { kind?: string; message?: string } = {};
const result = await runDoerHeadless({
shim: entryShim,
chatId,
phase,
round,
agentName,
askContent: ask,
answerFile,
doerCwd,
abortSignal: handle.signal,
onEvent,
modelOverride: entry.model,
lastError,
});
if (result !== null) return result;
if (attempt === MAX_ATTEMPTS) return null;
if (!isRetryableErrorKind(lastError.kind)) return null;
if (handle.signal.aborted) return null;
console.warn(
`[doer] retrying transient failure chat=${chatId} round=${round} ` +
`agent=${agentName} ` +
`target=${entry.lineage}/${entry.model ?? '(default)'} ` +
`kind=${lastError.kind} attempt=${attempt + 1}/${MAX_ATTEMPTS}`,
);
onEvent({
chatId,
type: 'cli_warning',
payload: {
phaseId: phase.id,
round,
role: 'doer',
agent: agentName,
reason: 'transient_retry',
message: `Transient ${lastError.kind ?? 'failure'} on ${entry.lineage}/${entry.model ?? '(default)'} — retrying once before advancing fallback.`,
},
ts: Date.now(),
});
await abortableSleep(RETRY_BACKOFF_MS, handle.signal);
if (handle.signal.aborted) return null;
}
return null;
} catch (err) {
threw = true;
throw err;
} finally {
if (threw) {
releaseFallbackClaim(chatId, round, entry.lineage, entry.model);
}
}
return null;
},
(from, to, fromIdx) => {
const sameLineage = from.lineage === to.lineage;
Expand Down Expand Up @@ -339,12 +396,16 @@ export async function runDoer(
// Codex's slow cold-start (it auths + paints panels); shorter and the
// Enter we send below races against the input box being ready and gets
// eaten. Raise if a slower box still misses the prompt.
await new Promise((r) => setTimeout(r, 6000));
// abortableSleep so a cancelled chat doesn't wait the full 6s before
// teardown — PR #77 audit.
await abortableSleep(6000, abortSignal);
if (abortSignal.aborted) return null;

tmuxMgr.pasteBuffer(session.name, prompt);
// Small gap between paste and Enter so the TUI registers the paste before
// we submit.
await new Promise((r) => setTimeout(r, 500));
await abortableSleep(500, abortSignal);
if (abortSignal.aborted) return null;
tmuxMgr.sendKeys(session.name, ['Enter']);

// Poll capture-pane every 2s to surface known CLI failure modes while we
Expand Down
Loading
Loading