diff --git a/src/daemon/agents/kimi.ts b/src/daemon/agents/kimi.ts index 4f8dd97..eaa6d33 100644 --- a/src/daemon/agents/kimi.ts +++ b/src/daemon/agents/kimi.ts @@ -243,4 +243,17 @@ export const kimiShim: AgentShim = { // Kimi CLI uses Moonshot subscription on the user's plan, not metered API. return 0; }, + + // Same opencode-go gateway flake risk as the opencode shim — when the + // requested model carries the opencode-go/ prefix the kimi shim + // delegates to opencode internally (see runHeadless above). Inherit + // the same retry posture so moonshot lineages don't silently fall + // back without an attempt at the cheap recovery. Flagged by codex on + // the PR #87 self-audit ("isOpenCodeFamily" idiom appears throughout + // error-detector.ts; the original lineage dispatch only checked + // 'opencode' and left this gap open). + retryPolicy: { + onNullKind: true, + onNoOutput: true, + }, }; diff --git a/src/daemon/agents/opencode.ts b/src/daemon/agents/opencode.ts index 042b021..1ac843f 100644 --- a/src/daemon/agents/opencode.ts +++ b/src/daemon/agents/opencode.ts @@ -218,4 +218,23 @@ export const opencodeShim: AgentShim = { // OpenCode Go subscription plan (Kimi/DeepSeek), not per-call API return 0; }, + + // Opencode-go's gateway has two well-documented transient failure + // modes that look terminal at the runner layer: + // - empty-stdout exit: subprocess exits 0 with no output -> the + // runner synthesises kind:'no_output' (universally terminal), + // but for opencode-go a second attempt typically succeeds with + // the same prompt (gateway boot race). + // - events-but-no-content: subprocess emits progress/tool events + // then closes without a message_done and without any error + // event -> kind stays undefined, content is empty. + // Both should be retried once. The original PR #87 attempted to + // address the second case with a lineage dispatch in the central + // classifier; the no_output case slipped through because no_output + // is in the universal terminal list. Moved here as part of the + // shim-owned retry refactor (PR #87 self-audit follow-up). + retryPolicy: { + onNullKind: true, + onNoOutput: true, + }, }; diff --git a/src/daemon/agents/types.ts b/src/daemon/agents/types.ts index ac9ec81..bc44cd8 100644 --- a/src/daemon/agents/types.ts +++ b/src/daemon/agents/types.ts @@ -152,6 +152,39 @@ export interface AgentShim { * lineages return 0; API-keyed lineages use the rate card. Best-effort. */ estimateCostUsd(inputTokens: number, outputTokens: number, model?: string): number; + /** + * Shim-owned retry policy. The runner consults this BEFORE consulting the + * universal `isRetryableErrorKind` taxonomy — letting each shim declare its + * own transport-specific transient failure modes without leaking + * lineage-dispatch into the central classifier (PR #87 audit feedback). + * + * Three orthogonal signals: + * - `extraKinds`: shim-specific error kinds beyond the universal transient + * list (e.g. an opencode-go-specific `provider_returned_error` could go + * here if the shim ever emits it). + * - `onNullKind`: retry when the run returned without setting any error + * kind (`lastError.kind === undefined`). Opencode-go's gateway has a + * transport flake where the subprocess emits some events but no usable + * content with no error event — this catches that. + * - `onNoOutput`: retry when the run exited with zero events (the finally + * block synthesises `kind: 'no_output'`). For most lineages this is a + * deterministic transport bug (e.g. TTY-only output), but for opencode- + * go's empty-stdout-exit flake it's exactly the case we want to retry. + * Closes the bug the original PR #87 missed: my opencode-null retry + * was unreachable because empty-exit always synthesised `no_output` + * terminal kind before the null-kind branch could see it (caught by + * antigravity + gemini on the PR #87 self-audit). + * + * Omit entirely (or set all false) for shims that should keep the + * conservative universal-default behaviour — codex, claude, gemini all + * fall through to the universal list and retry only on documented + * transient kinds. + */ + readonly retryPolicy?: { + readonly extraKinds?: readonly string[]; + readonly onNullKind?: boolean; + readonly onNoOutput?: boolean; + }; } /** diff --git a/src/daemon/error-detector.ts b/src/daemon/error-detector.ts index bcfa1f1..8d55c57 100644 --- a/src/daemon/error-detector.ts +++ b/src/daemon/error-detector.ts @@ -27,70 +27,72 @@ export interface CliError { /** * Decide whether a CLI failure is worth one immediate retry before - * advancing the fallback chain. Most CLI calls succeed; the failure - * cases that DO recur are deterministic — auth/quota/db-corrupt. The - * cases that DON'T recur are transient — network blip mid-stream, - * slow cold start, opencode DB lock that clears on the next attempt. - * One retry catches the cheap save without doubling spend on real - * outages (which fail twice and then advance as before). + * advancing the fallback chain. The decision is composed from TWO layers: * - * Terminal (no retry — same call will fail the same way): + * Universal layer (this function's switch + openrouter prefix check): + * Kinds that any CLI/shim might produce, where retry is universally + * safe — network blips, slow cold starts, generic stream failures. + * + * Shim-owned layer (consulted via `retryPolicy` on AgentShim): + * Each shim declares its own transport-specific transient signals. + * Avoids leaking lineage-dispatch into this central classifier + * (slippery-slope concern from PR #87 audit). To opt in for a new + * lineage, add a `retryPolicy` block to its shim file — not a new + * branch here. + * + * Universal terminal (no retry — same call will fail the same way): * - quota_exhausted quota window is server-scheduled * - token_refresh_lost human must re-authenticate * - opencode_db_corrupt local DB corruption persists * - permission_prompt needs user interaction or recoverKeys * - * Transient (retry once): + * Universal transient (retry once): * - cold_start_timeout CLI was slow; second cold start may hit warm cache * - tmux_dead session crashed; respawn may succeed - * - stream_failure catch-all from reviewer.ts when the stream - * ends with no kind match — usually a brief - * upstream blip + * - stream_failure catch-all when the stream ends with no kind match * - unknown same as stream_failure - * - mcp_handshake_failed codex's bundled MCP server boots racily — - * slow first start, handshake-timeout under - * load, server crash on first start. The - * error LOOKS auth-shaped (was originally - * classified terminal for that reason) but - * real auth failures surface as - * token_refresh_lost. Retry catches the boot - * race; a rare actual-misconfig fails twice - * and advances as before. (Caught when codex - * hit this on the PR #87 audit chat and went - * straight to claude fallback without any - * recovery attempt.) - * - openrouter_fetch_failed pre-HTTP network error from the OpenRouter - * shim — DNS blip, ECONNRESET mid-handshake, - * ETIMEDOUT. Exactly the case retry is for. - * (Flagged by codex in chorus self-audit on PR #79.) - * - openrouter_no_body 2xx response with empty body — anomalous - * edge state at the OpenRouter edge; second - * request normally succeeds. + * - mcp_handshake_failed codex's bundled MCP server boots racily — real + * auth failures surface as token_refresh_lost. + * (Reclassified in PR #89.) + * - openrouter_fetch_failed pre-HTTP network error (DNS, ECONNRESET, ETIMEDOUT) + * - openrouter_no_body 2xx response with empty body * - * Also retryable: HTTP-dispatched shim 5xx codes from OpenRouter (the - * caller passes the `openrouter_` kind verbatim — we accept any - * 5xx as transient). 4xx codes (401/402/403/429) are NOT retried; - * they're either auth/quota or already rate-limited. Retrying 429 - * immediately would just compound the rate-limit. + * Also retryable: OpenRouter 5xx HTTP codes (transient upstream outage). + * 4xx codes (401/402/403/429) stay terminal — auth/quota/rate-limit cases + * where retry just compounds the problem. * - * Lineage-specific extension (PR #85): when `kind` is undefined AND - * `lineage === 'opencode'`, treat as retryable. Opencode-go's gateway - * has known transport flakes where the subprocess exits 0 with empty - * output (no classified errorKind, no message) but a second attempt - * succeeds. Other lineages keep the conservative "no kind = not - * retryable" default — codex/claude/gemini's null-with-no-kind cases - * usually mean the model genuinely produced nothing, where retry would - * just produce nothing again. Victor caught the gap on the PR #83 - * audit (qwen3.6-plus on opencode produced null, no retry, straight to - * claude fallback — wasted the cheap save). + * Shim-specific signals (via `retryPolicy` on the shim): + * - extraKinds additional kinds beyond the universal list + * - onNullKind retry when `lastError.kind === undefined` + * (opencode-go: events emitted but no usable content, + * no error event — transport flake) + * - onNoOutput retry when the run synthesised `kind: 'no_output'` + * because `eventCount === 0` (opencode-go: subprocess + * exited 0 with empty stdout — the common case the + * original PR #87 missed because no_output was in + * the universal terminal list) + * + * The `shim` parameter is optional for callers that don't have one in scope + * (e.g. classifier-only tests). When omitted, only the universal layer + * applies — no shim-specific retries fire. */ +import type { AgentShim } from './agents/types.js'; + export function isRetryableErrorKind( kind: string | undefined, - lineage?: string, + shim?: { retryPolicy?: AgentShim['retryPolicy'] }, ): boolean { + const policy = shim?.retryPolicy; + // Shim-owned null-kind retry (was the PR #87 lineage-dispatch case). if (!kind) { - return lineage === 'opencode'; + return policy?.onNullKind === true; } + // Shim-owned no_output retry — the COMMON opencode-go flake that the + // original PR #87 silently missed. Synthesised by the runner's finally + // block when eventCount === 0, and would otherwise be filtered as + // terminal below. + if (kind === 'no_output' && policy?.onNoOutput === true) return true; + // Universal transient switch. switch (kind) { case 'cold_start_timeout': case 'tmux_dead': @@ -101,10 +103,14 @@ export function isRetryableErrorKind( case 'openrouter_no_body': return true; } + // OpenRouter HTTP-code escape: 5xx transient, 4xx terminal. if (kind.startsWith('openrouter_')) { const code = Number(kind.slice('openrouter_'.length)); - return Number.isFinite(code) && code >= 500 && code < 600; + if (Number.isFinite(code) && code >= 500 && code < 600) return true; } + // Shim-owned extra kinds (after universal — a shim CAN add a kind the + // universal list considers terminal, but only via explicit opt-in). + if (policy?.extraKinds?.includes(kind)) return true; return false; } diff --git a/src/daemon/runner/doer-driver.ts b/src/daemon/runner/doer-driver.ts index a605d95..af06c21 100644 --- a/src/daemon/runner/doer-driver.ts +++ b/src/daemon/runner/doer-driver.ts @@ -276,7 +276,7 @@ export async function runDoer( }); if (result !== null) return result; if (attempt === MAX_ATTEMPTS) return null; - if (!isRetryableErrorKind(lastError.kind, entry.lineage)) return null; + if (!isRetryableErrorKind(lastError.kind, entryShim)) return null; if (handle.signal.aborted) return null; console.warn( `[doer] retrying transient failure chat=${chatId} round=${round} ` + diff --git a/src/daemon/runner/reviewer-driver.ts b/src/daemon/runner/reviewer-driver.ts index fd33e4c..d1ce491 100644 --- a/src/daemon/runner/reviewer-driver.ts +++ b/src/daemon/runner/reviewer-driver.ts @@ -594,7 +594,7 @@ async function runReviewer( }); if (result !== null) return result; if (attempt === MAX_ATTEMPTS) return null; - if (!isRetryableErrorKind(lastError.kind, entry.lineage)) return null; + if (!isRetryableErrorKind(lastError.kind, entryShim)) return null; if (handle.signal.aborted) return null; console.warn( `[reviewer] retrying transient failure chat=${chatId} round=${round} ` + diff --git a/src/daemon/runner/reviewer.ts b/src/daemon/runner/reviewer.ts index b6c0f3d..0b9e2e9 100644 --- a/src/daemon/runner/reviewer.ts +++ b/src/daemon/runner/reviewer.ts @@ -533,5 +533,30 @@ export async function runReviewerHeadless(args: { ); } - return verdictFromReviewerText(content); + const verdict = verdictFromReviewerText(content); + // PR #91 audit catch (multiple reviewers, MEDIUM-HIGH): when the + // reviewer produces non-empty content but verdictFromReviewerText + // can't extract a positive/negative keyword, it returns null. The + // caller (reviewer-driver retry loop) sees `result === null` with + // `lastError.kind === undefined` — and the new opencode-shim + // retryPolicy.onNullKind=true causes a retry. But this case ISN'T + // a transport flake — the model successfully wrote prose that just + // doesn't match the verdict heuristic (apology, refusal, off-topic). + // Retrying it produces the same shape on the next attempt — wasted + // tokens. Setting an explicit `verdict_ambiguous` kind classifies + // this as terminal (not in the universal transient list, not opted + // in by any shim) so the retry skips and the chain advances. + // + // Guarded on !lastError.kind so the classifier doesn't stomp a real + // classified kind set by the finally block (e.g. when an errored + // stream wrote a `## REVIEWER FAILED` block to answer.md — content + // is non-empty so verdictFromReviewerText returns null, but the + // failure was classified earlier as `stream_failure` / + // `quota_exhausted` / etc. and that signal must survive). + if (verdict === null && lastError && !lastError.kind) { + lastError.kind = 'verdict_ambiguous'; + lastError.message = + 'Reviewer wrote non-empty content but no approve/reject verdict was detected.'; + } + return verdict; } diff --git a/tests/error-detector-retryable.test.ts b/tests/error-detector-retryable.test.ts index af2649d..92ba93e 100644 --- a/tests/error-detector-retryable.test.ts +++ b/tests/error-detector-retryable.test.ts @@ -20,31 +20,78 @@ describe('isRetryableErrorKind', () => { expect(isRetryableErrorKind('')).toBe(false); }); - describe('opencode-null special case (PR #85)', () => { - it('returns TRUE for undefined kind when lineage is opencode', () => { - // Opencode-go's gateway has known transport flakes where the - // subprocess exits 0 with empty output (no errorKind, no message) - // but a second attempt succeeds. Without this the qwen-style - // null-with-no-kind failure goes straight to fallback chain - // advance, wasting the cheap save. - expect(isRetryableErrorKind(undefined, 'opencode')).toBe(true); - }); - - it('keeps the conservative default for other lineages on undefined kind', () => { - // codex/claude/gemini null-with-no-kind usually means the model - // genuinely produced nothing — retry would produce nothing again. - expect(isRetryableErrorKind(undefined, 'openai')).toBe(false); - expect(isRetryableErrorKind(undefined, 'anthropic')).toBe(false); - expect(isRetryableErrorKind(undefined, 'google')).toBe(false); - expect(isRetryableErrorKind(undefined, 'antigravity')).toBe(false); - }); - - it('the lineage hint does NOT override an explicit non-retryable kind', () => { - // Even on opencode, an auth/quota/db-corrupt kind is still - // terminal — retry would just produce the same error. - expect(isRetryableErrorKind('quota_exhausted', 'opencode')).toBe(false); - expect(isRetryableErrorKind('opencode_db_corrupt', 'opencode')).toBe(false); - expect(isRetryableErrorKind('no_output', 'opencode')).toBe(false); + describe('shim-owned retry policy (PR #87 follow-up)', () => { + // Shape matches `Pick`. Building tiny + // shim stubs keeps the classifier tests independent of the full + // AgentShim contract (formatPrompt, runHeadless, etc.). + const opencodeShim = { + retryPolicy: { onNullKind: true, onNoOutput: true }, + }; + const codexShim = { retryPolicy: undefined }; + const claudeShim = {}; + const customShim = { + retryPolicy: { extraKinds: ['shim_specific_blip' as string] }, + }; + + it('retries on null kind when shim opts in via onNullKind', () => { + // Opencode-go gateway flake: events emitted but no usable content + // and no error event. + expect(isRetryableErrorKind(undefined, opencodeShim)).toBe(true); + }); + + it('does NOT retry on null kind when shim has no retryPolicy', () => { + // Codex/claude/gemini null-with-no-kind = model genuinely produced + // nothing. Retry would produce the same nothing. + expect(isRetryableErrorKind(undefined, codexShim)).toBe(false); + expect(isRetryableErrorKind(undefined, claudeShim)).toBe(false); + expect(isRetryableErrorKind(undefined)).toBe(false); + }); + + it('retries on no_output when shim opts in via onNoOutput', () => { + // THE bug the original PR #87 missed: empty-stdout exit + // synthesises no_output (universally terminal), so the opencode + // null-kind path was unreachable for the most common opencode + // failure mode. onNoOutput closes that bypass. + expect(isRetryableErrorKind('no_output', opencodeShim)).toBe(true); + }); + + it('keeps no_output terminal for shims that did NOT opt in', () => { + expect(isRetryableErrorKind('no_output', codexShim)).toBe(false); + expect(isRetryableErrorKind('no_output', claudeShim)).toBe(false); + expect(isRetryableErrorKind('no_output')).toBe(false); + }); + + it('shim extraKinds list adds shim-specific retryable kinds', () => { + expect(isRetryableErrorKind('shim_specific_blip', customShim)).toBe(true); + // Same kind, no shim: stays terminal. + expect(isRetryableErrorKind('shim_specific_blip')).toBe(false); + }); + + it('shim policy does NOT override universal-terminal kinds', () => { + // Even an opencode shim with both flags on does NOT retry + // quota_exhausted / token_refresh_lost / opencode_db_corrupt — + // these are deterministic, retry would produce the same error. + expect(isRetryableErrorKind('quota_exhausted', opencodeShim)).toBe(false); + expect(isRetryableErrorKind('token_refresh_lost', opencodeShim)).toBe(false); + expect(isRetryableErrorKind('opencode_db_corrupt', opencodeShim)).toBe(false); + }); + + it('verdict_ambiguous is terminal even on opencode shim (PR #91 audit catch)', () => { + // A reviewer that wrote non-empty prose but no approve/reject + // keyword is NOT a transport flake. The runner now sets + // lastError.kind='verdict_ambiguous' so the opencode shim's + // onNullKind=true policy can't accidentally retry it. + expect(isRetryableErrorKind('verdict_ambiguous', opencodeShim)).toBe(false); + expect(isRetryableErrorKind('verdict_ambiguous')).toBe(false); + }); + + it('universal transient kinds retry regardless of shim presence', () => { + // The opt-in is for ADDITIONAL retries, not a gate on the + // universal set. A shim with no retryPolicy still gets the full + // universal transient behaviour. + expect(isRetryableErrorKind('stream_failure')).toBe(true); + expect(isRetryableErrorKind('stream_failure', codexShim)).toBe(true); + expect(isRetryableErrorKind('openrouter_503', codexShim)).toBe(true); }); }); diff --git a/tests/runner-reviewer.test.ts b/tests/runner-reviewer.test.ts index 6e89f47..a90f08b 100644 --- a/tests/runner-reviewer.test.ts +++ b/tests/runner-reviewer.test.ts @@ -156,6 +156,36 @@ describe('runReviewerHeadless', () => { expect(verdict).toBeNull(); }); + it('classifies ambiguous-verdict as verdict_ambiguous so opencode retry does NOT fire on apology-shaped responses', async () => { + // PR #91 audit catch: a reviewer that writes non-empty prose but + // no approve/reject keyword returns null. Pre-fix, lastError.kind + // stayed undefined → the new opencode shim's onNullKind=true + // policy fired a retry that wasted tokens on the same shape. + // Post-fix, lastError.kind is set to 'verdict_ambiguous' which is + // not in any retry list, so the driver advances the chain cleanly. + const text = `${PADDING} ${PADDING} the code seems consistent with the rest of the codebase.\n## DONE`; + const handle = makeFakeShim({ events: happyPathEvents(text) }); + const lastError: { kind?: string; message?: string } = {}; + const verdict = await runReviewerHeadless({ + shim: handle.shim, + chatId: 'test-chat', + phase: fixturePhase, + round: 1, + reviewerIdx: 0, + candidateLineage: 'openai', + candidateModel: 'gpt-5.5', + agentName: 'codex-cli', + askContent: 'review the doer output', + answerFile, + reviewerDir, + abortSignal: new AbortController().signal, + onEvent: (e) => events.push(e), + lastError, + }); + expect(verdict).toBeNull(); + expect(lastError.kind).toBe('verdict_ambiguous'); + }); + it('returns null when stream errors with no content', async () => { const handle = makeFakeShim({ events: [{ type: 'error', kind: 'quota_exhausted', message: 'limit hit' }],