Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/daemon/agents/kimi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -243,4 +243,17 @@ export const kimiShim: AgentShim = {
// Kimi CLI uses Moonshot subscription on the user's plan, not metered API.
return 0;
},

// Same opencode-go gateway flake risk as the opencode shim — when the
// requested model carries the opencode-go/ prefix the kimi shim
// delegates to opencode internally (see runHeadless above). Inherit
// the same retry posture so moonshot lineages don't silently fall
// back without an attempt at the cheap recovery. Flagged by codex on
// the PR #87 self-audit ("isOpenCodeFamily" idiom appears throughout
// error-detector.ts; the original lineage dispatch only checked
// 'opencode' and left this gap open).
retryPolicy: {
onNullKind: true,
onNoOutput: true,
},
};
19 changes: 19 additions & 0 deletions src/daemon/agents/opencode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,4 +218,23 @@ export const opencodeShim: AgentShim = {
// OpenCode Go subscription plan (Kimi/DeepSeek), not per-call API
return 0;
},

// Opencode-go's gateway has two well-documented transient failure
// modes that look terminal at the runner layer:
// - empty-stdout exit: subprocess exits 0 with no output -> the
// runner synthesises kind:'no_output' (universally terminal),
// but for opencode-go a second attempt typically succeeds with
// the same prompt (gateway boot race).
// - events-but-no-content: subprocess emits progress/tool events
// then closes without a message_done and without any error
// event -> kind stays undefined, content is empty.
// Both should be retried once. The original PR #87 attempted to
// address the second case with a lineage dispatch in the central
// classifier; the no_output case slipped through because no_output
// is in the universal terminal list. Moved here as part of the
// shim-owned retry refactor (PR #87 self-audit follow-up).
retryPolicy: {
onNullKind: true,
onNoOutput: true,
},
};
33 changes: 33 additions & 0 deletions src/daemon/agents/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,39 @@ export interface AgentShim {
* lineages return 0; API-keyed lineages use the rate card. Best-effort.
*/
estimateCostUsd(inputTokens: number, outputTokens: number, model?: string): number;
/**
* Shim-owned retry policy. The runner consults this BEFORE consulting the
* universal `isRetryableErrorKind` taxonomy — letting each shim declare its
* own transport-specific transient failure modes without leaking
* lineage-dispatch into the central classifier (PR #87 audit feedback).
*
* Three orthogonal signals:
* - `extraKinds`: shim-specific error kinds beyond the universal transient
* list (e.g. an opencode-go-specific `provider_returned_error` could go
* here if the shim ever emits it).
* - `onNullKind`: retry when the run returned without setting any error
* kind (`lastError.kind === undefined`). Opencode-go's gateway has a
* transport flake where the subprocess emits some events but no usable
* content with no error event — this catches that.
* - `onNoOutput`: retry when the run exited with zero events (the finally
* block synthesises `kind: 'no_output'`). For most lineages this is a
* deterministic transport bug (e.g. TTY-only output), but for opencode-
* go's empty-stdout-exit flake it's exactly the case we want to retry.
* Closes the bug the original PR #87 missed: my opencode-null retry
* was unreachable because empty-exit always synthesised `no_output`
* terminal kind before the null-kind branch could see it (caught by
* antigravity + gemini on the PR #87 self-audit).
*
* Omit entirely (or set all false) for shims that should keep the
* conservative universal-default behaviour — codex, claude, gemini all
* fall through to the universal list and retry only on documented
* transient kinds.
*/
readonly retryPolicy?: {
readonly extraKinds?: readonly string[];
readonly onNullKind?: boolean;
readonly onNoOutput?: boolean;
};
}

/**
Expand Down
102 changes: 54 additions & 48 deletions src/daemon/error-detector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,70 +27,72 @@ export interface CliError {

/**
* Decide whether a CLI failure is worth one immediate retry before
* advancing the fallback chain. Most CLI calls succeed; the failure
* cases that DO recur are deterministic — auth/quota/db-corrupt. The
* cases that DON'T recur are transient — network blip mid-stream,
* slow cold start, opencode DB lock that clears on the next attempt.
* One retry catches the cheap save without doubling spend on real
* outages (which fail twice and then advance as before).
* advancing the fallback chain. The decision is composed from TWO layers:
*
* Terminal (no retry — same call will fail the same way):
* Universal layer (this function's switch + openrouter prefix check):
* Kinds that any CLI/shim might produce, where retry is universally
* safe — network blips, slow cold starts, generic stream failures.
*
* Shim-owned layer (consulted via `retryPolicy` on AgentShim):
* Each shim declares its own transport-specific transient signals.
* Avoids leaking lineage-dispatch into this central classifier
* (slippery-slope concern from PR #87 audit). To opt in for a new
* lineage, add a `retryPolicy` block to its shim file — not a new
* branch here.
*
* Universal terminal (no retry — same call will fail the same way):
* - quota_exhausted quota window is server-scheduled
* - token_refresh_lost human must re-authenticate
* - opencode_db_corrupt local DB corruption persists
* - permission_prompt needs user interaction or recoverKeys
*
* Transient (retry once):
* Universal transient (retry once):
* - cold_start_timeout CLI was slow; second cold start may hit warm cache
* - tmux_dead session crashed; respawn may succeed
* - stream_failure catch-all from reviewer.ts when the stream
* ends with no kind match — usually a brief
* upstream blip
* - stream_failure catch-all when the stream ends with no kind match
* - unknown same as stream_failure
* - mcp_handshake_failed codex's bundled MCP server boots racily —
* slow first start, handshake-timeout under
* load, server crash on first start. The
* error LOOKS auth-shaped (was originally
* classified terminal for that reason) but
* real auth failures surface as
* token_refresh_lost. Retry catches the boot
* race; a rare actual-misconfig fails twice
* and advances as before. (Caught when codex
* hit this on the PR #87 audit chat and went
* straight to claude fallback without any
* recovery attempt.)
* - openrouter_fetch_failed pre-HTTP network error from the OpenRouter
* shim — DNS blip, ECONNRESET mid-handshake,
* ETIMEDOUT. Exactly the case retry is for.
* (Flagged by codex in chorus self-audit on PR #79.)
* - openrouter_no_body 2xx response with empty body — anomalous
* edge state at the OpenRouter edge; second
* request normally succeeds.
* - mcp_handshake_failed codex's bundled MCP server boots racily — real
* auth failures surface as token_refresh_lost.
* (Reclassified in PR #89.)
* - openrouter_fetch_failed pre-HTTP network error (DNS, ECONNRESET, ETIMEDOUT)
* - openrouter_no_body 2xx response with empty body
*
* Also retryable: HTTP-dispatched shim 5xx codes from OpenRouter (the
* caller passes the `openrouter_<code>` kind verbatim — we accept any
* 5xx as transient). 4xx codes (401/402/403/429) are NOT retried;
* they're either auth/quota or already rate-limited. Retrying 429
* immediately would just compound the rate-limit.
* Also retryable: OpenRouter 5xx HTTP codes (transient upstream outage).
* 4xx codes (401/402/403/429) stay terminal — auth/quota/rate-limit cases
* where retry just compounds the problem.
*
* Lineage-specific extension (PR #85): when `kind` is undefined AND
* `lineage === 'opencode'`, treat as retryable. Opencode-go's gateway
* has known transport flakes where the subprocess exits 0 with empty
* output (no classified errorKind, no message) but a second attempt
* succeeds. Other lineages keep the conservative "no kind = not
* retryable" default — codex/claude/gemini's null-with-no-kind cases
* usually mean the model genuinely produced nothing, where retry would
* just produce nothing again. Victor caught the gap on the PR #83
* audit (qwen3.6-plus on opencode produced null, no retry, straight to
* claude fallback — wasted the cheap save).
* Shim-specific signals (via `retryPolicy` on the shim):
* - extraKinds additional kinds beyond the universal list
* - onNullKind retry when `lastError.kind === undefined`
* (opencode-go: events emitted but no usable content,
* no error event — transport flake)
* - onNoOutput retry when the run synthesised `kind: 'no_output'`
* because `eventCount === 0` (opencode-go: subprocess
* exited 0 with empty stdout — the common case the
* original PR #87 missed because no_output was in
* the universal terminal list)
*
* The `shim` parameter is optional for callers that don't have one in scope
* (e.g. classifier-only tests). When omitted, only the universal layer
* applies — no shim-specific retries fire.
*/
import type { AgentShim } from './agents/types.js';

export function isRetryableErrorKind(
kind: string | undefined,
lineage?: string,
shim?: { retryPolicy?: AgentShim['retryPolicy'] },
): boolean {
const policy = shim?.retryPolicy;
// Shim-owned null-kind retry (was the PR #87 lineage-dispatch case).
if (!kind) {
return lineage === 'opencode';
return policy?.onNullKind === true;
}
// Shim-owned no_output retry — the COMMON opencode-go flake that the
// original PR #87 silently missed. Synthesised by the runner's finally
// block when eventCount === 0, and would otherwise be filtered as
// terminal below.
if (kind === 'no_output' && policy?.onNoOutput === true) return true;
// Universal transient switch.
switch (kind) {
case 'cold_start_timeout':
case 'tmux_dead':
Expand All @@ -101,10 +103,14 @@ export function isRetryableErrorKind(
case 'openrouter_no_body':
return true;
}
// OpenRouter HTTP-code escape: 5xx transient, 4xx terminal.
if (kind.startsWith('openrouter_')) {
const code = Number(kind.slice('openrouter_'.length));
return Number.isFinite(code) && code >= 500 && code < 600;
if (Number.isFinite(code) && code >= 500 && code < 600) return true;
}
// Shim-owned extra kinds (after universal — a shim CAN add a kind the
// universal list considers terminal, but only via explicit opt-in).
if (policy?.extraKinds?.includes(kind)) return true;
return false;
}

Expand Down
2 changes: 1 addition & 1 deletion src/daemon/runner/doer-driver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ export async function runDoer(
});
if (result !== null) return result;
if (attempt === MAX_ATTEMPTS) return null;
if (!isRetryableErrorKind(lastError.kind, entry.lineage)) return null;
if (!isRetryableErrorKind(lastError.kind, entryShim)) return null;
if (handle.signal.aborted) return null;
console.warn(
`[doer] retrying transient failure chat=${chatId} round=${round} ` +
Expand Down
2 changes: 1 addition & 1 deletion src/daemon/runner/reviewer-driver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ async function runReviewer(
});
if (result !== null) return result;
if (attempt === MAX_ATTEMPTS) return null;
if (!isRetryableErrorKind(lastError.kind, entry.lineage)) return null;
if (!isRetryableErrorKind(lastError.kind, entryShim)) return null;
if (handle.signal.aborted) return null;
console.warn(
`[reviewer] retrying transient failure chat=${chatId} round=${round} ` +
Expand Down
27 changes: 26 additions & 1 deletion src/daemon/runner/reviewer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -533,5 +533,30 @@ export async function runReviewerHeadless(args: {
);
}

return verdictFromReviewerText(content);
const verdict = verdictFromReviewerText(content);
// PR #91 audit catch (multiple reviewers, MEDIUM-HIGH): when the
// reviewer produces non-empty content but verdictFromReviewerText
// can't extract a positive/negative keyword, it returns null. The
// caller (reviewer-driver retry loop) sees `result === null` with
// `lastError.kind === undefined` — and the new opencode-shim
// retryPolicy.onNullKind=true causes a retry. But this case ISN'T
// a transport flake — the model successfully wrote prose that just
// doesn't match the verdict heuristic (apology, refusal, off-topic).
// Retrying it produces the same shape on the next attempt — wasted
// tokens. Setting an explicit `verdict_ambiguous` kind classifies
// this as terminal (not in the universal transient list, not opted
// in by any shim) so the retry skips and the chain advances.
//
// Guarded on !lastError.kind so the classifier doesn't stomp a real
// classified kind set by the finally block (e.g. when an errored
// stream wrote a `## REVIEWER FAILED` block to answer.md — content
// is non-empty so verdictFromReviewerText returns null, but the
// failure was classified earlier as `stream_failure` /
// `quota_exhausted` / etc. and that signal must survive).
if (verdict === null && lastError && !lastError.kind) {
lastError.kind = 'verdict_ambiguous';
lastError.message =
'Reviewer wrote non-empty content but no approve/reject verdict was detected.';
}
return verdict;
}
97 changes: 72 additions & 25 deletions tests/error-detector-retryable.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,31 +20,78 @@ describe('isRetryableErrorKind', () => {
expect(isRetryableErrorKind('')).toBe(false);
});

describe('opencode-null special case (PR #85)', () => {
it('returns TRUE for undefined kind when lineage is opencode', () => {
// Opencode-go's gateway has known transport flakes where the
// subprocess exits 0 with empty output (no errorKind, no message)
// but a second attempt succeeds. Without this the qwen-style
// null-with-no-kind failure goes straight to fallback chain
// advance, wasting the cheap save.
expect(isRetryableErrorKind(undefined, 'opencode')).toBe(true);
});

it('keeps the conservative default for other lineages on undefined kind', () => {
// codex/claude/gemini null-with-no-kind usually means the model
// genuinely produced nothing — retry would produce nothing again.
expect(isRetryableErrorKind(undefined, 'openai')).toBe(false);
expect(isRetryableErrorKind(undefined, 'anthropic')).toBe(false);
expect(isRetryableErrorKind(undefined, 'google')).toBe(false);
expect(isRetryableErrorKind(undefined, 'antigravity')).toBe(false);
});

it('the lineage hint does NOT override an explicit non-retryable kind', () => {
// Even on opencode, an auth/quota/db-corrupt kind is still
// terminal — retry would just produce the same error.
expect(isRetryableErrorKind('quota_exhausted', 'opencode')).toBe(false);
expect(isRetryableErrorKind('opencode_db_corrupt', 'opencode')).toBe(false);
expect(isRetryableErrorKind('no_output', 'opencode')).toBe(false);
describe('shim-owned retry policy (PR #87 follow-up)', () => {
// Shape matches `Pick<AgentShim, 'retryPolicy'>`. Building tiny
// shim stubs keeps the classifier tests independent of the full
// AgentShim contract (formatPrompt, runHeadless, etc.).
const opencodeShim = {
retryPolicy: { onNullKind: true, onNoOutput: true },
};
const codexShim = { retryPolicy: undefined };
const claudeShim = {};
const customShim = {
retryPolicy: { extraKinds: ['shim_specific_blip' as string] },
};

it('retries on null kind when shim opts in via onNullKind', () => {
// Opencode-go gateway flake: events emitted but no usable content
// and no error event.
expect(isRetryableErrorKind(undefined, opencodeShim)).toBe(true);
});

it('does NOT retry on null kind when shim has no retryPolicy', () => {
// Codex/claude/gemini null-with-no-kind = model genuinely produced
// nothing. Retry would produce the same nothing.
expect(isRetryableErrorKind(undefined, codexShim)).toBe(false);
expect(isRetryableErrorKind(undefined, claudeShim)).toBe(false);
expect(isRetryableErrorKind(undefined)).toBe(false);
});

it('retries on no_output when shim opts in via onNoOutput', () => {
// THE bug the original PR #87 missed: empty-stdout exit
// synthesises no_output (universally terminal), so the opencode
// null-kind path was unreachable for the most common opencode
// failure mode. onNoOutput closes that bypass.
expect(isRetryableErrorKind('no_output', opencodeShim)).toBe(true);
});

it('keeps no_output terminal for shims that did NOT opt in', () => {
expect(isRetryableErrorKind('no_output', codexShim)).toBe(false);
expect(isRetryableErrorKind('no_output', claudeShim)).toBe(false);
expect(isRetryableErrorKind('no_output')).toBe(false);
});

it('shim extraKinds list adds shim-specific retryable kinds', () => {
expect(isRetryableErrorKind('shim_specific_blip', customShim)).toBe(true);
// Same kind, no shim: stays terminal.
expect(isRetryableErrorKind('shim_specific_blip')).toBe(false);
});

it('shim policy does NOT override universal-terminal kinds', () => {
// Even an opencode shim with both flags on does NOT retry
// quota_exhausted / token_refresh_lost / opencode_db_corrupt —
// these are deterministic, retry would produce the same error.
expect(isRetryableErrorKind('quota_exhausted', opencodeShim)).toBe(false);
expect(isRetryableErrorKind('token_refresh_lost', opencodeShim)).toBe(false);
expect(isRetryableErrorKind('opencode_db_corrupt', opencodeShim)).toBe(false);
});

it('verdict_ambiguous is terminal even on opencode shim (PR #91 audit catch)', () => {
// A reviewer that wrote non-empty prose but no approve/reject
// keyword is NOT a transport flake. The runner now sets
// lastError.kind='verdict_ambiguous' so the opencode shim's
// onNullKind=true policy can't accidentally retry it.
expect(isRetryableErrorKind('verdict_ambiguous', opencodeShim)).toBe(false);
expect(isRetryableErrorKind('verdict_ambiguous')).toBe(false);
});

it('universal transient kinds retry regardless of shim presence', () => {
// The opt-in is for ADDITIONAL retries, not a gate on the
// universal set. A shim with no retryPolicy still gets the full
// universal transient behaviour.
expect(isRetryableErrorKind('stream_failure')).toBe(true);
expect(isRetryableErrorKind('stream_failure', codexShim)).toBe(true);
expect(isRetryableErrorKind('openrouter_503', codexShim)).toBe(true);
});
});

Expand Down
Loading
Loading