diff --git a/src/daemon/error-detector.ts b/src/daemon/error-detector.ts index 6059861..bcfa1f1 100644 --- a/src/daemon/error-detector.ts +++ b/src/daemon/error-detector.ts @@ -37,7 +37,6 @@ export interface CliError { * Terminal (no retry — same call will fail the same way): * - quota_exhausted quota window is server-scheduled * - token_refresh_lost human must re-authenticate - * - mcp_handshake_failed auth issue, same remediation * - opencode_db_corrupt local DB corruption persists * - permission_prompt needs user interaction or recoverKeys * @@ -48,6 +47,18 @@ export interface CliError { * ends with no kind match — usually a brief * upstream blip * - unknown same as stream_failure + * - mcp_handshake_failed codex's bundled MCP server boots racily — + * slow first start, handshake-timeout under + * load, server crash on first start. The + * error LOOKS auth-shaped (was originally + * classified terminal for that reason) but + * real auth failures surface as + * token_refresh_lost. Retry catches the boot + * race; a rare actual-misconfig fails twice + * and advances as before. (Caught when codex + * hit this on the PR #87 audit chat and went + * straight to claude fallback without any + * recovery attempt.) * - openrouter_fetch_failed pre-HTTP network error from the OpenRouter * shim — DNS blip, ECONNRESET mid-handshake, * ETIMEDOUT. Exactly the case retry is for. @@ -85,6 +96,7 @@ export function isRetryableErrorKind( case 'tmux_dead': case 'stream_failure': case 'unknown': + case 'mcp_handshake_failed': case 'openrouter_fetch_failed': case 'openrouter_no_body': return true; diff --git a/tests/error-detector-retryable.test.ts b/tests/error-detector-retryable.test.ts index 2c273cf..af2649d 100644 --- a/tests/error-detector-retryable.test.ts +++ b/tests/error-detector-retryable.test.ts @@ -61,10 +61,6 @@ describe('isRetryableErrorKind', () => { expect(isRetryableErrorKind('token_refresh_lost')).toBe(false); }); - it('mcp_handshake_failed is terminal', () => { - expect(isRetryableErrorKind('mcp_handshake_failed')).toBe(false); - }); - it('opencode_db_corrupt is terminal', () => { // Local DB corruption persists across retries. expect(isRetryableErrorKind('opencode_db_corrupt')).toBe(false); @@ -104,6 +100,16 @@ describe('isRetryableErrorKind', () => { // Same treatment as stream_failure. expect(isRetryableErrorKind('unknown')).toBe(true); }); + + it('mcp_handshake_failed is retryable (codex MCP boot race)', () => { + // Was originally terminal (lumped with auth) but real auth + // surfaces as token_refresh_lost. mcp_handshake_failed is + // almost always codex's bundled MCP server booting racily — + // catches the cheap save without compounding cost on genuine + // misconfig. Caught when codex hit this on the PR #87 audit + // chat and went straight to claude fallback with no recovery. + expect(isRetryableErrorKind('mcp_handshake_failed')).toBe(true); + }); }); describe('OpenRouter shim error kinds', () => {