From 8305306bd874c285e8841a0642467649eec7b87c Mon Sep 17 00:00:00 2001 From: chorus-codes <280607145+chorus-codes@users.noreply.github.com> Date: Sun, 24 May 2026 12:23:06 +1000 Subject: [PATCH] fix(runner): mcp_handshake_failed is transient, not auth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex's bundled MCP server boots racily — slow first start, handshake timeout under load, server crash on first start. The error LOOKS auth-shaped (was originally classified terminal under "auth issue, same remediation as token_refresh_lost") but real codex auth failures always surface as token_refresh_lost. mcp_handshake_failed is almost always a transient boot race. Caught when codex hit this on the PR #87 audit chat — the run shows "MCP_HANDSHAKE_FAILED — failed: handshaking with MCP server failed" and went straight to claude-sonnet-4-6 fallback with no retry. Victor confirmed codex was working fine in the previous chat just minutes earlier — definitionally transient. Fix: move mcp_handshake_failed from terminal to transient in isRetryableErrorKind. Worst case: wastes one extra codex call on a genuine misconfig (fails twice and advances to fallback as before). Best case: catches the boot race and saves the fallback swap. Note: kindToStatus still maps mcp_handshake_failed -> auth_invalid for cli-health tracking. That's a separate concern (it triggers the 10-min precheck cooldown + "Auth broken" badge); flagging as follow-up but PR #81's auto-heal on cred-file mtime change covers the common recovery path. Worth revisiting in a dedicated PR if the badge proves sticky in practice. Tests: 978 passing. One terminal test removed, one transient test added. --- src/daemon/error-detector.ts | 14 +++++++++++++- tests/error-detector-retryable.test.ts | 14 ++++++++++---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/daemon/error-detector.ts b/src/daemon/error-detector.ts index 6059861..bcfa1f1 100644 --- a/src/daemon/error-detector.ts +++ b/src/daemon/error-detector.ts @@ -37,7 +37,6 @@ export interface CliError { * Terminal (no retry — same call will fail the same way): * - quota_exhausted quota window is server-scheduled * - token_refresh_lost human must re-authenticate - * - mcp_handshake_failed auth issue, same remediation * - opencode_db_corrupt local DB corruption persists * - permission_prompt needs user interaction or recoverKeys * @@ -48,6 +47,18 @@ export interface CliError { * ends with no kind match — usually a brief * upstream blip * - unknown same as stream_failure + * - mcp_handshake_failed codex's bundled MCP server boots racily — + * slow first start, handshake-timeout under + * load, server crash on first start. The + * error LOOKS auth-shaped (was originally + * classified terminal for that reason) but + * real auth failures surface as + * token_refresh_lost. Retry catches the boot + * race; a rare actual-misconfig fails twice + * and advances as before. (Caught when codex + * hit this on the PR #87 audit chat and went + * straight to claude fallback without any + * recovery attempt.) * - openrouter_fetch_failed pre-HTTP network error from the OpenRouter * shim — DNS blip, ECONNRESET mid-handshake, * ETIMEDOUT. Exactly the case retry is for. @@ -85,6 +96,7 @@ export function isRetryableErrorKind( case 'tmux_dead': case 'stream_failure': case 'unknown': + case 'mcp_handshake_failed': case 'openrouter_fetch_failed': case 'openrouter_no_body': return true; diff --git a/tests/error-detector-retryable.test.ts b/tests/error-detector-retryable.test.ts index 2c273cf..af2649d 100644 --- a/tests/error-detector-retryable.test.ts +++ b/tests/error-detector-retryable.test.ts @@ -61,10 +61,6 @@ describe('isRetryableErrorKind', () => { expect(isRetryableErrorKind('token_refresh_lost')).toBe(false); }); - it('mcp_handshake_failed is terminal', () => { - expect(isRetryableErrorKind('mcp_handshake_failed')).toBe(false); - }); - it('opencode_db_corrupt is terminal', () => { // Local DB corruption persists across retries. expect(isRetryableErrorKind('opencode_db_corrupt')).toBe(false); @@ -104,6 +100,16 @@ describe('isRetryableErrorKind', () => { // Same treatment as stream_failure. expect(isRetryableErrorKind('unknown')).toBe(true); }); + + it('mcp_handshake_failed is retryable (codex MCP boot race)', () => { + // Was originally terminal (lumped with auth) but real auth + // surfaces as token_refresh_lost. mcp_handshake_failed is + // almost always codex's bundled MCP server booting racily — + // catches the cheap save without compounding cost on genuine + // misconfig. Caught when codex hit this on the PR #87 audit + // chat and went straight to claude fallback with no recovery. + expect(isRetryableErrorKind('mcp_handshake_failed')).toBe(true); + }); }); describe('OpenRouter shim error kinds', () => {