Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .changeset/fix-silent-rate-limit-uses-agent-provider.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
"@stoneforge/smithy": patch
---

fix(smithy): silent rate-limit detection attributes the limit to the worker's actual provider, not hardcoded `'claude'`

When the dispatch daemon detects a rate limit from a session that exited
without producing any output (the "silent rate limit" / rapid-exit
branch), or from the orphan-recovery loop noticing a rate-limit pattern
in a worker's session history, and **no `fallbackChain` is configured**,
it previously called `handleRateLimitDetected('claude', resetTime)` —
attributing the limit to `'claude'` regardless of the worker's actual
provider.

For codex (or any non-claude) workers, this surfaced in the dashboard
banner as `Dispatch paused — claude hit its rate limit.` and caused
`getRateLimitStatus()` to report `'claude'` as the limited executable
even when no claude session had ever run. Dispatch was correctly paused
(any tracked limit pauses dispatch in no-fallback-chain mode), but the
attribution was wrong, masking the real issue from operators.

The fix introduces a private `resolveDefaultExecutableForAgent(agent)`
helper that mirrors `resolveExecutableWithFallback`'s resolution
priority — agent's `executablePath` override → workspace
`defaultExecutablePaths[provider]` → bare provider name — without doing
any rate-limit lookups. Both rapid-exit sites now attribute the rate
limit to the failing worker's executable. `'claude-code'` (the canonical
provider name) maps to `'claude'` (the binary name) so existing tracker
entries and banner display for default Claude workers are unchanged.
275 changes: 275 additions & 0 deletions packages/smithy/src/services/dispatch-daemon.bun.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4916,6 +4916,232 @@ describe('recoverOrphanedTask - rapid-exit detection', () => {
});
});

// ============================================================================
// Rapid-exit detection — agent-aware executable (no fallback chain)
// ============================================================================

describe('recoverOrphanedTask - rapid-exit silent rate limit (no fallback chain)', () => {
let api: QuarryAPI;
let inboxService: InboxService;
let agentRegistry: AgentRegistry;
let taskAssignment: TaskAssignmentService;
let dispatchService: DispatchService;
let sessionManager: SessionManager;
let worktreeManager: WorktreeManager;
let stewardScheduler: StewardScheduler;
let settingsService: SettingsService;
let daemon: DispatchDaemon;
let testDbPath: string;
let systemEntity: EntityId;

/**
* Same shape as the helper in the parent rapid-exit suite — a session
* manager whose startSession returns a real EventEmitter so tests can
* synthesize 'event' / 'exit' messages and exercise the rapid-exit
* detector attached by recoverOrphanedTask.
*/
function createMockSessionManagerWithEvents(): SessionManager & { _lastEvents: EventEmitter | null } {
const sessions = new Map<EntityId, SessionRecord>();
let lastEvents: EventEmitter | null = null;

const mgr = {
_lastEvents: null as EventEmitter | null,
startSession: mock(async (agentId: EntityId, options?: StartSessionOptions) => {
const events = new EventEmitter();
lastEvents = events;
mgr._lastEvents = events;
const session: SessionRecord = {
id: `session-${Date.now()}`,
agentId,
agentRole: 'worker',
workerMode: 'ephemeral',
status: 'running',
workingDirectory: options?.workingDirectory,
worktree: options?.worktree,
createdAt: createTimestamp(),
startedAt: createTimestamp(),
lastActivityAt: createTimestamp(),
};
sessions.set(agentId, session);
return { session, events };
}),
getActiveSession: mock((agentId: EntityId) => {
return sessions.get(agentId) ?? null;
}),
stopSession: mock(async () => {}),
suspendSession: mock(async () => {}),
resumeSession: mock(async (agentId: EntityId) => {
const events = new EventEmitter();
lastEvents = events;
mgr._lastEvents = events;
const session: SessionRecord = {
id: `session-resume-${Date.now()}`,
agentId,
agentRole: 'worker',
workerMode: 'ephemeral',
status: 'running',
createdAt: createTimestamp(),
startedAt: createTimestamp(),
lastActivityAt: createTimestamp(),
};
sessions.set(agentId, session);
return { session, events };
}),
getSession: mock(() => undefined),
listSessions: mock(() => []),
messageSession: mock(async () => ({ success: true })),
getSessionHistory: mock(() => []),
pruneInactiveSessions: mock(() => 0),
reconcileOnStartup: mock(async () => ({ reconciled: 0, errors: [] })),
on: mock(() => {}),
off: mock(() => {}),
emit: mock(() => {}),
} as unknown as SessionManager & { _lastEvents: EventEmitter | null };

return mgr;
}

beforeEach(async () => {
testDbPath = `/tmp/dispatch-daemon-rapid-exit-no-chain-${Date.now()}-${Math.random().toString(36).slice(2)}.db`;
const storage = createStorage({ path: testDbPath, create: true });
initializeSchema(storage);

api = createQuarryAPI(storage);
inboxService = createInboxService(storage);
agentRegistry = createAgentRegistry(api);
taskAssignment = createTaskAssignmentService(api);
dispatchService = createDispatchService(api, taskAssignment, agentRegistry);
sessionManager = createMockSessionManagerWithEvents();
worktreeManager = createMockWorktreeManager();
stewardScheduler = createMockStewardScheduler();
// Empty fallbackChain — exercises the branch that previously hardcoded 'claude'.
settingsService = createMockSettingsService();

const { createEntity, EntityTypeValue } = await import('@stoneforge/core');
const entity = await createEntity({
name: 'test-system-rapid-exit-no-chain',
entityType: EntityTypeValue.SYSTEM,
createdBy: 'system:test' as EntityId,
});
const saved = await api.create(entity as unknown as Record<string, unknown> & { createdBy: EntityId });
systemEntity = saved.id as unknown as EntityId;

daemon = new DispatchDaemonImpl(
api,
agentRegistry,
sessionManager,
dispatchService,
worktreeManager,
taskAssignment,
stewardScheduler,
inboxService,
{
pollIntervalMs: 100,
orphanRecoveryEnabled: true,
maxResumeAttemptsBeforeRecovery: 5,
},
undefined,
settingsService
);
});

afterEach(async () => {
await daemon.stop();
if (fs.existsSync(testDbPath)) {
fs.unlinkSync(testDbPath);
}
});

async function createAssignedTask(
title: string,
workerId: EntityId,
meta?: { sessionId?: string; worktree?: string; branch?: string; resumeCount?: number }
): Promise<Task> {
const task = await createTask({
title,
createdBy: systemEntity,
status: TaskStatus.IN_PROGRESS,
assignee: workerId,
});
const saved = await api.create(task as unknown as Record<string, unknown> & { createdBy: EntityId }) as Task;

await api.update(saved.id, {
metadata: updateOrchestratorTaskMeta(undefined, {
assignedAgent: workerId,
branch: meta?.branch ?? 'agent/test/task-branch',
worktree: meta?.worktree ?? '/worktrees/test/task',
sessionId: meta?.sessionId,
resumeCount: meta?.resumeCount,
}),
});

return (await api.get<Task>(saved.id))!;
}

test('records silent rate limit against codex executable when worker provider is codex', async () => {
// Regression: in the no-fallback-chain branch, the silent rate limit was
// recorded against 'claude' regardless of the worker's actual provider.
// For codex workers this surfaced in the dashboard banner as
// "claude hit its rate limit" and paused dispatch on the wrong executable.
const worker = await agentRegistry.registerWorker({
name: 'rapid-exit-codex-worker',
workerMode: 'ephemeral',
createdBy: systemEntity,
maxConcurrentTasks: 1,
provider: 'codex',
});
const workerId = worker.id as unknown as EntityId;
await createAssignedTask('Codex task with rapid exit', workerId, {
worktree: '/worktrees/rapid-exit-codex-worker/task',
resumeCount: 0,
});

expect(daemon.getRateLimitStatus().limits).toHaveLength(0);

await daemon.recoverOrphanedAssignments();

// Simulate rapid exit without any output — the silent rate limit branch
const events = (sessionManager as ReturnType<typeof createMockSessionManagerWithEvents>)._lastEvents!;
events.emit('exit', 1, null);

await new Promise(resolve => setTimeout(resolve, 50));

const status = daemon.getRateLimitStatus();
expect(status.isPaused).toBe(true);
expect(status.limits).toHaveLength(1);
expect(status.limits[0].executable).toBe('codex');
});

test('records silent rate limit against claude executable when worker provider is claude-code (default)', async () => {
// Backward-compat: workers created without an explicit provider default
// to 'claude-code' on the registry side; the rate-limit tracker keeps
// the historical 'claude' executable name (the binary name) for those.
const worker = await agentRegistry.registerWorker({
name: 'rapid-exit-default-worker',
workerMode: 'ephemeral',
createdBy: systemEntity,
maxConcurrentTasks: 1,
});
const workerId = worker.id as unknown as EntityId;
await createAssignedTask('Default-provider task with rapid exit', workerId, {
worktree: '/worktrees/rapid-exit-default-worker/task',
resumeCount: 0,
});

await daemon.recoverOrphanedAssignments();

const events = (sessionManager as ReturnType<typeof createMockSessionManagerWithEvents>)._lastEvents!;
events.emit('exit', 1, null);

await new Promise(resolve => setTimeout(resolve, 50));

const status = daemon.getRateLimitStatus();
expect(status.isPaused).toBe(true);
expect(status.limits).toHaveLength(1);
expect(status.limits[0].executable).toBe('claude');
});
});

// ============================================================================
// Triage Session Rate Limit Guard Tests
// ============================================================================
Expand Down Expand Up @@ -5445,6 +5671,55 @@ describe('spawnRecoveryStewardForTask - rate limit session history guard', () =>
expect(Math.abs(resetTime.getTime() - expectedResetTime)).toBeLessThan(10_000);
});

test('records rate limit against codex executable when worker provider is codex', async () => {
// Regression: previously the rate limit was hardcoded to 'claude' regardless
// of the worker's actual provider, so codex workers showed up in the
// dispatch banner as "claude hit its rate limit" and the dispatch pause
// was attributed to the wrong executable.
const worker = await agentRegistry.registerWorker({
name: 'rl-record-codex-worker',
workerMode: 'ephemeral',
createdBy: systemEntity,
maxConcurrentTasks: 1,
provider: 'codex',
});
const workerId = worker.id as unknown as EntityId;
await createTestRecoverySteward('recovery-steward-rl-codex');

await createTaskWithRateLimitPattern('Codex rate-limited task', workerId);

const result = await daemon.recoverOrphanedAssignments();
expect(result.processed).toBe(0);
expect(sessionManager.startSession).not.toHaveBeenCalled();

const status = daemon.getRateLimitStatus();
expect(status.isPaused).toBe(true);
expect(status.limits.length).toBe(1);
expect(status.limits[0].executable).toBe('codex');
});

test('records rate limit against worker.executablePath when explicit executable override is set', async () => {
const worker = await agentRegistry.registerWorker({
name: 'rl-record-custom-bin-worker',
workerMode: 'ephemeral',
createdBy: systemEntity,
maxConcurrentTasks: 1,
provider: 'claude-code',
executablePath: '/usr/local/bin/claude-canary',
});
const workerId = worker.id as unknown as EntityId;
await createTestRecoverySteward('recovery-steward-rl-custom-bin');

await createTaskWithRateLimitPattern('Custom-bin rate-limited task', workerId);

await daemon.recoverOrphanedAssignments();

const status = daemon.getRateLimitStatus();
expect(status.isPaused).toBe(true);
expect(status.limits.length).toBe(1);
expect(status.limits[0].executable).toBe('/usr/local/bin/claude-canary');
});

test('rate limit pattern detection prevents repeated log spam on subsequent cycles', async () => {
const worker = await createTestWorker('rl-spam-worker');
const workerId = worker.id as unknown as EntityId;
Expand Down
51 changes: 48 additions & 3 deletions packages/smithy/src/services/dispatch-daemon.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2191,6 +2191,43 @@ export class DispatchDaemonImpl implements DispatchDaemon {
return undefined;
}

/**
* Resolves the executable name to use when recording a rate limit for an
* agent's session, in the absence of a configured fallback chain.
*
* Mirrors the resolution priority of {@link resolveExecutableWithFallback}
* (per-agent override → workspace-wide default for the provider → bare
* provider name) but performs no rate-limit tracker lookups — its sole
* purpose is to produce a stable executable identifier to attribute a
* detected rate limit to.
*
* The returned name is what surfaces in the dashboard rate-limit banner
* and in `getRateLimitStatus().limits[].executable`. Using the agent's
* provider rather than a hardcoded value is what makes the banner
* accurate when, e.g., a codex worker rapid-exits.
*
* `'claude-code'` (the canonical Claude provider name) is mapped to
* `'claude'` (the binary name) for backward compatibility with the
* pre-existing tracker entries and dashboard display for default
* Claude workers.
*/
private resolveDefaultExecutableForAgent(agent: AgentEntity): string {
const meta = getAgentMetadata(agent);
const agentExecutablePath = (meta as { executablePath?: string } | undefined)?.executablePath;
if (agentExecutablePath) return agentExecutablePath;

const providerName = (meta as { provider?: string } | undefined)?.provider ?? 'claude-code';

if (this.settingsService) {
const defaultPath = this.settingsService.getAgentDefaults().defaultExecutablePaths[providerName];
if (defaultPath) return defaultPath;
}

// 'claude-code' is the canonical provider name; 'claude' is the binary
// and what the rate-limit tracker / banner have always used for Claude.
return providerName === 'claude-code' ? 'claude' : providerName;
}

/**
* Terminates sessions that have exceeded the configured max duration.
* Prevents stuck workers from blocking their slot indefinitely.
Expand Down Expand Up @@ -2539,8 +2576,12 @@ export class DispatchDaemonImpl implements DispatchDaemon {
this.handleRateLimitDetected(executable, resetTime);
}
} else {
// No fallback chain — apply to the default provider
this.handleRateLimitDetected('claude', resetTime);
// No fallback chain — attribute the rate limit to the worker's own
// provider/executable rather than a hardcoded default, so the
// dashboard banner and dispatch pause reflect the actual limited
// executable (e.g. 'codex' for codex workers, not 'claude').
const workerExecutable = this.resolveDefaultExecutableForAgent(worker);
this.handleRateLimitDetected(workerExecutable, resetTime);
}

logger.info(
Expand Down Expand Up @@ -3419,7 +3460,11 @@ export class DispatchDaemonImpl implements DispatchDaemon {
this.handleRateLimitDetected(executable, resetTime);
}
} else {
this.handleRateLimitDetected('claude', resetTime);
// No fallback chain — attribute the rate limit to the failing
// worker's own provider/executable so the banner and dispatch
// pause reflect the actual limited executable.
const workerExecutable = this.resolveDefaultExecutableForAgent(worker);
this.handleRateLimitDetected(workerExecutable, resetTime);
}

return false;
Expand Down
Loading