diff --git a/WORKBOOK_v6.md b/WORKBOOK_v6.md index 52a6869..c3eb505 100644 --- a/WORKBOOK_v6.md +++ b/WORKBOOK_v6.md @@ -18,7 +18,7 @@ schema_version: 6 product: ordinary-user-loop-os version_target: loop-os-v1 current_phase: V6-P6 # V6-P0..P6,见 §3;P4/P5 代码随本 PR 落地 -current_substep: v6p3_hold_real_proof_credentials # P3 真证明仍待操作员(真 Draft PR + 真判词) +current_substep: overnight_p1_p3_p4_p5_p6_done_p2_hold_planner_auth # P3 真证明仍待操作员(真 Draft PR + 真判词) last_session_id: s_v6_0003 open_holds: 0 blocked_on: operator_real_proof diff --git a/apps/dashboard/src/pages/Cockpit.test.tsx b/apps/dashboard/src/pages/Cockpit.test.tsx index c51ba22..da11ebe 100644 --- a/apps/dashboard/src/pages/Cockpit.test.tsx +++ b/apps/dashboard/src/pages/Cockpit.test.tsx @@ -173,6 +173,38 @@ describe('Draft PR gate card · GEMINI_NOT_CONFIGURED is humanized', () => { }) }) +// overnight-p3 — the loop card is an operator console: the daemon's +// primaryAction renders ON the card (cockpit-card-action) through the SAME +// handler plumbing as the guidance buttons, and the agent strip shows who is +// working — no logs needed. +describe('CockpitPage · loop card carries the primary action + agent strip (overnight-p3)', () => { + it('renders primaryAction on the card and clicking it calls the existing handler', async () => { + const session = { id: 's1', missionId: 'm1', title: 'M', prompt: 'p', status: 'approved', createdAt: '2026-01-01', updatedAt: '2026-01-01' } + apiMock.getLatestOperatorSession.mockResolvedValue({ session, messages: [] }) + const overview = makeOverview() + overview.operatorView!.primaryAction = { id: 'start-execution', label: 'Start Execution · 启动执行', kind: 'primary' } + overview.operatorView!.card = { + type: 'progress', + title: '可以开始执行 · Ready to start', + next_step: '点击启动执行后才会动手 · Work starts when you click start.', + machine: { user_state: 'ready_to_execute', stage: 'approved', hold_code: null, pr_gate_code: null }, + current_phase: '可以开始执行 · Ready to start', + current_action: '等待启动 · Waiting for your start.', + evidence_links: [], + tests_run: [], + } + apiMock.getMissionOverview.mockResolvedValue(overview) + apiMock.startOperatorSession.mockResolvedValue({ session, overview }) + render() + const btn = await screen.findByTestId('cockpit-card-action') + expect(btn.textContent).toContain('Start Execution') + expect(btn.getAttribute('data-action-id')).toBe('start-execution') + expect(screen.getByTestId('cockpit-card-agents')).toBeTruthy() + fireEvent.click(btn) + await waitFor(() => expect(apiMock.startOperatorSession).toHaveBeenCalledWith('s1')) + }) +}) + describe('buildGuidance · inline decision flow (#2: no Approve/Start after Generate)', () => { const base = { overview: null, diff --git a/apps/dashboard/src/pages/Cockpit.tsx b/apps/dashboard/src/pages/Cockpit.tsx index a79b516..d8efb95 100644 --- a/apps/dashboard/src/pages/Cockpit.tsx +++ b/apps/dashboard/src/pages/Cockpit.tsx @@ -47,6 +47,36 @@ const ACTION_TEST_IDS: Record = { 'start-over': 'cockpit-start-over', } +/** The single handler set behind every primary-action surface (guidance + loop card). */ +interface PrimaryActionHandlers { + onBrainstorm: () => void + onRoadmap: () => void + onApprove: () => void + onStart: () => void + onPause: () => void + onResume: () => void + onDraftPr: () => void + onReset: () => void +} + +/** + * overnight-p3 — ONE id→handler map shared by the guidance buttons and the + * on-card action button (LoopCard `onAction`), so the card never duplicates + * the action plumbing. + */ +export function resolvePrimaryActionHandler(id: string, h: PrimaryActionHandlers): (() => void) | undefined { + return ({ + 'generate-plan': h.onRoadmap, + 'approve-roadmap': h.onApprove, + 'start-execution': h.onStart, + 'check-draft-pr-gate': h.onDraftPr, + resume: h.onResume, + pause: h.onPause, + 'start-over': h.onReset, + 'start-brainstorm': h.onBrainstorm, + } as Record void>)[id] +} + export function CockpitPage({ onNavigate }: { onNavigate?: (tab: string) => void } = {}) { const sse = useSSE('/api/events/stream') const [pendingApprovals, setPendingApprovals] = useState(0) @@ -200,14 +230,9 @@ export function CockpitPage({ onNavigate }: { onNavigate?: (tab: string) => void } } - const guidance = buildGuidance({ - session, - overview, - busy, - messages, - latestHold, - sseConnected: sse.connected, - operatorView: overview?.operatorView, + // overnight-p3 — the ONE handler set behind every action surface: the + // guidance buttons and the loop card's on-card action share this plumbing. + const actionHandlers = { onBrainstorm: () => action('create', () => api.createOperatorSession({ repoId, title, prompt }), (x) => { setSession(x.session); setMessages(x.messages); setOverview(null); setDraftPrStatus(null) }), onRoadmap: () => session && action('roadmap', () => api.generateRoadmap(session.id), (x) => { setSession(x.session); setMessages(x.messages); if (x.mission) void api.getMissionOverview(x.mission.id).then(setOverview) }), onApprove: () => session && action('approve', () => api.approveRoadmap(session.id), (x) => { setSession(x.session); setOverview(x.overview) }), @@ -217,6 +242,16 @@ export function CockpitPage({ onNavigate }: { onNavigate?: (tab: string) => void onDraftPr: () => session && action('draft-pr', () => api.createDraftPr(session.id), (x) => { setOverview(x.overview); setDraftPrStatus({ status: x.status, code: x.code, reason: x.reason, url: x.pr?.url, number: x.pr?.number }) }), onReset: resetMission, onStop: () => session?.missionId && action('stop', () => api.stopOperatorSession(session.id), (x) => { setSession(x.session); setOverview(x.overview) }), + } + const guidance = buildGuidance({ + session, + overview, + busy, + messages, + latestHold, + sseConnected: sse.connected, + operatorView: overview?.operatorView, + ...actionHandlers, }) const hasSession = Boolean(session) @@ -351,7 +386,16 @@ export function CockpitPage({ onNavigate }: { onNavigate?: (tab: string) => void )} {notice &&
{notice}
} - {loopCard && } + {loopCard && ( + resolvePrimaryActionHandler(a.id, actionHandlers)?.()} + busy={Boolean(busy)} + prGate={operatorView?.safetySummary.prGate} + lastActivityPhase={operatorView?.lastActivity?.phase} + /> + )} void>)[action.id] + const click = resolvePrimaryActionHandler(action.id, opts) return { kicker: opts.operatorView.stageLabel, title: opts.operatorView.summary, diff --git a/apps/dashboard/src/pages/cockpit/LoopCard.test.tsx b/apps/dashboard/src/pages/cockpit/LoopCard.test.tsx index b979b9d..588e55e 100644 --- a/apps/dashboard/src/pages/cockpit/LoopCard.test.tsx +++ b/apps/dashboard/src/pages/cockpit/LoopCard.test.tsx @@ -10,8 +10,8 @@ * - the `machine` sub-object is NEVER rendered as visible text — raw codes * (HOLD-*, gate codes, stage tokens) live only in data-* attributes. */ -import { describe, it, expect, afterEach } from 'vitest' -import { render, cleanup } from '@testing-library/react' +import { describe, it, expect, afterEach, vi } from 'vitest' +import { render, cleanup, fireEvent } from '@testing-library/react' import { LoopCard } from './LoopCard.js' import type { ApiBlockerLoopCard, @@ -174,6 +174,182 @@ describe('LoopCard — per-type content', () => { }) }) +// ---- overnight-p3 — operator console upgrades ------------------------------- +// The five cards must tell a new user what the system is doing and what to do +// next WITHOUT logs: operator vocabulary titles, an agent strip, the primary +// action ON the card, evidence entries, and PR-gate transparency. + +function progressAt(stage: string): ApiProgressLoopCard { + const card = progress() + card.machine = { ...card.machine, stage } + return card +} + +function gateBlocker(code: string): ApiBlockerLoopCard { + const card = blocker() + card.machine = { user_state: 'blocked', stage: 'pr_blocked', hold_code: null, pr_gate_code: code } + card.human_explanation = '为了安全,PR 暂时不能开 · For safety the draft PR cannot be opened yet.' + return card +} + +describe('LoopCard — operator vocabulary titles (理解/计划/构建/验证/合并)', () => { + it('maps card types to the operator stage vocabulary', () => { + expect(renderCard(understanding()).textContent).toContain('理解 · Understand') + cleanup() + expect(renderCard(plan()).textContent).toContain('计划 · Plan') + cleanup() + expect(renderCard(prReady(false)).textContent).toContain('合并 · PR·Merge') + }) + + it('progress splits VISUALLY into Build (running) vs Verify (evidence/validating); data-card-type stays progress', () => { + const build = renderCard(progressAt('running')) + expect(build.textContent).toContain('构建 · Build') + expect(build.textContent).not.toContain('验证 · Verify') + expect(build.getAttribute('data-card-type')).toBe('progress') + cleanup() + for (const stage of ['evidence_ready', 'validating', 'validators_missing', 'validators_ready']) { + const verify = renderCard(progressAt(stage)) + expect(verify.textContent).toContain('验证 · Verify') + expect(verify.textContent).not.toContain('构建 · Build') + expect(verify.getAttribute('data-card-type')).toBe('progress') + cleanup() + } + }) +}) + +describe('LoopCard — agent strip: who is working (cockpit-card-agents)', () => { + function strip(root: HTMLElement): HTMLElement { + const el = root.querySelector('[data-testid="cockpit-card-agents"]') as HTMLElement + expect(el).toBeTruthy() + return el + } + + it('every card lists the four-agent team (Claude/Codex/Gemini/GitHub)', () => { + const cards: ApiLoopCard[] = [understanding(), plan(), progress(), blocker(), prReady(false)] + for (const card of cards) { + const el = strip(renderCard(card)) + for (const name of ['Claude', 'Codex', 'Gemini', 'GitHub']) expect(el.textContent).toContain(name) + cleanup() + } + }) + + it('highlights the active agent per card type and machine stage', () => { + expect(strip(renderCard(understanding())).getAttribute('data-active-agent')).toBe('claude') + cleanup() + expect(strip(renderCard(plan())).getAttribute('data-active-agent')).toBe('claude') + cleanup() + expect(strip(renderCard(progressAt('running'))).getAttribute('data-active-agent')).toBe('codex') + cleanup() + expect(strip(renderCard(progressAt('validating'))).getAttribute('data-active-agent')).toBe('gemini') + cleanup() + expect(strip(renderCard(prReady(true))).getAttribute('data-active-agent')).toBe('github') + }) + + it('blocker: falls back to the last activity phase for the active agent', () => { + const { container } = render() + const el = container.querySelector('[data-testid="cockpit-card-agents"]') as HTMLElement + expect(el.getAttribute('data-active-agent')).toBe('codex') + }) +}) + +describe('LoopCard — next-step action button ON the card (cockpit-card-action)', () => { + const action = { id: 'approve-roadmap', label: 'Approve Roadmap · 批准路线' } + + it('renders the primary action as the card action button and forwards clicks', () => { + const onAction = vi.fn() + const { container } = render() + const btn = container.querySelector('[data-testid="cockpit-card-action"]') as HTMLButtonElement + expect(btn).toBeTruthy() + expect(btn.textContent).toContain('Approve Roadmap') + expect(btn.getAttribute('data-action-id')).toBe('approve-roadmap') + fireEvent.click(btn) + expect(onAction).toHaveBeenCalledWith(action) + }) + + it('renders no card action button without a primary action', () => { + const { container } = render() + expect(container.querySelector('[data-testid="cockpit-card-action"]')).toBeNull() + }) + + it('disables the card action while busy', () => { + const { container } = render( undefined} busy />) + expect((container.querySelector('[data-testid="cockpit-card-action"]') as HTMLButtonElement).disabled).toBe(true) + }) +}) + +describe('LoopCard — blocker recovery actions: list with the recommended one emphasized', () => { + it('marks the recommended recovery action', () => { + const root = renderCard(blocker()) + const list = root.querySelector('[data-testid="cockpit-card-recovery"]') as HTMLElement + expect(list).toBeTruthy() + const items = Array.from(list.querySelectorAll('li')) + expect(items.length).toBe(2) + expect(items[0]?.getAttribute('data-recommended')).toBe('true') + expect(items[0]?.querySelector('strong')).toBeTruthy() + expect(items[1]?.getAttribute('data-recommended')).toBe('false') + expect(items[1]?.querySelector('strong')).toBeNull() + }) +}) + +describe('LoopCard — evidence entries (cockpit-card-evidence)', () => { + it('progress: evidence links render as clickable-looking entries', () => { + const root = renderCard(progress()) + const entries = root.querySelectorAll('[data-testid="cockpit-card-evidence"]') + expect(entries.length).toBe(1) + expect(entries[0]?.textContent).toContain('evidence/run-1/gate.json') + }) + + it('pr_ready: changed files render as evidence entries', () => { + const root = renderCard(prReady(false)) + const entries = root.querySelectorAll('[data-testid="cockpit-card-evidence"]') + expect(entries.length).toBe(1) + expect(entries[0]?.textContent).toContain('src/a.ts') + }) +}) + +describe('LoopCard — PR gate transparency: why / who / next, never raw codes', () => { + it('blocker via the Gemini gate shows the three lines and credits Gemini', () => { + const root = renderCard(gateBlocker('GEMINI_NOT_PASS')) + const gate = root.querySelector('[data-testid="cockpit-card-pr-gate"]') as HTMLElement + expect(gate).toBeTruthy() + expect(gate.textContent).toContain('为什么') + expect(gate.textContent).toContain('谁说的') + expect(gate.textContent).toContain('下一步') + expect(gate.textContent).toContain('Gemini') + expect(gate.textContent).toContain('For safety the draft PR cannot be opened yet') + expect(root.textContent).not.toContain('GEMINI_NOT_PASS') + }) + + it('blocker via the policy gate credits the safety gate (安全门), never the raw code', () => { + const root = renderCard(gateBlocker('REMOTE_WRITES_DISABLED')) + const gate = root.querySelector('[data-testid="cockpit-card-pr-gate"]') as HTMLElement + expect(gate.textContent).toContain('安全门') + expect(root.textContent).not.toContain('REMOTE_WRITES_DISABLED') + }) + + it('a HOLD blocker does not pretend to be a PR-gate decision', () => { + const root = renderCard(blocker()) // hold_code present → the hold, not the gate, blocks + expect(root.querySelector('[data-testid="cockpit-card-pr-gate"]')).toBeNull() + }) + + it('pr_ready with a checked gate explains why it can open, who said so and what is next', () => { + const { container } = render( + , + ) + const gate = container.querySelector('[data-testid="cockpit-card-pr-gate"]') as HTMLElement + expect(gate).toBeTruthy() + expect(gate.textContent).toContain('为什么') + expect(gate.textContent).toContain('谁说的') + expect(gate.textContent).toContain('下一步') + expect(gate.textContent).toContain('Draft PR URL is recorded') + }) + + it('pr_ready without any gate info renders no gate section', () => { + const root = renderCard(prReady(false)) + expect(root.querySelector('[data-testid="cockpit-card-pr-gate"]')).toBeNull() + }) +}) + describe('LoopCard — blocker card: human explanation, never raw codes', () => { it('shows human_explanation, why_it_matters and recovery actions', () => { const root = renderCard(blocker()) diff --git a/apps/dashboard/src/pages/cockpit/LoopCard.tsx b/apps/dashboard/src/pages/cockpit/LoopCard.tsx index de26dc5..b3b04e3 100644 --- a/apps/dashboard/src/pages/cockpit/LoopCard.tsx +++ b/apps/dashboard/src/pages/cockpit/LoopCard.tsx @@ -1,5 +1,7 @@ /** * v6-P2 — the five-card loop cockpit surface (WORKBOOK_v6 GR#11). + * overnight-p3 — upgraded from "state display" to operator console: a new user + * must know what the system is doing and what to do next WITHOUT logs. * * Renders exactly ONE of the five ordinary-user cards derived by the daemon * (`overview.operatorView.card`, packages/daemon/src/loop-cards.ts). Three @@ -8,22 +10,92 @@ * exposed ONLY through data-* attributes, never as visible text. * 2. Visible text is calm and bilingual. * 3. Every card answers "what happens next" via a prominent `next_step`. + * + * Operator-console additions (all derived from existing view fields — no + * schema change; the internal five types / data-card-type stay unchanged): + * - card titles use the operator vocabulary 理解/计划/构建/验证/合并 + * (Understand / Plan / Build / Verify / PR·Merge); `progress` splits + * VISUALLY into Build vs Verify via machine.stage (title only); + * - an agent strip (cockpit-card-agents) shows who is working; + * - the daemon's primaryAction renders ON the card (cockpit-card-action), + * wired through the SAME handler plumbing as the guidance buttons; + * - evidence/files render as clickable-looking entries (cockpit-card-evidence); + * - PR-gate decisions show 为什么/谁说的/下一步 (cockpit-card-pr-gate). */ import type { ReactNode } from 'react' -import type { ApiLoopCard } from '../../api.js' +import type { ApiBlockerLoopCard, ApiLoopCard, ApiPrReadyLoopCard } from '../../api.js' + +/** Structural mirror of the daemon's OperatorActionView — id + label is enough. */ +export interface LoopCardAction { + id: string + label: string + disabled?: boolean +} + +/** Gate summary passed from the operator view (safetySummary.prGate). */ +export interface LoopCardPrGate { + status: string + reason?: string +} +// Operator vocabulary (理解/计划/构建/验证/合并). `progress` is resolved by +// stage in operatorStageLabel below; the value here is its Build default. const TYPE_LABELS: Record = { - understanding: '理解 · Understanding', - plan: '方案 · Plan', - progress: '进展 · Progress', + understanding: '理解 · Understand', + plan: '计划 · Plan', + progress: '构建 · Build', blocker: '需要你 · Needs you', - pr_ready: '收尾 · Ready for you', + pr_ready: '合并 · PR·Merge', } -const RISK_LABELS: Record<'low' | 'medium' | 'high', string> = { - low: '低 · low', - medium: '中 · medium', - high: '高 · high', +/** Verify-phase stages: evidence is being checked/reviewed, not built. */ +function isVerifyStage(stage: string): boolean { + return stage === 'evidence_ready' || stage === 'validating' || stage.startsWith('validators') +} + +/** Visible card title in the operator vocabulary — title only, no schema change. */ +export function operatorStageLabel(card: Pick): string { + if (card.type === 'progress' && isVerifyStage(card.machine.stage)) return '验证 · Verify' + return TYPE_LABELS[card.type] +} + +export type LoopCardAgent = 'claude' | 'codex' | 'gemini' | 'github' | 'none' + +const AGENTS: Array<{ id: Exclude; label: string }> = [ + { id: 'claude', label: 'Claude · 澄清/规划/审查' }, + { id: 'codex', label: 'Codex · 编码' }, + { id: 'gemini', label: 'Gemini · 终审' }, + { id: 'github', label: 'GitHub · PR' }, +] + +/** Who is working right now — derived from card type + machine.stage (+ last activity phase). */ +export function activeAgentForCard(card: Pick, lastActivityPhase?: string): LoopCardAgent { + switch (card.type) { + case 'understanding': + case 'plan': + return 'claude' + case 'progress': + return isVerifyStage(card.machine.stage) ? 'gemini' : 'codex' + case 'pr_ready': + return 'github' + case 'blocker': + if (lastActivityPhase === 'planning') return 'claude' + if (lastActivityPhase === 'executing') return 'codex' + return 'none' + } +} + +function AgentStrip({ card, lastActivityPhase }: { card: ApiLoopCard; lastActivityPhase?: string | undefined }) { + const active = activeAgentForCard(card, lastActivityPhase) + return ( +
+ {AGENTS.map((a) => ( + + {a.label} + + ))} +
+ ) } function Row({ k, children }: { k: string; children: ReactNode }) { @@ -43,6 +115,47 @@ function List({ items }: { items: string[] }) { ) } +/** Evidence/files as clickable-looking, read-only entries (cockpit-card-evidence). */ +function EvidenceList({ items }: { items: string[] }) { + return ( +
    + {items.map((path, i) => ( +
  • + + {path} + +
  • + ))} +
+ ) +} + +/** + * PR-gate transparency (为什么 · 谁说的 · 下一步) for cards whose state was + * decided by the PR gate. A HOLD blocker is NOT a gate decision — the hold + * explains itself. Raw codes never render; `who` is derived calm wording. + */ +function PrGateSection({ card, prGate }: { card: ApiBlockerLoopCard | ApiPrReadyLoopCard; prGate?: LoopCardPrGate | undefined }) { + const code = card.machine.pr_gate_code + const gateDecided = card.type === 'blocker' + ? card.machine.hold_code === null && code !== null + : code !== null || Boolean(prGate) + if (!gateDecided) return null + const who = code && code.startsWith('GEMINI') + ? 'Gemini(证据终审) · Gemini, the evidence-only reviewer' + : '安全门(系统守护) · The safety gate' + const why = card.type === 'blocker' + ? card.human_explanation + : prGate?.reason?.trim() || card.summary + return ( +
+ {why} + {who} + {card.next_step} +
+ ) +} + function UnderstandingBody({ card }: { card: Extract }) { return (
@@ -79,6 +192,12 @@ function PlanBody({ card }: { card: Extract }) { ) } +const RISK_LABELS: Record<'low' | 'medium' | 'high', string> = { + low: '低 · low', + medium: '中 · medium', + high: '高 · high', +} + function ProgressBody({ card }: { card: Extract }) { return (
@@ -89,27 +208,43 @@ function ProgressBody({ card }: { card: Extract {card.evidence_links.length > 0 - ? + ? : '证据生成后会列在这里 · Evidence appears here once produced.'}
) } -function BlockerBody({ card }: { card: Extract }) { +function BlockerBody({ card, prGate }: { card: Extract; prGate?: LoopCardPrGate | undefined }) { return ( <>

{card.human_explanation}

{card.why_it_matters} - - {card.recommended_action} + +
    + {card.recovery_actions.map((item, i) => { + const recommended = item === card.recommended_action + return ( +
  • + {recommended ? {item} : item} + {recommended && 推荐 · recommended} +
  • + ) + })} +
+
+ ) } -function PrReadyBody({ card, onRework }: { card: Extract; onRework?: (() => void) | undefined }) { +function PrReadyBody({ card, onRework, prGate }: { + card: Extract + onRework?: (() => void) | undefined + prGate?: LoopCardPrGate | undefined +}) { return ( <>
@@ -120,7 +255,7 @@ function PrReadyBody({ card, onRework }: { card: Extract - {card.files_changed.length > 0 ? : '没有记录到文件改动 · No file changes recorded.'} + {card.files_changed.length > 0 ? : '没有记录到文件改动 · No file changes recorded.'} {card.tests.length > 0 ? : '还没有检查记录 · No checks recorded yet.'} @@ -129,6 +264,7 @@ function PrReadyBody({ card, onRework }: { card: Extract{RISK_LABELS[card.risk]} {card.merge_policy}
+
+
+ )} {card.type === 'understanding' && } {card.type === 'plan' && } {card.type === 'progress' && } - {card.type === 'blocker' && } - {card.type === 'pr_ready' && } + {card.type === 'blocker' && } + {card.type === 'pr_ready' && } ) } diff --git a/apps/dashboard/src/pages/cockpit/cockpit.css b/apps/dashboard/src/pages/cockpit/cockpit.css index ddf7415..1756743 100644 --- a/apps/dashboard/src/pages/cockpit/cockpit.css +++ b/apps/dashboard/src/pages/cockpit/cockpit.css @@ -301,6 +301,18 @@ button.ck-stat:hover { background: var(--bg-sub); } .ck-loop-actions { display: flex; gap: 8px; } .ck-loop-rework { font-size: 12px; } +/* --- overnight-p3 operator-console additions: agent strip, on-card action, evidence entries, PR-gate transparency --- */ +.ck-loop-agents { display: flex; flex-wrap: wrap; gap: 6px; } +.ck-loop-agent { border: 1px solid #e2e8f0; border-radius: 999px; padding: 1px 8px; font-size: 10.5px; color: var(--fg-muted); background: var(--bg-sub); } +.ck-loop-agent.active { border-color: #93c5fd; background: var(--accent-soft); color: #1d4ed8; font-weight: 700; } +/* Primary look without the .primary class — the quality smoke pins exactly + one .ck-btn.primary per stage (the guidance row); the card mirrors it. */ +.ck-loop-action { font-size: 12.5px; background: var(--accent); color: #fff; } +.ck-loop-recovery li.recommended { color: var(--fg); } +.ck-loop-recommended-tag { margin-left: 6px; font-size: 10px; color: #1d4ed8; background: var(--accent-soft); border-radius: 999px; padding: 0 6px; white-space: nowrap; } +.ck-evidence-link { color: #1d4ed8; text-decoration: underline; text-underline-offset: 2px; cursor: pointer; overflow-wrap: anywhere; } +.ck-loop-prgate { border-top: 1px dashed #dbe4f5; padding-top: 6px; } + .ck-review-block { align-self: stretch; max-width: 92%; border: var(--hair); border-radius: var(--r-lg); background: var(--bg); box-shadow: var(--sh-1); padding: 12px; display: grid; gap: 10px; } .ck-review-head { display: flex; align-items: center; justify-content: space-between; gap: 8px; font-size: 13px; } .ck-review-head span { color: var(--fg-muted); font-size: 11px; text-transform: uppercase; letter-spacing: 0.04em; } diff --git a/docs/SESSION_LOG_v3.md b/docs/SESSION_LOG_v3.md index fa4f30e..18b33a3 100644 --- a/docs/SESSION_LOG_v3.md +++ b/docs/SESSION_LOG_v3.md @@ -1,5 +1,11 @@ # SESSION LOG v3 +## s_v6_0005 · 2026-06-11 · Overnight harness loop — P1/P3/P4/P5/P6 done · P2 honest HOLD + +- P1: HOLD-PLANNER-AUTH detection + opt-in AEDEV_PLANNER_FALLBACK=codex (events record codex-cli (fallback), never impersonation) (+18). P3: operator-vocabulary cards + agent strip + on-card actions + PR-gate transparency, user-E2E 7/7 (+17). P4: merge-policy pure function, 864-combination sweep proves GR#10 (auto-merge off) (+14). P5: run-summary.md audit artifact on all four mission exits, absent-means-absent (+12). P6: full uninterrupted 30-min soak 5/5 PASS. +- P2 honest conclusion: REAL Draft PR exists (hermus-agent#4, operator-produced — remote-write gate truly proven); full cockpit chain + real Gemini verdict still HOLD-PLANNER-AUTH (operator claude login 401); recovery incl. the new fallback documented in evidence/v6/real-proof/. +- Suite: 950 passed / 0 failed. + ## s_v6_0004 · 2026-06-11 · V6-P3 cycle-4 attempt → honest HOLD-REAL-PROOF-CREDENTIALS - Planner chain (real, committed): env false-red REFUSE (cycle-2) → dirty-tree honest REFUSE (cycle-3) → clean PROPOSE cycle-4 = v6-p3-real-proof-closeout. diff --git a/docs/product/MERGE_POLICY.md b/docs/product/MERGE_POLICY.md new file mode 100644 index 0000000..ddd84bf --- /dev/null +++ b/docs/product/MERGE_POLICY.md @@ -0,0 +1,75 @@ +# Merge Policy — v6 mature-product action matrix + +> **Status (this cycle):** policy shipped as a **PURE DECISION FUNCTION + +> tests only**. No merge automation is wired. **Auto-merge is DISABLED per +> [WORKBOOK_v6 GR#10](../../WORKBOOK_v6.md) (human merge only):** the system +> never merges; a draft PR is the terminal machine exit, and the merge button +> belongs to a human. Enabling auto-merge in any future cycle requires a +> written operator-approved change to WORKBOOK_v6 §2. + +## Where this lives + +- **This document** is the forward-looking, mature-product action matrix. +- **Implementation:** `decideMergeAction()` in + [`packages/daemon/src/merge-policy-v6.ts`](../../packages/daemon/src/merge-policy-v6.ts) + — pure, exported only, **not imported by any merge execution path**. +- **Tests:** `packages/daemon/src/merge-policy-v6.test.ts` — full matrix, + including an exhaustive proof that `autoMergeEnabled=false` (the GR#10 + default, and the only legal value this cycle) can never produce + `auto_merge_eligible`, even for a perfect docs-only change. + +## Relationship to docs/AUTO_MERGE_POLICY.md (reconciliation, not duplication) + +[`docs/AUTO_MERGE_POLICY.md`](../AUTO_MERGE_POLICY.md) describes the legacy +v2.x **risk-score** policy implemented by the `MergePolicy` class +(`packages/validators/src/merge-policy.ts`), which mission-runner step 7 still +uses to label a run `AUTO_MERGE` / `WAITING` / `BLOCKED`. Two clarifications: + +1. Under GR#10 the runtime meaning of that `AUTO_MERGE` label is **"open a + draft PR through the fail-closed `DraftPrGate`"** — never an actual merge. + The gate additionally fail-closes on `allow_remote_writes`, the per-repo + whitelist, `repo.enabled`, and forbidden paths. +2. This document layers the v6 **action vocabulary** + (`auto_merge_eligible` / `draft_pr_only` / `hold` / `no_pr`) on top, keyed + by change kind rather than only by score. The legacy doc remains valid for + risk scoring; where the two disagree on the final action, **GR#10 and this + matrix win** (e.g. the legacy "AUTO_MERGE eligible if all gates pass" row + is capped at a draft PR this cycle). + +## Action matrix + +| Situation | Action | +|---|---| +| security / workflow / dependency / system-config change — any other state | `hold` | +| Gemini validator **FAIL** | `no_pr` (failed work never becomes a PR) | +| tests red | `no_pr` (same rule as a failed validator) | +| Gemini **inconclusive** or **not configured** | `hold` (a human decides; the system never guesses) | +| risk **high** (after the gates above) | `hold` | +| **docs-only** + low risk + tests pass + Gemini PASS + Claude review PASS | eligible for auto-merge **in a future cycle — currently DISABLED per WORKBOOK_v6 GR#10**, so this cycle: `draft_pr_only` | +| **code** change + all gates green + **explicit operator approval** | same as above: eligible only in a future cycle; this cycle `draft_pr_only` | +| **code** change without explicit approval (gates green) | `draft_pr_only` | +| Claude review `rework` / no review verdict (gates green) | `draft_pr_only` | +| docs-only above low risk (gates green) | `draft_pr_only` | + +## Decision order (mirrors the implementation) + +1. `security` / `workflow` / `dependency` / `system_config` → **hold**, + regardless of validators, risk, tests, or approval. +2. Gemini `fail` → **no_pr**. +3. Tests red → **no_pr**. +4. Gemini `inconclusive` / `not_configured` → **hold**. +5. Risk `high` → **hold**. +6. Eligibility check: docs-only + low risk + Claude review `approve`, **or** + code + Claude review `approve` + explicit operator approval. + - eligible + `autoMergeEnabled=true` (future cycle only) → **auto_merge_eligible** + - eligible + `autoMergeEnabled=false` (this cycle, always) → **draft_pr_only**, citing GR#10 +7. Everything else that survived the gates → **draft_pr_only**. + +## Scope guarantee for this cycle + +- `decideMergeAction()` is **export-only**: nothing in the daemon, runner, or + GitHub planes calls it to execute a merge. +- All production-relevant call sites (there are none yet) must pass + `autoMergeEnabled: false` until WORKBOOK_v6 GR#10 is formally amended. +- `auto_merge_eligible` is therefore a **label for audit/forecast purposes**, + unreachable this cycle; the tests pin this property exhaustively. diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/01-new.png b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/01-new.png new file mode 100644 index 0000000..1616498 Binary files /dev/null and b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/01-new.png differ diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/02-brainstorm-ready.png b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/02-brainstorm-ready.png new file mode 100644 index 0000000..f082080 Binary files /dev/null and b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/02-brainstorm-ready.png differ diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/03-plan-approval.png b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/03-plan-approval.png new file mode 100644 index 0000000..9792353 Binary files /dev/null and b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/03-plan-approval.png differ diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/04-approved.png b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/04-approved.png new file mode 100644 index 0000000..ed08b0c Binary files /dev/null and b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/04-approved.png differ diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/05-evidence-ready.png b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/05-evidence-ready.png new file mode 100644 index 0000000..ebc77c7 Binary files /dev/null and b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/05-evidence-ready.png differ diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/06-pr-blocked.png b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/06-pr-blocked.png new file mode 100644 index 0000000..96c8caa Binary files /dev/null and b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/06-pr-blocked.png differ diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/console-logs.json b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/console-logs.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/console-logs.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/db-state-summary.json b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/db-state-summary.json new file mode 100644 index 0000000..0fbf178 --- /dev/null +++ b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/db-state-summary.json @@ -0,0 +1,286 @@ +{ + "mission": { + "id": "01KTTVCRG5BZGRCVWWAKJY714P", + "status": "paused", + "githubPrUrl": null + }, + "operatorView": { + "stage": "pr_blocked", + "stageLabel": "PR blocked by policy · PR 被安全门拦截", + "confidence": 96, + "progressPercent": 95, + "headlessCallsToday": 0, + "primaryAction": { + "id": "check-draft-pr-gate", + "label": "Re-check Draft PR Gate · 重新检查 PR 安全门", + "kind": "primary" + }, + "secondaryActions": [], + "providerSummary": { + "planner": { + "name": "test-synthetic", + "mode": "mock", + "status": "Planner finished", + "tokens": null + }, + "worker": { + "name": "mock", + "mode": "mock", + "status": "done", + "tokens": null + }, + "validators": [ + { + "name": "gemini", + "mode": "not_configured", + "status": "not_configured" + } + ] + }, + "safetySummary": { + "remoteWrites": "disabled", + "prGate": { + "status": "blocked", + "code": "GEMINI_NOT_CONFIGURED", + "reason": "Gemini hard gate has no evidence-only PASS verdict for this mission.", + "remediation": "Remote writes are disabled for safety. Enable repo-scoped allow_remote_writes only when you want the worker to push a branch and open a Draft PR; until then no push, PR, or merge occurs." + }, + "testMode": { + "enabled": true, + "reason": "mock/template mode is active; no external model or remote write is implied." + } + }, + "understanding": { + "roundsCompleted": 0, + "questions": [], + "readyReason": "Planner confidence is at least 95% and no clarification questions are pending." + }, + "projectPulse": { + "progress": [ + { + "id": "understand", + "label": "Understand · 理解需求", + "status": "done" + }, + { + "id": "roadmap", + "label": "Roadmap · 路线图", + "status": "done" + }, + { + "id": "execute", + "label": "Execute · 本地执行", + "status": "done" + }, + { + "id": "validate", + "label": "Validate · 独立验证", + "status": "done", + "detail": "Gemini key is not configured; this is visible and not counted as pass." + }, + { + "id": "pr-gate", + "label": "PR Gate · PR 安全门", + "status": "active" + }, + { + "id": "learn", + "label": "Learn · 沉淀记忆", + "status": "pending" + } + ], + "workingFolder": "/tmp/aedev-cockpit-quality-P9VKvY/operator-evidence/01KTTVCS05E2F630888TDZTZZT", + "touchedFiles": [], + "evidence": [ + { + "id": "01KTTVCS0F27GJYGMX7PH9VJKR", + "title": "ADR draft", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P/adr-mission.md", + "type": "adr" + }, + { + "id": "01KTTVCS0F27GJYGMX7PH9VJKQ", + "title": "PRD", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P/prd.md", + "type": "prd" + }, + { + "id": "01KTTVCS0F27GJYGMX7PH9VJKT", + "title": "Workbook summary", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P/workbook-summary.md", + "type": "report" + }, + { + "id": "01KTTVCS0F27GJYGMX7PH9VJKV", + "title": "Test summary", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P/test-summary.md", + "type": "report" + }, + { + "id": "01KTTVCS0F27GJYGMX7PH9VJKW", + "title": "Risk report", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P/risk-report.md", + "type": "report" + }, + { + "id": "01KTTVCS0F27GJYGMX7PH9VJKX", + "title": "Worker diff", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P/diff-summary.md", + "type": "report" + }, + { + "id": "01KTTVCS0F27GJYGMX7PH9VJKY", + "title": "Done report", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P/done-report.md", + "type": "report" + }, + { + "id": "01KTTVCS0F27GJYGMX7PH9VJKS", + "title": "Roadmap", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P/roadmap.md", + "type": "roadmap" + }, + { + "id": "01KTTVCS0E3W7F8P59WMZ7R25R", + "title": "Evidence directory", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P", + "type": "evidence" + }, + { + "id": "01KTTVCRGC1C6TJAR8VH1ET705", + "title": "ADR draft in mission design", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/prd/01KTTVCRG5BZGRCVWWAKJY714P.design.json", + "type": "adr" + }, + { + "id": "01KTTVCRGBY9R2AJF7802CWZ0H", + "title": "PRD", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/prd/01KTTVCRG5BZGRCVWWAKJY714P.md", + "type": "prd" + }, + { + "id": "01KTTVCRGC1C6TJAR8VH1ET704", + "title": "Mission design JSON", + "path": "/tmp/aedev-cockpit-quality-P9VKvY/prd/01KTTVCRG5BZGRCVWWAKJY714P.design.json", + "type": "roadmap" + } + ], + "validatorReviews": [ + { + "id": "validators-not-configured", + "validator": "validators", + "verdict": "not_configured", + "summary": "Independent validation did not run because the Gemini key is not configured.", + "checkedEvidence": [ + "ADR draft", + "PRD", + "Workbook summary", + "Test summary", + "Risk report", + "Worker diff", + "Done report", + "Roadmap" + ], + "blockingIssues": [], + "evidenceGaps": [ + "No Gemini validator verdict exists for this mission." + ], + "recommendedNextAction": "Configure validator keys for live verification, or continue reviewing evidence manually." + } + ] + }, + "memorySummary": { + "projectFacts": [ + { + "id": "repo-01KTTVCMA48TBPF50TQAE80RKJ", + "kind": "project", + "text": "Target repo is cockpit-quality at /tmp/aedev-cockpit-quality-P9VKvY.", + "provenance": "repo registry", + "ttlDays": 90, + "superseded": false + }, + { + "id": "repo-forbidden-01KTTVCMA48TBPF50TQAE80RKJ", + "kind": "safety", + "text": "Forbidden paths stay protected: .env*, secrets/**, .github/**, AGENTS.md", + "provenance": "repo policy", + "ttlDays": 365, + "superseded": false + } + ], + "userPreferences": [ + { + "id": "pref-understand-first", + "kind": "user_preference", + "text": "Ask goal-specific questions and confirm understanding before starting worker execution.", + "provenance": "operator product directive", + "ttlDays": 365, + "superseded": false + }, + { + "id": "prompt-01KTTVCP0MB336M6V2C49A6C1N", + "kind": "mission_intent", + "text": "Current mission intent: In the dashboard Cockpit page, verify the existing conversation UI quality smoke keeps the single conversation layout, status strip, and safe Draft PR gate visible without changing product behavior. Acceptance: browser smoke passes and evid", + "provenance": "operator prompt", + "ttlDays": 30, + "superseded": false + } + ], + "recentLessons": [ + { + "id": "lesson-0", + "kind": "run_lesson", + "text": "Draft PR blocked: GEMINI_NOT_CONFIGURED", + "provenance": "event:operator.draft_pr_blocked", + "ttlDays": 30, + "superseded": false + } + ] + }, + "summary": "Worker done, evidence ready, and the Draft PR gate was blocked by policy. No branch push, PR, or merge occurred.", + "nextAction": "Continue reviewing evidence, or explicitly enable repo-scoped remote writes before re-checking the gate.", + "testMode": true, + "userState": { + "state": "blocked", + "label": "Needs your attention", + "labelZh": "需要你处理", + "explanation": "系统在这一步暂停,等你看一眼后再继续 · The system paused here and will continue once you take a look." + }, + "lastActivity": { + "atIso": "2026-06-11T08:04:51.651Z", + "agoMs": 197, + "phase": "blocked" + }, + "loopSummary": { + "whatChanged": [], + "testsRan": [ + "Test summary" + ], + "agents": [ + "planner · test-synthetic", + "worker · mock", + "validator · gemini" + ], + "validatorSaid": null, + "whyStoppedOrContinuing": "系统在这一步暂停,等你看一眼后再继续 · The system paused here and will continue once you take a look." + }, + "card": { + "type": "blocker", + "title": "需要你处理 · Needs your attention", + "human_explanation": "系统在这一步暂停,等你看一眼后再继续 · The system paused here and will continue once you take a look.", + "why_it_matters": "在不确定的时候暂停,比悄悄做错更安全;没有你的确认,任何东西都不会对外发布 · Pausing when unsure is safer than quietly doing the wrong thing; nothing is published without your confirmation.", + "recovery_actions": [ + "查看这张卡的说明,确认是否继续 · Read this card’s explanation and confirm whether to continue.", + "随时可以重新开始或调整目标 · You can restart or adjust the goal at any time." + ], + "recommended_action": "查看这张卡的说明,确认是否继续 · Read this card’s explanation and confirm whether to continue.", + "next_step": "查看这张卡的说明,确认是否继续 · Read this card’s explanation and confirm whether to continue.", + "machine": { + "user_state": "blocked", + "stage": "pr_blocked", + "hold_code": null, + "pr_gate_code": "GEMINI_NOT_CONFIGURED" + } + } + } +} \ No newline at end of file diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/dom-state-summary.json b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/dom-state-summary.json new file mode 100644 index 0000000..6e84ddc --- /dev/null +++ b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/dom-state-summary.json @@ -0,0 +1,6 @@ +{ + "stage": "pr_blocked", + "planner": "mock", + "worker": "mock", + "prGateCode": "GEMINI_NOT_CONFIGURED" +} \ No newline at end of file diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/event-tail.json b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/event-tail.json new file mode 100644 index 0000000..52a8510 --- /dev/null +++ b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/event-tail.json @@ -0,0 +1,120 @@ +[ + { + "type": "operator.draft_pr_blocked", + "payload": { + "code": "GEMINI_NOT_CONFIGURED", + "reason": "Gemini hard gate has no evidence-only PASS verdict for this mission.", + "validator": "gemini" + }, + "createdAt": "2026-06-11T08:04:51.651Z" + }, + { + "type": "operator.gemini_pr_blocked", + "payload": { + "code": "GEMINI_NOT_CONFIGURED", + "reason": "Gemini hard gate has no evidence-only PASS verdict for this mission.", + "verdict": "not_configured", + "summary": null + }, + "createdAt": "2026-06-11T08:04:51.651Z" + }, + { + "type": "operator.evidence_written", + "payload": { + "sessionId": "01KTTVCP0MB336M6V2C49A6C1N", + "evidenceDir": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P" + }, + "createdAt": "2026-06-11T08:04:49.807Z" + }, + { + "type": "operator.stage_changed", + "payload": { + "stage": "PR/Waiting/Blocked", + "sessionId": "01KTTVCP0MB336M6V2C49A6C1N", + "status": "waiting" + }, + "createdAt": "2026-06-11T08:04:49.807Z" + }, + { + "type": "mission.run_completed", + "payload": { + "taskId": "01KTTVCS05E2F630888TDZTZZT", + "runId": "01KTTVCS06ZE1HWCF2MQVPPY2D", + "exitCode": 0, + "status": "waiting", + "decision": "WAITING", + "riskScore": 0, + "validatorCount": 0, + "releaseDeployUrl": null, + "releaseReverted": false, + "draftPrUrl": null, + "draftPrNumber": null + }, + "createdAt": "2026-06-11T08:04:49.806Z" + }, + { + "type": "operator.worker_log", + "payload": { + "taskId": "01KTTVCS05E2F630888TDZTZZT", + "runId": "01KTTVCS06ZE1HWCF2MQVPPY2D", + "stream": "stdout", + "chunk": "mock worker completed evidence gate" + }, + "createdAt": "2026-06-11T08:04:49.799Z" + }, + { + "type": "operator.worker_started", + "payload": { + "taskId": "01KTTVCS05E2F630888TDZTZZT", + "runId": "01KTTVCS06ZE1HWCF2MQVPPY2D", + "provider": "mock", + "evidenceDir": "/tmp/aedev-cockpit-quality-P9VKvY/operator-evidence/01KTTVCS05E2F630888TDZTZZT" + }, + "createdAt": "2026-06-11T08:04:49.798Z" + }, + { + "type": "mission.route_selected", + "payload": { + "role": "coder", + "provider": "mock", + "sessionId": null, + "concurrency": 1, + "holdCode": null, + "reason": "worker router not configured" + }, + "createdAt": "2026-06-11T08:04:49.797Z" + }, + { + "type": "mission.run_started", + "payload": { + "evidenceDir": "/tmp/aedev-cockpit-quality-P9VKvY/evidence/01KTTVCRG5BZGRCVWWAKJY714P" + }, + "createdAt": "2026-06-11T08:04:49.789Z" + }, + { + "type": "operator.validators_not_configured", + "payload": { + "status": "not_configured", + "note": "Gemini validator key is not configured; Draft PR remains blocked until Gemini returns PASS." + }, + "createdAt": "2026-06-11T08:04:49.787Z" + }, + { + "type": "operator.worker_assigned", + "payload": { + "sessionId": "01KTTVCP0MB336M6V2C49A6C1N", + "mode": "mock", + "availableSessions": 0, + "paidApiKeysStripped": true + }, + "createdAt": "2026-06-11T08:04:49.787Z" + }, + { + "type": "operator.stage_changed", + "payload": { + "stage": "Worker", + "sessionId": "01KTTVCP0MB336M6V2C49A6C1N" + }, + "createdAt": "2026-06-11T08:04:49.786Z" + } +] \ No newline at end of file diff --git a/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/quality-smoke.md b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/quality-smoke.md new file mode 100644 index 0000000..ddbed11 --- /dev/null +++ b/evidence/browser-cockpit-quality/2026-06-11T08-04-44-897Z/quality-smoke.md @@ -0,0 +1,16 @@ +# Operator Cockpit WebUI Quality Smoke + +Result: PASS +Mission: 01KTTVCRG5BZGRCVWWAKJY714P +Stage: pr_blocked +PR gate: GEMINI_NOT_CONFIGURED + +Assertions: +- cockpit renders as one conversation column plus the three-part status strip +- legacy Project Pulse, sidebar, inspector, and tabbed panels are absent +- one primary action per stage +- stable testids for core controls +- planner/worker provider badges expose mock test mode +- PR URL stayed empty while Gemini hard gate was not configured +- draft PR blocked card reassures no push, PR, or merge occurred +- browser console had no error/warning \ No newline at end of file diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/01-composed-and-started.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/01-composed-and-started.png new file mode 100644 index 0000000..19f1702 Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/01-composed-and-started.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/02-planning-progress.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/02-planning-progress.png new file mode 100644 index 0000000..3e715bf Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/02-planning-progress.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03a-clarify-popup-filled.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03a-clarify-popup-filled.png new file mode 100644 index 0000000..16a2d8b Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03a-clarify-popup-filled.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03b-clarify-answered-gate-guidance.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03b-clarify-answered-gate-guidance.png new file mode 100644 index 0000000..f417e66 Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03b-clarify-answered-gate-guidance.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03c-clarify-unlocked.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03c-clarify-unlocked.png new file mode 100644 index 0000000..3569565 Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03c-clarify-unlocked.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/04-roadmap-ready.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/04-roadmap-ready.png new file mode 100644 index 0000000..a9bd910 Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/04-roadmap-ready.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/05a-approved.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/05a-approved.png new file mode 100644 index 0000000..cff3c36 Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/05a-approved.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/05b-execution-evidence-gate.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/05b-execution-evidence-gate.png new file mode 100644 index 0000000..8f3e754 Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/05b-execution-evidence-gate.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/06-loop-summary.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/06-loop-summary.png new file mode 100644 index 0000000..580f27e Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/06-loop-summary.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/07-pr-gate-blocked-human.png b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/07-pr-gate-blocked-human.png new file mode 100644 index 0000000..ff9f2c1 Binary files /dev/null and b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/07-pr-gate-blocked-human.png differ diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/console-logs.json b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/console-logs.json new file mode 100644 index 0000000..2936d7c --- /dev/null +++ b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/console-logs.json @@ -0,0 +1,3 @@ +[ + "error: Failed to load resource: the server responded with a status of 409 (Conflict)" +] \ No newline at end of file diff --git a/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/user-e2e-report.md b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/user-e2e-report.md new file mode 100644 index 0000000..445c33d --- /dev/null +++ b/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/user-e2e-report.md @@ -0,0 +1,79 @@ +# Operator Cockpit — User Journey E2E Report + +Result: **PASS** +Timestamp: 2026-06-11T08-05-02-983Z +Evidence dir: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z + +Harness: mock/template planner+worker, remote writes disabled, all external CLIs/APIs disabled, +temp stateDir, in-memory SQLite, vite dashboard, chromium via playwright. + +## Steps + +### step-1-compose-and-start — PASS + +Type a user prompt into the composer and start brainstorm +- composer testid: cockpit-goal-input · prompt: Make the onboarding flow friendlier for new users. I want it… +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/01-composed-and-started.png + +### step-2-visible-progress — PASS + +Planning shows visible progress; the UI never looks frozen +- status strip during planning: STAGE Brainstorm · 共创中 NOW Planner is thinking · Planner 正在分析 PROGRESS 0% — — APPROVALS 0 +- loop card during planning/clarify: type=understanding · active-agent=claude · next_step="回答下方的待确认问题,AI 才能继续生成方案 · Answer the questions below so the plan can continue." +- strip refreshed: "STAGE Brainstorm · 共创中 NOW Planner is thinking · Planner 正在分析 PROGRESS 0% — — APPROVALS 0" → "STAGE Decision · 做选择 NOW Review the questions, then generate the plan · 先确认问题,再生成方案 PROGRESS 0% — — APPROVALS 0" +- cockpit-last-activity refresh check is completed as soon as the mission overview exists (see step 4 notes) — the testid only renders once a mission is created. +- cockpit-last-activity refresh verified: "LAST ACTIVITY 0s ago" → "LAST ACTIVITY 1s ago" (1.7s apart) +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/02-planning-progress.png + +### step-3-clarifications — PASS + +Answer the clarification popup through the real UI controls +- clarification questions rendered: 2 +- answered transcript message visible; popup dismissed +- locked Generate Plan produced calm guidance, no raw gate code in visible text +- follow-up round confirmed confidence ≥95; plan unlocked +- loop card after clarification answers: type=understanding · active-agent=claude · next_step="稍等片刻,AI 正在确认理解,随后会给出方案 · Hang on — understanding is being confirmed; a plan comes next." +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03a-clarify-popup-filled.png +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03b-clarify-answered-gate-guidance.png +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/03c-clarify-unlocked.png + +### step-4-generate-roadmap — PASS + +Generate roadmap; PRD/roadmap artifacts exist and stage advances +- mission 01KTTVDD3GRHB3STCEQVD5QBFD created with 3 design artifacts (adr, prd, roadmap…) +- loop card at roadmap_ready: type=plan · active-agent=claude · next_step="审阅这份方案;你批准后才会开始动手 · Review this plan; work starts only after you approve it." +- card action on the plan card: approve-roadmap · "Approve Roadmap · 批准路线" +- cockpit-last-activity refresh verified: "LAST ACTIVITY 0s ago" → "LAST ACTIVITY 1s ago" +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/04-roadmap-ready.png + +### step-5-approve-and-execute — PASS + +Approve roadmap, start execution; execution state appears +- card action on the approved card: start-execution · "Start Execution · 启动执行" +- execution state appeared (stage=running) +- loop card during execution: type=progress · active-agent=codex · next_step="Wait for progress, pause, or stop if the run is wrong." +- worker runs recorded: 1; final stage=validators_missing +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/05a-approved.png +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/05b-execution-evidence-gate.png + +### step-6-loop-summary — PASS + +cockpit-loop-summary renders with non-empty whyStoppedOrContinuing +- whyStoppedOrContinuing: 结果评审尚未配置 · result review not configured +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/06-loop-summary.png + +### step-7-draft-pr-gate — PASS + +Draft PR gate BLOCKED is calm human text; no raw codes visible +- machine code stays in data-* only: data-pr-gate-code=GEMINI_NOT_CONFIGURED +- calm safety phrasing visible (安全门 / no push, no PR, no merge reassurance) +- loop card at the Draft PR gate: type=blocker · active-agent=none · next_step="查看这张卡的说明,确认是否继续 · Read this card’s explanation and confirm whether to continue." +- no PR URL recorded; operator.draft_pr_blocked event present +- screenshot: /home/user/claude-code-247/evidence/browser-cockpit-user-e2e/2026-06-11T08-05-02-983Z/07-pr-gate-blocked-human.png + +## Browser console issues (informational) + +- error: Failed to load resource: the server responded with a status of 409 (Conflict) + +> Note: the deliberate locked Generate Plan probe in step 3 produces one expected 409 network log entry; +> the assertion is that the VISIBLE UI stays human (guidance text, no raw codes). diff --git a/evidence/fleet-soak/2026-06-11T07-35-14-553Z/metrics.json b/evidence/fleet-soak/2026-06-11T07-35-14-553Z/metrics.json new file mode 100644 index 0000000..5914baa --- /dev/null +++ b/evidence/fleet-soak/2026-06-11T07-35-14-553Z/metrics.json @@ -0,0 +1,57 @@ +{ + "overall": "PASS", + "soakMs": 1800000, + "intervalMs": 200, + "durationMs": 1800054, + "tasksSeeded": 1000, + "executions": 1000, + "executionsPerWorker": { + "w-alice-1": 246, + "w-bob-1": 245, + "w-carol-1": 251, + "w-dave-1": 253, + "w-eve-1": 5 + }, + "drill": { + "taskId": "01KTTSQ2M71BX6CQRMYNDH6KHF", + "workerId": "w-eve-1", + "verdict": { + "mismatch": true, + "workerId": "w-eve-1", + "mismatchedGates": [ + "test" + ] + }, + "freezeAtMs": 1781163330412 + }, + "idle": { + "idlePollsOk": true, + "idleWindowMs": 614, + "headlessAtIdleStart": 0, + "headlessAtIdleEnd": 0, + "headlessTotal": 0 + }, + "criteria": [ + { + "id": "provisioning", + "pass": true + }, + { + "id": "no-double-execution", + "pass": true + }, + { + "id": "forged-evidence-drill", + "pass": true + }, + { + "id": "idle-zero-credit", + "pass": true + }, + { + "id": "operator-attribution", + "pass": true + } + ], + "harnessError": null +} \ No newline at end of file diff --git a/evidence/fleet-soak/2026-06-11T07-35-14-553Z/soak-report.md b/evidence/fleet-soak/2026-06-11T07-35-14-553Z/soak-report.md new file mode 100644 index 0000000..3e424f8 --- /dev/null +++ b/evidence/fleet-soak/2026-06-11T07-35-14-553Z/soak-report.md @@ -0,0 +1,80 @@ +# Fleet Soak Report — v5-P4 (in-container short soak) + +Result: **PASS** +Timestamp: 2026-06-11T07-35-14-553Z +Duration: 1800s (AEDEV_SOAK_MS=1800000, intervalMs=200) +Evidence dir: /home/user/claude-code-247/evidence/fleet-soak/2026-06-11T07-35-14-553Z + +Harness: real daemon (createServer, :memory: SQLite, temp stateDir), remote writes disabled, +all external CLIs/APIs disabled, 5 real FleetWorkerAgent loops over real HTTP on 127.0.0.1, +simulated executors producing passing evidence, simulated CI landing for each completion. + +## Workers + +| worker | operator | tasks executed | final registry status | +|--------|----------|----------------|------------------------| +| w-alice-1 | alice | 246 | active | +| w-bob-1 | bob | 245 | active | +| w-carol-1 | carol | 251 | active | +| w-dave-1 | dave | 253 | active | +| w-eve-1 | eve | 5 | frozen (drill) | + +Tasks seeded: 1000 · executed: 1000 · drill task: 01KTTSQ2M71BX6CQRMYNDH6KHF + +## Criteria + +### provisioning — PASS + +5 workers (5 operators) registered with real ed25519 keypairs +- registered: 5/5 +- distinct public keys in registry: 5 + +### no-double-execution — PASS + +Claim-ledger uniqueness: every task executed exactly once across 5 workers +- tasks seeded: 1000 · executions: 1000 +- executed twice: 0 · never executed: 0 · with ≠1 claim event: 0 +- queue drained inside the soak: true (t+990s) +- w-alice-1 (alice): 246 tasks executed +- w-bob-1 (bob): 245 tasks executed +- w-carol-1 (carol): 251 tasks executed +- w-dave-1 (dave): 253 tasks executed +- w-eve-1 (eve): 5 tasks executed + +### forged-evidence-drill — PASS + +Drill: self-reported PASS vs simulated-CI FAIL → HOLD + freeze + later claims 403; other 4 keep working +- verdict: mismatch=true worker=w-eve-1 gates=[test] +- HOLD-EVIDENCE-MISMATCH on task 01KTTSQ2M71BX6CQRMYNDH6KHF: open +- fleet.worker_frozen events: 1 · registry status: frozen +- w-eve-1 results after freeze: 8647, of which 403 worker_frozen: 8647, completions: 0 +- w-alice-1 completions after the freeze: 242 +- w-bob-1 completions after the freeze: 241 +- w-carol-1 completions after the freeze: 246 +- w-dave-1 completions after the freeze: 248 + +### idle-zero-credit — PASS + +Idle ≥3 loop intervals after drain with ZERO cost.headless_call events +- active workers observed idling: w-alice-1, w-bob-1, w-carol-1, w-dave-1 +- w-alice-1: +3 idle polls in the measured window (614ms) +- w-bob-1: +3 idle polls in the measured window (614ms) +- w-carol-1: +3 idle polls in the measured window (614ms) +- w-dave-1: +3 idle polls in the measured window (614ms) +- cost.headless_call during idle window: 0 · entire soak: 0 + +### operator-attribution — PASS + +Per-operator event attribution: claims/evidence/lifecycle carry registry-bound operatorId + workerId +- executions with fully consistent attribution: 1000/1000 +- operator alice: 246 claim events vs 246 executions +- operator bob: 245 claim events vs 245 executions +- operator carol: 251 claim events vs 251 executions +- operator dave: 253 claim events vs 253 executions +- operator eve: 5 claim events vs 5 executions + +## Honesty note + +in-container short soak with simulated executors — validates the harness + protocol under +concurrency; the ≥1-week real-CLI soak on operator machines remains open (rubric #19 stays +unchecked until then). diff --git a/evidence/fleet-soak/soak-pending.json b/evidence/fleet-soak/soak-pending.json new file mode 100644 index 0000000..037e975 --- /dev/null +++ b/evidence/fleet-soak/soak-pending.json @@ -0,0 +1,5 @@ +{ + "started_at": "2026-06-11T07:35:13.342Z", + "expected_end": "2026-06-18T07:35:13.342Z", + "status": "completed" +} diff --git a/evidence/v6/real-proof/2026-06-11T08-23-02Z/draft-pr-url.txt b/evidence/v6/real-proof/2026-06-11T08-23-02Z/draft-pr-url.txt new file mode 100644 index 0000000..b4f173b --- /dev/null +++ b/evidence/v6/real-proof/2026-06-11T08-23-02Z/draft-pr-url.txt @@ -0,0 +1 @@ +https://github.com/CTlanston/hermus-agent/pull/4 (REAL, operator-produced; full cockpit chain still HOLD-PLANNER-AUTH) diff --git a/evidence/v6/real-proof/2026-06-11T08-23-02Z/run-summary.md b/evidence/v6/real-proof/2026-06-11T08-23-02Z/run-summary.md new file mode 100644 index 0000000..ba3ee67 --- /dev/null +++ b/evidence/v6/real-proof/2026-06-11T08-23-02Z/run-summary.md @@ -0,0 +1,17 @@ +# Overnight Phase 2 — V6-P3 honest conclusion (2026-06-11T08-23-02Z) + +## REAL(操作员已产出,里程碑) +- **真实 Draft PR 存在:https://github.com/CTlanston/hermus-agent/pull/4** —— 远程写双闸在真实世界被证明可开真 PR(操作员 Mac,gh+codex 真实凭证)。 +- test:cockpit:real-smoke PASS(操作员 Mac)。 + +## HOLD(完整链仍未闭合) +- HOLD-PLANNER-AUTH:操作员 Mac 的 claude -p 返回 401 → cockpit 全链(clarify→…→Gemini→PR)未能由 cockpit 端到端驱动;真实 Gemini 判词 artifact 仍缺。 +- 本容器无 codex/gh/hermus —— 无法代跑。 + +## 恢复路径(含今晚 Phase 1 的新修复) +1. Mac: git pull 本分支;`claude login` 重登(或检查 Agent SDK credit)。 +2. 若 claude 暂不可用:`export AEDEV_PLANNER_FALLBACK=codex`(诚实降级,事件记 codex-cli (fallback))。 +3. 重跑 runbook mission → cockpit 端到端 → gemini-verdict.json + mission-events.jsonl 提交本目录。 + +## Classification +Real: hermus PR#4、real-smoke、30min soak 5/5、950 tests。Simulated: soak/E2E 引擎侧。Unproven: cockpit 端到端真链 + 真 Gemini 判词(本 HOLD 标的)。 diff --git a/packages/daemon/src/loop-cards.test.ts b/packages/daemon/src/loop-cards.test.ts index 1c12575..579701f 100644 --- a/packages/daemon/src/loop-cards.test.ts +++ b/packages/daemon/src/loop-cards.test.ts @@ -203,6 +203,21 @@ describe('BlockerCard — human wording only; raw codes live in machine', () => expect(card.human_explanation).toContain('等待你确认') }) + it('HOLD-PLANNER-AUTH blocker: recovery actions carry the exact one-line fixes, never raw 401/codes', () => { + const card = deriveLoopCard(makeInput('brainstorming', { + activeHolds: [{ code: 'HOLD-PLANNER-AUTH', reason: 'claude-cli auth failure (matched 401)' }], + })) + if (card.type !== 'blocker') throw new Error(`expected blocker, got ${card.type}`) + const joined = card.recovery_actions.join('\n') + expect(joined).toContain('claude login') + expect(joined).toContain('/status') + expect(joined).toContain('AEDEV_PLANNER_FALLBACK=codex') + expect(card.recommended_action).toContain('claude login') + expect(visibleText(card)).not.toMatch(VISIBLE_CODE) + expect(visibleText(card)).not.toContain('401') + expect(card.machine.hold_code).toBe('HOLD-PLANNER-AUTH') + }) + it('every blocker variant in the matrix keeps visible text code-free', () => { const variants: DeriveLoopCardInput[] = [ makeInput('failed'), diff --git a/packages/daemon/src/loop-cards.ts b/packages/daemon/src/loop-cards.ts index 1683fdb..e3ed4e2 100644 --- a/packages/daemon/src/loop-cards.ts +++ b/packages/daemon/src/loop-cards.ts @@ -147,6 +147,9 @@ function whyItMatters(code: string | null): string { if (code && code.startsWith('HOLD-REVIEW-LOOP')) { return '反复返工说明这条路走不通,继续自动重试只会浪费额度 · Repeated rework means this path is not converging; more automatic retries would only waste your allowance.' } + if (code && code.startsWith('HOLD-PLANNER-AUTH')) { + return '规划引擎的本地登录已失效,继续自动重试只会反复失败;先恢复登录是最快的恢复路径 · The planner’s local login is no longer valid; automatic retries would keep failing, so restoring the login is the fastest way back.' + } if (code && (code.startsWith('HOLD-SESSION-POOL') || code.startsWith('HOLD-TARGET-REPO'))) { return '在环境就绪之前动手只会产生假进度,系统选择诚实地等待 · Acting before the environment is ready would only fake progress; the system honestly waits instead.' } @@ -167,6 +170,13 @@ function recoveryActions(code: string | null): string[] { '把任务拆小或补充更明确的要求后重新开始 · Restart with a smaller task or clearer requirements.', ] } + if (code && code.startsWith('HOLD-PLANNER-AUTH')) { + return [ + '在终端运行 claude login 重新登录本地 Claude · Run `claude login` in a terminal to sign the local Claude CLI back in.', + '在 Claude CLI 里输入 /status 检查订阅额度 · Check subscription credit with /status inside the Claude CLI.', + '可选:设置 AEDEV_PLANNER_FALLBACK=codex 让本地 Codex 暂代规划(永不使用付费 API) · Optional: set AEDEV_PLANNER_FALLBACK=codex to let the local Codex CLI plan instead (never a paid API).', + ] + } if (code && (code.startsWith('HOLD-SESSION-POOL') || code.startsWith('HOLD-TARGET-REPO'))) { return [ '检查本地 AI 引擎和目标仓库是否就绪 · Check that the local AI engine and the target repository are ready.', diff --git a/packages/daemon/src/merge-policy-v6.test.ts b/packages/daemon/src/merge-policy-v6.test.ts new file mode 100644 index 0000000..f68ee40 --- /dev/null +++ b/packages/daemon/src/merge-policy-v6.test.ts @@ -0,0 +1,141 @@ +/** Overnight P4 — full matrix tests for the v6 merge-action policy. + * + * The policy is a PURE DECISION FUNCTION (docs/product/MERGE_POLICY.md). + * It is intentionally NOT wired into any merge execution path this cycle: + * WORKBOOK_v6 GR#10 (human merge only) keeps auto-merge DISABLED, and the + * exhaustive test below pins that `autoMergeEnabled=false` can NEVER yield + * `auto_merge_eligible` — not even for a perfect docs-only change. + */ +import { describe, it, expect } from 'vitest' +import { decideMergeAction, type MergeActionInput } from './merge-policy-v6.js' + +const CHANGE_KINDS = ['docs_only', 'code', 'security', 'workflow', 'dependency', 'system_config'] as const +const SENSITIVE_KINDS = ['security', 'workflow', 'dependency', 'system_config'] as const +const GEMINI_VERDICTS = ['pass', 'fail', 'inconclusive', 'not_configured'] as const +const CLAUDE_REVIEWS = ['approve', 'rework', 'none'] as const +const RISK_LEVELS = ['low', 'medium', 'high'] as const +const BOOLS = [true, false] as const + +/** A docs-only change with every gate green — the strongest possible candidate. */ +function perfectDocsOnly(overrides: Partial = {}): MergeActionInput { + return { + changeKind: 'docs_only', + testsGreen: true, + geminiVerdict: 'pass', + claudeReview: 'approve', + riskLevel: 'low', + explicitApproval: false, + autoMergeEnabled: false, + ...overrides, + } +} + +function* allInputs(fixed: Partial = {}): Generator { + for (const changeKind of CHANGE_KINDS) + for (const testsGreen of BOOLS) + for (const geminiVerdict of GEMINI_VERDICTS) + for (const claudeReview of CLAUDE_REVIEWS) + for (const riskLevel of RISK_LEVELS) + for (const explicitApproval of BOOLS) + for (const autoMergeEnabled of BOOLS) + yield { changeKind, testsGreen, geminiVerdict, claudeReview, riskLevel, explicitApproval, autoMergeEnabled, ...fixed } +} + +describe('decideMergeAction — GR#10 default (auto-merge disabled this cycle)', () => { + it('NEVER returns auto_merge_eligible when autoMergeEnabled=false — exhaustive over the whole matrix', () => { + let combos = 0 + for (const input of allInputs({ autoMergeEnabled: false })) { + combos += 1 + expect(decideMergeAction(input).action).not.toBe('auto_merge_eligible') + } + expect(combos).toBeGreaterThan(0) + }) + + it('a perfect docs-only change still stops at draft_pr_only, citing GR#10', () => { + const decision = decideMergeAction(perfectDocsOnly()) + expect(decision.action).toBe('draft_pr_only') + expect(decision.reason).toContain('GR#10') + }) +}) + +describe('decideMergeAction — validator and test gates', () => { + it('gemini fail → no_pr (failed work never becomes a PR)', () => { + for (const changeKind of ['docs_only', 'code'] as const) { + const decision = decideMergeAction(perfectDocsOnly({ changeKind, geminiVerdict: 'fail' })) + expect(decision.action).toBe('no_pr') + } + }) + + it('red tests → no_pr (failing work never becomes a PR)', () => { + for (const changeKind of ['docs_only', 'code'] as const) { + const decision = decideMergeAction(perfectDocsOnly({ changeKind, testsGreen: false })) + expect(decision.action).toBe('no_pr') + } + }) + + it('gemini inconclusive → hold (a human decides, the system never guesses)', () => { + for (const changeKind of ['docs_only', 'code'] as const) { + expect(decideMergeAction(perfectDocsOnly({ changeKind, geminiVerdict: 'inconclusive' })).action).toBe('hold') + } + }) + + it('gemini not_configured → hold (no validator evidence, no machine exit)', () => { + for (const changeKind of ['docs_only', 'code'] as const) { + expect(decideMergeAction(perfectDocsOnly({ changeKind, geminiVerdict: 'not_configured' })).action).toBe('hold') + } + }) +}) + +describe('decideMergeAction — sensitive change kinds hold regardless', () => { + it('security / workflow / dependency / system_config → hold for EVERY other combination', () => { + for (const changeKind of SENSITIVE_KINDS) { + for (const input of allInputs({ changeKind })) { + const decision = decideMergeAction(input) + expect(decision.action).toBe('hold') + expect(decision.reason).toContain(changeKind) + } + } + }) +}) + +describe('decideMergeAction — future-cycle eligibility (autoMergeEnabled=true)', () => { + it('perfect docs-only + enabled → auto_merge_eligible', () => { + expect(decideMergeAction(perfectDocsOnly({ autoMergeEnabled: true })).action).toBe('auto_merge_eligible') + }) + + it('code with all gates green but NO explicit approval → draft_pr_only', () => { + const decision = decideMergeAction(perfectDocsOnly({ changeKind: 'code', autoMergeEnabled: true })) + expect(decision.action).toBe('draft_pr_only') + }) + + it('code with explicit approval + all gates green + enabled → auto_merge_eligible', () => { + const decision = decideMergeAction(perfectDocsOnly({ changeKind: 'code', explicitApproval: true, autoMergeEnabled: true })) + expect(decision.action).toBe('auto_merge_eligible') + }) + + it('docs-only above low risk is never auto-merge eligible: medium → draft_pr_only, high → hold', () => { + expect(decideMergeAction(perfectDocsOnly({ riskLevel: 'medium', autoMergeEnabled: true })).action).toBe('draft_pr_only') + expect(decideMergeAction(perfectDocsOnly({ riskLevel: 'high', autoMergeEnabled: true })).action).toBe('hold') + }) + + it('claude review rework or none → draft_pr_only even with everything else green', () => { + for (const claudeReview of ['rework', 'none'] as const) { + expect(decideMergeAction(perfectDocsOnly({ claudeReview, autoMergeEnabled: true })).action).toBe('draft_pr_only') + } + }) +}) + +describe('decideMergeAction — output contract', () => { + it('every decision in the full matrix carries a non-empty reason', () => { + for (const input of allInputs()) { + const decision = decideMergeAction(input) + expect(decision.reason.length).toBeGreaterThan(0) + expect(CLAUDE_REVIEWS.length).toBeGreaterThan(0) // matrix sanity + } + }) + + it('high risk that survives the validator gates → hold (never a silent draft PR)', () => { + const decision = decideMergeAction(perfectDocsOnly({ changeKind: 'code', riskLevel: 'high', explicitApproval: true, autoMergeEnabled: true })) + expect(decision.action).toBe('hold') + }) +}) diff --git a/packages/daemon/src/merge-policy-v6.ts b/packages/daemon/src/merge-policy-v6.ts new file mode 100644 index 0000000..01e07a3 --- /dev/null +++ b/packages/daemon/src/merge-policy-v6.ts @@ -0,0 +1,124 @@ +/** WORKBOOK_v6 overnight P4 — the mature-product merge-action policy as a + * PURE DECISION FUNCTION. + * + * Policy doc: docs/product/MERGE_POLICY.md (reconciles docs/AUTO_MERGE_POLICY.md, + * the legacy v2.x risk-score policy behind @aedev/validators' MergePolicy). + * + * HARD SCOPE NOTE (GR#10 — human merge only): auto-merge is DISABLED this + * cycle. This module is exported for tests and future wiring ONLY; it is + * intentionally NOT imported by any merge execution path, and callers in this + * cycle must pass `autoMergeEnabled: false`, under which the function can + * never return `auto_merge_eligible` — the strongest outcome is a draft PR + * that a human merges. + */ + +export type V6ChangeKind = 'docs_only' | 'code' | 'security' | 'workflow' | 'dependency' | 'system_config' + +export interface MergeActionInput { + changeKind: V6ChangeKind + /** True only when the repo's test gate is green for this change. */ + testsGreen: boolean + /** Evidence-only Gemini hard gate (GR#9: Gemini is the final judge). */ + geminiVerdict: 'pass' | 'fail' | 'inconclusive' | 'not_configured' + /** In-process Claude cross-engine review verdict ('none' = no review ran). */ + claudeReview: 'approve' | 'rework' | 'none' + riskLevel: 'low' | 'medium' | 'high' + /** Operator explicitly approved merging this specific change. */ + explicitApproval: boolean + /** GR#10: must be false this cycle — auto-merge is disabled product-wide. */ + autoMergeEnabled: boolean +} + +export interface MergeActionDecision { + action: 'auto_merge_eligible' | 'draft_pr_only' | 'hold' | 'no_pr' + reason: string +} + +const SENSITIVE_KINDS: ReadonlySet = new Set(['security', 'workflow', 'dependency', 'system_config']) + +/** + * Decide the merge action for a completed, validated change. + * + * Decision order (first match wins — mirrors docs/product/MERGE_POLICY.md): + * + * 1. security / workflow / dependency / system_config → `hold`, regardless of + * validators, approval, or risk. These lanes always need a human. + * 2. Gemini FAIL → `no_pr`. Failed work never becomes a PR. + * 3. Tests red → `no_pr`. Same rule: broken work never becomes a PR. + * 4. Gemini inconclusive / not configured → `hold`. No verdict means a human + * decides; the system never guesses an exit. + * 5. High risk → `hold`. + * 6. Eligibility: docs-only + low risk + Claude review approve, OR a code + * change with Claude review approve AND explicit operator approval. + * Eligible + `autoMergeEnabled=true` (a FUTURE cycle, after a written + * GR#10 change) → `auto_merge_eligible`; eligible but disabled (this + * cycle's only legal state) → `draft_pr_only` citing GR#10. + * 7. Everything else that survived the gates → `draft_pr_only`. + */ +export function decideMergeAction(input: MergeActionInput): MergeActionDecision { + const { changeKind, testsGreen, geminiVerdict, claudeReview, riskLevel, explicitApproval, autoMergeEnabled } = input + + // 1. Sensitive lanes hold regardless of everything else. + if (SENSITIVE_KINDS.has(changeKind)) { + return { + action: 'hold', + reason: `${changeKind} change always holds for human review, regardless of validators, risk, or approval`, + } + } + + // 2. Failed validator → no PR at all. + if (geminiVerdict === 'fail') { + return { action: 'no_pr', reason: 'Gemini validator failed the evidence — failed work never becomes a PR' } + } + + // 3. Red tests → no PR at all. + if (!testsGreen) { + return { action: 'no_pr', reason: 'tests are red — broken work never becomes a PR' } + } + + // 4. No usable validator verdict → a human decides. + if (geminiVerdict === 'inconclusive' || geminiVerdict === 'not_configured') { + return { + action: 'hold', + reason: `Gemini verdict is ${geminiVerdict} — hold for a human decision instead of guessing a machine exit`, + } + } + + // 5. High risk → a human decides. + if (riskLevel === 'high') { + return { action: 'hold', reason: 'high-risk change — hold for human review' } + } + + // 6. Eligibility for (future-cycle) auto-merge. + const reviewApproved = claudeReview === 'approve' + const docsEligible = changeKind === 'docs_only' && riskLevel === 'low' && reviewApproved + const codeEligible = changeKind === 'code' && reviewApproved && explicitApproval + if (docsEligible || codeEligible) { + if (!autoMergeEnabled) { + return { + action: 'draft_pr_only', + reason: 'eligible on every gate, but auto-merge is DISABLED this cycle (WORKBOOK_v6 GR#10: human merge only) — the machine exit stops at a draft PR', + } + } + return { + action: 'auto_merge_eligible', + reason: docsEligible + ? 'docs-only low-risk change with green tests, Gemini PASS, and Claude review PASS' + : 'code change with green gates and explicit operator approval', + } + } + + // 7. Survived the gates but not auto-merge eligible → draft PR for a human. + if (!reviewApproved) { + return { + action: 'draft_pr_only', + reason: claudeReview === 'rework' + ? 'Claude review demands rework — draft PR only, for human triage' + : 'no Claude review verdict recorded — draft PR only, a human reviews and merges', + } + } + if (changeKind === 'code') { + return { action: 'draft_pr_only', reason: 'code change without explicit operator approval — draft PR only, a human merges' } + } + return { action: 'draft_pr_only', reason: 'docs-only change above low risk — draft PR only, a human merges' } +} diff --git a/packages/daemon/src/mission-runner.test.ts b/packages/daemon/src/mission-runner.test.ts index 780f2ed..fda97a9 100644 --- a/packages/daemon/src/mission-runner.test.ts +++ b/packages/daemon/src/mission-runner.test.ts @@ -6,6 +6,7 @@ import { AedevDb, type Mission, type RunResult, type Task, type ValidatorResult import { IntakeService } from './intake.js' import { MemoryGitClient, MissionRunner, type MissionValidator } from './mission-runner.js' import { DraftPrGateError, type DraftPrRequest, type DraftPrInfo } from './draft-pr-gate.js' +import { RUN_SUMMARY_SECTIONS } from './run-summary.js' import { ReleasePipeline, type DeployFn } from './release-pipeline.js' import type { RolePipeline } from './roles/role-pipeline.js' import type { WorkerSession } from '@aedev/runner' @@ -805,3 +806,83 @@ describe('MissionRunner — P2 cross-engine review loop', () => { expect(db.queryEvents({ type: 'review.requested', entityId: mission.id })).toHaveLength(0) }) }) + +describe('MissionRunner — P5 run-summary.md evidence audit artifact', () => { + function auditTaskEvidence(changedPaths: string[], branch: string): string { + const dir = join(stateDir, `audit-evidence-${Math.random().toString(16).slice(2)}`) + mkdirSync(dir, { recursive: true }) + writeFileSync(join(dir, 'diff-summary.md'), '# Diff Summary\n\nChanged files.\n') + writeFileSync(join(dir, 'test-summary.md'), '# Test Summary\n\nTests passed.\n') + writeFileSync(join(dir, 'done-report.md'), '# Done\n\nDone.\n') + writeFileSync(join(dir, 'changed-paths.json'), JSON.stringify({ changedPaths, forbiddenHits: [] })) + writeFileSync(join(dir, 'local-commit.json'), JSON.stringify({ attempted: true, created: true, sha: 'abc123', branch })) + return dir + } + + it('writes run-summary.md with every required audit section on the happy path', async () => { + const mission = approveMission('Refactor the auth utility (no UI)') + const runner = new MissionRunner(db, { + stateDir, + rolePipeline: fakeRolePipeline(), + runner: fakeRunner({ taskEvidenceDir: auditTaskEvidence(['README.md', 'src/foo.ts'], 'v6/p5-audit') }), + validators: [fakeValidator('gemini', 'pass'), fakeValidator('openai', 'pass')], + requiresUi: false, + }) + + const result = await runner.runMission(mission.id) + expect(result.status).toBe('done') + + const path = join(result.evidenceDir, 'run-summary.md') + expect(existsSync(path)).toBe(true) + const summary = readFileSync(path, 'utf8') + for (const section of RUN_SUMMARY_SECTIONS) expect(summary).toContain(section) + expect(summary).toContain('**Decision:** AUTO_MERGE') + expect(summary).toContain('- gemini: pass') + expect(summary).toContain('- openai: pass') + expect(summary).toContain('- README.md') + expect(summary).toContain('- src/foo.ts') + // honesty: the injected fake runner never touched a real worktree (mock + // provider) — the artifact must say simulated even with commit metadata. + expect(summary).toContain('Classification: simulated') + }) + + it('writes run-summary.md on a held path and records the hold honestly', async () => { + const mission = approveMission('Implement a safe hold path') + const runner = new MissionRunner(db, { + stateDir, + rolePipeline: fakeRolePipeline(), + workerSessions: [{ id: 'codex-1', provider: 'codex-cli', family: 'openai', healthy: false, active: 0 }], + }) + + const result = await runner.runMission(mission.id) + expect(result.status).toBe('waiting') + + const path = join(result.evidenceDir, 'run-summary.md') + expect(existsSync(path)).toBe(true) + const summary = readFileSync(path, 'utf8') + for (const section of RUN_SUMMARY_SECTIONS) expect(summary).toContain(section) + expect(summary).toContain('**Status:** waiting') + expect(summary).toContain('- HOLD-SESSION-POOL:') + expect(summary).toContain('(absent — no validator verdict recorded)') + expect(summary).not.toContain('- gemini: pass') + }) + + it('never fabricates a verdict — absent validator/reviewer inputs render as absent', async () => { + const mission = approveMission('Run without validators or reviewer') + const runner = new MissionRunner(db, { + stateDir, + rolePipeline: fakeRolePipeline(), + runner: fakeRunner({ taskEvidenceDir: makeTaskEvidence() }), + requiresUi: false, + }) + + const result = await runner.runMission(mission.id) + expect(result.status).toBe('waiting') + + const summary = readFileSync(join(result.evidenceDir, 'run-summary.md'), 'utf8') + expect(summary).toContain('(absent — no validator verdict recorded)') + expect(summary).toContain('(absent — no reviewer verdict recorded)') + expect(summary).toContain('(absent — runner produced no changed-paths.json)') + expect(summary).not.toContain(': pass') + }) +}) diff --git a/packages/daemon/src/mission-runner.ts b/packages/daemon/src/mission-runner.ts index bf40a94..b8673b5 100644 --- a/packages/daemon/src/mission-runner.ts +++ b/packages/daemon/src/mission-runner.ts @@ -21,6 +21,8 @@ import { } from './claude-reviewer.js' import { ReleasePipeline, type GitClient, type DeployFn, type DeployRequest, type ReleaseResult } from './release-pipeline.js' import { DraftPrGateError, type DraftPrRequest, type DraftPrInfo } from './draft-pr-gate.js' +import { HEADLESS_CALL_EVENT } from './headless-budget-guard.js' +import { listEvidenceArtifacts, writeRunSummary, type RunSummaryInput } from './run-summary.js' /** * Mission validator contract. Production wires real Gemini + OpenAI; tests @@ -173,6 +175,12 @@ export class MissionRunner { writeFileSync(join(evidenceDir, 'dag-summary.md'), renderDagSummary(dag, 'failed')) this.db.updateMissionStatus(mission.id, 'failed') this.db.insertEvent('mission.dag_failed', 'mission', mission.id, { taskId: task.id, runId: run.runId, nodeCount: dag.length }) + this.emitRunSummary(mission, evidenceDir, { + status: 'failed', + summary: `DAG execution failed at task ${task.id} (run ${run.runId}, exit ${run.exitCode}) after ${dag.length} planned node(s).`, + mergeDecision: 'BLOCKED', + ...(routeDecision?.provider ? { provider: routeDecision.provider } : {}), + }) return { missionId: mission.id, taskId: task.id, @@ -232,6 +240,7 @@ export class MissionRunner { // (diff + PRD + logs, never the coder conversation). `rework` sends the // coder through capped repair rounds; over the cap or a blocked/ // unstructured review → held mission, never a silent approve (GR#7). + let lastReview: { verdict: ReviewVerdict; cycle: number } | undefined if (this.opts.reviewer) { const maxCycles = maxReviewCyclesFromEnv() let reworks = 0 @@ -250,6 +259,7 @@ export class MissionRunner { } throw e } + lastReview = { verdict, cycle } writeFileSync(join(evidenceDir, `claude-review-${cycle}.json`), JSON.stringify({ cycle, ...verdict }, null, 2)) this.db.insertEvent('review.verdict', 'mission', mission.id, { cycle, taskId: task.id, verdict: verdict.verdict, findings: verdict.findings, confidence: verdict.confidence, @@ -288,6 +298,7 @@ export class MissionRunner { // evidence; we gate on those paths, not on regexing evidence prose. Feeds // both risk scoring and the merge gate's hard BLOCK. const changedPaths = readChangedPaths(evidenceDir) + const changedPathsFilePresent = existsSync(join(evidenceDir, 'changed-paths.json')) const repoForGate = this.db.getRepo(mission.repoId) const forbiddenPatterns = repoForGate?.forbiddenPaths?.length ? repoForGate.forbiddenPaths : DEFAULT_FORBIDDEN_PATHS const forbiddenPathTouched = changedPaths.some((p) => forbiddenPatterns.some((pat) => forbiddenMatch(pat, p))) @@ -396,6 +407,7 @@ export class MissionRunner { // we attempt only when the coder produced a real branch + changed files. const localCommit = readLocalCommit(evidenceDir) let draftPr: DraftPrInfo | undefined + let draftPrGateOutcome: string | undefined if (decision === 'AUTO_MERGE' && this.opts.draftPrExecutor && repoForGate && localCommit?.branch && changedPaths.length > 0) { const worktreeBaseDir = this.opts.runnerConfig?.worktreeBaseDir ?? join(this.opts.stateDir, 'worktrees') const request: DraftPrRequest = { @@ -420,6 +432,7 @@ export class MissionRunner { }) } catch (e) { const code = e instanceof DraftPrGateError ? e.code : 'DRAFT_PR_BLOCKED' + draftPrGateOutcome = `${code}: ${(e as Error).message}` this.db.insertEvent('mission.draft_pr_blocked', 'mission', mission.id, { code, reason: (e as Error).message, branch: localCommit.branch, }) @@ -452,6 +465,29 @@ export class MissionRunner { mission, run, risk, validatorResults, decision, browserQAResult, releaseResult, status: summaryStatus, })) + // 9b. P5 evidence audit — run-summary.md, the one-stop audit artifact. + // Built only from data already in scope; absent inputs render as absent. + this.emitRunSummary(mission, evidenceDir, { + status: summaryStatus, + summary: `Run ${run.runId} exited ${run.exitCode}; merge decision ${decision}; risk ${risk.score} (${risk.level}).`, + ...(changedPathsFilePresent ? { changedPaths } : {}), + ...(bundle['diff-summary.md'] !== undefined ? { diffSummary: bundle['diff-summary.md'] } : {}), + validatorResults: validatorResults.map((v) => ({ + validator: v.validator, + verdict: v.verdict, + ...(v.summary ? { summary: v.summary } : {}), + })), + ...(lastReview !== undefined + ? { reviewerVerdict: { verdict: lastReview.verdict.verdict, cycle: lastReview.cycle, findings: lastReview.verdict.findings } } + : {}), + mergeDecision: decision, + ...(draftPr !== undefined + ? { prUrl: draftPr.url } + : { prGateOutcome: draftPrGateOutcome ?? `no draft PR attempted (decision ${decision})` }), + ...(routeDecision.provider ? { provider: routeDecision.provider } : {}), + ...(localCommit?.sha !== undefined ? { localCommitSha: localCommit.sha } : {}), + }) + // 10. State machine — but never clobber an operator cancel (Stop fence). const dbStatus = summaryStatus === 'done' ? 'done' : summaryStatus === 'failed' ? 'failed' : 'paused' if (!skipFinalWriteIfCancelled(this.db, mission.id, 'mission-runner.complete', dbStatus)) { @@ -493,10 +529,38 @@ export class MissionRunner { } this.db.insertEvent('mission.run_failed', 'mission', mission.id, { error: (e as Error).message }) writeFileSync(join(evidenceDir, 'run-error.txt'), `${(e as Error).message}\n`) + this.emitRunSummary(mission, evidenceDir, { + status: 'failed', + summary: `Mission run threw: ${(e as Error).message}`, + }) throw e } } + /** P5 evidence audit: write run-summary.md from data already in scope. + * Cost/headless tallies come straight from the event store (GR#5) and the + * artifact list from the evidence dir; the audit write itself must never + * take down a mission run. */ + private emitRunSummary( + mission: Mission, + evidenceDir: string, + seed: Partial & Pick, + ): void { + try { + writeRunSummary(evidenceDir, { + missionId: mission.id, + missionTitle: mission.title, + costEvents: this.db.queryEvents({ type: 'model.usage.recorded', entityId: mission.id }).length, + headlessCalls: this.db.queryEvents({ type: HEADLESS_CALL_EVENT, entityId: mission.id }).length, + holds: this.db.listHolds(mission.id).map((h) => ({ code: h.code, reason: h.reason })), + ...seed, + artifacts: listEvidenceArtifacts(evidenceDir), + }) + } catch { + /* never let the audit artifact break the run */ + } + } + private async runDagNodes( mission: Mission, evidenceDir: string, @@ -671,6 +735,13 @@ export class MissionRunner { holdCode: params.holdCode, reason: params.reason, }) + this.emitRunSummary(params.mission, params.evidenceDir, { + status: 'waiting', + summary: `Mission held (${params.holdCode}): ${params.reason}`, + mergeDecision: 'WAITING', + holds: [{ code: params.holdCode, reason: params.reason }], + ...(params.routeDecision?.provider ? { provider: params.routeDecision.provider } : {}), + }) return { missionId: params.mission.id, taskId: params.task.id, diff --git a/packages/daemon/src/operator-planner-fallback.test.ts b/packages/daemon/src/operator-planner-fallback.test.ts new file mode 100644 index 0000000..547ef10 --- /dev/null +++ b/packages/daemon/src/operator-planner-fallback.test.ts @@ -0,0 +1,212 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest' +import { AedevDb } from '@aedev/core' +import type { ClaudeRunResult, CodexRunOptions, CodexRunResult } from '@aedev/runner' +import { runLocalPlannerText, runPlannerMissionDesign, type PlannerAdapterDeps } from './routes/operator.js' +import { HEADLESS_CALL_EVENT } from './headless-budget-guard.js' + +// overnight-p1 — honest planner auth failure + opt-in codex fallback. +// +// Real failure being fixed: `claude -p` → 401 on the operator's Mac, and the +// cockpit raised a confusing generic HOLD-PLANNER-CLI. Contract under test: +// 1. auth-looking failures emit HOLD-PLANNER-AUTH with the matched hint; +// 2. AEDEV_PLANNER_FALLBACK=codex (explicit opt-in ONLY) retries ONCE via the +// local codex CLI in read-only exec mode, recorded honestly as +// `planner_provider: codex-cli (fallback)` — never pretending it was claude; +// 3. fallback unset → codex is NEVER attempted; no template is substituted; +// 4. there is NO paid-API fallback of any kind. +// +// All adapters here are fakes (GR#8): no real CLI is ever spawned. + +const ENV_KEYS = [ + 'AEDEV_PLANNER_FALLBACK', + 'AEDEV_COCKPIT_PLANNER_PROVIDER', + 'AEDEV_COCKPIT_PLANNER_FIXTURE_JSON', + 'AEDEV_COCKPIT_FORCE_TEMPLATE', + 'AEDEV_BUDGET_MAX_HEADLESS_PER_MISSION', + 'AEDEV_BUDGET_MAX_HEADLESS_PER_DAY', +] as const +const saved: Record = {} + +beforeEach(() => { + for (const k of ENV_KEYS) { saved[k] = process.env[k]; delete process.env[k] } +}) + +afterEach(() => { + for (const k of ENV_KEYS) { + if (saved[k] === undefined) delete process.env[k] + else process.env[k] = saved[k] + } +}) + +const CLAUDE_401_STDERR = + 'API Error: 401 {"type":"error","error":{"type":"authentication_error","message":"OAuth token has expired. Please run /login."}}' + +function claudeResult(over: Partial = {}): ClaudeRunResult { + return { + transcript: '', + exitCode: 1, + durationMs: 25, + authMode: 'local_claude_code', + costUsd: null, + inputTokens: 0, + outputTokens: 0, + rawJson: {}, + error: CLAUDE_401_STDERR, + ...over, + } +} + +function codexResult(over: Partial = {}): CodexRunResult { + return { + transcript: 'codex says hi', + exitCode: 0, + durationMs: 30, + authMode: 'local_codex', + costUsd: null, + inputTokens: 5, + outputTokens: 9, + rawJson: {}, + ...over, + } +} + +function fakeClaude(result: ClaudeRunResult): NonNullable { + return { isAvailable: async () => true, run: async () => result } +} + +interface CodexCall { prompt: string; workdir: string; options: CodexRunOptions } + +function fakeCodex(result: CodexRunResult): { adapter: NonNullable; calls: CodexCall[] } { + const calls: CodexCall[] = [] + return { + calls, + adapter: { + isAvailable: async () => true, + run: async (prompt: string, workdir: string, options: CodexRunOptions = {}) => { + calls.push({ prompt, workdir, options }) + return result + }, + }, + } +} + +describe('runLocalPlannerText — distinct HOLD-PLANNER-AUTH on claude auth failure', () => { + it('claude 401 with NO fallback env → HOLD-PLANNER-AUTH (not generic), matched hint in reason, no codex attempt, no template', async () => { + const codex = fakeCodex(codexResult()) + const out = await runLocalPlannerText('sys', 'plan it', 'planner', 'add dark mode', undefined, { + claude: fakeClaude(claudeResult()), + codex: codex.adapter, + }) + expect(out.event['holdCode']).toBe('HOLD-PLANNER-AUTH') + const failures = (out.event['failures'] as string[]).join('; ') + expect(failures).toContain("matched '401'") + expect(failures).toContain('401') + // Regression (template never impersonates): real mode substitutes NO synthetic brainstorm. + expect(out.content).not.toContain('Initial brainstorm:') + expect(out.content).toContain('HOLD-PLANNER-AUTH') + expect(out.content).toContain('claude login') + expect(out.content).toContain('AEDEV_PLANNER_FALLBACK=codex') + // Fallback unset → codex must never be attempted. + expect(codex.calls).toHaveLength(0) + }) + + it('a non-auth claude failure stays HOLD-PLANNER-CLI', async () => { + const out = await runLocalPlannerText('sys', 'plan it', 'planner', 'goal', undefined, { + claude: fakeClaude(claudeResult({ exitCode: -1, error: 'spawn failed: ENOENT' })), + }) + expect(out.event['holdCode']).toBe('HOLD-PLANNER-CLI') + }) +}) + +describe('runLocalPlannerText — AEDEV_PLANNER_FALLBACK=codex (explicit opt-in)', () => { + it('claude 401 + fallback env → ONE codex retry in read-only exec mode, recorded as codex-cli (fallback)', async () => { + process.env['AEDEV_PLANNER_FALLBACK'] = 'codex' + const db = new AedevDb(':memory:') + const codex = fakeCodex(codexResult({ transcript: 'Brainstorm via codex\n\n1. option A' })) + const out = await runLocalPlannerText('sys prompt', 'plan it', 'planner', 'goal', { db, sessionId: 's1' }, { + claude: fakeClaude(claudeResult()), + codex: codex.adapter, + }) + // Honest output: content from codex, provider recorded as the fallback — never claude. + expect(out.content).toContain('Brainstorm via codex') + expect(out.event['holdCode']).toBeUndefined() + expect(out.event['provider']).toBe('codex-cli') + expect(out.event['planner_provider']).toBe('codex-cli (fallback)') + expect(out.event['fallbackFrom']).toBe('claude-cli') + expect(out.event['authMode']).toBe('local_codex') + // Exactly one retry, mirroring the probe contract: read-only sandbox, never-approve. + expect(codex.calls).toHaveLength(1) + expect(codex.calls[0]!.options.sandbox).toBe('read-only') + expect(codex.calls[0]!.options.approvalPolicy).toBe('never') + expect(codex.calls[0]!.prompt).toContain('sys prompt') + expect(codex.calls[0]!.prompt).toContain('plan it') + // The metered headless call is recorded with provider codex-cli (role unchanged). + const events = db.queryEvents({ type: HEADLESS_CALL_EVENT }) + const codexCall = events.find((e) => e.payload['provider'] === 'codex-cli') + expect(codexCall).toBeDefined() + expect(codexCall!.payload['role']).toBe('planner') + }) + + it('claude 401 + fallback env but codex also fails → HOLD-PLANNER-AUTH with both failures', async () => { + process.env['AEDEV_PLANNER_FALLBACK'] = 'codex' + const codex = fakeCodex(codexResult({ exitCode: 1, transcript: '', error: 'stream error: unexpected status 401' })) + const out = await runLocalPlannerText('sys', 'plan it', 'planner', 'goal', undefined, { + claude: fakeClaude(claudeResult()), + codex: codex.adapter, + }) + expect(codex.calls).toHaveLength(1) + expect(out.event['holdCode']).toBe('HOLD-PLANNER-AUTH') + const failures = (out.event['failures'] as string[]).join('; ') + expect(failures).toContain('codex-cli (fallback)') + }) + + it('claude healthy → codex is never consulted even with the fallback env set', async () => { + process.env['AEDEV_PLANNER_FALLBACK'] = 'codex' + const codex = fakeCodex(codexResult()) + const out = await runLocalPlannerText('sys', 'plan it', 'planner', 'goal', undefined, { + claude: fakeClaude(claudeResult({ exitCode: 0, transcript: 'real claude plan', error: undefined })), + codex: codex.adapter, + }) + expect(out.event['provider']).toBe('claude-cli') + expect(out.event['planner_provider']).toBeUndefined() + expect(codex.calls).toHaveLength(0) + }) +}) + +describe('runPlannerMissionDesign — auth hold + opt-in fallback (real mode)', () => { + beforeEach(() => { + // pnpm test forces the deterministic template; these tests pin the REAL path. + process.env['AEDEV_COCKPIT_FORCE_TEMPLATE'] = '0' + }) + + it('claude 401 with NO fallback env → ok:false with holdCode HOLD-PLANNER-AUTH and the 401 reason', async () => { + const codex = fakeCodex(codexResult()) + const out = await runPlannerMissionDesign('add dark mode', 'Dark mode', 'r1', 'm1', '', undefined, { + claude: fakeClaude(claudeResult()), + codex: codex.adapter, + }) + expect(out.ok).toBe(false) + if (out.ok) throw new Error('expected failure') + expect(out.holdCode).toBe('HOLD-PLANNER-AUTH') + expect(out.reason).toContain('401') + expect(codex.calls).toHaveLength(0) + }) + + it('claude 401 + AEDEV_PLANNER_FALLBACK=codex → design parsed from the codex fenced-JSON contract, provider honest', async () => { + process.env['AEDEV_PLANNER_FALLBACK'] = 'codex' + const codex = fakeCodex(codexResult({ + transcript: 'Here is the design:\n```json\n{"missionId":"m1","title":"Dark mode"}\n```\n', + })) + const out = await runPlannerMissionDesign('add dark mode', 'Dark mode', 'r1', 'm1', '', undefined, { + claude: fakeClaude(claudeResult()), + codex: codex.adapter, + }) + expect(out.ok).toBe(true) + if (!out.ok) throw new Error('expected success') + expect(out.design).toEqual({ missionId: 'm1', title: 'Dark mode' }) + expect(out.provider).toBe('codex-cli') + expect(out.plannerProvider).toBe('codex-cli (fallback)') + expect(codex.calls).toHaveLength(1) + expect(codex.calls[0]!.options.sandbox).toBe('read-only') + }) +}) diff --git a/packages/daemon/src/planner-auth.test.ts b/packages/daemon/src/planner-auth.test.ts new file mode 100644 index 0000000..b627f0b --- /dev/null +++ b/packages/daemon/src/planner-auth.test.ts @@ -0,0 +1,64 @@ +import { describe, it, expect } from 'vitest' +import { + PLANNER_AUTH_HOLD_CODE, + PLANNER_FALLBACK_ENV, + detectPlannerAuthFailure, + plannerFallbackProvider, +} from './planner-auth.js' + +// overnight-p1 — the operator's real failure: `claude -p` returns 401 +// (subscription/Agent-SDK auth) and the cockpit used to raise a confusing +// generic HOLD-PLANNER-CLI. These pure helpers classify the failure and read +// the explicit opt-in fallback policy. NEVER a paid-API fallback. + +describe('detectPlannerAuthFailure', () => { + it('classifies a 401 stderr as an auth failure with the matched hint', () => { + const hit = detectPlannerAuthFailure({ + exitCode: 1, + error: 'API Error: 401 {"type":"error","error":{"type":"authentication_error","message":"OAuth token has expired."}}', + }) + expect(hit).not.toBeNull() + expect(hit!.matched).toBe('401') + }) + + it('classifies unauthorized / login / credit hints (case-insensitive)', () => { + expect(detectPlannerAuthFailure({ exitCode: 1, error: 'Unauthorized request' })!.matched.toLowerCase()).toBe('unauthorized') + expect(detectPlannerAuthFailure({ exitCode: 2, transcript: 'Please run /login to continue' })!.matched.toLowerCase()).toBe('login') + expect(detectPlannerAuthFailure({ exitCode: 1, error: 'You are out of credit for this billing period' })!.matched.toLowerCase()).toBe('credit') + }) + + it('matches the hint in the transcript when stderr is empty', () => { + const hit = detectPlannerAuthFailure({ exitCode: 1, transcript: 'authentication_error: token invalid' }) + expect(hit).not.toBeNull() + }) + + it('a SUCCESSFUL run is never an auth failure even if the text mentions 401', () => { + expect(detectPlannerAuthFailure({ exitCode: 0, transcript: 'Here is how to handle HTTP 401 in your app' })).toBeNull() + }) + + it('a failure without any auth hint is NOT classified as auth', () => { + expect(detectPlannerAuthFailure({ exitCode: -1, error: 'spawn failed: ENOENT' })).toBeNull() + expect(detectPlannerAuthFailure({ exitCode: 124 })).toBeNull() + }) + + it('exports the distinct hold code', () => { + expect(PLANNER_AUTH_HOLD_CODE).toBe('HOLD-PLANNER-AUTH') + }) +}) + +describe('plannerFallbackProvider — explicit opt-in only', () => { + it('default (env unset) → null: no fallback, claude-only planner pin holds', () => { + expect(plannerFallbackProvider({})).toBeNull() + }) + + it('AEDEV_PLANNER_FALLBACK=codex → codex (trimmed, case-insensitive)', () => { + expect(plannerFallbackProvider({ [PLANNER_FALLBACK_ENV]: 'codex' })).toBe('codex') + expect(plannerFallbackProvider({ [PLANNER_FALLBACK_ENV]: ' CODEX ' })).toBe('codex') + }) + + it('any other value → null (no silent API fallback, no other providers)', () => { + for (const v of ['1', 'true', 'claude', 'openai', 'anthropic-api', '']) { + expect(plannerFallbackProvider({ [PLANNER_FALLBACK_ENV]: v })).toBeNull() + } + }) +}) diff --git a/packages/daemon/src/planner-auth.ts b/packages/daemon/src/planner-auth.ts new file mode 100644 index 0000000..04d236a --- /dev/null +++ b/packages/daemon/src/planner-auth.ts @@ -0,0 +1,52 @@ +/** + * overnight-p1 — honest planner auth-failure detection + opt-in codex fallback policy. + * + * The operator's real failure mode: `claude -p` returns 401 (subscription / + * Agent-SDK auth) and the cockpit planner used to raise a confusing generic + * HOLD-PLANNER-CLI. These pure helpers: + * 1. classify a FAILED local-CLI result as an auth failure + * (HOLD-PLANNER-AUTH) when the stderr/transcript carries an auth hint, and + * 2. read the explicit opt-in fallback policy `AEDEV_PLANNER_FALLBACK=codex`. + * + * Policy (operator-approved; intentionally relaxes the v3-P1 claude-only + * planner pin, but ONLY via explicit opt-in): env unset (default) = NO + * fallback; exactly `codex` = retry once via the local Codex CLI in read-only + * exec mode. There is NEVER a paid-API fallback (non-negotiable #6 / GR#6), + * and a fallback run is always recorded as `planner_provider: + * codex-cli (fallback)` — the system never pretends it was claude. + */ + +export const PLANNER_AUTH_HOLD_CODE = 'HOLD-PLANNER-AUTH' +export const PLANNER_FALLBACK_ENV = 'AEDEV_PLANNER_FALLBACK' + +/** Hints that a failed CLI run is an auth/subscription problem rather than a + * generic CLI breakage. Mirrors what the claude adapter surfaces: stderr in + * `error` (e.g. `API Error: 401 ... authentication_error ...`) and the JSON + * `result` text in `transcript`. */ +const AUTH_HINT = /401|unauthorized|auth|credit|login/i + +export interface PlannerCliOutcome { + exitCode: number + error?: string | undefined + transcript?: string | undefined +} + +/** Returns the matched auth hint when a FAILED CLI result (exitCode != 0) + * looks like an auth/credit/login problem; null for successes and for + * failures with no auth hint. */ +export function detectPlannerAuthFailure(result: PlannerCliOutcome): { matched: string } | null { + if (result.exitCode === 0) return null + for (const text of [result.error, result.transcript]) { + if (!text) continue + const match = AUTH_HINT.exec(text) + if (match) return { matched: match[0] } + } + return null +} + +/** Explicit opt-in planner fallback. Only the exact value `codex` + * (trimmed, case-insensitive) enables it; unset or anything else → null, + * i.e. the claude-only planner pin stays in force. */ +export function plannerFallbackProvider(env: NodeJS.ProcessEnv = process.env): 'codex' | null { + return (env[PLANNER_FALLBACK_ENV] ?? '').trim().toLowerCase() === 'codex' ? 'codex' : null +} diff --git a/packages/daemon/src/routes/operator.ts b/packages/daemon/src/routes/operator.ts index a6353c7..cb99cfc 100644 --- a/packages/daemon/src/routes/operator.ts +++ b/packages/daemon/src/routes/operator.ts @@ -38,6 +38,7 @@ import { recordHeadlessCall, } from '../headless-budget-guard.js' import { ClaudeReviewer, type ReviewVerdict } from '../claude-reviewer.js' +import { PLANNER_AUTH_HOLD_CODE, detectPlannerAuthFailure, plannerFallbackProvider } from '../planner-auth.js' import { deriveUserState, type UserStateView } from '../user-state.js' import { deriveLoopCard, type LoopCard } from '../loop-cards.js' @@ -393,17 +394,28 @@ export function registerOperatorRoutes(app: FastifyInstance, db: AedevDb, stateD db.insertEvent('operator.roadmap_generation_started', 'operator_session', session.id, { repoId }) const generated = await generateRoadmapDesign(db, stateDir, intake, repoId, session) if (!generated.ok) { + // HOLD-PLANNER-AUTH propagates so the cockpit shows the re-login fix + // instead of a confusing generic planner hold (overnight-p1). + const holdCode = generated.holdCode ?? 'HOLD-ROADMAP-PLANNER' db.updateOperatorSession(session.id, { status: 'hold' }) db.insertOperatorMessage({ sessionId: session.id, role: 'assistant', content: generated.message }) db.insertEvent('operator.hold_created', 'operator_session', session.id, { - holdCode: 'HOLD-ROADMAP-PLANNER', + holdCode, reason: generated.reason, }) - db.insertHold({ entityType: 'operator_session', entityId: session.id, code: 'HOLD-ROADMAP-PLANNER', reason: generated.reason }) + db.insertHold({ + entityType: 'operator_session', + entityId: session.id, + code: holdCode, + reason: generated.reason, + ...(holdCode === PLANNER_AUTH_HOLD_CODE + ? { nextAction: 'Run `claude login` (or check /status credit), or set AEDEV_PLANNER_FALLBACK=codex for an honest local fallback.' } + : {}), + }) return { session: db.getOperatorSession(session.id), messages: db.listOperatorMessages(session.id), - hold: { code: 'HOLD-ROADMAP-PLANNER', reason: generated.reason }, + hold: { code: holdCode, reason: generated.reason }, } } const mission = intake.requestApproval(generated.mission.id) @@ -1042,6 +1054,7 @@ async function completePlannerBrainstorm( ? undefined : recordClarifyRound(db, sessionId, 'planner', parsed, questions) db.updateOperatorSession(sessionId, { status: isHold || missingStructuredQuestions ? 'hold' : gateState?.unlocked ? 'brainstorm_ready' : 'clarifying' }) + if (isHold) recordPlannerAuthHold(db, sessionId, brainstorm.event) if (missingStructuredQuestions) { db.insertEvent('operator.hold_created', 'operator_session', sessionId, { holdCode: 'HOLD-CLARIFY-STRUCTURE', @@ -1082,6 +1095,26 @@ async function completePlannerBrainstorm( } } +/** overnight-p1 — persist a real HOLD row for a planner AUTH failure so the + * user-state / blocker card surface the calm re-login fix instead of a + * generic stuck session. Idempotent enough for the cockpit: one row per + * failed planner round; a later success resolves it via resolveSessionHolds. */ +function recordPlannerAuthHold(db: AedevDb, sessionId: string, event: Record): void { + if (event['holdCode'] !== PLANNER_AUTH_HOLD_CODE) return + const failures = event['failures'] + const reason = Array.isArray(failures) && failures.length + ? failures.map(String).join('; ') + : 'local Claude CLI auth failure' + db.insertEvent('operator.hold_created', 'operator_session', sessionId, { holdCode: PLANNER_AUTH_HOLD_CODE, reason }) + db.insertHold({ + entityType: 'operator_session', + entityId: sessionId, + code: PLANNER_AUTH_HOLD_CODE, + reason, + nextAction: 'Run `claude login` (or check /status credit), or set AEDEV_PLANNER_FALLBACK=codex for an honest local fallback.', + }) +} + /** Resolve any active session HOLDs and announce it so the UI clears stale banners (PRD §D). */ function resolveSessionHolds(db: AedevDb, sessionId: string, code?: string): void { const resolved = db.resolveHold(sessionId, code) @@ -1160,7 +1193,15 @@ async function runPlannerFollowup(requestPrompt: string, title: string, repoId: return runLocalPlannerText(systemPrompt, plannerPrompt, 'planner-followup', requestPrompt, budget) } -async function runLocalPlannerText(systemPrompt: string, plannerPrompt: string, role: string, holdContextPrompt: string, budget?: HeadlessBudgetCtx): Promise<{ content: string; event: Record }> { +/** Injectable planner adapters (overnight-p1). Tests pass fakes; production + * uses the real local CLIs from @aedev/runner (GR#8: the daemon itself never + * forks child processes — the spawning lives in the runner package). */ +export interface PlannerAdapterDeps { + claude?: Pick + codex?: Pick +} + +export async function runLocalPlannerText(systemPrompt: string, plannerPrompt: string, role: string, holdContextPrompt: string, budget?: HeadlessBudgetCtx, adapters?: PlannerAdapterDeps): Promise<{ content: string; event: Record }> { const timeoutMs = Number(process.env['AEDEV_COCKPIT_AI_TIMEOUT_MS'] ?? '300000') const failures: string[] = [] const plannerProvider = process.env['AEDEV_COCKPIT_PLANNER_PROVIDER'] ?? 'claude' @@ -1177,7 +1218,9 @@ async function runLocalPlannerText(systemPrompt: string, plannerPrompt: string, } } - const claude = new ClaudeCodeAdapter() + const claude = adapters?.claude ?? new ClaudeCodeAdapter() + let claudeFailed = false + let authHint: string | undefined if (plannerProvider === 'claude' && await claude.isAvailable()) { const result = await claude.run(plannerPrompt, process.cwd(), { systemPrompt, @@ -1207,16 +1250,72 @@ async function runLocalPlannerText(systemPrompt: string, plannerPrompt: string, }, } } - failures.push(`claude-cli: ${result.error ?? `exit ${result.exitCode}`}`) + claudeFailed = true + // Distinct, honest auth classification (overnight-p1): a 401/credit/login + // failure is the operator's problem to fix, not a broken CLI install. + const auth = detectPlannerAuthFailure(result) + if (auth) { + authHint = auth.matched + failures.push(`claude-cli auth failure (matched '${auth.matched}'): ${result.error ?? `exit ${result.exitCode}`}`) + } else { + failures.push(`claude-cli: ${result.error ?? `exit ${result.exitCode}`}`) + } + } else if (plannerProvider === 'claude') { + claudeFailed = true } if (plannerProvider !== 'claude') { failures.push(`unsupported planner provider '${plannerProvider}'; P1 requires claude-cli`) } + // Honest opt-in fallback (overnight-p1, operator policy): when + // AEDEV_PLANNER_FALLBACK=codex is set AND claude failed for ANY reason, + // retry ONCE via the local codex CLI in read-only exec mode (the same + // contract as the worker-session probe). NEVER a paid API. The event always + // records planner_provider 'codex-cli (fallback)' — never pretending claude. + if (plannerProvider === 'claude' && claudeFailed && plannerFallbackProvider() === 'codex') { + const codex = adapters?.codex ?? new CodexCliAdapter() + if (await codex.isAvailable()) { + const result = await codex.run([systemPrompt, '', plannerPrompt].join('\n'), process.cwd(), { + timeoutMs, + sandbox: 'read-only', + approvalPolicy: 'never', + }) + if (budget) { + recordHeadlessCall(budget.db, budget.sessionId, { + role, + provider: 'codex-cli', + authMode: result.authMode, + inputTokens: result.inputTokens, + outputTokens: result.outputTokens, + exitCode: result.exitCode, + }) + } + if (result.exitCode === 0 && result.transcript.trim()) { + return { + content: result.transcript.trim(), + event: { + role, + provider: 'codex-cli', + planner_provider: 'codex-cli (fallback)', + fallbackFrom: 'claude-cli', + authMode: result.authMode, + inputTokens: result.inputTokens, + outputTokens: result.outputTokens, + costUsd: result.costUsd, + }, + } + } + failures.push(`codex-cli (fallback): ${result.error ?? `exit ${result.exitCode}`}`) + } else { + failures.push('codex-cli (fallback): codex CLI not found on PATH') + } + } + + const holdCode = authHint ? PLANNER_AUTH_HOLD_CODE : failures.length ? 'HOLD-PLANNER-CLI' : 'HOLD-NO-LOCAL-CLI' return { - content: renderPlannerHold('claude-cli planner', failures.length ? failures.join('; ') : 'No healthy Claude CLI was found on PATH.', holdContextPrompt), - event: { role, provider: null, holdCode: failures.length ? 'HOLD-PLANNER-CLI' : 'HOLD-NO-LOCAL-CLI', failures }, + content: renderPlannerHold('claude-cli planner', failures.length ? failures.join('; ') : 'No healthy Claude CLI was found on PATH.', holdContextPrompt, holdCode), + event: { role, provider: null, holdCode, failures, ...(authHint ? { authHint } : {}) }, } } @@ -1277,6 +1376,7 @@ async function completePlannerFollowup( ? undefined : recordClarifyRound(db, sessionId, 'planner-followup', parsed, questions) db.updateOperatorSession(sessionId, { status: isHold || missingStructuredQuestions ? 'hold' : gateState?.unlocked ? 'brainstorm_ready' : 'clarifying' }) + if (isHold) recordPlannerAuthHold(db, sessionId, followup.event) if (missingStructuredQuestions) { db.insertEvent('operator.hold_created', 'operator_session', sessionId, { holdCode: 'HOLD-CLARIFY-STRUCTURE', @@ -1317,13 +1417,16 @@ async function completePlannerFollowup( } } -function renderPlannerHold(provider: string, reason: string, prompt: string): string { +function renderPlannerHold(provider: string, reason: string, prompt: string, holdCode = 'HOLD-PLANNER-CLI'): string { + const recovery = holdCode === PLANNER_AUTH_HOLD_CODE + ? 'No synthetic brainstorm was substituted. Local Claude auth failed — run `claude login` in a terminal (or check subscription credit with /status). Optional honest fallback: set AEDEV_PLANNER_FALLBACK=codex to retry once via the local Codex CLI. No paid API is ever used.' + : 'No synthetic brainstorm was substituted. Fix the local CLI/session and click New Brainstorm again.' return [ - `HOLD-PLANNER-CLI: ${provider} could not produce a real brainstorm.`, + `${holdCode}: ${provider} could not produce a real brainstorm.`, '', `Reason: ${reason}`, '', - 'No synthetic brainstorm was substituted. Fix the local CLI/session and click New Brainstorm again.', + recovery, '', `Original prompt: ${prompt}`, ].join('\n') @@ -1359,7 +1462,7 @@ async function generateRoadmapDesign( intake: IntakeService, repoId: string, session: { id: string; title: string; prompt: string }, -): Promise<{ ok: true; mission: NonNullable>; design: MissionDesign } | { ok: false; reason: string; message: string }> { +): Promise<{ ok: true; mission: NonNullable>; design: MissionDesign } | { ok: false; reason: string; message: string; holdCode?: string }> { if (isTemplateRoadmapEnabled()) { const mission = intake.createMissionCandidate(repoId, session.prompt, session.title) registerDesignArtifacts(db, stateDir, mission.id) @@ -1375,15 +1478,19 @@ async function generateRoadmapDesign( const output = await runPlannerMissionDesign(session.prompt, session.title, repoId, mission.id, buildClarifications(db, session.id), { db, sessionId: session.id }) if (!output.ok) { + const holdCode = output.holdCode ?? 'HOLD-ROADMAP-PLANNER' return { ok: false, reason: output.reason, + holdCode, message: [ - 'HOLD-ROADMAP-PLANNER: local planner CLI could not produce a valid PRD/ADR/Roadmap design.', + `${holdCode}: local planner CLI could not produce a valid PRD/ADR/Roadmap design.`, '', `Reason: ${output.reason}`, '', - 'No deterministic template was substituted. Fix the planner CLI/session or set AEDEV_COCKPIT_FORCE_TEMPLATE=1 for explicit test fallback.', + holdCode === PLANNER_AUTH_HOLD_CODE + ? 'No deterministic template was substituted. Local Claude auth failed — run `claude login` (or check subscription credit with /status), or set AEDEV_PLANNER_FALLBACK=codex for an honest local fallback. No paid API is ever used.' + : 'No deterministic template was substituted. Fix the planner CLI/session or set AEDEV_COCKPIT_FORCE_TEMPLATE=1 for explicit test fallback.', ].join('\n'), } } @@ -1405,6 +1512,8 @@ async function generateRoadmapDesign( db.insertEvent('operator.cost_updated', 'operator_session', session.id, { scope: 'planner', provider: output.provider, + // Honest provenance: 'codex-cli (fallback)' when the opt-in fallback planned. + ...(output.plannerProvider ? { planner_provider: output.plannerProvider } : {}), authMode: output.authMode, inputTokens: output.inputTokens, outputTokens: output.outputTokens, @@ -1441,22 +1550,25 @@ function buildClarifications(db: AedevDb, sessionId: string): string { return lines.join('\n') } -async function runPlannerMissionDesign( +export async function runPlannerMissionDesign( prompt: string, title: string, repoId: string, missionId: string, clarifications = '', budget?: HeadlessBudgetCtx, + adapters?: PlannerAdapterDeps, ): Promise<{ ok: true design: unknown provider: string + /** Honest provenance — 'codex-cli (fallback)' when the opt-in fallback produced the design. */ + plannerProvider?: string authMode?: string inputTokens: number outputTokens: number costUsd: number | null -} | { ok: false; reason: string }> { +} | { ok: false; reason: string; holdCode?: string }> { const fixture = process.env['AEDEV_COCKPIT_PLANNER_FIXTURE_JSON'] if (fixture) { try { @@ -1507,7 +1619,9 @@ async function runPlannerMissionDesign( } } - const claude = new ClaudeCodeAdapter() + const claude = adapters?.claude ?? new ClaudeCodeAdapter() + let claudeFailed = false + let authHint: string | undefined if (provider === 'claude' && await claude.isAvailable()) { const result = await claude.run(plannerPrompt, process.cwd(), { timeoutMs, permissionMode: 'bypassPermissions' }) if (budget) { @@ -1531,17 +1645,69 @@ async function runPlannerMissionDesign( outputTokens: result.outputTokens, costUsd: result.costUsd, } + claudeFailed = true failures.push(`claude-cli invalid JSON: ${parsed.reason}`) } else { - failures.push(`claude-cli: ${result.error ?? `exit ${result.exitCode}`}`) + claudeFailed = true + const auth = detectPlannerAuthFailure(result) + if (auth) { + authHint = auth.matched + failures.push(`claude-cli auth failure (matched '${auth.matched}'): ${result.error ?? `exit ${result.exitCode}`}`) + } else { + failures.push(`claude-cli: ${result.error ?? `exit ${result.exitCode}`}`) + } } + } else if (provider === 'claude') { + claudeFailed = true } if (provider !== 'claude') { failures.push(`unsupported planner provider '${provider}'; P1 requires claude-cli`) } - return { ok: false, reason: failures.length ? failures.join('; ') : 'No healthy local Claude planner CLI found.' } + // Honest opt-in fallback (overnight-p1): AEDEV_PLANNER_FALLBACK=codex retries + // ONCE via the local codex CLI (read-only exec, same fenced-JSON contract). + // NEVER a paid API; provenance is recorded as 'codex-cli (fallback)'. + if (provider === 'claude' && claudeFailed && plannerFallbackProvider() === 'codex') { + const codex = adapters?.codex ?? new CodexCliAdapter() + if (await codex.isAvailable()) { + const result = await codex.run(plannerPrompt, process.cwd(), { timeoutMs, sandbox: 'read-only', approvalPolicy: 'never' }) + if (budget) { + recordHeadlessCall(budget.db, budget.sessionId, { + role: 'mission-design', + provider: 'codex-cli', + authMode: result.authMode, + inputTokens: result.inputTokens, + outputTokens: result.outputTokens, + exitCode: result.exitCode, + }) + } + if (result.exitCode === 0 && result.transcript.trim()) { + const parsed = extractJsonObject(result.transcript) + if (parsed.ok) return { + ok: true, + design: parsed.value, + provider: 'codex-cli', + plannerProvider: 'codex-cli (fallback)', + authMode: result.authMode, + inputTokens: result.inputTokens, + outputTokens: result.outputTokens, + costUsd: result.costUsd, + } + failures.push(`codex-cli (fallback) invalid JSON: ${parsed.reason}`) + } else { + failures.push(`codex-cli (fallback): ${result.error ?? `exit ${result.exitCode}`}`) + } + } else { + failures.push('codex-cli (fallback): codex CLI not found on PATH') + } + } + + return { + ok: false, + reason: failures.length ? failures.join('; ') : 'No healthy local Claude planner CLI found.', + ...(authHint ? { holdCode: PLANNER_AUTH_HOLD_CODE } : {}), + } } function extractJsonObject(text: string): { ok: true; value: unknown } | { ok: false; reason: string } { diff --git a/packages/daemon/src/run-summary.test.ts b/packages/daemon/src/run-summary.test.ts new file mode 100644 index 0000000..93f7aec --- /dev/null +++ b/packages/daemon/src/run-summary.test.ts @@ -0,0 +1,138 @@ +/** Overnight P5 — evidence audit package: run-summary.md renderer contract. + * + * Honesty rule under test: every required section exists in every render, a + * missing input renders as an explicit "absent" marker, and the renderer can + * NEVER fabricate a PASS or a "real" classification from missing evidence + * (GR: evidence honesty — real / simulated / unproven explicit). + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest' +import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'fs' +import { join } from 'path' +import { tmpdir } from 'os' +import { + RUN_SUMMARY_SECTIONS, + classifyRun, + listEvidenceArtifacts, + renderRunSummary, + writeRunSummary, + type RunSummaryInput, +} from './run-summary.js' + +function fullInput(): RunSummaryInput { + return { + missionId: 'm-full', + missionTitle: 'Ship the audit artifact', + status: 'done', + summary: 'Run run-1 exited 0; merge decision AUTO_MERGE.', + changedPaths: ['README.md', 'src/foo.ts'], + diffSummary: '# Diff Summary\n\nTwo files changed.\n', + validatorResults: [ + { validator: 'gemini', verdict: 'pass', summary: 'evidence is consistent' }, + { validator: 'openai', verdict: 'pass' }, + ], + reviewerVerdict: { verdict: 'approve', cycle: 2, findings: ['nit: rename helper'] }, + mergeDecision: 'AUTO_MERGE', + prUrl: 'https://github.com/o/r/pull/7', + costEvents: 3, + headlessCalls: 1, + holds: [{ code: 'HOLD-BUDGET', reason: 'daily cap reached' }], + artifacts: ['diff-summary.md', 'plan.md'], + provider: 'claude-cli', + localCommitSha: 'abc123', + } +} + +describe('renderRunSummary — full input', () => { + it('renders every required section with the supplied data', () => { + const md = renderRunSummary(fullInput()) + for (const section of RUN_SUMMARY_SECTIONS) expect(md).toContain(section) + expect(md).toContain('# Run Summary — m-full') + expect(md).toContain('Ship the audit artifact') + expect(md).toContain('- README.md') + expect(md).toContain('- src/foo.ts') + expect(md).toContain('- gemini: pass — evidence is consistent') + expect(md).toContain('- openai: pass') + expect(md).toContain('approve (cycle 2)') + expect(md).toContain('nit: rename helper') + expect(md).toContain('https://github.com/o/r/pull/7') + expect(md).toContain('model usage events: 3') + expect(md).toContain('headless calls (mission scope): 1') + expect(md).toContain('- HOLD-BUDGET: daily cap reached') + expect(md).toContain('- plan.md') + expect(md).toContain('Classification: real') + expect(md).toContain('abc123') + }) +}) + +describe('renderRunSummary — missing-file honesty', () => { + it('renders every section as an explicit absent marker on a minimal input', () => { + const md = renderRunSummary({ missionId: 'm-min', status: 'failed' }) + for (const section of RUN_SUMMARY_SECTIONS) expect(md).toContain(section) + expect(md).toContain('(absent — no run summary recorded)') + expect(md).toContain('(absent — runner produced no changed-paths.json)') + expect(md).toContain('(absent — no diff evidence)') + expect(md).toContain('(absent — no validator verdict recorded)') + expect(md).toContain('(absent — no reviewer verdict recorded)') + expect(md).toContain('(absent — no PR attempted and no gate outcome recorded)') + expect(md).toContain('model usage events: absent') + expect(md).toContain('headless calls (mission scope): absent') + expect(md).toContain('(absent — hold state not collected)') + expect(md).toContain('(absent — artifacts not listed)') + // never a fabricated verdict + expect(md).not.toContain(': pass') + expect(md).not.toContain('PASS') + expect(md).toContain('Classification: simulated') + }) + + it('an empty changed-paths list is reported as an empty diff, not as absent', () => { + const md = renderRunSummary({ missionId: 'm-empty', status: 'waiting', changedPaths: [] }) + expect(md).toContain('(none — empty diff reported by the runner)') + expect(md).not.toContain('(absent — runner produced no changed-paths.json)') + }) + + it('an empty validator list still renders as absent — never an implicit pass', () => { + const md = renderRunSummary({ missionId: 'm-noval', status: 'waiting', validatorResults: [] }) + expect(md).toContain('(absent — no validator verdict recorded)') + }) +}) + +describe('classifyRun — real / simulated / unproven', () => { + it('mock or missing provider → simulated', () => { + expect(classifyRun({}).label).toBe('simulated') + expect(classifyRun({ provider: 'mock', localCommitSha: 'abc', changedPaths: ['a'] }).label).toBe('simulated') + }) + + it('real provider + local commit + changed paths → real', () => { + expect(classifyRun({ provider: 'claude-cli', localCommitSha: 'abc', changedPaths: ['a.ts'] }).label).toBe('real') + }) + + it('real provider without commit or diff evidence → unproven, never real', () => { + expect(classifyRun({ provider: 'claude-cli' }).label).toBe('unproven') + expect(classifyRun({ provider: 'codex-cli', localCommitSha: 'abc', changedPaths: [] }).label).toBe('unproven') + expect(classifyRun({ provider: 'codex-cli', changedPaths: ['a.ts'] }).label).toBe('unproven') + }) +}) + +describe('writeRunSummary + listEvidenceArtifacts', () => { + let dir: string + beforeEach(() => { + dir = join(tmpdir(), `aedev-run-summary-test-${Date.now()}-${Math.random().toString(16).slice(2)}`) + mkdirSync(dir, { recursive: true }) + }) + afterEach(() => rmSync(dir, { recursive: true, force: true })) + + it('writes run-summary.md into the evidence dir and returns its path', () => { + const path = writeRunSummary(dir, { missionId: 'm-write', status: 'done' }) + expect(path).toBe(join(dir, 'run-summary.md')) + expect(existsSync(path)).toBe(true) + expect(readFileSync(path, 'utf8')).toContain('# Run Summary — m-write') + }) + + it('lists top-level evidence files sorted, and returns [] for a missing dir', () => { + writeFileSync(join(dir, 'b.md'), 'b') + writeFileSync(join(dir, 'a.md'), 'a') + mkdirSync(join(dir, 'nodes')) + expect(listEvidenceArtifacts(dir)).toEqual(['a.md', 'b.md']) + expect(listEvidenceArtifacts(join(dir, 'does-not-exist'))).toEqual([]) + }) +}) diff --git a/packages/daemon/src/run-summary.ts b/packages/daemon/src/run-summary.ts new file mode 100644 index 0000000..1b6381e --- /dev/null +++ b/packages/daemon/src/run-summary.ts @@ -0,0 +1,198 @@ +/** WORKBOOK_v6 overnight P5 — evidence audit package. + * + * `run-summary.md` is the single audit artifact written into every mission + * evidence dir at the end of `runMission` (done / failed / held paths). It + * complements `workbook-summary.md` (operator-facing narrative) with a + * machine-auditable, honesty-first digest: every required section is always + * present, a missing input renders as an explicit "absent" marker (never a + * fabricated PASS), and the run is explicitly classified + * real / simulated / unproven (GR: evidence honesty). + * + * `renderRunSummary` is pure; `writeRunSummary` is the thin fs writer. + */ +import { existsSync, readdirSync, statSync, writeFileSync } from 'fs' +import { join } from 'path' + +export interface RunSummaryValidatorVerdict { + validator: string + verdict: string + summary?: string +} + +export interface RunSummaryReviewerVerdict { + verdict: string + cycle: number + findings?: string[] +} + +export interface RunSummaryHold { + code: string + reason: string +} + +export interface RunSummaryInput { + missionId: string + missionTitle?: string + status: string + /** One-line outcome of the run. */ + summary?: string + /** Real `git diff` file list from the runner; undefined = file absent. */ + changedPaths?: string[] + /** Raw diff-summary.md content (stats only are rendered, never re-asserted). */ + diffSummary?: string + validatorResults?: RunSummaryValidatorVerdict[] + reviewerVerdict?: RunSummaryReviewerVerdict + mergeDecision?: string + /** Real draft-PR URL when one was opened. */ + prUrl?: string + /** Why no PR exists (gate refusal / decision) when prUrl is absent. */ + prGateOutcome?: string + /** `model.usage.recorded` event count for this mission. */ + costEvents?: number + /** `cost.headless_call` event count attributed to this mission. */ + headlessCalls?: number + holds?: RunSummaryHold[] + artifacts?: string[] + /** Coder provider that produced the change (classification input). */ + provider?: string + /** Local commit sha from local-commit.json (classification input). */ + localCommitSha?: string +} + +/** Every render contains all of these (the trailing entry is the + * real-vs-simulated classification line, not a markdown heading). */ +export const RUN_SUMMARY_SECTIONS = [ + '## Summary', + '## Changed Paths', + '## Diff Stats', + '## Validator Verdicts', + '## Reviewer Verdict', + '## PR / Gate Outcome', + '## Cost & Headless Calls', + '## Holds', + '## Artifacts', + 'Classification:', +] as const + +export type RunClassification = 'real' | 'simulated' | 'unproven' + +/** GR evidence honesty: a run is `real` only when a non-mock provider left + * both a local commit and a non-empty changed-paths list; a non-mock + * provider without that proof is `unproven`, never `real`. */ +export function classifyRun( + input: Pick, +): { label: RunClassification; detail: string } { + if (!input.provider || input.provider === 'mock') { + return { label: 'simulated', detail: `coder provider is ${input.provider ?? 'unknown'} — no real worktree run` } + } + const changedCount = input.changedPaths?.length ?? 0 + if (input.localCommitSha && changedCount > 0) { + return { + label: 'real', + detail: `provider ${input.provider}, local commit ${input.localCommitSha}, ${changedCount} changed file(s)`, + } + } + const missing = [ + ...(input.localCommitSha ? [] : ['local commit evidence']), + ...(changedCount > 0 ? [] : ['changed-path evidence']), + ] + return { + label: 'unproven', + detail: `provider ${input.provider} ran, but ${missing.join(' and ')} is missing — not claiming a real change`, + } +} + +export function renderRunSummary(input: RunSummaryInput): string { + const lines: string[] = [ + `# Run Summary — ${input.missionId}`, + '', + ...(input.missionTitle !== undefined ? [`**Mission:** ${input.missionTitle}`] : []), + `**Status:** ${input.status}`, + `**Decision:** ${input.mergeDecision ?? '(absent — no merge decision recorded)'}`, + '', + '## Summary', + input.summary ?? '(absent — no run summary recorded)', + '', + '## Changed Paths', + ] + + if (input.changedPaths === undefined) lines.push('(absent — runner produced no changed-paths.json)') + else if (input.changedPaths.length === 0) lines.push('(none — empty diff reported by the runner)') + else for (const p of input.changedPaths) lines.push(`- ${p}`) + + lines.push('', '## Diff Stats') + if (input.changedPaths === undefined && input.diffSummary === undefined) { + lines.push('(absent — no diff evidence)') + } else { + if (input.changedPaths !== undefined) lines.push(`- files changed: ${input.changedPaths.length}`) + lines.push( + input.diffSummary !== undefined + ? `- diff-summary.md: present (${input.diffSummary.split('\n').length} lines)` + : '- diff-summary.md: absent', + ) + } + + lines.push('', '## Validator Verdicts') + if (!input.validatorResults || input.validatorResults.length === 0) { + lines.push('(absent — no validator verdict recorded)') + } else { + for (const v of input.validatorResults) { + lines.push(`- ${v.validator}: ${v.verdict}${v.summary ? ` — ${v.summary}` : ''}`) + } + } + + lines.push('', '## Reviewer Verdict') + if (!input.reviewerVerdict) { + lines.push('(absent — no reviewer verdict recorded)') + } else { + lines.push(`- ${input.reviewerVerdict.verdict} (cycle ${input.reviewerVerdict.cycle})`) + for (const finding of input.reviewerVerdict.findings ?? []) lines.push(` - ${finding}`) + } + + lines.push('', '## PR / Gate Outcome') + if (input.prUrl) lines.push(`- draft PR: ${input.prUrl}`) + else if (input.prGateOutcome) lines.push(`- gate outcome: ${input.prGateOutcome}`) + else lines.push('(absent — no PR attempted and no gate outcome recorded)') + + lines.push( + '', + '## Cost & Headless Calls', + `- model usage events: ${input.costEvents ?? 'absent'}`, + `- headless calls (mission scope): ${input.headlessCalls ?? 'absent'}`, + ) + + lines.push('', '## Holds') + if (input.holds === undefined) lines.push('(absent — hold state not collected)') + else if (input.holds.length === 0) lines.push('(none)') + else for (const h of input.holds) lines.push(`- ${h.code}: ${h.reason}`) + + lines.push('', '## Artifacts') + if (input.artifacts === undefined) lines.push('(absent — artifacts not listed)') + else if (input.artifacts.length === 0) lines.push('(none)') + else for (const a of input.artifacts) lines.push(`- ${a}`) + + const classification = classifyRun(input) + lines.push('', `Classification: ${classification.label} — ${classification.detail}`, '') + return lines.join('\n') +} + +/** Write run-summary.md into the evidence dir; returns the file path. */ +export function writeRunSummary(evidenceDir: string, input: RunSummaryInput): string { + const path = join(evidenceDir, 'run-summary.md') + writeFileSync(path, renderRunSummary(input)) + return path +} + +/** Sorted top-level evidence files (subdirectories like nodes/ are skipped). */ +export function listEvidenceArtifacts(evidenceDir: string): string[] { + if (!existsSync(evidenceDir)) return [] + return readdirSync(evidenceDir) + .filter((entry) => { + try { + return statSync(join(evidenceDir, entry)).isFile() + } catch { + return false + } + }) + .sort() +} diff --git a/packages/daemon/src/user-state.test.ts b/packages/daemon/src/user-state.test.ts index 03668b4..9d35239 100644 --- a/packages/daemon/src/user-state.test.ts +++ b/packages/daemon/src/user-state.test.ts @@ -88,6 +88,14 @@ describe('deriveUserState — blocked with HUMAN explanations from hold codes', expect(v.explanation).toContain('本地 AI 引擎未就绪') }) + it('HOLD-PLANNER-AUTH → calm re-login wording with the exact one-line fix, never raw 401', () => { + const v = derive('brainstorming', { activeHoldCodes: ['HOLD-PLANNER-AUTH'] }) + expect(v.state).toBe('blocked') + expect(v.explanation).toContain('本地 Claude 登录已过期或额度用尽') + expect(v.explanation).toContain('claude login') + expect(v.explanation).not.toMatch(/401|unauthorized|HOLD-/i) + }) + it('any other active HOLD-* → blocked with a calm generic fallback, never the raw code', () => { const v = derive('running', { activeHoldCodes: ['HOLD-ROADMAP-PLANNER'] }) expect(v.state).toBe('blocked') diff --git a/packages/daemon/src/user-state.ts b/packages/daemon/src/user-state.ts index 02cdc1a..669997d 100644 --- a/packages/daemon/src/user-state.ts +++ b/packages/daemon/src/user-state.ts @@ -61,6 +61,9 @@ export function explainBlockingCode(code: string | undefined): string { if (code && code.startsWith('HOLD-REVIEW-LOOP')) { return '自动修复多次未通过,需要人看一眼 · Automatic repair did not pass after several tries; a person should take a look.' } + if (code && code.startsWith('HOLD-PLANNER-AUTH')) { + return '本地 Claude 登录已过期或额度用尽:在终端运行 claude login 重新登录,或检查订阅额度 · The local Claude session needs re-login or has run out of credit — run "claude login" in a terminal, or check your subscription credit.' + } if (code && code.startsWith('HOLD-SESSION-POOL')) { return '本地 AI 引擎未就绪 · The local AI engine is not ready yet.' } diff --git a/scripts/operator-cockpit-user-e2e.ts b/scripts/operator-cockpit-user-e2e.ts index aacb77a..5c39d1c 100644 --- a/scripts/operator-cockpit-user-e2e.ts +++ b/scripts/operator-cockpit-user-e2e.ts @@ -279,6 +279,8 @@ async function runJourney(page: Page): Promise { await expectStage(page, ['roadmap_ready', 'pending_approval']) // v6-P2: at roadmap_ready the daemon-derived card is the plan card. await expectLoopCard(page, ['plan'], 'at roadmap_ready') + // overnight-p3: the plan card carries its own next-step action button. + await expectCardAction(page, ['generate-plan', 'approve-roadmap'], 'on the plan card') // Deferred half of step 2: the overview now exists, so cockpit-last-activity // must render and keep refreshing (two samples >1.5s apart). @@ -300,6 +302,8 @@ async function runJourney(page: Page): Promise { await step('step-5-approve-and-execute', 'Approve roadmap, start execution; execution state appears', async () => { await page.getByTestId('cockpit-approve-roadmap').click() await page.getByTestId('cockpit-start-execution').waitFor({ timeout: 20_000 }) + // overnight-p3: after approval the card's own action button offers Start. + await expectCardAction(page, ['start-execution'], 'on the approved card') await shot(page, '05a-approved') await page.getByTestId('cockpit-start-execution').click() await waitForRootStage(page, ['running', 'evidence_ready', 'validators_missing', 'validating', 'pr_ready'], 30_000) @@ -427,7 +431,33 @@ async function expectLoopCard(page: Page, expectedTypes: string[], where: string throw new Error(`Machine code "${code}" (${attr}) rendered as visible loop-card text ${where}`) } } - note(`loop card ${where}: type=${lastType} · next_step="${nextStepBody.slice(0, 120)}"`) + // overnight-p3 — every card carries the agent strip: a new user sees WHO is + // working (Claude/Codex/Gemini/GitHub) without reading logs. + const agents = page.getByTestId('cockpit-card-agents') + if (!(await agents.count())) throw new Error(`Agent strip (cockpit-card-agents) missing on the loop card ${where}`) + const activeAgent = (await agents.getAttribute('data-active-agent')) ?? 'absent' + const agentsText = await agents.innerText() + for (const name of ['Claude', 'Codex', 'Gemini', 'GitHub']) { + if (!agentsText.includes(name)) throw new Error(`Agent strip is missing "${name}" ${where}`) + } + note(`loop card ${where}: type=${lastType} · active-agent=${activeAgent} · next_step="${nextStepBody.slice(0, 120)}"`) +} + +/** + * overnight-p3 — the card's next-step action button (cockpit-card-action): + * the daemon primaryAction is rendered ON the card, so the operator can act + * from the card itself without hunting for the guidance row. + */ +async function expectCardAction(page: Page, expectedActionIds: string[], where: string): Promise { + const btn = page.getByTestId('cockpit-card-action') + await btn.waitFor({ timeout: 15_000 }) + const actionId = (await btn.getAttribute('data-action-id')) ?? 'absent' + if (!expectedActionIds.includes(actionId)) { + throw new Error(`Expected card action in [${expectedActionIds.join(', ')}] ${where}, got: ${actionId}`) + } + const label = oneLine(await btn.innerText()) + if (label.length < 4) throw new Error(`Card action button has no readable label ${where}`) + note(`card action ${where}: ${actionId} · "${label}"`) } async function rootStage(page: Page): Promise {