diff --git a/skills/auto/SKILL.md b/skills/auto/SKILL.md index e8174825d..315780814 100644 --- a/skills/auto/SKILL.md +++ b/skills/auto/SKILL.md @@ -71,7 +71,8 @@ When the user types `ooo auto` with CLI-style flags inside chat, translate to MC | Layer | Code | Surface | Meaning | |---|---|---|---| -| Interview | `interview_max_rounds_exhausted` | `last_error_code`, `result.stop_reason_code` | Auto interview ran `max_interview_rounds` without ledger+backend mutual closure or safe-default fallback. | +| Interview | `interview_max_rounds_exhausted` | `last_error_code`, `result.stop_reason_code` | Auto interview ran `max_interview_rounds` without ledger+backend mutual closure, no section was safely defaultable, and no partial defaults applied — i.e. genuine deadlock with nothing the policy could close. | +| Interview | `interview_unsafe_gaps_remain` | `last_error_code`, `result.stop_reason_code` | Auto interview ran `max_interview_rounds` with at least one section safely defaultable and at least one section remaining unsafe (e.g. CONFLICTING ledger entry, production/credential context). Partial defaults are rolled back so the persisted transcript and ledger stay aligned; resume can address the unsafe gap and re-run. | | Interview | `interview_phase_deadline` | `last_error_code`, `result.stop_reason_code` | Interview phase exceeded its per-phase timeout. | | Ralph | `iteration_timeout` | blocker text + (future) `result.stop_reason_code` | A single Ralph iteration exceeded its per-iteration timeout. | | Ralph | `wall_clock_exhausted` | blocker text + (future) `result.stop_reason_code` | The Ralph wall-clock budget was exhausted before convergence. | @@ -81,4 +82,16 @@ When the user types `ooo auto` with CLI-style flags inside chat, translate to MC Blockers without a canonical code keep using the free-form ``last_error`` text. Ralph-layer codes are surfaced via blocker text today; their result-envelope promotion is tracked as a follow-up. +### Interview closure mode taxonomy + +When `result.status == "seed_ready"`, `result.interview_closure_mode` distinguishes how the interview was closed: + +| Value | Meaning | +|---|---| +| `None` | Mutual agreement — both the backend and the ledger declared the seed ready in the same round. The default healthy path. | +| `"ledger_only"` | PR-B1 / #1148: `max_rounds` hit; the ledger was structurally complete but the backend refused to declare closure. The interview closes on ledger-only consensus. Defaulted sections (if any) are tagged in `result.defaulted_sections`. | +| `"safe_default"` | PR-B2: `max_rounds` hit; the safe-default policy successfully filled every remaining required gap with auditable assumptions. Synthesis was pushed back into the persisted transcript so the seed generator sees the same assumptions the ledger records. Defaulted sections are tagged in `result.defaulted_sections`. | + +Genuine-deadlock and partial-unsafe outcomes do **not** set `interview_closure_mode`; they reach a `blocked` terminal with the matching `stop_reason_code` above instead. + The pipeline must not hang indefinitely: all loops are bounded and timeout failures return a resumable `auto_session_id`. Resume with `ooo auto --resume `. Use `--skip-run` to stop after the A-grade Seed. Use `--complete-product` to drive the full Interview → Seed → Run → Ralph → Product chain on a single `ooo auto` invocation; the chained Ralph loop honors the same wall-clock deadline as the parent auto session (`--timeout`). The CLI-only `--show-ledger` flag prints assumptions/non-goals; MCP skill responses already include the same ledger summary when available. diff --git a/src/ouroboros/auto/interview_driver.py b/src/ouroboros/auto/interview_driver.py index 71ca055c7..b188d6186 100644 --- a/src/ouroboros/auto/interview_driver.py +++ b/src/ouroboros/auto/interview_driver.py @@ -461,6 +461,10 @@ async def run(self, state: AutoPipelineState, ledger: SeedDraftLedger) -> AutoIn state.ledger = ledger.to_dict() state.pending_question = None state.interview_completed = True + # PR-B2 / #821: tag the envelope so callers can distinguish a + # safe-default-applied closure from a backend-confirmed close + # (``mutual_agreement``) and from PR-B1's ``ledger_only`` path. + state.interview_closure_mode = "safe_default" state.mark_progress( "safe-default finalization closed interview gaps: " + ", ".join(finalization.defaulted_sections), @@ -500,6 +504,47 @@ async def run(self, state: AutoPipelineState, ledger: SeedDraftLedger) -> AutoIn return AutoInterviewResult( "seed_ready", state.interview_session_id, ledger, self.max_rounds ) + # PR-B2 / #821: partial safe-default closure — some required gaps were + # safely defaultable, but at least one remained unsafe at max_rounds. + # Distinguish from the generic "nothing was defaultable" path with a + # dedicated structured event and a typed stop_reason_code so callers + # can resume with the unsafe gap context surfaced. Roll back the + # partial defaults because synthesis was never pushed to the backend + # transcript (same invariant as the synthesis-failure rollback above): + # leaving entries in the ledger that the persisted interview does not + # mirror would diverge on resume. + if ( + finalization is not None + and finalization.defaulted_sections + and finalization.unsafe_gaps + ): + log.info( + "auto.interview.safe_default_partial_unsafe_gaps", + auto_session_id=state.auto_session_id, + defaulted_sections=finalization.defaulted_sections, + unsafe_gaps=finalization.unsafe_gaps, + ambiguity_score=turn.ambiguity_score, + interview_session_id=state.interview_session_id, + ) + _revert_safe_default_entries(ledger, finalization.defaulted_sections) + blocker = ( + f"auto interview reached max_rounds={self.max_rounds} with " + f"partial safe-default closure (rolled back): " + f"defaultable={list(finalization.defaulted_sections)}, " + f"unsafe_remaining={list(finalization.unsafe_gaps)}" + ) + state.ledger = ledger.to_dict() + state.mark_blocked( + blocker, + tool_name="interview_driver", + error_code="interview_unsafe_gaps_remain", + ) + record_authoring_backend(state) + self._save(state) + return AutoInterviewResult( + "blocked", state.interview_session_id, ledger, self.max_rounds, blocker + ) + if ledger_done: log.info( "auto.interview.mutual_agreement_deadlock_at_max_rounds", diff --git a/tests/unit/auto/test_interview_pipeline.py b/tests/unit/auto/test_interview_pipeline.py index 48f434b6d..bafdb3dad 100644 --- a/tests/unit/auto/test_interview_pipeline.py +++ b/tests/unit/auto/test_interview_pipeline.py @@ -735,6 +735,10 @@ async def answer( assert result.status == "seed_ready" assert state.interview_completed is True assert state.pending_question is None + # PR-B2 / #821: the safe-default closure path now tags the envelope so + # callers can distinguish it from mutual_agreement (None) and ledger_only. + assert state.interview_closure_mode == "safe_default" + assert state.last_error_code is None assert ledger.open_gaps() == [] assert any( entry.status == LedgerStatus.DEFAULTED @@ -744,6 +748,78 @@ async def answer( assert any("[safe-default-synthesis]" in answer for answer in answers) +@pytest.mark.asyncio +async def test_interview_driver_blocks_with_unsafe_gaps_when_partially_defaultable( + tmp_path, +) -> None: + """PR-B2 / #821: a benign goal where one ledger section is CONFLICTING yields + partial safe-default at ``max_rounds`` — some sections defaultable, others + not. The driver must roll back the partial defaults (synthesis was never + pushed to the backend so leaving them would diverge from the persisted + transcript), record the typed ``interview_unsafe_gaps_remain`` stop code, + and not set ``interview_closure_mode`` (blocked outcomes do not carry it). + """ + + async def start(goal: str, cwd: str) -> InterviewTurn: # noqa: ARG001 + return InterviewTurn("What should we verify?", "interview_partial") + + async def answer( + session_id: str, text: str, *, last_question: str | None = None + ) -> InterviewTurn: # noqa: ARG001 + return InterviewTurn("What else?", session_id, seed_ready=False) + + state = AutoPipelineState(goal="Build a tiny local CLI", cwd=str(tmp_path)) + ledger = SeedDraftLedger.from_goal(state.goal) + # Seed a CONFLICTING entry on one section so it is per-section unsafe + # without triggering the goal-level unsafe-context gate (the goal is + # benign). The other gap sections remain safely defaultable, so + # ``finalize_safe_defaultable_gaps`` produces both ``defaulted_sections`` + # and ``unsafe_gaps`` — the exact partial-safe shape PR-B2 routes. + ledger.add_entry( + "constraints", + LedgerEntry( + key="constraints.contradiction", + value="Two recorded answers disagree on whether to allow new deps.", + source=LedgerSource.USER_PREFERENCE, + confidence=1.0, + status=LedgerStatus.CONFLICTING, + ), + ) + assert "constraints" in ledger.open_gaps() + + driver = AutoInterviewDriver( + FunctionInterviewBackend(start, answer), + store=AutoStore(tmp_path), + max_rounds=1, + timeout_seconds=1, + ) + + result = await driver.run(state, ledger) + + assert result.status == "blocked" + assert state.phase == AutoPhase.BLOCKED + blocker = result.blocker or "" + assert "partial safe-default closure" in blocker + assert "rolled back" in blocker + # The new typed code distinguishes partial-safe from the genuine deadlock + # (``interview_max_rounds_exhausted``) and from the per-phase timeout + # (``interview_phase_deadline``). + assert state.last_error_code == "interview_unsafe_gaps_remain" + # Result envelope must not report a closure_mode on a blocked outcome. + assert state.interview_closure_mode is None + # Rollback invariant: no safe-default entries remain in the ledger. + assert not any( + entry.key.endswith(".safe_default_finalization") + for section in ledger.sections.values() + for entry in section.entries + ), "partial safe-default rollback must remove all defaulted entries" + # The CONFLICTING entry itself is preserved (it is user-recorded data the + # caller may need on resume to address the unsafe gap). + assert any( + entry.key == "constraints.contradiction" for entry in ledger.sections["constraints"].entries + ) + + @pytest.mark.asyncio async def test_interview_driver_rolls_back_defaults_when_synthesis_sync_fails(tmp_path) -> None: async def start(goal: str, cwd: str) -> InterviewTurn: # noqa: ARG001 @@ -844,6 +920,12 @@ async def answer(session_id: str, text: str) -> InterviewTurn: # noqa: ARG001 assert "ledger_done=False" in blocker assert "open_gaps=" in blocker assert not ledger.is_seed_ready() + # PR-B2 regression guard: a goal whose unsafe-context gate marks ALL gaps + # unsafe (no defaultable_sections produced) must continue to use the + # generic ``interview_max_rounds_exhausted`` code, NOT the new partial-safe + # code ``interview_unsafe_gaps_remain``. + assert state.last_error_code == "interview_max_rounds_exhausted" + assert state.interview_closure_mode is None @pytest.mark.asyncio