From f2119bad6fb0a5564e74125d6869fc2992ff29f6 Mon Sep 17 00:00:00 2001 From: heomin86 <126128010+heomin86@users.noreply.github.com> Date: Tue, 26 May 2026 00:45:12 +1000 Subject: [PATCH] Close auto skip-run loops with actionable blockers Make the Codex/App Ouroboros auto loop distinguish recoverable authoring blockers from transport or system failures, and add diagnostics that explain stale App/runtime state without requiring tmux-only interactive OMX surfaces. Constraint: Codex App outside-tmux cannot rely on tmux-only interactive OMX surfaces, so long auto checks need native MCP or pollable job paths. Rejected: Treat seed ambiguity validation as a generic failed session | it requires more interview/user input and should stay resumable through the seed_generator path. Confidence: high Scope-risk: moderate Directive: Keep App transport diagnostics separate from MCP registration checks; handler reachability and active archive drift are different failure classes. Tested: git diff --check; uv run pytest tests/unit/auto/test_interview_pipeline.py tests/unit/cli/test_codex_command.py tests/unit/orchestrator/test_sandbox_class.py tests/unit/orchestrator/test_codex_cli_runtime.py tests/unit/mcp/tools/test_interview_done_streak.py -q (184 passed). Not-tested: Full implementation execution after A-grade Seed; validation intentionally used skip-run/pollable auto smoke. Co-authored-by: OmX --- src/ouroboros/auto/pipeline.py | 15 ++- src/ouroboros/codex_permissions.py | 27 ++++- src/ouroboros/mcp/tools/authoring_handlers.py | 38 +++++- .../orchestrator/codex_cli_runtime.py | 114 ++++++++++++------ src/ouroboros/providers/codex_cli_adapter.py | 1 + tests/unit/auto/test_interview_pipeline.py | 32 +++++ tests/unit/mcp/tools/test_definitions.py | 5 +- .../mcp/tools/test_interview_done_streak.py | 61 ++++++++++ .../orchestrator/test_codex_cli_runtime.py | 60 +++++++++ tests/unit/orchestrator/test_sandbox_class.py | 19 +++ 10 files changed, 328 insertions(+), 44 deletions(-) diff --git a/src/ouroboros/auto/pipeline.py b/src/ouroboros/auto/pipeline.py index 85c43cab8..f8a59f5da 100644 --- a/src/ouroboros/auto/pipeline.py +++ b/src/ouroboros/auto/pipeline.py @@ -686,10 +686,11 @@ async def run(self, state: AutoPipelineState) -> AutoPipelineResult: self._save(state) return self._result(state, ledger, blocker=str(exc) or state.last_error) except Exception as exc: - state.mark_failed( - f"seed generation failed: {exc}", - tool_name="seed_generator", - ) + message = f"seed generation failed: {exc}" + if _is_seed_generation_blocker(exc): + state.mark_blocked(message, tool_name="seed_generator") + else: + state.mark_failed(message, tool_name="seed_generator") record_authoring_backend(state) self._save(state) return self._result(state, ledger, blocker=state.last_error) @@ -3101,6 +3102,12 @@ def _accepts_keyword(func: Callable[..., Any], name: str) -> bool: return False +def _is_seed_generation_blocker(exc: Exception) -> bool: + """Classify recoverable authoring validation as blocked, not failed.""" + message = str(exc) + return "Ambiguity score" in message and "exceeds threshold" in message + + def _recoverable_phase_for_tool(tool_name: str | None) -> AutoPhase | None: if tool_name in { "interview.start", diff --git a/src/ouroboros/codex_permissions.py b/src/ouroboros/codex_permissions.py index 4e067c4a1..7db6f7eca 100644 --- a/src/ouroboros/codex_permissions.py +++ b/src/ouroboros/codex_permissions.py @@ -57,7 +57,14 @@ def resolve_codex_permission_mode( return candidate # type: ignore[return-value] -def build_codex_exec_args_for_sandbox(sandbox: SandboxClass) -> list[str]: +def build_codex_exec_args_for_sandbox( + sandbox: SandboxClass, + *, + source: str | None = None, + permission_mode: str | None = None, + default_mode: CodexPermissionMode | None = None, + resolved_mode: CodexPermissionMode | None = None, +) -> list[str]: """Translate a sandbox class into Codex CLI exec flags. This is the canonical entry point for new call sites. Engine code @@ -73,7 +80,14 @@ def build_codex_exec_args_for_sandbox(sandbox: SandboxClass) -> list[str]: msg = f"No Codex CLI mapping registered for sandbox class {sandbox!r}" raise KeyError(msg) if sandbox is SandboxClass.UNRESTRICTED: - log.warning("permissions.bypass_activated", sandbox=sandbox.value) + log.warning( + "permissions.bypass_activated", + sandbox=sandbox.value, + source=source, + permission_mode=permission_mode, + default_mode=default_mode, + resolved_mode=resolved_mode, + ) return list(args) @@ -81,6 +95,7 @@ def build_codex_exec_permission_args( permission_mode: str | None, *, default_mode: CodexPermissionMode = "default", + source: str | None = None, ) -> list[str]: """Translate a legacy permission-mode string into Codex CLI exec flags. @@ -95,7 +110,13 @@ def build_codex_exec_permission_args( no sandbox """ resolved = resolve_codex_permission_mode(permission_mode, default_mode=default_mode) - return build_codex_exec_args_for_sandbox(_PERMISSION_MODE_TO_SANDBOX[resolved]) + return build_codex_exec_args_for_sandbox( + _PERMISSION_MODE_TO_SANDBOX[resolved], + source=source, + permission_mode=permission_mode, + default_mode=default_mode, + resolved_mode=resolved, + ) __all__ = [ diff --git a/src/ouroboros/mcp/tools/authoring_handlers.py b/src/ouroboros/mcp/tools/authoring_handlers.py index e2694394c..a5df54fe0 100644 --- a/src/ouroboros/mcp/tools/authoring_handlers.py +++ b/src/ouroboros/mcp/tools/authoring_handlers.py @@ -1434,6 +1434,8 @@ async def _complete_interview_response( state: InterviewState, session_id: str, score: AmbiguityScore | None = None, + *, + seed_ready_override: bool | None = None, ) -> Result[MCPToolResult, MCPServerError]: """Complete the interview and return a Seed-ready MCP response.""" complete_result = await engine.complete_interview(state) @@ -1484,7 +1486,13 @@ async def _complete_interview_response( "completed": True, "ambiguity_score": score.overall_score if score is not None else None, "milestone": _milestone_for_score(score), - "seed_ready": score.is_ready_for_seed if score is not None else None, + "seed_ready": ( + seed_ready_override + if seed_ready_override is not None + else score.is_ready_for_seed + if score is not None + else None + ), "required_client_gates": REQUIRED_CLIENT_GATES, **_interview_reasoning_meta( state=state, @@ -2248,6 +2256,34 @@ async def handle( advance_streak=False, reset_on_failure=True, ) + # Safe-default synthesis is emitted only after the + # auto driver has filled every remaining required + # ledger gap with audited conservative defaults. Do + # not require the semantic ambiguity scorer to also + # cross the normal human "done" threshold; that score + # can lag behind the ledger and would leave a trailing + # unanswered question in the persisted transcript. + if is_safe_default_synthesis: + if has_pending_round: + state.rounds.pop() + from ouroboros.bigbang.interview import InterviewRound + + state.rounds.append( + InterviewRound( + round_number=len(state.rounds) + 1, + question=last_question or "[driver safe-default finalization]", + user_response=answer, + ) + ) + state.clear_stored_ambiguity() + state.mark_updated() + return await self._complete_interview_response( + engine, + state, + session_id, + None, + seed_ready_override=True, + ) if exit_score is not None and qualifies_for_seed_completion( exit_score, is_brownfield=state.is_brownfield, diff --git a/src/ouroboros/orchestrator/codex_cli_runtime.py b/src/ouroboros/orchestrator/codex_cli_runtime.py index ffc3edd6e..4348754d8 100644 --- a/src/ouroboros/orchestrator/codex_cli_runtime.py +++ b/src/ouroboros/orchestrator/codex_cli_runtime.py @@ -167,6 +167,7 @@ def _build_permission_args(self) -> list[str]: return build_codex_exec_permission_args( self._permission_mode, default_mode="acceptEdits", + source=f"{self._log_namespace}.agent_runtime", ) def _get_configured_cli_path(self) -> str | None: @@ -539,43 +540,87 @@ def _log_invalid_skill_intercept(self, dispatch_result: InvalidSkill) -> None: error=self._invalid_skill_log_error(dispatch_result), ) + @staticmethod + def _auto_dispatch_error_category( + error_type: str | None, + error_text: str, + ) -> str | None: + """Classify terminal auto-dispatch failures for operator diagnostics.""" + normalized_error = error_text.lower() + transport_markers = ( + "transport closed", + "connection closed", + "stdio closed", + "broken pipe", + ) + unavailable_markers = ( + "unavailable", + "not found", + "not registered", + "unknown tool", + "no such tool", + ) + + if any(marker in normalized_error for marker in transport_markers): + return "mcp_transport_closed" + if error_type == "MCPResourceNotFoundError": + return "mcp_registration_missing" + if error_type == "LookupError" and "no local handler registered" in normalized_error: + return "local_handler_missing" + if error_type in {"MCPClientError", "MCPToolError"} and any( + marker in normalized_error for marker in unavailable_markers + ): + return "mcp_registration_missing" + return None + @staticmethod def _is_auto_recoverable_dispatch_unavailable(recoverable_error: AgentMessage) -> bool: """Return whether a recoverable auto dispatch error means the tool is unavailable.""" - error_type = recoverable_error.data.get("error_type") - error_text = recoverable_error.content.lower() - if error_type == "MCPResourceNotFoundError": - return True - if error_type == "LookupError": - return "no local handler registered" in error_text - - if error_type == "MCPClientError": - return any( - marker in error_text - for marker in ( - "unavailable", - "not found", - "not registered", - "unknown tool", - "no such tool", - ) - ) - if error_type != "MCPToolError": + error_type = str(recoverable_error.data.get("error_type") or "") + error_text = recoverable_error.content + if error_text.lower().startswith("auto pipeline failed:"): return False + return CodexCliRuntime._auto_dispatch_error_category(error_type, error_text) is not None - if error_text.startswith("auto pipeline failed:"): - return False - return any( - marker in error_text - for marker in ( - "unavailable", - "not found", - "not registered", - "unknown tool", - "no such tool", + def _build_auto_dispatch_unavailable_content( + self, + tool_name: str, + category: str | None, + ) -> str: + """Return the operator-facing auto dispatch failure message.""" + if category == "mcp_transport_closed": + return ( + "Cannot run ooo auto: MCP transport closed before " + f"`{tool_name}` completed. Run `ouroboros mcp doctor` to verify " + "the server, then reconnect or restart the Codex App MCP session; " + "this is a transport/session failure, not proof that the tool is " + "unregistered." ) + return ( + "Cannot run ooo auto: required MCP tool " + f"`{tool_name}` is unavailable. " + "Run `ouroboros mcp doctor` / setup to register the MCP server." + ) + + @staticmethod + def _is_mcp_transport_closed_error(message: AgentMessage) -> bool: + """Return True for recoverable-looking MCP client transport closures.""" + error_type = message.data.get("error_type") + if error_type != "MCPClientError": + return False + return ( + CodexCliRuntime._auto_dispatch_error_category(error_type, message.content) + == "mcp_transport_closed" ) + @staticmethod + def _is_recoverable_mcp_error_type(message: AgentMessage) -> bool: + """Return True for MCP transport errors that should not be passed through.""" + error_type = message.data.get("error_type") + if error_type in {"MCPConnectionError", "MCPTimeoutError"}: + return True + return CodexCliRuntime._is_mcp_transport_closed_error(message) + def _build_auto_dispatch_unavailable_message( self, intercept: Resolved, @@ -596,14 +641,13 @@ def _build_auto_dispatch_unavailable_message( data["dispatch_error_type"] = dispatch_error_type if dispatch_error: data["dispatch_error"] = dispatch_error + category = self._auto_dispatch_error_category(dispatch_error_type, dispatch_error or "") + if category: + data["dispatch_error_category"] = category return AgentMessage( type="result", - content=( - "Cannot run ooo auto: required MCP tool " - f"`{intercept.mcp_tool}` is unavailable. " - "Run `ouroboros mcp doctor` / setup to register the MCP server." - ), + content=self._build_auto_dispatch_unavailable_content(intercept.mcp_tool, category), data=data, resume_handle=current_handle, ) @@ -719,7 +763,7 @@ def _extract_recoverable_dispatch_error( if metadata.get("is_retriable") is True or metadata.get("retriable") is True: return final_message - if final_message.data.get("error_type") in {"MCPConnectionError", "MCPTimeoutError"}: + if self._is_recoverable_mcp_error_type(final_message): return final_message return None diff --git a/src/ouroboros/providers/codex_cli_adapter.py b/src/ouroboros/providers/codex_cli_adapter.py index cfb75ae41..c86ffed5f 100644 --- a/src/ouroboros/providers/codex_cli_adapter.py +++ b/src/ouroboros/providers/codex_cli_adapter.py @@ -157,6 +157,7 @@ def _build_permission_args(self) -> list[str]: return build_codex_exec_permission_args( self._permission_mode, default_mode="default", + source=f"{self._log_namespace}.llm_adapter", ) def _get_configured_cli_path(self) -> str | None: diff --git a/tests/unit/auto/test_interview_pipeline.py b/tests/unit/auto/test_interview_pipeline.py index 47641b227..e8d3c09ee 100644 --- a/tests/unit/auto/test_interview_pipeline.py +++ b/tests/unit/auto/test_interview_pipeline.py @@ -1383,6 +1383,38 @@ async def generate_seed(session_id: str) -> Seed: # noqa: ARG001 assert "Outputs are stdout-only by inference" not in result.assumptions +@pytest.mark.asyncio +async def test_pipeline_blocks_on_seed_ambiguity_validation(tmp_path) -> None: + async def start(goal: str, cwd: str) -> InterviewTurn: # noqa: ARG001 + return InterviewTurn("done", "interview_1", seed_ready=True, completed=True) + + async def answer(session_id: str, text: str) -> InterviewTurn: # noqa: ARG001 + raise AssertionError("completed interview should not need another answer") + + async def generate_seed(session_id: str) -> Seed: # noqa: ARG001 + raise RuntimeError( + "ouroboros_generate_seed failed: Validation error: Ambiguity score 0.26 " + "exceeds threshold 0.2. Cannot generate Seed." + ) + + state = AutoPipelineState(goal="Build a CLI", cwd=str(tmp_path)) + ledger = SeedDraftLedger.from_goal(state.goal) + _fill_ready(ledger) + state.ledger = ledger.to_dict() + driver = AutoInterviewDriver( + FunctionInterviewBackend(start, answer), store=AutoStore(tmp_path), max_rounds=1 + ) + pipeline = AutoPipeline(driver, generate_seed, store=AutoStore(tmp_path), skip_run=True) + + result = await pipeline.run(state) + + assert result.status == "blocked" + assert state.phase == AutoPhase.BLOCKED + assert state.interview_completed is True + assert "Ambiguity score 0.26 exceeds threshold 0.2" in (result.blocker or "") + assert state.last_tool_name == "seed_generator" + + @pytest.mark.asyncio async def test_pipeline_uses_explicit_goal_facts_before_completed_interview(tmp_path) -> None: async def start(goal: str, cwd: str) -> InterviewTurn: # noqa: ARG001 diff --git a/tests/unit/mcp/tools/test_definitions.py b/tests/unit/mcp/tools/test_definitions.py index 9daf552ff..13621ba43 100644 --- a/tests/unit/mcp/tools/test_definitions.py +++ b/tests/unit/mcp/tools/test_definitions.py @@ -2376,7 +2376,10 @@ async def complete_interview( assert result.value.meta["completed"] is True assert result.value.meta["seed_ready"] is True assert state.status is InterviewStatus.COMPLETED - assert state.rounds == [] + assert len(state.rounds) == 1 + assert state.rounds[0].question == "[driver safe-default finalization]" + assert state.rounds[0].user_response is not None + assert "[safe-default-synthesis]" in state.rounds[0].user_response mock_engine.complete_interview.assert_awaited_once() diff --git a/tests/unit/mcp/tools/test_interview_done_streak.py b/tests/unit/mcp/tools/test_interview_done_streak.py index 3c3893639..bc2644f5f 100644 --- a/tests/unit/mcp/tools/test_interview_done_streak.py +++ b/tests/unit/mcp/tools/test_interview_done_streak.py @@ -182,6 +182,67 @@ async def complete_state( mock_engine.complete_interview.assert_called_once() mock_engine.ask_next_question.assert_not_called() + async def test_safe_default_synthesis_completes_without_score_threshold(self) -> None: + """Auto safe-default synthesis closes the transcript without the done streak.""" + handler = InterviewHandler() + handler._emit_event = AsyncMock() + state = InterviewState( + interview_id="sess-123", + completion_candidate_streak=0, + rounds=[ + InterviewRound( + round_number=1, + question="What boundary should invalidate the run?", + user_response=None, + ) + ], + ) + nonqualifying_score = create_mock_live_ambiguity_score(0.35, seed_ready=False) + answer = ( + "[from-auto][safe-default-synthesis] Mark the interview complete and " + "hand off for seed generation. Auto ledger-complete synthesis." + ) + + async def complete_state( + current_state: InterviewState, + ) -> Result[InterviewState, Exception]: + current_state.status = InterviewStatus.COMPLETED + return Result.ok(current_state) + + mock_engine = MagicMock() + mock_engine.load_state = AsyncMock(return_value=Result.ok(state)) + mock_engine.complete_interview = AsyncMock(side_effect=complete_state) + mock_engine.save_state = AsyncMock(return_value=MagicMock(is_ok=True, is_err=False)) + mock_engine.ask_next_question = AsyncMock() + + with ( + patch( + "ouroboros.mcp.tools.authoring_handlers.InterviewEngine", + return_value=mock_engine, + ), + patch.object( + handler, + "_score_interview_state", + AsyncMock(return_value=nonqualifying_score), + ), + ): + result = await handler.handle( + { + "session_id": "sess-123", + "answer": answer, + "last_question": "[driver safe-default finalization: max_rounds=2]", + } + ) + + assert result.is_ok + assert state.status == InterviewStatus.COMPLETED + assert state.completion_candidate_streak == 0 + assert state.rounds[-1].question == "[driver safe-default finalization: max_rounds=2]" + assert state.rounds[-1].user_response == answer + assert result.value.meta["completed"] is True + mock_engine.complete_interview.assert_called_once() + mock_engine.ask_next_question.assert_not_called() + async def test_explicit_done_no_infinite_loop(self) -> None: """Two sequential 'done' commands must complete the interview (no infinite loop). diff --git a/tests/unit/orchestrator/test_codex_cli_runtime.py b/tests/unit/orchestrator/test_codex_cli_runtime.py index f87bd621c..b487837d8 100644 --- a/tests/unit/orchestrator/test_codex_cli_runtime.py +++ b/tests/unit/orchestrator/test_codex_cli_runtime.py @@ -1792,6 +1792,7 @@ async def test_execute_task_auto_dispatch_failure_does_not_fall_back( "command_prefix": "ooo auto", "dispatch_error_type": "LookupError", "dispatch_error": "No local handler registered", + "dispatch_error_category": "local_handler_missing", } @pytest.mark.asyncio @@ -1908,6 +1909,65 @@ async def test_execute_task_auto_resource_not_found_dispatch_error_does_not_fall assert messages[0].data["error_type"] == "SkillDispatchUnavailable" assert messages[0].data["dispatch_error_type"] == "MCPResourceNotFoundError" assert messages[0].data["dispatch_error"] == "Tool ouroboros_auto not found" + assert messages[0].data["dispatch_error_category"] == "mcp_registration_missing" + + @pytest.mark.asyncio + async def test_execute_task_auto_transport_closed_reports_transport_not_setup( + self, + tmp_path: Path, + ) -> None: + """Codex App MCP transport closures should not be misreported as setup drift.""" + self._write_skill( + tmp_path, + "auto", + [ + "name: auto", + 'description: "Automatically converge from goal to A-grade Seed and execute it"', + "mcp_tool: ouroboros_auto", + "mcp_args:", + ' goal: "$goal"', + ' cwd: "$CWD"', + ], + ) + dispatcher = AsyncMock( + return_value=( + AgentMessage(type="assistant", content="Calling tool: ouroboros_auto"), + AgentMessage( + type="result", + content="MCPClientError: Transport closed", + data={ + "subtype": "error", + "error_type": "MCPClientError", + }, + ), + ) + ) + runtime = CodexCliRuntime( + cli_path="codex", + cwd="/tmp/project", + skills_dir=tmp_path, + skill_dispatcher=dispatcher, + ) + + with ( + patch("ouroboros.orchestrator.codex_cli_runtime.log.warning") as mock_warning, + patch( + "ouroboros.orchestrator.codex_cli_runtime.asyncio.create_subprocess_exec", + ) as mock_exec, + ): + messages = [message async for message in runtime.execute_task("ooo auto Build a CLI")] + + dispatcher.assert_awaited_once() + assert mock_warning.call_args.kwargs["fallback"] == "terminal_error" + mock_exec.assert_not_called() + assert len(messages) == 1 + assert messages[0].data["error_type"] == "SkillDispatchUnavailable" + assert messages[0].data["dispatch_error_type"] == "MCPClientError" + assert messages[0].data["dispatch_error"] == "MCPClientError: Transport closed" + assert messages[0].data["dispatch_error_category"] == "mcp_transport_closed" + assert "MCP transport closed" in messages[0].content + assert "not proof that the tool is unregistered" in messages[0].content + assert "setup to register" not in messages[0].content @pytest.mark.asyncio async def test_execute_task_auto_recoverable_pipeline_error_preserves_real_cause( diff --git a/tests/unit/orchestrator/test_sandbox_class.py b/tests/unit/orchestrator/test_sandbox_class.py index c1ec8179d..af6ac399d 100644 --- a/tests/unit/orchestrator/test_sandbox_class.py +++ b/tests/unit/orchestrator/test_sandbox_class.py @@ -2,6 +2,8 @@ from __future__ import annotations +from unittest.mock import patch + import pytest from ouroboros.claude_permissions import ( @@ -118,6 +120,23 @@ def test_legacy_wrapper_routes_through_sandbox_enum( enum_args = build_codex_exec_args_for_sandbox(expected_sandbox) assert legacy_args == enum_args + def test_bypass_warning_includes_permission_provenance(self) -> None: + with patch("ouroboros.codex_permissions.log.warning") as mock_warning: + args = build_codex_exec_permission_args( + "bypassPermissions", + source="codex_cli_runtime.agent_runtime", + ) + + assert args == ["--dangerously-bypass-approvals-and-sandbox"] + mock_warning.assert_called_once_with( + "permissions.bypass_activated", + sandbox="unrestricted", + source="codex_cli_runtime.agent_runtime", + permission_mode="bypassPermissions", + default_mode="default", + resolved_mode="bypassPermissions", + ) + class TestClaudeLegacyVocabularyMatchesSandbox: """Claude SDK historically consumed the same ``default``/``acceptEdits``/