diff --git a/tests/canonical/README.md b/tests/canonical/README.md index 902fb06f8..19ab32be8 100644 --- a/tests/canonical/README.md +++ b/tests/canonical/README.md @@ -36,7 +36,7 @@ fixture files in the right shape. It does **not** invoke copyable status line per scenario, for example: ```text -CANONICAL cli-todo: shape_valid domain=cli completion=product_complete probes=headless_run,stdout_golden budget=1800s live=deferred_l0b +CANONICAL cli-todo: shape_valid domain=cli completion=product_complete probes=headless_run,stdout_golden budget=1800s live=available_opt_in ``` ### Full live run (manual, costs LLM tokens) @@ -45,13 +45,9 @@ CANONICAL cli-todo: shape_valid domain=cli completion=product_complete probes=he OUROBOROS_RUN_CANONICAL=1 uv run pytest tests/canonical/ -v ``` -Once the live wiring lands in L0-b, this command will invoke the -`ouroboros_auto` MCP tool against each scenario and assert the -documented terminal state — **use sparingly**, each scenario will -consume real LLM tokens (cli-todo ≈ \$1, kart-racer ≈ \$5 with -Sonnet-class models). **At L0-a (this PR) the opt-in still -`pytest.skip`s with a typed reason** so the harness contract is -observable without burning tokens; the shape-check tests still run. +This command invokes the `ouroboros_auto` MCP tool against each scenario +and asserts the documented terminal state — **use sparingly**, each +scenario will consume real LLM tokens. ### Run a single scenario @@ -63,8 +59,8 @@ with `-k`: uv run pytest tests/canonical/ -v -k cli-todo ``` -Add `OUROBOROS_RUN_CANONICAL=1` once L0-b lands to opt into the live -invocation for that scenario. +Add `OUROBOROS_RUN_CANONICAL=1` to opt into the live invocation for that +scenario. ## Scenario directory shape @@ -84,7 +80,7 @@ domain_class: cli # one of the L1 TaskClass values completion_mode: product_complete # CODE_COMPLETE | PRODUCT_COMPLETE # optional -runtime_probe_kinds: # placeholder until L3 lands +runtime_probe_kinds: - headless_run - stdout_golden wall_clock_budget_seconds: 600 # default: 7200 @@ -97,10 +93,9 @@ canonicalizing, add a new `/` directory + populate `expected.yaml`. No infrastructure change required. The runner auto-discovers. -## Adding the live-run path +## Live-run path -The hermetic shape-check is in place from L0-a. The live-run path -(`OUROBOROS_RUN_CANONICAL=1`) is wired but the actual -`ouroboros_auto` invocation lands in L0-b once the maintainer has -confirmed they want it; until then the live-run path skips with a -typed reason so the harness contract stays observable. +The hermetic shape-check is the default. The live-run path +(`OUROBOROS_RUN_CANONICAL=1`) invokes `ouroboros_auto` against each +scenario and treats MCP errors, failed terminals, and unverified +PRODUCT_COMPLETE handoffs as test failures. diff --git a/tests/canonical/cli-todo/expected.yaml b/tests/canonical/cli-todo/expected.yaml index dc474670c..435f058a3 100644 --- a/tests/canonical/cli-todo/expected.yaml +++ b/tests/canonical/cli-todo/expected.yaml @@ -6,10 +6,8 @@ # acceptance gate verifies the artifact (`completion_mode`, # `runtime_probe_kinds`). # -# These values are validated for shape only at L0-a. Cross-validation -# that they round-trip through the L1 `TaskClassProfile` catalog -# (#1173) lands in a follow-up sub-PR; the round-trip test is not yet -# present in `tests/canonical/test_canonical.py`. +# These values are validated against the L1 `TaskClassProfile` catalog +# by `tests/canonical/test_canonical.py`. domain_class: cli completion_mode: product_complete @@ -18,7 +16,5 @@ runtime_probe_kinds: - headless_run - stdout_golden -# 30-minute soft budget. The future L2 watchdog will cancel beyond this -# if it ever ships; until then the value is informational metadata that -# downstream harnesses can honor or ignore. +# 30-minute soft budget honored by the opt-in live canonical runner. wall_clock_budget_seconds: 1800 diff --git a/tests/canonical/conftest.py b/tests/canonical/conftest.py index 5e53cffdf..f9c612dc1 100644 --- a/tests/canonical/conftest.py +++ b/tests/canonical/conftest.py @@ -82,11 +82,8 @@ def env_dir(self) -> Path | None: def format_canonical_summary_line(scenario: CanonicalScenario) -> str: """Return the copyable one-line status for a canonical scenario. - L0-a has no live ``ouroboros_auto`` invocation yet, so the summary - deliberately reports the current shape-check terminal instead of - pretending PRODUCT_COMPLETE has been exercised. The live terminal - can replace ``shape_valid`` in L0-b without changing the copyable - line contract. + The default run is still the no-cost shape check, but the live + ``ouroboros_auto`` path is available behind ``OUROBOROS_RUN_CANONICAL=1``. """ probe_text = ",".join(scenario.runtime_probe_kinds) or "none" return ( @@ -95,7 +92,7 @@ def format_canonical_summary_line(scenario: CanonicalScenario) -> str: f"completion={scenario.completion_mode} " f"probes={probe_text} " f"budget={scenario.wall_clock_budget_seconds}s " - f"live=deferred_l0b" + f"live=available_opt_in" ) diff --git a/tests/canonical/test_canonical.py b/tests/canonical/test_canonical.py index 4b4006409..d23766fc2 100644 --- a/tests/canonical/test_canonical.py +++ b/tests/canonical/test_canonical.py @@ -41,8 +41,8 @@ def test_scenario_domain_class_is_lowercase_snake(scenario: CanonicalScenario) - ``expected.yaml`` fails here rather than at runtime when the inference hook is wired. - Cross-validation against the actual L1 ``TaskClass`` enum lands in - a follow-up PR after #1173 merges to main. + Cross-validation against the actual L1 ``TaskClass`` enum is pinned + below. """ value = scenario.domain_class assert value == value.lower(), f"{scenario.slug}: domain_class {value!r} must be lowercase" @@ -66,9 +66,8 @@ def test_scenario_runtime_probe_kinds_are_strings( scenario: CanonicalScenario, ) -> None: """``runtime_probe_kinds`` is a tuple of plain strings. Cross- - validation against the L1 catalog's per-class probe whitelist - lands in a follow-up PR after #1173 merges; this test pins the - surface shape only.""" + validation against the L1 catalog's per-class probe whitelist is + pinned below; this test pins the surface shape.""" kinds = scenario.runtime_probe_kinds assert isinstance(kinds, tuple) for kind in kinds: @@ -268,3 +267,50 @@ async def test_scenario_live_run_or_skip( f"CANONICAL {scenario.slug}: status={tool_result.meta['status']} " f"phase={tool_result.meta.get('phase')} completion_mode={scenario.completion_mode}" ) + + +@pytest.mark.asyncio +async def test_live_run_opt_in_invokes_auto_handler( + scenario: CanonicalScenario, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + """Hermetically pin that the opt-in path calls the live runner.""" + + calls: list[tuple[str, Path]] = [] + + class _Ok: + def is_ok(self) -> bool: + return True + + def unwrap(self) -> object: + return _ToolResult() + + def unwrap_err(self) -> str: + return "unexpected" + + class _ToolResult: + is_error = False + content: list[object] = [] + meta = { + "status": "complete", + "phase": "done", + "product_status": "verified_complete", + } + + async def fake_invoke(selected: CanonicalScenario, workdir: Path) -> _Ok: + calls.append((selected.slug, workdir)) + return _Ok() + + monkeypatch.setattr( + "tests.canonical.test_canonical._invoke_ouroboros_auto", + fake_invoke, + ) + + await test_scenario_live_run_or_skip( + scenario=scenario, + live_run_enabled=True, + tmp_path=tmp_path, + ) + + assert calls == [(scenario.slug, tmp_path / scenario.slug)] diff --git a/tests/canonical/test_conftest.py b/tests/canonical/test_conftest.py index f15950e96..711c80d9a 100644 --- a/tests/canonical/test_conftest.py +++ b/tests/canonical/test_conftest.py @@ -89,7 +89,7 @@ def test_format_canonical_summary_line_is_copyable() -> None: "completion=product_complete " "probes=headless_run,stdout_golden " "budget=1800s " - "live=deferred_l0b" + "live=available_opt_in" ) @@ -124,6 +124,6 @@ def test_pytest_terminal_summary_emits_copyable_lines( "completion=product_complete " "probes=headless_run " "budget=42s " - "live=deferred_l0b", + "live=available_opt_in", ), ]