Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 12 additions & 17 deletions tests/canonical/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ fixture files in the right shape. It does **not** invoke
copyable status line per scenario, for example:

```text
CANONICAL cli-todo: shape_valid domain=cli completion=product_complete probes=headless_run,stdout_golden budget=1800s live=deferred_l0b
CANONICAL cli-todo: shape_valid domain=cli completion=product_complete probes=headless_run,stdout_golden budget=1800s live=available_opt_in
```

### Full live run (manual, costs LLM tokens)
Expand All @@ -45,13 +45,9 @@ CANONICAL cli-todo: shape_valid domain=cli completion=product_complete probes=he
OUROBOROS_RUN_CANONICAL=1 uv run pytest tests/canonical/ -v
```

Once the live wiring lands in L0-b, this command will invoke the
`ouroboros_auto` MCP tool against each scenario and assert the
documented terminal state — **use sparingly**, each scenario will
consume real LLM tokens (cli-todo ≈ \$1, kart-racer ≈ \$5 with
Sonnet-class models). **At L0-a (this PR) the opt-in still
`pytest.skip`s with a typed reason** so the harness contract is
observable without burning tokens; the shape-check tests still run.
This command invokes the `ouroboros_auto` MCP tool against each scenario
and asserts the documented terminal state — **use sparingly**, each
scenario will consume real LLM tokens.

### Run a single scenario

Expand All @@ -63,8 +59,8 @@ with `-k`:
uv run pytest tests/canonical/ -v -k cli-todo
```

Add `OUROBOROS_RUN_CANONICAL=1` once L0-b lands to opt into the live
invocation for that scenario.
Add `OUROBOROS_RUN_CANONICAL=1` to opt into the live invocation for that
scenario.

## Scenario directory shape

Expand All @@ -84,7 +80,7 @@ domain_class: cli # one of the L1 TaskClass values
completion_mode: product_complete # CODE_COMPLETE | PRODUCT_COMPLETE

# optional
runtime_probe_kinds: # placeholder until L3 lands
runtime_probe_kinds:
- headless_run
- stdout_golden
wall_clock_budget_seconds: 600 # default: 7200
Expand All @@ -97,10 +93,9 @@ canonicalizing, add a new `<slug>/` directory + populate
`expected.yaml`. No infrastructure change required. The runner
auto-discovers.

## Adding the live-run path
## Live-run path

The hermetic shape-check is in place from L0-a. The live-run path
(`OUROBOROS_RUN_CANONICAL=1`) is wired but the actual
`ouroboros_auto` invocation lands in L0-b once the maintainer has
confirmed they want it; until then the live-run path skips with a
typed reason so the harness contract stays observable.
The hermetic shape-check is the default. The live-run path
(`OUROBOROS_RUN_CANONICAL=1`) invokes `ouroboros_auto` against each
scenario and treats MCP errors, failed terminals, and unverified
PRODUCT_COMPLETE handoffs as test failures.
10 changes: 3 additions & 7 deletions tests/canonical/cli-todo/expected.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
# acceptance gate verifies the artifact (`completion_mode`,
# `runtime_probe_kinds`).
#
# These values are validated for shape only at L0-a. Cross-validation
# that they round-trip through the L1 `TaskClassProfile` catalog
# (#1173) lands in a follow-up sub-PR; the round-trip test is not yet
# present in `tests/canonical/test_canonical.py`.
# These values are validated against the L1 `TaskClassProfile` catalog
# by `tests/canonical/test_canonical.py`.

domain_class: cli
completion_mode: product_complete
Expand All @@ -18,7 +16,5 @@ runtime_probe_kinds:
- headless_run
- stdout_golden

# 30-minute soft budget. The future L2 watchdog will cancel beyond this
# if it ever ships; until then the value is informational metadata that
# downstream harnesses can honor or ignore.
# 30-minute soft budget honored by the opt-in live canonical runner.
wall_clock_budget_seconds: 1800
9 changes: 3 additions & 6 deletions tests/canonical/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,8 @@ def env_dir(self) -> Path | None:
def format_canonical_summary_line(scenario: CanonicalScenario) -> str:
"""Return the copyable one-line status for a canonical scenario.

L0-a has no live ``ouroboros_auto`` invocation yet, so the summary
deliberately reports the current shape-check terminal instead of
pretending PRODUCT_COMPLETE has been exercised. The live terminal
can replace ``shape_valid`` in L0-b without changing the copyable
line contract.
The default run is still the no-cost shape check, but the live
``ouroboros_auto`` path is available behind ``OUROBOROS_RUN_CANONICAL=1``.
"""
probe_text = ",".join(scenario.runtime_probe_kinds) or "none"
return (
Expand All @@ -95,7 +92,7 @@ def format_canonical_summary_line(scenario: CanonicalScenario) -> str:
f"completion={scenario.completion_mode} "
f"probes={probe_text} "
f"budget={scenario.wall_clock_budget_seconds}s "
f"live=deferred_l0b"
f"live=available_opt_in"
)


Expand Down
56 changes: 51 additions & 5 deletions tests/canonical/test_canonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def test_scenario_domain_class_is_lowercase_snake(scenario: CanonicalScenario) -
``expected.yaml`` fails here rather than at runtime when the
inference hook is wired.

Cross-validation against the actual L1 ``TaskClass`` enum lands in
a follow-up PR after #1173 merges to main.
Cross-validation against the actual L1 ``TaskClass`` enum is pinned
below.
"""
value = scenario.domain_class
assert value == value.lower(), f"{scenario.slug}: domain_class {value!r} must be lowercase"
Expand All @@ -66,9 +66,8 @@ def test_scenario_runtime_probe_kinds_are_strings(
scenario: CanonicalScenario,
) -> None:
"""``runtime_probe_kinds`` is a tuple of plain strings. Cross-
validation against the L1 catalog's per-class probe whitelist
lands in a follow-up PR after #1173 merges; this test pins the
surface shape only."""
validation against the L1 catalog's per-class probe whitelist is
pinned below; this test pins the surface shape."""
kinds = scenario.runtime_probe_kinds
assert isinstance(kinds, tuple)
for kind in kinds:
Expand Down Expand Up @@ -268,3 +267,50 @@ async def test_scenario_live_run_or_skip(
f"CANONICAL {scenario.slug}: status={tool_result.meta['status']} "
f"phase={tool_result.meta.get('phase')} completion_mode={scenario.completion_mode}"
)


@pytest.mark.asyncio
async def test_live_run_opt_in_invokes_auto_handler(
scenario: CanonicalScenario,
monkeypatch: pytest.MonkeyPatch,
tmp_path: Path,
) -> None:
"""Hermetically pin that the opt-in path calls the live runner."""

calls: list[tuple[str, Path]] = []

class _Ok:
def is_ok(self) -> bool:
return True

def unwrap(self) -> object:
return _ToolResult()

def unwrap_err(self) -> str:
return "unexpected"

class _ToolResult:
is_error = False
content: list[object] = []
meta = {
"status": "complete",
"phase": "done",
"product_status": "verified_complete",
}

async def fake_invoke(selected: CanonicalScenario, workdir: Path) -> _Ok:
calls.append((selected.slug, workdir))
return _Ok()

monkeypatch.setattr(
"tests.canonical.test_canonical._invoke_ouroboros_auto",
fake_invoke,
)

await test_scenario_live_run_or_skip(
scenario=scenario,
live_run_enabled=True,
tmp_path=tmp_path,
)

assert calls == [(scenario.slug, tmp_path / scenario.slug)]
4 changes: 2 additions & 2 deletions tests/canonical/test_conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_format_canonical_summary_line_is_copyable() -> None:
"completion=product_complete "
"probes=headless_run,stdout_golden "
"budget=1800s "
"live=deferred_l0b"
"live=available_opt_in"
)


Expand Down Expand Up @@ -124,6 +124,6 @@ def test_pytest_terminal_summary_emits_copyable_lines(
"completion=product_complete "
"probes=headless_run "
"budget=42s "
"live=deferred_l0b",
"live=available_opt_in",
),
]
Loading