diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c4ab9cf06..bb728f07d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -317,6 +317,15 @@ jobs: git fetch --quiet origin "$PUSH_BEFORE_SHA" || true bash scripts/check-legal-name-leaks.sh --diff "$PUSH_BEFORE_SHA..HEAD" fi + # Cross-runtime gate-manifest wiring check (reform §6 P0 — one gate-set + # SSOT). Verifies the codex hook adapter, antigrav/vibe/gemini capability + # markers, and CI jobs/run-markers still match hooks/gate-manifest.yaml. + # The live ~/.claude/settings.json check has no file on CI runners, so it + # is skipped here and enforced by the pre-commit hook on the operator + # machine where the live settings exist. + - name: Gate manifest wiring check + if: needs.post_merge_duplicate_filter.outputs.duplicate_merge_group != 'true' && needs.docs_only_filter.outputs.docs_only != 'true' + run: uv run python scripts/gate-manifest-check.py --skip-claude-settings typecheck: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5afc84e27..f35f94730 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -55,3 +55,12 @@ repos: language: system files: ^(config/pipewire/[^/]+\.conf|scripts/check-audio-conf-names\.py)$ pass_filenames: false + - id: gate-manifest-check + name: Cross-runtime gate-manifest wiring check (incl. live claude settings) + # --require-claude-settings checks the live ~/.claude/settings.json (the + # most drift-prone wiring) against hooks/gate-manifest.yaml. Runs on the + # operator machine where the live settings exist; CI checks the rest. + entry: uv run python scripts/gate-manifest-check.py --require-claude-settings + language: system + files: ^(hooks/gate-manifest\.yaml|hooks/scripts/(codex|antigrav)-hook-adapter\.sh|scripts/hapax-(codex|antigrav|vibe|gemini)|config/codex/config\.toml|\.github/workflows/ci\.yml)$ + pass_filenames: false diff --git a/hooks/gate-manifest.yaml b/hooks/gate-manifest.yaml index ff1a57b85..25cabdf17 100644 --- a/hooks/gate-manifest.yaml +++ b/hooks/gate-manifest.yaml @@ -264,3 +264,4 @@ runtimes: - 'uv run --no-project --with pyrefly==0.62.0 pyrefly check' - 'uv run --no-project --with pytest==9.0.2 --with pyyaml pytest' - 'uv run --no-sync python scripts/ci_governance_coverage_gate.py' + - 'uv run python scripts/gate-manifest-check.py --skip-claude-settings' diff --git a/scripts/cc-pr-autoqueue.py b/scripts/cc-pr-autoqueue.py index 806fa26c0..d30ad9e4c 100755 --- a/scripts/cc-pr-autoqueue.py +++ b/scripts/cc-pr-autoqueue.py @@ -45,10 +45,13 @@ FleetThrottlePolicy, ThrottleDecision, active_quarantined_pr_numbers, + bisection_plan_for_failed_runs, decide_fleet_throttle, read_jsonl_records, read_quarantine, recommend_max_entries_to_build, + reconcile_flake_quarantines, + write_quarantine, ) from shared.release_gate import evaluate_avsdlc_release_gate # noqa: E402 from shared.sdlc_lifecycle import ( # noqa: E402 @@ -88,7 +91,12 @@ CI_REPAIR_KINDS = {"cicd-speedup", "ci-repair", "ci-speedup", "merge-queue-repair"} CI_REPAIR_TAGS = {"cicd", "ci", "autoqueue"} INDEPENDENT_QUEUE_ADMISSION = {"independent", "independent_route"} -DEFAULT_STORM_OPEN_PR_THRESHOLD = 8 +# Open-PR COUNT is advisory-only — it raises a "busy" signal but NEVER freezes +# admission (FM-3). The only freeze is failure-RATE based (decide_fleet_throttle). +# The old ``*_STORM_OPEN_PR_THRESHOLD`` naming implied a count freeze that no +# longer exists; the advisory name is canonical, the storm alias is deprecated. +DEFAULT_ADVISORY_OPEN_PR_COUNT = 8 +DEFAULT_STORM_OPEN_PR_THRESHOLD = DEFAULT_ADVISORY_OPEN_PR_COUNT # deprecated alias DEFAULT_STORM_FAILED_MERGE_GROUP_THRESHOLD = 1 DEFAULT_STORM_RECENT_RUN_LIMIT = 20 STORM_MAX_ENTRIES_TO_BUILD = 1 @@ -198,6 +206,7 @@ class StormMode: failure_rate: float failure_rate_samples: int rate_frozen: bool + recommended_bisections: tuple[dict[str, Any], ...] = () def as_dict(self, *, repo: str) -> dict[str, Any]: return { @@ -210,6 +219,7 @@ def as_dict(self, *, repo: str) -> dict[str, Any]: "blocked_queued_prs": list(self.blocked_queued_prs), "failed_recent_merge_group_run_count": len(self.failed_recent_merge_group_runs), "failed_recent_merge_group_runs": list(self.failed_recent_merge_group_runs), + "recommended_bisections": list(self.recommended_bisections), "failure_rate": self.failure_rate, "failure_rate_samples": self.failure_rate_samples, "rate_frozen": self.rate_frozen, @@ -1002,6 +1012,9 @@ def _build_storm_mode( failure_rate=throttle_decision.failure_rate, failure_rate_samples=throttle_decision.samples, rate_frozen=throttle_decision.frozen, + recommended_bisections=tuple( + bisection_plan_for_failed_runs(failed_recent_merge_group_runs) + ), ) @@ -1016,8 +1029,9 @@ def run_reconciler( required_checks: tuple[str, ...] = DEFAULT_REQUIRED_CHECKS, limit: int = 100, lineage_ledger_path: Path | None = DEFAULT_LEDGER_PATH, + quarantine_path: Path = DEFAULT_QUARANTINE_PATH, storm_mode_enabled: bool = True, - storm_open_pr_threshold: int = DEFAULT_STORM_OPEN_PR_THRESHOLD, + advisory_open_pr_count: int = DEFAULT_ADVISORY_OPEN_PR_COUNT, storm_failed_merge_group_threshold: int = DEFAULT_STORM_FAILED_MERGE_GROUP_THRESHOLD, storm_recent_run_limit: int = DEFAULT_STORM_RECENT_RUN_LIMIT, auto_arm_ledger_path: Path | None = None, @@ -1051,10 +1065,24 @@ def run_reconciler( ] now = datetime.now(UTC) lineage_records = read_jsonl_records(lineage_ledger_path) if lineage_ledger_path else [] - quarantined_prs = active_quarantined_pr_numbers( - read_quarantine(DEFAULT_QUARANTINE_PATH), now=now + throttle_policy = FleetThrottlePolicy(advisory_open_pr_count=advisory_open_pr_count) + # Quarantine WRITE side (FM-3/FM-4 reversible quarantine): open quarantines for + # PRs over the failure threshold, lift expired ones, and persist (apply mode + # only). PRs already quarantined ON ENTRY are excluded from THIS tick's + # failure-rate signal; PRs newly quarantined this tick are persisted now and + # take effect next tick — isolating a flaky PR converges without a one-tick + # regression in fleet protection. + existing_quarantine = read_quarantine(quarantine_path) + quarantined_prs = active_quarantined_pr_numbers(existing_quarantine, now=now) + quarantine_reconciliation = reconcile_flake_quarantines( + existing_quarantine, + lineage_records, + candidate_prs={pr.number for pr in prs}, + policy=throttle_policy, + now=now, ) - throttle_policy = FleetThrottlePolicy(advisory_open_pr_count=storm_open_pr_threshold) + if apply and (quarantine_reconciliation.newly_quarantined or quarantine_reconciliation.lifted): + write_quarantine(quarantine_path, quarantine_reconciliation.records) throttle_decision = decide_fleet_throttle( lineage_records, open_pr_count=len(prs), @@ -1190,6 +1218,18 @@ def run_reconciler( "active_ci_repair_task_ids": list(active_ci_repair_task_ids), "storm_mode_enabled": storm_mode_enabled, "storm_mode": storm_mode.as_dict(repo=repo), + "flake_quarantine": { + "path": str(quarantine_path), + "active": quarantine_reconciliation.active, + "newly_quarantined": quarantine_reconciliation.newly_quarantined, + "lifted": quarantine_reconciliation.lifted, + "written": bool( + apply + and ( + quarantine_reconciliation.newly_quarantined or quarantine_reconciliation.lifted + ) + ), + }, "lineage_ledger_path": str(lineage_ledger_path) if lineage_ledger_path else None, "open_pr_count": len(prs), "queued_prs": sorted(queued_prs), @@ -1258,10 +1298,16 @@ def main(argv: list[str] | None = None) -> int: help="Report storm/admission pressure but do not add storm admission holds.", ) parser.add_argument( - "--storm-open-pr-threshold", + "--advisory-open-pr-count", + "--storm-open-pr-threshold", # deprecated alias type=int, - default=DEFAULT_STORM_OPEN_PR_THRESHOLD, - help="Open PR count at or above which storm admission pressure is active.", + dest="advisory_open_pr_count", + default=DEFAULT_ADVISORY_OPEN_PR_COUNT, + help=( + "Open PR count at or above which the queue reports an advisory 'busy' " + "signal. Advisory only — it never freezes admission (the only freeze is " + "failure-rate based). --storm-open-pr-threshold is a deprecated alias." + ), ) parser.add_argument( "--storm-failed-merge-group-threshold", @@ -1302,7 +1348,7 @@ def main(argv: list[str] | None = None) -> int: limit=args.limit, lineage_ledger_path=args.lineage_ledger_path, storm_mode_enabled=not args.disable_storm_mode, - storm_open_pr_threshold=args.storm_open_pr_threshold, + advisory_open_pr_count=args.advisory_open_pr_count, storm_failed_merge_group_threshold=args.storm_failed_merge_group_threshold, storm_recent_run_limit=args.storm_recent_run_limit, ) diff --git a/scripts/executor_contract.py b/scripts/executor_contract.py new file mode 100644 index 000000000..39a71e511 --- /dev/null +++ b/scripts/executor_contract.py @@ -0,0 +1,139 @@ +"""Executor adapter contract — the one capability surface every runtime conforms +to (reform §6 P1). + +Each launcher (Claude, Codex, Gemini, Vibe, Antigrav) speaks a common adapter +CLI; their genuine differences (Gemini read-only, Antigrav's IDE-surface hook +gap, which runtimes have a real headless path) are reported as machine-legible +*capability flags* by :func:`capabilities`, NOT branched in the dispatcher. The +dispatcher consumes :func:`supports_route` to decide launchability instead of a +hard ``(platform, mode)`` if-ladder, and ``hapax-executor-capabilities`` / +``hapax-methodology-dispatch --capabilities`` emit the registry as JSON so the +CLOG cockpit and other clients read the same contract. + +Colocated with the dispatcher and the ``hapax-executor-capabilities`` probe under +``scripts/`` so all three share one definition. +""" + +from __future__ import annotations + +from pydantic import BaseModel + +# The canonical adapter CLI every launcher accepts (quirks live in the flags +# below, not in extra options). Order is informational. +ADAPTER_CLI_CONTRACT: tuple[str, ...] = ( + "--lane", + "--task", + "--mode", # headless | interactive | receipt-only + "--prompt", + "--no-claim", + "--force", +) + +# Dispatch modes an executor can be launched in. ``receipt-only`` is a +# dispatch-level validation mode (no spawn), so it is not an executor capability. +LAUNCH_MODES: tuple[str, ...] = ("headless", "interactive") + + +class ExecutorCapabilities(BaseModel, frozen=True): + """Machine-legible capability flags for one executor runtime.""" + + platform: str + modes: tuple[str, ...] # launchable dispatch modes + profiles: tuple[str, ...] # capability profiles the route table exposes + mutates: bool # can mutate source under governance + claims: bool # participates in the cc-task claim lease + hooks_wired: bool # the dispatch-launched path enforces governance hooks + headless: bool # has a genuine non-interactive (no tmux pane) path + read_only: bool = False # default posture is read-only + notes: str = "" + + def supports(self, mode: str) -> bool: + return mode in self.modes + + +EXECUTOR_REGISTRY: dict[str, ExecutorCapabilities] = { + "claude": ExecutorCapabilities( + platform="claude", + modes=("headless", "interactive"), + profiles=("full", "opus", "sonnet"), + mutates=True, + claims=True, + hooks_wired=True, + headless=True, + notes="stream-json headless lane (hapax-claude-headless) + tmux interactive", + ), + "codex": ExecutorCapabilities( + platform="codex", + modes=("headless",), + profiles=("full", "spark"), + mutates=True, + claims=True, + hooks_wired=True, + headless=True, + notes=( + "codex exec headless (hapax-codex-headless). The tmux pane (hapax-codex) " + "exists for direct interactive use but is not a governed dispatch route." + ), + ), + "gemini": ExecutorCapabilities( + platform="gemini", + modes=("headless",), + profiles=("full", "worker", "flash", "lite"), + mutates=False, + claims=False, + hooks_wired=True, + headless=True, + read_only=True, + notes="read-only/plan-mode by policy; the worker profile is a governed auto-edit exception", + ), + "vibe": ExecutorCapabilities( + platform="vibe", + modes=("headless",), + profiles=("full",), + mutates=True, + claims=True, + hooks_wired=True, + headless=True, + notes="bounded one-shot headless worker lane", + ), + "antigrav": ExecutorCapabilities( + platform="antigrav", + modes=("interactive",), + profiles=("full",), + mutates=True, + claims=True, + hooks_wired=True, + headless=False, + notes=( + "agy CLI interactive; PreToolUse gate wired via antigrav-hook-adapter " + "(#3802). Residual gap: the IDE Edit/Write surface is not gated." + ), + ), +} + + +def capabilities(platform: str) -> ExecutorCapabilities | None: + """Return the capability flags for ``platform`` (None if unknown).""" + return EXECUTOR_REGISTRY.get(platform) + + +def supports_route(platform: str, mode: str) -> bool: + """True when ``platform`` has a launchable adapter for ``mode``.""" + caps = capabilities(platform) + return caps is not None and caps.supports(mode) + + +def capabilities_payload() -> dict[str, dict]: + """The whole registry as JSON-serialisable flags (the ``capabilities`` probe).""" + return {name: caps.model_dump() for name, caps in sorted(EXECUTOR_REGISTRY.items())} + + +__all__ = [ + "ADAPTER_CLI_CONTRACT", + "LAUNCH_MODES", + "EXECUTOR_REGISTRY", + "ExecutorCapabilities", + "capabilities", + "supports_route", + "capabilities_payload", +] diff --git a/scripts/hapax-codex-headless b/scripts/hapax-codex-headless new file mode 100755 index 000000000..fa969cf1f --- /dev/null +++ b/scripts/hapax-codex-headless @@ -0,0 +1,257 @@ +#!/usr/bin/env bash +# hapax-codex-headless - Launch a Codex lane non-interactively via `codex exec`. +# +# The genuine headless analog to hapax-claude-headless: NO tmux pane. Codex runs +# non-interactively, JSONL events stream to a log the headless-staleness watchdog +# can read, a PID file marks the run live, and the same identity-env + governed +# task-gate + hook wiring as hapax-codex applies. Default-deny: headless lanes +# mutate source autonomously, so a governed enable must exist first. + +set -euo pipefail + +COUNCIL_DIR="${HAPAX_COUNCIL_DIR:-$HOME/projects/hapax-council}" +ROLE_HELPER="$COUNCIL_DIR/hooks/scripts/agent-role.sh" +if [ -f "$ROLE_HELPER" ]; then + # shellcheck source=../hooks/scripts/agent-role.sh + # shellcheck disable=SC1091 + . "$ROLE_HELPER" +fi +if ! declare -F hapax_agent_is_codex_name >/dev/null 2>&1; then + hapax_agent_is_codex_name() { + [[ "${1:-}" =~ ^cx-[a-z][a-z-]*$ ]] + } +fi + +DISABLE_FILE="${HAPAX_CODEX_HEADLESS_DISABLE_FILE:-$HOME/.cache/hapax/disable-codex-headless}" +ENABLE_FILE="${HAPAX_CODEX_HEADLESS_ENABLE_FILE:-$HOME/.cache/hapax/enable-codex-headless}" + +headless_allowed() { + [[ "${HAPAX_CODEX_HEADLESS_ALLOW:-}" == "1" || -e "$ENABLE_FILE" ]] +} + +usage() { + cat >&2 <<'EOF' +usage: hapax-codex-headless [--task TASK_ID] [--no-claim] [--force] [-- codex exec args...] + +Default-deny: enable with `touch ~/.cache/hapax/enable-codex-headless` or +HAPAX_CODEX_HEADLESS_ALLOW=1. The governed initial message is the codex exec +prompt; everything after `--` is passed through to `codex exec`. +EOF +} + +SESSION="" +CODEX_TASK="${HAPAX_METHODOLOGY_DISPATCH_TASK:-}" +CLAIM_TASK=1 +FORCE=0 +CODEX_EXTRA=() +while [[ $# -gt 0 ]]; do + case "$1" in + --task) + CODEX_TASK="${2:-}" + shift 2 + ;; + --task=*) + CODEX_TASK="${1#--task=}" + shift + ;; + --no-claim) + CLAIM_TASK=0 + shift + ;; + --force) + FORCE=1 + shift + ;; + --help|-h) + usage + exit 0 + ;; + *) + SESSION="$1" + shift + break + ;; + esac +done + +if [[ -z "$SESSION" ]]; then + usage + exit 1 +fi +if ! hapax_agent_is_codex_name "$SESSION"; then + echo "hapax-codex-headless: invalid session '$SESSION' (expected cx-)" >&2 + exit 2 +fi + +INITIAL_MSG="${1:-}" +[[ $# -gt 0 ]] && shift +# Strip an optional `--` separator, then pass the rest through to `codex exec`. +[[ "${1:-}" == "--" ]] && shift +if [[ $# -gt 0 ]]; then + CODEX_EXTRA+=("$@") +fi + +if [[ -z "$INITIAL_MSG" ]]; then + echo "hapax-codex-headless: governed initial message required" >&2 + exit 5 +fi + +if [[ -e "$DISABLE_FILE" ]]; then + echo "hapax-codex-headless: disabled by $DISABLE_FILE" >&2 + exit 77 +fi +if ! headless_allowed; then + echo "hapax-codex-headless: disabled until governed enable exists at $ENABLE_FILE or HAPAX_CODEX_HEADLESS_ALLOW=1 is set" >&2 + echo " enable with: touch $ENABLE_FILE" >&2 + exit 77 +fi + +# Worktree selection mirrors hapax-codex's codex-native default. +WORKDIR="${HAPAX_CODEX_HEADLESS_WORKDIR:-$HOME/projects/hapax-council--$SESSION}" +if [[ ! -d "$WORKDIR" ]]; then + WORKDIR="$COUNCIL_DIR" +fi +[[ -d "$WORKDIR" ]] || { echo "hapax-codex-headless: worktree not found: $WORKDIR" >&2; exit 3; } +WORKDIR="$(cd "$WORKDIR" && pwd)" + +CODEX_BIN="$(command -v codex || true)" +[[ -n "$CODEX_BIN" ]] || { echo "hapax-codex-headless: codex not found in PATH" >&2; exit 4; } + +HOOK="$COUNCIL_DIR/hooks/scripts/codex-hook-adapter.sh" +[[ -x "$HOOK" ]] || { echo "hapax-codex-headless: hook adapter not executable: $HOOK" >&2; exit 6; } + +# Identity env — same contract as hapax-codex so the gate keys a session-scoped +# claim lease (reform Phase 1, FM-2) and the hooks recover this lane's role. +PARENT_INTERFACE="${HAPAX_PARENT_AGENT_INTERFACE:-${HAPAX_AGENT_INTERFACE:-unknown}}" +PARENT_NAME="${HAPAX_PARENT_AGENT_NAME:-${HAPAX_AGENT_NAME:-unknown}}" +SESSION_UUID="${HAPAX_SESSION_ID:-$(cat /proc/sys/kernel/random/uuid 2>/dev/null || uuidgen 2>/dev/null || printf '%s-%s' "$SESSION" "$$")}" +export HAPAX_PARENT_AGENT_INTERFACE="$PARENT_INTERFACE" +export HAPAX_PARENT_AGENT_NAME="$PARENT_NAME" +export HAPAX_AGENT_INTERFACE="codex" +export HAPAX_AGENT_NAME="$SESSION" +export CODEX_THREAD_NAME="$SESSION" +export CODEX_ROLE="$SESSION" +export HAPAX_AGENT_ROLE="$SESSION" +export HAPAX_AGENT_SLOT="${HAPAX_AGENT_SLOT:-alpha}" +export HAPAX_WORKTREE_ROLE="$HAPAX_AGENT_SLOT" +export HAPAX_SESSION_ID="$SESSION_UUID" +export CLAUDE_ROLE="${CLAUDE_ROLE:-$SESSION}" +export HAPAX_IDLE_UPDATE_SECONDS="${HAPAX_IDLE_UPDATE_SECONDS:-270}" +export PASSWORD_STORE_DIR="${PASSWORD_STORE_DIR:-$HOME/.password-store}" +export LOGOS_BASE_URL="${LOGOS_BASE_URL:-http://localhost:8051/api}" +export COCKPIT_BASE_URL="${COCKPIT_BASE_URL:-$LOGOS_BASE_URL}" + +# Governed task gate (mirrors hapax-claude-headless): refuse a mutating launch +# without an explicit task or an existing session claim, and never launch onto a +# claim that belongs to a different task. +CLAIM_FILE="$HOME/.cache/hapax/cc-active-task-$SESSION" +CLAIMED_TASK="" +if [[ -s "$CLAIM_FILE" ]]; then + CLAIMED_TASK="$(head -n1 "$CLAIM_FILE" | tr -d '[:space:]')" +fi +if [[ -z "$CODEX_TASK" && -n "$CLAIMED_TASK" ]]; then + CODEX_TASK="$CLAIMED_TASK" +fi +if [[ -z "$CODEX_TASK" ]]; then + echo "hapax-codex-headless: refusing mutating launch without --task or existing active claim for '$SESSION'" >&2 + exit 15 +fi +if [[ -n "$CLAIMED_TASK" && "$CLAIMED_TASK" != "$CODEX_TASK" ]]; then + echo "hapax-codex-headless: refusing launch; '$SESSION' already claims '$CLAIMED_TASK', not dispatch task '$CODEX_TASK'" >&2 + exit 13 +fi +if [[ -z "$CLAIMED_TASK" && "$CLAIM_TASK" -eq 1 ]]; then + [[ -x "$WORKDIR/scripts/cc-claim" ]] || { echo "hapax-codex-headless: missing cc-claim in $WORKDIR" >&2; exit 14; } + (cd "$WORKDIR" && "$WORKDIR/scripts/cc-claim" "$CODEX_TASK") +fi +export HAPAX_METHODOLOGY_DISPATCH_TASK="$CODEX_TASK" + +# Keep the lane from becoming a credential sink (same scrub as hapax-codex). +unset GITHUB_PERSONAL_ACCESS_TOKEN +unset CODEX_GITHUB_PERSONAL_ACCESS_TOKEN +unset TAVILY_API_KEY +unset CONTEXT7_API_KEY +MCP_CONFIG_SCRUB="$COUNCIL_DIR/scripts/hapax-codex-mcp-config-scrub" +[[ -x "$MCP_CONFIG_SCRUB" ]] && "$MCP_CONFIG_SCRUB" + +LOG_DIR="$HOME/.cache/hapax/codex-headless/$SESSION" +PID_DIR="/run/user/$(id -u)/hapax-codex" +mkdir -p "$LOG_DIR" "$PID_DIR" 2>/dev/null || true +[[ -d "$PID_DIR" ]] || PID_DIR="$LOG_DIR" +LOG_FILE="$LOG_DIR/output.jsonl" +PID_FILE="$PID_DIR/$SESSION.pid" + +RELAY_RETIRE="$COUNCIL_DIR/scripts/hapax-relay-retire" +cleanup() { + rm -f "$PID_FILE" + [[ -x "$RELAY_RETIRE" ]] && "$RELAY_RETIRE" "$SESSION" --reason "clean exit (codex headless)" 2>/dev/null || true +} +trap cleanup EXIT + +SDLC_PREAMBLE="SDLC DISCIPLINE (MANDATORY): +1. Work only on the task named in this dispatch. +2. Run cc-claim for that exact task before mutation. +3. If cc-claim rejects, stop and write a relay receipt. +4. Use only the AuthorityCase and parent spec named in this dispatch. +5. Do not create, select, or claim other work from the task pool. +6. When the task is complete or blocked, report status and idle. + +DISPATCH: +" +INITIAL_MSG="${SDLC_PREAMBLE}${INITIAL_MSG}" + +# Config mirrors hapax-codex's governed CODEX_ARGS (shared contract — keep in +# sync): full-access sandbox under external governance, the codex-hook-adapter on +# every phase, and the hapax MCP surface. `--json` streams JSONL events; `--cd` +# binds the worktree; trust is scoped to the worktree. +CODEX_ARGS=( + exec + --dangerously-bypass-approvals-and-sandbox + --skip-git-repo-check + --json + --cd "$WORKDIR" + -c 'model="gpt-5.5"' + -c 'model_reasoning_effort="xhigh"' + -c 'approval_policy="never"' + -c 'sandbox_mode="danger-full-access"' + -c "projects.\"$HOME/projects\".trust_level=\"trusted\"" + -c "projects.\"$WORKDIR\".trust_level=\"trusted\"" + -c "hooks.SessionStart=[{command=\"$HOOK\",timeout=20,statusMessage=\"Loading Hapax context\"}]" + -c "hooks.PreToolUse=[{command=\"$HOOK\",timeout=20,include_apply_patch_tool=true,statusMessage=\"Hapax guardrails\"}]" + -c "hooks.PostToolUse=[{command=\"$HOOK\",timeout=20,include_apply_patch_tool=true,statusMessage=\"Hapax audit\"}]" + -c "hooks.Stop=[{command=\"$HOOK\",timeout=20,statusMessage=\"Writing Hapax session summary\"}]" + -c "mcp_servers.hapax.command=\"$HOME/.local/bin/uv\"" + -c "mcp_servers.hapax.args=[\"--directory\",\"$HOME/projects/hapax-mcp\",\"run\",\"hapax-mcp\"]" + -c 'mcp_servers.hapax.env.LOGOS_BASE_URL="http://localhost:8051/api"' + -c "mcp_servers.github.command=\"$COUNCIL_DIR/scripts/hapax-github-mcp\"" + -c "mcp_servers.context7.command=\"$COUNCIL_DIR/scripts/hapax-context7-mcp\"" +) +CODEX_ARGS+=("${CODEX_EXTRA[@]}") +CODEX_ARGS+=("$INITIAL_MSG") + +# Refuse to double-launch a live lane unless forced (the watchdog reads this PID). +if [[ -f "$PID_FILE" && "$FORCE" -ne 1 ]]; then + EXISTING_PID="$(cat "$PID_FILE" 2>/dev/null || true)" + if [[ -n "$EXISTING_PID" ]] && kill -0 "$EXISTING_PID" 2>/dev/null; then + echo "hapax-codex-headless: lane '$SESSION' already live (pid=$EXISTING_PID); pass --force to relaunch" >&2 + exit 11 + fi +fi + +echo "hapax-codex-headless: starting $SESSION (task=$CODEX_TASK) in $WORKDIR" +echo " log: $LOG_FILE" +echo " pid: $PID_FILE" + +cd "$WORKDIR" +"$CODEX_BIN" "${CODEX_ARGS[@]}" >>"$LOG_FILE" 2>&1 & +CODEX_PID=$! +echo "$CODEX_PID" >"$PID_FILE" +echo "hapax-codex-headless: pid=$CODEX_PID" + +if wait "$CODEX_PID"; then + EXIT_CODE=0 +else + EXIT_CODE=$? +fi +echo "hapax-codex-headless: $SESSION exited (code=$EXIT_CODE)" +exit "$EXIT_CODE" diff --git a/scripts/hapax-executor-capabilities b/scripts/hapax-executor-capabilities new file mode 100755 index 000000000..7eb22ef2d --- /dev/null +++ b/scripts/hapax-executor-capabilities @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +"""Emit the Executor adapter capability registry as JSON (reform §6 P1). + +The machine-legible ``capabilities`` probe: one JSON object per executor runtime +with its launchable modes, profiles, and governance flags (mutates / claims / +hooks_wired / headless / read_only). The dispatcher consumes the same registry +(``scripts/executor_contract.py``) to route on data instead of a hard if-ladder; +this CLI lets the CLOG cockpit and other clients read it too. + +Usage:: + + scripts/hapax-executor-capabilities # all runtimes + scripts/hapax-executor-capabilities codex # one runtime + scripts/hapax-executor-capabilities --contract # the adapter CLI contract +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from executor_contract import ( # noqa: E402 + ADAPTER_CLI_CONTRACT, + LAUNCH_MODES, + capabilities, + capabilities_payload, +) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("platform", nargs="?", help="Limit output to one runtime.") + parser.add_argument( + "--contract", + action="store_true", + help="Print the canonical adapter CLI contract instead of the registry.", + ) + args = parser.parse_args(argv) + + if args.contract: + print( + json.dumps( + { + "adapter_cli_contract": list(ADAPTER_CLI_CONTRACT), + "launch_modes": list(LAUNCH_MODES), + }, + indent=2, + ) + ) + return 0 + + if args.platform: + caps = capabilities(args.platform) + if caps is None: + print(f"unknown executor: {args.platform}", file=sys.stderr) + return 1 + print(json.dumps(caps.model_dump(), indent=2)) + return 0 + + print(json.dumps(capabilities_payload(), indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/hapax-methodology-dispatch b/scripts/hapax-methodology-dispatch index db3176d2b..a4a377a29 100755 --- a/scripts/hapax-methodology-dispatch +++ b/scripts/hapax-methodology-dispatch @@ -17,6 +17,15 @@ from typing import Any REPO_ROOT_FOR_IMPORTS = Path(__file__).resolve().parents[1] if str(REPO_ROOT_FOR_IMPORTS) not in sys.path: sys.path.insert(0, str(REPO_ROOT_FOR_IMPORTS)) +_SCRIPTS_DIR_FOR_IMPORTS = Path(__file__).resolve().parent +if str(_SCRIPTS_DIR_FOR_IMPORTS) not in sys.path: + sys.path.insert(0, str(_SCRIPTS_DIR_FOR_IMPORTS)) + +from executor_contract import ( # noqa: E402 + capabilities, + capabilities_payload, + supports_route, +) from shared.coord_dispatch import ( CoordDispatchError, @@ -721,43 +730,6 @@ def launch_claude_interactive(task_id: str, lane: str, validation: Validation) - return subprocess.call(args) -def write_codex_bootstrap(task_id: str, lane: str, prompt: str) -> Path: - spawn_dir = Path( - os.environ.get( - "HAPAX_CODEX_SPAWN_DIR", - str( - Path(os.environ.get("XDG_CACHE_HOME", str(Path.home() / ".cache"))) - / "hapax" - / "codex-spawns" - ), - ) - ) - spawn_dir.mkdir(parents=True, exist_ok=True) - path = ( - spawn_dir - / f"{now_utc().replace(':', '').replace('-', '')}-{slugify(lane)}-{slugify(task_id)}-methodology.md" - ) - path.write_text( - "\n".join( - [ - "# Hapax Codex Methodology Dispatch", - "", - prompt, - "", - "Codex launch contract:", - "", - "- This bootstrap was produced by `hapax-methodology-dispatch` after task authority validation.", - "- Read the workspace `CLAUDE.md` and `AGENTS.md` files before source mutation.", - "- Treat the task note and parent spec named above as the only work authority.", - "- Keep relay/status surfaces current while blocked, waiting, or idle.", - "", - ] - ), - encoding="utf-8", - ) - return path - - def normalize_profile(platform: str, profile: str) -> str: profile = profile.strip().lower().replace("_", "-") if profile in {"default", "max", "pro", "full-capability"}: @@ -878,34 +850,26 @@ def launch_codex_headless( validation: Validation, route: PlatformPath, ) -> int: + # Genuine headless path (reform §6 P0): `codex exec` via hapax-codex-headless, + # NOT a tmux pane. The tmux launcher (hapax-codex) remains the interactive + # analog (launch_codex_interactive). launcher = Path( os.environ.get( - "HAPAX_METHODOLOGY_CODEX_LAUNCHER", - str(Path(__file__).resolve().parent / "hapax-codex"), + "HAPAX_METHODOLOGY_CODEX_HEADLESS", + str(Path(__file__).resolve().parent / "hapax-codex-headless"), ) ) if not launcher.is_file(): - print(f"Codex launcher not found: {launcher}", file=sys.stderr) + print(f"Codex headless launcher not found: {launcher}", file=sys.stderr) return 8 status = "" if validation.task is not None: status = strip_scalar(str(validation.task.fields.get("status", ""))) - args = [ - str(launcher), - "--session", - lane, - "--terminal", - "tmux", - "--task", - task_id, - "--bootstrap", - str(write_codex_bootstrap(task_id, lane, prompt)), - "--task-gate", - "--force", - ] + args = [str(launcher), "--task", task_id, "--force"] if status in {"claimed", "in_progress"}: args.append("--no-claim") + args.extend([lane, prompt]) if route.profile == "spark": args.extend( [ @@ -916,7 +880,9 @@ def launch_codex_headless( 'model_reasoning_effort="high"', ] ) - return subprocess.call(args) + env = os.environ.copy() + env["HAPAX_METHODOLOGY_DISPATCH_TASK"] = task_id + return subprocess.call(args, env=env) def launch_vibe_headless(task_id: str, lane: str, prompt: str, validation: Validation) -> int: @@ -1074,6 +1040,11 @@ def main(argv: list[str] | None = None) -> int: action="store_true", help="List governed platform/mode/profile launch paths and exit", ) + parser.add_argument( + "--capabilities", + action="store_true", + help="Print the Executor adapter capability registry as JSON and exit", + ) parser.add_argument( "--print-prompt", action="store_true", help="Print the governed launch prompt" ) @@ -1110,8 +1081,14 @@ def main(argv: list[str] | None = None) -> int: print("\n".join(supported_route_lines())) return 0 + if args.capabilities: + print(json.dumps(capabilities_payload(), indent=2)) + return 0 + if not args.task or not args.lane: - parser.error("--task and --lane are required unless --list-platform-paths is used") + parser.error( + "--task and --lane are required unless --list-platform-paths or --capabilities is used" + ) task_root = Path( os.environ.get( @@ -1340,28 +1317,48 @@ def main(argv: list[str] | None = None) -> int: print(prompt) if args.launch: + # Launchable (platform, mode) -> adapter call. Eligibility is decided by + # the Executor capability registry (supports_route), NOT a hard if-ladder + # (reform §6 P1) — adding a runtime/mode is a one-line table + registry edit. + launchers: dict[tuple[str, str], Any] = { + ("claude", "headless"): lambda: launch_claude_headless( + args.task, args.lane, prompt or "", route + ), + ("claude", "interactive"): lambda: launch_claude_interactive( + args.task, args.lane, validation + ), + ("codex", "headless"): lambda: launch_codex_headless( + args.task, args.lane, prompt or "", validation, route + ), + ("vibe", "headless"): lambda: launch_vibe_headless( + args.task, args.lane, prompt or "", validation + ), + ("gemini", "headless"): lambda: launch_gemini_headless(args.lane, prompt or "", route), + ("antigrav", "interactive"): lambda: launch_antigrav_interactive( + args.task, args.lane, validation, prompt or "" + ), + } def run_platform_launch() -> int: - if args.platform == "claude" and args.mode == "headless": - return launch_claude_headless(args.task, args.lane, prompt or "", route) - if args.platform == "claude" and args.mode == "interactive": - return launch_claude_interactive(args.task, args.lane, validation) - if args.platform == "codex" and args.mode == "headless": - return launch_codex_headless(args.task, args.lane, prompt or "", validation, route) - if args.platform == "vibe" and args.mode == "headless": - return launch_vibe_headless(args.task, args.lane, prompt or "", validation) - if args.platform == "gemini" and args.mode == "headless": - return launch_gemini_headless(args.lane, prompt or "", route) - if args.platform == "antigrav" and args.mode == "interactive": - return launch_antigrav_interactive(args.task, args.lane, validation, prompt or "") - print( - "launch currently unsupported for " - f"--platform {args.platform} --mode {args.mode} --profile {args.profile}", - file=sys.stderr, - ) - print("Supported governed routes:", file=sys.stderr) - print("\n".join(supported_route_lines()), file=sys.stderr) - return 2 + launch = launchers.get((args.platform, args.mode)) + if launch is None or not supports_route(args.platform, args.mode): + caps = capabilities(args.platform) + print( + "launch unsupported for " + f"--platform {args.platform} --mode {args.mode} --profile {args.profile}", + file=sys.stderr, + ) + if caps is not None: + print( + f" {args.platform} capabilities: modes={list(caps.modes)} " + f"mutates={caps.mutates} hooks_wired={caps.hooks_wired} " + f"headless={caps.headless}", + file=sys.stderr, + ) + print("Supported governed routes:", file=sys.stderr) + print("\n".join(supported_route_lines()), file=sys.stderr) + return 2 + return launch() dispatch_launch: DispatchLaunchResult | None = None if route.mutable and not validation.exempt_read_only: diff --git a/scripts/hapax-pr-admission b/scripts/hapax-pr-admission index 1657579ef..453b8fcd3 100755 --- a/scripts/hapax-pr-admission +++ b/scripts/hapax-pr-admission @@ -132,7 +132,7 @@ def current_throttle_decision(open_count: int | None = None): return decide_fleet_throttle( records, open_pr_count=prs_count, - policy=FleetThrottlePolicy(advisory_open_pr_count=AUTO_FREEZE_THRESHOLD), + policy=FleetThrottlePolicy(advisory_open_pr_count=ADVISORY_OPEN_PR_COUNT), now=datetime.now(UTC), ) @@ -340,9 +340,11 @@ def is_admission_allowed(branch: str | None) -> tuple[bool, str]: return False, f"mode is {state['mode']} and branch {branch!r} not in snapshot" -# Admission throttle threshold used only as the advisory open-PR count in the -# failure-rate policy. Open count alone is never a freeze. -AUTO_FREEZE_THRESHOLD = 10 +# Advisory open-PR count fed to the failure-rate policy. Open count alone is +# NEVER a freeze (FM-3) — the only freeze is failure-rate based. The old +# ``AUTO_FREEZE_THRESHOLD`` name implied a count freeze that no longer exists. +ADVISORY_OPEN_PR_COUNT = 10 +AUTO_FREEZE_THRESHOLD = ADVISORY_OPEN_PR_COUNT # deprecated alias def cmd_auto(args: argparse.Namespace) -> int: diff --git a/shared/merge_queue_lineage.py b/shared/merge_queue_lineage.py index b1a307d4f..84973520b 100644 --- a/shared/merge_queue_lineage.py +++ b/shared/merge_queue_lineage.py @@ -18,7 +18,7 @@ from pydantic import BaseModel, Field if TYPE_CHECKING: - from collections.abc import Collection, Iterable, Sequence + from collections.abc import Collection, Iterable, Mapping, Sequence BottleneckKind = Literal[ "queue_admission", @@ -967,6 +967,113 @@ def bisect_failed_batch(batch: Sequence[int]) -> list[tuple[int, ...]]: return [items[:mid], items[mid:]] +class QuarantineReconciliation(BaseModel): + """Outcome of one quarantine write-side pass over the live store.""" + + records: list[FlakeQuarantine] # full set to persist (kept + lifted + newly opened) + newly_quarantined: list[int] + lifted: list[int] + active: list[int] # PR numbers whose quarantine is active AFTER reconciliation + + +def reconcile_flake_quarantines( + existing: Iterable[FlakeQuarantine], + lineage_records: Iterable[MergeQueueLineageRecord], + *, + candidate_prs: Collection[int], + policy: FleetThrottlePolicy | None = None, + now: datetime | None = None, +) -> QuarantineReconciliation: + """Open quarantines for flaky PRs and lift expired ones — the WRITE side. + + The merge-queue reconciler already READS quarantines to exclude flaky PRs + from the failure-rate signal; this computes the records to persist back. + Reversible by construction (FM-3/FM-4): a PR already actively quarantined is + never re-opened, and a quarantine whose cooldown has elapsed is lifted + (``released_at`` stamped) so the PR re-enters the queue. Pure: every result + is a function of ``existing``, ``lineage_records`` and ``now``. + """ + policy = policy or FleetThrottlePolicy() + now = now or datetime.now(UTC) + lineage = list(lineage_records) + counts = pr_failure_counts(lineage, window_seconds=policy.window_seconds, now=now) + + out: list[FlakeQuarantine] = [] + lifted: list[int] = [] + for record in existing: + if record.released_at is None and not quarantine_active(record, now=now): + out.append(lift_quarantine(record, now=now)) + lifted.append(record.pr_number) + else: + out.append(record) + + active_after_lift = active_quarantined_pr_numbers(out, now=now) + newly: list[int] = [] + window_hours = policy.window_seconds / 3600 + for pr_number in sorted(set(candidate_prs)): + if pr_number in active_after_lift: + continue + if should_quarantine_pr(pr_number, lineage, policy=policy, now=now): + out.append( + open_quarantine( + pr_number, + reason=( + f"{counts.get(pr_number, 0)} CI failures within " + f"{window_hours:.0f}h (>= {policy.quarantine_failures}); " + "reversible flake quarantine" + ), + now=now, + cooldown_seconds=policy.quarantine_cooldown_seconds, + ) + ) + newly.append(pr_number) + + active = sorted(active_quarantined_pr_numbers(out, now=now)) + return QuarantineReconciliation( + records=out, + newly_quarantined=sorted(newly), + lifted=sorted(lifted), + active=active, + ) + + +def bisection_plan_for_failed_runs( + failed_runs: Iterable[Mapping[str, Any]], +) -> list[dict[str, Any]]: + """Group failed merge-group runs by run id and propose the next split. + + A merge group batches several PRs into one CI run; when it fails, halving the + batch isolates the culprit (FM-3/FM-4 batch bisection via + :func:`bisect_failed_batch`). Single-PR groups are terminal (the culprit is + already known) and yield no further split. Rows missing ``run_id``/``pr`` are + skipped so malformed lineage never crashes the reconciler. + """ + by_run: dict[str, list[int]] = defaultdict(list) + for run in failed_runs: + run_id = run.get("run_id") + pr = run.get("pr") + if run_id is None or pr is None: + continue + bucket = by_run[str(run_id)] + if int(pr) not in bucket: + bucket.append(int(pr)) + + plans: list[dict[str, Any]] = [] + for run_id, prs in sorted(by_run.items()): + batch = tuple(sorted(prs)) + splits = bisect_failed_batch(batch) + if not splits: + continue + plans.append( + { + "merge_group_run_id": run_id, + "failed_batch": list(batch), + "next_bisection": [list(split) for split in splits], + } + ) + return plans + + def recommend_max_entries_to_build( records: Iterable[MergeQueueLineageRecord], *, @@ -999,10 +1106,13 @@ def recommend_max_entries_to_build( "SUCCESS_CONCLUSIONS", "FleetThrottlePolicy", "FlakeQuarantine", + "QuarantineReconciliation", "ThrottleDecision", "active_quarantined_pr_numbers", "bisect_failed_batch", + "bisection_plan_for_failed_runs", "decide_fleet_throttle", + "reconcile_flake_quarantines", "lift_quarantine", "merge_failure_rate", "open_quarantine", diff --git a/shared/relay_mq.py b/shared/relay_mq.py index 533f06f8c..724b1aaf6 100644 --- a/shared/relay_mq.py +++ b/shared/relay_mq.py @@ -167,6 +167,47 @@ def _row_to_envelope(row: sqlite3.Row) -> Envelope: return Envelope.model_construct(**d) +# Canonical Claude coordination-lane names (greek slots). Codex lanes are +# ``cx-``; Gemini is ``iota``; Antigrav lanes start ``antigrav``; Vibe +# lanes start ``vbe``/``vibe``. These predicates are the single source of truth +# shared by the per-runtime broadcast groups and the cross-runtime ``workers`` +# group (reform §6 P1 — every runtime is reachable by a group broadcast). +_CLAUDE_LANE_NAMES = frozenset( + {"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta"} +) + + +def _is_claude_lane(peer: str) -> bool: + return peer in _CLAUDE_LANE_NAMES and not peer.startswith("cx-") + + +def _is_codex_lane(peer: str) -> bool: + return peer.startswith("cx-") + + +def _is_gemini_lane(peer: str) -> bool: + return peer == "iota" + + +def _is_antigrav_lane(peer: str) -> bool: + return peer.startswith("antigrav") + + +def _is_vibe_lane(peer: str) -> bool: + return peer.startswith("vbe") or peer.startswith("vibe") + + +def _is_worker_lane(peer: str) -> bool: + """A recognised executor lane across any of the five runtimes.""" + return ( + _is_claude_lane(peer) + or _is_codex_lane(peer) + or _is_gemini_lane(peer) + or _is_antigrav_lane(peer) + or _is_vibe_lane(peer) + ) + + def expand_recipients( spec: str, relay_dir: Path | None = None, @@ -182,27 +223,22 @@ def expand_recipients( if not peers: raise ValueError(f"Broadcast spec '{spec}' but no peers found in {relay_dir}") - claude_names = { - "alpha", - "beta", - "gamma", - "delta", - "epsilon", - "zeta", - "eta", - "theta", - } - if group == "all": return peers elif group == "coordinators": return [p for p in peers if p != "rte" and not p.startswith("timer:")] elif group == "claude": - return [p for p in peers if p in claude_names and not p.startswith("cx-")] + return [p for p in peers if _is_claude_lane(p)] elif group == "codex": - return [p for p in peers if p.startswith("cx-")] + return [p for p in peers if _is_codex_lane(p)] elif group == "gemini": - return [p for p in peers if p == "iota"] + return [p for p in peers if _is_gemini_lane(p)] + elif group == "antigrav": + return [p for p in peers if _is_antigrav_lane(p)] + elif group == "vibe": + return [p for p in peers if _is_vibe_lane(p)] + elif group == "workers": + return [p for p in peers if _is_worker_lane(p)] else: raise ValueError(f"Unknown broadcast group: '{group}'") diff --git a/tests/hooks/test_gate_manifest_check.py b/tests/hooks/test_gate_manifest_check.py index 665be70c8..04a1b8063 100644 --- a/tests/hooks/test_gate_manifest_check.py +++ b/tests/hooks/test_gate_manifest_check.py @@ -143,6 +143,23 @@ def test_gemini_capability_marker_drift_fails(tmp_path: Path) -> None: assert "--approval-mode" in result.stderr +def test_gate_manifest_check_is_wired_in_ci_and_precommit() -> None: + ci_text = (REPO_ROOT / ".github" / "workflows" / "ci.yml").read_text(encoding="utf-8") + precommit_text = (REPO_ROOT / ".pre-commit-config.yaml").read_text(encoding="utf-8") + # The checker runs in CI (codex/antigrav/vibe/gemini/ci wiring) and in + # pre-commit (the live ~/.claude/settings.json, the most drift-prone wiring). + assert "scripts/gate-manifest-check.py" in ci_text + assert "scripts/gate-manifest-check.py" in precommit_text + assert "--require-claude-settings" in precommit_text # live check in pre-commit + assert "--skip-claude-settings" in ci_text # CI runners have no live settings + # The manifest lists its own CI invocation so the checker self-verifies the + # CI wiring is present (check_ci run-marker round-trip). + manifest = _manifest() + assert any( + "gate-manifest-check.py" in marker for marker in manifest["runtimes"]["ci"]["run_markers"] + ) + + def test_ci_job_drift_fails(tmp_path: Path) -> None: workflow = yaml.safe_load((REPO_ROOT / ".github" / "workflows" / "ci.yml").read_text()) workflow["jobs"].pop("security") diff --git a/tests/scripts/test_hapax_methodology_dispatch.py b/tests/scripts/test_hapax_methodology_dispatch.py index f8242d5a9..a1172c4de 100644 --- a/tests/scripts/test_hapax_methodology_dispatch.py +++ b/tests/scripts/test_hapax_methodology_dispatch.py @@ -761,7 +761,7 @@ def test_policy_hold_writes_route_decision_before_prompt_or_launch(tmp_path: Pat "headless", "--print-prompt", "--launch", - extra_env={"HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher)}, + extra_env={"HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher)}, ) assert result.returncode == 10 @@ -815,7 +815,7 @@ def test_launch_blocks_without_durable_mq_authority_binding(tmp_path: Path) -> N "--mode", "headless", "--launch", - extra_env={"HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher)}, + extra_env={"HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher)}, durable_mq=False, ) @@ -866,7 +866,7 @@ def test_launch_requires_strict_mq_message_id(tmp_path: Path) -> None: "headless", "--launch", extra_env={ - "HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher), + "HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher), "HAPAX_METHODOLOGY_DISPATCH_MESSAGE_ID": "", }, ) @@ -911,7 +911,7 @@ def test_launch_blocks_mq_message_id_mismatch_without_consuming(tmp_path: Path) "headless", "--launch", extra_env={ - "HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher), + "HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher), "HAPAX_METHODOLOGY_DISPATCH_MESSAGE_ID": "wrong-message-id", }, ) @@ -959,32 +959,23 @@ def test_launches_codex_headless_through_codex_launcher(tmp_path: Path) -> None: "headless", "--launch", extra_env={ - "HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher), + "HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher), "XDG_CACHE_HOME": str(tmp_path / "cache"), }, ) assert result.returncode == 0, result.stderr - args = launcher_args.read_text(encoding="utf-8").splitlines() - assert args[:10] == [ - "--session", - "cx-green", - "--terminal", - "tmux", - "--task", - "governed-build", - "--bootstrap", - args[7], - "--task-gate", - "--force", - ] - bootstrap = Path(args[7]).read_text(encoding="utf-8") - assert "SDLC GOVERNED DISPATCH." in bootstrap - assert "Task: governed-build" in bootstrap - assert "AuthorityCase: CASE-TEST-001" in bootstrap - assert "If the launcher already claimed it" in bootstrap - assert "claim the next" not in bootstrap - assert "highest-WSJF" not in bootstrap + # hapax-codex-headless takes `--task --force `; the prompt + # is passed inline (multi-line), so assert the flag prefix then the prompt + # body in the raw recorded args. + recorded = launcher_args.read_text(encoding="utf-8") + assert recorded.startswith("--task\ngoverned-build\n--force\ncx-green\n") + assert "SDLC GOVERNED DISPATCH." in recorded + assert "Task: governed-build" in recorded + assert "AuthorityCase: CASE-TEST-001" in recorded + assert "If the launcher already claimed it" in recorded + assert "claim the next" not in recorded + assert "highest-WSJF" not in recorded line = ( (tmp_path / "ledger" / "methodology-dispatch.jsonl") @@ -1045,7 +1036,7 @@ def test_launch_idempotency_replays_without_second_launcher_call(tmp_path: Path) "--idempotency-key", "dispatch-test-key", extra_env={ - "HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher), + "HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher), "XDG_CACHE_HOME": str(tmp_path / "cache"), }, ) @@ -1070,7 +1061,7 @@ def test_launch_idempotency_replays_without_second_launcher_call(tmp_path: Path) extra_env={ "HAPAX_RELAY_MQ_DB": str(tmp_path / "relay" / "messages.db"), "HAPAX_METHODOLOGY_DISPATCH_MESSAGE_ID": message_id, - "HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher), + "HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher), "XDG_CACHE_HOME": str(tmp_path / "cache"), }, ) @@ -1114,7 +1105,7 @@ def test_failed_launch_cleans_up_mq_state_and_records_failure(tmp_path: Path) -> "--mode", "headless", "--launch", - extra_env={"HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher)}, + extra_env={"HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher)}, ) assert result.returncode == 42 @@ -1193,7 +1184,7 @@ def test_launch_uses_subscription_receipt_without_policy_rollback(tmp_path: Path "headless", "--launch", extra_env={ - "HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher), + "HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher), "HAPAX_PLATFORM_CAPABILITY_REGISTRY": str(REGISTRY), "HAPAX_PLATFORM_CAPABILITY_RECEIPT_DIR": str(receipt_dir), "XDG_CACHE_HOME": str(tmp_path / "cache"), @@ -1254,7 +1245,7 @@ def test_policy_rollback_is_retired_before_launcher( "--policy-rollback", "--launch", extra_env={ - "HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher), + "HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher), "HAPAX_PLATFORM_CAPABILITY_REGISTRY": str(REGISTRY), "XDG_CACHE_HOME": str(tmp_path / "cache"), }, @@ -1322,7 +1313,7 @@ def test_policy_rollback_holds_non_full_profile_before_launcher(tmp_path: Path) "--policy-rollback", "--launch", extra_env={ - "HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher), + "HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher), "XDG_CACHE_HOME": str(tmp_path / "cache"), }, ) @@ -1780,7 +1771,7 @@ def test_codex_launch_unsupported_mode_fails_closed(tmp_path: Path) -> None: "--mode", "interactive", "--launch", - extra_env={"HAPAX_METHODOLOGY_CODEX_LAUNCHER": str(fake_launcher)}, + extra_env={"HAPAX_METHODOLOGY_CODEX_HEADLESS": str(fake_launcher)}, ) assert result.returncode == 10 diff --git a/tests/shared/test_relay_mq.py b/tests/shared/test_relay_mq.py index 8957bfc6e..b035cb2f9 100644 --- a/tests/shared/test_relay_mq.py +++ b/tests/shared/test_relay_mq.py @@ -584,6 +584,49 @@ def test_expand_broadcast_codex(self) -> None: self.assertIn("cx-blue", result) self.assertNotIn("alpha", result) + def test_expand_broadcast_gemini(self) -> None: + with tempfile.TemporaryDirectory() as td: + relay_dir = _make_relay_dir(Path(td), ["alpha", "iota", "cx-red"]) + result = expand_recipients("*:gemini", relay_dir) + self.assertEqual(result, ["iota"]) + + def test_expand_broadcast_antigrav(self) -> None: + with tempfile.TemporaryDirectory() as td: + relay_dir = _make_relay_dir( + Path(td), ["alpha", "antigrav", "antigrav-2", "antigravity", "cx-red"] + ) + result = expand_recipients("*:antigrav", relay_dir) + self.assertIn("antigrav", result) + self.assertIn("antigrav-2", result) + self.assertIn("antigravity", result) + self.assertNotIn("alpha", result) + self.assertNotIn("cx-red", result) + + def test_expand_broadcast_vibe(self) -> None: + with tempfile.TemporaryDirectory() as td: + relay_dir = _make_relay_dir(Path(td), ["alpha", "vbe-1", "vbe-2", "cx-red"]) + result = expand_recipients("*:vibe", relay_dir) + self.assertIn("vbe-1", result) + self.assertIn("vbe-2", result) + self.assertNotIn("alpha", result) + self.assertNotIn("cx-red", result) + + def test_expand_broadcast_workers_spans_all_runtimes(self) -> None: + with tempfile.TemporaryDirectory() as td: + relay_dir = _make_relay_dir( + Path(td), + ["alpha", "cx-red", "iota", "antigrav", "vbe-1", "rte", "alpha-status"], + ) + result = expand_recipients("*:workers", relay_dir) + self.assertIn("alpha", result) + self.assertIn("cx-red", result) + self.assertIn("iota", result) + self.assertIn("antigrav", result) + self.assertIn("vbe-1", result) + # Coordinators-only and stray status-file stems are not worker lanes. + self.assertNotIn("rte", result) + self.assertNotIn("alpha-status", result) + def test_expand_broadcast_unknown_group(self) -> None: with tempfile.TemporaryDirectory() as td: relay_dir = _make_relay_dir(Path(td), ["alpha"]) diff --git a/tests/test_cc_pr_autoqueue.py b/tests/test_cc_pr_autoqueue.py index 9fe8c430a..1af816243 100644 --- a/tests/test_cc_pr_autoqueue.py +++ b/tests/test_cc_pr_autoqueue.py @@ -1333,3 +1333,61 @@ def test_already_release_authorized_task_is_not_rearmed(tmp_path: Path) -> None: assert ["gh", "pr", "merge", "705", "--repo", "owner/repo", "--merge"] in runner.calls decision = next(d for d in report["decisions"] if d["pr"] == 705) assert decision.get("auto_arm", False) is False + + +def test_flake_quarantine_write_side_persists_and_excludes_next_tick( + tmp_path: Path, +) -> None: + vault = _make_vault(tmp_path) + _write_task(vault, task_id="flaky-pr", pr=140, route_metadata_schema=None) + ledger = tmp_path / "merge-queue-lineage.jsonl" + write_jsonl_records( + ledger, + [ + MergeQueueLineageRecord( + observed_at=_recent_observed_at(i), + pr_number=140, + merge_group_run_id=8000 + i, + run_conclusion="failure", + run_outcome="failure", + ) + # 4 genuine failures: over the quarantine threshold (2) AND enough + # samples (min_samples 4) to also trip the failure-rate freeze. + for i in range(4) + ], + ) + quarantine_path = tmp_path / "merge-queue-quarantine.jsonl" + runner = _FakeRunner() + runner.open_prs = [_pr(140)] + + # First apply: PR 140 is over the failure threshold → quarantine opened and + # persisted. The freshly-detected PR still counts toward THIS tick's rate. + report = autoqueue.run_reconciler( + repo="owner/repo", + repo_root=tmp_path, + vault_root=vault, + apply=True, + lineage_ledger_path=ledger, + quarantine_path=quarantine_path, + runner=runner, + ) + assert report["flake_quarantine"]["newly_quarantined"] == [140] + assert report["flake_quarantine"]["written"] is True + assert quarantine_path.exists() + assert report["storm_mode"]["rate_frozen"] is True + + # Second apply: the persisted quarantine is now active → PR 140 is excluded + # from the failure-rate signal, so the isolated flaky PR no longer freezes the + # fleet, and it is not re-opened. + report2 = autoqueue.run_reconciler( + repo="owner/repo", + repo_root=tmp_path, + vault_root=vault, + apply=True, + lineage_ledger_path=ledger, + quarantine_path=quarantine_path, + runner=runner, + ) + assert 140 in report2["flake_quarantine"]["active"] + assert report2["flake_quarantine"]["newly_quarantined"] == [] + assert report2["storm_mode"]["rate_frozen"] is False diff --git a/tests/test_executor_contract.py b/tests/test_executor_contract.py new file mode 100644 index 000000000..aeb805aae --- /dev/null +++ b/tests/test_executor_contract.py @@ -0,0 +1,116 @@ +"""Tests for the Executor adapter contract (reform §6 P1). + +The capability registry is the machine-legible surface the dispatcher consumes +instead of a hard ``(platform, mode)`` if-ladder, and that +``hapax-executor-capabilities`` emits as JSON. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +SCRIPTS = REPO_ROOT / "scripts" +if str(SCRIPTS) not in sys.path: + sys.path.insert(0, str(SCRIPTS)) + +import executor_contract as ec # noqa: E402 + +ALL_PLATFORMS = {"claude", "codex", "gemini", "vibe", "antigrav"} + + +def test_registry_covers_all_five_runtimes() -> None: + assert set(ec.EXECUTOR_REGISTRY) == ALL_PLATFORMS + + +def test_headless_flag_matches_modes() -> None: + # headless capability is true iff a headless mode is launchable. + for caps in ec.EXECUTOR_REGISTRY.values(): + assert caps.headless == ("headless" in caps.modes), caps.platform + + +def test_read_only_implies_no_mutation() -> None: + for caps in ec.EXECUTOR_REGISTRY.values(): + if caps.read_only: + assert not caps.mutates, caps.platform + + +def test_supports_route_for_known_routes() -> None: + assert ec.supports_route("codex", "headless") + assert ec.supports_route("claude", "headless") + assert ec.supports_route("gemini", "headless") + assert ec.supports_route("vibe", "headless") + assert ec.supports_route("antigrav", "interactive") + + +def test_supports_route_rejects_unlaunchable_routes() -> None: + assert not ec.supports_route("gemini", "interactive") + assert not ec.supports_route("antigrav", "headless") + assert not ec.supports_route("vibe", "interactive") + assert not ec.supports_route("codex", "interactive") # tmux pane, not a dispatch route + assert not ec.supports_route("unknown", "headless") + # receipt-only is a dispatch validation mode, not an executor capability. + assert not ec.supports_route("codex", "receipt-only") + + +def test_codex_has_a_genuine_headless_path() -> None: + codex = ec.capabilities("codex") + assert codex is not None + assert codex.headless is True + assert "hapax-codex-headless" in codex.notes + + +def test_antigrav_hook_gap_is_machine_legible() -> None: + antigrav = ec.capabilities("antigrav") + assert antigrav is not None + # The agy CLI path is gated; the residual IDE-surface gap is documented. + assert antigrav.headless is False + assert "IDE" in antigrav.notes + + +def test_capabilities_unknown_is_none() -> None: + assert ec.capabilities("nope") is None + + +def test_adapter_cli_contract_has_canonical_flags() -> None: + for flag in ("--lane", "--task", "--mode", "--prompt", "--no-claim", "--force"): + assert flag in ec.ADAPTER_CLI_CONTRACT + + +def test_capabilities_payload_is_json_serialisable_and_sorted() -> None: + payload = ec.capabilities_payload() + text = json.dumps(payload) # must not raise + assert set(payload) == ALL_PLATFORMS + assert list(payload) == sorted(payload) + assert "hooks_wired" in text + + +def test_standalone_capabilities_cli_emits_json() -> None: + result = subprocess.run( + [sys.executable, str(SCRIPTS / "hapax-executor-capabilities")], + capture_output=True, + text=True, + check=False, + timeout=30, + ) + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert set(payload) == ALL_PLATFORMS + assert payload["codex"]["modes"] == ["headless"] + + +def test_standalone_capabilities_cli_single_platform() -> None: + result = subprocess.run( + [sys.executable, str(SCRIPTS / "hapax-executor-capabilities"), "gemini"], + capture_output=True, + text=True, + check=False, + timeout=30, + ) + assert result.returncode == 0, result.stderr + caps = json.loads(result.stdout) + assert caps["platform"] == "gemini" + assert caps["read_only"] is True diff --git a/tests/test_merge_queue_lineage.py b/tests/test_merge_queue_lineage.py index 77219cda1..4485f470d 100644 --- a/tests/test_merge_queue_lineage.py +++ b/tests/test_merge_queue_lineage.py @@ -239,3 +239,93 @@ def test_recommend_max_entries_storm_vs_healthy(): storm = [_rec(i, "failure") for i in range(8)] assert mql.recommend_max_entries_to_build(healthy, now=NOW) == mql.HEALTHY_MAX_ENTRIES assert mql.recommend_max_entries_to_build(storm, now=NOW) == mql.STORM_MAX_ENTRIES + + +# --------------------------------------------------------------------------- # +# reconcile_flake_quarantines (the WRITE side — FM-3/FM-4) # +# --------------------------------------------------------------------------- # + + +def test_reconcile_opens_quarantine_over_failure_threshold(): + # PR 5 has 2 genuine failures (default quarantine_failures=2) → quarantine. + records = [_rec(5, "failure"), _rec(5, "failure"), _rec(6, "success")] + result = mql.reconcile_flake_quarantines([], records, candidate_prs={5, 6}, now=NOW) + assert result.newly_quarantined == [5] + assert result.lifted == [] + assert result.active == [5] + assert len(result.records) == 1 + assert result.records[0].pr_number == 5 + assert result.records[0].released_at is None + + +def test_reconcile_below_threshold_opens_nothing(): + records = [_rec(5, "failure"), _rec(6, "success")] + result = mql.reconcile_flake_quarantines([], records, candidate_prs={5, 6}, now=NOW) + assert result.newly_quarantined == [] + assert result.active == [] + assert result.records == [] + + +def test_reconcile_does_not_reopen_active_quarantine(): + existing = [mql.open_quarantine(5, reason="prior", now=NOW, cooldown_seconds=6 * 3600)] + records = [_rec(5, "failure"), _rec(5, "failure")] + result = mql.reconcile_flake_quarantines( + existing, records, candidate_prs={5}, now=NOW + timedelta(hours=1) + ) + assert result.newly_quarantined == [] + assert result.active == [5] + assert len(result.records) == 1 # not duplicated + + +def test_reconcile_lifts_expired_quarantine_on_cooldown(): + existing = [mql.open_quarantine(5, reason="prior", now=NOW, cooldown_seconds=3600)] + # 2h later the 1h cooldown has elapsed → lift (release), and no fresh + # failures in window means it is not re-opened. + later = NOW + timedelta(hours=2) + result = mql.reconcile_flake_quarantines(existing, [], candidate_prs={5}, now=later) + assert result.lifted == [5] + assert result.active == [] + assert result.records[0].released_at == later + + +def test_reconcile_is_deterministic_for_multiple_prs(): + records = [ + _rec(7, "failure"), + _rec(7, "failure"), + _rec(3, "failure"), + _rec(3, "failure"), + ] + result = mql.reconcile_flake_quarantines([], records, candidate_prs={7, 3}, now=NOW) + assert result.newly_quarantined == [3, 7] # sorted + + +# --------------------------------------------------------------------------- # +# bisection_plan_for_failed_runs (wire bisect into the failure handler) # +# --------------------------------------------------------------------------- # + + +def test_bisection_plan_groups_by_run_and_splits(): + failed = [ + {"run_id": "100", "pr": 1}, + {"run_id": "100", "pr": 2}, + {"run_id": "100", "pr": 3}, + {"run_id": "100", "pr": 4}, + ] + plan = mql.bisection_plan_for_failed_runs(failed) + assert plan == [ + { + "merge_group_run_id": "100", + "failed_batch": [1, 2, 3, 4], + "next_bisection": [[1, 2], [3, 4]], + } + ] + + +def test_bisection_plan_single_pr_group_is_terminal(): + failed = [{"run_id": "200", "pr": 9}] + assert mql.bisection_plan_for_failed_runs(failed) == [] + + +def test_bisection_plan_skips_rows_missing_keys(): + failed = [{"run_id": None, "pr": 1}, {"pr": 2}, {"run_id": "300"}] + assert mql.bisection_plan_for_failed_runs(failed) == []