forkadarshp · forkadarshp · May 29, 2026 · May 29, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,3 +22,6 @@ jobs:
         uses: DavidAnson/markdownlint-cli2-action@v17
         with:
           globs: "**/*.md"
+      - name: Test benchmark harness
+        working-directory: harness
+        run: python3 -m unittest discover -s tests -v
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,6 @@ transcript.md
 
 # Internal launch/growth notes (not part of the published skill)
 LAUNCH_PREP.md
+
+# Benchmark harness run output
+harness/results.json
diff --git a/harness/README.md b/harness/README.md
@@ -0,0 +1,69 @@
+# Migration Benchmark Harness
+
+A runnable, closed-loop evaluator for the three-arm migration benchmark
+described in [`../references/benchmarking.md`](../references/benchmarking.md).
+Run a scenario, read the leaderboard and the failing cases, tighten the
+enhanced system, and re-run — iterating until the migrated system measurably
+beats both the baseline and a raw model swap.
+
+## The three arms
+
+| Arm | Config | Isolates |
+| --- | --- | --- |
+| A baseline | old model + baseline prompt | starting point |
+| B naive swap | new model + baseline prompt | raw model delta |
+| C ModelPort | new model + enhanced prompt | the skill's added value |
+
+## Quickstart
+
+```bash
+# offline, deterministic — no API key needed
+python3 run.py --provider sim
+
+# real measured numbers — needs ANTHROPIC_API_KEY and `pip install anthropic`
+python3 run.py --provider anthropic
+```
+
+Output: a leaderboard (task success, output-contract conformance, tool-call
+accuracy, p95 latency, cost/req, weighted composite), an attribution line
+(model delta B−A, skill delta C−B, net C−A), and the list of cases the
+ModelPort arm still fails — your to-do list for the next iteration.
+
+## The loop
+
+1. Run the harness.
+2. Read the leaderboard + failing cases.
+3. Tighten the enhanced prompt/config for the failing dimension(s).
+4. Re-run; confirm the composite and the skill delta (C−B) went up.
+
+This is exactly how the bundled scenario was tuned: a vague enhanced prompt
+scored a **negative** skill delta (it cost more without lifting quality); making
+the contract explicit (JSON-only, enumerated schema, no prose) moved ModelPort
+from last place to a clear win.
+
+## Providers
+
+- **`sim`** — offline and deterministic. Outputs are a *simulation* driven only
+  by prompt explicitness and a per-model "literalness" knob (the documented
+  Opus 4.7+ trait: newer models follow instructions more literally, punishing
+  vague prompts and rewarding precise ones). The grading, scoring, and
+  leaderboard pipeline is real; **the numbers are illustrative, not measured.**
+- **`anthropic`** — real Messages API calls. Same graders, real numbers.
+
+## Define your own scenario
+
+Copy `scenarios/support_triage.json` and edit:
+
+- `models` — old/new model IDs
+- `prompts.baseline` / `prompts.enhanced` — the configs for arms A/B vs C
+- `tools`, `categories` — the task surface
+- `eval_cases` — inputs + expected `{category, tool, args}` and a `difficulty`
+- `sim` — per-model knobs for the offline simulator (ignored by `--provider anthropic`)
+
+## Files
+
+- `run.py` — orchestrates the three arms, grades, prints the leaderboard
+- `graders.py` — provider-agnostic scoring (contract, tool, task)
+- `providers.py` — `SimProvider` (offline) + `AnthropicProvider` (real)
+- `scenarios/` — scenario fixtures
+- `tests/` — `python3 -m unittest discover -s tests`
diff --git a/harness/graders.py b/harness/graders.py
@@ -0,0 +1,88 @@
+"""Provider-agnostic graders for the migration benchmark harness.
+
+These score a model Response against a case's expected outcome. They run on the
+real text/tool-call a model returns, so the same graders judge both the
+simulated provider and the real Anthropic provider.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Response:
+    """Normalized model output, independent of which provider produced it."""
+
+    text: str = ""
+    tool_name: str | None = None
+    tool_args: dict = field(default_factory=dict)
+    latency_ms: float = 0.0
+    tokens_in: int = 0
+    tokens_out: int = 0
+
+
+def _strip_fence(text: str) -> str:
+    t = text.strip()
+    if t.startswith("```"):
+        # drop the opening fence line and a trailing fence if present
+        t = t.split("\n", 1)[1] if "\n" in t else ""
+        if t.rstrip().endswith("```"):
+            t = t.rstrip()[:-3]
+    return t.strip()
+
+
+def parse_contract(text: str) -> dict | None:
+    """Strict parse: the whole message must be a single JSON object.
+
+    Prose around the JSON (a common regression after a model swap) fails here,
+    which is exactly the contract violation we want to catch.
+    """
+    candidate = _strip_fence(text)
+    if not (candidate.startswith("{") and candidate.endswith("}")):
+        return None
+    try:
+        obj = json.loads(candidate)
+    except (ValueError, TypeError):
+        return None
+    return obj if isinstance(obj, dict) else None
+
+
+def grade_contract(resp: Response) -> bool:
+    obj = parse_contract(resp.text)
+    if obj is None:
+        return False
+    return (
+        isinstance(obj.get("category"), str)
+        and isinstance(obj.get("tool"), str)
+        and isinstance(obj.get("args"), dict)
+    )
+
+
+def grade_tool(resp: Response, expected: dict) -> bool:
+    want_tool = expected.get("tool", "none")
+    want_args = expected.get("args", {}) or {}
+    got_tool = resp.tool_name if resp.tool_name is not None else "none"
+    if got_tool != want_tool:
+        return False
+    # expected args must be a subset of what the model supplied
+    return all(str(resp.tool_args.get(k)) == str(v) for k, v in want_args.items())
+
+
+def grade_task(resp: Response, expected: dict) -> bool:
+    """Overall success: valid contract AND the right category."""
+    obj = parse_contract(resp.text)
+    if obj is None:
+        return False
+    if not grade_contract(resp):
+        return False
+    return obj.get("category") == expected.get("category")
+
+
+def grade_case(resp: Response, expected: dict) -> dict:
+    return {
+        "contract": grade_contract(resp),
+        "tool": grade_tool(resp, expected),
+        "task": grade_task(resp, expected),
+    }
diff --git a/harness/providers.py b/harness/providers.py
@@ -0,0 +1,171 @@
+"""Model providers for the benchmark harness.
+
+A provider turns (model, system_prompt, tools, case) into a normalized
+``Response``. The graders then score that Response, so the scoring pipeline is
+identical regardless of provider.
+
+- ``SimProvider``  : offline, deterministic. No API key. Its outputs are a
+  *simulation* whose only inputs are prompt explicitness and a per-model
+  "literalness" knob (the documented Opus 4.7+ trait: newer models follow
+  instructions more literally, which punishes vague prompts and rewards precise
+  ones). Numbers it produces are illustrative, not measured.
+- ``AnthropicProvider`` : real Messages API calls. Produces real numbers. Needs
+  ``ANTHROPIC_API_KEY`` and ``pip install anthropic``.
+"""
+
+from __future__ import annotations
+
+import json
+import time
+
+from graders import Response
+
+# Simulator formula constants (transparent, tunable). See module docstring.
+_BOOST_EXPLICIT = 0.30      # precise prompt raises fidelity
+_PENALTY_VAGUE = 0.20       # literal model * vague prompt lowers fidelity
+_BONUS_LITERAL_EXPLICIT = 0.05  # literal model * precise prompt small bonus
+
+
+def _clamp(x: float, lo: float = 0.02, hi: float = 0.95) -> float:
+    return max(lo, min(hi, x))
+
+
+def explicitness(prompt: str, scenario: dict) -> dict:
+    """Score how precisely a prompt specifies the output contract and tools."""
+    p = prompt.lower()
+    contract_markers = ["only", "json", "schema", "no prose", "do not wrap"]
+    c = sum(m in p for m in contract_markers) / len(contract_markers)
+
+    tool_names = [t["name"] for t in scenario["tools"] if t["name"] != "none"]
+    enum = sum(name in p for name in tool_names) / max(1, len(tool_names))
+    args_marker = 1.0 if "args" in p else 0.0
+    tool_e = 0.6 * enum + 0.4 * args_marker
+
+    return {"contract": c, "tool": tool_e, "task": (c + tool_e) / 2}
+
+
+class SimProvider:
+    """Deterministic simulator. Same input -> same output, no network."""
+
+    def __init__(self, scenario: dict):
+        self.scenario = scenario
+        self.sim = scenario["sim"]
+
+    def _fidelity(self, e: float, literalness: float, base: float) -> float:
+        return _clamp(
+            base
+            + _BOOST_EXPLICIT * e
+            - _PENALTY_VAGUE * literalness * (1 - e)
+            + _BONUS_LITERAL_EXPLICIT * literalness * e
+        )
+
+    def generate(self, model: str, system_prompt: str, case: dict) -> Response:
+        params = self.sim[model]
+        L, base = params["literalness"], params["base"]
+        e = explicitness(system_prompt, self.scenario)
+        d = case["difficulty"]
+        expected = case["expected"]
+
+        fid = {m: self._fidelity(e[m], L, base) for m in ("contract", "tool", "task")}
+        contract_ok = d <= fid["contract"]
+        tool_ok = d <= fid["tool"]
+        task_ok = contract_ok and (d <= fid["task"])
+
+        # craft text + tool-call consistent with the chosen outcomes, so the
+        # real graders re-derive these scores from actual strings.
+        category = expected["category"] if task_ok else _wrong_category(
+            expected["category"], self.scenario["categories"]
+        )
+        if tool_ok:
+            tool_name = None if expected["tool"] == "none" else expected["tool"]
+            tool_args = dict(expected.get("args", {}))
+        else:
+            tool_name = None if expected["tool"] != "none" else "lookup_order"
+            tool_args = {}
+
+        payload = {"category": category, "tool": expected["tool"] if tool_ok else "none", "args": tool_args}
+        if contract_ok:
+            text = json.dumps(payload)
+        else:
+            # contract regression: prose around the JSON -> strict parse fails
+            text = f"Sure! Based on the ticket, here's my take: {json.dumps(payload)} Hope that helps."
+
+        tokens_in = max(1, (len(system_prompt) + len(case["input"])) // 4)
+        tokens_out = max(1, len(text) // 4)
+        # deterministic latency: model base + tiny difficulty-driven jitter
+        latency_ms = params["latency_ms"] * (0.9 + 0.2 * d)
+
+        return Response(
+            text=text,
+            tool_name=tool_name,
+            tool_args=tool_args,
+            latency_ms=latency_ms,
+            tokens_in=tokens_in,
+            tokens_out=tokens_out,
+        )
+
+
+def _wrong_category(correct: str, categories: list[str]) -> str:
+    for c in categories:
+        if c != correct:
+            return c
+    return correct
+
+
+class AnthropicProvider:
+    """Real Anthropic Messages API backend. Requires ANTHROPIC_API_KEY."""
+
+    def __init__(self, scenario: dict):
+        import anthropic  # lazy: only needed for real runs
+
+        self.scenario = scenario
+        self.client = anthropic.Anthropic()
+
+    def generate(self, model: str, system_prompt: str, case: dict) -> Response:
+        tools = [
+            {
+                "name": t["name"],
+                "description": f"{t['name']} tool",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {a: {"type": "string"} for a in t["args"]},
+                    "required": t["args"],
+                },
+            }
+            for t in self.scenario["tools"]
+            if t["name"] != "none"
+        ]
+        start = time.time()
+        msg = self.client.messages.create(
+            model=model,
+            max_tokens=512,
+            system=system_prompt,
+            tools=tools,
+            messages=[{"role": "user", "content": case["input"]}],
+        )
+        latency_ms = (time.time() - start) * 1000
+
+        text, tool_name, tool_args = "", None, {}
+        for block in msg.content:
+            if block.type == "text":
+                text += block.text
+            elif block.type == "tool_use":
+                tool_name = block.name
+                tool_args = dict(block.input)
+
+        return Response(
+            text=text,
+            tool_name=tool_name,
+            tool_args=tool_args,
+            latency_ms=latency_ms,
+            tokens_in=msg.usage.input_tokens,
+            tokens_out=msg.usage.output_tokens,
+        )
+
+
+def get_provider(name: str, scenario: dict):
+    if name == "sim":
+        return SimProvider(scenario)
+    if name == "anthropic":
+        return AnthropicProvider(scenario)
+    raise ValueError(f"unknown provider: {name!r} (use 'sim' or 'anthropic')")