From 8181cd4d981ec38a9748816c7aa5adde6d6c5729 Mon Sep 17 00:00:00 2001
From: forkadarshp <foradarshpandita@gmail.com>
Date: Fri, 29 May 2026 06:00:07 +0530
Subject: [PATCH] feat: add runnable three-arm benchmark harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A closed-loop evaluator that operationalizes references/benchmarking.md:
runs baseline / naive-swap / ModelPort-enhanced arms over an eval set, grades
output-contract conformance, tool-calling accuracy, and task success, and
prints a leaderboard with attribution (model delta, skill delta, net).

- harness/run.py — orchestrates the three arms + leaderboard
- harness/graders.py — provider-agnostic scoring (real, runs on actual output)
- harness/providers.py — SimProvider (offline, deterministic) + AnthropicProvider
  (real Messages API; needs ANTHROPIC_API_KEY)
- harness/scenarios/support_triage.json — bundled scenario fixture
- harness/tests/ — 9 unit/smoke tests, wired into CI
- harness/README.md — usage + the iterate-on-failures loop

The simulator's numbers are illustrative (driven by prompt explicitness × a
per-model literalness knob); the grading/scoring pipeline is real, so the
Anthropic provider yields measured results with no other changes.
---
 .github/workflows/ci.yml              |   3 +
 .gitignore                            |   3 +
 harness/README.md                     |  69 +++++++++++
 harness/graders.py                    |  88 +++++++++++++
 harness/providers.py                  | 171 ++++++++++++++++++++++++++
 harness/run.py                        | 141 +++++++++++++++++++++
 harness/scenarios/support_triage.json |  36 ++++++
 harness/tests/test_harness.py         |  76 ++++++++++++
 8 files changed, 587 insertions(+)
 create mode 100644 harness/README.md
 create mode 100644 harness/graders.py
 create mode 100644 harness/providers.py
 create mode 100644 harness/run.py
 create mode 100644 harness/scenarios/support_triage.json
 create mode 100644 harness/tests/test_harness.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 69c4803..028f4e9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,3 +22,6 @@ jobs:
         uses: DavidAnson/markdownlint-cli2-action@v17
         with:
           globs: "**/*.md"
+      - name: Test benchmark harness
+        working-directory: harness
+        run: python3 -m unittest discover -s tests -v
diff --git a/.gitignore b/.gitignore
index 77984e3..06a045c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,6 @@ transcript.md
 
 # Internal launch/growth notes (not part of the published skill)
 LAUNCH_PREP.md
+
+# Benchmark harness run output
+harness/results.json
diff --git a/harness/README.md b/harness/README.md
new file mode 100644
index 0000000..f0d35ed
--- /dev/null
+++ b/harness/README.md
@@ -0,0 +1,69 @@
+# Migration Benchmark Harness
+
+A runnable, closed-loop evaluator for the three-arm migration benchmark
+described in [`../references/benchmarking.md`](../references/benchmarking.md).
+Run a scenario, read the leaderboard and the failing cases, tighten the
+enhanced system, and re-run — iterating until the migrated system measurably
+beats both the baseline and a raw model swap.
+
+## The three arms
+
+| Arm | Config | Isolates |
+| --- | --- | --- |
+| A baseline | old model + baseline prompt | starting point |
+| B naive swap | new model + baseline prompt | raw model delta |
+| C ModelPort | new model + enhanced prompt | the skill's added value |
+
+## Quickstart
+
+```bash
+# offline, deterministic — no API key needed
+python3 run.py --provider sim
+
+# real measured numbers — needs ANTHROPIC_API_KEY and `pip install anthropic`
+python3 run.py --provider anthropic
+```
+
+Output: a leaderboard (task success, output-contract conformance, tool-call
+accuracy, p95 latency, cost/req, weighted composite), an attribution line
+(model delta B−A, skill delta C−B, net C−A), and the list of cases the
+ModelPort arm still fails — your to-do list for the next iteration.
+
+## The loop
+
+1. Run the harness.
+2. Read the leaderboard + failing cases.
+3. Tighten the enhanced prompt/config for the failing dimension(s).
+4. Re-run; confirm the composite and the skill delta (C−B) went up.
+
+This is exactly how the bundled scenario was tuned: a vague enhanced prompt
+scored a **negative** skill delta (it cost more without lifting quality); making
+the contract explicit (JSON-only, enumerated schema, no prose) moved ModelPort
+from last place to a clear win.
+
+## Providers
+
+- **`sim`** — offline and deterministic. Outputs are a *simulation* driven only
+  by prompt explicitness and a per-model "literalness" knob (the documented
+  Opus 4.7+ trait: newer models follow instructions more literally, punishing
+  vague prompts and rewarding precise ones). The grading, scoring, and
+  leaderboard pipeline is real; **the numbers are illustrative, not measured.**
+- **`anthropic`** — real Messages API calls. Same graders, real numbers.
+
+## Define your own scenario
+
+Copy `scenarios/support_triage.json` and edit:
+
+- `models` — old/new model IDs
+- `prompts.baseline` / `prompts.enhanced` — the configs for arms A/B vs C
+- `tools`, `categories` — the task surface
+- `eval_cases` — inputs + expected `{category, tool, args}` and a `difficulty`
+- `sim` — per-model knobs for the offline simulator (ignored by `--provider anthropic`)
+
+## Files
+
+- `run.py` — orchestrates the three arms, grades, prints the leaderboard
+- `graders.py` — provider-agnostic scoring (contract, tool, task)
+- `providers.py` — `SimProvider` (offline) + `AnthropicProvider` (real)
+- `scenarios/` — scenario fixtures
+- `tests/` — `python3 -m unittest discover -s tests`
diff --git a/harness/graders.py b/harness/graders.py
new file mode 100644
index 0000000..6babb25
--- /dev/null
+++ b/harness/graders.py
@@ -0,0 +1,88 @@
+"""Provider-agnostic graders for the migration benchmark harness.
+
+These score a model Response against a case's expected outcome. They run on the
+real text/tool-call a model returns, so the same graders judge both the
+simulated provider and the real Anthropic provider.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Response:
+    """Normalized model output, independent of which provider produced it."""
+
+    text: str = ""
+    tool_name: str | None = None
+    tool_args: dict = field(default_factory=dict)
+    latency_ms: float = 0.0
+    tokens_in: int = 0
+    tokens_out: int = 0
+
+
+def _strip_fence(text: str) -> str:
+    t = text.strip()
+    if t.startswith("```"):
+        # drop the opening fence line and a trailing fence if present
+        t = t.split("\n", 1)[1] if "\n" in t else ""
+        if t.rstrip().endswith("```"):
+            t = t.rstrip()[:-3]
+    return t.strip()
+
+
+def parse_contract(text: str) -> dict | None:
+    """Strict parse: the whole message must be a single JSON object.
+
+    Prose around the JSON (a common regression after a model swap) fails here,
+    which is exactly the contract violation we want to catch.
+    """
+    candidate = _strip_fence(text)
+    if not (candidate.startswith("{") and candidate.endswith("}")):
+        return None
+    try:
+        obj = json.loads(candidate)
+    except (ValueError, TypeError):
+        return None
+    return obj if isinstance(obj, dict) else None
+
+
+def grade_contract(resp: Response) -> bool:
+    obj = parse_contract(resp.text)
+    if obj is None:
+        return False
+    return (
+        isinstance(obj.get("category"), str)
+        and isinstance(obj.get("tool"), str)
+        and isinstance(obj.get("args"), dict)
+    )
+
+
+def grade_tool(resp: Response, expected: dict) -> bool:
+    want_tool = expected.get("tool", "none")
+    want_args = expected.get("args", {}) or {}
+    got_tool = resp.tool_name if resp.tool_name is not None else "none"
+    if got_tool != want_tool:
+        return False
+    # expected args must be a subset of what the model supplied
+    return all(str(resp.tool_args.get(k)) == str(v) for k, v in want_args.items())
+
+
+def grade_task(resp: Response, expected: dict) -> bool:
+    """Overall success: valid contract AND the right category."""
+    obj = parse_contract(resp.text)
+    if obj is None:
+        return False
+    if not grade_contract(resp):
+        return False
+    return obj.get("category") == expected.get("category")
+
+
+def grade_case(resp: Response, expected: dict) -> dict:
+    return {
+        "contract": grade_contract(resp),
+        "tool": grade_tool(resp, expected),
+        "task": grade_task(resp, expected),
+    }
diff --git a/harness/providers.py b/harness/providers.py
new file mode 100644
index 0000000..4b43401
--- /dev/null
+++ b/harness/providers.py
@@ -0,0 +1,171 @@
+"""Model providers for the benchmark harness.
+
+A provider turns (model, system_prompt, tools, case) into a normalized
+``Response``. The graders then score that Response, so the scoring pipeline is
+identical regardless of provider.
+
+- ``SimProvider``  : offline, deterministic. No API key. Its outputs are a
+  *simulation* whose only inputs are prompt explicitness and a per-model
+  "literalness" knob (the documented Opus 4.7+ trait: newer models follow
+  instructions more literally, which punishes vague prompts and rewards precise
+  ones). Numbers it produces are illustrative, not measured.
+- ``AnthropicProvider`` : real Messages API calls. Produces real numbers. Needs
+  ``ANTHROPIC_API_KEY`` and ``pip install anthropic``.
+"""
+
+from __future__ import annotations
+
+import json
+import time
+
+from graders import Response
+
+# Simulator formula constants (transparent, tunable). See module docstring.
+_BOOST_EXPLICIT = 0.30      # precise prompt raises fidelity
+_PENALTY_VAGUE = 0.20       # literal model * vague prompt lowers fidelity
+_BONUS_LITERAL_EXPLICIT = 0.05  # literal model * precise prompt small bonus
+
+
+def _clamp(x: float, lo: float = 0.02, hi: float = 0.95) -> float:
+    return max(lo, min(hi, x))
+
+
+def explicitness(prompt: str, scenario: dict) -> dict:
+    """Score how precisely a prompt specifies the output contract and tools."""
+    p = prompt.lower()
+    contract_markers = ["only", "json", "schema", "no prose", "do not wrap"]
+    c = sum(m in p for m in contract_markers) / len(contract_markers)
+
+    tool_names = [t["name"] for t in scenario["tools"] if t["name"] != "none"]
+    enum = sum(name in p for name in tool_names) / max(1, len(tool_names))
+    args_marker = 1.0 if "args" in p else 0.0
+    tool_e = 0.6 * enum + 0.4 * args_marker
+
+    return {"contract": c, "tool": tool_e, "task": (c + tool_e) / 2}
+
+
+class SimProvider:
+    """Deterministic simulator. Same input -> same output, no network."""
+
+    def __init__(self, scenario: dict):
+        self.scenario = scenario
+        self.sim = scenario["sim"]
+
+    def _fidelity(self, e: float, literalness: float, base: float) -> float:
+        return _clamp(
+            base
+            + _BOOST_EXPLICIT * e
+            - _PENALTY_VAGUE * literalness * (1 - e)
+            + _BONUS_LITERAL_EXPLICIT * literalness * e
+        )
+
+    def generate(self, model: str, system_prompt: str, case: dict) -> Response:
+        params = self.sim[model]
+        L, base = params["literalness"], params["base"]
+        e = explicitness(system_prompt, self.scenario)
+        d = case["difficulty"]
+        expected = case["expected"]
+
+        fid = {m: self._fidelity(e[m], L, base) for m in ("contract", "tool", "task")}
+        contract_ok = d <= fid["contract"]
+        tool_ok = d <= fid["tool"]
+        task_ok = contract_ok and (d <= fid["task"])
+
+        # craft text + tool-call consistent with the chosen outcomes, so the
+        # real graders re-derive these scores from actual strings.
+        category = expected["category"] if task_ok else _wrong_category(
+            expected["category"], self.scenario["categories"]
+        )
+        if tool_ok:
+            tool_name = None if expected["tool"] == "none" else expected["tool"]
+            tool_args = dict(expected.get("args", {}))
+        else:
+            tool_name = None if expected["tool"] != "none" else "lookup_order"
+            tool_args = {}
+
+        payload = {"category": category, "tool": expected["tool"] if tool_ok else "none", "args": tool_args}
+        if contract_ok:
+            text = json.dumps(payload)
+        else:
+            # contract regression: prose around the JSON -> strict parse fails
+            text = f"Sure! Based on the ticket, here's my take: {json.dumps(payload)} Hope that helps."
+
+        tokens_in = max(1, (len(system_prompt) + len(case["input"])) // 4)
+        tokens_out = max(1, len(text) // 4)
+        # deterministic latency: model base + tiny difficulty-driven jitter
+        latency_ms = params["latency_ms"] * (0.9 + 0.2 * d)
+
+        return Response(
+            text=text,
+            tool_name=tool_name,
+            tool_args=tool_args,
+            latency_ms=latency_ms,
+            tokens_in=tokens_in,
+            tokens_out=tokens_out,
+        )
+
+
+def _wrong_category(correct: str, categories: list[str]) -> str:
+    for c in categories:
+        if c != correct:
+            return c
+    return correct
+
+
+class AnthropicProvider:
+    """Real Anthropic Messages API backend. Requires ANTHROPIC_API_KEY."""
+
+    def __init__(self, scenario: dict):
+        import anthropic  # lazy: only needed for real runs
+
+        self.scenario = scenario
+        self.client = anthropic.Anthropic()
+
+    def generate(self, model: str, system_prompt: str, case: dict) -> Response:
+        tools = [
+            {
+                "name": t["name"],
+                "description": f"{t['name']} tool",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {a: {"type": "string"} for a in t["args"]},
+                    "required": t["args"],
+                },
+            }
+            for t in self.scenario["tools"]
+            if t["name"] != "none"
+        ]
+        start = time.time()
+        msg = self.client.messages.create(
+            model=model,
+            max_tokens=512,
+            system=system_prompt,
+            tools=tools,
+            messages=[{"role": "user", "content": case["input"]}],
+        )
+        latency_ms = (time.time() - start) * 1000
+
+        text, tool_name, tool_args = "", None, {}
+        for block in msg.content:
+            if block.type == "text":
+                text += block.text
+            elif block.type == "tool_use":
+                tool_name = block.name
+                tool_args = dict(block.input)
+
+        return Response(
+            text=text,
+            tool_name=tool_name,
+            tool_args=tool_args,
+            latency_ms=latency_ms,
+            tokens_in=msg.usage.input_tokens,
+            tokens_out=msg.usage.output_tokens,
+        )
+
+
+def get_provider(name: str, scenario: dict):
+    if name == "sim":
+        return SimProvider(scenario)
+    if name == "anthropic":
+        return AnthropicProvider(scenario)
+    raise ValueError(f"unknown provider: {name!r} (use 'sim' or 'anthropic')")
diff --git a/harness/run.py b/harness/run.py
new file mode 100644
index 0000000..c4604f8
--- /dev/null
+++ b/harness/run.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Run the three-arm migration benchmark and print a leaderboard.
+
+Arms (same eval set, same graders):
+  A baseline    = old model + baseline prompt
+  B naive swap  = new model + baseline prompt   (raw model delta)
+  C ModelPort   = new model + enhanced prompt    (skill's added value)
+
+Usage:
+  python run.py --scenario scenarios/support_triage.json --provider sim
+  python run.py --provider anthropic   # real numbers; needs ANTHROPIC_API_KEY
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from pathlib import Path
+
+from graders import grade_case
+from providers import get_provider
+
+
+def run_arm(provider, model, prompt, scenario, prices):
+    cases, rows = scenario["eval_cases"], []
+    lat, costs = [], []
+    fails = []
+    for case in cases:
+        resp = provider.generate(model, prompt, case)
+        scores = grade_case(resp, case["expected"])
+        lat.append(resp.latency_ms)
+        costs.append(resp.tokens_in * prices["price_in"] + resp.tokens_out * prices["price_out"])
+        rows.append(scores)
+        if not all(scores.values()):
+            fails.append((case["id"], scores))
+    n = len(cases)
+    return {
+        "task": sum(r["task"] for r in rows) / n,
+        "contract": sum(r["contract"] for r in rows) / n,
+        "tool": sum(r["tool"] for r in rows) / n,
+        "p50_ms": statistics.median(lat),
+        "p95_ms": sorted(lat)[max(0, int(0.95 * n) - 1)],
+        "cost": sum(costs) / n,
+        "fails": fails,
+    }
+
+
+def composite(arms):
+    """0.50 quality + 0.30 cost-efficiency + 0.20 speed.
+
+    quality = mean(task, contract, tool). cost/speed are scored *relative to the
+    baseline arm* and bounded, so a small absolute cost/latency difference only
+    moves the score a little — quality stays dominant. (Min-max normalization
+    across arms is deliberately avoided: with near-equal costs it amplifies
+    rounding into a full-weight swing and can rank a worse arm first.)
+    """
+    base = arms[0]
+
+    def contrib(factor: float) -> float:  # factor > 1 == better than baseline
+        return max(0.0, min(1.0, 0.5 + 0.5 * (factor - 1.0)))
+
+    out = []
+    for a in arms:
+        quality = (a["task"] + a["contract"] + a["tool"]) / 3
+        cost_factor = base["cost"] / a["cost"] if a["cost"] else 1.0
+        speed_factor = base["p95_ms"] / a["p95_ms"] if a["p95_ms"] else 1.0
+        out.append(0.5 * quality + 0.3 * contrib(cost_factor) + 0.2 * contrib(speed_factor))
+    return out
+
+
+def leaderboard(arms):
+    M, V = 22, 12
+    bar = lambda l, m, r: l + "─" * M + m + ("─" * V + m) * 2 + "─" * V + r
+    def hrow(c0, c1, c2, c3):
+        cells = [" " + c0.ljust(M - 1)] + [" " + c.ljust(V - 1) for c in (c1, c2, c3)]
+        return "│" + cells[0] + "│" + cells[1] + "│" + cells[2] + "│" + cells[3] + "│"
+    def row(label, a, b, c):
+        cells = [label.ljust(M)] + [x.rjust(V - 1) + " " for x in (a, b, c)]
+        return "│" + cells[0] + "│" + cells[1] + "│" + cells[2] + "│" + cells[3] + "│"
+
+    pct = lambda x: f"{round(x * 100)}%"
+    sec = lambda ms: f"{ms / 1000:.1f}s"
+    usd = lambda c: f"${c:.4f}"
+    comp = composite(arms)
+
+    out = [bar("╭", "┬", "╮"),
+           hrow("metric", "baseline", "naive swap", "ModelPort"),
+           hrow("", "(old/old)", "(new/old)", "(new/enh.)"),
+           bar("├", "┼", "┤"),
+           row("  task success", *[pct(a["task"]) for a in arms]),
+           row("  output contract", *[pct(a["contract"]) for a in arms]),
+           row("  tool-call accuracy", *[pct(a["tool"]) for a in arms]),
+           row("  p95 latency", *[sec(a["p95_ms"]) for a in arms]),
+           row("  cost / req", *[usd(a["cost"]) for a in arms]),
+           bar("├", "┼", "┤"),
+           row("  composite (50/30/20)", *[f"{x:.2f}" for x in comp]),
+           bar("╰", "┴", "╯")]
+    return "\n".join(out), comp
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--scenario", default=str(Path(__file__).parent / "scenarios/support_triage.json"))
+    ap.add_argument("--provider", default="sim", choices=["sim", "anthropic"])
+    ap.add_argument("--out", default=str(Path(__file__).parent / "results.json"))
+    args = ap.parse_args()
+
+    scenario = json.loads(Path(args.scenario).read_text())
+    provider = get_provider(args.provider, scenario)
+    old, new = scenario["models"]["old"], scenario["models"]["new"]
+    base_p, enh_p = scenario["prompts"]["baseline"], scenario["prompts"]["enhanced"]
+
+    arm_specs = [("A baseline", old, base_p), ("B naive swap", new, base_p), ("C ModelPort", new, enh_p)]
+    arms = [run_arm(provider, m, p, scenario, scenario["sim"][m]) for _, m, p in arm_specs]
+
+    table, comp = leaderboard(arms)
+    print(f"\nScenario: {scenario['name']}  |  provider: {args.provider}  |  n={len(scenario['eval_cases'])}")
+    if args.provider == "sim":
+        print("(simulated numbers — illustrative; run --provider anthropic for measured results)")
+    print(table)
+    print(f"\nattribution:  model delta (B−A) {comp[1] - comp[0]:+.2f}   "
+          f"skill delta (C−B) {comp[2] - comp[1]:+.2f}   net (C−A) {comp[2] - comp[0]:+.2f}")
+
+    cfails = arms[2]["fails"]
+    if cfails:
+        broken = sorted({k for _, s in cfails for k, ok in s.items() if not ok})
+        print(f"\nModelPort arm still failing {len(cfails)} case(s) on: {', '.join(broken)}")
+        print("  cases: " + ", ".join(cid for cid, _ in cfails))
+        print("  -> next iteration: tighten the enhanced prompt for the failing dimension(s).")
+    else:
+        print("\nModelPort arm passes every case. ")
+
+    Path(args.out).write_text(json.dumps(
+        {"scenario": scenario["name"], "provider": args.provider,
+         "arms": {spec[0]: {k: v for k, v in a.items() if k != "fails"} for spec, a in zip(arm_specs, arms)},
+         "composite": dict(zip([s[0] for s in arm_specs], comp))}, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/harness/scenarios/support_triage.json b/harness/scenarios/support_triage.json
new file mode 100644
index 0000000..b63a3df
--- /dev/null
+++ b/harness/scenarios/support_triage.json
@@ -0,0 +1,36 @@
+{
+  "name": "support-ticket-triage",
+  "description": "Classify a support ticket, optionally call one tool, and return a strict JSON object. Exercises output-contract conformance and tool-calling accuracy across a model migration.",
+  "models": { "old": "claude-opus-4-6", "new": "claude-opus-4-8" },
+  "categories": ["billing", "technical", "account", "other"],
+  "tools": [
+    { "name": "lookup_order", "args": ["order_id"] },
+    { "name": "issue_refund", "args": ["order_id"] },
+    { "name": "reset_password", "args": ["email"] },
+    { "name": "none", "args": [] }
+  ],
+  "prompts": {
+    "baseline": "You are a helpful support assistant. Read the customer ticket and decide how to handle it. Tell me the category and whether a tool is needed.",
+    "enhanced": "You are a support-triage assistant. Return ONLY a JSON object and no prose. Schema: {\"category\": one of billing|technical|account|other, \"tool\": one of lookup_order|issue_refund|reset_password|none, \"args\": object}. Do not wrap the JSON in code fences or commentary. Choose exactly one category and exactly one tool from the lists above."
+  },
+  "eval_cases": [
+    { "id": "c01", "input": "I was charged twice for order 4471, please refund one.", "difficulty": 0.15, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "4471" } } },
+    { "id": "c02", "input": "I forgot my password and can't log in. Email is sam@acme.io", "difficulty": 0.2, "expected": { "category": "account", "tool": "reset_password", "args": { "email": "sam@acme.io" } } },
+    { "id": "c03", "input": "Where is my package for order 9920?", "difficulty": 0.3, "expected": { "category": "other", "tool": "lookup_order", "args": { "order_id": "9920" } } },
+    { "id": "c04", "input": "The checkout page throws a 500 error when I click pay.", "difficulty": 0.45, "expected": { "category": "technical", "tool": "none", "args": {} } },
+    { "id": "c05", "input": "Refund my duplicate charge on order 1203.", "difficulty": 0.25, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "1203" } } },
+    { "id": "c06", "input": "Can you check the status of order 5560?", "difficulty": 0.4, "expected": { "category": "other", "tool": "lookup_order", "args": { "order_id": "5560" } } },
+    { "id": "c07", "input": "My account is locked after too many login attempts; reset for jo@x.com", "difficulty": 0.55, "expected": { "category": "account", "tool": "reset_password", "args": { "email": "jo@x.com" } } },
+    { "id": "c08", "input": "The mobile app crashes on startup since the update.", "difficulty": 0.6, "expected": { "category": "technical", "tool": "none", "args": {} } },
+    { "id": "c09", "input": "I think I was overcharged but I'm not sure, can you look at order 7781?", "difficulty": 0.7, "expected": { "category": "billing", "tool": "lookup_order", "args": { "order_id": "7781" } } },
+    { "id": "c10", "input": "Just wanted to say your support team is great!", "difficulty": 0.5, "expected": { "category": "other", "tool": "none", "args": {} } },
+    { "id": "c11", "input": "Password reset link expired, send a new one to me@dev.io please", "difficulty": 0.8, "expected": { "category": "account", "tool": "reset_password", "args": { "email": "me@dev.io" } } },
+    { "id": "c12", "input": "Payment failed three times and now I have three pending charges on order 3030.", "difficulty": 0.9, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "3030" } } },
+    { "id": "c13", "input": "Charged for a subscription I cancelled (order 8800), I want a refund.", "difficulty": 0.95, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "8800" } } },
+    { "id": "c14", "input": "App shows a white screen, reinstalled twice, still broken.", "difficulty": 0.97, "expected": { "category": "technical", "tool": "none", "args": {} } }
+  ],
+  "sim": {
+    "claude-opus-4-6": { "literalness": 0.5, "base": 0.72, "latency_ms": 3800, "price_in": 0.000005, "price_out": 0.000025 },
+    "claude-opus-4-8": { "literalness": 0.9, "base": 0.76, "latency_ms": 2300, "price_in": 0.000005, "price_out": 0.000025 }
+  }
+}
diff --git a/harness/tests/test_harness.py b/harness/tests/test_harness.py
new file mode 100644
index 0000000..00f1ee0
--- /dev/null
+++ b/harness/tests/test_harness.py
@@ -0,0 +1,76 @@
+"""Unit + smoke tests for the benchmark harness. Run: python3 -m unittest."""
+
+import json
+import sys
+import unittest
+from pathlib import Path
+
+HARNESS = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(HARNESS))
+
+from graders import Response, grade_contract, grade_task, grade_tool  # noqa: E402
+from providers import SimProvider, explicitness  # noqa: E402
+import run  # noqa: E402
+
+
+class TestGraders(unittest.TestCase):
+    def test_strict_json_passes_contract(self):
+        r = Response(text='{"category": "billing", "tool": "none", "args": {}}')
+        self.assertTrue(grade_contract(r))
+
+    def test_prose_wrapped_json_fails_contract(self):
+        r = Response(text='Sure! Here you go: {"category": "billing", "tool": "none", "args": {}} thanks')
+        self.assertFalse(grade_contract(r))
+
+    def test_code_fenced_json_passes(self):
+        r = Response(text='```json\n{"category": "account", "tool": "none", "args": {}}\n```')
+        self.assertTrue(grade_contract(r))
+
+    def test_tool_match_and_args_subset(self):
+        r = Response(tool_name="issue_refund", tool_args={"order_id": "4471", "extra": "x"})
+        self.assertTrue(grade_tool(r, {"tool": "issue_refund", "args": {"order_id": "4471"}}))
+
+    def test_tool_name_mismatch_fails(self):
+        r = Response(tool_name="lookup_order", tool_args={"order_id": "4471"})
+        self.assertFalse(grade_tool(r, {"tool": "issue_refund", "args": {"order_id": "4471"}}))
+
+    def test_tool_none_expected(self):
+        r = Response(tool_name=None)
+        self.assertTrue(grade_tool(r, {"tool": "none", "args": {}}))
+
+    def test_task_requires_correct_category(self):
+        ok = Response(text='{"category": "billing", "tool": "none", "args": {}}')
+        wrong = Response(text='{"category": "technical", "tool": "none", "args": {}}')
+        self.assertTrue(grade_task(ok, {"category": "billing"}))
+        self.assertFalse(grade_task(wrong, {"category": "billing"}))
+
+
+class TestSimAndLoop(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.scenario = json.loads((HARNESS / "scenarios/support_triage.json").read_text())
+
+    def test_enhanced_prompt_is_more_explicit(self):
+        e_base = explicitness(self.scenario["prompts"]["baseline"], self.scenario)
+        e_enh = explicitness(self.scenario["prompts"]["enhanced"], self.scenario)
+        self.assertGreater(e_enh["contract"], e_base["contract"])
+        self.assertGreater(e_enh["task"], e_base["task"])
+
+    def test_modelport_arm_beats_baseline(self):
+        sc = self.scenario
+        prov = SimProvider(sc)
+        old, new = sc["models"]["old"], sc["models"]["new"]
+        arms = [
+            run.run_arm(prov, old, sc["prompts"]["baseline"], sc, sc["sim"][old]),
+            run.run_arm(prov, new, sc["prompts"]["baseline"], sc, sc["sim"][new]),
+            run.run_arm(prov, new, sc["prompts"]["enhanced"], sc, sc["sim"][new]),
+        ]
+        comp = run.composite(arms)
+        # enhanced arm should win on quality and on the composite
+        self.assertGreater(arms[2]["task"], arms[0]["task"])
+        self.assertGreater(comp[2], comp[0])
+        self.assertGreater(comp[2], comp[1])
+
+
+if __name__ == "__main__":
+    unittest.main()