From 8181cd4d981ec38a9748816c7aa5adde6d6c5729 Mon Sep 17 00:00:00 2001 From: forkadarshp Date: Fri, 29 May 2026 06:00:07 +0530 Subject: [PATCH] feat: add runnable three-arm benchmark harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A closed-loop evaluator that operationalizes references/benchmarking.md: runs baseline / naive-swap / ModelPort-enhanced arms over an eval set, grades output-contract conformance, tool-calling accuracy, and task success, and prints a leaderboard with attribution (model delta, skill delta, net). - harness/run.py — orchestrates the three arms + leaderboard - harness/graders.py — provider-agnostic scoring (real, runs on actual output) - harness/providers.py — SimProvider (offline, deterministic) + AnthropicProvider (real Messages API; needs ANTHROPIC_API_KEY) - harness/scenarios/support_triage.json — bundled scenario fixture - harness/tests/ — 9 unit/smoke tests, wired into CI - harness/README.md — usage + the iterate-on-failures loop The simulator's numbers are illustrative (driven by prompt explicitness × a per-model literalness knob); the grading/scoring pipeline is real, so the Anthropic provider yields measured results with no other changes. --- .github/workflows/ci.yml | 3 + .gitignore | 3 + harness/README.md | 69 +++++++++++ harness/graders.py | 88 +++++++++++++ harness/providers.py | 171 ++++++++++++++++++++++++++ harness/run.py | 141 +++++++++++++++++++++ harness/scenarios/support_triage.json | 36 ++++++ harness/tests/test_harness.py | 76 ++++++++++++ 8 files changed, 587 insertions(+) create mode 100644 harness/README.md create mode 100644 harness/graders.py create mode 100644 harness/providers.py create mode 100644 harness/run.py create mode 100644 harness/scenarios/support_triage.json create mode 100644 harness/tests/test_harness.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 69c4803..028f4e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,3 +22,6 @@ jobs: uses: DavidAnson/markdownlint-cli2-action@v17 with: globs: "**/*.md" + - name: Test benchmark harness + working-directory: harness + run: python3 -m unittest discover -s tests -v diff --git a/.gitignore b/.gitignore index 77984e3..06a045c 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,6 @@ transcript.md # Internal launch/growth notes (not part of the published skill) LAUNCH_PREP.md + +# Benchmark harness run output +harness/results.json diff --git a/harness/README.md b/harness/README.md new file mode 100644 index 0000000..f0d35ed --- /dev/null +++ b/harness/README.md @@ -0,0 +1,69 @@ +# Migration Benchmark Harness + +A runnable, closed-loop evaluator for the three-arm migration benchmark +described in [`../references/benchmarking.md`](../references/benchmarking.md). +Run a scenario, read the leaderboard and the failing cases, tighten the +enhanced system, and re-run — iterating until the migrated system measurably +beats both the baseline and a raw model swap. + +## The three arms + +| Arm | Config | Isolates | +| --- | --- | --- | +| A baseline | old model + baseline prompt | starting point | +| B naive swap | new model + baseline prompt | raw model delta | +| C ModelPort | new model + enhanced prompt | the skill's added value | + +## Quickstart + +```bash +# offline, deterministic — no API key needed +python3 run.py --provider sim + +# real measured numbers — needs ANTHROPIC_API_KEY and `pip install anthropic` +python3 run.py --provider anthropic +``` + +Output: a leaderboard (task success, output-contract conformance, tool-call +accuracy, p95 latency, cost/req, weighted composite), an attribution line +(model delta B−A, skill delta C−B, net C−A), and the list of cases the +ModelPort arm still fails — your to-do list for the next iteration. + +## The loop + +1. Run the harness. +2. Read the leaderboard + failing cases. +3. Tighten the enhanced prompt/config for the failing dimension(s). +4. Re-run; confirm the composite and the skill delta (C−B) went up. + +This is exactly how the bundled scenario was tuned: a vague enhanced prompt +scored a **negative** skill delta (it cost more without lifting quality); making +the contract explicit (JSON-only, enumerated schema, no prose) moved ModelPort +from last place to a clear win. + +## Providers + +- **`sim`** — offline and deterministic. Outputs are a *simulation* driven only + by prompt explicitness and a per-model "literalness" knob (the documented + Opus 4.7+ trait: newer models follow instructions more literally, punishing + vague prompts and rewarding precise ones). The grading, scoring, and + leaderboard pipeline is real; **the numbers are illustrative, not measured.** +- **`anthropic`** — real Messages API calls. Same graders, real numbers. + +## Define your own scenario + +Copy `scenarios/support_triage.json` and edit: + +- `models` — old/new model IDs +- `prompts.baseline` / `prompts.enhanced` — the configs for arms A/B vs C +- `tools`, `categories` — the task surface +- `eval_cases` — inputs + expected `{category, tool, args}` and a `difficulty` +- `sim` — per-model knobs for the offline simulator (ignored by `--provider anthropic`) + +## Files + +- `run.py` — orchestrates the three arms, grades, prints the leaderboard +- `graders.py` — provider-agnostic scoring (contract, tool, task) +- `providers.py` — `SimProvider` (offline) + `AnthropicProvider` (real) +- `scenarios/` — scenario fixtures +- `tests/` — `python3 -m unittest discover -s tests` diff --git a/harness/graders.py b/harness/graders.py new file mode 100644 index 0000000..6babb25 --- /dev/null +++ b/harness/graders.py @@ -0,0 +1,88 @@ +"""Provider-agnostic graders for the migration benchmark harness. + +These score a model Response against a case's expected outcome. They run on the +real text/tool-call a model returns, so the same graders judge both the +simulated provider and the real Anthropic provider. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field + + +@dataclass +class Response: + """Normalized model output, independent of which provider produced it.""" + + text: str = "" + tool_name: str | None = None + tool_args: dict = field(default_factory=dict) + latency_ms: float = 0.0 + tokens_in: int = 0 + tokens_out: int = 0 + + +def _strip_fence(text: str) -> str: + t = text.strip() + if t.startswith("```"): + # drop the opening fence line and a trailing fence if present + t = t.split("\n", 1)[1] if "\n" in t else "" + if t.rstrip().endswith("```"): + t = t.rstrip()[:-3] + return t.strip() + + +def parse_contract(text: str) -> dict | None: + """Strict parse: the whole message must be a single JSON object. + + Prose around the JSON (a common regression after a model swap) fails here, + which is exactly the contract violation we want to catch. + """ + candidate = _strip_fence(text) + if not (candidate.startswith("{") and candidate.endswith("}")): + return None + try: + obj = json.loads(candidate) + except (ValueError, TypeError): + return None + return obj if isinstance(obj, dict) else None + + +def grade_contract(resp: Response) -> bool: + obj = parse_contract(resp.text) + if obj is None: + return False + return ( + isinstance(obj.get("category"), str) + and isinstance(obj.get("tool"), str) + and isinstance(obj.get("args"), dict) + ) + + +def grade_tool(resp: Response, expected: dict) -> bool: + want_tool = expected.get("tool", "none") + want_args = expected.get("args", {}) or {} + got_tool = resp.tool_name if resp.tool_name is not None else "none" + if got_tool != want_tool: + return False + # expected args must be a subset of what the model supplied + return all(str(resp.tool_args.get(k)) == str(v) for k, v in want_args.items()) + + +def grade_task(resp: Response, expected: dict) -> bool: + """Overall success: valid contract AND the right category.""" + obj = parse_contract(resp.text) + if obj is None: + return False + if not grade_contract(resp): + return False + return obj.get("category") == expected.get("category") + + +def grade_case(resp: Response, expected: dict) -> dict: + return { + "contract": grade_contract(resp), + "tool": grade_tool(resp, expected), + "task": grade_task(resp, expected), + } diff --git a/harness/providers.py b/harness/providers.py new file mode 100644 index 0000000..4b43401 --- /dev/null +++ b/harness/providers.py @@ -0,0 +1,171 @@ +"""Model providers for the benchmark harness. + +A provider turns (model, system_prompt, tools, case) into a normalized +``Response``. The graders then score that Response, so the scoring pipeline is +identical regardless of provider. + +- ``SimProvider`` : offline, deterministic. No API key. Its outputs are a + *simulation* whose only inputs are prompt explicitness and a per-model + "literalness" knob (the documented Opus 4.7+ trait: newer models follow + instructions more literally, which punishes vague prompts and rewards precise + ones). Numbers it produces are illustrative, not measured. +- ``AnthropicProvider`` : real Messages API calls. Produces real numbers. Needs + ``ANTHROPIC_API_KEY`` and ``pip install anthropic``. +""" + +from __future__ import annotations + +import json +import time + +from graders import Response + +# Simulator formula constants (transparent, tunable). See module docstring. +_BOOST_EXPLICIT = 0.30 # precise prompt raises fidelity +_PENALTY_VAGUE = 0.20 # literal model * vague prompt lowers fidelity +_BONUS_LITERAL_EXPLICIT = 0.05 # literal model * precise prompt small bonus + + +def _clamp(x: float, lo: float = 0.02, hi: float = 0.95) -> float: + return max(lo, min(hi, x)) + + +def explicitness(prompt: str, scenario: dict) -> dict: + """Score how precisely a prompt specifies the output contract and tools.""" + p = prompt.lower() + contract_markers = ["only", "json", "schema", "no prose", "do not wrap"] + c = sum(m in p for m in contract_markers) / len(contract_markers) + + tool_names = [t["name"] for t in scenario["tools"] if t["name"] != "none"] + enum = sum(name in p for name in tool_names) / max(1, len(tool_names)) + args_marker = 1.0 if "args" in p else 0.0 + tool_e = 0.6 * enum + 0.4 * args_marker + + return {"contract": c, "tool": tool_e, "task": (c + tool_e) / 2} + + +class SimProvider: + """Deterministic simulator. Same input -> same output, no network.""" + + def __init__(self, scenario: dict): + self.scenario = scenario + self.sim = scenario["sim"] + + def _fidelity(self, e: float, literalness: float, base: float) -> float: + return _clamp( + base + + _BOOST_EXPLICIT * e + - _PENALTY_VAGUE * literalness * (1 - e) + + _BONUS_LITERAL_EXPLICIT * literalness * e + ) + + def generate(self, model: str, system_prompt: str, case: dict) -> Response: + params = self.sim[model] + L, base = params["literalness"], params["base"] + e = explicitness(system_prompt, self.scenario) + d = case["difficulty"] + expected = case["expected"] + + fid = {m: self._fidelity(e[m], L, base) for m in ("contract", "tool", "task")} + contract_ok = d <= fid["contract"] + tool_ok = d <= fid["tool"] + task_ok = contract_ok and (d <= fid["task"]) + + # craft text + tool-call consistent with the chosen outcomes, so the + # real graders re-derive these scores from actual strings. + category = expected["category"] if task_ok else _wrong_category( + expected["category"], self.scenario["categories"] + ) + if tool_ok: + tool_name = None if expected["tool"] == "none" else expected["tool"] + tool_args = dict(expected.get("args", {})) + else: + tool_name = None if expected["tool"] != "none" else "lookup_order" + tool_args = {} + + payload = {"category": category, "tool": expected["tool"] if tool_ok else "none", "args": tool_args} + if contract_ok: + text = json.dumps(payload) + else: + # contract regression: prose around the JSON -> strict parse fails + text = f"Sure! Based on the ticket, here's my take: {json.dumps(payload)} Hope that helps." + + tokens_in = max(1, (len(system_prompt) + len(case["input"])) // 4) + tokens_out = max(1, len(text) // 4) + # deterministic latency: model base + tiny difficulty-driven jitter + latency_ms = params["latency_ms"] * (0.9 + 0.2 * d) + + return Response( + text=text, + tool_name=tool_name, + tool_args=tool_args, + latency_ms=latency_ms, + tokens_in=tokens_in, + tokens_out=tokens_out, + ) + + +def _wrong_category(correct: str, categories: list[str]) -> str: + for c in categories: + if c != correct: + return c + return correct + + +class AnthropicProvider: + """Real Anthropic Messages API backend. Requires ANTHROPIC_API_KEY.""" + + def __init__(self, scenario: dict): + import anthropic # lazy: only needed for real runs + + self.scenario = scenario + self.client = anthropic.Anthropic() + + def generate(self, model: str, system_prompt: str, case: dict) -> Response: + tools = [ + { + "name": t["name"], + "description": f"{t['name']} tool", + "input_schema": { + "type": "object", + "properties": {a: {"type": "string"} for a in t["args"]}, + "required": t["args"], + }, + } + for t in self.scenario["tools"] + if t["name"] != "none" + ] + start = time.time() + msg = self.client.messages.create( + model=model, + max_tokens=512, + system=system_prompt, + tools=tools, + messages=[{"role": "user", "content": case["input"]}], + ) + latency_ms = (time.time() - start) * 1000 + + text, tool_name, tool_args = "", None, {} + for block in msg.content: + if block.type == "text": + text += block.text + elif block.type == "tool_use": + tool_name = block.name + tool_args = dict(block.input) + + return Response( + text=text, + tool_name=tool_name, + tool_args=tool_args, + latency_ms=latency_ms, + tokens_in=msg.usage.input_tokens, + tokens_out=msg.usage.output_tokens, + ) + + +def get_provider(name: str, scenario: dict): + if name == "sim": + return SimProvider(scenario) + if name == "anthropic": + return AnthropicProvider(scenario) + raise ValueError(f"unknown provider: {name!r} (use 'sim' or 'anthropic')") diff --git a/harness/run.py b/harness/run.py new file mode 100644 index 0000000..c4604f8 --- /dev/null +++ b/harness/run.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +"""Run the three-arm migration benchmark and print a leaderboard. + +Arms (same eval set, same graders): + A baseline = old model + baseline prompt + B naive swap = new model + baseline prompt (raw model delta) + C ModelPort = new model + enhanced prompt (skill's added value) + +Usage: + python run.py --scenario scenarios/support_triage.json --provider sim + python run.py --provider anthropic # real numbers; needs ANTHROPIC_API_KEY +""" + +from __future__ import annotations + +import argparse +import json +import statistics +from pathlib import Path + +from graders import grade_case +from providers import get_provider + + +def run_arm(provider, model, prompt, scenario, prices): + cases, rows = scenario["eval_cases"], [] + lat, costs = [], [] + fails = [] + for case in cases: + resp = provider.generate(model, prompt, case) + scores = grade_case(resp, case["expected"]) + lat.append(resp.latency_ms) + costs.append(resp.tokens_in * prices["price_in"] + resp.tokens_out * prices["price_out"]) + rows.append(scores) + if not all(scores.values()): + fails.append((case["id"], scores)) + n = len(cases) + return { + "task": sum(r["task"] for r in rows) / n, + "contract": sum(r["contract"] for r in rows) / n, + "tool": sum(r["tool"] for r in rows) / n, + "p50_ms": statistics.median(lat), + "p95_ms": sorted(lat)[max(0, int(0.95 * n) - 1)], + "cost": sum(costs) / n, + "fails": fails, + } + + +def composite(arms): + """0.50 quality + 0.30 cost-efficiency + 0.20 speed. + + quality = mean(task, contract, tool). cost/speed are scored *relative to the + baseline arm* and bounded, so a small absolute cost/latency difference only + moves the score a little — quality stays dominant. (Min-max normalization + across arms is deliberately avoided: with near-equal costs it amplifies + rounding into a full-weight swing and can rank a worse arm first.) + """ + base = arms[0] + + def contrib(factor: float) -> float: # factor > 1 == better than baseline + return max(0.0, min(1.0, 0.5 + 0.5 * (factor - 1.0))) + + out = [] + for a in arms: + quality = (a["task"] + a["contract"] + a["tool"]) / 3 + cost_factor = base["cost"] / a["cost"] if a["cost"] else 1.0 + speed_factor = base["p95_ms"] / a["p95_ms"] if a["p95_ms"] else 1.0 + out.append(0.5 * quality + 0.3 * contrib(cost_factor) + 0.2 * contrib(speed_factor)) + return out + + +def leaderboard(arms): + M, V = 22, 12 + bar = lambda l, m, r: l + "─" * M + m + ("─" * V + m) * 2 + "─" * V + r + def hrow(c0, c1, c2, c3): + cells = [" " + c0.ljust(M - 1)] + [" " + c.ljust(V - 1) for c in (c1, c2, c3)] + return "│" + cells[0] + "│" + cells[1] + "│" + cells[2] + "│" + cells[3] + "│" + def row(label, a, b, c): + cells = [label.ljust(M)] + [x.rjust(V - 1) + " " for x in (a, b, c)] + return "│" + cells[0] + "│" + cells[1] + "│" + cells[2] + "│" + cells[3] + "│" + + pct = lambda x: f"{round(x * 100)}%" + sec = lambda ms: f"{ms / 1000:.1f}s" + usd = lambda c: f"${c:.4f}" + comp = composite(arms) + + out = [bar("╭", "┬", "╮"), + hrow("metric", "baseline", "naive swap", "ModelPort"), + hrow("", "(old/old)", "(new/old)", "(new/enh.)"), + bar("├", "┼", "┤"), + row(" task success", *[pct(a["task"]) for a in arms]), + row(" output contract", *[pct(a["contract"]) for a in arms]), + row(" tool-call accuracy", *[pct(a["tool"]) for a in arms]), + row(" p95 latency", *[sec(a["p95_ms"]) for a in arms]), + row(" cost / req", *[usd(a["cost"]) for a in arms]), + bar("├", "┼", "┤"), + row(" composite (50/30/20)", *[f"{x:.2f}" for x in comp]), + bar("╰", "┴", "╯")] + return "\n".join(out), comp + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--scenario", default=str(Path(__file__).parent / "scenarios/support_triage.json")) + ap.add_argument("--provider", default="sim", choices=["sim", "anthropic"]) + ap.add_argument("--out", default=str(Path(__file__).parent / "results.json")) + args = ap.parse_args() + + scenario = json.loads(Path(args.scenario).read_text()) + provider = get_provider(args.provider, scenario) + old, new = scenario["models"]["old"], scenario["models"]["new"] + base_p, enh_p = scenario["prompts"]["baseline"], scenario["prompts"]["enhanced"] + + arm_specs = [("A baseline", old, base_p), ("B naive swap", new, base_p), ("C ModelPort", new, enh_p)] + arms = [run_arm(provider, m, p, scenario, scenario["sim"][m]) for _, m, p in arm_specs] + + table, comp = leaderboard(arms) + print(f"\nScenario: {scenario['name']} | provider: {args.provider} | n={len(scenario['eval_cases'])}") + if args.provider == "sim": + print("(simulated numbers — illustrative; run --provider anthropic for measured results)") + print(table) + print(f"\nattribution: model delta (B−A) {comp[1] - comp[0]:+.2f} " + f"skill delta (C−B) {comp[2] - comp[1]:+.2f} net (C−A) {comp[2] - comp[0]:+.2f}") + + cfails = arms[2]["fails"] + if cfails: + broken = sorted({k for _, s in cfails for k, ok in s.items() if not ok}) + print(f"\nModelPort arm still failing {len(cfails)} case(s) on: {', '.join(broken)}") + print(" cases: " + ", ".join(cid for cid, _ in cfails)) + print(" -> next iteration: tighten the enhanced prompt for the failing dimension(s).") + else: + print("\nModelPort arm passes every case. ") + + Path(args.out).write_text(json.dumps( + {"scenario": scenario["name"], "provider": args.provider, + "arms": {spec[0]: {k: v for k, v in a.items() if k != "fails"} for spec, a in zip(arm_specs, arms)}, + "composite": dict(zip([s[0] for s in arm_specs], comp))}, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/harness/scenarios/support_triage.json b/harness/scenarios/support_triage.json new file mode 100644 index 0000000..b63a3df --- /dev/null +++ b/harness/scenarios/support_triage.json @@ -0,0 +1,36 @@ +{ + "name": "support-ticket-triage", + "description": "Classify a support ticket, optionally call one tool, and return a strict JSON object. Exercises output-contract conformance and tool-calling accuracy across a model migration.", + "models": { "old": "claude-opus-4-6", "new": "claude-opus-4-8" }, + "categories": ["billing", "technical", "account", "other"], + "tools": [ + { "name": "lookup_order", "args": ["order_id"] }, + { "name": "issue_refund", "args": ["order_id"] }, + { "name": "reset_password", "args": ["email"] }, + { "name": "none", "args": [] } + ], + "prompts": { + "baseline": "You are a helpful support assistant. Read the customer ticket and decide how to handle it. Tell me the category and whether a tool is needed.", + "enhanced": "You are a support-triage assistant. Return ONLY a JSON object and no prose. Schema: {\"category\": one of billing|technical|account|other, \"tool\": one of lookup_order|issue_refund|reset_password|none, \"args\": object}. Do not wrap the JSON in code fences or commentary. Choose exactly one category and exactly one tool from the lists above." + }, + "eval_cases": [ + { "id": "c01", "input": "I was charged twice for order 4471, please refund one.", "difficulty": 0.15, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "4471" } } }, + { "id": "c02", "input": "I forgot my password and can't log in. Email is sam@acme.io", "difficulty": 0.2, "expected": { "category": "account", "tool": "reset_password", "args": { "email": "sam@acme.io" } } }, + { "id": "c03", "input": "Where is my package for order 9920?", "difficulty": 0.3, "expected": { "category": "other", "tool": "lookup_order", "args": { "order_id": "9920" } } }, + { "id": "c04", "input": "The checkout page throws a 500 error when I click pay.", "difficulty": 0.45, "expected": { "category": "technical", "tool": "none", "args": {} } }, + { "id": "c05", "input": "Refund my duplicate charge on order 1203.", "difficulty": 0.25, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "1203" } } }, + { "id": "c06", "input": "Can you check the status of order 5560?", "difficulty": 0.4, "expected": { "category": "other", "tool": "lookup_order", "args": { "order_id": "5560" } } }, + { "id": "c07", "input": "My account is locked after too many login attempts; reset for jo@x.com", "difficulty": 0.55, "expected": { "category": "account", "tool": "reset_password", "args": { "email": "jo@x.com" } } }, + { "id": "c08", "input": "The mobile app crashes on startup since the update.", "difficulty": 0.6, "expected": { "category": "technical", "tool": "none", "args": {} } }, + { "id": "c09", "input": "I think I was overcharged but I'm not sure, can you look at order 7781?", "difficulty": 0.7, "expected": { "category": "billing", "tool": "lookup_order", "args": { "order_id": "7781" } } }, + { "id": "c10", "input": "Just wanted to say your support team is great!", "difficulty": 0.5, "expected": { "category": "other", "tool": "none", "args": {} } }, + { "id": "c11", "input": "Password reset link expired, send a new one to me@dev.io please", "difficulty": 0.8, "expected": { "category": "account", "tool": "reset_password", "args": { "email": "me@dev.io" } } }, + { "id": "c12", "input": "Payment failed three times and now I have three pending charges on order 3030.", "difficulty": 0.9, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "3030" } } }, + { "id": "c13", "input": "Charged for a subscription I cancelled (order 8800), I want a refund.", "difficulty": 0.95, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "8800" } } }, + { "id": "c14", "input": "App shows a white screen, reinstalled twice, still broken.", "difficulty": 0.97, "expected": { "category": "technical", "tool": "none", "args": {} } } + ], + "sim": { + "claude-opus-4-6": { "literalness": 0.5, "base": 0.72, "latency_ms": 3800, "price_in": 0.000005, "price_out": 0.000025 }, + "claude-opus-4-8": { "literalness": 0.9, "base": 0.76, "latency_ms": 2300, "price_in": 0.000005, "price_out": 0.000025 } + } +} diff --git a/harness/tests/test_harness.py b/harness/tests/test_harness.py new file mode 100644 index 0000000..00f1ee0 --- /dev/null +++ b/harness/tests/test_harness.py @@ -0,0 +1,76 @@ +"""Unit + smoke tests for the benchmark harness. Run: python3 -m unittest.""" + +import json +import sys +import unittest +from pathlib import Path + +HARNESS = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(HARNESS)) + +from graders import Response, grade_contract, grade_task, grade_tool # noqa: E402 +from providers import SimProvider, explicitness # noqa: E402 +import run # noqa: E402 + + +class TestGraders(unittest.TestCase): + def test_strict_json_passes_contract(self): + r = Response(text='{"category": "billing", "tool": "none", "args": {}}') + self.assertTrue(grade_contract(r)) + + def test_prose_wrapped_json_fails_contract(self): + r = Response(text='Sure! Here you go: {"category": "billing", "tool": "none", "args": {}} thanks') + self.assertFalse(grade_contract(r)) + + def test_code_fenced_json_passes(self): + r = Response(text='```json\n{"category": "account", "tool": "none", "args": {}}\n```') + self.assertTrue(grade_contract(r)) + + def test_tool_match_and_args_subset(self): + r = Response(tool_name="issue_refund", tool_args={"order_id": "4471", "extra": "x"}) + self.assertTrue(grade_tool(r, {"tool": "issue_refund", "args": {"order_id": "4471"}})) + + def test_tool_name_mismatch_fails(self): + r = Response(tool_name="lookup_order", tool_args={"order_id": "4471"}) + self.assertFalse(grade_tool(r, {"tool": "issue_refund", "args": {"order_id": "4471"}})) + + def test_tool_none_expected(self): + r = Response(tool_name=None) + self.assertTrue(grade_tool(r, {"tool": "none", "args": {}})) + + def test_task_requires_correct_category(self): + ok = Response(text='{"category": "billing", "tool": "none", "args": {}}') + wrong = Response(text='{"category": "technical", "tool": "none", "args": {}}') + self.assertTrue(grade_task(ok, {"category": "billing"})) + self.assertFalse(grade_task(wrong, {"category": "billing"})) + + +class TestSimAndLoop(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.scenario = json.loads((HARNESS / "scenarios/support_triage.json").read_text()) + + def test_enhanced_prompt_is_more_explicit(self): + e_base = explicitness(self.scenario["prompts"]["baseline"], self.scenario) + e_enh = explicitness(self.scenario["prompts"]["enhanced"], self.scenario) + self.assertGreater(e_enh["contract"], e_base["contract"]) + self.assertGreater(e_enh["task"], e_base["task"]) + + def test_modelport_arm_beats_baseline(self): + sc = self.scenario + prov = SimProvider(sc) + old, new = sc["models"]["old"], sc["models"]["new"] + arms = [ + run.run_arm(prov, old, sc["prompts"]["baseline"], sc, sc["sim"][old]), + run.run_arm(prov, new, sc["prompts"]["baseline"], sc, sc["sim"][new]), + run.run_arm(prov, new, sc["prompts"]["enhanced"], sc, sc["sim"][new]), + ] + comp = run.composite(arms) + # enhanced arm should win on quality and on the composite + self.assertGreater(arms[2]["task"], arms[0]["task"]) + self.assertGreater(comp[2], comp[0]) + self.assertGreater(comp[2], comp[1]) + + +if __name__ == "__main__": + unittest.main()