From 6faef8c0cb9b303fb03aeabe9ddd61a0a69e587a Mon Sep 17 00:00:00 2001 From: Ousama Ben Younes Date: Thu, 18 Jun 2026 20:00:40 +0000 Subject: [PATCH] fix(benchmarks): portable plugin-dir resolution for agentic arms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ponytail/caveman arms hardcoded one machine's Windows plugin-cache paths (C:\Users\Dietr\...), so only baseline/yagni/yagni-oneliner were reproducible off the maintainer's box — undercutting the "fully reproducible" claim the rebuilt benchmark (#126) was meant to establish. Resolve per-arm at use-site: env override (PONYTAIL_PLUGIN_DIR / CAVEMAN_PLUGIN_DIR) -> latest version dir under ~/.claude/plugins/cache -> clear sys.exit. No pinned version/hash. Selftest extended to cover env-override and missing-install paths. Fixes #169 Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/agentic/run.py | 48 ++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/benchmarks/agentic/run.py b/benchmarks/agentic/run.py index 307cce6..cac56fa 100644 --- a/benchmarks/agentic/run.py +++ b/benchmarks/agentic/run.py @@ -22,7 +22,7 @@ ponytail: the claude CLI is the harness (already installed, we run inside it). No SDK dependency. The CLI's JSON output already carries cost/tokens/duration/permission_denials. """ -import argparse, concurrent.futures, datetime, json, re, shutil, statistics, subprocess, sys, tempfile +import argparse, concurrent.futures, datetime, json, os, re, shutil, statistics, subprocess, sys, tempfile from collections import defaultdict from pathlib import Path @@ -43,11 +43,22 @@ def _skill(rel): return (ROOT / rel).read_text(encoding="utf-8") # Skills are plugins activated by a SessionStart hook. To test exactly one at a time we exclude the # user's globally-enabled plugins (--setting-sources project,local) and load one plugin from its -# cache dir (--plugin-dir). Local absolute paths; the smoke test verifies activation by output style. -PLUGIN_DIRS = { - "ponytail": r"C:\Users\Dietr\.claude\plugins\cache\ponytail\ponytail\4.2.0", - "caveman": r"C:\Users\Dietr\.claude\plugins\cache\caveman\caveman\63e797cd753b", -} +# cache dir (--plugin-dir). The smoke test verifies activation by output style. +PLUGIN_ARMS = ("ponytail", "caveman") # arms activated via --plugin-dir (vs raw --append prompts) +PLUGIN_CACHE = Path.home() / ".claude" / "plugins" / "cache" + +def _plugin_dir(name): + """Resolve a plugin's cache dir portably -- hardcoding one machine's absolute path + (e.g. C:\\Users\\\\...) made the ponytail/caveman arms unreproducible off that box. + Order: env override -> latest version dir under ~/.claude/plugins/cache -> clear error. + Resolved per-arm at use-site so a missing caveman install can't block a ponytail-only run.""" + env = os.environ.get(f"{name.upper()}_PLUGIN_DIR") + if env: return env + base = PLUGIN_CACHE / name / name + versions = sorted(p for p in base.glob("*") if p.is_dir()) if base.exists() else [] + if not versions: + sys.exit(f"{name} plugin dir not found under {base}; install the plugin or set {name.upper()}_PLUGIN_DIR") + return str(versions[-1]) # latest version dir; not pinned to one machine's hash CELL_TIMEOUT = 300 # seconds per cell; a hung agent is force-killed (process tree) so the pool can't freeze @@ -146,9 +157,30 @@ def selftest(): print(f"{'ok ' if ok else 'XX '} {tid:12} {kind:4} correct={r['correct']} " f"safe={r['safe']} axis={axis} {r['reason']}") failures += 0 if ok else 1 + failures += _selftest_plugin_dir() print(f"\nselftest: {'all instruments valid' if not failures else str(failures) + ' BROKEN'}") return failures +def _selftest_plugin_dir(): + """Plugin-dir resolution must be portable: env override wins, and a missing install + fails loudly (sys.exit) instead of silently passing a non-existent path to --plugin-dir.""" + fails = 0 + sentinel = "/tmp/ponytail-selftest-plugin-dir" + os.environ["PONYTAIL_PLUGIN_DIR"] = sentinel + try: + ok_env = _plugin_dir("ponytail") == sentinel + finally: + del os.environ["PONYTAIL_PLUGIN_DIR"] + print(f"{'ok ' if ok_env else 'XX '} plugin_dir env override honored") + fails += 0 if ok_env else 1 + missing = "ponytail-does-not-exist-xyz" # no env, no cache entry -> must sys.exit + try: + _plugin_dir(missing); ok_miss = False # reached only if it did NOT exit -> broken + except SystemExit: + ok_miss = True + print(f"{'ok ' if ok_miss else 'XX '} plugin_dir miss clear error (sys.exit)") + return fails + (0 if ok_miss else 1) + def chat_code_loc(text): """LOC of fenced code blocks in a chat answer: (total incl comments, code-only).""" total = code = 0 @@ -217,8 +249,8 @@ def run_cell(task_id, arm, model, workdir: Path): "--setting-sources", "project,local", "--strict-mcp-config", "--disallowedTools", "Bash"] append = NO_RUN # all arms get NO_RUN, identically - if arm in PLUGIN_DIRS: - cmd += ["--plugin-dir", PLUGIN_DIRS[arm]] # real activation of exactly one plugin + if arm in PLUGIN_ARMS: + cmd += ["--plugin-dir", _plugin_dir(arm)] # real activation of exactly one plugin else: extra = ARMS[arm]() # baseline -> None; yagni-oneliner -> the prompt if extra: append = extra + "\n\n" + NO_RUN