From 7063428998ce8a3b9f412aa73967b63bcb1169e5 Mon Sep 17 00:00:00 2001 From: Ferran Torres Date: Wed, 17 Jun 2026 13:05:51 +0200 Subject: [PATCH 1/4] fix: correctness and mode-state bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - mode-tracker: a prompt that both switches mode and says "stop ponytail" no longer emits two JSON objects on one stdout (broke the host JSON.parse in Codex/Copilot); the deactivation branch is now mutually exclusive. - mode-tracker / opencode: an unknown /ponytail arg (a typo) no longer silently resets the active level — it is left untouched, matching what the pi extension already does. Bare /ponytail still re-applies the default. - benchmarks/correctness gate: prose that name-drops the structural keywords no longer scores as a pass on the run-free checks (countdown/ratelimit), and a model demo that exits 0 before the appended asserts run no longer counts as a pass — the harness already prints a PASS sentinel; we require it. Co-Authored-By: Claude Opus 4.8 (1M context) --- .opencode/plugins/ponytail.mjs | 6 +++++- benchmarks/correctness.js | 35 +++++++++++++++++++++++++--------- hooks/ponytail-mode-tracker.js | 12 +++++++----- tests/hooks.test.js | 9 +++++++++ 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/.opencode/plugins/ponytail.mjs b/.opencode/plugins/ponytail.mjs index 2fd3625..ab7b7c0 100644 --- a/.opencode/plugins/ponytail.mjs +++ b/.opencode/plugins/ponytail.mjs @@ -56,8 +56,12 @@ export default async ({ client } = {}) => { // synchronous store if same-turn switching ever matters. 'command.execute.before': async (input) => { if (!input || input.command !== 'ponytail') return; + // bare /ponytail re-applies the default; a known level sets it; an unknown + // arg (a typo) is left alone rather than silently resetting the level. + const arg = (input.arguments || '').trim(); + const mode = arg === '' ? getDefaultMode() : normalizePersistedMode(arg); + if (!mode) return; // `off` is persisted like any mode; the transform reads it and stays silent. - const mode = normalizePersistedMode((input.arguments || '').trim()) || getDefaultMode(); writeMode(mode); log('info', 'ponytail ' + mode); }, diff --git a/benchmarks/correctness.js b/benchmarks/correctness.js index fc56611..2b896c0 100644 --- a/benchmarks/correctness.js +++ b/benchmarks/correctness.js @@ -17,10 +17,19 @@ function extractBlocks(text) { const matches = [...text.matchAll(/```(\w*)\r?\n([\s\S]*?)```/g)]; // ponytail: terse models often answer with bare, unfenced code. Treat the whole // response as one block so the gate scores the code instead of reporting "no block". - if (matches.length === 0 && text.trim()) return [{ lang: '', code: text }]; + if (matches.length === 0 && text.trim()) return [{ lang: '', code: text, unfenced: true }]; return matches.map((m) => ({ lang: (m[1] || '').toLowerCase(), code: m[2] })); } +// ponytail: a fenced block is code by the model's own delimiter; the unfenced +// fallback wraps the whole response, prose included. The structural-only checks +// (countdown/ratelimit) never run the code, so prose carrying the right keywords +// would score as a pass — require a real code construct on an unfenced block +// before trusting them. ceiling: keyword heuristic, not a parser. +function looksLikeCode(block) { + return !block.unfenced || /[{}]|=>|@\w|\bdef\b|\bimport\b/.test(block.code); +} + // Identify which task we're evaluating from vars.task. function identifyTask(task) { const t = task.toLowerCase(); @@ -35,13 +44,21 @@ function identifyTask(task) { // Run a command, return { ok, stderr }. function exec(cmd, opts = {}) { try { - execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts }); - return { ok: true, stderr: '' }; + const stdout = execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts }); + return { ok: true, stdout: stdout || '', stderr: '' }; } catch (e) { - return { ok: false, stderr: (e.stderr || e.message || '').slice(0, 500) }; + return { ok: false, stdout: e.stdout || '', stderr: (e.stderr || e.message || '').slice(0, 500) }; } } +// ponytail: exit-0 alone is not a pass. The model's own code can sys.exit(0) / +// process.exit(0) in a __main__/demo block before our appended asserts run, so +// each harness prints a PASS sentinel on success and we require it here. Without +// this, a skipped assertion masquerades as a passing answer. +function passed(result) { + return result.ok && /(^|\n)PASS\s*$/.test(result.stdout || ''); +} + // ponytail: probe once at load; macOS and many Linux images ship python3 only. let pythonCmd; function python() { @@ -120,7 +137,7 @@ print("PASS") const f = tmpFile('.py', harness); const result = exec(`${python()} "${f}"`); fs.unlinkSync(f); - if (result.ok) return { pass: true, reason: 'Email validator passes all checks' }; + if (passed(result)) return { pass: true, reason: 'Email validator passes all checks' }; return { pass: false, reason: result.stderr || 'Email validator failed' }; }, @@ -165,7 +182,7 @@ setTimeout(() => { const f = tmpFile('.mjs', harness); const result = exec(`node "${f}"`); fs.unlinkSync(f); - if (result.ok) return { pass: true, reason: 'Debounce passes all checks' }; + if (passed(result)) return { pass: true, reason: 'Debounce passes all checks' }; return { pass: false, reason: result.stderr || 'Debounce failed' }; }, @@ -215,7 +232,7 @@ else: const result = exec(`${python()} "${f}"`); try { fs.unlinkSync(f); } catch (e) {} try { fs.unlinkSync(csvPath); } catch (e) {} - if (result.ok) return { pass: true, reason: 'CSV sum produces correct result (351)' }; + if (passed(result)) return { pass: true, reason: 'CSV sum produces correct result (351)' }; return { pass: false, reason: result.stderr || 'CSV sum failed' }; }, @@ -223,7 +240,7 @@ else: // React components can't run in bare Node without a bundler. Structural check: // the code must contain timer/countdown logic (useState/useEffect/setInterval/setTimeout). const code = blocks.find((b) => b.code.includes('ount') || b.code.includes('timer') || b.code.includes('Timer')); - if (!code) return { pass: false, reason: 'No countdown component found' }; + if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No countdown component found' }; const src = code.code; const hasState = /useState|useReducer|this\.state/.test(src); @@ -241,7 +258,7 @@ else: ratelimit(blocks) { const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && (b.code.includes('rate') || b.code.includes('limit')))); - if (!code) return { pass: false, reason: 'No Python code block found' }; + if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No Python code block found' }; // Structural check for rate limiting: must have some form of counter/time tracking. const src = code.code; diff --git a/hooks/ponytail-mode-tracker.js b/hooks/ponytail-mode-tracker.js index d4fda46..0209f50 100644 --- a/hooks/ponytail-mode-tracker.js +++ b/hooks/ponytail-mode-tracker.js @@ -28,7 +28,9 @@ process.stdin.on('end', () => { else if (arg === 'full') mode = 'full'; else if (arg === 'ultra') mode = 'ultra'; else if (arg === 'off') mode = 'off'; - else mode = getDefaultMode(); + else if (arg === '') mode = getDefaultMode(); + // else: unknown arg (a typo) — leave mode null so we don't silently + // reset the active level; pi already treats unknown args as a no-op. } if (mode && mode !== 'off') { @@ -42,10 +44,10 @@ process.stdin.on('end', () => { clearMode(); writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF'); } - } - - // Detect deactivation - if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) { + } else if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) { + // Deactivation phrase — but only when the prompt isn't itself a /ponytail + // command. A prompt matching both branches used to emit two hook outputs + // (two JSON objects on one stdout), breaking the host's JSON.parse. clearMode(); writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF'); } diff --git a/tests/hooks.test.js b/tests/hooks.test.js index 46e7be6..282b01f 100644 --- a/tests/hooks.test.js +++ b/tests/hooks.test.js @@ -138,5 +138,14 @@ assert.equal( output = JSON.parse(result.stdout); assert.deepEqual(output, {}); +// Unknown /ponytail arg must NOT silently reset the level: a typo leaves the +// active mode untouched and emits nothing (pi already treats it as a no-op). +run('ponytail-mode-tracker.js', codexEnv, JSON.stringify({ prompt: '@ponytail lite' })); +assert.equal(fs.readFileSync(codexState, 'utf8'), 'lite'); +result = run('ponytail-mode-tracker.js', codexEnv, JSON.stringify({ prompt: '@ponytail bogus' })); +assert.equal(result.status, 0, result.stderr); +assert.equal(fs.readFileSync(codexState, 'utf8'), 'lite', 'unknown arg must not change the persisted level'); +assert.equal(result.stdout, '', 'unknown arg must emit no mode-change output'); + fs.rmSync(temp, { recursive: true, force: true }); console.log('hook compatibility checks passed'); From 5d57c14179b1ebb676632b4b0634b335c4e22706 Mon Sep 17 00:00:00 2001 From: Ferran Torres Date: Wed, 17 Jun 2026 13:06:08 +0200 Subject: [PATCH 2/4] fix: honor CLAUDE_CONFIG_DIR in statusline and setup nudge The hooks write the mode flag under $CLAUDE_CONFIG_DIR when set (getClaudeDir), but the statusline scripts hardcoded ~/.claude, so the badge vanished whenever a user relocated Claude's config dir. Both the bash and PowerShell statusline now resolve the same dir. The SessionStart setup nudge likewise pointed at a literal ~/.claude/settings.json; it now interpolates the resolved settings path. Co-Authored-By: Claude Opus 4.8 (1M context) --- hooks/ponytail-activate.js | 4 +++- hooks/ponytail-statusline.ps1 | 4 +++- hooks/ponytail-statusline.sh | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/hooks/ponytail-activate.js b/hooks/ponytail-activate.js index 9610721..6f3ee50 100644 --- a/hooks/ponytail-activate.js +++ b/hooks/ponytail-activate.js @@ -61,7 +61,9 @@ if (!isCodex) try { output += "\n\n" + "STATUSLINE SETUP NEEDED: The ponytail plugin includes a statusline badge showing active mode " + "(e.g. [PONYTAIL], [PONYTAIL:ULTRA]). It is not configured yet. " + - "To enable, add this to ~/.claude/settings.json: " + + // settingsPath honors CLAUDE_CONFIG_DIR; the literal "~/.claude" misled + // anyone who relocated Claude's config dir to edit the wrong file. + "To enable, add this to " + settingsPath + ": " + statusLineSnippet + " " + "Proactively offer to set this up for the user on first interaction."; } diff --git a/hooks/ponytail-statusline.ps1 b/hooks/ponytail-statusline.ps1 index d9fe437..ba60ff1 100644 --- a/hooks/ponytail-statusline.ps1 +++ b/hooks/ponytail-statusline.ps1 @@ -1,4 +1,6 @@ -$Flag = Join-Path $HOME ".claude/.ponytail-active" +# Mirror ponytail-config.getClaudeDir(): honor CLAUDE_CONFIG_DIR, else ~/.claude. +$ConfigDir = if ($env:CLAUDE_CONFIG_DIR) { $env:CLAUDE_CONFIG_DIR } else { Join-Path $HOME ".claude" } +$Flag = Join-Path $ConfigDir ".ponytail-active" if (-not (Test-Path $Flag)) { exit 0 } diff --git a/hooks/ponytail-statusline.sh b/hooks/ponytail-statusline.sh index 5e83a27..d9ee87d 100644 --- a/hooks/ponytail-statusline.sh +++ b/hooks/ponytail-statusline.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash -flag="$HOME/.claude/.ponytail-active" +# Mirror ponytail-config.getClaudeDir(): the flag is written under +# $CLAUDE_CONFIG_DIR when set, else ~/.claude. Reading the wrong path hides the +# badge whenever the user relocates Claude's config dir. +flag="${CLAUDE_CONFIG_DIR:-$HOME/.claude}/.ponytail-active" [ -f "$flag" ] || exit 0 mode=$(head -n1 "$flag" | tr -d '[:space:]') From 1054b231f8d524e87714732c8faa69290470d739 Mon Sep 17 00:00:00 2001 From: Ferran Torres Date: Wed, 17 Jun 2026 13:06:08 +0200 Subject: [PATCH 3/4] test: close CI coverage gaps - Fold the orphaned benchmarks/correctness.test.js (issue #65 guard, never run by `npm test` or CI) into tests/correctness.test.js, and add regression cases for the prose-as-pass and exit-0-as-pass fixes. - Add a bash smoke test for the statusline badge (both CLAUDE_CONFIG_DIR and the ~/.claude fallback). - Extend the Windows hooks test to also validate copilot-hooks.json (script existence) and assert hooks.json and copilot-hooks.json wire the same scripts, so a hook added to one host manifest can't be silently forgotten in the other. - Add a parity gate over the three marketplace.json manifests (parse, plugin name, shared description) and a name+description gate over the skill source frontmatter. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/correctness.test.js | 26 ----------------- tests/correctness.test.js | 51 ++++++++++++++++++++++++++++++++++ tests/gemini-extension.test.js | 29 +++++++++++++++++++ tests/hooks-windows.test.js | 44 ++++++++++++++++++++++++----- tests/openclaw-skills.test.js | 11 ++++++++ tests/statusline.test.js | 49 ++++++++++++++++++++++++++++++++ 6 files changed, 177 insertions(+), 33 deletions(-) delete mode 100644 benchmarks/correctness.test.js create mode 100644 tests/statusline.test.js diff --git a/benchmarks/correctness.test.js b/benchmarks/correctness.test.js deleted file mode 100644 index e3103c4..0000000 --- a/benchmarks/correctness.test.js +++ /dev/null @@ -1,26 +0,0 @@ -// Regression guard for the gate fixes (issue #65). Run: node correctness.test.js -// Needs python + node on PATH, same as correctness.js itself. -const assert = require('assert'); -const check = require('./correctness.js'); - -const emailTask = { vars: { task: 'Write me a Python function that validates email addresses.' } }; -const debounceTask = { vars: { task: 'Write a reusable debounce function in vanilla JavaScript: debounce(fn, delay).' } }; - -const FENCED_EMAIL = '```python\nimport re\ndef validate_email(e):\n return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))\n```'; -const UNFENCED_EMAIL = 'import re\ndef validate_email(e):\n return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))'; -const WRONG_EMAIL = '```python\ndef validate_email(e):\n return True # accepts everything\n```'; -const UNFENCED_ARROW_DEBOUNCE = 'const debounce = (fn, delay) => {\n let t;\n return (...a) => { clearTimeout(t); t = setTimeout(() => fn(...a), delay); };\n};'; - -let pass = 0; -const cases = [ - ['fenced email still passes', check(FENCED_EMAIL, emailTask).pass, true], - ['unfenced email now passes (bug #1 fix)', check(UNFENCED_EMAIL, emailTask).pass, true], - ['broken email still fails', check(WRONG_EMAIL, emailTask).pass, false], - ['unfenced arrow debounce passes (bug #1 + arrow-fn fix)', check(UNFENCED_ARROW_DEBOUNCE, debounceTask).pass, true], -]; -for (const [name, got, want] of cases) { - assert.strictEqual(got, want, `FAILED: ${name} (got ${got}, want ${want})`); - console.log(`ok - ${name}`); - pass++; -} -console.log(`\n${pass}/${cases.length} passed`); diff --git a/tests/correctness.test.js b/tests/correctness.test.js index a3facc8..f1ee1d6 100644 --- a/tests/correctness.test.js +++ b/tests/correctness.test.js @@ -179,6 +179,57 @@ def endpoint(): assert.equal(result.score, 0); }); +// --- Unfenced fallback + prose / early-exit guards (gate integrity) --- + +// A terse model can answer with bare, unfenced code (issue #65); the gate must +// still score it. These reach the extractBlocks fallback, which the fenced +// `check` helper above never exercises. +test('email: unfenced bare code still passes', () => { + const result = correctness( + 'import re\ndef validate_email(e):\n return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))', + { vars: { task: 'Write me a Python function that validates email addresses.' } }, + ); + assert.equal(result.pass, true); +}); + +test('debounce: unfenced arrow function still passes', () => { + const result = correctness( + 'const debounce = (fn, delay) => {\n let t;\n return (...a) => { clearTimeout(t); t = setTimeout(() => fn(...a), delay); };\n};', + { vars: { task: 'Add debounce to a search input in vanilla JavaScript.' } }, + ); + assert.equal(result.pass, true); +}); + +// Prose that name-drops the structural keywords but contains no code must NOT +// pass the run-free checks (countdown/ratelimit), or the gate rewards talk. +test('countdown: prose without code fails the structural check', () => { + const result = correctness( + 'To build it, use useState for the count and useEffect with setInterval; each tick set count - 1 until zero.', + { vars: { task: 'Build me a countdown timer component in React.' } }, + ); + assert.equal(result.pass, false); +}); + +test('ratelimit: prose without code fails the structural check', () => { + const result = correctness( + 'Build a rate limiter in FastAPI that tracks requests per window and returns 429 when the limit is exceeded.', + { vars: { task: 'Add rate limiting to my FastAPI endpoint.' } }, + ); + assert.equal(result.pass, false); +}); + +// A model demo that exits 0 before our appended asserts run must not score as a +// pass just because the process exited cleanly. +test('email: code that exits 0 before the asserts run fails', () => { + const result = check( + 'Write me a Python function that validates email addresses.', + 'python', + 'import sys\ndef validate_email(e):\n return True # broken: accepts everything\nsys.exit(0)', + ); + assert.equal(result.pass, false); + assert.equal(result.score, 0); +}); + // --- Edge cases --- test('unknown task is gracefully skipped', () => { diff --git a/tests/gemini-extension.test.js b/tests/gemini-extension.test.js index 0f24338..833d8dc 100644 --- a/tests/gemini-extension.test.js +++ b/tests/gemini-extension.test.js @@ -22,6 +22,15 @@ const VERSIONED_MANIFESTS = [ '.codex-plugin/plugin.json', '.github/plugin/plugin.json', ]; +// The marketplace manifests for the three plugin ecosystems. Shapes differ per +// ecosystem, but all must parse, name the ponytail plugin, and — for the two +// that carry a shared plugin description — keep it identical, so a rename or +// copy-edit can't silently desync one marketplace listing from the others. +const MARKETPLACE_MANIFESTS = [ + '.claude-plugin/marketplace.json', + '.github/plugin/marketplace.json', + '.agents/plugins/marketplace.json', +]; // Gemini auto-discovers these by directory; the manifest is only useful if they exist. const REUSED_COMMANDS = ['commands/ponytail.toml', 'commands/ponytail-review.toml']; const REUSED_SKILLS = ['skills/ponytail/SKILL.md']; @@ -77,3 +86,23 @@ test('the commands and skills the adapter reuses are present', () => { assert.ok(fs.existsSync(path.join(root, rel)), `reused file missing: ${rel}`); } }); + +test('every marketplace manifest parses and names the ponytail plugin', () => { + for (const rel of MARKETPLACE_MANIFESTS) { + assert.ok(fs.existsSync(path.join(root, rel)), `marketplace manifest missing: ${rel}`); + const manifest = JSON.parse(read(rel)); + assert.equal(manifest.name, EXTENSION_NAME, `${rel}: top-level name must be ponytail`); + assert.ok(Array.isArray(manifest.plugins) && manifest.plugins.length > 0, `${rel}: must list a plugin`); + assert.equal(manifest.plugins[0].name, EXTENSION_NAME, `${rel}: plugins[0].name must be ponytail`); + } +}); + +test('marketplace manifests that carry a plugin description keep it identical', () => { + const descriptions = MARKETPLACE_MANIFESTS + .map((rel) => JSON.parse(read(rel)).plugins[0].description) + .filter(Boolean); + assert.ok(descriptions.length >= 2, 'expected a shared plugin description in at least two marketplaces'); + for (const d of descriptions) { + assert.equal(d, descriptions[0], 'a marketplace plugin description drifted from the others'); + } +}); diff --git a/tests/hooks-windows.test.js b/tests/hooks-windows.test.js index f7b5353..473afdf 100644 --- a/tests/hooks-windows.test.js +++ b/tests/hooks-windows.test.js @@ -12,6 +12,7 @@ const path = require('path'); const root = path.join(__dirname, '..'); const HOOKS_JSON = 'hooks/hooks.json'; +const COPILOT_HOOKS_JSON = 'hooks/copilot-hooks.json'; // cmd.exe variable syntax (%FOO%); PowerShell leaves it literal, breaking the path. const CMD_VAR_SYNTAX = /%[A-Za-z_][A-Za-z0-9_]*%/; // Pull the hooks/