From 7063428998ce8a3b9f412aa73967b63bcb1169e5 Mon Sep 17 00:00:00 2001
From: Ferran Torres <ai.automate.mail@gmail.com>
Date: Wed, 17 Jun 2026 13:05:51 +0200
Subject: [PATCH 1/4] fix: correctness and mode-state bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- mode-tracker: a prompt that both switches mode and says "stop ponytail"
  no longer emits two JSON objects on one stdout (broke the host JSON.parse
  in Codex/Copilot); the deactivation branch is now mutually exclusive.
- mode-tracker / opencode: an unknown /ponytail arg (a typo) no longer
  silently resets the active level — it is left untouched, matching what the
  pi extension already does. Bare /ponytail still re-applies the default.
- benchmarks/correctness gate: prose that name-drops the structural keywords
  no longer scores as a pass on the run-free checks (countdown/ratelimit),
  and a model demo that exits 0 before the appended asserts run no longer
  counts as a pass — the harness already prints a PASS sentinel; we require it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .opencode/plugins/ponytail.mjs |  6 +++++-
 benchmarks/correctness.js      | 35 +++++++++++++++++++++++++---------
 hooks/ponytail-mode-tracker.js | 12 +++++++-----
 tests/hooks.test.js            |  9 +++++++++
 4 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/.opencode/plugins/ponytail.mjs b/.opencode/plugins/ponytail.mjs
index 2fd3625..ab7b7c0 100644
--- a/.opencode/plugins/ponytail.mjs
+++ b/.opencode/plugins/ponytail.mjs
@@ -56,8 +56,12 @@ export default async ({ client } = {}) => {
     // synchronous store if same-turn switching ever matters.
     'command.execute.before': async (input) => {
       if (!input || input.command !== 'ponytail') return;
+      // bare /ponytail re-applies the default; a known level sets it; an unknown
+      // arg (a typo) is left alone rather than silently resetting the level.
+      const arg = (input.arguments || '').trim();
+      const mode = arg === '' ? getDefaultMode() : normalizePersistedMode(arg);
+      if (!mode) return;
       // `off` is persisted like any mode; the transform reads it and stays silent.
-      const mode = normalizePersistedMode((input.arguments || '').trim()) || getDefaultMode();
       writeMode(mode);
       log('info', 'ponytail ' + mode);
     },
diff --git a/benchmarks/correctness.js b/benchmarks/correctness.js
index fc56611..2b896c0 100644
--- a/benchmarks/correctness.js
+++ b/benchmarks/correctness.js
@@ -17,10 +17,19 @@ function extractBlocks(text) {
   const matches = [...text.matchAll(/```(\w*)\r?\n([\s\S]*?)```/g)];
   // ponytail: terse models often answer with bare, unfenced code. Treat the whole
   // response as one block so the gate scores the code instead of reporting "no block".
-  if (matches.length === 0 && text.trim()) return [{ lang: '', code: text }];
+  if (matches.length === 0 && text.trim()) return [{ lang: '', code: text, unfenced: true }];
   return matches.map((m) => ({ lang: (m[1] || '').toLowerCase(), code: m[2] }));
 }
 
+// ponytail: a fenced block is code by the model's own delimiter; the unfenced
+// fallback wraps the whole response, prose included. The structural-only checks
+// (countdown/ratelimit) never run the code, so prose carrying the right keywords
+// would score as a pass — require a real code construct on an unfenced block
+// before trusting them. ceiling: keyword heuristic, not a parser.
+function looksLikeCode(block) {
+  return !block.unfenced || /[{}]|=>|@\w|\bdef\b|\bimport\b/.test(block.code);
+}
+
 // Identify which task we're evaluating from vars.task.
 function identifyTask(task) {
   const t = task.toLowerCase();
@@ -35,13 +44,21 @@ function identifyTask(task) {
 // Run a command, return { ok, stderr }.
 function exec(cmd, opts = {}) {
   try {
-    execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
-    return { ok: true, stderr: '' };
+    const stdout = execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
+    return { ok: true, stdout: stdout || '', stderr: '' };
   } catch (e) {
-    return { ok: false, stderr: (e.stderr || e.message || '').slice(0, 500) };
+    return { ok: false, stdout: e.stdout || '', stderr: (e.stderr || e.message || '').slice(0, 500) };
   }
 }
 
+// ponytail: exit-0 alone is not a pass. The model's own code can sys.exit(0) /
+// process.exit(0) in a __main__/demo block before our appended asserts run, so
+// each harness prints a PASS sentinel on success and we require it here. Without
+// this, a skipped assertion masquerades as a passing answer.
+function passed(result) {
+  return result.ok && /(^|\n)PASS\s*$/.test(result.stdout || '');
+}
+
 // ponytail: probe once at load; macOS and many Linux images ship python3 only.
 let pythonCmd;
 function python() {
@@ -120,7 +137,7 @@ print("PASS")
     const f = tmpFile('.py', harness);
     const result = exec(`${python()} "${f}"`);
     fs.unlinkSync(f);
-    if (result.ok) return { pass: true, reason: 'Email validator passes all checks' };
+    if (passed(result)) return { pass: true, reason: 'Email validator passes all checks' };
     return { pass: false, reason: result.stderr || 'Email validator failed' };
   },
 
@@ -165,7 +182,7 @@ setTimeout(() => {
     const f = tmpFile('.mjs', harness);
     const result = exec(`node "${f}"`);
     fs.unlinkSync(f);
-    if (result.ok) return { pass: true, reason: 'Debounce passes all checks' };
+    if (passed(result)) return { pass: true, reason: 'Debounce passes all checks' };
     return { pass: false, reason: result.stderr || 'Debounce failed' };
   },
 
@@ -215,7 +232,7 @@ else:
     const result = exec(`${python()} "${f}"`);
     try { fs.unlinkSync(f); } catch (e) {}
     try { fs.unlinkSync(csvPath); } catch (e) {}
-    if (result.ok) return { pass: true, reason: 'CSV sum produces correct result (351)' };
+    if (passed(result)) return { pass: true, reason: 'CSV sum produces correct result (351)' };
     return { pass: false, reason: result.stderr || 'CSV sum failed' };
   },
 
@@ -223,7 +240,7 @@ else:
     // React components can't run in bare Node without a bundler. Structural check:
     // the code must contain timer/countdown logic (useState/useEffect/setInterval/setTimeout).
     const code = blocks.find((b) => b.code.includes('ount') || b.code.includes('timer') || b.code.includes('Timer'));
-    if (!code) return { pass: false, reason: 'No countdown component found' };
+    if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No countdown component found' };
 
     const src = code.code;
     const hasState = /useState|useReducer|this\.state/.test(src);
@@ -241,7 +258,7 @@ else:
 
   ratelimit(blocks) {
     const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && (b.code.includes('rate') || b.code.includes('limit'))));
-    if (!code) return { pass: false, reason: 'No Python code block found' };
+    if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No Python code block found' };
 
     // Structural check for rate limiting: must have some form of counter/time tracking.
     const src = code.code;
diff --git a/hooks/ponytail-mode-tracker.js b/hooks/ponytail-mode-tracker.js
index d4fda46..0209f50 100644
--- a/hooks/ponytail-mode-tracker.js
+++ b/hooks/ponytail-mode-tracker.js
@@ -28,7 +28,9 @@ process.stdin.on('end', () => {
         else if (arg === 'full') mode = 'full';
         else if (arg === 'ultra') mode = 'ultra';
         else if (arg === 'off') mode = 'off';
-        else mode = getDefaultMode();
+        else if (arg === '') mode = getDefaultMode();
+        // else: unknown arg (a typo) — leave mode null so we don't silently
+        // reset the active level; pi already treats unknown args as a no-op.
       }
 
       if (mode && mode !== 'off') {
@@ -42,10 +44,10 @@ process.stdin.on('end', () => {
         clearMode();
         writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF');
       }
-    }
-
-    // Detect deactivation
-    if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) {
+    } else if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) {
+      // Deactivation phrase — but only when the prompt isn't itself a /ponytail
+      // command. A prompt matching both branches used to emit two hook outputs
+      // (two JSON objects on one stdout), breaking the host's JSON.parse.
       clearMode();
       writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF');
     }
diff --git a/tests/hooks.test.js b/tests/hooks.test.js
index 46e7be6..282b01f 100644
--- a/tests/hooks.test.js
+++ b/tests/hooks.test.js
@@ -138,5 +138,14 @@ assert.equal(
 output = JSON.parse(result.stdout);
 assert.deepEqual(output, {});
 
+// Unknown /ponytail arg must NOT silently reset the level: a typo leaves the
+// active mode untouched and emits nothing (pi already treats it as a no-op).
+run('ponytail-mode-tracker.js', codexEnv, JSON.stringify({ prompt: '@ponytail lite' }));
+assert.equal(fs.readFileSync(codexState, 'utf8'), 'lite');
+result = run('ponytail-mode-tracker.js', codexEnv, JSON.stringify({ prompt: '@ponytail bogus' }));
+assert.equal(result.status, 0, result.stderr);
+assert.equal(fs.readFileSync(codexState, 'utf8'), 'lite', 'unknown arg must not change the persisted level');
+assert.equal(result.stdout, '', 'unknown arg must emit no mode-change output');
+
 fs.rmSync(temp, { recursive: true, force: true });
 console.log('hook compatibility checks passed');

From 5d57c14179b1ebb676632b4b0634b335c4e22706 Mon Sep 17 00:00:00 2001
From: Ferran Torres <ai.automate.mail@gmail.com>
Date: Wed, 17 Jun 2026 13:06:08 +0200
Subject: [PATCH 2/4] fix: honor CLAUDE_CONFIG_DIR in statusline and setup
 nudge

The hooks write the mode flag under $CLAUDE_CONFIG_DIR when set (getClaudeDir),
but the statusline scripts hardcoded ~/.claude, so the badge vanished whenever
a user relocated Claude's config dir. Both the bash and PowerShell statusline
now resolve the same dir. The SessionStart setup nudge likewise pointed at a
literal ~/.claude/settings.json; it now interpolates the resolved settings path.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 hooks/ponytail-activate.js    | 4 +++-
 hooks/ponytail-statusline.ps1 | 4 +++-
 hooks/ponytail-statusline.sh  | 5 ++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/hooks/ponytail-activate.js b/hooks/ponytail-activate.js
index 9610721..6f3ee50 100644
--- a/hooks/ponytail-activate.js
+++ b/hooks/ponytail-activate.js
@@ -61,7 +61,9 @@ if (!isCodex) try {
     output += "\n\n" +
       "STATUSLINE SETUP NEEDED: The ponytail plugin includes a statusline badge showing active mode " +
       "(e.g. [PONYTAIL], [PONYTAIL:ULTRA]). It is not configured yet. " +
-      "To enable, add this to ~/.claude/settings.json: " +
+      // settingsPath honors CLAUDE_CONFIG_DIR; the literal "~/.claude" misled
+      // anyone who relocated Claude's config dir to edit the wrong file.
+      "To enable, add this to " + settingsPath + ": " +
       statusLineSnippet + " " +
       "Proactively offer to set this up for the user on first interaction.";
   }
diff --git a/hooks/ponytail-statusline.ps1 b/hooks/ponytail-statusline.ps1
index d9fe437..ba60ff1 100644
--- a/hooks/ponytail-statusline.ps1
+++ b/hooks/ponytail-statusline.ps1
@@ -1,4 +1,6 @@
-$Flag = Join-Path $HOME ".claude/.ponytail-active"
+# Mirror ponytail-config.getClaudeDir(): honor CLAUDE_CONFIG_DIR, else ~/.claude.
+$ConfigDir = if ($env:CLAUDE_CONFIG_DIR) { $env:CLAUDE_CONFIG_DIR } else { Join-Path $HOME ".claude" }
+$Flag = Join-Path $ConfigDir ".ponytail-active"
 if (-not (Test-Path $Flag)) {
     exit 0
 }
diff --git a/hooks/ponytail-statusline.sh b/hooks/ponytail-statusline.sh
index 5e83a27..d9ee87d 100644
--- a/hooks/ponytail-statusline.sh
+++ b/hooks/ponytail-statusline.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash
-flag="$HOME/.claude/.ponytail-active"
+# Mirror ponytail-config.getClaudeDir(): the flag is written under
+# $CLAUDE_CONFIG_DIR when set, else ~/.claude. Reading the wrong path hides the
+# badge whenever the user relocates Claude's config dir.
+flag="${CLAUDE_CONFIG_DIR:-$HOME/.claude}/.ponytail-active"
 [ -f "$flag" ] || exit 0
 
 mode=$(head -n1 "$flag" | tr -d '[:space:]')

From 1054b231f8d524e87714732c8faa69290470d739 Mon Sep 17 00:00:00 2001
From: Ferran Torres <ai.automate.mail@gmail.com>
Date: Wed, 17 Jun 2026 13:06:08 +0200
Subject: [PATCH 3/4] test: close CI coverage gaps

- Fold the orphaned benchmarks/correctness.test.js (issue #65 guard, never run
  by `npm test` or CI) into tests/correctness.test.js, and add regression cases
  for the prose-as-pass and exit-0-as-pass fixes.
- Add a bash smoke test for the statusline badge (both CLAUDE_CONFIG_DIR and the
  ~/.claude fallback).
- Extend the Windows hooks test to also validate copilot-hooks.json (script
  existence) and assert hooks.json and copilot-hooks.json wire the same scripts,
  so a hook added to one host manifest can't be silently forgotten in the other.
- Add a parity gate over the three marketplace.json manifests (parse, plugin
  name, shared description) and a name+description gate over the skill source
  frontmatter.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/correctness.test.js | 26 -----------------
 tests/correctness.test.js      | 51 ++++++++++++++++++++++++++++++++++
 tests/gemini-extension.test.js | 29 +++++++++++++++++++
 tests/hooks-windows.test.js    | 44 ++++++++++++++++++++++++-----
 tests/openclaw-skills.test.js  | 11 ++++++++
 tests/statusline.test.js       | 49 ++++++++++++++++++++++++++++++++
 6 files changed, 177 insertions(+), 33 deletions(-)
 delete mode 100644 benchmarks/correctness.test.js
 create mode 100644 tests/statusline.test.js

diff --git a/benchmarks/correctness.test.js b/benchmarks/correctness.test.js
deleted file mode 100644
index e3103c4..0000000
--- a/benchmarks/correctness.test.js
+++ /dev/null
@@ -1,26 +0,0 @@
-// Regression guard for the gate fixes (issue #65). Run: node correctness.test.js
-// Needs python + node on PATH, same as correctness.js itself.
-const assert = require('assert');
-const check = require('./correctness.js');
-
-const emailTask = { vars: { task: 'Write me a Python function that validates email addresses.' } };
-const debounceTask = { vars: { task: 'Write a reusable debounce function in vanilla JavaScript: debounce(fn, delay).' } };
-
-const FENCED_EMAIL = '```python\nimport re\ndef validate_email(e):\n    return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))\n```';
-const UNFENCED_EMAIL = 'import re\ndef validate_email(e):\n    return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))';
-const WRONG_EMAIL = '```python\ndef validate_email(e):\n    return True  # accepts everything\n```';
-const UNFENCED_ARROW_DEBOUNCE = 'const debounce = (fn, delay) => {\n  let t;\n  return (...a) => { clearTimeout(t); t = setTimeout(() => fn(...a), delay); };\n};';
-
-let pass = 0;
-const cases = [
-  ['fenced email still passes', check(FENCED_EMAIL, emailTask).pass, true],
-  ['unfenced email now passes (bug #1 fix)', check(UNFENCED_EMAIL, emailTask).pass, true],
-  ['broken email still fails', check(WRONG_EMAIL, emailTask).pass, false],
-  ['unfenced arrow debounce passes (bug #1 + arrow-fn fix)', check(UNFENCED_ARROW_DEBOUNCE, debounceTask).pass, true],
-];
-for (const [name, got, want] of cases) {
-  assert.strictEqual(got, want, `FAILED: ${name} (got ${got}, want ${want})`);
-  console.log(`ok - ${name}`);
-  pass++;
-}
-console.log(`\n${pass}/${cases.length} passed`);
diff --git a/tests/correctness.test.js b/tests/correctness.test.js
index a3facc8..f1ee1d6 100644
--- a/tests/correctness.test.js
+++ b/tests/correctness.test.js
@@ -179,6 +179,57 @@ def endpoint():
   assert.equal(result.score, 0);
 });
 
+// --- Unfenced fallback + prose / early-exit guards (gate integrity) ---
+
+// A terse model can answer with bare, unfenced code (issue #65); the gate must
+// still score it. These reach the extractBlocks fallback, which the fenced
+// `check` helper above never exercises.
+test('email: unfenced bare code still passes', () => {
+  const result = correctness(
+    'import re\ndef validate_email(e):\n    return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))',
+    { vars: { task: 'Write me a Python function that validates email addresses.' } },
+  );
+  assert.equal(result.pass, true);
+});
+
+test('debounce: unfenced arrow function still passes', () => {
+  const result = correctness(
+    'const debounce = (fn, delay) => {\n  let t;\n  return (...a) => { clearTimeout(t); t = setTimeout(() => fn(...a), delay); };\n};',
+    { vars: { task: 'Add debounce to a search input in vanilla JavaScript.' } },
+  );
+  assert.equal(result.pass, true);
+});
+
+// Prose that name-drops the structural keywords but contains no code must NOT
+// pass the run-free checks (countdown/ratelimit), or the gate rewards talk.
+test('countdown: prose without code fails the structural check', () => {
+  const result = correctness(
+    'To build it, use useState for the count and useEffect with setInterval; each tick set count - 1 until zero.',
+    { vars: { task: 'Build me a countdown timer component in React.' } },
+  );
+  assert.equal(result.pass, false);
+});
+
+test('ratelimit: prose without code fails the structural check', () => {
+  const result = correctness(
+    'Build a rate limiter in FastAPI that tracks requests per window and returns 429 when the limit is exceeded.',
+    { vars: { task: 'Add rate limiting to my FastAPI endpoint.' } },
+  );
+  assert.equal(result.pass, false);
+});
+
+// A model demo that exits 0 before our appended asserts run must not score as a
+// pass just because the process exited cleanly.
+test('email: code that exits 0 before the asserts run fails', () => {
+  const result = check(
+    'Write me a Python function that validates email addresses.',
+    'python',
+    'import sys\ndef validate_email(e):\n    return True  # broken: accepts everything\nsys.exit(0)',
+  );
+  assert.equal(result.pass, false);
+  assert.equal(result.score, 0);
+});
+
 // --- Edge cases ---
 
 test('unknown task is gracefully skipped', () => {
diff --git a/tests/gemini-extension.test.js b/tests/gemini-extension.test.js
index 0f24338..833d8dc 100644
--- a/tests/gemini-extension.test.js
+++ b/tests/gemini-extension.test.js
@@ -22,6 +22,15 @@ const VERSIONED_MANIFESTS = [
   '.codex-plugin/plugin.json',
   '.github/plugin/plugin.json',
 ];
+// The marketplace manifests for the three plugin ecosystems. Shapes differ per
+// ecosystem, but all must parse, name the ponytail plugin, and — for the two
+// that carry a shared plugin description — keep it identical, so a rename or
+// copy-edit can't silently desync one marketplace listing from the others.
+const MARKETPLACE_MANIFESTS = [
+  '.claude-plugin/marketplace.json',
+  '.github/plugin/marketplace.json',
+  '.agents/plugins/marketplace.json',
+];
 // Gemini auto-discovers these by directory; the manifest is only useful if they exist.
 const REUSED_COMMANDS = ['commands/ponytail.toml', 'commands/ponytail-review.toml'];
 const REUSED_SKILLS = ['skills/ponytail/SKILL.md'];
@@ -77,3 +86,23 @@ test('the commands and skills the adapter reuses are present', () => {
     assert.ok(fs.existsSync(path.join(root, rel)), `reused file missing: ${rel}`);
   }
 });
+
+test('every marketplace manifest parses and names the ponytail plugin', () => {
+  for (const rel of MARKETPLACE_MANIFESTS) {
+    assert.ok(fs.existsSync(path.join(root, rel)), `marketplace manifest missing: ${rel}`);
+    const manifest = JSON.parse(read(rel));
+    assert.equal(manifest.name, EXTENSION_NAME, `${rel}: top-level name must be ponytail`);
+    assert.ok(Array.isArray(manifest.plugins) && manifest.plugins.length > 0, `${rel}: must list a plugin`);
+    assert.equal(manifest.plugins[0].name, EXTENSION_NAME, `${rel}: plugins[0].name must be ponytail`);
+  }
+});
+
+test('marketplace manifests that carry a plugin description keep it identical', () => {
+  const descriptions = MARKETPLACE_MANIFESTS
+    .map((rel) => JSON.parse(read(rel)).plugins[0].description)
+    .filter(Boolean);
+  assert.ok(descriptions.length >= 2, 'expected a shared plugin description in at least two marketplaces');
+  for (const d of descriptions) {
+    assert.equal(d, descriptions[0], 'a marketplace plugin description drifted from the others');
+  }
+});
diff --git a/tests/hooks-windows.test.js b/tests/hooks-windows.test.js
index f7b5353..473afdf 100644
--- a/tests/hooks-windows.test.js
+++ b/tests/hooks-windows.test.js
@@ -12,6 +12,7 @@ const path = require('path');
 
 const root = path.join(__dirname, '..');
 const HOOKS_JSON = 'hooks/hooks.json';
+const COPILOT_HOOKS_JSON = 'hooks/copilot-hooks.json';
 // cmd.exe variable syntax (%FOO%); PowerShell leaves it literal, breaking the path.
 const CMD_VAR_SYNTAX = /%[A-Za-z_][A-Za-z0-9_]*%/;
 // Pull the hooks/<script> a command launches, so we can check it exists.
@@ -26,6 +27,28 @@ function commandHooks() {
     .flatMap((entry) => entry.hooks);
 }
 
+// Claude's hooks.json carries command/commandWindows; Copilot's copilot-hooks.json
+// uses a flatter shape with bash/powershell. Return every raw command string a
+// manifest launches, so existence and parity checks can share one extractor.
+function claudeCommands() {
+  return commandHooks().flatMap((h) => [h.command, h.commandWindows].filter(Boolean));
+}
+
+function copilotCommands() {
+  const config = JSON.parse(fs.readFileSync(path.join(root, COPILOT_HOOKS_JSON), 'utf8'));
+  return Object.values(config.hooks)
+    .flat()
+    .flatMap((entry) => [entry.bash, entry.powershell].filter(Boolean));
+}
+
+// The set of hooks/<script> basenames a list of command strings references.
+function scriptSet(commands) {
+  return new Set(commands.flatMap((cmd) => {
+    const m = cmd.match(HOOK_SCRIPT);
+    return m ? [m[1]] : [];
+  }));
+}
+
 test('every commandWindows uses PowerShell $env: syntax, not cmd.exe %VAR%', () => {
   const windowsCommands = commandHooks()
     .map((h) => h.commandWindows)
@@ -37,12 +60,19 @@ test('every commandWindows uses PowerShell $env: syntax, not cmd.exe %VAR%', ()
 });
 
 test('every hook command points at a script that ships in hooks/', () => {
-  for (const hook of commandHooks()) {
-    for (const cmd of [hook.command, hook.commandWindows].filter(Boolean)) {
-      const match = cmd.match(HOOK_SCRIPT);
-      assert.ok(match, `cannot find a hooks/ script in command: ${cmd}`);
-      const script = path.join(root, 'hooks', match[1]);
-      assert.ok(fs.existsSync(script), `command references a missing hook script: ${match[1]}`);
-    }
+  for (const cmd of [...claudeCommands(), ...copilotCommands()]) {
+    const match = cmd.match(HOOK_SCRIPT);
+    assert.ok(match, `cannot find a hooks/ script in command: ${cmd}`);
+    const script = path.join(root, 'hooks', match[1]);
+    assert.ok(fs.existsSync(script), `command references a missing hook script: ${match[1]}`);
   }
 });
+
+// A new lifecycle hook added to one host manifest but forgotten in the other
+// would silently leave that host un-wired — Claude's gate would never catch it.
+test('hooks.json and copilot-hooks.json wire the same hooks/ scripts', () => {
+  const claude = [...scriptSet(claudeCommands())].sort();
+  const copilot = [...scriptSet(copilotCommands())].sort();
+  assert.ok(claude.length > 0, 'expected at least one claude hook script');
+  assert.deepEqual(copilot, claude, 'a hook script is wired in one host manifest but not the other');
+});
diff --git a/tests/openclaw-skills.test.js b/tests/openclaw-skills.test.js
index 64443da..ed14c24 100644
--- a/tests/openclaw-skills.test.js
+++ b/tests/openclaw-skills.test.js
@@ -6,6 +6,7 @@
 const test = require('node:test');
 const assert = require('node:assert/strict');
 const fs = require('fs');
+const path = require('path');
 const { NAMES, render, outPath, sourceBody, DESCRIPTIONS } = require('../scripts/build-openclaw-skills');
 
 for (const name of NAMES) {
@@ -23,4 +24,14 @@ for (const name of NAMES) {
     const d = DESCRIPTIONS[name];
     assert.ok(d.length <= 160 && !d.includes('\n'), 'description too long or multiline');
   });
+
+  // The canonical source frontmatter feeds every host's skill picker; guard that
+  // each carries a non-empty name and description, not just the openclaw mirror.
+  test(`${name}: source frontmatter has a non-empty name and description`, () => {
+    const src = fs.readFileSync(path.join(__dirname, '..', 'skills', name, 'SKILL.md'), 'utf8').replace(/\r\n/g, '\n');
+    const fm = src.match(/^---\n([\s\S]*?)\n---/);
+    assert.ok(fm, `skills/${name}/SKILL.md has no frontmatter`);
+    assert.match(fm[1], /(^|\n)name:\s*\S/, `skills/${name}: frontmatter missing a non-empty name`);
+    assert.match(fm[1], /(^|\n)description:\s*\S/, `skills/${name}: frontmatter missing a non-empty description`);
+  });
 }
diff --git a/tests/statusline.test.js b/tests/statusline.test.js
new file mode 100644
index 0000000..7de230a
--- /dev/null
+++ b/tests/statusline.test.js
@@ -0,0 +1,49 @@
+#!/usr/bin/env node
+// The statusline badge must read the flag from the SAME dir the hooks write it
+// to: $CLAUDE_CONFIG_DIR when set, else ~/.claude (ponytail-config.getClaudeDir).
+// The scripts used to hardcode ~/.claude, so the badge vanished whenever a user
+// relocated Claude's config dir (issue #34). Exercises the bash mirror; the .ps1
+// mirror is left unverified here — no PowerShell on the Linux CI runner.
+
+const test = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const os = require('os');
+const path = require('path');
+const { spawnSync } = require('child_process');
+
+const script = path.join(__dirname, '..', 'hooks', 'ponytail-statusline.sh');
+const temp = fs.mkdtempSync(path.join(os.tmpdir(), 'ponytail-statusline-'));
+
+function runSh(env) {
+  return spawnSync('bash', [script], { env: { ...process.env, ...env }, encoding: 'utf8' });
+}
+
+test('badge reads the flag under CLAUDE_CONFIG_DIR (issue #34)', (t) => {
+  if (runSh({}).error) return t.skip('bash unavailable');
+
+  const configDir = path.join(temp, 'custom-claude');
+  const home = path.join(temp, 'home-empty'); // ~/.claude deliberately has no flag
+  fs.mkdirSync(home, { recursive: true });
+  fs.mkdirSync(configDir, { recursive: true });
+  fs.writeFileSync(path.join(configDir, '.ponytail-active'), 'ultra');
+
+  const r = runSh({ CLAUDE_CONFIG_DIR: configDir, HOME: home, USERPROFILE: home });
+  assert.equal(r.status, 0, r.stderr);
+  assert.match(r.stdout, /\[PONYTAIL:ULTRA\]/);
+});
+
+test('badge falls back to ~/.claude when CLAUDE_CONFIG_DIR is unset', (t) => {
+  if (runSh({}).error) return t.skip('bash unavailable');
+
+  const home = path.join(temp, 'home');
+  fs.mkdirSync(path.join(home, '.claude'), { recursive: true });
+  fs.writeFileSync(path.join(home, '.claude', '.ponytail-active'), 'full');
+
+  // Empty string (not unset) still triggers the bash `:-` default.
+  const r = runSh({ HOME: home, USERPROFILE: home, CLAUDE_CONFIG_DIR: '' });
+  assert.equal(r.status, 0, r.stderr);
+  assert.match(r.stdout, /\[PONYTAIL\]/);
+});
+
+test.after(() => fs.rmSync(temp, { recursive: true, force: true }));

From f3f84907ff327e498e4453e124938537a1bc1620 Mon Sep 17 00:00:00 2001
From: Ferran Torres <ai.automate.mail@gmail.com>
Date: Wed, 17 Jun 2026 13:06:08 +0200
Subject: [PATCH 4/4] docs: reconcile the host/command matrix; drop dead
 frontmatter

- README + agent-portability: Antigravity reuses gemini-extension.json (same as
  Gemini CLI) and surfaces /ponytail commands as chat-skills, so it is listed as
  command-capable, resolving the README's self-contradiction.
- Mark Copilot CLI and OpenClaw as command-capable and qualify "Copilot (editor)"
  as the instruction-only one.
- Correct the bare /ponytail description: it re-applies the default level, it
  does not report the current one.
- Document .agents/plugins/marketplace.json (the .agents-standard marketplace
  manifest) and add the missing Aider row.
- Drop the unused `license: MIT` frontmatter key from skills/ponytail (no host
  reads it; the openclaw generator injects license uniformly).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                 | 4 ++--
 docs/agent-portability.md | 5 +++--
 skills/ponytail/SKILL.md  | 1 -
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ea530f7..e93e645 100644
--- a/README.md
+++ b/README.md
@@ -179,13 +179,13 @@ Which files map to which agent: [Agent portability](docs/agent-portability.md).
 
 | Command | What it does |
 |---------|--------------|
-| `/ponytail [lite \| full \| ultra \| off]` | Set the intensity, or turn it off. No argument reports the current level. |
+| `/ponytail [lite \| full \| ultra \| off]` | Set the intensity, or turn it off. No argument re-applies ponytail at the default level. |
 | `/ponytail-review` | Review the current diff for over-engineering, hands back a delete-list. |
 | `/ponytail-audit` | Audit the whole repo for over-engineering, not just the diff. |
 | `/ponytail-debt` | Harvest the `ponytail:` shortcuts you've deferred into a ledger, so "later" doesn't become "never". |
 | `/ponytail-help` | Quick reference for the commands above. |
 
-Commands need a skill-capable host (Claude Code, Codex, OpenCode, Gemini, pi). In Codex they're skills, invoke with `@` (`@ponytail-review`). The instruction-only adapters (Cursor, Windsurf, Cline, Copilot, Kiro, Antigravity) load the always-on ruleset without the commands.
+Commands need a skill-capable host (Claude Code, Codex, OpenCode, Gemini, Antigravity, pi, GitHub Copilot CLI, OpenClaw). In Codex they're skills, invoke with `@` (`@ponytail-review`); in Antigravity they're chat-skills, typed into the chat (`/ponytail-review` as a message). The instruction-only adapters (Cursor, Windsurf, Cline, Copilot editor, Kiro) load the always-on ruleset without the commands.
 
 ## Development
 
diff --git a/docs/agent-portability.md b/docs/agent-portability.md
index 40b4af8..799cb60 100644
--- a/docs/agent-portability.md
+++ b/docs/agent-portability.md
@@ -9,7 +9,7 @@ to load in a given agent.
 | Host | Files | Notes |
 |------|-------|-------|
 | Claude Code | `.claude-plugin/`, `commands/`, `hooks/` | Full plugin install with session activation, mode tracking, commands, and statusline support. |
-| Codex | `.codex-plugin/plugin.json`, `hooks/hooks.json`, `hooks/`, `skills/` | Plugin install with the same skills plus lifecycle hooks for activation and mode tracking. |
+| Codex | `.codex-plugin/plugin.json`, `.agents/plugins/marketplace.json`, `hooks/hooks.json`, `hooks/`, `skills/` | Plugin install with the same skills plus lifecycle hooks for activation and mode tracking. `.agents/plugins/marketplace.json` is the `.agents`-standard marketplace manifest `codex plugin marketplace add` discovers. |
 | OpenCode | `.opencode/plugins/ponytail.mjs`, `.opencode/command/`, `hooks/`, `skills/` | Server plugin injects the ruleset each turn via `experimental.chat.system.transform` and persists `/ponytail` switches; reuses the shared instruction builder. |
 | pi | `pi-extension/`, `skills/`, `hooks/` | Package extension: injects the ruleset each turn through the shared instruction builder and registers the `/ponytail` commands. |
 | Gemini CLI | `gemini-extension.json`, `AGENTS.md`, `commands/`, `skills/` | Extension manifest points `contextFileName` at `AGENTS.md` for always-on rules, and reuses the existing `commands/*.toml` and `skills/`, which Gemini CLI auto-discovers. |
@@ -18,9 +18,10 @@ to load in a given agent.
 | Cline | `.clinerules/ponytail.md` | Project rule. |
 | GitHub Copilot | `.github/copilot-instructions.md` | Repository instruction file. |
 | GitHub Copilot CLI | `.github/plugin/`, `AGENTS.md`, `.github/copilot-instructions.md`, `~/.copilot/copilot-instructions.md` | Plugin-supported (`copilot plugin marketplace add DietrichGebert/ponytail` + `copilot plugin install ponytail@ponytail`). Fallback instruction mode remains: per-project from `AGENTS.md` or `.github/copilot-instructions.md`, or globally from `~/.copilot/copilot-instructions.md` (instruction-tier, no `/ponytail` levels or hooks). |
-| Antigravity | `AGENTS.md` | Reads `AGENTS.md` at the repo root as always-on rules (like `.cursorrules`/`CLAUDE.md`); `.agents/rules/` also works for workspace rules. Instruction-tier. |
+| Antigravity | `gemini-extension.json`, `AGENTS.md`, `commands/`, `skills/` | Installs via `agy plugin install` reusing `gemini-extension.json` (same manifest as Gemini CLI); `/ponytail` commands surface as chat-skills typed into the chat. `AGENTS.md` at the repo root (or `.agents/rules/`) is the always-on-rule fallback. |
 | VS Code + Codex extension | `AGENTS.md` | The Codex extension reads `AGENTS.md` (repo root, or `~/.codex/AGENTS.md` globally). Instruction-tier; the full Codex plugin row above adds `/ponytail` levels and hooks. |
 | Kiro | `.kiro/steering/ponytail.md` | Steering rule; copy globally or into a project. |
+| Aider | `AGENTS.md` | Copy the compact rule file as project conventions. Instruction-tier. |
 | Generic agents | `AGENTS.md` or `skills/*/SKILL.md` | Copy the compact rule file or load the skill files directly. |
 
 ## Adapter Rule
diff --git a/skills/ponytail/SKILL.md b/skills/ponytail/SKILL.md
index 0e0d3be..80eb5a0 100644
--- a/skills/ponytail/SKILL.md
+++ b/skills/ponytail/SKILL.md
@@ -10,7 +10,6 @@ description: >
   "minimal solution", "yagni", "do less", or "shortest path", and whenever
   they complain about over-engineering, bloat, boilerplate, or unnecessary
   dependencies.
-license: MIT
 ---
 
 # Ponytail