DietrichGebert · ferran-valvia · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.opencode/plugins/ponytail.mjs b/.opencode/plugins/ponytail.mjs
@@ -56,8 +56,12 @@ export default async ({ client } = {}) => {
     // synchronous store if same-turn switching ever matters.
     'command.execute.before': async (input) => {
       if (!input || input.command !== 'ponytail') return;
+      // bare /ponytail re-applies the default; a known level sets it; an unknown
+      // arg (a typo) is left alone rather than silently resetting the level.
+      const arg = (input.arguments || '').trim();
+      const mode = arg === '' ? getDefaultMode() : normalizePersistedMode(arg);
+      if (!mode) return;
       // `off` is persisted like any mode; the transform reads it and stays silent.
-      const mode = normalizePersistedMode((input.arguments || '').trim()) || getDefaultMode();
       writeMode(mode);
       log('info', 'ponytail ' + mode);
     },

diff --git a/README.md b/README.md
@@ -179,13 +179,13 @@ Which files map to which agent: [Agent portability](docs/agent-portability.md).
 
 | Command | What it does |
 |---------|--------------|
-| `/ponytail [lite \| full \| ultra \| off]` | Set the intensity, or turn it off. No argument reports the current level. |
+| `/ponytail [lite \| full \| ultra \| off]` | Set the intensity, or turn it off. No argument re-applies ponytail at the default level. |
 | `/ponytail-review` | Review the current diff for over-engineering, hands back a delete-list. |
 | `/ponytail-audit` | Audit the whole repo for over-engineering, not just the diff. |
 | `/ponytail-debt` | Harvest the `ponytail:` shortcuts you've deferred into a ledger, so "later" doesn't become "never". |
 | `/ponytail-help` | Quick reference for the commands above. |
 
-Commands need a skill-capable host (Claude Code, Codex, OpenCode, Gemini, pi). In Codex they're skills, invoke with `@` (`@ponytail-review`). The instruction-only adapters (Cursor, Windsurf, Cline, Copilot, Kiro, Antigravity) load the always-on ruleset without the commands.
+Commands need a skill-capable host (Claude Code, Codex, OpenCode, Gemini, Antigravity, pi, GitHub Copilot CLI, OpenClaw). In Codex they're skills, invoke with `@` (`@ponytail-review`); in Antigravity they're chat-skills, typed into the chat (`/ponytail-review` as a message). The instruction-only adapters (Cursor, Windsurf, Cline, Copilot editor, Kiro) load the always-on ruleset without the commands.
 
 ## Development
 

diff --git a/benchmarks/correctness.js b/benchmarks/correctness.js
@@ -17,10 +17,19 @@ function extractBlocks(text) {
   const matches = [...text.matchAll(/```(\w*)\r?\n([\s\S]*?)```/g)];
   // ponytail: terse models often answer with bare, unfenced code. Treat the whole
   // response as one block so the gate scores the code instead of reporting "no block".
-  if (matches.length === 0 && text.trim()) return [{ lang: '', code: text }];
+  if (matches.length === 0 && text.trim()) return [{ lang: '', code: text, unfenced: true }];
   return matches.map((m) => ({ lang: (m[1] || '').toLowerCase(), code: m[2] }));
 }
 
+// ponytail: a fenced block is code by the model's own delimiter; the unfenced
+// fallback wraps the whole response, prose included. The structural-only checks
+// (countdown/ratelimit) never run the code, so prose carrying the right keywords
+// would score as a pass — require a real code construct on an unfenced block
+// before trusting them. ceiling: keyword heuristic, not a parser.
+function looksLikeCode(block) {
+  return !block.unfenced || /[{}]|=>|@\w|\bdef\b|\bimport\b/.test(block.code);
+}
+
 // Identify which task we're evaluating from vars.task.
 function identifyTask(task) {
   const t = task.toLowerCase();
@@ -35,13 +44,21 @@ function identifyTask(task) {
 // Run a command, return { ok, stderr }.
 function exec(cmd, opts = {}) {
   try {
-    execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
-    return { ok: true, stderr: '' };
+    const stdout = execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
+    return { ok: true, stdout: stdout || '', stderr: '' };
   } catch (e) {
-    return { ok: false, stderr: (e.stderr || e.message || '').slice(0, 500) };
+    return { ok: false, stdout: e.stdout || '', stderr: (e.stderr || e.message || '').slice(0, 500) };
   }
 }
 
+// ponytail: exit-0 alone is not a pass. The model's own code can sys.exit(0) /
+// process.exit(0) in a __main__/demo block before our appended asserts run, so
+// each harness prints a PASS sentinel on success and we require it here. Without
+// this, a skipped assertion masquerades as a passing answer.
+function passed(result) {
+  return result.ok && /(^|\n)PASS\s*$/.test(result.stdout || '');
+}
+
 // ponytail: probe once at load; macOS and many Linux images ship python3 only.
 let pythonCmd;
 function python() {
@@ -120,7 +137,7 @@ print("PASS")
     const f = tmpFile('.py', harness);
     const result = exec(`${python()} "${f}"`);
     fs.unlinkSync(f);
-    if (result.ok) return { pass: true, reason: 'Email validator passes all checks' };
+    if (passed(result)) return { pass: true, reason: 'Email validator passes all checks' };
     return { pass: false, reason: result.stderr || 'Email validator failed' };
   },
 
@@ -165,7 +182,7 @@ setTimeout(() => {
     const f = tmpFile('.mjs', harness);
     const result = exec(`node "${f}"`);
     fs.unlinkSync(f);
-    if (result.ok) return { pass: true, reason: 'Debounce passes all checks' };
+    if (passed(result)) return { pass: true, reason: 'Debounce passes all checks' };
     return { pass: false, reason: result.stderr || 'Debounce failed' };
   },
 
@@ -215,15 +232,15 @@ else:
     const result = exec(`${python()} "${f}"`);
     try { fs.unlinkSync(f); } catch (e) {}
     try { fs.unlinkSync(csvPath); } catch (e) {}
-    if (result.ok) return { pass: true, reason: 'CSV sum produces correct result (351)' };
+    if (passed(result)) return { pass: true, reason: 'CSV sum produces correct result (351)' };
     return { pass: false, reason: result.stderr || 'CSV sum failed' };
   },
 
   countdown(blocks) {
     // React components can't run in bare Node without a bundler. Structural check:
     // the code must contain timer/countdown logic (useState/useEffect/setInterval/setTimeout).
     const code = blocks.find((b) => b.code.includes('ount') || b.code.includes('timer') || b.code.includes('Timer'));
-    if (!code) return { pass: false, reason: 'No countdown component found' };
+    if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No countdown component found' };
 
     const src = code.code;
     const hasState = /useState|useReducer|this\.state/.test(src);
@@ -241,7 +258,7 @@ else:
 
   ratelimit(blocks) {
     const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && (b.code.includes('rate') || b.code.includes('limit'))));
-    if (!code) return { pass: false, reason: 'No Python code block found' };
+    if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No Python code block found' };
 
     // Structural check for rate limiting: must have some form of counter/time tracking.
     const src = code.code;

diff --git a/benchmarks/correctness.test.js b/benchmarks/correctness.test.js
diff --git a/docs/agent-portability.md b/docs/agent-portability.md
@@ -9,7 +9,7 @@ to load in a given agent.
 | Host | Files | Notes |
 |------|-------|-------|
 | Claude Code | `.claude-plugin/`, `commands/`, `hooks/` | Full plugin install with session activation, mode tracking, commands, and statusline support. |
-| Codex | `.codex-plugin/plugin.json`, `hooks/hooks.json`, `hooks/`, `skills/` | Plugin install with the same skills plus lifecycle hooks for activation and mode tracking. |
+| Codex | `.codex-plugin/plugin.json`, `.agents/plugins/marketplace.json`, `hooks/hooks.json`, `hooks/`, `skills/` | Plugin install with the same skills plus lifecycle hooks for activation and mode tracking. `.agents/plugins/marketplace.json` is the `.agents`-standard marketplace manifest `codex plugin marketplace add` discovers. |
 | OpenCode | `.opencode/plugins/ponytail.mjs`, `.opencode/command/`, `hooks/`, `skills/` | Server plugin injects the ruleset each turn via `experimental.chat.system.transform` and persists `/ponytail` switches; reuses the shared instruction builder. |
 | pi | `pi-extension/`, `skills/`, `hooks/` | Package extension: injects the ruleset each turn through the shared instruction builder and registers the `/ponytail` commands. |
 | Gemini CLI | `gemini-extension.json`, `AGENTS.md`, `commands/`, `skills/` | Extension manifest points `contextFileName` at `AGENTS.md` for always-on rules, and reuses the existing `commands/*.toml` and `skills/`, which Gemini CLI auto-discovers. |
@@ -18,9 +18,10 @@ to load in a given agent.
 | Cline | `.clinerules/ponytail.md` | Project rule. |
 | GitHub Copilot | `.github/copilot-instructions.md` | Repository instruction file. |
 | GitHub Copilot CLI | `.github/plugin/`, `AGENTS.md`, `.github/copilot-instructions.md`, `~/.copilot/copilot-instructions.md` | Plugin-supported (`copilot plugin marketplace add DietrichGebert/ponytail` + `copilot plugin install ponytail@ponytail`). Fallback instruction mode remains: per-project from `AGENTS.md` or `.github/copilot-instructions.md`, or globally from `~/.copilot/copilot-instructions.md` (instruction-tier, no `/ponytail` levels or hooks). |
-| Antigravity | `AGENTS.md` | Reads `AGENTS.md` at the repo root as always-on rules (like `.cursorrules`/`CLAUDE.md`); `.agents/rules/` also works for workspace rules. Instruction-tier. |
+| Antigravity | `gemini-extension.json`, `AGENTS.md`, `commands/`, `skills/` | Installs via `agy plugin install` reusing `gemini-extension.json` (same manifest as Gemini CLI); `/ponytail` commands surface as chat-skills typed into the chat. `AGENTS.md` at the repo root (or `.agents/rules/`) is the always-on-rule fallback. |
 | VS Code + Codex extension | `AGENTS.md` | The Codex extension reads `AGENTS.md` (repo root, or `~/.codex/AGENTS.md` globally). Instruction-tier; the full Codex plugin row above adds `/ponytail` levels and hooks. |
 | Kiro | `.kiro/steering/ponytail.md` | Steering rule; copy globally or into a project. |
+| Aider | `AGENTS.md` | Copy the compact rule file as project conventions. Instruction-tier. |
 | Generic agents | `AGENTS.md` or `skills/*/SKILL.md` | Copy the compact rule file or load the skill files directly. |
 
 ## Adapter Rule

diff --git a/hooks/ponytail-activate.js b/hooks/ponytail-activate.js
@@ -61,7 +61,9 @@ if (!isCodex) try {
     output += "\n\n" +
       "STATUSLINE SETUP NEEDED: The ponytail plugin includes a statusline badge showing active mode " +
       "(e.g. [PONYTAIL], [PONYTAIL:ULTRA]). It is not configured yet. " +
-      "To enable, add this to ~/.claude/settings.json: " +
+      // settingsPath honors CLAUDE_CONFIG_DIR; the literal "~/.claude" misled
+      // anyone who relocated Claude's config dir to edit the wrong file.
+      "To enable, add this to " + settingsPath + ": " +
       statusLineSnippet + " " +
       "Proactively offer to set this up for the user on first interaction.";
   }

diff --git a/hooks/ponytail-mode-tracker.js b/hooks/ponytail-mode-tracker.js
@@ -28,7 +28,9 @@ process.stdin.on('end', () => {
         else if (arg === 'full') mode = 'full';
         else if (arg === 'ultra') mode = 'ultra';
         else if (arg === 'off') mode = 'off';
-        else mode = getDefaultMode();
+        else if (arg === '') mode = getDefaultMode();
+        // else: unknown arg (a typo) — leave mode null so we don't silently
+        // reset the active level; pi already treats unknown args as a no-op.
       }
 
       if (mode && mode !== 'off') {
@@ -42,10 +44,10 @@ process.stdin.on('end', () => {
         clearMode();
         writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF');
       }
-    }
-
-    // Detect deactivation
-    if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) {
+    } else if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) {
+      // Deactivation phrase — but only when the prompt isn't itself a /ponytail
+      // command. A prompt matching both branches used to emit two hook outputs
+      // (two JSON objects on one stdout), breaking the host's JSON.parse.
       clearMode();
       writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF');
     }

diff --git a/hooks/ponytail-statusline.ps1 b/hooks/ponytail-statusline.ps1
@@ -1,4 +1,6 @@
-$Flag = Join-Path $HOME ".claude/.ponytail-active"
+# Mirror ponytail-config.getClaudeDir(): honor CLAUDE_CONFIG_DIR, else ~/.claude.
+$ConfigDir = if ($env:CLAUDE_CONFIG_DIR) { $env:CLAUDE_CONFIG_DIR } else { Join-Path $HOME ".claude" }
+$Flag = Join-Path $ConfigDir ".ponytail-active"
 if (-not (Test-Path $Flag)) {
     exit 0
 }

diff --git a/hooks/ponytail-statusline.sh b/hooks/ponytail-statusline.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash
-flag="$HOME/.claude/.ponytail-active"
+# Mirror ponytail-config.getClaudeDir(): the flag is written under
+# $CLAUDE_CONFIG_DIR when set, else ~/.claude. Reading the wrong path hides the
+# badge whenever the user relocates Claude's config dir.
+flag="${CLAUDE_CONFIG_DIR:-$HOME/.claude}/.ponytail-active"
 [ -f "$flag" ] || exit 0
 
 mode=$(head -n1 "$flag" | tr -d '[:space:]')

diff --git a/skills/ponytail/SKILL.md b/skills/ponytail/SKILL.md
@@ -10,7 +10,6 @@ description: >
   "minimal solution", "yagni", "do less", or "shortest path", and whenever
   they complain about over-engineering, bloat, boilerplate, or unnecessary
   dependencies.
-license: MIT
 ---
 
 # Ponytail

diff --git a/tests/correctness.test.js b/tests/correctness.test.js
@@ -179,6 +179,57 @@ def endpoint():
   assert.equal(result.score, 0);
 });
 
+// --- Unfenced fallback + prose / early-exit guards (gate integrity) ---
+
+// A terse model can answer with bare, unfenced code (issue #65); the gate must
+// still score it. These reach the extractBlocks fallback, which the fenced
+// `check` helper above never exercises.
+test('email: unfenced bare code still passes', () => {
+  const result = correctness(
+    'import re\ndef validate_email(e):\n    return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))',
+    { vars: { task: 'Write me a Python function that validates email addresses.' } },
+  );
+  assert.equal(result.pass, true);
+});
+
+test('debounce: unfenced arrow function still passes', () => {
+  const result = correctness(
+    'const debounce = (fn, delay) => {\n  let t;\n  return (...a) => { clearTimeout(t); t = setTimeout(() => fn(...a), delay); };\n};',
+    { vars: { task: 'Add debounce to a search input in vanilla JavaScript.' } },
+  );
+  assert.equal(result.pass, true);
+});
+
+// Prose that name-drops the structural keywords but contains no code must NOT
+// pass the run-free checks (countdown/ratelimit), or the gate rewards talk.
+test('countdown: prose without code fails the structural check', () => {
+  const result = correctness(
+    'To build it, use useState for the count and useEffect with setInterval; each tick set count - 1 until zero.',
+    { vars: { task: 'Build me a countdown timer component in React.' } },
+  );
+  assert.equal(result.pass, false);
+});
+
+test('ratelimit: prose without code fails the structural check', () => {
+  const result = correctness(
+    'Build a rate limiter in FastAPI that tracks requests per window and returns 429 when the limit is exceeded.',
+    { vars: { task: 'Add rate limiting to my FastAPI endpoint.' } },
+  );
+  assert.equal(result.pass, false);
+});
+
+// A model demo that exits 0 before our appended asserts run must not score as a
+// pass just because the process exited cleanly.
+test('email: code that exits 0 before the asserts run fails', () => {
+  const result = check(
+    'Write me a Python function that validates email addresses.',
+    'python',
+    'import sys\ndef validate_email(e):\n    return True  # broken: accepts everything\nsys.exit(0)',
+  );
+  assert.equal(result.pass, false);
+  assert.equal(result.score, 0);
+});
+
 // --- Edge cases ---
 
 test('unknown task is gracefully skipped', () => {

diff --git a/tests/gemini-extension.test.js b/tests/gemini-extension.test.js
@@ -22,6 +22,15 @@ const VERSIONED_MANIFESTS = [
   '.codex-plugin/plugin.json',
   '.github/plugin/plugin.json',
 ];
+// The marketplace manifests for the three plugin ecosystems. Shapes differ per
+// ecosystem, but all must parse, name the ponytail plugin, and — for the two
+// that carry a shared plugin description — keep it identical, so a rename or
+// copy-edit can't silently desync one marketplace listing from the others.
+const MARKETPLACE_MANIFESTS = [
+  '.claude-plugin/marketplace.json',
+  '.github/plugin/marketplace.json',
+  '.agents/plugins/marketplace.json',
+];
 // Gemini auto-discovers these by directory; the manifest is only useful if they exist.
 const REUSED_COMMANDS = ['commands/ponytail.toml', 'commands/ponytail-review.toml'];
 const REUSED_SKILLS = ['skills/ponytail/SKILL.md'];
@@ -77,3 +86,23 @@ test('the commands and skills the adapter reuses are present', () => {
     assert.ok(fs.existsSync(path.join(root, rel)), `reused file missing: ${rel}`);
   }
 });
+
+test('every marketplace manifest parses and names the ponytail plugin', () => {
+  for (const rel of MARKETPLACE_MANIFESTS) {
+    assert.ok(fs.existsSync(path.join(root, rel)), `marketplace manifest missing: ${rel}`);
+    const manifest = JSON.parse(read(rel));
+    assert.equal(manifest.name, EXTENSION_NAME, `${rel}: top-level name must be ponytail`);
+    assert.ok(Array.isArray(manifest.plugins) && manifest.plugins.length > 0, `${rel}: must list a plugin`);
+    assert.equal(manifest.plugins[0].name, EXTENSION_NAME, `${rel}: plugins[0].name must be ponytail`);
+  }
+});
+
+test('marketplace manifests that carry a plugin description keep it identical', () => {
+  const descriptions = MARKETPLACE_MANIFESTS
+    .map((rel) => JSON.parse(read(rel)).plugins[0].description)
+    .filter(Boolean);
+  assert.ok(descriptions.length >= 2, 'expected a shared plugin description in at least two marketplaces');
+  for (const d of descriptions) {
+    assert.equal(d, descriptions[0], 'a marketplace plugin description drifted from the others');
+  }
+});