diff --git a/.opencode/plugins/ponytail.mjs b/.opencode/plugins/ponytail.mjs index 2fd3625..ab7b7c0 100644 --- a/.opencode/plugins/ponytail.mjs +++ b/.opencode/plugins/ponytail.mjs @@ -56,8 +56,12 @@ export default async ({ client } = {}) => { // synchronous store if same-turn switching ever matters. 'command.execute.before': async (input) => { if (!input || input.command !== 'ponytail') return; + // bare /ponytail re-applies the default; a known level sets it; an unknown + // arg (a typo) is left alone rather than silently resetting the level. + const arg = (input.arguments || '').trim(); + const mode = arg === '' ? getDefaultMode() : normalizePersistedMode(arg); + if (!mode) return; // `off` is persisted like any mode; the transform reads it and stays silent. - const mode = normalizePersistedMode((input.arguments || '').trim()) || getDefaultMode(); writeMode(mode); log('info', 'ponytail ' + mode); }, diff --git a/README.md b/README.md index ea530f7..e93e645 100644 --- a/README.md +++ b/README.md @@ -179,13 +179,13 @@ Which files map to which agent: [Agent portability](docs/agent-portability.md). | Command | What it does | |---------|--------------| -| `/ponytail [lite \| full \| ultra \| off]` | Set the intensity, or turn it off. No argument reports the current level. | +| `/ponytail [lite \| full \| ultra \| off]` | Set the intensity, or turn it off. No argument re-applies ponytail at the default level. | | `/ponytail-review` | Review the current diff for over-engineering, hands back a delete-list. | | `/ponytail-audit` | Audit the whole repo for over-engineering, not just the diff. | | `/ponytail-debt` | Harvest the `ponytail:` shortcuts you've deferred into a ledger, so "later" doesn't become "never". | | `/ponytail-help` | Quick reference for the commands above. | -Commands need a skill-capable host (Claude Code, Codex, OpenCode, Gemini, pi). In Codex they're skills, invoke with `@` (`@ponytail-review`). The instruction-only adapters (Cursor, Windsurf, Cline, Copilot, Kiro, Antigravity) load the always-on ruleset without the commands. +Commands need a skill-capable host (Claude Code, Codex, OpenCode, Gemini, Antigravity, pi, GitHub Copilot CLI, OpenClaw). In Codex they're skills, invoke with `@` (`@ponytail-review`); in Antigravity they're chat-skills, typed into the chat (`/ponytail-review` as a message). The instruction-only adapters (Cursor, Windsurf, Cline, Copilot editor, Kiro) load the always-on ruleset without the commands. ## Development diff --git a/benchmarks/correctness.js b/benchmarks/correctness.js index fc56611..2b896c0 100644 --- a/benchmarks/correctness.js +++ b/benchmarks/correctness.js @@ -17,10 +17,19 @@ function extractBlocks(text) { const matches = [...text.matchAll(/```(\w*)\r?\n([\s\S]*?)```/g)]; // ponytail: terse models often answer with bare, unfenced code. Treat the whole // response as one block so the gate scores the code instead of reporting "no block". - if (matches.length === 0 && text.trim()) return [{ lang: '', code: text }]; + if (matches.length === 0 && text.trim()) return [{ lang: '', code: text, unfenced: true }]; return matches.map((m) => ({ lang: (m[1] || '').toLowerCase(), code: m[2] })); } +// ponytail: a fenced block is code by the model's own delimiter; the unfenced +// fallback wraps the whole response, prose included. The structural-only checks +// (countdown/ratelimit) never run the code, so prose carrying the right keywords +// would score as a pass — require a real code construct on an unfenced block +// before trusting them. ceiling: keyword heuristic, not a parser. +function looksLikeCode(block) { + return !block.unfenced || /[{}]|=>|@\w|\bdef\b|\bimport\b/.test(block.code); +} + // Identify which task we're evaluating from vars.task. function identifyTask(task) { const t = task.toLowerCase(); @@ -35,13 +44,21 @@ function identifyTask(task) { // Run a command, return { ok, stderr }. function exec(cmd, opts = {}) { try { - execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts }); - return { ok: true, stderr: '' }; + const stdout = execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts }); + return { ok: true, stdout: stdout || '', stderr: '' }; } catch (e) { - return { ok: false, stderr: (e.stderr || e.message || '').slice(0, 500) }; + return { ok: false, stdout: e.stdout || '', stderr: (e.stderr || e.message || '').slice(0, 500) }; } } +// ponytail: exit-0 alone is not a pass. The model's own code can sys.exit(0) / +// process.exit(0) in a __main__/demo block before our appended asserts run, so +// each harness prints a PASS sentinel on success and we require it here. Without +// this, a skipped assertion masquerades as a passing answer. +function passed(result) { + return result.ok && /(^|\n)PASS\s*$/.test(result.stdout || ''); +} + // ponytail: probe once at load; macOS and many Linux images ship python3 only. let pythonCmd; function python() { @@ -120,7 +137,7 @@ print("PASS") const f = tmpFile('.py', harness); const result = exec(`${python()} "${f}"`); fs.unlinkSync(f); - if (result.ok) return { pass: true, reason: 'Email validator passes all checks' }; + if (passed(result)) return { pass: true, reason: 'Email validator passes all checks' }; return { pass: false, reason: result.stderr || 'Email validator failed' }; }, @@ -165,7 +182,7 @@ setTimeout(() => { const f = tmpFile('.mjs', harness); const result = exec(`node "${f}"`); fs.unlinkSync(f); - if (result.ok) return { pass: true, reason: 'Debounce passes all checks' }; + if (passed(result)) return { pass: true, reason: 'Debounce passes all checks' }; return { pass: false, reason: result.stderr || 'Debounce failed' }; }, @@ -215,7 +232,7 @@ else: const result = exec(`${python()} "${f}"`); try { fs.unlinkSync(f); } catch (e) {} try { fs.unlinkSync(csvPath); } catch (e) {} - if (result.ok) return { pass: true, reason: 'CSV sum produces correct result (351)' }; + if (passed(result)) return { pass: true, reason: 'CSV sum produces correct result (351)' }; return { pass: false, reason: result.stderr || 'CSV sum failed' }; }, @@ -223,7 +240,7 @@ else: // React components can't run in bare Node without a bundler. Structural check: // the code must contain timer/countdown logic (useState/useEffect/setInterval/setTimeout). const code = blocks.find((b) => b.code.includes('ount') || b.code.includes('timer') || b.code.includes('Timer')); - if (!code) return { pass: false, reason: 'No countdown component found' }; + if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No countdown component found' }; const src = code.code; const hasState = /useState|useReducer|this\.state/.test(src); @@ -241,7 +258,7 @@ else: ratelimit(blocks) { const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && (b.code.includes('rate') || b.code.includes('limit')))); - if (!code) return { pass: false, reason: 'No Python code block found' }; + if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No Python code block found' }; // Structural check for rate limiting: must have some form of counter/time tracking. const src = code.code; diff --git a/benchmarks/correctness.test.js b/benchmarks/correctness.test.js deleted file mode 100644 index e3103c4..0000000 --- a/benchmarks/correctness.test.js +++ /dev/null @@ -1,26 +0,0 @@ -// Regression guard for the gate fixes (issue #65). Run: node correctness.test.js -// Needs python + node on PATH, same as correctness.js itself. -const assert = require('assert'); -const check = require('./correctness.js'); - -const emailTask = { vars: { task: 'Write me a Python function that validates email addresses.' } }; -const debounceTask = { vars: { task: 'Write a reusable debounce function in vanilla JavaScript: debounce(fn, delay).' } }; - -const FENCED_EMAIL = '```python\nimport re\ndef validate_email(e):\n return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))\n```'; -const UNFENCED_EMAIL = 'import re\ndef validate_email(e):\n return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))'; -const WRONG_EMAIL = '```python\ndef validate_email(e):\n return True # accepts everything\n```'; -const UNFENCED_ARROW_DEBOUNCE = 'const debounce = (fn, delay) => {\n let t;\n return (...a) => { clearTimeout(t); t = setTimeout(() => fn(...a), delay); };\n};'; - -let pass = 0; -const cases = [ - ['fenced email still passes', check(FENCED_EMAIL, emailTask).pass, true], - ['unfenced email now passes (bug #1 fix)', check(UNFENCED_EMAIL, emailTask).pass, true], - ['broken email still fails', check(WRONG_EMAIL, emailTask).pass, false], - ['unfenced arrow debounce passes (bug #1 + arrow-fn fix)', check(UNFENCED_ARROW_DEBOUNCE, debounceTask).pass, true], -]; -for (const [name, got, want] of cases) { - assert.strictEqual(got, want, `FAILED: ${name} (got ${got}, want ${want})`); - console.log(`ok - ${name}`); - pass++; -} -console.log(`\n${pass}/${cases.length} passed`); diff --git a/docs/agent-portability.md b/docs/agent-portability.md index 40b4af8..799cb60 100644 --- a/docs/agent-portability.md +++ b/docs/agent-portability.md @@ -9,7 +9,7 @@ to load in a given agent. | Host | Files | Notes | |------|-------|-------| | Claude Code | `.claude-plugin/`, `commands/`, `hooks/` | Full plugin install with session activation, mode tracking, commands, and statusline support. | -| Codex | `.codex-plugin/plugin.json`, `hooks/hooks.json`, `hooks/`, `skills/` | Plugin install with the same skills plus lifecycle hooks for activation and mode tracking. | +| Codex | `.codex-plugin/plugin.json`, `.agents/plugins/marketplace.json`, `hooks/hooks.json`, `hooks/`, `skills/` | Plugin install with the same skills plus lifecycle hooks for activation and mode tracking. `.agents/plugins/marketplace.json` is the `.agents`-standard marketplace manifest `codex plugin marketplace add` discovers. | | OpenCode | `.opencode/plugins/ponytail.mjs`, `.opencode/command/`, `hooks/`, `skills/` | Server plugin injects the ruleset each turn via `experimental.chat.system.transform` and persists `/ponytail` switches; reuses the shared instruction builder. | | pi | `pi-extension/`, `skills/`, `hooks/` | Package extension: injects the ruleset each turn through the shared instruction builder and registers the `/ponytail` commands. | | Gemini CLI | `gemini-extension.json`, `AGENTS.md`, `commands/`, `skills/` | Extension manifest points `contextFileName` at `AGENTS.md` for always-on rules, and reuses the existing `commands/*.toml` and `skills/`, which Gemini CLI auto-discovers. | @@ -18,9 +18,10 @@ to load in a given agent. | Cline | `.clinerules/ponytail.md` | Project rule. | | GitHub Copilot | `.github/copilot-instructions.md` | Repository instruction file. | | GitHub Copilot CLI | `.github/plugin/`, `AGENTS.md`, `.github/copilot-instructions.md`, `~/.copilot/copilot-instructions.md` | Plugin-supported (`copilot plugin marketplace add DietrichGebert/ponytail` + `copilot plugin install ponytail@ponytail`). Fallback instruction mode remains: per-project from `AGENTS.md` or `.github/copilot-instructions.md`, or globally from `~/.copilot/copilot-instructions.md` (instruction-tier, no `/ponytail` levels or hooks). | -| Antigravity | `AGENTS.md` | Reads `AGENTS.md` at the repo root as always-on rules (like `.cursorrules`/`CLAUDE.md`); `.agents/rules/` also works for workspace rules. Instruction-tier. | +| Antigravity | `gemini-extension.json`, `AGENTS.md`, `commands/`, `skills/` | Installs via `agy plugin install` reusing `gemini-extension.json` (same manifest as Gemini CLI); `/ponytail` commands surface as chat-skills typed into the chat. `AGENTS.md` at the repo root (or `.agents/rules/`) is the always-on-rule fallback. | | VS Code + Codex extension | `AGENTS.md` | The Codex extension reads `AGENTS.md` (repo root, or `~/.codex/AGENTS.md` globally). Instruction-tier; the full Codex plugin row above adds `/ponytail` levels and hooks. | | Kiro | `.kiro/steering/ponytail.md` | Steering rule; copy globally or into a project. | +| Aider | `AGENTS.md` | Copy the compact rule file as project conventions. Instruction-tier. | | Generic agents | `AGENTS.md` or `skills/*/SKILL.md` | Copy the compact rule file or load the skill files directly. | ## Adapter Rule diff --git a/hooks/ponytail-activate.js b/hooks/ponytail-activate.js index 9610721..6f3ee50 100644 --- a/hooks/ponytail-activate.js +++ b/hooks/ponytail-activate.js @@ -61,7 +61,9 @@ if (!isCodex) try { output += "\n\n" + "STATUSLINE SETUP NEEDED: The ponytail plugin includes a statusline badge showing active mode " + "(e.g. [PONYTAIL], [PONYTAIL:ULTRA]). It is not configured yet. " + - "To enable, add this to ~/.claude/settings.json: " + + // settingsPath honors CLAUDE_CONFIG_DIR; the literal "~/.claude" misled + // anyone who relocated Claude's config dir to edit the wrong file. + "To enable, add this to " + settingsPath + ": " + statusLineSnippet + " " + "Proactively offer to set this up for the user on first interaction."; } diff --git a/hooks/ponytail-mode-tracker.js b/hooks/ponytail-mode-tracker.js index d4fda46..0209f50 100644 --- a/hooks/ponytail-mode-tracker.js +++ b/hooks/ponytail-mode-tracker.js @@ -28,7 +28,9 @@ process.stdin.on('end', () => { else if (arg === 'full') mode = 'full'; else if (arg === 'ultra') mode = 'ultra'; else if (arg === 'off') mode = 'off'; - else mode = getDefaultMode(); + else if (arg === '') mode = getDefaultMode(); + // else: unknown arg (a typo) — leave mode null so we don't silently + // reset the active level; pi already treats unknown args as a no-op. } if (mode && mode !== 'off') { @@ -42,10 +44,10 @@ process.stdin.on('end', () => { clearMode(); writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF'); } - } - - // Detect deactivation - if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) { + } else if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) { + // Deactivation phrase — but only when the prompt isn't itself a /ponytail + // command. A prompt matching both branches used to emit two hook outputs + // (two JSON objects on one stdout), breaking the host's JSON.parse. clearMode(); writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF'); } diff --git a/hooks/ponytail-statusline.ps1 b/hooks/ponytail-statusline.ps1 index d9fe437..ba60ff1 100644 --- a/hooks/ponytail-statusline.ps1 +++ b/hooks/ponytail-statusline.ps1 @@ -1,4 +1,6 @@ -$Flag = Join-Path $HOME ".claude/.ponytail-active" +# Mirror ponytail-config.getClaudeDir(): honor CLAUDE_CONFIG_DIR, else ~/.claude. +$ConfigDir = if ($env:CLAUDE_CONFIG_DIR) { $env:CLAUDE_CONFIG_DIR } else { Join-Path $HOME ".claude" } +$Flag = Join-Path $ConfigDir ".ponytail-active" if (-not (Test-Path $Flag)) { exit 0 } diff --git a/hooks/ponytail-statusline.sh b/hooks/ponytail-statusline.sh index 5e83a27..d9ee87d 100644 --- a/hooks/ponytail-statusline.sh +++ b/hooks/ponytail-statusline.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash -flag="$HOME/.claude/.ponytail-active" +# Mirror ponytail-config.getClaudeDir(): the flag is written under +# $CLAUDE_CONFIG_DIR when set, else ~/.claude. Reading the wrong path hides the +# badge whenever the user relocates Claude's config dir. +flag="${CLAUDE_CONFIG_DIR:-$HOME/.claude}/.ponytail-active" [ -f "$flag" ] || exit 0 mode=$(head -n1 "$flag" | tr -d '[:space:]') diff --git a/skills/ponytail/SKILL.md b/skills/ponytail/SKILL.md index 0e0d3be..80eb5a0 100644 --- a/skills/ponytail/SKILL.md +++ b/skills/ponytail/SKILL.md @@ -10,7 +10,6 @@ description: > "minimal solution", "yagni", "do less", or "shortest path", and whenever they complain about over-engineering, bloat, boilerplate, or unnecessary dependencies. -license: MIT --- # Ponytail diff --git a/tests/correctness.test.js b/tests/correctness.test.js index a3facc8..f1ee1d6 100644 --- a/tests/correctness.test.js +++ b/tests/correctness.test.js @@ -179,6 +179,57 @@ def endpoint(): assert.equal(result.score, 0); }); +// --- Unfenced fallback + prose / early-exit guards (gate integrity) --- + +// A terse model can answer with bare, unfenced code (issue #65); the gate must +// still score it. These reach the extractBlocks fallback, which the fenced +// `check` helper above never exercises. +test('email: unfenced bare code still passes', () => { + const result = correctness( + 'import re\ndef validate_email(e):\n return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))', + { vars: { task: 'Write me a Python function that validates email addresses.' } }, + ); + assert.equal(result.pass, true); +}); + +test('debounce: unfenced arrow function still passes', () => { + const result = correctness( + 'const debounce = (fn, delay) => {\n let t;\n return (...a) => { clearTimeout(t); t = setTimeout(() => fn(...a), delay); };\n};', + { vars: { task: 'Add debounce to a search input in vanilla JavaScript.' } }, + ); + assert.equal(result.pass, true); +}); + +// Prose that name-drops the structural keywords but contains no code must NOT +// pass the run-free checks (countdown/ratelimit), or the gate rewards talk. +test('countdown: prose without code fails the structural check', () => { + const result = correctness( + 'To build it, use useState for the count and useEffect with setInterval; each tick set count - 1 until zero.', + { vars: { task: 'Build me a countdown timer component in React.' } }, + ); + assert.equal(result.pass, false); +}); + +test('ratelimit: prose without code fails the structural check', () => { + const result = correctness( + 'Build a rate limiter in FastAPI that tracks requests per window and returns 429 when the limit is exceeded.', + { vars: { task: 'Add rate limiting to my FastAPI endpoint.' } }, + ); + assert.equal(result.pass, false); +}); + +// A model demo that exits 0 before our appended asserts run must not score as a +// pass just because the process exited cleanly. +test('email: code that exits 0 before the asserts run fails', () => { + const result = check( + 'Write me a Python function that validates email addresses.', + 'python', + 'import sys\ndef validate_email(e):\n return True # broken: accepts everything\nsys.exit(0)', + ); + assert.equal(result.pass, false); + assert.equal(result.score, 0); +}); + // --- Edge cases --- test('unknown task is gracefully skipped', () => { diff --git a/tests/gemini-extension.test.js b/tests/gemini-extension.test.js index 0f24338..833d8dc 100644 --- a/tests/gemini-extension.test.js +++ b/tests/gemini-extension.test.js @@ -22,6 +22,15 @@ const VERSIONED_MANIFESTS = [ '.codex-plugin/plugin.json', '.github/plugin/plugin.json', ]; +// The marketplace manifests for the three plugin ecosystems. Shapes differ per +// ecosystem, but all must parse, name the ponytail plugin, and — for the two +// that carry a shared plugin description — keep it identical, so a rename or +// copy-edit can't silently desync one marketplace listing from the others. +const MARKETPLACE_MANIFESTS = [ + '.claude-plugin/marketplace.json', + '.github/plugin/marketplace.json', + '.agents/plugins/marketplace.json', +]; // Gemini auto-discovers these by directory; the manifest is only useful if they exist. const REUSED_COMMANDS = ['commands/ponytail.toml', 'commands/ponytail-review.toml']; const REUSED_SKILLS = ['skills/ponytail/SKILL.md']; @@ -77,3 +86,23 @@ test('the commands and skills the adapter reuses are present', () => { assert.ok(fs.existsSync(path.join(root, rel)), `reused file missing: ${rel}`); } }); + +test('every marketplace manifest parses and names the ponytail plugin', () => { + for (const rel of MARKETPLACE_MANIFESTS) { + assert.ok(fs.existsSync(path.join(root, rel)), `marketplace manifest missing: ${rel}`); + const manifest = JSON.parse(read(rel)); + assert.equal(manifest.name, EXTENSION_NAME, `${rel}: top-level name must be ponytail`); + assert.ok(Array.isArray(manifest.plugins) && manifest.plugins.length > 0, `${rel}: must list a plugin`); + assert.equal(manifest.plugins[0].name, EXTENSION_NAME, `${rel}: plugins[0].name must be ponytail`); + } +}); + +test('marketplace manifests that carry a plugin description keep it identical', () => { + const descriptions = MARKETPLACE_MANIFESTS + .map((rel) => JSON.parse(read(rel)).plugins[0].description) + .filter(Boolean); + assert.ok(descriptions.length >= 2, 'expected a shared plugin description in at least two marketplaces'); + for (const d of descriptions) { + assert.equal(d, descriptions[0], 'a marketplace plugin description drifted from the others'); + } +}); diff --git a/tests/hooks-windows.test.js b/tests/hooks-windows.test.js index f7b5353..473afdf 100644 --- a/tests/hooks-windows.test.js +++ b/tests/hooks-windows.test.js @@ -12,6 +12,7 @@ const path = require('path'); const root = path.join(__dirname, '..'); const HOOKS_JSON = 'hooks/hooks.json'; +const COPILOT_HOOKS_JSON = 'hooks/copilot-hooks.json'; // cmd.exe variable syntax (%FOO%); PowerShell leaves it literal, breaking the path. const CMD_VAR_SYNTAX = /%[A-Za-z_][A-Za-z0-9_]*%/; // Pull the hooks/