Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .opencode/plugins/ponytail.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,12 @@ export default async ({ client } = {}) => {
// synchronous store if same-turn switching ever matters.
'command.execute.before': async (input) => {
if (!input || input.command !== 'ponytail') return;
// bare /ponytail re-applies the default; a known level sets it; an unknown
// arg (a typo) is left alone rather than silently resetting the level.
const arg = (input.arguments || '').trim();
const mode = arg === '' ? getDefaultMode() : normalizePersistedMode(arg);
if (!mode) return;
// `off` is persisted like any mode; the transform reads it and stays silent.
const mode = normalizePersistedMode((input.arguments || '').trim()) || getDefaultMode();
writeMode(mode);
log('info', 'ponytail ' + mode);
},
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,13 @@ Which files map to which agent: [Agent portability](docs/agent-portability.md).

| Command | What it does |
|---------|--------------|
| `/ponytail [lite \| full \| ultra \| off]` | Set the intensity, or turn it off. No argument reports the current level. |
| `/ponytail [lite \| full \| ultra \| off]` | Set the intensity, or turn it off. No argument re-applies ponytail at the default level. |
| `/ponytail-review` | Review the current diff for over-engineering, hands back a delete-list. |
| `/ponytail-audit` | Audit the whole repo for over-engineering, not just the diff. |
| `/ponytail-debt` | Harvest the `ponytail:` shortcuts you've deferred into a ledger, so "later" doesn't become "never". |
| `/ponytail-help` | Quick reference for the commands above. |

Commands need a skill-capable host (Claude Code, Codex, OpenCode, Gemini, pi). In Codex they're skills, invoke with `@` (`@ponytail-review`). The instruction-only adapters (Cursor, Windsurf, Cline, Copilot, Kiro, Antigravity) load the always-on ruleset without the commands.
Commands need a skill-capable host (Claude Code, Codex, OpenCode, Gemini, Antigravity, pi, GitHub Copilot CLI, OpenClaw). In Codex they're skills, invoke with `@` (`@ponytail-review`); in Antigravity they're chat-skills, typed into the chat (`/ponytail-review` as a message). The instruction-only adapters (Cursor, Windsurf, Cline, Copilot editor, Kiro) load the always-on ruleset without the commands.

## Development

Expand Down
35 changes: 26 additions & 9 deletions benchmarks/correctness.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,19 @@ function extractBlocks(text) {
const matches = [...text.matchAll(/```(\w*)\r?\n([\s\S]*?)```/g)];
// ponytail: terse models often answer with bare, unfenced code. Treat the whole
// response as one block so the gate scores the code instead of reporting "no block".
if (matches.length === 0 && text.trim()) return [{ lang: '', code: text }];
if (matches.length === 0 && text.trim()) return [{ lang: '', code: text, unfenced: true }];
return matches.map((m) => ({ lang: (m[1] || '').toLowerCase(), code: m[2] }));
}

// ponytail: a fenced block is code by the model's own delimiter; the unfenced
// fallback wraps the whole response, prose included. The structural-only checks
// (countdown/ratelimit) never run the code, so prose carrying the right keywords
// would score as a pass — require a real code construct on an unfenced block
// before trusting them. ceiling: keyword heuristic, not a parser.
function looksLikeCode(block) {
return !block.unfenced || /[{}]|=>|@\w|\bdef\b|\bimport\b/.test(block.code);
}

// Identify which task we're evaluating from vars.task.
function identifyTask(task) {
const t = task.toLowerCase();
Expand All @@ -35,13 +44,21 @@ function identifyTask(task) {
// Run a command, return { ok, stderr }.
function exec(cmd, opts = {}) {
try {
execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
return { ok: true, stderr: '' };
const stdout = execSync(cmd, { timeout: 10_000, encoding: 'utf8', stdio: 'pipe', ...opts });
return { ok: true, stdout: stdout || '', stderr: '' };
} catch (e) {
return { ok: false, stderr: (e.stderr || e.message || '').slice(0, 500) };
return { ok: false, stdout: e.stdout || '', stderr: (e.stderr || e.message || '').slice(0, 500) };
}
}

// ponytail: exit-0 alone is not a pass. The model's own code can sys.exit(0) /
// process.exit(0) in a __main__/demo block before our appended asserts run, so
// each harness prints a PASS sentinel on success and we require it here. Without
// this, a skipped assertion masquerades as a passing answer.
function passed(result) {
return result.ok && /(^|\n)PASS\s*$/.test(result.stdout || '');
}

// ponytail: probe once at load; macOS and many Linux images ship python3 only.
let pythonCmd;
function python() {
Expand Down Expand Up @@ -120,7 +137,7 @@ print("PASS")
const f = tmpFile('.py', harness);
const result = exec(`${python()} "${f}"`);
fs.unlinkSync(f);
if (result.ok) return { pass: true, reason: 'Email validator passes all checks' };
if (passed(result)) return { pass: true, reason: 'Email validator passes all checks' };
return { pass: false, reason: result.stderr || 'Email validator failed' };
},

Expand Down Expand Up @@ -165,7 +182,7 @@ setTimeout(() => {
const f = tmpFile('.mjs', harness);
const result = exec(`node "${f}"`);
fs.unlinkSync(f);
if (result.ok) return { pass: true, reason: 'Debounce passes all checks' };
if (passed(result)) return { pass: true, reason: 'Debounce passes all checks' };
return { pass: false, reason: result.stderr || 'Debounce failed' };
},

Expand Down Expand Up @@ -215,15 +232,15 @@ else:
const result = exec(`${python()} "${f}"`);
try { fs.unlinkSync(f); } catch (e) {}
try { fs.unlinkSync(csvPath); } catch (e) {}
if (result.ok) return { pass: true, reason: 'CSV sum produces correct result (351)' };
if (passed(result)) return { pass: true, reason: 'CSV sum produces correct result (351)' };
return { pass: false, reason: result.stderr || 'CSV sum failed' };
},

countdown(blocks) {
// React components can't run in bare Node without a bundler. Structural check:
// the code must contain timer/countdown logic (useState/useEffect/setInterval/setTimeout).
const code = blocks.find((b) => b.code.includes('ount') || b.code.includes('timer') || b.code.includes('Timer'));
if (!code) return { pass: false, reason: 'No countdown component found' };
if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No countdown component found' };

const src = code.code;
const hasState = /useState|useReducer|this\.state/.test(src);
Expand All @@ -241,7 +258,7 @@ else:

ratelimit(blocks) {
const code = blocks.find((b) => b.lang === 'python' || b.lang === 'py' || (!b.lang && (b.code.includes('rate') || b.code.includes('limit'))));
if (!code) return { pass: false, reason: 'No Python code block found' };
if (!code || !looksLikeCode(code)) return { pass: false, reason: 'No Python code block found' };

// Structural check for rate limiting: must have some form of counter/time tracking.
const src = code.code;
Expand Down
26 changes: 0 additions & 26 deletions benchmarks/correctness.test.js

This file was deleted.

5 changes: 3 additions & 2 deletions docs/agent-portability.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ to load in a given agent.
| Host | Files | Notes |
|------|-------|-------|
| Claude Code | `.claude-plugin/`, `commands/`, `hooks/` | Full plugin install with session activation, mode tracking, commands, and statusline support. |
| Codex | `.codex-plugin/plugin.json`, `hooks/hooks.json`, `hooks/`, `skills/` | Plugin install with the same skills plus lifecycle hooks for activation and mode tracking. |
| Codex | `.codex-plugin/plugin.json`, `.agents/plugins/marketplace.json`, `hooks/hooks.json`, `hooks/`, `skills/` | Plugin install with the same skills plus lifecycle hooks for activation and mode tracking. `.agents/plugins/marketplace.json` is the `.agents`-standard marketplace manifest `codex plugin marketplace add` discovers. |
| OpenCode | `.opencode/plugins/ponytail.mjs`, `.opencode/command/`, `hooks/`, `skills/` | Server plugin injects the ruleset each turn via `experimental.chat.system.transform` and persists `/ponytail` switches; reuses the shared instruction builder. |
| pi | `pi-extension/`, `skills/`, `hooks/` | Package extension: injects the ruleset each turn through the shared instruction builder and registers the `/ponytail` commands. |
| Gemini CLI | `gemini-extension.json`, `AGENTS.md`, `commands/`, `skills/` | Extension manifest points `contextFileName` at `AGENTS.md` for always-on rules, and reuses the existing `commands/*.toml` and `skills/`, which Gemini CLI auto-discovers. |
Expand All @@ -18,9 +18,10 @@ to load in a given agent.
| Cline | `.clinerules/ponytail.md` | Project rule. |
| GitHub Copilot | `.github/copilot-instructions.md` | Repository instruction file. |
| GitHub Copilot CLI | `.github/plugin/`, `AGENTS.md`, `.github/copilot-instructions.md`, `~/.copilot/copilot-instructions.md` | Plugin-supported (`copilot plugin marketplace add DietrichGebert/ponytail` + `copilot plugin install ponytail@ponytail`). Fallback instruction mode remains: per-project from `AGENTS.md` or `.github/copilot-instructions.md`, or globally from `~/.copilot/copilot-instructions.md` (instruction-tier, no `/ponytail` levels or hooks). |
| Antigravity | `AGENTS.md` | Reads `AGENTS.md` at the repo root as always-on rules (like `.cursorrules`/`CLAUDE.md`); `.agents/rules/` also works for workspace rules. Instruction-tier. |
| Antigravity | `gemini-extension.json`, `AGENTS.md`, `commands/`, `skills/` | Installs via `agy plugin install` reusing `gemini-extension.json` (same manifest as Gemini CLI); `/ponytail` commands surface as chat-skills typed into the chat. `AGENTS.md` at the repo root (or `.agents/rules/`) is the always-on-rule fallback. |
| VS Code + Codex extension | `AGENTS.md` | The Codex extension reads `AGENTS.md` (repo root, or `~/.codex/AGENTS.md` globally). Instruction-tier; the full Codex plugin row above adds `/ponytail` levels and hooks. |
| Kiro | `.kiro/steering/ponytail.md` | Steering rule; copy globally or into a project. |
| Aider | `AGENTS.md` | Copy the compact rule file as project conventions. Instruction-tier. |
| Generic agents | `AGENTS.md` or `skills/*/SKILL.md` | Copy the compact rule file or load the skill files directly. |

## Adapter Rule
Expand Down
4 changes: 3 additions & 1 deletion hooks/ponytail-activate.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ if (!isCodex) try {
output += "\n\n" +
"STATUSLINE SETUP NEEDED: The ponytail plugin includes a statusline badge showing active mode " +
"(e.g. [PONYTAIL], [PONYTAIL:ULTRA]). It is not configured yet. " +
"To enable, add this to ~/.claude/settings.json: " +
// settingsPath honors CLAUDE_CONFIG_DIR; the literal "~/.claude" misled
// anyone who relocated Claude's config dir to edit the wrong file.
"To enable, add this to " + settingsPath + ": " +
statusLineSnippet + " " +
"Proactively offer to set this up for the user on first interaction.";
}
Expand Down
12 changes: 7 additions & 5 deletions hooks/ponytail-mode-tracker.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ process.stdin.on('end', () => {
else if (arg === 'full') mode = 'full';
else if (arg === 'ultra') mode = 'ultra';
else if (arg === 'off') mode = 'off';
else mode = getDefaultMode();
else if (arg === '') mode = getDefaultMode();
// else: unknown arg (a typo) — leave mode null so we don't silently
// reset the active level; pi already treats unknown args as a no-op.
}

if (mode && mode !== 'off') {
Expand All @@ -42,10 +44,10 @@ process.stdin.on('end', () => {
clearMode();
writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF');
}
}

// Detect deactivation
if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) {
} else if (/\b(stop ponytail|normal mode)\b/i.test(prompt)) {
// Deactivation phrase — but only when the prompt isn't itself a /ponytail
// command. A prompt matching both branches used to emit two hook outputs
// (two JSON objects on one stdout), breaking the host's JSON.parse.
clearMode();
writeHookOutput('UserPromptSubmit', 'off', 'PONYTAIL MODE OFF');
}
Expand Down
4 changes: 3 additions & 1 deletion hooks/ponytail-statusline.ps1
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
$Flag = Join-Path $HOME ".claude/.ponytail-active"
# Mirror ponytail-config.getClaudeDir(): honor CLAUDE_CONFIG_DIR, else ~/.claude.
$ConfigDir = if ($env:CLAUDE_CONFIG_DIR) { $env:CLAUDE_CONFIG_DIR } else { Join-Path $HOME ".claude" }
$Flag = Join-Path $ConfigDir ".ponytail-active"
if (-not (Test-Path $Flag)) {
exit 0
}
Expand Down
5 changes: 4 additions & 1 deletion hooks/ponytail-statusline.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/usr/bin/env bash
flag="$HOME/.claude/.ponytail-active"
# Mirror ponytail-config.getClaudeDir(): the flag is written under
# $CLAUDE_CONFIG_DIR when set, else ~/.claude. Reading the wrong path hides the
# badge whenever the user relocates Claude's config dir.
flag="${CLAUDE_CONFIG_DIR:-$HOME/.claude}/.ponytail-active"
[ -f "$flag" ] || exit 0

mode=$(head -n1 "$flag" | tr -d '[:space:]')
Expand Down
1 change: 0 additions & 1 deletion skills/ponytail/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ description: >
"minimal solution", "yagni", "do less", or "shortest path", and whenever
they complain about over-engineering, bloat, boilerplate, or unnecessary
dependencies.
license: MIT
---

# Ponytail
Expand Down
51 changes: 51 additions & 0 deletions tests/correctness.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,57 @@ def endpoint():
assert.equal(result.score, 0);
});

// --- Unfenced fallback + prose / early-exit guards (gate integrity) ---

// A terse model can answer with bare, unfenced code (issue #65); the gate must
// still score it. These reach the extractBlocks fallback, which the fenced
// `check` helper above never exercises.
test('email: unfenced bare code still passes', () => {
const result = correctness(
'import re\ndef validate_email(e):\n return bool(re.match(r"^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$", e))',
{ vars: { task: 'Write me a Python function that validates email addresses.' } },
);
assert.equal(result.pass, true);
});

test('debounce: unfenced arrow function still passes', () => {
const result = correctness(
'const debounce = (fn, delay) => {\n let t;\n return (...a) => { clearTimeout(t); t = setTimeout(() => fn(...a), delay); };\n};',
{ vars: { task: 'Add debounce to a search input in vanilla JavaScript.' } },
);
assert.equal(result.pass, true);
});

// Prose that name-drops the structural keywords but contains no code must NOT
// pass the run-free checks (countdown/ratelimit), or the gate rewards talk.
test('countdown: prose without code fails the structural check', () => {
const result = correctness(
'To build it, use useState for the count and useEffect with setInterval; each tick set count - 1 until zero.',
{ vars: { task: 'Build me a countdown timer component in React.' } },
);
assert.equal(result.pass, false);
});

test('ratelimit: prose without code fails the structural check', () => {
const result = correctness(
'Build a rate limiter in FastAPI that tracks requests per window and returns 429 when the limit is exceeded.',
{ vars: { task: 'Add rate limiting to my FastAPI endpoint.' } },
);
assert.equal(result.pass, false);
});

// A model demo that exits 0 before our appended asserts run must not score as a
// pass just because the process exited cleanly.
test('email: code that exits 0 before the asserts run fails', () => {
const result = check(
'Write me a Python function that validates email addresses.',
'python',
'import sys\ndef validate_email(e):\n return True # broken: accepts everything\nsys.exit(0)',
);
assert.equal(result.pass, false);
assert.equal(result.score, 0);
});

// --- Edge cases ---

test('unknown task is gracefully skipped', () => {
Expand Down
29 changes: 29 additions & 0 deletions tests/gemini-extension.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ const VERSIONED_MANIFESTS = [
'.codex-plugin/plugin.json',
'.github/plugin/plugin.json',
];
// The marketplace manifests for the three plugin ecosystems. Shapes differ per
// ecosystem, but all must parse, name the ponytail plugin, and — for the two
// that carry a shared plugin description — keep it identical, so a rename or
// copy-edit can't silently desync one marketplace listing from the others.
const MARKETPLACE_MANIFESTS = [
'.claude-plugin/marketplace.json',
'.github/plugin/marketplace.json',
'.agents/plugins/marketplace.json',
];
// Gemini auto-discovers these by directory; the manifest is only useful if they exist.
const REUSED_COMMANDS = ['commands/ponytail.toml', 'commands/ponytail-review.toml'];
const REUSED_SKILLS = ['skills/ponytail/SKILL.md'];
Expand Down Expand Up @@ -77,3 +86,23 @@ test('the commands and skills the adapter reuses are present', () => {
assert.ok(fs.existsSync(path.join(root, rel)), `reused file missing: ${rel}`);
}
});

test('every marketplace manifest parses and names the ponytail plugin', () => {
for (const rel of MARKETPLACE_MANIFESTS) {
assert.ok(fs.existsSync(path.join(root, rel)), `marketplace manifest missing: ${rel}`);
const manifest = JSON.parse(read(rel));
assert.equal(manifest.name, EXTENSION_NAME, `${rel}: top-level name must be ponytail`);
assert.ok(Array.isArray(manifest.plugins) && manifest.plugins.length > 0, `${rel}: must list a plugin`);
assert.equal(manifest.plugins[0].name, EXTENSION_NAME, `${rel}: plugins[0].name must be ponytail`);
}
});

test('marketplace manifests that carry a plugin description keep it identical', () => {
const descriptions = MARKETPLACE_MANIFESTS
.map((rel) => JSON.parse(read(rel)).plugins[0].description)
.filter(Boolean);
assert.ok(descriptions.length >= 2, 'expected a shared plugin description in at least two marketplaces');
for (const d of descriptions) {
assert.equal(d, descriptions[0], 'a marketplace plugin description drifted from the others');
}
});
Loading