diff --git a/.env.example b/.env.example index 23d4820..fb759a3 100644 --- a/.env.example +++ b/.env.example @@ -5,3 +5,7 @@ GEMINI_API_KEY= # Optional overrides (also settable via config file / CLI flags) # TINY_CODE_PROVIDER=anthropic # anthropic | gemini # TINY_CODE_MODEL=claude-opus-4-8 + +# Self-improvement: reflect on sessions and propose markdown-only improvement PRs. +# On by default; set to 0 to disable. Requires the `gh` CLI installed + authed. +# TINY_CODE_IMPROVE=1 # 1 | 0 diff --git a/AGENTS.md b/AGENTS.md index 2b900ff..05ae519 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,3 +26,12 @@ runaway costs. - No business logic. This is a general-purpose tool. - Don't add a second state paradigm or heavy dependencies without a clear reason. - New deferred features go in `TODO.md` with a rationale and rough approach. + +## Self-improvement (`src/improve/`) +- Proposals are markdown-only PRs (`improvements/.md`). The "never code" + guarantee is structural — the PR creator validates the slug, writes one file, + and stages exactly one explicit path (never `git add -A`). Preserve this; do + not loosen `src/improve/pr.ts` to stage arbitrary paths. +- Reflection (`src/improve/reflect.ts`) must call the provider with `tools: []` + so it can never execute anything from a transcript. +- Opening PRs shells out to the `gh` CLI (assumed installed + authenticated). diff --git a/README.md b/README.md index 0aca72d..9d1363f 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ In the REPL: type a request, watch it work. Mutating actions (writes, edits, shell commands) prompt for approval unless pre-approved in config. - `/help` — list commands +- `/improve` — reflect on the session and propose an improvement PR (see below) - `/ [args]` — run a custom command (see below) - `/exit` — quit @@ -127,6 +128,37 @@ the savings where it can: (see `TODO.md`), which will keep input-token counts from compounding across many turns without any user action. +## Self-improvement + +tiny-code can learn from how it's used. When a session ends (or when you run +`/improve`), it reflects on the conversation transcript looking for recurring +friction — tool errors, repeated retries, denied permissions, missing +capabilities. If it finds a concrete improvement, it asks for your permission to +open a pull request. + +That PR contains **only a single markdown file** under `improvements/` +describing the proposed change, targeting `main` for a maintainer to review and +implement separately. **It never contains code changes** — this is enforced +structurally (the PR creator only ever stages one regex-validated markdown path), +so a prompt-injected session cannot smuggle code into a PR. + +PRs are opened via the [`gh` CLI](https://cli.github.com/), which must be +installed and authenticated (`gh auth login`); the working tree must be clean. + +```json +{ + "improve": { + "enabled": true, + "baseBranch": "main", + "onSessionEnd": true + } +} +``` + +The feature is **on by default**. Set `improve.enabled` to `false` (or export +`TINY_CODE_IMPROVE=0`) to disable it entirely; set `onSessionEnd` to `false` to +keep `/improve` but skip the automatic reflection at exit. + ## Development ```bash diff --git a/src/config/load.ts b/src/config/load.ts index f71112b..d2ce512 100644 --- a/src/config/load.ts +++ b/src/config/load.ts @@ -27,6 +27,17 @@ export interface ResolvedConfig { maxIterations: number; commandDirs: string[]; allow: AllowRules; + improve: ImproveConfig; +} + +/** Settings for the self-improvement / proposal-PR feature. */ +export interface ImproveConfig { + /** Master switch for the whole feature (manual and automatic). */ + enabled: boolean; + /** Branch PRs target. */ + baseBranch: string; + /** Whether to reflect automatically when the session ends. */ + onSessionEnd: boolean; } export interface CliOverrides { @@ -56,6 +67,13 @@ const FileConfigSchema = z write: z.array(z.string()).optional(), }) .optional(), + improve: z + .object({ + enabled: z.boolean().optional(), + baseBranch: z.string().optional(), + onSessionEnd: z.boolean().optional(), + }) + .optional(), }) .strict(); @@ -118,5 +136,15 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c bash: file.allow?.bash ?? [], write: file.allow?.write ?? [], }, + improve: { + enabled: + env.TINY_CODE_IMPROVE === '0' + ? false + : env.TINY_CODE_IMPROVE === '1' + ? true + : (file.improve?.enabled ?? true), + baseBranch: file.improve?.baseBranch ?? 'main', + onSessionEnd: file.improve?.onSessionEnd ?? true, + }, }; } diff --git a/src/improve/pr.ts b/src/improve/pr.ts new file mode 100644 index 0000000..698bf17 --- /dev/null +++ b/src/improve/pr.ts @@ -0,0 +1,126 @@ +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; +import { mkdirSync, writeFileSync } from 'node:fs'; +import { join, resolve, sep } from 'node:path'; +import { SLUG_RE } from './slug.js'; + +const run = promisify(execFile); + +export interface CreatePrOptions { + cwd: string; + /** Already-slugified identifier (validated again here, defensively). */ + slug: string; + title: string; + markdown: string; + baseBranch?: string; +} + +export interface PrResult { + ok: boolean; + url?: string; + reason?: string; +} + +const IMPROVEMENTS_DIR = 'improvements'; + +/** + * Open a PR containing exactly one markdown file (`improvements/.md`). + * + * The "markdown-only, never code" guarantee is structural, not advisory: + * - filenames and branch names derive solely from a regex-validated slug; + * - the file is the only thing written to disk; + * - staging is a single explicit path (`git add improvements/.md`), + * never `git add -A`/`.`; + * - the staged set is asserted to be exactly that one path before committing. + * The model only ever influences the file's *contents* and the PR title. + */ +export async function createImprovementPr(opts: CreatePrOptions): Promise { + const { cwd, slug, title, markdown } = opts; + const baseBranch = opts.baseBranch ?? 'main'; + + // Defense in depth: never trust the caller's slug. + if (!SLUG_RE.test(slug)) { + return { ok: false, reason: `Refusing unsafe slug: ${slug}` }; + } + + const relPath = `${IMPROVEMENTS_DIR}/${slug}.md`; + const dirAbs = join(cwd, IMPROVEMENTS_DIR); + const fileAbs = join(dirAbs, `${slug}.md`); + // Path-traversal guard (redundant with SLUG_RE, kept as a hard boundary). + if (resolve(fileAbs) !== fileAbs || !fileAbs.startsWith(dirAbs + sep)) { + return { ok: false, reason: 'Resolved path escaped the improvements directory.' }; + } + + const branch = `improve/${slug}`; + + // --- Preflight: fail gracefully rather than throw into the exit path. --- + try { + await run('gh', ['--version'], { cwd }); + } catch { + return { ok: false, reason: 'gh CLI not found — install and authenticate it to open improvement PRs.' }; + } + try { + await run('git', ['rev-parse', '--is-inside-work-tree'], { cwd }); + } catch { + return { ok: false, reason: 'Not inside a git repository.' }; + } + try { + await run('gh', ['auth', 'status'], { cwd }); + } catch { + return { ok: false, reason: 'gh CLI is not authenticated (run `gh auth login`).' }; + } + + const dirty = (await run('git', ['status', '--porcelain'], { cwd })).stdout.trim(); + if (dirty.length > 0) { + return { + ok: false, + reason: 'Working tree has uncommitted changes — commit or stash them before proposing an improvement.', + }; + } + + const original = (await run('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { cwd })).stdout.trim(); + + try { + await run('git', ['checkout', '-b', branch], { cwd }); + + mkdirSync(dirAbs, { recursive: true }); + writeFileSync(fileAbs, markdown.endsWith('\n') ? markdown : `${markdown}\n`); + + // The single, explicit staged path — never `git add -A`/`.`. + await run('git', ['add', relPath], { cwd }); + + // Belt-and-suspenders: confirm nothing else got staged. + const staged = (await run('git', ['diff', '--cached', '--name-only'], { cwd })).stdout + .split('\n') + .map((s) => s.trim()) + .filter(Boolean); + if (staged.length !== 1 || staged[0] !== relPath) { + throw new Error(`Unexpected staged files: ${staged.join(', ') || '(none)'}`); + } + + await run('git', ['commit', '-m', `docs: propose improvement — ${title}`], { cwd }); + await run('git', ['push', '-u', 'origin', branch], { cwd }); + + const body = + `Automated improvement proposal generated by tiny-code from session usage.\n\n` + + `This PR intentionally contains a single markdown file under \`${IMPROVEMENTS_DIR}/\` and no code changes — ` + + `it is for a maintainer to review and implement separately.`; + const created = await run( + 'gh', + ['pr', 'create', '--base', baseBranch, '--head', branch, '--title', title, '--body', body], + { cwd }, + ); + const url = created.stdout.trim().split('\n').filter(Boolean).pop(); + + return url ? { ok: true, url } : { ok: true }; + } catch (err) { + return { ok: false, reason: (err as Error).message }; + } finally { + // Best-effort restore of the user's original branch. + try { + await run('git', ['checkout', original], { cwd }); + } catch { + /* leave them on the improve branch rather than masking the real result */ + } + } +} diff --git a/src/improve/reflect.ts b/src/improve/reflect.ts new file mode 100644 index 0000000..bf8a1a6 --- /dev/null +++ b/src/improve/reflect.ts @@ -0,0 +1,95 @@ +import type { ModelProvider } from '../providers/types.js'; +import type { Message } from '../agent/types.js'; + +/** Sentinel the model emits when a session yields nothing worth proposing. */ +export const NO_IMPROVEMENT = 'NO_IMPROVEMENT'; + +const MAX_RESULT_CHARS = 2_000; +const MAX_TRANSCRIPT_CHARS = 60_000; + +const REFLECTION_SYSTEM = `You are a contributor reviewing how the "tiny-code" CLI coding agent itself performed in the session below. You are NOT here to finish the user's coding task — you are looking for ways to improve the agent (its prompts, tools, ergonomics, or docs). + +Look for recurring friction: tool errors, repeated retries on the same file, denied permissions, confusion, hitting the iteration limit, or missing capabilities. + +If — and only if — you find a concrete, worthwhile improvement, respond with a SINGLE markdown document and nothing else, in exactly this structure: + +# + +## Summary + + +## Motivation + + +## Proposed change + + +## Affected areas + + +## Risks + + +If there is no clear improvement worth filing, respond with exactly: +${NO_IMPROVEMENT} + +Do not propose code. Do not include anything outside the document or the sentinel.`; + +/** Flatten the conversation into a compact, readable transcript for reflection. */ +export function serializeTranscript(messages: readonly Message[]): string { + const lines: string[] = []; + + for (const message of messages) { + for (const block of message.content) { + if (block.type === 'text') { + if (block.text.trim().length > 0) { + lines.push(`[${message.role}] ${block.text.trim()}`); + } + } else if (block.type === 'tool_use') { + lines.push(`[tool_use] ${block.name} ${JSON.stringify(block.input ?? {})}`); + } else { + const marker = block.isError ? ' (error)' : ''; + lines.push(`[tool_result${marker}] ${truncate(block.content, MAX_RESULT_CHARS)}`); + } + } + } + + const transcript = lines.join('\n'); + return transcript.length > MAX_TRANSCRIPT_CHARS + ? transcript.slice(transcript.length - MAX_TRANSCRIPT_CHARS) + : transcript; +} + +export interface ReflectOptions { + provider: ModelProvider; + transcript: string; + cwd: string; +} + +/** + * Run a single tool-free reflection pass. Returns the proposal markdown, or + * `null` when the model declines (sentinel) or produces nothing usable. + * + * No tools are passed, so this call cannot execute anything — it can only emit + * text, which keeps reflection safe regardless of what the transcript contains. + */ +export async function reflect(opts: ReflectOptions): Promise { + const userText = `Working directory: ${opts.cwd}\n\nSession transcript:\n\n${opts.transcript}`; + + let text = ''; + for await (const event of opts.provider.send({ + system: REFLECTION_SYSTEM, + messages: [{ role: 'user', content: [{ type: 'text', text: userText }] }], + tools: [], + })) { + if (event.type === 'text') text += event.delta; + } + + const trimmed = text.trim(); + if (trimmed.length === 0 || trimmed === NO_IMPROVEMENT) return null; + return trimmed; +} + +function truncate(s: string, n: number): string { + return s.length > n ? `${s.slice(0, n)}…` : s; +} diff --git a/src/improve/run.ts b/src/improve/run.ts new file mode 100644 index 0000000..65d97c8 --- /dev/null +++ b/src/improve/run.ts @@ -0,0 +1,66 @@ +import type { ModelProvider } from '../providers/types.js'; +import type { Message } from '../agent/types.js'; +import { reflect, serializeTranscript } from './reflect.js'; +import { slugify } from './slug.js'; +import { createImprovementPr } from './pr.js'; + +export interface RunImprovementOptions { + provider: ModelProvider; + messages: readonly Message[]; + cwd: string; + baseBranch: string; + /** Surface a line of status to the user. */ + log: (line: string) => void; + /** Ask the user to approve opening a PR; returns true to proceed. */ + confirm: (title: string) => Promise; +} + +/** First `# ` heading in the markdown, used as the PR title and slug seed. */ +function extractTitle(markdown: string): string { + const match = markdown.match(/^#\s+(.+)$/m); + return match && match[1] ? match[1].trim() : 'tiny-code improvement'; +} + +/** + * End-to-end improvement flow: reflect on the session, and if there's a + * proposal, ask the user before opening a markdown-only PR. Never throws — it + * is safe to call from the REPL's exit path. + */ +export async function runImprovement(opts: RunImprovementOptions): Promise { + try { + const transcript = serializeTranscript(opts.messages); + if (transcript.trim().length === 0) { + opts.log('No session activity to reflect on.'); + return; + } + + const proposal = await reflect({ provider: opts.provider, transcript, cwd: opts.cwd }); + if (!proposal) { + opts.log('No improvements suggested for this session.'); + return; + } + + const title = extractTitle(proposal); + const approved = await opts.confirm(title); + if (!approved) { + opts.log('Skipped — no PR created.'); + return; + } + + const result = await createImprovementPr({ + cwd: opts.cwd, + slug: slugify(title), + title, + markdown: proposal, + baseBranch: opts.baseBranch, + }); + + if (result.ok) { + opts.log(`Opened improvement PR${result.url ? `: ${result.url}` : '.'}`); + } else { + opts.log(`Could not open PR: ${result.reason ?? 'unknown error'}`); + } + } catch (err) { + opts.log(`Improvement step failed: ${(err as Error).message}`); + } +} diff --git a/src/improve/slug.ts b/src/improve/slug.ts new file mode 100644 index 0000000..8b8f86e --- /dev/null +++ b/src/improve/slug.ts @@ -0,0 +1,34 @@ +/** + * Security-critical filename derivation for improvement proposals. + * + * The PR creator only ever writes/stages a path built from this slug, so the + * slug pattern is the single source of truth that keeps an (possibly injected) + * model from influencing anything beyond a single markdown file's contents. + */ + +/** A slug is lowercase alphanumerics joined by single dashes — no `/`, no `.`. */ +export const SLUG_RE = /^[a-z0-9]+(?:-[a-z0-9]+)*$/; + +const MAX_BASE_LENGTH = 50; + +/** + * Turn an arbitrary title into a safe, unique slug guaranteed to match + * {@link SLUG_RE}. Falls back to `improvement-` when the title yields + * nothing usable (e.g. all punctuation). + */ +export function slugify(title: string): string { + const suffix = Date.now().toString(36); + + const base = title + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, MAX_BASE_LENGTH) + .replace(/^-+|-+$/g, ''); + + const slug = base.length > 0 ? `${base}-${suffix}` : `improvement-${suffix}`; + + // The construction above should always satisfy SLUG_RE, but assert rather + // than trust it — this value becomes a filename and a branch name. + return SLUG_RE.test(slug) ? slug : `improvement-${suffix}`; +} diff --git a/src/repl.ts b/src/repl.ts index 3e9990e..f660036 100644 --- a/src/repl.ts +++ b/src/repl.ts @@ -12,10 +12,12 @@ import { loadProjectContext } from './config/context.js'; import { buildSystemPrompt } from './agent/systemPrompt.js'; import { loadCommands, renderCommand } from './commands/loader.js'; import type { Command } from './commands/types.js'; +import { runImprovement } from './improve/run.js'; function printHelp(commands: Map): void { console.log(pc.bold('\nBuilt-in:')); console.log(' /help Show this help'); + console.log(' /improve Reflect on this session and propose an improvement PR'); console.log(' /exit, /quit Leave the session'); if (commands.size > 0) { console.log(pc.bold('\nCustom commands:')); @@ -60,6 +62,44 @@ export async function startRepl(overrides: CliOverrides): Promise { maxIterations: config.maxIterations, }); + // Tracks the transcript length at the last reflection, so the auto-trigger on + // exit doesn't re-run when nothing happened since a manual /improve. + let lastImprovedAt = 0; + + const confirmPr = (title: string): Promise => + new Promise((resolve) => { + const label = pc.yellow('\nOpen a PR with this improvement?'); + rl.question(`${label} ${pc.dim(title)} [y/N] `, (answer) => { + resolve(/^y(es)?$/i.test(answer.trim())); + }); + }); + + const improve = async (): Promise => { + lastImprovedAt = agent.getMessages().length; + await runImprovement({ + provider, + messages: agent.getMessages(), + cwd, + baseBranch: config.improve.baseBranch, + log: (line) => console.log(pc.dim(line)), + confirm: confirmPr, + }); + }; + + // Auto-reflect when leaving via /exit or /quit — runs while readline is still + // open so the confirmation prompt works. Skipped if nothing happened since the + // last manual /improve. + const maybeAutoImprove = async (): Promise => { + if ( + config.improve.enabled && + config.improve.onSessionEnd && + agent.getMessages().length > lastImprovedAt + ) { + console.log(pc.dim('\nReflecting on this session…')); + await improve(); + } + }; + console.log( pc.bold('tiny-code') + pc.dim(` · ${provider.name}:${provider.model} · ${cwd}`), ); @@ -75,6 +115,7 @@ export async function startRepl(overrides: CliOverrides): Promise { return; } if (input === '/exit' || input === '/quit') { + await maybeAutoImprove(); rl.close(); return; } @@ -83,6 +124,15 @@ export async function startRepl(overrides: CliOverrides): Promise { ask(); return; } + if (input === '/improve') { + if (config.improve.enabled) { + await improve(); + } else { + console.log(pc.dim('Self-improvement is disabled in config.')); + } + ask(); + return; + } let userMessage = input; if (input.startsWith('/')) { diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts index f1a5829..2b73592 100644 --- a/tests/config/load.test.ts +++ b/tests/config/load.test.ts @@ -11,6 +11,7 @@ const ENV_KEYS = [ 'TINY_CODE_MODEL', 'TINY_CODE_MAX_TOKENS', 'TINY_CODE_EFFORT', + 'TINY_CODE_IMPROVE', 'HOME', ]; @@ -76,6 +77,34 @@ describe('loadConfig', () => { expect(cfg.allow.write).toEqual(['src/**']); }); + it('enables self-improvement by default', () => { + const cfg = loadConfig({}, cwd); + expect(cfg.improve.enabled).toBe(true); + expect(cfg.improve.baseBranch).toBe('main'); + expect(cfg.improve.onSessionEnd).toBe(true); + }); + + it('lets TINY_CODE_IMPROVE=0 disable the feature over a config file', async () => { + await writeFile( + join(cwd, 'tiny-code.config.json'), + JSON.stringify({ improve: { enabled: true } }), + ); + process.env.TINY_CODE_IMPROVE = '0'; + const cfg = loadConfig({}, cwd); + expect(cfg.improve.enabled).toBe(false); + }); + + it('reads improve settings from a config file', async () => { + await writeFile( + join(cwd, 'tiny-code.config.json'), + JSON.stringify({ improve: { enabled: false, baseBranch: 'develop', onSessionEnd: false } }), + ); + const cfg = loadConfig({}, cwd); + expect(cfg.improve.enabled).toBe(false); + expect(cfg.improve.baseBranch).toBe('develop'); + expect(cfg.improve.onSessionEnd).toBe(false); + }); + it('lets env override the config file model', async () => { await writeFile( join(cwd, 'tiny-code.config.json'), diff --git a/tests/improve/pr.test.ts b/tests/improve/pr.test.ts new file mode 100644 index 0000000..fb8f2f0 --- /dev/null +++ b/tests/improve/pr.test.ts @@ -0,0 +1,142 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +interface Call { + cmd: string; + args: string[]; +} + +// Shared mock state, reset per test. +const calls: Call[] = []; +let handler: (cmd: string, args: string[]) => { stdout: string; stderr: string }; + +vi.mock('node:child_process', () => { + const execFile = function execFile(): void { + /* unused — only the promisified custom path is exercised */ + }; + // promisify(execFile) returns this custom function, resolving to {stdout,stderr}. + (execFile as unknown as Record)[ + Symbol.for('nodejs.util.promisify.custom') + ] = (cmd: string, args: string[]) => { + calls.push({ cmd, args }); + return Promise.resolve(handler(cmd, args)); + }; + return { execFile }; +}); + +// Imported after the mock is registered. +const { createImprovementPr } = await import('../../src/improve/pr.js'); + +/** A handler simulating a clean repo with an authed gh, tracking staged files. */ +function happyHandler() { + const staged: string[] = []; + return (cmd: string, args: string[]) => { + if (cmd === 'git' && args[0] === 'add') staged.push(args[1] ?? ''); + if (cmd === 'git' && args.includes('--cached') && args.includes('--name-only')) { + return { stdout: staged.join('\n'), stderr: '' }; + } + if (cmd === 'git' && args[0] === 'rev-parse' && args.includes('--abbrev-ref')) { + return { stdout: 'work-branch', stderr: '' }; + } + if (cmd === 'git' && args[0] === 'status') return { stdout: '', stderr: '' }; + if (cmd === 'gh' && args[0] === 'pr') { + return { stdout: 'https://github.com/o/r/pull/7', stderr: '' }; + } + return { stdout: '', stderr: '' }; + }; +} + +let cwd: string; + +beforeEach(async () => { + cwd = await mkdtemp(join(tmpdir(), 'tiny-code-pr-')); + calls.length = 0; + handler = happyHandler(); +}); + +afterEach(async () => { + await rm(cwd, { recursive: true, force: true }); +}); + +describe('createImprovementPr', () => { + it('opens a PR staging only the single markdown path', async () => { + const result = await createImprovementPr({ + cwd, + slug: 'better-grep-abc', + title: 'Better grep', + markdown: '# Better grep\n', + }); + + expect(result.ok).toBe(true); + expect(result.url).toBe('https://github.com/o/r/pull/7'); + + const adds = calls.filter((c) => c.cmd === 'git' && c.args[0] === 'add'); + expect(adds).toHaveLength(1); + expect(adds[0]?.args).toEqual(['add', 'improvements/better-grep-abc.md']); + }); + + it('never stages with -A or .', async () => { + await createImprovementPr({ cwd, slug: 'x-1', title: 'X', markdown: '# X' }); + for (const c of calls) { + if (c.cmd === 'git' && c.args[0] === 'add') { + expect(c.args).not.toContain('-A'); + expect(c.args).not.toContain('.'); + } + } + }); + + it('passes title to gh as a discrete argument (no shell interpolation)', async () => { + const evil = 'X"; rm -rf / #'; + await createImprovementPr({ cwd, slug: 'x-2', title: evil, markdown: '# X' }); + const prCall = calls.find((c) => c.cmd === 'gh' && c.args[0] === 'pr'); + expect(prCall?.args).toContain(evil); // intact, as one arg — not concatenated into a shell line + }); + + it('refuses an unsafe slug before running anything', async () => { + const result = await createImprovementPr({ + cwd, + slug: '../../etc/passwd', + title: 'X', + markdown: '# X', + }); + expect(result.ok).toBe(false); + expect(result.reason).toMatch(/unsafe slug/); + expect(calls).toHaveLength(0); + }); + + it('fails gracefully when gh is missing', async () => { + handler = (cmd) => { + if (cmd === 'gh') throw Object.assign(new Error('not found'), { code: 'ENOENT' }); + return { stdout: '', stderr: '' }; + }; + const result = await createImprovementPr({ cwd, slug: 'x-3', title: 'X', markdown: '# X' }); + expect(result.ok).toBe(false); + expect(result.reason).toMatch(/gh CLI not found/); + }); + + it('fails gracefully when the working tree is dirty', async () => { + const base = happyHandler(); + handler = (cmd, args) => { + if (cmd === 'git' && args[0] === 'status') return { stdout: ' M src/x.ts', stderr: '' }; + return base(cmd, args); + }; + const result = await createImprovementPr({ cwd, slug: 'x-4', title: 'X', markdown: '# X' }); + expect(result.ok).toBe(false); + expect(result.reason).toMatch(/uncommitted changes/); + }); + + it('aborts if anything beyond the markdown file gets staged', async () => { + const base = happyHandler(); + handler = (cmd, args) => { + if (cmd === 'git' && args.includes('--cached') && args.includes('--name-only')) { + return { stdout: 'improvements/x-5.md\nsrc/evil.ts', stderr: '' }; + } + return base(cmd, args); + }; + const result = await createImprovementPr({ cwd, slug: 'x-5', title: 'X', markdown: '# X' }); + expect(result.ok).toBe(false); + expect(result.reason).toMatch(/Unexpected staged files/); + }); +}); diff --git a/tests/improve/reflect.test.ts b/tests/improve/reflect.test.ts new file mode 100644 index 0000000..fd1fb17 --- /dev/null +++ b/tests/improve/reflect.test.ts @@ -0,0 +1,62 @@ +import { describe, it, expect } from 'vitest'; +import { reflect, serializeTranscript, NO_IMPROVEMENT } from '../../src/improve/reflect.js'; +import type { Message } from '../../src/agent/types.js'; +import type { ModelProvider, ProviderEvent, SendRequest } from '../../src/providers/types.js'; + +class TextProvider implements ModelProvider { + readonly name = 'anthropic' as const; + readonly model = 'fake'; + readonly sent: SendRequest[] = []; + + constructor(private readonly chunks: string[]) {} + + async *send(req: SendRequest): AsyncIterable { + this.sent.push(req); + for (const delta of this.chunks) yield { type: 'text', delta }; + yield { type: 'done', usage: { inputTokens: 0, outputTokens: 0 }, stopReason: 'end_turn' }; + } +} + +describe('serializeTranscript', () => { + it('flattens text, tool_use, and tool_result blocks', () => { + const messages: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'fix the bug' }] }, + { + role: 'assistant', + content: [{ type: 'tool_use', id: '1', name: 'bash', input: { command: 'ls' } }], + }, + { + role: 'user', + content: [{ type: 'tool_result', toolUseId: '1', content: 'boom', isError: true }], + }, + ]; + const out = serializeTranscript(messages); + expect(out).toContain('[user] fix the bug'); + expect(out).toContain('[tool_use] bash {"command":"ls"}'); + expect(out).toContain('[tool_result (error)] boom'); + }); +}); + +describe('reflect', () => { + it('returns trimmed markdown when the model proposes something', async () => { + const provider = new TextProvider(['# Better grep\n', '\n## Summary\nuse rg']); + const result = await reflect({ provider, transcript: 'session', cwd: '/x' }); + expect(result).toBe('# Better grep\n\n## Summary\nuse rg'); + }); + + it('passes no tools to the provider', async () => { + const provider = new TextProvider(['# x']); + await reflect({ provider, transcript: 'session', cwd: '/x' }); + expect(provider.sent[0]?.tools).toEqual([]); + }); + + it('returns null on the sentinel', async () => { + const provider = new TextProvider([NO_IMPROVEMENT]); + expect(await reflect({ provider, transcript: 's', cwd: '/x' })).toBeNull(); + }); + + it('returns null on empty output', async () => { + const provider = new TextProvider([' ']); + expect(await reflect({ provider, transcript: 's', cwd: '/x' })).toBeNull(); + }); +}); diff --git a/tests/improve/run.test.ts b/tests/improve/run.test.ts new file mode 100644 index 0000000..4337329 --- /dev/null +++ b/tests/improve/run.test.ts @@ -0,0 +1,80 @@ +import { describe, it, expect, beforeEach, vi } from 'vitest'; +import type { Message } from '../../src/agent/types.js'; +import type { ModelProvider, ProviderEvent, SendRequest } from '../../src/providers/types.js'; + +const createImprovementPr = vi.fn(); +vi.mock('../../src/improve/pr.js', () => ({ createImprovementPr })); + +const { runImprovement } = await import('../../src/improve/run.js'); + +class TextProvider implements ModelProvider { + readonly name = 'anthropic' as const; + readonly model = 'fake'; + constructor(private readonly text: string) {} + async *send(_req: SendRequest): AsyncIterable { + yield { type: 'text', delta: this.text }; + yield { type: 'done', usage: { inputTokens: 0, outputTokens: 0 }, stopReason: 'end_turn' }; + } +} + +const userMsg: Message[] = [{ role: 'user', content: [{ type: 'text', text: 'hi' }] }]; + +function harness(provider: ModelProvider, confirmValue: boolean) { + const logs: string[] = []; + return { + logs, + opts: { + provider, + messages: userMsg, + cwd: '/x', + baseBranch: 'main', + log: (l: string) => logs.push(l), + confirm: async () => confirmValue, + }, + }; +} + +beforeEach(() => { + createImprovementPr.mockReset(); +}); + +describe('runImprovement', () => { + it('reports no activity for an empty transcript', async () => { + const { logs, opts } = harness(new TextProvider('# X'), true); + await runImprovement({ ...opts, messages: [] }); + expect(logs.join()).toMatch(/No session activity/); + expect(createImprovementPr).not.toHaveBeenCalled(); + }); + + it('reports when reflection yields nothing', async () => { + const { logs, opts } = harness(new TextProvider('NO_IMPROVEMENT'), true); + await runImprovement(opts); + expect(logs.join()).toMatch(/No improvements suggested/); + expect(createImprovementPr).not.toHaveBeenCalled(); + }); + + it('skips PR creation when the user declines', async () => { + const { logs, opts } = harness(new TextProvider('# Better grep\nbody'), false); + await runImprovement(opts); + expect(logs.join()).toMatch(/Skipped/); + expect(createImprovementPr).not.toHaveBeenCalled(); + }); + + it('creates a PR and logs the url on approval', async () => { + createImprovementPr.mockResolvedValue({ ok: true, url: 'https://example/pr/1' }); + const { logs, opts } = harness(new TextProvider('# Better grep\nbody'), true); + await runImprovement(opts); + expect(createImprovementPr).toHaveBeenCalledOnce(); + const arg = createImprovementPr.mock.calls[0]?.[0]; + expect(arg.title).toBe('Better grep'); + expect(arg.markdown).toContain('# Better grep'); + expect(logs.join()).toMatch(/https:\/\/example\/pr\/1/); + }); + + it('logs the failure reason when PR creation fails', async () => { + createImprovementPr.mockResolvedValue({ ok: false, reason: 'gh CLI not found' }); + const { logs, opts } = harness(new TextProvider('# Title\nbody'), true); + await runImprovement(opts); + expect(logs.join()).toMatch(/gh CLI not found/); + }); +}); diff --git a/tests/improve/slug.test.ts b/tests/improve/slug.test.ts new file mode 100644 index 0000000..fa5131f --- /dev/null +++ b/tests/improve/slug.test.ts @@ -0,0 +1,39 @@ +import { describe, it, expect } from 'vitest'; +import { SLUG_RE, slugify } from '../../src/improve/slug.js'; + +describe('slugify', () => { + it('produces a SLUG_RE-valid slug from a normal title', () => { + const slug = slugify('Improve the grep tool'); + expect(SLUG_RE.test(slug)).toBe(true); + expect(slug).toMatch(/^improve-the-grep-tool-/); + }); + + it('strips punctuation and collapses separators', () => { + const slug = slugify('Add ??? web_fetch!! tool'); + expect(SLUG_RE.test(slug)).toBe(true); + expect(slug.startsWith('-')).toBe(false); + expect(slug).not.toContain('_'); + }); + + it('neutralizes path-traversal attempts', () => { + for (const evil of ['../../etc/passwd', '..\\..\\win', '/abs/path', 'a/b/c.md']) { + const slug = slugify(evil); + expect(SLUG_RE.test(slug)).toBe(true); + expect(slug).not.toContain('/'); + expect(slug).not.toContain('.'); + } + }); + + it('falls back to improvement- when nothing usable remains', () => { + const slug = slugify('!!! ...'); + expect(SLUG_RE.test(slug)).toBe(true); + expect(slug).toMatch(/^improvement-/); + }); + + it('caps the base length', () => { + const slug = slugify('x'.repeat(200)); + // base (<=50) + '-' + timestamp suffix + expect(slug.length).toBeLessThan(70); + expect(SLUG_RE.test(slug)).toBe(true); + }); +});