diff --git a/CLAUDE.md b/CLAUDE.md index d7fdfc2..b89a2d9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -103,10 +103,12 @@ Users configure persistent scan preferences via `/pixelslop settings` (interacti | Key | Type | Default | What it does | |-----|------|---------|-------------| | `headed` | boolean | `false` | Open visible browser window during scans | -| `deep` | boolean | `false` | Extended collection — doubled budgets, more elements tested | -| `thorough` | boolean | `false` | Show lower-confidence findings (50% vs 65% threshold) | +| `deep` | boolean | `true` | Extended collection — doubled budgets, more elements tested | +| `thorough` | boolean | `true` | Show lower-confidence findings, tagged with confidence | | `personas` | string | `all` | Persona IDs to evaluate (comma-separated, `all`, or `none`) | +**Exhaustive by default.** `deep` and `thorough` default to `true` because Pixelslop is usually driven by an AI agent that won't remember to pass the flags — the default has to be the thorough one. `--fast` is the opt-out (sets `deep: false`, `thorough: false` for a quick, high-confidence-only pass). The cost of `deep: true` is a slower scan; `--fast` is there when speed matters. + **Merge priority:** CLI args > saved settings > defaults. A user who runs `/pixelslop --thorough` gets thorough mode regardless of what's in `.pixelslop.md`. **Commands:** @@ -133,6 +135,7 @@ Agents use `pixelslop-tools` (bin/pixelslop-tools.cjs) for all state operations. - **`browser analyze-page`** classifies page type (landing-page, e-commerce, content, form-heavy, app-like, general) and suggests relevant personas. Fast (< 2s), no screenshots. - **`config read-tokens` / `config write-tokens`** read and write the project's normative design tokens — a `## Design Tokens` section in `.pixelslop.md` holding flat `key: value` lines (`color-primary: #b8422e`, `font-body: Inter`, `type-scale: 1.25`, `space-unit: 4px`). The setup agent captures them from the codebase; the fixer reads them so a fix moves *toward* the project's real palette/type/spacing instead of a generic default. `write-tokens` merges (unspecified keys preserved) and only touches the Design Tokens section — `config write` stays the initializer, tokens layer on top like settings do. - **`scan trend`** reports the score progression across runs. `scan save-results` now appends each run's /20 total (plus per-pillar scores) to `.pixelslop/scan-history.json`; `scan trend [--target ] [--last ]` reads it back (`11 -> 13 -> 14 (+3)`). History is best-effort — a corrupt history file self-heals and never blocks the actual save. The orchestrator surfaces the trend in its scan summary. +- **`personas write` / `personas list`** manage project-specific personas. `write --json ''` validates (required fields, slug-only id, no built-in collision, no path traversal) and saves to `.pixelslop/personas/.json`; `list` returns the 8 built-ins plus any custom ones. The orchestrator generates 1-2 personas from the project's audience/brand and evaluates them alongside the built-ins, so persona findings fit the real users instead of only the generic profiles. ## Voice & Persona diff --git a/bin/pixelslop-tools.cjs b/bin/pixelslop-tools.cjs index 26d64ea..de42af4 100644 --- a/bin/pixelslop-tools.cjs +++ b/bin/pixelslop-tools.cjs @@ -1100,6 +1100,72 @@ function configWriteTokens(args = {}) { : `Design tokens written (${Object.keys(merged).length}): ${configPath}`); } +// The 8 shipped persona profiles. Custom (project-specific) personas live in +// .pixelslop/personas/ and must not collide with these ids. +const BUILTIN_PERSONA_IDS = [ + 'screen-reader-user', 'low-vision-user', 'keyboard-user', 'rushed-mobile-user', + 'slow-connection-user', 'non-native-english', 'design-critic', 'first-time-visitor' +]; + +/** + * Write a project-specific persona to .pixelslop/personas/.json after + * validating it. The setup agent generates these from the project's audience, + * and the orchestrator evaluates them alongside the built-ins. + */ +function personasWrite(args = {}) { + try { + if (!args.json) return { ok: false, error: '--json is required' }; + let persona; + try { persona = JSON.parse(args.json); } catch (e) { return { ok: false, error: `Invalid --json: ${e.message}` }; } + if (!persona || typeof persona !== 'object' || Array.isArray(persona)) { + return { ok: false, error: '--json must be a persona object' }; + } + + const required = ['id', 'name', 'category', 'description', 'designPriorities', 'frustrationTriggers', 'positiveSignals']; + const missing = required.filter((k) => persona[k] == null); + if (missing.length) return { ok: false, error: `Missing persona fields: ${missing.join(', ')}` }; + if (!Array.isArray(persona.frustrationTriggers) || !Array.isArray(persona.positiveSignals)) { + return { ok: false, error: 'frustrationTriggers and positiveSignals must be arrays' }; + } + + const id = String(persona.id); + // id doubles as the filename, so it must be a safe slug — no traversal, no surprises. + if (!/^[a-z0-9][a-z0-9-]{1,40}$/.test(id)) { + return { ok: false, error: `Persona id must be a lowercase slug [a-z0-9-], 2-41 chars: got "${id}"` }; + } + if (BUILTIN_PERSONA_IDS.includes(id)) { + return { ok: false, error: `"${id}" collides with a built-in persona; use a project-specific id` }; + } + + const dir = path.join(resolveProjectRoot(args.root), '.pixelslop', 'personas'); + const outPath = path.join(dir, `${id}.json`); + // Defence in depth: the written file must stay inside the personas dir. + if (path.dirname(path.resolve(outPath)) !== path.resolve(dir)) { + return { ok: false, error: 'unsafe persona path' }; + } + fs.mkdirSync(dir, { recursive: true }); + fs.writeFileSync(outPath, JSON.stringify(persona, null, 2), 'utf-8'); + return { ok: true, id, path: outPath }; + } catch (err) { + return { ok: false, error: err.message }; + } +} + +/** + * List available personas: the built-ins plus any custom ones in + * .pixelslop/personas/. Lets the orchestrator discover generated personas. + */ +function personasList(args = {}) { + const dir = path.join(resolveProjectRoot(args.root), '.pixelslop', 'personas'); + let custom = []; + if (fs.existsSync(dir)) { + custom = fs.readdirSync(dir) + .filter((f) => f.endsWith('.json') && !f.startsWith('._')) + .map((f) => f.replace(/\.json$/, '')); + } + return { ok: true, builtin: BUILTIN_PERSONA_IDS, custom, dir }; +} + /** * Check if .pixelslop.md exists. */ @@ -1113,10 +1179,14 @@ function configExists(args = {}) { // ───────────────────────────────────────────── /** Valid setting keys and their value types/defaults */ +// Defaults are exhaustive on purpose. Pixelslop is usually driven by an AI agent +// that won't remember to pass --thorough or --deep, so the default has to be the +// thorough one. `--fast` (handled in SKILL.md) is the opt-out that turns deep and +// thorough back off for a quick pass. const SETTING_DEFS = { headed: { type: 'boolean', default: false, description: 'Open visible browser window' }, - deep: { type: 'boolean', default: false, description: 'Extended collection with doubled budgets' }, - thorough: { type: 'boolean', default: false, description: 'Show lower-confidence findings' }, + deep: { type: 'boolean', default: true, description: 'Extended collection with doubled budgets (off with --fast)' }, + thorough: { type: 'boolean', default: true, description: 'Show lower-confidence findings, tagged (off with --fast)' }, personas: { type: 'string', default: 'all', description: 'Persona IDs (comma-separated, "all", or "none")' }, }; @@ -3077,32 +3147,53 @@ function reportGenerate(flags) { `; } - // ── Findings table ── - let findingsHtml; - if (hasFixData && findings.length > 0) { - // Full table with category + status columns - const rows = findings.map(f => { - const text = typeof f === 'string' ? f : (f.description || ''); - const priority = typeof f === 'object' ? (f.priority || 'P2') : 'P2'; + // ── Findings table (split into measured evidence vs design judgment) ── + // kind defaults to 'measured' so existing scans render exactly as before; + // the design-director pass is the only producer of 'judgment' findings. + const kindOf = (f) => (typeof f === 'object' && f.kind === 'judgment') ? 'judgment' : 'measured'; + const measuredFindings = findings.filter(f => kindOf(f) === 'measured'); + const judgmentFindings = findings.filter(f => kindOf(f) === 'judgment'); + + const renderRows = (list) => list.map(f => { + const text = typeof f === 'string' ? f : (f.description || ''); + const priority = typeof f === 'object' ? (f.priority || 'P2') : 'P2'; + // Judgment findings carry a confidence the report surfaces inline. + const conf = (typeof f === 'object' && f.confidence != null) + ? ` (${escapeHtml(String(f.confidence))})` : ''; + if (hasFixData) { const category = typeof f === 'object' ? (f.category || '') : ''; - // Try to match finding to plan issue for status let fixStatus = 'OPEN'; if (typeof f === 'object' && f.id && issueMap.has(f.id)) { fixStatus = (issueMap.get(f.id).status || 'pending').toUpperCase(); } - return `${escapeHtml(priority)}${escapeHtml(category)}${escapeHtml(text)}${escapeHtml(fixStatus)}`; - }).join('\n '); - findingsHtml = `\n ${rows}\n
PriorityCategoryFindingStatus
`; - } else if (findings.length > 0) { - // Simple table without category/status - const rows = findings.map(f => { - const text = typeof f === 'string' ? f : (f.description || ''); - const priority = typeof f === 'object' ? (f.priority || 'P2') : 'P2'; - return `${escapeHtml(priority)}${escapeHtml(text)}`; - }).join('\n '); - findingsHtml = `\n ${rows}\n
PriorityFinding
`; - } else { + return `${escapeHtml(priority)}${escapeHtml(category)}${escapeHtml(text)}${conf}${escapeHtml(fixStatus)}`; + } + return `${escapeHtml(priority)}${escapeHtml(text)}${conf}`; + }).join('\n '); + + const tableFor = (list) => { + const head = hasFixData + ? 'PriorityCategoryFindingStatus' + : 'PriorityFinding'; + return `${head}\n ${renderRows(list)}\n
`; + }; + const layerHeading = (title, note) => + `

${escapeHtml(title)} — ${escapeHtml(note)}

`; + + let findingsHtml; + if (findings.length === 0) { findingsHtml = '

No findings

'; + } else if (judgmentFindings.length === 0) { + // Only measured findings — render the single table, no layer headings (unchanged look). + findingsHtml = tableFor(measuredFindings.length ? measuredFindings : findings); + } else { + // Both layers present — label and separate them so judgment never reads as measured fact. + const sections = []; + if (measuredFindings.length > 0) { + sections.push(layerHeading('Measured', 'evidence-backed') + tableFor(measuredFindings)); + } + sections.push(layerHeading('Design judgment', "a design director's read, not measured") + tableFor(judgmentFindings)); + findingsHtml = sections.join('\n '); } // ── Fix section (entire tab-section div, or empty) ── @@ -3398,6 +3489,15 @@ async function main() { break; } + case 'personas': { + switch (command) { + case 'write': return output(personasWrite(flags), true); + case 'list': return output(personasList(flags), true); + default: fail(`Unknown personas command: ${command}. Valid: write, list`); + } + break; + } + default: fail(`Unknown group: ${group}. Valid: plan, checkpoint, gate, config, log, discover, serve, init, verify, browser, scan, report`); } diff --git a/dist/agents/internal/pixelslop-eval-design-director.md b/dist/agents/internal/pixelslop-eval-design-director.md new file mode 100644 index 0000000..f80a56b --- /dev/null +++ b/dist/agents/internal/pixelslop-eval-design-director.md @@ -0,0 +1,95 @@ +--- +name: pixelslop-eval-design-director +description: > + The subjective design-judgment pass. Looks at the screenshots and reads the + page like a design director — composition, distinctiveness, emotional fit, + missed opportunities — then argues against its own findings before returning + them. Produces judgment findings only. Does NOT touch the /20 score. +model: sonnet +tools: + - Read +--- + +You're the design director. The other six evaluators measure things — contrast ratios, type scales, overflow. You do the thing a measurement can't: you look at the page and say whether it's actually *good*, and where a real designer would push back. + +This is the subjective pass on purpose. You are allowed to have taste and opinions. But you are also the one evaluator most at risk of producing noise — vague, unfalsifiable, "make it pop" feedback that wastes everyone's time. So you do two passes: first you say what you see, then you argue against yourself and throw out everything you can't defend. What survives is what you return. + +**You never touch the /20 score.** The score stays measured. Your findings are a separate layer, labeled as judgment. Your job is coverage and taste, not grading. + +## Setup: Load Your Knowledge + +``` +Read dist/skill/resources/scoring.md # The whole rubric — know what's already measured so you don't repeat it +Read dist/skill/resources/ai-slop-patterns.md # The visual fingerprints of AI-generated design +Read dist/skill/resources/heuristics.md # Nielsen's 10, adapted — the UX lens +Read dist/skill/resources/cognitive-load.md # When a page asks too much of the user +``` + +## Input + +- **evidence_path** (required) — absolute path to the evidence bundle JSON +- **thorough** (optional, default: false) — when true, keep medium-confidence findings; when false, only high-confidence + +## Protocol + +1. **Read your resource files.** All four. You need to know what's already measured so you don't just restate it in prose. + +2. **Read the evidence bundle** at `evidence_path`. Note the pillar evidence, the slop patterns already detected, the persona checks. + +3. **Look at the screenshots.** This is the part the measured evaluators can't do. The bundle has `viewports.desktop.screenshot`, `viewports.tablet.screenshot`, `viewports.mobile.screenshot` (and scroll-fold screenshots if present). `Read` each PNG path. A screenshot you didn't open doesn't count — don't opine on a layout you haven't seen. + +4. **First pass — say what you see.** Look like a design director reviewing a junior's work. Draft findings across these lenses: + - **Does this look AI-generated?** Be honest. Generic hero, icon-heading-paragraph-button rows, no point of view, every section the same rhythm. The `ai-slop-patterns.md` fingerprints, but as a gestalt, not a checklist. + - **Composition & distinctiveness** — does the page have a point of view, or is it a template? Is there a focal point, a reason the eye goes where it goes? Would anyone remember this page? + - **Emotional fit** — does the feeling match the job? A funeral home that feels like a fintech startup is wrong even if every contrast ratio passes. + - **Missed opportunities** — the strongest design-director move. Not "this is broken" but "this is fine and forgettable, and here's the version that isn't." + - **UX heuristics & cognitive load** — where the page makes the user think too hard, in ways the measured pillars don't already flag. + +5. **Second pass — argue against yourself.** For every finding from pass 1, ask: + - *Is this falsifiable, or is it "make it pop"?* If you can't point at the screenshot and say what specifically and why, cut it. + - *Is a measured evaluator already saying this?* If contrast/typography/hierarchy already flagged it, drop yours — it's their finding, measured beats judgment. + - *Am I imposing one taste, or is this a real problem?* A bold, deliberate choice you personally wouldn't make is not a finding. Respect intent. Pixelslop does not punish distinctive design for being distinctive. + - *Would a second design director agree?* If you're only ~60% sure, tag it `low`. If you'd bet on it, `high`. + + Kill everything that fails. Be ruthless — a short list of sharp, defensible reads beats a long list of vibes. Returning two real findings is a success. Inventing eight to look thorough is the failure mode this pass exists to prevent. + +6. **Return JSON.** Findings that survived, each tagged `kind: "judgment"` and a confidence. + +## Output Format + +Return exactly this. Nothing else. + +```json +{ + "kind": "design-director", + "verdict": "One honest sentence: does this look designed, or generated?", + "findings": [ + { + "criterion": "distinctiveness", + "kind": "judgment", + "confidence": "high", + "detail": "Every section is icon / heading / paragraph / button at the same rhythm — the page reads as a template with the content swapped in, not as a designed page.", + "evidence": "desktop screenshot: features, testimonials, and pricing sections share identical structure and spacing", + "opportunity": "Break the rhythm — let one section be full-bleed, vary the grid, give the hero a real focal object instead of centered text over a gradient." + } + ] +} +``` + +Each finding needs: +- `criterion` — the lens (`ai-slop`, `distinctiveness`, `composition`, `emotional-fit`, `missed-opportunity`, `cognitive-load`, `ux-heuristic`) +- `kind` — always `"judgment"` +- `confidence` — `"high"` or `"medium"` (or `"low"` only in thorough mode) +- `detail` — what you see, specific enough to point at in the screenshot +- `evidence` — which screenshot/viewport, and what in it +- `opportunity` — optional but encouraged; the better version, concretely + +## Rules + +1. **Judgment only — never a score.** You do not return a `score` or `pillar`. The /20 is measured. If you find yourself wanting to grade, stop. +2. **You looked, or you don't speak.** Every finding cites a specific screenshot. No opining on layouts you didn't open. +3. **Don't restate measured findings.** If a pillar evaluator measured it, it's theirs. You cover what measurement can't. +4. **Respect intent.** Distinctive ≠ wrong. Bold ≠ broken. A choice you wouldn't make is not a defect. +5. **The second pass is mandatory.** Returning pass-1 findings without arguing against them is the one thing you must never do. Noise is worse than silence here. +6. **Confidence is honest.** `high` means you'd defend it in a studio review. Don't inflate. +7. **Return JSON only.** No markdown, no preamble. diff --git a/dist/agents/pixelslop.md b/dist/agents/pixelslop.md index dd01a4f..c7cc5a5 100644 --- a/dist/agents/pixelslop.md +++ b/dist/agents/pixelslop.md @@ -159,6 +159,24 @@ node bin/pixelslop-tools.cjs config write \ If the user wants to skip setup, proceed without it — config is optional. +### Step 5b: Generate Project-Specific Personas + +If you have a real audience and brand for this project (from design context above or an existing `.pixelslop.md`), generate 1-2 personas tuned to *this* project's actual users — not just the 8 generic built-ins. A wedding-planner site should be tested by "the stressed bride three weeks out," not only "first-time visitor." + +First check whether project personas already exist (don't regenerate every run): + +```bash +node bin/pixelslop-tools.cjs personas list --root "$ROOT" --raw +``` + +If `custom` is empty and you have audience/brand, synthesize 1-2 personas following `dist/skill/resources/personas/schema.md` (a real `humanName`, the project's actual user in `description`, `frustrationTriggers` and `positiveSignals` specific to this audience), and write each via: + +```bash +node bin/pixelslop-tools.cjs personas write --root "$ROOT" --raw --json '' +``` + +Use a project-specific `id` slug (e.g. `stressed-bride`, not a built-in id). Only generate what the audience genuinely supports — one sharp project persona beats two generic ones. Skip this step entirely when there's no real audience to work from. + ### Step 6: Collect Evidence **Log before collection:** @@ -185,22 +203,24 @@ node bin/pixelslop-tools.cjs log write --agent orchestrator --level info --messa ### Step 6b: Spawn Specialist Evaluators -Spawn all 6 specialist evaluators from `dist/agents/internal/`. Each receives the evidence file path and reads its own domain resource files. +Spawn the 6 measured specialists plus the design-director from `dist/agents/internal/`. Each receives the evidence file path and reads its own domain resource files. ``` Spawn agents (parallel where runtime supports it): - - pixelslop-eval-hierarchy (evidence_path, thorough flag) - - pixelslop-eval-typography (evidence_path, thorough flag) - - pixelslop-eval-color (evidence_path, thorough flag) - - pixelslop-eval-responsiveness (evidence_path, thorough flag) - - pixelslop-eval-accessibility (evidence_path, thorough flag) - - pixelslop-eval-slop (evidence_path, thorough flag) + - pixelslop-eval-hierarchy (evidence_path, thorough flag) + - pixelslop-eval-typography (evidence_path, thorough flag) + - pixelslop-eval-color (evidence_path, thorough flag) + - pixelslop-eval-responsiveness (evidence_path, thorough flag) + - pixelslop-eval-accessibility (evidence_path, thorough flag) + - pixelslop-eval-slop (evidence_path, thorough flag) + - pixelslop-eval-design-director (evidence_path, thorough flag) ``` Each pillar specialist returns JSON: `{ "pillar": "...", "score": N, "evidence": "...", "findings": [...] }` The slop classifier returns JSON: `{ "band": "...", "patternCount": N, "patterns": [...] }` +The design-director returns JSON: `{ "kind": "design-director", "verdict": "...", "findings": [...] }` where every finding is `kind: "judgment"` with a `confidence`. It returns **no score** — it never affects the /20. -Collect all 6 results. +Collect all 7 results. The 6 measured specialists feed the scores and measured findings; the design-director feeds only the judgment layer. ### Step 6c: Aggregate Report @@ -227,14 +247,21 @@ Patterns detected: [patternCount] [patterns list from eval-slop] ### Findings -[merge all specialist findings, sort by priority] + +**Measured** [evidence-backed] +[merge the 6 measured specialists' findings, sort by priority — each carries kind: "measured"] + +**Design judgment** [the design director's read, not measured] +[the design-director's verdict line, then its findings — each carries kind: "judgment" and a confidence. Omit this whole sub-section if the director returned no findings. These never change the /20.] ### Persona Insights -[For each evaluated persona: read the persona JSON's humanName, name, narrationStyle.voice, and sampleReactions. +[Evaluate the selected built-in personas (per the `personas` setting) AND every project-specific persona. Discover the project ones with `personas list` — read each `custom` id's JSON from `.pixelslop/personas/.json`. Built-in JSONs live in `dist/skill/resources/personas/`. Custom personas use the exact same schema, so evaluate them identically. + +For each evaluated persona: read the persona JSON's humanName, name, narrationStyle.voice, and sampleReactions. Match frustrationTriggers and positiveSignals against specialist findings and personaChecks data from the evidence bundle. Write a 1-3 paragraph narrative in the persona's voice — see scoring.md Persona Report Format for contract and examples. End each persona section with the **Issues:** and **Worked well:** machine-parseable anchors. -Skip personas with zero issues and no notable positives.] +Skip personas with zero issues and no notable positives. A project-specific persona that surfaces a real audience issue is the most valuable one in the report — lead with it.] ### Screenshots [reference from evidence bundle] diff --git a/dist/skill/SKILL.md b/dist/skill/SKILL.md index 5fb872b..0768f1f 100644 --- a/dist/skill/SKILL.md +++ b/dist/skill/SKILL.md @@ -1,9 +1,12 @@ --- name: pixelslop description: > - Browser-first design quality review and fix. Scans pages with Playwright, - scores 5 design pillars, detects AI slop patterns, fixes issues with - checkpoint-based rollback. + Browser-first design quality review and fix. Scans real pages with Playwright, + scores 5 measured pillars, detects AI slop patterns, and runs a design-director + pass for subjective judgment findings. Evaluates against 8 built-in personas + plus project-specific ones generated from your audience, tracks score trends + across runs, and fixes issues toward your design tokens with checkpoint-based + rollback. Exhaustive by default (--fast for a quick pass). user-invokable: true args: - name: url @@ -22,7 +25,13 @@ args: description: Persona IDs to evaluate (comma-separated, "all", or "none"). Default all required: false - name: thorough - description: Show lower-confidence findings (threshold 50% instead of 65%) + description: Show lower-confidence findings, tagged with confidence. Default true (exhaustive) + required: false + - name: deep + description: Extended collection with doubled budgets and more elements tested. Default true (exhaustive) + required: false + - name: fast + description: Quick pass — turns deep and thorough off for a faster, high-confidence-only scan required: false - name: debug description: Enable session logging to .pixelslop-session.log for troubleshooting @@ -35,6 +44,16 @@ args: required: false --- +## Asking the user (works in any harness) + +Pixelslop runs under different harnesses (Claude Code, Codex CLI, and others), and they ask the user questions differently. Wherever this skill says to ask the user — including every `AskUserQuestion(...)` block below — present the **same question and the same options** using whatever your harness supports: + +- **Claude Code:** use the `AskUserQuestion` tool with the listed options (structured, selectable). +- **Codex CLI, or any harness with no choice-prompt tool:** print the question and its options as a short numbered list, then **stop and wait** for the user to reply with a number or text. Codex has no `AskUserQuestion`-style popup (it's an open request upstream), so a plain numbered menu is the equivalent. Don't silently pick a default and continue — the point is to let the user choose. +- **Non-interactive runs** (`codex exec`, CI, or `--quick`): don't ask at all. Use the saved setting or the documented default and proceed. + +The `AskUserQuestion(...)` snippets in this file are the question **content** — the exact wording and options to surface. *How* you render them is your harness's call; *what* you ask is not. If you're not on Claude Code, read each block as "ask this question, offer these options" and present it your way. + ## Settings Mode When `--settings` is passed (e.g., `/pixelslop settings`), run the interactive settings configurator and stop — don't scan anything. @@ -132,6 +151,60 @@ Tell them: "These settings apply to all future `/pixelslop` runs in this project --- +## Capabilities & Options (the full menu) + +Everything Pixelslop can do, in one place. Read this so you can tell the user what's available — most people (and most agents) don't know half of it. When a scan finishes, mention the one or two options that fit their situation. + +**What a scan produces:** +- **5 measured pillars** (hierarchy, typography, color, responsiveness, accessibility), scored /20 from real browser evidence. +- **AI slop detection** — 25 visual patterns + source patterns. +- **Design-director judgment** — a subjective pass that looks at the screenshots and flags what measurement can't (generic composition, AI-generated feel, missed opportunities). Shown in a separate "Design judgment" layer; never affects the /20. +- **Persona evaluation** — 8 built-in personas, plus 1-2 project-specific personas generated from your audience/brand. +- **Score trends** — each run's score is recorded; repeat scans show movement (`scan trend`). +- **Self-contained HTML report** with screenshots and the measured/judgment split. + +**Run options (flags):** +- `--fast` — quick pass; turns off deep + thorough (Pixelslop is exhaustive by default). +- `--thorough` / `--deep` — both default **on**; `--fast` is the opt-out. +- `--personas all|none|` — which personas to evaluate (default all). +- `--code-check` — source-only analysis, no browser. +- `--quick` — skip the per-run config prompt, use saved settings/defaults. +- `--headed` — visible browser window. +- `--settings` — open the interactive settings configurator. +- `--debug` — session logging for troubleshooting. + +**Beyond scanning:** +- **Fix loop** — locates the source, fixes *toward your design tokens*, checkpoints before editing, rolls back if the build breaks. +- **Design tokens** — `config read-tokens` / `write-tokens` hold your real palette/type/spacing so fixes match the project. +- **Custom personas** — `personas write` adds your own; the orchestrator also generates project-specific ones automatically. +- **Settings** — `/pixelslop settings` saves preferences per project so you don't pass flags every run. + +If a scan was slow, mention `--fast`. If the user has a clear audience, project personas are already working for them. If they've scanned before, point at the trend. Surface what's relevant; don't dump the whole list every time. + +## Advise, don't interrogate (read this before asking the user anything) + +You are an advisor, not a config form. Before you scan, work out what the user is actually trying to do and **lead with a recommendation**, then offer the alternative. Don't open with a wall of settings questions, and don't silently run defaults on a request that implies something else. + +Infer intent from how they asked, then match it: + +| What they said / the situation | Recommend | Why | +|--------------------------------|-----------|-----| +| "quick look", "does this look ok", a glance | **`--fast`** | high-confidence findings only, ~10s — respects "quick" | +| "review", "before launch", "audit", or unspecified | **the default** (exhaustive: 5 pillars + design-director + personas) | catches the soft stuff, not just what's measurable | +| First scan of this project (no `.pixelslop.md`) | **setup first**, then scan | gathering audience/brand unlocks project personas + token-aware fixes | +| Clear audience/brand mentioned | default + **let it generate a project persona** | tests against their real users, not just generic profiles | +| No URL, local project | help resolve a dev-server URL, or **`--code-check`** | code-check needs no browser | +| "in CI", "automate", "for every PR" | **`--fast --quick --personas none`** | fast and deterministic, no prompts | +| "is it getting better?", iterating | scan, then **`scan trend`** | shows the /20 climbing across runs | +| Wants fixes, not just a report | scan → **fix loop** → re-scan | fixes move toward their tokens; the trend confirms it | + +How to actually advise: +1. **State your recommendation and the one tradeoff**, in a sentence. "You're pre-launch, so I'll run the full exhaustive scan with a persona tuned to your audience — it's thorough so ~30-40s. Want a fast gut-check instead?" +2. **Only ask when there's a real fork.** If the intent is clear, recommend and proceed. If it's genuinely ambiguous (quick vs thorough, fix vs report-only), present 2-3 concrete options with their tradeoff and let them pick — don't ask about individual flags. +3. **Never** present the raw settings questions (personas? deep? thorough?) as the opening move. Those are for `/pixelslop settings`, not for advising a scan. Translate intent into the flags yourself. + +The point: the user shouldn't need to know the flags exist. You know them. Recommend the right run, explain it in one line, and let them redirect. + ## How This Works You (the main session) handle all user-facing decisions **before** spawning the orchestrator. The orchestrator runs to completion — no mid-execution pauses, no SendMessage relay. This keeps things reliable. @@ -349,7 +422,9 @@ A lightweight pre-scan step that lets the user tweak settings for this specific | 1 | CLI flags | This run only — e.g., `--personas none --thorough` | | 2 | Per-run answers | This run only — user picks in Phase 2b | | 3 | Saved settings | All runs — from `.pixelslop.md` | -| 4 | Defaults | Fallback — `personas: all`, `thorough: false`, etc. | +| 4 | Defaults | Fallback — exhaustive by default: `personas: all`, `thorough: true`, `deep: true` | + +**Exhaustive by default.** Pixelslop is usually driven by an AI agent that won't remember to pass `--thorough` or `--deep`, so those default to **on**. `thorough: true` shows lower-confidence findings tagged with their confidence rather than hiding them; `deep: true` doubles collection budgets for more evidence (at the cost of a slower scan). The opt-out is **`--fast`**: when the user passes `--fast`, set `thorough: false` and `deep: false` for that run (a quick, high-confidence-only pass). `--fast` is a CLI flag, so it wins over saved settings for this run, same as any other flag. ### Skip conditions diff --git a/dist/skill/resources/scoring.md b/dist/skill/resources/scoring.md index 1fd53f4..42fff29 100644 --- a/dist/skill/resources/scoring.md +++ b/dist/skill/resources/scoring.md @@ -310,6 +310,11 @@ The Evidence column in the scores table is not optional. A score without evidenc Findings should be ordered by impact -- the thing that hurts the site the most goes first. Each finding should reference which pillar it affects and include the specific browser observation that surfaced it. +**Finding kinds — measured vs judgment.** Every finding carries a `kind`: + +- `kind: "measured"` (the default) — backed by a specific browser measurement. The six pillar evaluators and the persona checks only ever produce these. If `kind` is absent, it is measured. +- `kind: "judgment"` — a subjective read from the design-director pass (composition, distinctiveness, emotional fit, missed opportunities). These do **not** affect the /20 score; the score stays measured-only. They carry a `confidence` field (`low`/`medium`/`high`) and render in a separate "Design judgment" layer of the report, clearly labeled as opinion, not measured fact. This is how Pixelslop stays exhaustive without letting judgment masquerade as measurement. + Screenshots are references to captured images, not inline data. If a screenshot was not captured for a given viewport, note it as `[not captured]` and that gap should be reflected in the confidence score. --- diff --git a/tests/design-director.test.js b/tests/design-director.test.js new file mode 100644 index 0000000..ef254a4 --- /dev/null +++ b/tests/design-director.test.js @@ -0,0 +1,65 @@ +/** + * Design Director Contract Tests + * + * The design-director is the subjective judgment pass — the one evaluator that + * looks at screenshots and opines. Its whole value depends on a few invariants + * that are easy to erode in editing, so they're pinned here: + * - it produces judgment findings only and never a /20 score + * - it actually looks at the screenshots + * - it runs the adversarial second pass (the anti-noise guard) + * - the orchestrator spawns it and routes its output to the judgment layer + * + * Run: node --test tests/design-director.test.js + */ + +import { describe, it } from 'node:test'; +import { strict as assert } from 'node:assert'; +import { readFileSync, existsSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; + +const ROOT = join(dirname(fileURLToPath(import.meta.url)), '..'); +const DIRECTOR = join(ROOT, 'dist', 'agents', 'internal', 'pixelslop-eval-design-director.md'); +const ORCH = join(ROOT, 'dist', 'agents', 'pixelslop.md'); + +describe('design-director spec', () => { + assert.ok(existsSync(DIRECTOR), 'design-director spec must exist'); + const spec = readFileSync(DIRECTOR, 'utf-8'); + + it('has read-only frontmatter (no Write/Edit)', () => { + const fm = spec.slice(0, spec.indexOf('---', 3)); + assert.ok(/name:\s*pixelslop-eval-design-director/.test(spec), 'name set'); + assert.ok(/tools:[\s\S]*-\s*Read/.test(spec), 'has Read tool'); + assert.ok(!/-\s*Write/.test(fm) && !/-\s*Edit/.test(fm), 'must not have Write or Edit'); + }); + + it('produces judgment only and never a score', () => { + assert.ok(/never.{0,20}(score|\/20)|no score|stays measured/i.test(spec), 'states it never scores'); + assert.ok(spec.includes('"kind": "judgment"') || /kind.{0,4}judgment/.test(spec), 'findings are kind judgment'); + assert.ok(!/"score"\s*:/.test(spec) || /do not return a `?score/i.test(spec), 'no score field in output, or explicitly forbidden'); + }); + + it('actually looks at the screenshots', () => { + assert.ok(/screenshot/i.test(spec), 'references screenshots'); + assert.ok(/Read.{0,40}(PNG|screenshot)|screenshot you didn/i.test(spec), 'instructed to open the screenshot'); + }); + + it('runs the adversarial second pass (anti-noise guard)', () => { + assert.ok(/argue against (yourself|your own)|second pass/i.test(spec), 'has the self-argument pass'); + assert.ok(/confidence/i.test(spec), 'tags findings with confidence'); + assert.ok(/respect intent|distinctive.{0,4}(!=|≠|is not).{0,10}wrong/i.test(spec), 'respects intentional bold design'); + }); +}); + +describe('orchestrator wiring', () => { + const orch = readFileSync(ORCH, 'utf-8'); + + it('spawns the design-director', () => { + assert.ok(orch.includes('pixelslop-eval-design-director'), 'orchestrator spawns the director'); + }); + + it('routes its findings to a separate judgment layer, not the score', () => { + assert.ok(/no score|never affects the \/20|never change the \/20/i.test(orch), 'director does not affect the /20'); + assert.ok(/Design judgment/i.test(orch), 'findings go to a Design judgment section'); + }); +}); diff --git a/tests/evaluator.test.js b/tests/evaluator.test.js index dd92397..382f7d1 100644 --- a/tests/evaluator.test.js +++ b/tests/evaluator.test.js @@ -90,10 +90,11 @@ describe('Internal evaluator agents directory', () => { assert.ok(existsSync(INTERNAL), 'Missing: dist/agents/internal/'); }); - it('contains exactly 6 evaluator specs', () => { + it('contains the 6 measured evaluators plus the design-director (7 total)', () => { const files = readdirSync(INTERNAL).filter(f => f.endsWith('.md') && !f.startsWith('._')); - assert.equal(files.length, 6, - `Expected 6 internal evaluator specs, found ${files.length}: ${files.join(', ')}`); + assert.equal(files.length, 7, + `Expected 7 internal evaluator specs (6 measured + design-director), found ${files.length}: ${files.join(', ')}`); + assert.ok(files.includes('pixelslop-eval-design-director.md'), 'design-director spec must be present'); }); }); @@ -189,9 +190,11 @@ describe('Pillar coverage', () => { assert.ok(existsSync(INTERNAL), `INTERNAL dir missing: ${INTERNAL}`); const raw = readdirSync(INTERNAL); const mdFiles = raw.filter(f => f.endsWith('.md') && !f.startsWith('._')); - const pillarFiles = mdFiles.filter(f => !f.includes('eval-slop')); + // Pillar evaluators exclude the slop classifier and the design-director — + // the director is the subjective judgment pass, not a scored pillar. + const pillarFiles = mdFiles.filter(f => !f.includes('eval-slop') && !f.includes('eval-design-director')); assert.equal(pillarFiles.length, 5, - `Expected 5 pillar evaluators (excluding slop), found ${pillarFiles.length}. Raw dir: ${raw.join(', ')}. MD files: ${mdFiles.join(', ')}. Pillar files: ${pillarFiles.join(', ')}`); + `Expected 5 pillar evaluators (excluding slop + design-director), found ${pillarFiles.length}. Raw dir: ${raw.join(', ')}. MD files: ${mdFiles.join(', ')}. Pillar files: ${pillarFiles.join(', ')}`); }); }); diff --git a/tests/personas-tool.test.js b/tests/personas-tool.test.js new file mode 100644 index 0000000..23d0953 --- /dev/null +++ b/tests/personas-tool.test.js @@ -0,0 +1,104 @@ +/** + * Personas Tool Tests + * + * `personas write` validates and saves a project-specific persona to + * .pixelslop/personas/, and `personas list` reports built-ins + custom ones so + * the orchestrator can discover generated personas. The id doubles as the + * filename, so validation (slug-only, no built-in collisions, no traversal) is + * a safety boundary, not a nicety. + * + * Run: node --test tests/personas-tool.test.js + */ + +import { describe, it, beforeEach } from 'node:test'; +import { strict as assert } from 'node:assert'; +import { execFileSync } from 'node:child_process'; +import { existsSync, mkdtempSync, rmSync, readFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; + +const ROOT = join(dirname(fileURLToPath(import.meta.url)), '..'); +const TOOLS = join(ROOT, 'bin', 'pixelslop-tools.cjs'); + +function run(args) { + const stdout = execFileSync('node', [TOOLS, ...args], { encoding: 'utf-8' }); + try { return JSON.parse(stdout); } catch { return { _raw: stdout }; } +} +const persona = (over = {}) => JSON.stringify({ + id: 'stressed-bride', name: 'Stressed Bride', category: 'context', + description: 'A bride three weeks from her wedding, evaluating a planner', + designPriorities: { hierarchy: 4 }, + frustrationTriggers: ['buried pricing'], positiveSignals: ['clear timeline'], + ...over +}); + +describe('personas write / list', () => { + let dir; + beforeEach(() => { dir = mkdtempSync(join(tmpdir(), 'pxs-personas-')); }); + const cleanup = () => { try { rmSync(dir, { recursive: true, force: true }); } catch {} }; + + it('lists the 8 built-ins and no custom on a fresh project', () => { + const r = run(['personas', 'list', '--root', dir, '--raw']); + assert.equal(r.builtin.length, 8); + assert.deepEqual(r.custom, []); + cleanup(); + }); + + it('writes a valid persona and lists it', () => { + const w = run(['personas', 'write', '--root', dir, '--raw', '--json', persona()]); + assert.equal(w.ok, true); + assert.equal(w.id, 'stressed-bride'); + assert.ok(existsSync(join(dir, '.pixelslop', 'personas', 'stressed-bride.json'))); + const l = run(['personas', 'list', '--root', dir, '--raw']); + assert.deepEqual(l.custom, ['stressed-bride']); + cleanup(); + }); + + it('rejects an id that collides with a built-in', () => { + const w = run(['personas', 'write', '--root', dir, '--raw', '--json', persona({ id: 'design-critic' })]); + assert.equal(w.ok, false); + assert.match(w.error, /collides/i); + cleanup(); + }); + + it('rejects a non-slug / path-traversal id', () => { + for (const bad of ['../evil', 'Has Spaces', 'a/b', 'UPPER']) { + const w = run(['personas', 'write', '--root', dir, '--raw', '--json', persona({ id: bad })]); + assert.equal(w.ok, false, `id "${bad}" must be rejected`); + } + // and nothing escaped the personas dir + assert.ok(!existsSync(join(dir, 'evil.json'))); + cleanup(); + }); + + it('rejects a persona missing required fields', () => { + const w = run(['personas', 'write', '--root', dir, '--raw', '--json', '{"id":"x"}']); + assert.equal(w.ok, false); + assert.match(w.error, /Missing persona fields/i); + cleanup(); + }); + + it('rejects non-array frustrationTriggers', () => { + const w = run(['personas', 'write', '--root', dir, '--raw', '--json', persona({ frustrationTriggers: 'nope' })]); + assert.equal(w.ok, false); + cleanup(); + }); + + it('rejects invalid JSON cleanly', () => { + const w = run(['personas', 'write', '--root', dir, '--raw', '--json', '{not json']); + assert.equal(w.ok, false); + assert.match(w.error, /Invalid --json/i); + cleanup(); + }); +}); + +describe('persona generation is wired into the orchestrator', () => { + const orch = readFileSync(join(ROOT, 'dist', 'agents', 'pixelslop.md'), 'utf-8'); + + it('the orchestrator generates project personas and discovers them', () => { + assert.ok(orch.includes('personas write'), 'orchestrator writes generated personas'); + assert.ok(orch.includes('personas list'), 'orchestrator discovers custom personas'); + assert.ok(/project-specific persona|project's actual users|tuned to/i.test(orch), 'frames them as project-specific'); + }); +}); diff --git a/tests/report-layers.test.js b/tests/report-layers.test.js new file mode 100644 index 0000000..74e6f6a --- /dev/null +++ b/tests/report-layers.test.js @@ -0,0 +1,80 @@ +/** + * Report Layer Tests + * + * Findings now carry a `kind`: "measured" (evidence-backed, the default) or + * "judgment" (the design-director's subjective read). The HTML report keeps the + * two visually separate so judgment never reads as measured fact, and a scan with + * only measured findings looks exactly as it did before (no extra headings). + * + * Run: node --test tests/report-layers.test.js + */ + +import { describe, it, beforeEach } from 'node:test'; +import { strict as assert } from 'node:assert'; +import { execFileSync } from 'node:child_process'; +import { readFileSync, writeFileSync, mkdtempSync, mkdirSync, rmSync, readdirSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; + +const ROOT = join(dirname(fileURLToPath(import.meta.url)), '..'); +const TOOLS = join(ROOT, 'bin', 'pixelslop-tools.cjs'); + +function generate(dir, findings) { + mkdirSync(join(dir, '.pixelslop'), { recursive: true }); + const scan = { + title: 'T', url: 'http://x', timestamp: '2026-06-10T00:00:00Z', + scores: { hierarchy: { score: 3 }, typography: { score: 2 }, color: { score: 2 }, responsiveness: { score: 3 }, accessibility: { score: 2 } }, + findings + }; + const scanPath = join(dir, '.pixelslop', 'scan-results.json'); + writeFileSync(scanPath, JSON.stringify(scan), 'utf-8'); + execFileSync('node', [TOOLS, 'report', 'generate', '--scan-results', scanPath, '--root', dir, '--raw'], { encoding: 'utf-8' }); + const reportsDir = join(dir, '.pixelslop', 'reports'); + const file = readdirSync(reportsDir).find((f) => f.endsWith('.html')); + return readFileSync(join(reportsDir, file), 'utf-8'); +} + +describe('report layers (measured vs judgment)', () => { + let dir; + beforeEach(() => { dir = mkdtempSync(join(tmpdir(), 'pxs-layers-')); }); + const cleanup = () => { try { rmSync(dir, { recursive: true, force: true }); } catch {} }; + + it('renders no layer headings when every finding is measured (unchanged look)', () => { + const html = generate(dir, [ + { priority: 'P1', description: 'Contrast weak', kind: 'measured' }, + { priority: 'P2', description: 'No focus ring' } // kind omitted -> measured + ]); + assert.ok(html.includes('Contrast weak') && html.includes('No focus ring'), 'measured findings render'); + assert.ok(!html.includes('Design judgment'), 'no judgment section when there are no judgment findings'); + cleanup(); + }); + + it('separates measured and judgment findings into labeled layers', () => { + const html = generate(dir, [ + { priority: 'P1', description: 'Contrast weak', kind: 'measured' }, + { priority: 'P2', description: 'Hero feels generic', kind: 'judgment', confidence: 'medium' } + ]); + assert.ok(html.includes('Measured'), 'measured layer heading present'); + assert.ok(html.includes('Design judgment'), 'judgment layer heading present'); + assert.ok(/design director.{0,8}s read, not measured/.test(html), 'judgment labeled as opinion (apostrophe may be HTML-escaped)'); + assert.ok(html.includes('Contrast weak') && html.includes('Hero feels generic'), 'both findings render'); + cleanup(); + }); + + it('surfaces a judgment finding confidence inline', () => { + const html = generate(dir, [ + { priority: 'P2', description: 'Composition is safe', kind: 'judgment', confidence: 'low' } + ]); + assert.ok(html.includes('Composition is safe'), 'judgment finding renders'); + assert.ok(html.includes('(low)'), 'confidence shown inline'); + cleanup(); + }); + + it('treats a string finding as measured', () => { + const html = generate(dir, ['Plain string finding']); + assert.ok(html.includes('Plain string finding')); + assert.ok(!html.includes('Design judgment')); + cleanup(); + }); +}); diff --git a/tests/skill-discoverability.test.js b/tests/skill-discoverability.test.js new file mode 100644 index 0000000..878f5ba --- /dev/null +++ b/tests/skill-discoverability.test.js @@ -0,0 +1,109 @@ +/** + * Skill Discoverability Tests + * + * SKILL.md is what an AI agent reads when it invokes /pixelslop — it's the only + * place the agent (and through it, the user) learns what Pixelslop can do. The + * failure mode is drift: we add a flag or a command in the code, and forget to + * advertise it in SKILL.md, so nobody ever uses it. + * + * These tests are the guard. The setting keys are extracted live from + * pixelslop-tools.cjs, so adding a setting and forgetting to document it fails + * the build. The flag/command/capability lists are curated — when you add one, + * add it here and to SKILL.md together. That coupling is the point. + * + * Run: node --test tests/skill-discoverability.test.js + */ + +import { describe, it } from 'node:test'; +import { strict as assert } from 'node:assert'; +import { readFileSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; + +const ROOT = join(dirname(fileURLToPath(import.meta.url)), '..'); +const SKILL = readFileSync(join(ROOT, 'dist', 'skill', 'SKILL.md'), 'utf-8'); +const TOOLS = readFileSync(join(ROOT, 'bin', 'pixelslop-tools.cjs'), 'utf-8'); + +// Pull the real setting keys straight from SETTING_DEFS so the test tracks code. +function settingKeys() { + const block = TOOLS.match(/const SETTING_DEFS = \{([\s\S]*?)\};/); + assert.ok(block, 'SETTING_DEFS block must exist in pixelslop-tools.cjs'); + return [...block[1].matchAll(/^\s*([a-z]+):\s*\{/gm)].map((m) => m[1]); +} + +describe('SKILL.md advertises every setting', () => { + for (const key of settingKeys()) { + it(`mentions the "${key}" setting`, () => { + assert.ok(SKILL.includes(key), + `SKILL.md never mentions the "${key}" setting — an agent won't know it exists. Add it to the Capabilities section and args.`); + }); + } +}); + +describe('SKILL.md advertises every run flag', () => { + // Curated: when you add a flag, add it here and to SKILL.md together. + const flags = ['--fast', '--thorough', '--deep', '--personas', '--code-check', '--quick', '--headed', '--settings', '--debug']; + for (const flag of flags) { + it(`mentions ${flag}`, () => { + assert.ok(SKILL.includes(flag), `SKILL.md never mentions ${flag}`); + }); + } +}); + +describe('SKILL.md advertises every major capability', () => { + const capabilities = { + 'design-director / judgment layer': /design.director|design judgment|judgment finding/i, + 'project-specific personas': /project-specific persona|personas write|generated from your audience/i, + 'score trends': /scan trend|score trend/i, + 'design tokens': /read-tokens|design tokens/i, + 'fix loop': /fix loop|checkpoint/i, + 'code-check mode': /code-check/i, + }; + for (const [name, re] of Object.entries(capabilities)) { + it(`mentions ${name}`, () => { + assert.ok(re.test(SKILL), `SKILL.md never mentions ${name} — it's invisible to agents and users.`); + }); + } +}); + +describe('the frontmatter description sells the breadth', () => { + const fm = SKILL.slice(0, SKILL.indexOf('user-invokable')); + it('the trigger description mentions personas, judgment, trends, tokens, and fast', () => { + for (const word of ['persona', 'judgment', 'trend', 'token', 'fast']) { + assert.ok(new RegExp(word, 'i').test(fm), + `frontmatter description omits "${word}" — that surface is what agents see in the skill list before invoking.`); + } + }); +}); + +describe('a capabilities overview section exists', () => { + it('SKILL.md has a Capabilities & Options menu', () => { + assert.ok(/## Capabilities & Options/i.test(SKILL), + 'SKILL.md must have a single canonical Capabilities & Options section'); + }); +}); + +describe('the asking protocol is harness-neutral (works under Codex too)', () => { + it('has an "Asking the user" protocol', () => { + assert.ok(/## Asking the user/i.test(SKILL), + 'SKILL.md must define how to ask the user across harnesses — AskUserQuestion is Claude Code only'); + }); + it('tells non-Claude harnesses what to do instead of AskUserQuestion', () => { + assert.ok(/AskUserQuestion/i.test(SKILL), 'still describes the Claude Code mechanism'); + assert.ok(/Codex/i.test(SKILL), 'names Codex specifically'); + assert.ok(/numbered (list|menu)/i.test(SKILL), 'gives the Codex/plain-text equivalent (a numbered menu)'); + assert.ok(/wait/i.test(SKILL), 'tells the agent to stop and wait for the reply'); + }); +}); + +describe('the skill drives advisory behaviour, not a config form', () => { + it('has an advise-the-user playbook', () => { + assert.ok(/## Advise/i.test(SKILL), + 'SKILL.md must have an advisory section so any harness leads with a recommendation, not a settings form'); + }); + it('tells the agent to recommend by intent and not open with raw settings questions', () => { + assert.ok(/lead with a recommendation/i.test(SKILL), 'must instruct leading with a recommendation'); + assert.ok(/intent/i.test(SKILL), 'must map user intent to a run'); + assert.ok(/advisor, not a config form|advise, don.t interrogate/i.test(SKILL), 'must frame the agent as an advisor'); + }); +}); diff --git a/tests/tools.test.js b/tests/tools.test.js index 000d4f0..ef5b2f7 100644 --- a/tests/tools.test.js +++ b/tests/tools.test.js @@ -1497,8 +1497,8 @@ describe('config settings', () => { const result = runJson(`config get --root "${dir}"`, dir); assert.ok(result.settings, 'should return settings object'); assert.equal(result.settings.headed, false, 'headed default is false'); - assert.equal(result.settings.deep, false, 'deep default is false'); - assert.equal(result.settings.thorough, false, 'thorough default is false'); + assert.equal(result.settings.deep, true, 'deep default is true (exhaustive by default)'); + assert.equal(result.settings.thorough, true, 'thorough default is true (exhaustive by default)'); assert.equal(result.settings.personas, 'all', 'personas default is all'); assert.deepEqual(result.defined, [], 'no keys explicitly defined'); }); @@ -1528,7 +1528,7 @@ describe('config settings', () => { runJson(`config set headed false --root "${dir}"`, dir); const result = runJson(`config get thorough --root "${dir}"`, dir); assert.equal(result.key, 'thorough'); - assert.equal(result.value, false); + assert.equal(result.value, true); assert.equal(result.source, 'default'); }); @@ -1537,8 +1537,8 @@ describe('config settings', () => { const result = runJson(`config get --root "${dir}"`, dir); assert.ok(result.settings, 'should have settings object'); assert.equal(result.settings.headed, true); - assert.equal(result.settings.deep, false, 'unset deep should default to false'); - assert.equal(result.settings.thorough, false, 'unset thorough should default to false'); + assert.equal(result.settings.deep, true, 'unset deep should default to true'); + assert.equal(result.settings.thorough, true, 'unset thorough should default to true'); assert.equal(result.settings.personas, 'all', 'unset personas should default to all'); assert.deepEqual(result.defined, ['headed'], 'only headed was explicitly set'); });