diff --git a/package-lock.json b/package-lock.json index 67a4112..dd0a472 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,7 +12,8 @@ "react": "^19.2.4", "react-chartjs-2": "^5.3.1", "react-dom": "^19.2.4", - "react-force-graph-2d": "^1.29.1" + "react-force-graph-2d": "^1.29.1", + "zod": "^4.3.6" }, "bin": { "crune": "bin/crune.js" @@ -4300,7 +4301,6 @@ "version": "4.3.6", "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz", "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==", - "dev": true, "license": "MIT", "funding": { "url": "https://github.com/sponsors/colinhacks" diff --git a/package.json b/package.json index 82791af..2a066b6 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,8 @@ "react": "^19.2.4", "react-chartjs-2": "^5.3.1", "react-dom": "^19.2.4", - "react-force-graph-2d": "^1.29.1" + "react-force-graph-2d": "^1.29.1", + "zod": "^4.3.6" }, "devDependencies": { "@eslint/js": "^9.39.4", diff --git a/scripts/__tests__/cli.test.ts b/scripts/__tests__/cli.test.ts index f2fa2aa..26548c8 100644 --- a/scripts/__tests__/cli.test.ts +++ b/scripts/__tests__/cli.test.ts @@ -10,6 +10,8 @@ describe("parseCliArgs", () => { expect(result.model).toBeUndefined(); expect(result.skipSynthesis).toBe(false); expect(result.dryRun).toBe(false); + expect(result.skipEval).toBe(false); + expect(result.evalModel).toBeUndefined(); }); it("sets sessionsDir with --sessions-dir", () => { @@ -47,6 +49,16 @@ describe("parseCliArgs", () => { expect(result.dryRun).toBe(true); }); + it("sets skipEval with --skip-eval", () => { + const result = parseCliArgs(["node", "cli.ts", "--skip-eval"]); + expect(result.skipEval).toBe(true); + }); + + it("sets evalModel with --eval-model", () => { + const result = parseCliArgs(["node", "cli.ts", "--eval-model", "haiku"]); + expect(result.evalModel).toBe("haiku"); + }); + it("handles multiple flags combined", () => { const result = parseCliArgs([ "node", diff --git a/scripts/__tests__/skill-evaluator.test.ts b/scripts/__tests__/skill-evaluator.test.ts new file mode 100644 index 0000000..f913595 --- /dev/null +++ b/scripts/__tests__/skill-evaluator.test.ts @@ -0,0 +1,290 @@ +import { describe, it, expect } from "vitest"; +import { + validateStructure, + extractFrontmatter, + extractFirstJsonObject, + buildRubricPrompt, + evaluateSkill, + smokeFireTest, +} from "../skill-evaluator.js"; + +const goodSkill = `--- +name: refactor-tests +description: Use when the user wants to refactor a Vitest suite to use shared fixtures. Triggers on phrases like "refactor tests" or "extract fixture". +allowed-tools: [Read, Edit, Bash] +--- + +## Overview +Refactors Vitest tests to share fixtures. + +## When to Use +When tests duplicate setup logic. + +## Workflow +1. Read failing test files. +2. Extract fixtures. +3. Run vitest. +`; + +describe("validateStructure", () => { + it("accepts well-formed frontmatter", () => { + const result = validateStructure(goodSkill); + expect(result.valid).toBe(true); + expect(result.issues).toEqual([]); + expect(result.parsed?.name).toBe("refactor-tests"); + expect(result.parsed?.allowedTools).toEqual(["Read", "Edit", "Bash"]); + }); + + it("flags missing name", () => { + const md = `--- +description: Long enough description for the validator triggers and hints. +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "name")).toBe(true); + }); + + it("flags missing description", () => { + const md = `--- +name: my-skill +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "description")).toBe(true); + }); + + it("flags oversize description (>500 chars)", () => { + const longDesc = "x".repeat(501); + const md = `--- +name: my-skill +description: ${longDesc} +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "description" && /500/.test(i.message))).toBe( + true + ); + }); + + it("flags too-short description (<20 chars)", () => { + const md = `--- +name: my-skill +description: short +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "description")).toBe(true); + }); + + it("flags forbidden characters in name", () => { + const md = `--- +name: My_Skill! +description: A reasonable description with enough detail to pass minimum length check. +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "name")).toBe(true); + }); + + it("flags malformed YAML — missing closing ---", () => { + const md = `--- +name: my-skill +description: A reasonable description with enough detail to pass minimum length check. + +body without closing fence +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues[0].field).toBe("frontmatter"); + expect(result.issues[0].message).toMatch(/closing/); + }); + + it("flags malformed YAML — no frontmatter at all", () => { + const md = `# Just a heading\n\nNo frontmatter here.`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues[0].field).toBe("frontmatter"); + }); + + it("flags malformed YAML — bad scalar line", () => { + const md = `--- +name: ok-name +this line has no colon +description: A reasonable description with enough detail to pass minimum length check. +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues[0].field).toBe("frontmatter"); + }); + + it("flags empty body after frontmatter", () => { + const md = `--- +name: my-skill +description: A reasonable description with enough detail to pass minimum length check. +--- + +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "body")).toBe(true); + }); + + it("accepts body when file ends without trailing newline after closing fence", () => { + // No trailing newline after the body and no blank line between fence + // and body — previously misreported as "empty body". + const md = + "---\nname: my-skill\ndescription: A reasonable description with enough detail to pass minimum length check.\n---\nbody content"; + const result = validateStructure(md); + expect(result.valid).toBe(true); + expect(result.issues).toEqual([]); + }); + + it("accepts indented list form for allowed-tools", () => { + const md = `--- +name: my-skill +description: A reasonable description with enough detail to pass minimum length check. +allowed-tools: + - Read + - Edit +--- + +body content +`; + const result = validateStructure(md); + expect(result.valid).toBe(true); + expect(result.parsed?.allowedTools).toEqual(["Read", "Edit"]); + }); + + it("accepts requires/next workflow-continuation arrays", () => { + const md = `--- +name: deploy-app +description: Use when the user wants to deploy the staging environment after running tests. +requires: [setup-env] +next: [smoke-test] +--- + +deploy steps +`; + const result = validateStructure(md); + expect(result.valid).toBe(true); + expect(result.parsed?.requires).toEqual(["setup-env"]); + expect(result.parsed?.next).toEqual(["smoke-test"]); + }); +}); + +describe("extractFrontmatter", () => { + it("parses simple frontmatter", () => { + const { data } = extractFrontmatter(goodSkill); + expect(data.name).toBe("refactor-tests"); + expect(typeof data.description).toBe("string"); + }); + + it("strips quotes around values", () => { + const md = `--- +name: "quoted-name" +description: 'single quoted long enough description for the validator' +--- + +body +`; + const { data } = extractFrontmatter(md); + expect(data.name).toBe("quoted-name"); + expect(data.description).toBe("single quoted long enough description for the validator"); + }); +}); + +describe("extractFirstJsonObject", () => { + it("returns the first balanced JSON object", () => { + const s = 'preamble {"score": 80, "nested": {"a": 1}} trailing'; + expect(extractFirstJsonObject(s)).toBe('{"score": 80, "nested": {"a": 1}}'); + }); + + it("handles strings containing braces", () => { + const s = '{"hint": "look at {curly} braces", "score": 50}'; + expect(extractFirstJsonObject(s)).toBe(s); + }); + + it("returns null when no object present", () => { + expect(extractFirstJsonObject("just text")).toBeNull(); + }); +}); + +describe("buildRubricPrompt", () => { + it("includes the skill markdown and rubric instructions", () => { + const prompt = buildRubricPrompt(goodSkill); + expect(prompt).toContain("STRICT JSON"); + expect(prompt).toContain("nameQuality"); + expect(prompt).toContain("descriptionTriggering"); + expect(prompt).toContain("instructionsConcrete"); + expect(prompt).toContain("noPreambleNoise"); + expect(prompt).toContain("refactor-tests"); + }); + + it("uses an enclosing fence longer than any backtick run inside the markdown", () => { + // Skill body containing triple-backtick code blocks must not break the + // outer fence — pick a longer fence (>=4 backticks). + const skillWithFences = `--- +name: my-skill +description: A reasonable description with enough detail to pass the minimum length check. +--- + +## Example + +\`\`\`bash +echo "hello" +\`\`\` +`; + const prompt = buildRubricPrompt(skillWithFences); + // The outer fence must be at least 4 backticks long to wrap the inner ```. + expect(prompt).toMatch(/\n````+\n[\s\S]*?\n````+\n?/); + // The inner ``` must still be present verbatim. + expect(prompt).toContain("```bash"); + }); +}); + +describe("smokeFireTest", () => { + it("returns skipped: true with a follow-up message (stub)", async () => { + const result = await smokeFireTest(goodSkill); + expect(result.skipped).toBe(true); + expect(typeof result.message).toBe("string"); + }); +}); + +describe("evaluateSkill (orchestrator)", () => { + it("scores 0 when structural validation fails and does not call the LLM", async () => { + const broken = `# no frontmatter at all`; + const result = await evaluateSkill(broken, { skipRubric: true }); + expect(result.structural.valid).toBe(false); + expect(result.overallScore).toBe(0); + // smoke firing is always present (stub) + expect(result.smokeFiring.skipped).toBe(true); + // rubric is marked skipped, never invoked + expect(result.rubric?.skipped).toBe(true); + expect(result.rubric?.ok).toBe(false); + }); + + it("scores 50 when structural passes but rubric is skipped", async () => { + const result = await evaluateSkill(goodSkill, { skipRubric: true }); + expect(result.structural.valid).toBe(true); + expect(result.overallScore).toBe(50); + expect(result.rubric?.skipped).toBe(true); + }); +}); diff --git a/scripts/cli.ts b/scripts/cli.ts index 1137f00..1daee41 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -23,6 +23,7 @@ import { stripSynthesisPreamble, type TopicNode as SynthTopicNode, } from "./skill-synthesizer.js"; +import { evaluateSkill } from "./skill-evaluator.js"; // ─── CLI argument parsing ───────────────────────────────────────── @@ -33,6 +34,8 @@ interface CliArgs { model?: string; skipSynthesis: boolean; dryRun: boolean; + skipEval: boolean; + evalModel?: string; } export function parseCliArgs(argv: string[]): CliArgs { @@ -43,6 +46,8 @@ export function parseCliArgs(argv: string[]): CliArgs { let model: string | undefined; let skipSynthesis = false; let dryRun = false; + let skipEval = false; + let evalModel: string | undefined; for (let i = 0; i < args.length; i++) { if (args[i] === "--sessions-dir" && args[i + 1]) { @@ -58,13 +63,17 @@ export function parseCliArgs(argv: string[]): CliArgs { skipSynthesis = true; } else if (args[i] === "--dry-run") { dryRun = true; + } else if (args[i] === "--skip-eval") { + skipEval = true; + } else if (args[i] === "--eval-model" && args[i + 1]) { + evalModel = args[++i]; } else if (args[i] === "--help" || args[i] === "-h") { printUsage(); process.exit(0); } } - return { sessionsDir, outputDir, count, model, skipSynthesis, dryRun }; + return { sessionsDir, outputDir, count, model, skipSynthesis, dryRun, skipEval, evalModel }; } function printUsage(): void { @@ -79,6 +88,8 @@ Options: --model Claude model for synthesis (e.g., haiku, sonnet) --skip-synthesis Skip LLM synthesis, output heuristic skills only --dry-run Show candidates without writing files + --skip-eval Skip skill evaluation pipeline (structural + rubric) + --eval-model Claude model for rubric scoring (defaults to --model) -h, --help Show this help message`); } @@ -196,6 +207,7 @@ async function main(): Promise { console.error(` -> ${label}`); let markdown = candidate.skillMarkdown; + let synthesisHappened = false; if (!config.skipSynthesis && topic) { // Find enriched sequences related to this topic's sessions @@ -216,6 +228,7 @@ async function main(): Promise { if (result.success) { markdown = stripSynthesisPreamble(result.stdout); + synthesisHappened = true; console.error(` Synthesized`); } else { console.error( @@ -226,6 +239,48 @@ async function main(): Promise { console.error(` Heuristic only`); } + // Evaluate skill (structural always; rubric only if synthesis ran). + if (!config.skipEval) { + const evalOptions = { + model: config.evalModel ?? config.model, + // Skip the LLM rubric when synthesis was skipped/failed — heuristic + // markdown is not the target of the rubric. + skipRubric: !synthesisHappened, + }; + const evalResult = await evaluateSkill(markdown, evalOptions); + if (!evalResult.structural.valid) { + console.error( + ` Eval: structural FAIL (${evalResult.structural.issues.length} issue(s))` + ); + for (const issue of evalResult.structural.issues.slice(0, 5)) { + console.error(` - [${issue.field}] ${issue.message}`); + } + console.error( + " Hint: review SKILL.md frontmatter (name/description); file written anyway" + ); + } else if (evalResult.rubric?.ok && typeof evalResult.rubric.score === "number") { + const score = evalResult.rubric.score; + const overall = evalResult.overallScore ?? score; + console.error( + ` Eval: structural OK | rubric ${score}/100 | overall ${overall}/100` + ); + if (score < 60) { + console.error(" Hint: low rubric score — consider re-synthesizing"); + for (const h of (evalResult.rubric.hints ?? []).slice(0, 3)) { + console.error(` - ${h}`); + } + } + } else if (evalResult.rubric?.skipped) { + console.error(` Eval: structural OK | rubric skipped`); + } else if (evalResult.rubric && !evalResult.rubric.ok) { + console.error( + ` Eval: structural OK | rubric error: ${evalResult.rubric.error ?? "unknown"}` + ); + } else { + console.error(` Eval: structural OK`); + } + } + // Write skill file as //SKILL.md const skillName = extractSkillName(markdown, label); const skillDir = path.join(config.outputDir, skillName); diff --git a/scripts/skill-evaluator.ts b/scripts/skill-evaluator.ts new file mode 100644 index 0000000..046af88 --- /dev/null +++ b/scripts/skill-evaluator.ts @@ -0,0 +1,534 @@ +/** + * Skill evaluator: validates SKILL.md output from skill synthesis. + * + * Three layers (issue #20): + * 1. Deterministic structural validation (zod-backed YAML frontmatter checks). + * 2. LLM rubric scoring via `claude -p` (skill-creator-aligned). + * 3. Smoke firing test (stubbed for now — needs Claude Code itself to run). + * + * The evaluator surfaces hints; gating / regenerate-loop is intentionally left + * to a follow-up issue. Callers may still write the file when scores are low. + */ +import { spawn } from "node:child_process"; +import { z } from "zod"; + +// ---------- Types ---------- + +export interface StructuralIssue { + field: string; + message: string; +} + +export interface StructuralResult { + valid: boolean; + issues: StructuralIssue[]; + parsed?: { + name: string; + description: string; + allowedTools?: string[]; + requires?: string[]; + next?: string[]; + }; +} + +export interface RubricResult { + ok: boolean; + score?: number; // 0-100 + breakdown?: { + nameQuality: number; + descriptionTriggering: number; + instructionsConcrete: number; + noPreambleNoise: number; + }; + hints?: string[]; + rawResponse?: string; + error?: string; + skipped?: boolean; +} + +export interface SmokeFiringResult { + skipped: boolean; + message?: string; +} + +export interface EvaluationResult { + structural: StructuralResult; + rubric?: RubricResult; + smokeFiring: SmokeFiringResult; + overallScore?: number; // structural pass (0/50) + rubric (0..50) when available +} + +export interface EvaluatorOptions { + model?: string; // claude model for rubric scoring + skipRubric?: boolean; // local-only structural validation + timeoutMs?: number; // rubric LLM call timeout +} + +// ---------- Frontmatter parsing ---------- + +/** + * Minimal YAML frontmatter parser tailored to SKILL.md (no full YAML lib needed). + * Supports: + * - simple `key: value` scalars + * - inline arrays `key: [a, b, c]` + * - indented list items under a key: + * key: + * - a + * - b + * - quoted strings (single or double) + * Throws on malformed structure. + */ +export function extractFrontmatter(markdown: string): { + raw: string; + data: Record; +} { + // Strip optional UTF-8 BOM (U+FEFF) before parsing. + const trimmed = markdown.replace(/^\uFEFF/, ""); + if (!trimmed.startsWith("---")) { + throw new Error("missing frontmatter: file must start with '---'"); + } + const rest = trimmed.slice(3); + const newlineAfterOpen = rest.indexOf("\n"); + if (newlineAfterOpen === -1) { + throw new Error("malformed frontmatter: no newline after opening '---'"); + } + const after = rest.slice(newlineAfterOpen + 1); + const closeIdx = after.search(/^---\s*$/m); + if (closeIdx === -1) { + throw new Error("malformed frontmatter: missing closing '---'"); + } + const raw = after.slice(0, closeIdx); + + const data: Record = {}; + const lines = raw.split("\n"); + let currentListKey: string | null = null; + let listAccumulator: string[] = []; + + const flushList = () => { + if (currentListKey !== null) { + data[currentListKey] = listAccumulator; + currentListKey = null; + listAccumulator = []; + } + }; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (line.trim() === "") continue; + + // Indented list item under previous key. + if (currentListKey !== null && /^\s+-\s+/.test(line)) { + const value = line.replace(/^\s+-\s+/, "").trim(); + listAccumulator.push(unquote(value)); + continue; + } + + // New key — flush any pending list. + flushList(); + + const m = line.match(/^([A-Za-z_][A-Za-z0-9_-]*)\s*:\s*(.*)$/); + if (!m) { + throw new Error(`malformed frontmatter line: ${JSON.stringify(line)}`); + } + const [, key, rawValue] = m; + const value = rawValue.trim(); + + if (value === "") { + // Possibly a list opener. + currentListKey = key; + listAccumulator = []; + continue; + } + + if (value.startsWith("[") && value.endsWith("]")) { + const inner = value.slice(1, -1).trim(); + data[key] = inner === "" + ? [] + : inner.split(",").map((s) => unquote(s.trim())); + continue; + } + + data[key] = unquote(value); + } + flushList(); + + return { raw, data }; +} + +function unquote(s: string): string { + if (s.length >= 2) { + if ((s.startsWith("\"") && s.endsWith("\"")) || (s.startsWith("'") && s.endsWith("'"))) { + return s.slice(1, -1); + } + } + return s; +} + +// ---------- Structural schema ---------- + +// Skill-creator conventions: kebab-case-ish name, short, descriptive. +const NAME_RE = /^[a-z0-9][a-z0-9-]{0,62}[a-z0-9]$|^[a-z0-9]$/; +const FORBIDDEN_NAME_CHARS_RE = /[^a-z0-9-]/; + +const FrontmatterSchema = z.object({ + name: z + .string({ message: "name is required" }) + .min(2, { message: "name must be at least 2 characters" }) + .max(64, { message: "name must be at most 64 characters" }) + .refine((v) => !FORBIDDEN_NAME_CHARS_RE.test(v), { + message: "name must use lowercase letters, digits, and hyphens only", + }) + .refine((v) => NAME_RE.test(v), { + message: "name must start and end with [a-z0-9] and use kebab-case", + }), + description: z + .string({ message: "description is required" }) + .min(20, { message: "description should be at least 20 characters (encode trigger conditions)" }) + .max(500, { message: "description must be at most 500 characters" }), + allowedTools: z.array(z.string().min(1)).optional(), + requires: z.array(z.string().min(1)).optional(), + next: z.array(z.string().min(1)).optional(), +}).passthrough(); + +// ---------- Structural validation ---------- + +export function validateStructure(markdown: string): StructuralResult { + const issues: StructuralIssue[] = []; + + let parsed: { raw: string; data: Record }; + try { + parsed = extractFrontmatter(markdown); + } catch (err) { + return { + valid: false, + issues: [ + { + field: "frontmatter", + message: err instanceof Error ? err.message : String(err), + }, + ], + }; + } + + // Normalize allowed-tools key variants (e.g. `allowed-tools`, `allowed_tools`). + const data = { ...parsed.data }; + for (const altKey of ["allowed-tools", "allowed_tools"]) { + if (altKey in data && !("allowedTools" in data)) { + data.allowedTools = data[altKey]; + } + } + + const result = FrontmatterSchema.safeParse(data); + if (!result.success) { + for (const issue of result.error.issues) { + issues.push({ + field: issue.path.join(".") || "frontmatter", + message: issue.message, + }); + } + return { valid: false, issues }; + } + + const fm = result.data; + + // Body sanity: must contain something past the closing '---'. + // Tolerate optional UTF-8 BOM and files that end immediately after the + // closing fence (no trailing newline) — the previous regex required a + // newline after the closing fence and falsely reported "empty body" when + // a real body was present without a trailing blank line. + const stripped = markdown.replace(/^\uFEFF/, ""); + const closeMatch = stripped.match( + /^---\s*\n[\s\S]*?\n---\s*(?:\n([\s\S]*))?$/ + ); + const body = closeMatch?.[1] ?? ""; + if (body.trim().length === 0) { + issues.push({ + field: "body", + message: "skill body is empty after frontmatter", + }); + } + + return { + valid: issues.length === 0, + issues, + parsed: { + name: fm.name, + description: fm.description, + allowedTools: fm.allowedTools, + requires: fm.requires, + next: fm.next, + }, + }; +} + +// ---------- LLM rubric ---------- + +const RUBRIC_PROMPT_HEADER = `You are an evaluator for Claude Code "Skill" definitions (SKILL.md files). +Your task is to score a single SKILL.md against the skill-creator authoring rubric. + +Output requirements: +- Return STRICT JSON only. No markdown fences. No prose before or after. +- The JSON must match this shape exactly: + { + "score": , + "breakdown": { + "nameQuality": , + "descriptionTriggering": , + "instructionsConcrete": , + "noPreambleNoise": + }, + "hints": [] + } + +Scoring rubric (skill-creator alignment): +- nameQuality (25): kebab-case, descriptive of the workflow, not generic ("helper", "utils"). +- descriptionTriggering (25): description encodes WHEN to use the skill, with concrete trigger phrases. Penalize vague descriptions. +- instructionsConcrete (25): body contains step-by-step actionable instructions, references specific tools, and has examples. +- noPreambleNoise (25): no LLM preamble outside the SKILL.md content (e.g. "Here is your skill:"); content begins with frontmatter. + +If the SKILL.md is fundamentally broken (not a SKILL.md), return score 0 with a hint explaining why.`; + +export function buildRubricPrompt(markdown: string): string { + // SKILL.md content commonly contains triple-backtick code fences in its + // body. Wrap the embedded markdown with a longer fence so nested triple + // backticks do not terminate the outer block (CommonMark allows the inner + // fences as long as their length is shorter than the outer). + const fence = pickEnclosingFence(markdown); + return [ + RUBRIC_PROMPT_HEADER, + "", + "## SKILL.md to evaluate", + fence, + markdown, + fence, + ].join("\n"); +} + +function pickEnclosingFence(markdown: string): string { + // Find the longest run of backticks at line start in the input, then use + // a fence that is one backtick longer (minimum 4). + let longest = 0; + for (const line of markdown.split("\n")) { + const m = line.match(/^(`+)/); + if (m && m[1].length > longest) longest = m[1].length; + } + return "`".repeat(Math.max(4, longest + 1)); +} + +const RubricResponseSchema = z.object({ + score: z.number().min(0).max(100), + breakdown: z.object({ + nameQuality: z.number().min(0).max(25), + descriptionTriggering: z.number().min(0).max(25), + instructionsConcrete: z.number().min(0).max(25), + noPreambleNoise: z.number().min(0).max(25), + }), + hints: z.array(z.string()).default([]), +}); + +/** + * Extract the first balanced JSON object from a possibly-noisy string. + * Handles cases where the model wraps the JSON in stray text. + */ +export function extractFirstJsonObject(s: string): string | null { + const start = s.indexOf("{"); + if (start === -1) return null; + let depth = 0; + let inString = false; + let escape = false; + for (let i = start; i < s.length; i++) { + const ch = s[i]; + if (escape) { + escape = false; + continue; + } + if (ch === "\\") { + escape = true; + continue; + } + if (ch === "\"") { + inString = !inString; + continue; + } + if (inString) continue; + if (ch === "{") depth++; + else if (ch === "}") { + depth--; + if (depth === 0) { + return s.slice(start, i + 1); + } + } + } + return null; +} + +/** + * Spawn `claude -p` to score a SKILL.md against the rubric. + * Mirrors `synthesizeWithClaude` style for consistency. + */ +export function scoreWithRubric( + markdown: string, + options: EvaluatorOptions = {} +): Promise { + const timeoutMs = options.timeoutMs ?? 180_000; + const prompt = buildRubricPrompt(markdown); + + return new Promise((resolve) => { + const args = [ + "-p", + "--output-format", + "text", + "--permission-mode", + "acceptEdits", + "--no-session-persistence", + ]; + if (options.model) { + args.push("--model", options.model); + } + + const child = spawn("claude", args, { stdio: ["pipe", "pipe", "pipe"] }); + + let spawnFailed = false; + + child.on("error", (err: NodeJS.ErrnoException) => { + // Capture any spawn-time failure (ENOENT, EACCES, EPERM, ...). The + // 'close' event still fires after 'error' with code = null, so we set + // a flag and let the close handler short-circuit with a meaningful + // message instead of "claude exited with code null". + if (spawnFailed) return; + spawnFailed = true; + const message = + err.code === "ENOENT" + ? "claude CLI not found. Install Claude Code first." + : `failed to spawn claude: ${err.message}`; + resolve({ ok: false, error: message }); + }); + + // Without an 'error' listener on stdin, a spawn failure causes Node to + // crash the host process when we call stdin.write below. Swallow it — + // the 'error' handler on the child already produced a structured result. + child.stdin.on("error", () => {}); + + const stdoutChunks: Buffer[] = []; + const stderrChunks: Buffer[] = []; + child.stdout.on("data", (c: Buffer) => stdoutChunks.push(c)); + child.stderr.on("data", (c: Buffer) => stderrChunks.push(c)); + + const timeout = setTimeout(() => { + child.kill("SIGKILL"); + resolve({ + ok: false, + error: `Rubric scoring timed out (${timeoutMs / 1000}s)`, + }); + }, timeoutMs); + + child.on("close", (code) => { + clearTimeout(timeout); + if (spawnFailed) return; + const stdout = Buffer.concat(stdoutChunks).toString("utf-8"); + const stderr = Buffer.concat(stderrChunks).toString("utf-8"); + if (code !== 0) { + resolve({ + ok: false, + rawResponse: stdout, + error: `claude exited with code ${code}: ${stderr}`, + }); + return; + } + const json = extractFirstJsonObject(stdout); + if (!json) { + resolve({ + ok: false, + rawResponse: stdout, + error: "rubric response did not contain a JSON object", + }); + return; + } + let parsedJson: unknown; + try { + parsedJson = JSON.parse(json); + } catch (err) { + resolve({ + ok: false, + rawResponse: stdout, + error: `failed to parse rubric JSON: ${err instanceof Error ? err.message : String(err)}`, + }); + return; + } + const validated = RubricResponseSchema.safeParse(parsedJson); + if (!validated.success) { + resolve({ + ok: false, + rawResponse: stdout, + error: `rubric JSON shape mismatch: ${validated.error.issues.map((i) => i.message).join("; ")}`, + }); + return; + } + resolve({ + ok: true, + score: validated.data.score, + breakdown: validated.data.breakdown, + hints: validated.data.hints, + rawResponse: stdout, + }); + }); + + // Best-effort write. If spawn failed, stdin may already be destroyed — + // the registered 'error' listeners catch the resulting EPIPE/ECONNRESET. + try { + child.stdin.write(prompt); + child.stdin.end(); + } catch { + // Already surfaced via the child 'error' handler above. + } + }); +} + +// ---------- Smoke firing test (stub) ---------- + +/** + * Smoke firing test: feed fixture prompts that should trigger the skill and + * verify Claude Code activates it. Real implementation requires Claude Code + * itself as runner, so this is intentionally a stub for issue #20. Follow-up + * work will wire fixtures + activation tracing. + */ +export function smokeFireTest(markdown: string): Promise { + // markdown is accepted for API stability — real implementation will feed + // fixture prompts and inspect activation traces against this content. + void markdown; + return Promise.resolve({ + skipped: true, + message: "smoke firing test is not yet implemented (issue #20 follow-up)", + }); +} + +// ---------- Orchestrator ---------- + +export async function evaluateSkill( + markdown: string, + options: EvaluatorOptions = {} +): Promise { + const structural = validateStructure(markdown); + + let rubric: RubricResult | undefined; + if (!options.skipRubric) { + rubric = await scoreWithRubric(markdown, options); + } else { + rubric = { ok: false, skipped: true }; + } + + const smokeFiring = await smokeFireTest(markdown); + + // Overall score: 50 for structural pass + scaled rubric (0-50). + let overallScore: number | undefined; + if (structural.valid) { + overallScore = 50; + if (rubric?.ok && typeof rubric.score === "number") { + overallScore += Math.round(rubric.score / 2); + } + } else { + overallScore = 0; + } + + return { structural, rubric, smokeFiring, overallScore }; +} diff --git a/src/types/session.ts b/src/types/session.ts index 03f79c7..8aca3b0 100644 --- a/src/types/session.ts +++ b/src/types/session.ts @@ -224,6 +224,46 @@ export interface SkillCandidate { skillMarkdown: string synthesizedMarkdown?: string hookJson?: string + evaluation?: SkillEvaluation +} + +// === Skill Evaluation (issue #20) === +export interface SkillEvaluationStructuralIssue { + field: string + message: string +} + +export interface SkillEvaluationStructural { + valid: boolean + issues: SkillEvaluationStructuralIssue[] +} + +export interface SkillEvaluationRubricBreakdown { + nameQuality: number + descriptionTriggering: number + instructionsConcrete: number + noPreambleNoise: number +} + +export interface SkillEvaluationRubric { + ok: boolean + score?: number + breakdown?: SkillEvaluationRubricBreakdown + hints?: string[] + error?: string + skipped?: boolean +} + +export interface SkillEvaluationSmokeFiring { + skipped: boolean + message?: string +} + +export interface SkillEvaluation { + structural: SkillEvaluationStructural + rubric?: SkillEvaluationRubric + smokeFiring?: SkillEvaluationSmokeFiring + overallScore?: number } // === Tacit Knowledge ===