From 7312c2e993850d44b676c8878b55255f739a46a5 Mon Sep 17 00:00:00 2001 From: Kazuki Chigita Date: Sun, 26 Apr 2026 02:05:35 +0900 Subject: [PATCH 1/5] Add skill evaluation pipeline with structural and rubric checks - New `scripts/skill-evaluator.ts` exposing `validateStructure`, `scoreWithRubric`, `smokeFireTest` (stub), and `evaluateSkill`. Structural validation uses zod over a minimal frontmatter parser (name, description, allowed-tools/requires/next). Rubric scoring spawns `claude -p` with a strict-JSON prompt aligned to skill-creator conventions. Smoke firing test is intentionally a stub for now. - Wire evaluator into `scripts/cli.ts` with `--skip-eval` and `--eval-model` flags. Per-candidate console output now reports structural pass/fail, rubric score, and improvement hints. Files are still written on failure so reviewers can inspect; gating is left to a follow-up issue. - Extend `SkillCandidate` in `src/types/session.ts` with optional `evaluation` field so persisted data still parses. - Unit tests cover frontmatter happy path, missing required fields, oversize/short description, malformed YAML, and JSON extraction helpers. Tests do not call `claude -p`. Refs #20 Co-Authored-By: Claude Opus 4.7 (1M context) --- package-lock.json | 4 +- package.json | 3 +- scripts/__tests__/cli.test.ts | 12 + scripts/__tests__/skill-evaluator.test.ts | 228 ++++++++++ scripts/cli.ts | 57 ++- scripts/skill-evaluator.ts | 496 ++++++++++++++++++++++ src/types/session.ts | 40 ++ 7 files changed, 836 insertions(+), 4 deletions(-) create mode 100644 scripts/__tests__/skill-evaluator.test.ts create mode 100644 scripts/skill-evaluator.ts diff --git a/package-lock.json b/package-lock.json index 67a4112..dd0a472 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,7 +12,8 @@ "react": "^19.2.4", "react-chartjs-2": "^5.3.1", "react-dom": "^19.2.4", - "react-force-graph-2d": "^1.29.1" + "react-force-graph-2d": "^1.29.1", + "zod": "^4.3.6" }, "bin": { "crune": "bin/crune.js" @@ -4300,7 +4301,6 @@ "version": "4.3.6", "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz", "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==", - "dev": true, "license": "MIT", "funding": { "url": "https://github.com/sponsors/colinhacks" diff --git a/package.json b/package.json index 82791af..2a066b6 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,8 @@ "react": "^19.2.4", "react-chartjs-2": "^5.3.1", "react-dom": "^19.2.4", - "react-force-graph-2d": "^1.29.1" + "react-force-graph-2d": "^1.29.1", + "zod": "^4.3.6" }, "devDependencies": { "@eslint/js": "^9.39.4", diff --git a/scripts/__tests__/cli.test.ts b/scripts/__tests__/cli.test.ts index f2fa2aa..26548c8 100644 --- a/scripts/__tests__/cli.test.ts +++ b/scripts/__tests__/cli.test.ts @@ -10,6 +10,8 @@ describe("parseCliArgs", () => { expect(result.model).toBeUndefined(); expect(result.skipSynthesis).toBe(false); expect(result.dryRun).toBe(false); + expect(result.skipEval).toBe(false); + expect(result.evalModel).toBeUndefined(); }); it("sets sessionsDir with --sessions-dir", () => { @@ -47,6 +49,16 @@ describe("parseCliArgs", () => { expect(result.dryRun).toBe(true); }); + it("sets skipEval with --skip-eval", () => { + const result = parseCliArgs(["node", "cli.ts", "--skip-eval"]); + expect(result.skipEval).toBe(true); + }); + + it("sets evalModel with --eval-model", () => { + const result = parseCliArgs(["node", "cli.ts", "--eval-model", "haiku"]); + expect(result.evalModel).toBe("haiku"); + }); + it("handles multiple flags combined", () => { const result = parseCliArgs([ "node", diff --git a/scripts/__tests__/skill-evaluator.test.ts b/scripts/__tests__/skill-evaluator.test.ts new file mode 100644 index 0000000..85b7e8f --- /dev/null +++ b/scripts/__tests__/skill-evaluator.test.ts @@ -0,0 +1,228 @@ +import { describe, it, expect } from "vitest"; +import { + validateStructure, + extractFrontmatter, + extractFirstJsonObject, + buildRubricPrompt, +} from "../skill-evaluator.js"; + +const goodSkill = `--- +name: refactor-tests +description: Use when the user wants to refactor a Vitest suite to use shared fixtures. Triggers on phrases like "refactor tests" or "extract fixture". +allowed-tools: [Read, Edit, Bash] +--- + +## Overview +Refactors Vitest tests to share fixtures. + +## When to Use +When tests duplicate setup logic. + +## Workflow +1. Read failing test files. +2. Extract fixtures. +3. Run vitest. +`; + +describe("validateStructure", () => { + it("accepts well-formed frontmatter", () => { + const result = validateStructure(goodSkill); + expect(result.valid).toBe(true); + expect(result.issues).toEqual([]); + expect(result.parsed?.name).toBe("refactor-tests"); + expect(result.parsed?.allowedTools).toEqual(["Read", "Edit", "Bash"]); + }); + + it("flags missing name", () => { + const md = `--- +description: Long enough description for the validator triggers and hints. +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "name")).toBe(true); + }); + + it("flags missing description", () => { + const md = `--- +name: my-skill +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "description")).toBe(true); + }); + + it("flags oversize description (>500 chars)", () => { + const longDesc = "x".repeat(501); + const md = `--- +name: my-skill +description: ${longDesc} +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "description" && /500/.test(i.message))).toBe( + true + ); + }); + + it("flags too-short description (<20 chars)", () => { + const md = `--- +name: my-skill +description: short +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "description")).toBe(true); + }); + + it("flags forbidden characters in name", () => { + const md = `--- +name: My_Skill! +description: A reasonable description with enough detail to pass minimum length check. +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "name")).toBe(true); + }); + + it("flags malformed YAML — missing closing ---", () => { + const md = `--- +name: my-skill +description: A reasonable description with enough detail to pass minimum length check. + +body without closing fence +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues[0].field).toBe("frontmatter"); + expect(result.issues[0].message).toMatch(/closing/); + }); + + it("flags malformed YAML — no frontmatter at all", () => { + const md = `# Just a heading\n\nNo frontmatter here.`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues[0].field).toBe("frontmatter"); + }); + + it("flags malformed YAML — bad scalar line", () => { + const md = `--- +name: ok-name +this line has no colon +description: A reasonable description with enough detail to pass minimum length check. +--- + +body +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues[0].field).toBe("frontmatter"); + }); + + it("flags empty body after frontmatter", () => { + const md = `--- +name: my-skill +description: A reasonable description with enough detail to pass minimum length check. +--- + +`; + const result = validateStructure(md); + expect(result.valid).toBe(false); + expect(result.issues.some((i) => i.field === "body")).toBe(true); + }); + + it("accepts indented list form for allowed-tools", () => { + const md = `--- +name: my-skill +description: A reasonable description with enough detail to pass minimum length check. +allowed-tools: + - Read + - Edit +--- + +body content +`; + const result = validateStructure(md); + expect(result.valid).toBe(true); + expect(result.parsed?.allowedTools).toEqual(["Read", "Edit"]); + }); + + it("accepts requires/next workflow-continuation arrays", () => { + const md = `--- +name: deploy-app +description: Use when the user wants to deploy the staging environment after running tests. +requires: [setup-env] +next: [smoke-test] +--- + +deploy steps +`; + const result = validateStructure(md); + expect(result.valid).toBe(true); + expect(result.parsed?.requires).toEqual(["setup-env"]); + expect(result.parsed?.next).toEqual(["smoke-test"]); + }); +}); + +describe("extractFrontmatter", () => { + it("parses simple frontmatter", () => { + const { data } = extractFrontmatter(goodSkill); + expect(data.name).toBe("refactor-tests"); + expect(typeof data.description).toBe("string"); + }); + + it("strips quotes around values", () => { + const md = `--- +name: "quoted-name" +description: 'single quoted long enough description for the validator' +--- + +body +`; + const { data } = extractFrontmatter(md); + expect(data.name).toBe("quoted-name"); + expect(data.description).toBe("single quoted long enough description for the validator"); + }); +}); + +describe("extractFirstJsonObject", () => { + it("returns the first balanced JSON object", () => { + const s = 'preamble {"score": 80, "nested": {"a": 1}} trailing'; + expect(extractFirstJsonObject(s)).toBe('{"score": 80, "nested": {"a": 1}}'); + }); + + it("handles strings containing braces", () => { + const s = '{"hint": "look at {curly} braces", "score": 50}'; + expect(extractFirstJsonObject(s)).toBe(s); + }); + + it("returns null when no object present", () => { + expect(extractFirstJsonObject("just text")).toBeNull(); + }); +}); + +describe("buildRubricPrompt", () => { + it("includes the skill markdown and rubric instructions", () => { + const prompt = buildRubricPrompt(goodSkill); + expect(prompt).toContain("STRICT JSON"); + expect(prompt).toContain("nameQuality"); + expect(prompt).toContain("descriptionTriggering"); + expect(prompt).toContain("instructionsConcrete"); + expect(prompt).toContain("noPreambleNoise"); + expect(prompt).toContain("refactor-tests"); + }); +}); diff --git a/scripts/cli.ts b/scripts/cli.ts index 1137f00..1daee41 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -23,6 +23,7 @@ import { stripSynthesisPreamble, type TopicNode as SynthTopicNode, } from "./skill-synthesizer.js"; +import { evaluateSkill } from "./skill-evaluator.js"; // ─── CLI argument parsing ───────────────────────────────────────── @@ -33,6 +34,8 @@ interface CliArgs { model?: string; skipSynthesis: boolean; dryRun: boolean; + skipEval: boolean; + evalModel?: string; } export function parseCliArgs(argv: string[]): CliArgs { @@ -43,6 +46,8 @@ export function parseCliArgs(argv: string[]): CliArgs { let model: string | undefined; let skipSynthesis = false; let dryRun = false; + let skipEval = false; + let evalModel: string | undefined; for (let i = 0; i < args.length; i++) { if (args[i] === "--sessions-dir" && args[i + 1]) { @@ -58,13 +63,17 @@ export function parseCliArgs(argv: string[]): CliArgs { skipSynthesis = true; } else if (args[i] === "--dry-run") { dryRun = true; + } else if (args[i] === "--skip-eval") { + skipEval = true; + } else if (args[i] === "--eval-model" && args[i + 1]) { + evalModel = args[++i]; } else if (args[i] === "--help" || args[i] === "-h") { printUsage(); process.exit(0); } } - return { sessionsDir, outputDir, count, model, skipSynthesis, dryRun }; + return { sessionsDir, outputDir, count, model, skipSynthesis, dryRun, skipEval, evalModel }; } function printUsage(): void { @@ -79,6 +88,8 @@ Options: --model Claude model for synthesis (e.g., haiku, sonnet) --skip-synthesis Skip LLM synthesis, output heuristic skills only --dry-run Show candidates without writing files + --skip-eval Skip skill evaluation pipeline (structural + rubric) + --eval-model Claude model for rubric scoring (defaults to --model) -h, --help Show this help message`); } @@ -196,6 +207,7 @@ async function main(): Promise { console.error(` -> ${label}`); let markdown = candidate.skillMarkdown; + let synthesisHappened = false; if (!config.skipSynthesis && topic) { // Find enriched sequences related to this topic's sessions @@ -216,6 +228,7 @@ async function main(): Promise { if (result.success) { markdown = stripSynthesisPreamble(result.stdout); + synthesisHappened = true; console.error(` Synthesized`); } else { console.error( @@ -226,6 +239,48 @@ async function main(): Promise { console.error(` Heuristic only`); } + // Evaluate skill (structural always; rubric only if synthesis ran). + if (!config.skipEval) { + const evalOptions = { + model: config.evalModel ?? config.model, + // Skip the LLM rubric when synthesis was skipped/failed — heuristic + // markdown is not the target of the rubric. + skipRubric: !synthesisHappened, + }; + const evalResult = await evaluateSkill(markdown, evalOptions); + if (!evalResult.structural.valid) { + console.error( + ` Eval: structural FAIL (${evalResult.structural.issues.length} issue(s))` + ); + for (const issue of evalResult.structural.issues.slice(0, 5)) { + console.error(` - [${issue.field}] ${issue.message}`); + } + console.error( + " Hint: review SKILL.md frontmatter (name/description); file written anyway" + ); + } else if (evalResult.rubric?.ok && typeof evalResult.rubric.score === "number") { + const score = evalResult.rubric.score; + const overall = evalResult.overallScore ?? score; + console.error( + ` Eval: structural OK | rubric ${score}/100 | overall ${overall}/100` + ); + if (score < 60) { + console.error(" Hint: low rubric score — consider re-synthesizing"); + for (const h of (evalResult.rubric.hints ?? []).slice(0, 3)) { + console.error(` - ${h}`); + } + } + } else if (evalResult.rubric?.skipped) { + console.error(` Eval: structural OK | rubric skipped`); + } else if (evalResult.rubric && !evalResult.rubric.ok) { + console.error( + ` Eval: structural OK | rubric error: ${evalResult.rubric.error ?? "unknown"}` + ); + } else { + console.error(` Eval: structural OK`); + } + } + // Write skill file as //SKILL.md const skillName = extractSkillName(markdown, label); const skillDir = path.join(config.outputDir, skillName); diff --git a/scripts/skill-evaluator.ts b/scripts/skill-evaluator.ts new file mode 100644 index 0000000..daedcb2 --- /dev/null +++ b/scripts/skill-evaluator.ts @@ -0,0 +1,496 @@ +/** + * Skill evaluator: validates SKILL.md output from skill synthesis. + * + * Three layers (issue #20): + * 1. Deterministic structural validation (zod-backed YAML frontmatter checks). + * 2. LLM rubric scoring via `claude -p` (skill-creator-aligned). + * 3. Smoke firing test (stubbed for now — needs Claude Code itself to run). + * + * The evaluator surfaces hints; gating / regenerate-loop is intentionally left + * to a follow-up issue. Callers may still write the file when scores are low. + */ +import { spawn } from "node:child_process"; +import { z } from "zod"; + +// ---------- Types ---------- + +export interface StructuralIssue { + field: string; + message: string; +} + +export interface StructuralResult { + valid: boolean; + issues: StructuralIssue[]; + parsed?: { + name: string; + description: string; + allowedTools?: string[]; + requires?: string[]; + next?: string[]; + }; +} + +export interface RubricResult { + ok: boolean; + score?: number; // 0-100 + breakdown?: { + nameQuality: number; + descriptionTriggering: number; + instructionsConcrete: number; + noPreambleNoise: number; + }; + hints?: string[]; + rawResponse?: string; + error?: string; + skipped?: boolean; +} + +export interface SmokeFiringResult { + skipped: boolean; + message?: string; +} + +export interface EvaluationResult { + structural: StructuralResult; + rubric?: RubricResult; + smokeFiring: SmokeFiringResult; + overallScore?: number; // structural pass (0/50) + rubric (0..50) when available +} + +export interface EvaluatorOptions { + model?: string; // claude model for rubric scoring + skipRubric?: boolean; // local-only structural validation + timeoutMs?: number; // rubric LLM call timeout +} + +// ---------- Frontmatter parsing ---------- + +/** + * Minimal YAML frontmatter parser tailored to SKILL.md (no full YAML lib needed). + * Supports: + * - simple `key: value` scalars + * - inline arrays `key: [a, b, c]` + * - indented list items under a key: + * key: + * - a + * - b + * - quoted strings (single or double) + * Throws on malformed structure. + */ +export function extractFrontmatter(markdown: string): { + raw: string; + data: Record; +} { + // Strip optional UTF-8 BOM (U+FEFF) before parsing. + const trimmed = markdown.replace(/^\uFEFF/, ""); + if (!trimmed.startsWith("---")) { + throw new Error("missing frontmatter: file must start with '---'"); + } + const rest = trimmed.slice(3); + const newlineAfterOpen = rest.indexOf("\n"); + if (newlineAfterOpen === -1) { + throw new Error("malformed frontmatter: no newline after opening '---'"); + } + const after = rest.slice(newlineAfterOpen + 1); + const closeIdx = after.search(/^---\s*$/m); + if (closeIdx === -1) { + throw new Error("malformed frontmatter: missing closing '---'"); + } + const raw = after.slice(0, closeIdx); + + const data: Record = {}; + const lines = raw.split("\n"); + let currentListKey: string | null = null; + let listAccumulator: string[] = []; + + const flushList = () => { + if (currentListKey !== null) { + data[currentListKey] = listAccumulator; + currentListKey = null; + listAccumulator = []; + } + }; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (line.trim() === "") continue; + + // Indented list item under previous key. + if (currentListKey !== null && /^\s+-\s+/.test(line)) { + const value = line.replace(/^\s+-\s+/, "").trim(); + listAccumulator.push(unquote(value)); + continue; + } + + // New key — flush any pending list. + flushList(); + + const m = line.match(/^([A-Za-z_][A-Za-z0-9_-]*)\s*:\s*(.*)$/); + if (!m) { + throw new Error(`malformed frontmatter line: ${JSON.stringify(line)}`); + } + const [, key, rawValue] = m; + const value = rawValue.trim(); + + if (value === "") { + // Possibly a list opener. + currentListKey = key; + listAccumulator = []; + continue; + } + + if (value.startsWith("[") && value.endsWith("]")) { + const inner = value.slice(1, -1).trim(); + data[key] = inner === "" + ? [] + : inner.split(",").map((s) => unquote(s.trim())); + continue; + } + + data[key] = unquote(value); + } + flushList(); + + return { raw, data }; +} + +function unquote(s: string): string { + if (s.length >= 2) { + if ((s.startsWith("\"") && s.endsWith("\"")) || (s.startsWith("'") && s.endsWith("'"))) { + return s.slice(1, -1); + } + } + return s; +} + +// ---------- Structural schema ---------- + +// Skill-creator conventions: kebab-case-ish name, short, descriptive. +const NAME_RE = /^[a-z0-9][a-z0-9-]{0,62}[a-z0-9]$|^[a-z0-9]$/; +const FORBIDDEN_NAME_CHARS_RE = /[^a-z0-9-]/; + +const FrontmatterSchema = z.object({ + name: z + .string({ message: "name is required" }) + .min(2, { message: "name must be at least 2 characters" }) + .max(64, { message: "name must be at most 64 characters" }) + .refine((v) => !FORBIDDEN_NAME_CHARS_RE.test(v), { + message: "name must use lowercase letters, digits, and hyphens only", + }) + .refine((v) => NAME_RE.test(v), { + message: "name must start and end with [a-z0-9] and use kebab-case", + }), + description: z + .string({ message: "description is required" }) + .min(20, { message: "description should be at least 20 characters (encode trigger conditions)" }) + .max(500, { message: "description must be at most 500 characters" }), + allowedTools: z.array(z.string().min(1)).optional(), + requires: z.array(z.string().min(1)).optional(), + next: z.array(z.string().min(1)).optional(), +}).passthrough(); + +// ---------- Structural validation ---------- + +export function validateStructure(markdown: string): StructuralResult { + const issues: StructuralIssue[] = []; + + let parsed: { raw: string; data: Record }; + try { + parsed = extractFrontmatter(markdown); + } catch (err) { + return { + valid: false, + issues: [ + { + field: "frontmatter", + message: err instanceof Error ? err.message : String(err), + }, + ], + }; + } + + // Normalize allowed-tools key variants (e.g. `allowed-tools`, `allowed_tools`). + const data = { ...parsed.data }; + for (const altKey of ["allowed-tools", "allowed_tools"]) { + if (altKey in data && !("allowedTools" in data)) { + data.allowedTools = data[altKey]; + } + } + + const result = FrontmatterSchema.safeParse(data); + if (!result.success) { + for (const issue of result.error.issues) { + issues.push({ + field: issue.path.join(".") || "frontmatter", + message: issue.message, + }); + } + return { valid: false, issues }; + } + + const fm = result.data; + + // Body sanity: must contain something past the closing '---'. + const closeMatch = markdown.match(/^---\s*\n[\s\S]*?\n---\s*\n([\s\S]*)$/); + const body = closeMatch?.[1] ?? ""; + if (body.trim().length === 0) { + issues.push({ + field: "body", + message: "skill body is empty after frontmatter", + }); + } + + return { + valid: issues.length === 0, + issues, + parsed: { + name: fm.name, + description: fm.description, + allowedTools: fm.allowedTools, + requires: fm.requires, + next: fm.next, + }, + }; +} + +// ---------- LLM rubric ---------- + +const RUBRIC_PROMPT_HEADER = `You are an evaluator for Claude Code "Skill" definitions (SKILL.md files). +Your task is to score a single SKILL.md against the skill-creator authoring rubric. + +Output requirements: +- Return STRICT JSON only. No markdown fences. No prose before or after. +- The JSON must match this shape exactly: + { + "score": , + "breakdown": { + "nameQuality": , + "descriptionTriggering": , + "instructionsConcrete": , + "noPreambleNoise": + }, + "hints": [] + } + +Scoring rubric (skill-creator alignment): +- nameQuality (25): kebab-case, descriptive of the workflow, not generic ("helper", "utils"). +- descriptionTriggering (25): description encodes WHEN to use the skill, with concrete trigger phrases. Penalize vague descriptions. +- instructionsConcrete (25): body contains step-by-step actionable instructions, references specific tools, and has examples. +- noPreambleNoise (25): no LLM preamble outside the SKILL.md content (e.g. "Here is your skill:"); content begins with frontmatter. + +If the SKILL.md is fundamentally broken (not a SKILL.md), return score 0 with a hint explaining why.`; + +export function buildRubricPrompt(markdown: string): string { + return [ + RUBRIC_PROMPT_HEADER, + "", + "## SKILL.md to evaluate", + "```", + markdown, + "```", + ].join("\n"); +} + +const RubricResponseSchema = z.object({ + score: z.number().min(0).max(100), + breakdown: z.object({ + nameQuality: z.number().min(0).max(25), + descriptionTriggering: z.number().min(0).max(25), + instructionsConcrete: z.number().min(0).max(25), + noPreambleNoise: z.number().min(0).max(25), + }), + hints: z.array(z.string()).default([]), +}); + +/** + * Extract the first balanced JSON object from a possibly-noisy string. + * Handles cases where the model wraps the JSON in stray text. + */ +export function extractFirstJsonObject(s: string): string | null { + const start = s.indexOf("{"); + if (start === -1) return null; + let depth = 0; + let inString = false; + let escape = false; + for (let i = start; i < s.length; i++) { + const ch = s[i]; + if (escape) { + escape = false; + continue; + } + if (ch === "\\") { + escape = true; + continue; + } + if (ch === "\"") { + inString = !inString; + continue; + } + if (inString) continue; + if (ch === "{") depth++; + else if (ch === "}") { + depth--; + if (depth === 0) { + return s.slice(start, i + 1); + } + } + } + return null; +} + +/** + * Spawn `claude -p` to score a SKILL.md against the rubric. + * Mirrors `synthesizeWithClaude` style for consistency. + */ +export function scoreWithRubric( + markdown: string, + options: EvaluatorOptions = {} +): Promise { + const timeoutMs = options.timeoutMs ?? 180_000; + const prompt = buildRubricPrompt(markdown); + + return new Promise((resolve) => { + const args = [ + "-p", + "--output-format", + "text", + "--permission-mode", + "acceptEdits", + "--no-session-persistence", + ]; + if (options.model) { + args.push("--model", options.model); + } + + const child = spawn("claude", args, { stdio: ["pipe", "pipe", "pipe"] }); + + let notFound = false; + + child.on("error", (err: NodeJS.ErrnoException) => { + if (err.code === "ENOENT") { + notFound = true; + resolve({ + ok: false, + error: "claude CLI not found. Install Claude Code first.", + }); + } + }); + + const stdoutChunks: Buffer[] = []; + const stderrChunks: Buffer[] = []; + child.stdout.on("data", (c: Buffer) => stdoutChunks.push(c)); + child.stderr.on("data", (c: Buffer) => stderrChunks.push(c)); + + const timeout = setTimeout(() => { + child.kill("SIGKILL"); + resolve({ + ok: false, + error: `Rubric scoring timed out (${timeoutMs / 1000}s)`, + }); + }, timeoutMs); + + child.on("close", (code) => { + clearTimeout(timeout); + if (notFound) return; + const stdout = Buffer.concat(stdoutChunks).toString("utf-8"); + const stderr = Buffer.concat(stderrChunks).toString("utf-8"); + if (code !== 0) { + resolve({ + ok: false, + rawResponse: stdout, + error: `claude exited with code ${code}: ${stderr}`, + }); + return; + } + const json = extractFirstJsonObject(stdout); + if (!json) { + resolve({ + ok: false, + rawResponse: stdout, + error: "rubric response did not contain a JSON object", + }); + return; + } + let parsedJson: unknown; + try { + parsedJson = JSON.parse(json); + } catch (err) { + resolve({ + ok: false, + rawResponse: stdout, + error: `failed to parse rubric JSON: ${err instanceof Error ? err.message : String(err)}`, + }); + return; + } + const validated = RubricResponseSchema.safeParse(parsedJson); + if (!validated.success) { + resolve({ + ok: false, + rawResponse: stdout, + error: `rubric JSON shape mismatch: ${validated.error.issues.map((i) => i.message).join("; ")}`, + }); + return; + } + resolve({ + ok: true, + score: validated.data.score, + breakdown: validated.data.breakdown, + hints: validated.data.hints, + rawResponse: stdout, + }); + }); + + child.stdin.write(prompt); + child.stdin.end(); + }); +} + +// ---------- Smoke firing test (stub) ---------- + +/** + * Smoke firing test: feed fixture prompts that should trigger the skill and + * verify Claude Code activates it. Real implementation requires Claude Code + * itself as runner, so this is intentionally a stub for issue #20. Follow-up + * work will wire fixtures + activation tracing. + */ +export function smokeFireTest(markdown: string): Promise { + // markdown is accepted for API stability — real implementation will feed + // fixture prompts and inspect activation traces against this content. + void markdown; + return Promise.resolve({ + skipped: true, + message: "smoke firing test is not yet implemented (issue #20 follow-up)", + }); +} + +// ---------- Orchestrator ---------- + +export async function evaluateSkill( + markdown: string, + options: EvaluatorOptions = {} +): Promise { + const structural = validateStructure(markdown); + + let rubric: RubricResult | undefined; + if (!options.skipRubric) { + rubric = await scoreWithRubric(markdown, options); + } else { + rubric = { ok: false, skipped: true }; + } + + const smokeFiring = await smokeFireTest(markdown); + + // Overall score: 50 for structural pass + scaled rubric (0-50). + let overallScore: number | undefined; + if (structural.valid) { + overallScore = 50; + if (rubric?.ok && typeof rubric.score === "number") { + overallScore += Math.round(rubric.score / 2); + } + } else { + overallScore = 0; + } + + return { structural, rubric, smokeFiring, overallScore }; +} diff --git a/src/types/session.ts b/src/types/session.ts index 03f79c7..8aca3b0 100644 --- a/src/types/session.ts +++ b/src/types/session.ts @@ -224,6 +224,46 @@ export interface SkillCandidate { skillMarkdown: string synthesizedMarkdown?: string hookJson?: string + evaluation?: SkillEvaluation +} + +// === Skill Evaluation (issue #20) === +export interface SkillEvaluationStructuralIssue { + field: string + message: string +} + +export interface SkillEvaluationStructural { + valid: boolean + issues: SkillEvaluationStructuralIssue[] +} + +export interface SkillEvaluationRubricBreakdown { + nameQuality: number + descriptionTriggering: number + instructionsConcrete: number + noPreambleNoise: number +} + +export interface SkillEvaluationRubric { + ok: boolean + score?: number + breakdown?: SkillEvaluationRubricBreakdown + hints?: string[] + error?: string + skipped?: boolean +} + +export interface SkillEvaluationSmokeFiring { + skipped: boolean + message?: string +} + +export interface SkillEvaluation { + structural: SkillEvaluationStructural + rubric?: SkillEvaluationRubric + smokeFiring?: SkillEvaluationSmokeFiring + overallScore?: number } // === Tacit Knowledge === From a76a2787d6aa433ff6521a1f315ab0fea127a9c2 Mon Sep 17 00:00:00 2001 From: Kazuki Chigita Date: Sun, 26 Apr 2026 02:24:04 +0900 Subject: [PATCH 2/5] fix(nirami): accept SKILL.md body when no trailing newline follows closing fence The body-extraction regex in validateStructure required a newline after the closing '---' fence, so a SKILL.md ending immediately after its body content (no trailing newline) was incorrectly reported as having an empty body. Make the trailing newline optional and add regression coverage. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/__tests__/skill-evaluator.test.ts | 10 ++++++++++ scripts/skill-evaluator.ts | 9 ++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/scripts/__tests__/skill-evaluator.test.ts b/scripts/__tests__/skill-evaluator.test.ts index 85b7e8f..6d8de48 100644 --- a/scripts/__tests__/skill-evaluator.test.ts +++ b/scripts/__tests__/skill-evaluator.test.ts @@ -145,6 +145,16 @@ description: A reasonable description with enough detail to pass minimum length expect(result.issues.some((i) => i.field === "body")).toBe(true); }); + it("accepts body when file ends without trailing newline after closing fence", () => { + // No trailing newline after the body and no blank line between fence + // and body — previously misreported as "empty body". + const md = + "---\nname: my-skill\ndescription: A reasonable description with enough detail to pass minimum length check.\n---\nbody content"; + const result = validateStructure(md); + expect(result.valid).toBe(true); + expect(result.issues).toEqual([]); + }); + it("accepts indented list form for allowed-tools", () => { const md = `--- name: my-skill diff --git a/scripts/skill-evaluator.ts b/scripts/skill-evaluator.ts index daedcb2..4eecc9e 100644 --- a/scripts/skill-evaluator.ts +++ b/scripts/skill-evaluator.ts @@ -232,7 +232,14 @@ export function validateStructure(markdown: string): StructuralResult { const fm = result.data; // Body sanity: must contain something past the closing '---'. - const closeMatch = markdown.match(/^---\s*\n[\s\S]*?\n---\s*\n([\s\S]*)$/); + // Tolerate optional UTF-8 BOM and files that end immediately after the + // closing fence (no trailing newline) — the previous regex required a + // newline after the closing fence and falsely reported "empty body" when + // a real body was present without a trailing blank line. + const stripped = markdown.replace(/^\uFEFF/, ""); + const closeMatch = stripped.match( + /^---\s*\n[\s\S]*?\n---\s*(?:\n([\s\S]*))?$/ + ); const body = closeMatch?.[1] ?? ""; if (body.trim().length === 0) { issues.push({ From ef3bfb0316ab826d83e30ad32fd18906d3c5c669 Mon Sep 17 00:00:00 2001 From: Kazuki Chigita Date: Sun, 26 Apr 2026 02:24:47 +0900 Subject: [PATCH 3/5] fix(nirami): use longer fence in rubric prompt to escape nested code blocks SKILL.md bodies routinely contain triple-backtick code blocks. The previous buildRubricPrompt wrapped the markdown in a triple-backtick fence, so any nested ``` would close the outer block prematurely and confuse the rubric LLM. Pick a fence one backtick longer than the longest run in the input (minimum 4) so nested fences stay verbatim. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/__tests__/skill-evaluator.test.ts | 21 +++++++++++++++++++++ scripts/skill-evaluator.ts | 20 ++++++++++++++++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/scripts/__tests__/skill-evaluator.test.ts b/scripts/__tests__/skill-evaluator.test.ts index 6d8de48..5b16147 100644 --- a/scripts/__tests__/skill-evaluator.test.ts +++ b/scripts/__tests__/skill-evaluator.test.ts @@ -235,4 +235,25 @@ describe("buildRubricPrompt", () => { expect(prompt).toContain("noPreambleNoise"); expect(prompt).toContain("refactor-tests"); }); + + it("uses an enclosing fence longer than any backtick run inside the markdown", () => { + // Skill body containing triple-backtick code blocks must not break the + // outer fence — pick a longer fence (>=4 backticks). + const skillWithFences = `--- +name: my-skill +description: A reasonable description with enough detail to pass the minimum length check. +--- + +## Example + +\`\`\`bash +echo "hello" +\`\`\` +`; + const prompt = buildRubricPrompt(skillWithFences); + // The outer fence must be at least 4 backticks long to wrap the inner ```. + expect(prompt).toMatch(/\n````+\n[\s\S]*?\n````+\n?/); + // The inner ``` must still be present verbatim. + expect(prompt).toContain("```bash"); + }); }); diff --git a/scripts/skill-evaluator.ts b/scripts/skill-evaluator.ts index 4eecc9e..088853f 100644 --- a/scripts/skill-evaluator.ts +++ b/scripts/skill-evaluator.ts @@ -289,16 +289,32 @@ Scoring rubric (skill-creator alignment): If the SKILL.md is fundamentally broken (not a SKILL.md), return score 0 with a hint explaining why.`; export function buildRubricPrompt(markdown: string): string { + // SKILL.md content commonly contains triple-backtick code fences in its + // body. Wrap the embedded markdown with a longer fence so nested triple + // backticks do not terminate the outer block (CommonMark allows the inner + // fences as long as their length is shorter than the outer). + const fence = pickEnclosingFence(markdown); return [ RUBRIC_PROMPT_HEADER, "", "## SKILL.md to evaluate", - "```", + fence, markdown, - "```", + fence, ].join("\n"); } +function pickEnclosingFence(markdown: string): string { + // Find the longest run of backticks at line start in the input, then use + // a fence that is one backtick longer (minimum 4). + let longest = 0; + for (const line of markdown.split("\n")) { + const m = line.match(/^(`+)/); + if (m && m[1].length > longest) longest = m[1].length; + } + return "`".repeat(Math.max(4, longest + 1)); +} + const RubricResponseSchema = z.object({ score: z.number().min(0).max(100), breakdown: z.object({ From 08f6c0454f481b99212ede3c2df48628984837c9 Mon Sep 17 00:00:00 2001 From: Kazuki Chigita Date: Sun, 26 Apr 2026 02:25:28 +0900 Subject: [PATCH 4/5] fix(nirami): handle non-ENOENT spawn failures and stdin EPIPE in scoreWithRubric Two failure modes were unsafe in the rubric path: 1. The child 'error' handler only treated ENOENT specially. For other spawn errors (EACCES, EPERM, ...) the 'close' callback fired with code=null and produced the misleading message "claude exited with code null", obscuring the real reason. 2. There was no 'error' listener on child.stdin. When spawn fails, the subsequent stdin.write triggers an unhandled 'error' event that crashes the host process before we can return the structured RubricResult. Capture all spawn errors with a structured message, attach a no-op stdin 'error' listener, and wrap the write/end in try/catch so the failure is always reported through the resolved promise. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/skill-evaluator.ts | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/scripts/skill-evaluator.ts b/scripts/skill-evaluator.ts index 088853f..046af88 100644 --- a/scripts/skill-evaluator.ts +++ b/scripts/skill-evaluator.ts @@ -388,18 +388,27 @@ export function scoreWithRubric( const child = spawn("claude", args, { stdio: ["pipe", "pipe", "pipe"] }); - let notFound = false; + let spawnFailed = false; child.on("error", (err: NodeJS.ErrnoException) => { - if (err.code === "ENOENT") { - notFound = true; - resolve({ - ok: false, - error: "claude CLI not found. Install Claude Code first.", - }); - } + // Capture any spawn-time failure (ENOENT, EACCES, EPERM, ...). The + // 'close' event still fires after 'error' with code = null, so we set + // a flag and let the close handler short-circuit with a meaningful + // message instead of "claude exited with code null". + if (spawnFailed) return; + spawnFailed = true; + const message = + err.code === "ENOENT" + ? "claude CLI not found. Install Claude Code first." + : `failed to spawn claude: ${err.message}`; + resolve({ ok: false, error: message }); }); + // Without an 'error' listener on stdin, a spawn failure causes Node to + // crash the host process when we call stdin.write below. Swallow it — + // the 'error' handler on the child already produced a structured result. + child.stdin.on("error", () => {}); + const stdoutChunks: Buffer[] = []; const stderrChunks: Buffer[] = []; child.stdout.on("data", (c: Buffer) => stdoutChunks.push(c)); @@ -415,7 +424,7 @@ export function scoreWithRubric( child.on("close", (code) => { clearTimeout(timeout); - if (notFound) return; + if (spawnFailed) return; const stdout = Buffer.concat(stdoutChunks).toString("utf-8"); const stderr = Buffer.concat(stderrChunks).toString("utf-8"); if (code !== 0) { @@ -464,8 +473,14 @@ export function scoreWithRubric( }); }); - child.stdin.write(prompt); - child.stdin.end(); + // Best-effort write. If spawn failed, stdin may already be destroyed — + // the registered 'error' listeners catch the resulting EPIPE/ECONNRESET. + try { + child.stdin.write(prompt); + child.stdin.end(); + } catch { + // Already surfaced via the child 'error' handler above. + } }); } From 7dfd407c06d5af965bcb8caa7ed3b37b6d8ebed0 Mon Sep 17 00:00:00 2001 From: Kazuki Chigita Date: Sun, 26 Apr 2026 02:26:18 +0900 Subject: [PATCH 5/5] fix(nirami): cover evaluateSkill orchestrator and smokeFireTest stub with tests The orchestrator's overallScore branches (structural-fail -> 0, structural pass + skipped rubric -> 50) and the smokeFireTest stub had no test coverage. Add deterministic cases that exercise both code paths without spawning the claude CLI (skipRubric: true), so a regression in either the score combination or the stub return shape is now caught locally. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/__tests__/skill-evaluator.test.ts | 31 +++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/scripts/__tests__/skill-evaluator.test.ts b/scripts/__tests__/skill-evaluator.test.ts index 5b16147..f913595 100644 --- a/scripts/__tests__/skill-evaluator.test.ts +++ b/scripts/__tests__/skill-evaluator.test.ts @@ -4,6 +4,8 @@ import { extractFrontmatter, extractFirstJsonObject, buildRubricPrompt, + evaluateSkill, + smokeFireTest, } from "../skill-evaluator.js"; const goodSkill = `--- @@ -257,3 +259,32 @@ echo "hello" expect(prompt).toContain("```bash"); }); }); + +describe("smokeFireTest", () => { + it("returns skipped: true with a follow-up message (stub)", async () => { + const result = await smokeFireTest(goodSkill); + expect(result.skipped).toBe(true); + expect(typeof result.message).toBe("string"); + }); +}); + +describe("evaluateSkill (orchestrator)", () => { + it("scores 0 when structural validation fails and does not call the LLM", async () => { + const broken = `# no frontmatter at all`; + const result = await evaluateSkill(broken, { skipRubric: true }); + expect(result.structural.valid).toBe(false); + expect(result.overallScore).toBe(0); + // smoke firing is always present (stub) + expect(result.smokeFiring.skipped).toBe(true); + // rubric is marked skipped, never invoked + expect(result.rubric?.skipped).toBe(true); + expect(result.rubric?.ok).toBe(false); + }); + + it("scores 50 when structural passes but rubric is skipped", async () => { + const result = await evaluateSkill(goodSkill, { skipRubric: true }); + expect(result.structural.valid).toBe(true); + expect(result.overallScore).toBe(50); + expect(result.rubric?.skipped).toBe(true); + }); +});