chigichan24 · chigichan24 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -29,7 +29,8 @@
     "react": "^19.2.4",
     "react-chartjs-2": "^5.3.1",
     "react-dom": "^19.2.4",
-    "react-force-graph-2d": "^1.29.1"
+    "react-force-graph-2d": "^1.29.1",
+    "zod": "^4.3.6"
   },
   "devDependencies": {
     "@eslint/js": "^9.39.4",

diff --git a/scripts/__tests__/cli.test.ts b/scripts/__tests__/cli.test.ts
@@ -10,6 +10,8 @@ describe("parseCliArgs", () => {
     expect(result.model).toBeUndefined();
     expect(result.skipSynthesis).toBe(false);
     expect(result.dryRun).toBe(false);
+    expect(result.skipEval).toBe(false);
+    expect(result.evalModel).toBeUndefined();
   });
 
   it("sets sessionsDir with --sessions-dir", () => {
@@ -47,6 +49,16 @@ describe("parseCliArgs", () => {
     expect(result.dryRun).toBe(true);
   });
 
+  it("sets skipEval with --skip-eval", () => {
+    const result = parseCliArgs(["node", "cli.ts", "--skip-eval"]);
+    expect(result.skipEval).toBe(true);
+  });
+
+  it("sets evalModel with --eval-model", () => {
+    const result = parseCliArgs(["node", "cli.ts", "--eval-model", "haiku"]);
+    expect(result.evalModel).toBe("haiku");
+  });
+
   it("handles multiple flags combined", () => {
     const result = parseCliArgs([
       "node",

diff --git a/scripts/__tests__/skill-evaluator.test.ts b/scripts/__tests__/skill-evaluator.test.ts
@@ -0,0 +1,290 @@
+import { describe, it, expect } from "vitest";
+import {
+  validateStructure,
+  extractFrontmatter,
+  extractFirstJsonObject,
+  buildRubricPrompt,
+  evaluateSkill,
+  smokeFireTest,
+} from "../skill-evaluator.js";
+
+const goodSkill = `---
+name: refactor-tests
+description: Use when the user wants to refactor a Vitest suite to use shared fixtures. Triggers on phrases like "refactor tests" or "extract fixture".
+allowed-tools: [Read, Edit, Bash]
+---
+
+## Overview
+Refactors Vitest tests to share fixtures.
+
+## When to Use
+When tests duplicate setup logic.
+
+## Workflow
+1. Read failing test files.
+2. Extract fixtures.
+3. Run vitest.
+`;
+
+describe("validateStructure", () => {
+  it("accepts well-formed frontmatter", () => {
+    const result = validateStructure(goodSkill);
+    expect(result.valid).toBe(true);
+    expect(result.issues).toEqual([]);
+    expect(result.parsed?.name).toBe("refactor-tests");
+    expect(result.parsed?.allowedTools).toEqual(["Read", "Edit", "Bash"]);
+  });
+
+  it("flags missing name", () => {
+    const md = `---
+description: Long enough description for the validator triggers and hints.
+---
+
+body
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues.some((i) => i.field === "name")).toBe(true);
+  });
+
+  it("flags missing description", () => {
+    const md = `---
+name: my-skill
+---
+
+body
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues.some((i) => i.field === "description")).toBe(true);
+  });
+
+  it("flags oversize description (>500 chars)", () => {
+    const longDesc = "x".repeat(501);
+    const md = `---
+name: my-skill
+description: ${longDesc}
+---
+
+body
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues.some((i) => i.field === "description" && /500/.test(i.message))).toBe(
+      true
+    );
+  });
+
+  it("flags too-short description (<20 chars)", () => {
+    const md = `---
+name: my-skill
+description: short
+---
+
+body
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues.some((i) => i.field === "description")).toBe(true);
+  });
+
+  it("flags forbidden characters in name", () => {
+    const md = `---
+name: My_Skill!
+description: A reasonable description with enough detail to pass minimum length check.
+---
+
+body
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues.some((i) => i.field === "name")).toBe(true);
+  });
+
+  it("flags malformed YAML — missing closing ---", () => {
+    const md = `---
+name: my-skill
+description: A reasonable description with enough detail to pass minimum length check.
+
+body without closing fence
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues[0].field).toBe("frontmatter");
+    expect(result.issues[0].message).toMatch(/closing/);
+  });
+
+  it("flags malformed YAML — no frontmatter at all", () => {
+    const md = `# Just a heading\n\nNo frontmatter here.`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues[0].field).toBe("frontmatter");
+  });
+
+  it("flags malformed YAML — bad scalar line", () => {
+    const md = `---
+name: ok-name
+this line has no colon
+description: A reasonable description with enough detail to pass minimum length check.
+---
+
+body
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues[0].field).toBe("frontmatter");
+  });
+
+  it("flags empty body after frontmatter", () => {
+    const md = `---
+name: my-skill
+description: A reasonable description with enough detail to pass minimum length check.
+---
+
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(false);
+    expect(result.issues.some((i) => i.field === "body")).toBe(true);
+  });
+
+  it("accepts body when file ends without trailing newline after closing fence", () => {
+    // No trailing newline after the body and no blank line between fence
+    // and body — previously misreported as "empty body".
+    const md =
+      "---\nname: my-skill\ndescription: A reasonable description with enough detail to pass minimum length check.\n---\nbody content";
+    const result = validateStructure(md);
+    expect(result.valid).toBe(true);
+    expect(result.issues).toEqual([]);
+  });
+
+  it("accepts indented list form for allowed-tools", () => {
+    const md = `---
+name: my-skill
+description: A reasonable description with enough detail to pass minimum length check.
+allowed-tools:
+  - Read
+  - Edit
+---
+
+body content
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(true);
+    expect(result.parsed?.allowedTools).toEqual(["Read", "Edit"]);
+  });
+
+  it("accepts requires/next workflow-continuation arrays", () => {
+    const md = `---
+name: deploy-app
+description: Use when the user wants to deploy the staging environment after running tests.
+requires: [setup-env]
+next: [smoke-test]
+---
+
+deploy steps
+`;
+    const result = validateStructure(md);
+    expect(result.valid).toBe(true);
+    expect(result.parsed?.requires).toEqual(["setup-env"]);
+    expect(result.parsed?.next).toEqual(["smoke-test"]);
+  });
+});
+
+describe("extractFrontmatter", () => {
+  it("parses simple frontmatter", () => {
+    const { data } = extractFrontmatter(goodSkill);
+    expect(data.name).toBe("refactor-tests");
+    expect(typeof data.description).toBe("string");
+  });
+
+  it("strips quotes around values", () => {
+    const md = `---
+name: "quoted-name"
+description: 'single quoted long enough description for the validator'
+---
+
+body
+`;
+    const { data } = extractFrontmatter(md);
+    expect(data.name).toBe("quoted-name");
+    expect(data.description).toBe("single quoted long enough description for the validator");
+  });
+});
+
+describe("extractFirstJsonObject", () => {
+  it("returns the first balanced JSON object", () => {
+    const s = 'preamble {"score": 80, "nested": {"a": 1}} trailing';
+    expect(extractFirstJsonObject(s)).toBe('{"score": 80, "nested": {"a": 1}}');
+  });
+
+  it("handles strings containing braces", () => {
+    const s = '{"hint": "look at {curly} braces", "score": 50}';
+    expect(extractFirstJsonObject(s)).toBe(s);
+  });
+
+  it("returns null when no object present", () => {
+    expect(extractFirstJsonObject("just text")).toBeNull();
+  });
+});
+
+describe("buildRubricPrompt", () => {
+  it("includes the skill markdown and rubric instructions", () => {
+    const prompt = buildRubricPrompt(goodSkill);
+    expect(prompt).toContain("STRICT JSON");
+    expect(prompt).toContain("nameQuality");
+    expect(prompt).toContain("descriptionTriggering");
+    expect(prompt).toContain("instructionsConcrete");
+    expect(prompt).toContain("noPreambleNoise");
+    expect(prompt).toContain("refactor-tests");
+  });
+
+  it("uses an enclosing fence longer than any backtick run inside the markdown", () => {
+    // Skill body containing triple-backtick code blocks must not break the
+    // outer fence — pick a longer fence (>=4 backticks).
+    const skillWithFences = `---
+name: my-skill
+description: A reasonable description with enough detail to pass the minimum length check.
+---
+
+## Example
+
+\`\`\`bash
+echo "hello"
+\`\`\`
+`;
+    const prompt = buildRubricPrompt(skillWithFences);
+    // The outer fence must be at least 4 backticks long to wrap the inner ```.
+    expect(prompt).toMatch(/\n````+\n[\s\S]*?\n````+\n?/);
+    // The inner ``` must still be present verbatim.
+    expect(prompt).toContain("```bash");
+  });
+});
+
+describe("smokeFireTest", () => {
+  it("returns skipped: true with a follow-up message (stub)", async () => {
+    const result = await smokeFireTest(goodSkill);
+    expect(result.skipped).toBe(true);
+    expect(typeof result.message).toBe("string");
+  });
+});
+
+describe("evaluateSkill (orchestrator)", () => {
+  it("scores 0 when structural validation fails and does not call the LLM", async () => {
+    const broken = `# no frontmatter at all`;
+    const result = await evaluateSkill(broken, { skipRubric: true });
+    expect(result.structural.valid).toBe(false);
+    expect(result.overallScore).toBe(0);
+    // smoke firing is always present (stub)
+    expect(result.smokeFiring.skipped).toBe(true);
+    // rubric is marked skipped, never invoked
+    expect(result.rubric?.skipped).toBe(true);
+    expect(result.rubric?.ok).toBe(false);
+  });
+
+  it("scores 50 when structural passes but rubric is skipped", async () => {
+    const result = await evaluateSkill(goodSkill, { skipRubric: true });
+    expect(result.structural.valid).toBe(true);
+    expect(result.overallScore).toBe(50);
+    expect(result.rubric?.skipped).toBe(true);
+  });
+});