Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
"react": "^19.2.4",
"react-chartjs-2": "^5.3.1",
"react-dom": "^19.2.4",
"react-force-graph-2d": "^1.29.1"
"react-force-graph-2d": "^1.29.1",
"zod": "^4.3.6"
},
"devDependencies": {
"@eslint/js": "^9.39.4",
Expand Down
12 changes: 12 additions & 0 deletions scripts/__tests__/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ describe("parseCliArgs", () => {
expect(result.model).toBeUndefined();
expect(result.skipSynthesis).toBe(false);
expect(result.dryRun).toBe(false);
expect(result.skipEval).toBe(false);
expect(result.evalModel).toBeUndefined();
});

it("sets sessionsDir with --sessions-dir", () => {
Expand Down Expand Up @@ -47,6 +49,16 @@ describe("parseCliArgs", () => {
expect(result.dryRun).toBe(true);
});

it("sets skipEval with --skip-eval", () => {
const result = parseCliArgs(["node", "cli.ts", "--skip-eval"]);
expect(result.skipEval).toBe(true);
});

it("sets evalModel with --eval-model", () => {
const result = parseCliArgs(["node", "cli.ts", "--eval-model", "haiku"]);
expect(result.evalModel).toBe("haiku");
});

it("handles multiple flags combined", () => {
const result = parseCliArgs([
"node",
Expand Down
290 changes: 290 additions & 0 deletions scripts/__tests__/skill-evaluator.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
import { describe, it, expect } from "vitest";
import {
validateStructure,
extractFrontmatter,
extractFirstJsonObject,
buildRubricPrompt,
evaluateSkill,
smokeFireTest,
} from "../skill-evaluator.js";

const goodSkill = `---
name: refactor-tests
description: Use when the user wants to refactor a Vitest suite to use shared fixtures. Triggers on phrases like "refactor tests" or "extract fixture".
allowed-tools: [Read, Edit, Bash]
---

## Overview
Refactors Vitest tests to share fixtures.

## When to Use
When tests duplicate setup logic.

## Workflow
1. Read failing test files.
2. Extract fixtures.
3. Run vitest.
`;

describe("validateStructure", () => {
it("accepts well-formed frontmatter", () => {
const result = validateStructure(goodSkill);
expect(result.valid).toBe(true);
expect(result.issues).toEqual([]);
expect(result.parsed?.name).toBe("refactor-tests");
expect(result.parsed?.allowedTools).toEqual(["Read", "Edit", "Bash"]);
});

it("flags missing name", () => {
const md = `---
description: Long enough description for the validator triggers and hints.
---

body
`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues.some((i) => i.field === "name")).toBe(true);
});

it("flags missing description", () => {
const md = `---
name: my-skill
---

body
`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues.some((i) => i.field === "description")).toBe(true);
});

it("flags oversize description (>500 chars)", () => {
const longDesc = "x".repeat(501);
const md = `---
name: my-skill
description: ${longDesc}
---

body
`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues.some((i) => i.field === "description" && /500/.test(i.message))).toBe(
true
);
});

it("flags too-short description (<20 chars)", () => {
const md = `---
name: my-skill
description: short
---

body
`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues.some((i) => i.field === "description")).toBe(true);
});

it("flags forbidden characters in name", () => {
const md = `---
name: My_Skill!
description: A reasonable description with enough detail to pass minimum length check.
---

body
`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues.some((i) => i.field === "name")).toBe(true);
});

it("flags malformed YAML — missing closing ---", () => {
const md = `---
name: my-skill
description: A reasonable description with enough detail to pass minimum length check.

body without closing fence
`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues[0].field).toBe("frontmatter");
expect(result.issues[0].message).toMatch(/closing/);
});

it("flags malformed YAML — no frontmatter at all", () => {
const md = `# Just a heading\n\nNo frontmatter here.`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues[0].field).toBe("frontmatter");
});

it("flags malformed YAML — bad scalar line", () => {
const md = `---
name: ok-name
this line has no colon
description: A reasonable description with enough detail to pass minimum length check.
---

body
`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues[0].field).toBe("frontmatter");
});

it("flags empty body after frontmatter", () => {
const md = `---
name: my-skill
description: A reasonable description with enough detail to pass minimum length check.
---

`;
const result = validateStructure(md);
expect(result.valid).toBe(false);
expect(result.issues.some((i) => i.field === "body")).toBe(true);
});

it("accepts body when file ends without trailing newline after closing fence", () => {
// No trailing newline after the body and no blank line between fence
// and body — previously misreported as "empty body".
const md =
"---\nname: my-skill\ndescription: A reasonable description with enough detail to pass minimum length check.\n---\nbody content";
const result = validateStructure(md);
expect(result.valid).toBe(true);
expect(result.issues).toEqual([]);
});

it("accepts indented list form for allowed-tools", () => {
const md = `---
name: my-skill
description: A reasonable description with enough detail to pass minimum length check.
allowed-tools:
- Read
- Edit
---

body content
`;
const result = validateStructure(md);
expect(result.valid).toBe(true);
expect(result.parsed?.allowedTools).toEqual(["Read", "Edit"]);
});

it("accepts requires/next workflow-continuation arrays", () => {
const md = `---
name: deploy-app
description: Use when the user wants to deploy the staging environment after running tests.
requires: [setup-env]
next: [smoke-test]
---

deploy steps
`;
const result = validateStructure(md);
expect(result.valid).toBe(true);
expect(result.parsed?.requires).toEqual(["setup-env"]);
expect(result.parsed?.next).toEqual(["smoke-test"]);
});
});

describe("extractFrontmatter", () => {
it("parses simple frontmatter", () => {
const { data } = extractFrontmatter(goodSkill);
expect(data.name).toBe("refactor-tests");
expect(typeof data.description).toBe("string");
});

it("strips quotes around values", () => {
const md = `---
name: "quoted-name"
description: 'single quoted long enough description for the validator'
---

body
`;
const { data } = extractFrontmatter(md);
expect(data.name).toBe("quoted-name");
expect(data.description).toBe("single quoted long enough description for the validator");
});
});

describe("extractFirstJsonObject", () => {
it("returns the first balanced JSON object", () => {
const s = 'preamble {"score": 80, "nested": {"a": 1}} trailing';
expect(extractFirstJsonObject(s)).toBe('{"score": 80, "nested": {"a": 1}}');
});

it("handles strings containing braces", () => {
const s = '{"hint": "look at {curly} braces", "score": 50}';
expect(extractFirstJsonObject(s)).toBe(s);
});

it("returns null when no object present", () => {
expect(extractFirstJsonObject("just text")).toBeNull();
});
});

describe("buildRubricPrompt", () => {
it("includes the skill markdown and rubric instructions", () => {
const prompt = buildRubricPrompt(goodSkill);
expect(prompt).toContain("STRICT JSON");
expect(prompt).toContain("nameQuality");
expect(prompt).toContain("descriptionTriggering");
expect(prompt).toContain("instructionsConcrete");
expect(prompt).toContain("noPreambleNoise");
expect(prompt).toContain("refactor-tests");
});

it("uses an enclosing fence longer than any backtick run inside the markdown", () => {
// Skill body containing triple-backtick code blocks must not break the
// outer fence — pick a longer fence (>=4 backticks).
const skillWithFences = `---
name: my-skill
description: A reasonable description with enough detail to pass the minimum length check.
---

## Example

\`\`\`bash
echo "hello"
\`\`\`
`;
const prompt = buildRubricPrompt(skillWithFences);
// The outer fence must be at least 4 backticks long to wrap the inner ```.
expect(prompt).toMatch(/\n````+\n[\s\S]*?\n````+\n?/);
// The inner ``` must still be present verbatim.
expect(prompt).toContain("```bash");
});
});

describe("smokeFireTest", () => {
it("returns skipped: true with a follow-up message (stub)", async () => {
const result = await smokeFireTest(goodSkill);
expect(result.skipped).toBe(true);
expect(typeof result.message).toBe("string");
});
});

describe("evaluateSkill (orchestrator)", () => {
it("scores 0 when structural validation fails and does not call the LLM", async () => {
const broken = `# no frontmatter at all`;
const result = await evaluateSkill(broken, { skipRubric: true });
expect(result.structural.valid).toBe(false);
expect(result.overallScore).toBe(0);
// smoke firing is always present (stub)
expect(result.smokeFiring.skipped).toBe(true);
// rubric is marked skipped, never invoked
expect(result.rubric?.skipped).toBe(true);
expect(result.rubric?.ok).toBe(false);
});

it("scores 50 when structural passes but rubric is skipped", async () => {
const result = await evaluateSkill(goodSkill, { skipRubric: true });
expect(result.structural.valid).toBe(true);
expect(result.overallScore).toBe(50);
expect(result.rubric?.skipped).toBe(true);
});
});
Loading
Loading