From 709b0f5db71bd1706a36db4ae421eac5b68d3c91 Mon Sep 17 00:00:00 2001 From: Jory Irving Date: Wed, 1 Jul 2026 15:22:59 -0600 Subject: [PATCH] feat(groomer): schema-constrained LLM output (json_schema) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The groomer used response_format {type:"json_object"} — valid JSON, but not schema-conformant, so a model could emit wrong/missing fields. Switch to {type:"json_schema"} with a JSON Schema for the groomer output, so a llama.cpp backend (via litellm) grammar-constrains decoding to the exact shape. This is what lets a small model (e.g. the 4B vision model) produce reliable structure. - buildGroomerResponseSchema() constrains structure + fixed enums, and builds lane.id as a dynamic enum from the configured lanes (getLaneIds) so the model can't emit a hallucinated lane. - Falls back to json_object on a 400 (backend without json_schema support), so grooming never breaks. validateGroomerOutput still runs as the safety net and handles enum alias canonicalization — belt and suspenders. Related: #498 (dispatch audit). Enables running the groomer on a small local model. --- src/lib/groomer/llm.test.ts | 45 +++++++++++++++- src/lib/groomer/llm.ts | 103 +++++++++++++++++++++++++++++------- 2 files changed, 128 insertions(+), 20 deletions(-) diff --git a/src/lib/groomer/llm.test.ts b/src/lib/groomer/llm.test.ts index 142dddee..7dae9e6a 100644 --- a/src/lib/groomer/llm.test.ts +++ b/src/lib/groomer/llm.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it, vi, beforeEach } from "vitest"; -import { callGroomerLLM } from "./llm"; +import { callGroomerLLM, buildGroomerResponseSchema } from "./llm"; +import { getLaneIds } from "@/lib/lane-config"; describe("callGroomerLLM", () => { const originalFetch = global.fetch; @@ -30,7 +31,7 @@ describe("callGroomerLLM", () => { capturedBody = JSON.parse((global.fetch as any).mock.calls[0][1].body); expect(capturedUrl).toBe("https://llm.example.com/chat/completions"); expect(capturedBody.model).toBe("gpt-4o-mini"); - expect(capturedBody.response_format?.type).toBe("json_object"); + expect(capturedBody.response_format?.type).toBe("json_schema"); }); it("returns parsed JSON from LLM response", async () => { @@ -186,3 +187,43 @@ describe("callGroomerLLM", () => { expect(result.lane.id).toBe("local"); }); }); + +describe("buildGroomerResponseSchema", () => { + it("requires lane/labels, forbids extra props, and constrains lane.id to configured lanes", () => { + const schema = buildGroomerResponseSchema() as any; + expect(schema.required).toEqual(expect.arrayContaining(["labelsToAdd", "labelsToRemove", "lane"])); + expect(schema.additionalProperties).toBe(false); + expect(schema.properties.lane.required).toEqual(["id", "confidence", "reason"]); + const laneId = schema.properties.lane.properties.id; + expect(laneId.enum).toEqual(getLaneIds()); + expect(laneId.enum.length).toBeGreaterThan(0); + }); +}); + +describe("callGroomerLLM response_format", () => { + beforeEach(() => vi.restoreAllMocks()); + + const okContent = () => + `{"labelsToAdd":[],"labelsToRemove":[],"lane":{"id":"${getLaneIds()[0]}","confidence":"high","reason":"r"}}`; + + it("sends json_schema (name + dynamic lane enum) on the first attempt", async () => { + global.fetch = vi.fn().mockResolvedValue({ ok: true, json: async () => ({ choices: [{ message: { content: okContent() } }] }) }); + await callGroomerLLM({ baseUrl: "https://llm.example.com", apiKey: "k", model: "vision", prompt: "p", timeoutMs: 1000 }); + const body = JSON.parse((global.fetch as any).mock.calls[0][1].body); + expect(body.response_format.type).toBe("json_schema"); + expect(body.response_format.json_schema.name).toBe("groomer_output"); + expect(body.response_format.json_schema.schema.properties.lane.properties.id.enum).toEqual(getLaneIds()); + }); + + it("falls back to json_object when the backend rejects json_schema (400)", async () => { + const fetchMock = vi.fn() + .mockResolvedValueOnce({ ok: false, status: 400, text: async () => "unsupported response_format" }) + .mockResolvedValueOnce({ ok: true, status: 200, json: async () => ({ choices: [{ message: { content: okContent() } }] }) }); + global.fetch = fetchMock as any; + const result = await callGroomerLLM({ baseUrl: "https://llm.example.com", apiKey: "k", model: "vision", prompt: "p", timeoutMs: 1000 }); + expect(fetchMock).toHaveBeenCalledTimes(2); + expect(JSON.parse(fetchMock.mock.calls[0][1].body).response_format.type).toBe("json_schema"); + expect(JSON.parse(fetchMock.mock.calls[1][1].body).response_format.type).toBe("json_object"); + expect(result.lane.confidence).toBe("high"); + }); +}); diff --git a/src/lib/groomer/llm.ts b/src/lib/groomer/llm.ts index 22d77c27..d0f48e0a 100644 --- a/src/lib/groomer/llm.ts +++ b/src/lib/groomer/llm.ts @@ -1,5 +1,5 @@ import type { GroomerOutput } from "./schema"; -import { getConfiguredLanes, getClaimableLanes, getBacklogLane } from "@/lib/lane-config"; +import { getConfiguredLanes, getClaimableLanes, getBacklogLane, getLaneIds } from "@/lib/lane-config"; import { STATUS_LABELS, PRIORITY_LABELS } from "@/types"; export interface CallLlmOptions { @@ -61,6 +61,78 @@ Body enrichment rules: - Keep enriched body under 10000 characters`; } +const CONFIDENCE_ENUM = ["high", "medium", "low"] as const; + +/** + * JSON Schema for the groomer's output, used as an OpenAI-style `json_schema` + * response_format. On a self-hosted llama.cpp backend (via litellm) this + * grammar-constrains decoding to the exact shape — the key to reliable output + * from a small model. `lane.id` is a dynamic enum built from the configured + * lanes so the model can only emit a real lane, never a hallucinated one. + * `validateGroomerOutput` still runs afterward as the safety net (and handles + * enum alias canonicalization), so this is belt-and-suspenders. + */ +export function buildGroomerResponseSchema(): Record { + const laneIds = getLaneIds(); + const confidence = { type: "string", enum: [...CONFIDENCE_ENUM] }; + return { + type: "object", + additionalProperties: false, + required: ["labelsToAdd", "labelsToRemove", "lane"], + properties: { + actionability: { type: "string", enum: ["ready", "needs_info", "blocked", "backlog", "already_done"] }, + confidence, + labelsToAdd: { type: "array", items: { type: "string" } }, + labelsToRemove: { type: "array", items: { type: "string" } }, + lane: { + type: "object", + additionalProperties: false, + required: ["id", "confidence", "reason"], + properties: { + id: laneIds.length > 0 ? { type: "string", enum: laneIds } : { type: "string" }, + confidence, + reason: { type: "string" }, + }, + }, + summary: { type: "string" }, + githubComment: { type: "string" }, + needsInfoReason: { type: "string" }, + blockedReason: { type: "string" }, + nextGroomingAction: { + type: "string", + enum: ["promote_to_ready", "escalate", "mark_not_ready", "mark_needs_info", "mark_blocked"], + }, + proposedTitle: { type: "string" }, + proposedBody: { type: "string" }, + }, + }; +} + +function postChatCompletion( + url: string, + options: CallLlmOptions, + responseFormat: unknown, + signal: AbortSignal, +): Promise { + return fetch(url, { + method: "POST", + headers: { + Authorization: `Bearer ${options.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: options.model, + messages: [ + { role: "system", content: buildSystemPrompt() }, + { role: "user", content: options.prompt }, + ], + response_format: responseFormat, + temperature: 0.1, + }), + signal, + }); +} + export async function callGroomerLLM(options: CallLlmOptions): Promise { const url = `${options.baseUrl}/chat/completions`; @@ -68,23 +140,18 @@ export async function callGroomerLLM(options: CallLlmOptions): Promise controller.abort(), options.timeoutMs); try { - const response = await fetch(url, { - method: "POST", - headers: { - Authorization: `Bearer ${options.apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: options.model, - messages: [ - { role: "system", content: buildSystemPrompt() }, - { role: "user", content: options.prompt }, - ], - response_format: { type: "json_object" }, - temperature: 0.1, - }), - signal: controller.signal, - }); + // Prefer schema-constrained decoding. Fall back to plain JSON mode if the + // backend rejects json_schema (400), so grooming never breaks on a serving + // stack that doesn't support it; validateGroomerOutput repairs content either way. + let response = await postChatCompletion( + url, + options, + { type: "json_schema", json_schema: { name: "groomer_output", schema: buildGroomerResponseSchema() } }, + controller.signal, + ); + if (response.status === 400) { + response = await postChatCompletion(url, options, { type: "json_object" }, controller.signal); + } if (!response.ok) { const text = await response.text();