From 68c4bcf09951c6b348cefc168f9b9623d889c3ba Mon Sep 17 00:00:00 2001 From: stxkxs <139715017+stxkxs@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:12:33 -0700 Subject: [PATCH] feat(quality): persist the gate's grade signal as a cross-engagement trend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The merge gate grades every PR across the 9 QUALITY_RUBRIC dimensions and the external-reviewer re-grades cold, but those grades only ever drove a single ship/block decision and were then discarded. The factory could not answer the one question that tells it whether it is improving: are my grades trending up or down across engagements? This wires the existing signal into an append-only record and surfaces the trend. ─────────────────────────── What changed ─────────────────────────── src/quality.ts (new) — the quality log. - QualityRun record: timestamp, workflow, gate profile, final decision, revision attempts, the aggregate internal grades, and (for calibrated code runs) the external-reviewer grades + drift. - appendQualityRun / loadQualityRuns over an append-only JSONL at ~/.fab/quality.jsonl — next to state.json, so the signal spans every repo the factory ships rather than one working tree. Overridable with FAB_QUALITY_FILE, mirroring FAB_STATE_FILE. - gradeToGpa maps letters (with +/-) to a 0–4.3 scale; N/A is excluded. - formatQualityTrend renders a per-dimension table (overall vs recent-window GPA with a direction arrow — declining dimensions show in red) plus a footer with approval rate, calibration coverage, and drift rate. src/gate.ts — extracted aggregateGrades(verdicts), the internal-grade aggregation the external calibration already did inline (advisory verdicts skipped, later verdict wins on collision). Now shared by the gate and the quality record so they cannot diverge. src/workflows.ts — capture at the gate, not a new pass. - runExternalCalibration now returns the internal + external grades and the drift alongside its blocking result, instead of returning null and throwing the grades away on the aligned path. - runMergeGate records exactly one QualityRun on every terminal path (approve, drift-block reject, gate reject, exhausted revisions) via a best-effort recordQuality helper — a metrics write is wrapped so it can never break the gate. src/bin/fab.ts — `fab perf` now prints the quality trend below the agent performance table. One command, the whole picture. src/index.ts — exports the quality surface. ─────────────────────────── Scope ─────────────────────────── This is the loop-closing wire-up, deliberately not a new repo or datastore. A frozen golden-brief benchmark and a re-grading harness become worthwhile only once the JSONL has real run density — they are explicitly deferred. collectSessionMetrics (the per-role token table behind `fab perf`) remains unwired; it is managed-agents-API-specific and orthogonal to the grade trend. Tracked separately. Tests: quality.test.ts (roundtrip, gradeToGpa, trend formatting/footer) and aggregateGrades cases in gate.test.ts. Full suite, typecheck, lint, and prettier all green. --- __tests__/gate.test.ts | 36 ++++++++++ __tests__/quality.test.ts | 102 +++++++++++++++++++++++++++ __tests__/setup.ts | 3 + src/bin/fab.ts | 5 +- src/gate.ts | 17 +++++ src/index.ts | 2 + src/quality.ts | 142 ++++++++++++++++++++++++++++++++++++++ src/workflows.ts | 110 ++++++++++++++++++++++++----- 8 files changed, 397 insertions(+), 20 deletions(-) create mode 100644 __tests__/quality.test.ts create mode 100644 src/quality.ts diff --git a/__tests__/gate.test.ts b/__tests__/gate.test.ts index 61aad5d..25cec34 100644 --- a/__tests__/gate.test.ts +++ b/__tests__/gate.test.ts @@ -5,6 +5,7 @@ import { applySelfReviewDowngrade, parseQualityGrades, compareGrades, + aggregateGrades, parseCitations, verifyCitations, } from '../src/gate.js'; @@ -194,6 +195,41 @@ describe('mergeGateVerdicts', () => { }); }); +describe('aggregateGrades', () => { + const v = (role: TeamRole, grades?: Record, advisory = false): GateVerdict => ({ + role, + verdict: 'APPROVE', + feedback: '', + advisory, + grades, + }); + + it('merges disjoint per-role dimensions into one map', () => { + const merged = aggregateGrades([ + v('pr-reviewer', { architecture: 'B+', code_quality: 'A-' }), + v('qa-security', { security: 'A' }), + ]); + expect(merged).toEqual({ architecture: 'B+', code_quality: 'A-', security: 'A' }); + }); + + it('skips advisory verdicts (self-review downgrades carry no weight)', () => { + const merged = aggregateGrades([ + v('pr-reviewer', { architecture: 'A' }, true), + v('qa-security', { security: 'B' }), + ]); + expect(merged).toEqual({ security: 'B' }); + }); + + it('lets a later verdict win on a collision', () => { + const merged = aggregateGrades([v('pr-reviewer', { architecture: 'C' }), v('qa-security', { architecture: 'A' })]); + expect(merged).toEqual({ architecture: 'A' }); + }); + + it('tolerates verdicts without grades', () => { + expect(aggregateGrades([v('pr-reviewer')])).toEqual({}); + }); +}); + describe('applySelfReviewDowngrade', () => { it('downgrades conflicted role to advisory', () => { const verdicts: GateVerdict[] = [ diff --git a/__tests__/quality.test.ts b/__tests__/quality.test.ts new file mode 100644 index 0000000..4ad2838 --- /dev/null +++ b/__tests__/quality.test.ts @@ -0,0 +1,102 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { unlink } from 'node:fs/promises'; +import { appendQualityRun, loadQualityRuns, formatQualityTrend, gradeToGpa, type QualityRun } from '../src/quality.js'; + +const QUALITY_FILE = process.env.FAB_QUALITY_FILE!; + +async function cleanup() { + try { + await unlink(QUALITY_FILE); + } catch { + /* ignore */ + } +} + +function run(overrides: Partial = {}): QualityRun { + return { + ts: '2026-06-22T00:00:00.000Z', + workflow: 'feature-build', + profile: 'code', + decision: 'approve', + attempts: 1, + internal: { architecture: 'B+', security: 'A-' }, + ...overrides, + }; +} + +describe('quality', () => { + beforeEach(cleanup); + afterEach(cleanup); + + it('loadQualityRuns returns [] when no file exists', async () => { + expect(await loadQualityRuns()).toEqual([]); + }); + + it('appendQualityRun + loadQualityRuns roundtrips records in order', async () => { + await appendQualityRun(run({ ts: '2026-06-20T00:00:00.000Z', workflow: 'a' })); + await appendQualityRun(run({ ts: '2026-06-21T00:00:00.000Z', workflow: 'b' })); + const runs = await loadQualityRuns(); + expect(runs.map((r) => r.workflow)).toEqual(['a', 'b']); + expect(runs[0].internal).toEqual({ architecture: 'B+', security: 'A-' }); + }); + + it('appends as JSONL (one record per line), tolerating a trailing newline', async () => { + await appendQualityRun(run()); + await appendQualityRun(run()); + const runs = await loadQualityRuns(); + expect(runs).toHaveLength(2); + }); + + it('preserves optional external grades and drift', async () => { + await appendQualityRun( + run({ + external: { architecture: 'B', security: 'A-' }, + drift: { drifted: ['architecture'], maxDrift: 1 }, + }), + ); + const [r] = await loadQualityRuns(); + expect(r.external).toEqual({ architecture: 'B', security: 'A-' }); + expect(r.drift).toEqual({ drifted: ['architecture'], maxDrift: 1 }); + }); + + describe('gradeToGpa', () => { + it('maps letters with +/- to a 0–4.3 scale', () => { + expect(gradeToGpa('A+')).toBeCloseTo(4.3); + expect(gradeToGpa('A')).toBe(4); + expect(gradeToGpa('A-')).toBeCloseTo(3.7); + expect(gradeToGpa('B')).toBe(3); + expect(gradeToGpa('F')).toBe(0); + }); + + it('returns null for N/A so it is excluded from averages', () => { + expect(gradeToGpa('N/A')).toBeNull(); + }); + + it('floors at 0 (no negative GPA)', () => { + expect(gradeToGpa('F')).toBe(0); + }); + }); + + describe('formatQualityTrend', () => { + it('reports an empty-state message with no runs', () => { + expect(formatQualityTrend([])).toMatch(/No quality runs recorded/); + }); + + it('prefers external grades over internal for the trend', () => { + const out = formatQualityTrend([run({ internal: { architecture: 'A' }, external: { architecture: 'C' } })]); + // External C (2.00) wins over internal A (4.00). + expect(out).toMatch(/architecture\s+1\s+2\.00\s+2\.00/); + }); + + it('counts approvals, calibration coverage and drift in the footer', () => { + const out = formatQualityTrend([ + run({ decision: 'approve', external: { architecture: 'B' }, drift: { drifted: [], maxDrift: 0 } }), + run({ decision: 'reject' }), + ]); + expect(out).toMatch(/2 runs/); + expect(out).toMatch(/50% approved/); + expect(out).toMatch(/50% calibrated/); + expect(out).toMatch(/0% drifted/); + }); + }); +}); diff --git a/__tests__/setup.ts b/__tests__/setup.ts index e94ecae..0e1e96e 100644 --- a/__tests__/setup.ts +++ b/__tests__/setup.ts @@ -4,3 +4,6 @@ import { join } from 'node:path'; // Point fab's state file at a throwaway temp path so the test suite never // reads or writes the real ~/.fab/state.json. process.env.FAB_STATE_FILE = join(tmpdir(), `fab-test-state-${process.pid}.json`); + +// Same for the quality log — never touch the real ~/.fab/quality.jsonl. +process.env.FAB_QUALITY_FILE = join(tmpdir(), `fab-test-quality-${process.pid}.jsonl`); diff --git a/src/bin/fab.ts b/src/bin/fab.ts index e7b37bf..f49edd3 100644 --- a/src/bin/fab.ts +++ b/src/bin/fab.ts @@ -45,6 +45,7 @@ import { executeRoleSession } from '../runtimes/role-session.js'; import { ADVISOR_TOOL, hasAdvisorAccess } from '../advisor.js'; import { aggregateUsage, formatUsageReport } from '../usage.js'; import { loadPerf, formatPerfReport } from '../perf.js'; +import { loadQualityRuns, formatQualityTrend } from '../quality.js'; import { deliverResult } from '../webhook.js'; import { parseArgs, type ParsedArgs } from '../args.js'; import type { @@ -1159,6 +1160,8 @@ async function model(args: ParsedArgs): Promise { async function perf(): Promise { const data = await loadPerf(); console.log(formatPerfReport(data)); + const runs = await loadQualityRuns(); + console.log('\n' + formatQualityTrend(runs)); } // ── Budget command ────────────────────────────────────────────────── @@ -1447,7 +1450,7 @@ USAGE fab usage [--since ] Token usage and cost report fab budget [set|clear] Per-session cost limit fab export Extract artifacts to local disk - fab perf Agent performance metrics + fab perf Agent performance metrics + quality trend fab scaffold Full product scaffold [--deploy] [--timeline] [--client] [--webhook ] diff --git a/src/gate.ts b/src/gate.ts index 36c7eee..0151ebc 100644 --- a/src/gate.ts +++ b/src/gate.ts @@ -464,6 +464,23 @@ export interface GradeDrift { maxDrift: number; // largest letter-level gap observed } +/** + * Aggregate per-dimension QUALITY_GRADES across a set of gate verdicts into + * a single map. Advisory verdicts (self-review downgrades) carry no weight + * and are skipped. The gate roles own disjoint dimensions, so collisions are + * rare; when they do occur the later verdict wins. + */ +export function aggregateGrades(verdicts: GateVerdict[]): Record { + const grades: Record = {}; + for (const v of verdicts) { + if (v.advisory) continue; + for (const [dim, grade] of Object.entries(v.grades ?? {})) { + grades[dim] = grade; + } + } + return grades; +} + /** * Compare internal gate grades against external-reviewer grades. * diff --git a/src/index.ts b/src/index.ts index 4f068eb..9d7328b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -7,6 +7,8 @@ export { resolveMcpServers, getRegistry } from './mcp.js'; export { getWorkflow, listWorkflows, executeWorkflow, reviseWorkflow, streamWithAdvisor } from './workflows.js'; export { ADVISOR_TOOL, callAdvisor } from './advisor.js'; export { loadPerf, collectSessionMetrics, formatPerfReport } from './perf.js'; +export { appendQualityRun, loadQualityRuns, formatQualityTrend, gradeToGpa } from './quality.js'; +export type { QualityRun } from './quality.js'; export { deliverResult } from './webhook.js'; export { aggregateUsage, formatUsageReport } from './usage.js'; export { getAllSkillDefs, getSkillDef, loadSkillContent, previewSkillContent, resolveNanohypePath } from './skills.js'; diff --git a/src/quality.ts b/src/quality.ts new file mode 100644 index 0000000..65f8e5b --- /dev/null +++ b/src/quality.ts @@ -0,0 +1,142 @@ +import { appendFile, readFile, mkdir } from 'node:fs/promises'; +import { homedir } from 'node:os'; +import { dirname, join } from 'node:path'; +import type { Grade, GradeDrift } from './gate.js'; +import type { GateDecision } from './types.js'; + +// ── Quality trend — the factory's own grade record ────────────────── +// +// The merge gate grades every PR across the 9 QUALITY_RUBRIC dimensions, +// and the external-reviewer calibration re-grades cold. Until now those +// grades drove a single ship/block decision and were discarded. This +// module appends one record per gated run so the factory can answer the +// question it could not before: are my grades trending up or down across +// engagements? +// +// The log lives next to state.json under ~/.fab so the signal spans every +// repo the factory ships — a cross-engagement trend, not one working tree. +// Override with FAB_QUALITY_FILE (used by tests; an escape hatch) — mirrors +// FAB_STATE_FILE. Append-only JSONL: one self-describing record per line, +// cheap to write mid-pipeline and trivial to tail. + +function qualityFile(): string { + return process.env.FAB_QUALITY_FILE ?? join(homedir(), '.fab', 'quality.jsonl'); +} + +export interface QualityRun { + ts: string; // ISO timestamp the run was recorded + workflow: string; // workflow name (e.g. 'feature-build') + profile: 'code' | 'docs'; // gate profile + decision: GateDecision; // final gate outcome + attempts: number; // 1-based revision attempts the gate took + internal: Record; // aggregate of the gate roles' QUALITY_GRADES + external?: Record; // external-reviewer cold grades (code profile, when parseable) + drift?: GradeDrift; // calibration drift vs internal (present iff external is) +} + +export async function appendQualityRun(run: QualityRun): Promise { + const file = qualityFile(); + await mkdir(dirname(file), { recursive: true }); + await appendFile(file, JSON.stringify(run) + '\n', 'utf-8'); +} + +export async function loadQualityRuns(): Promise { + try { + const raw = await readFile(qualityFile(), 'utf-8'); + return raw + .split('\n') + .filter((line) => line.trim().length > 0) + .map((line) => JSON.parse(line) as QualityRun); + } catch { + return []; + } +} + +/** + * Map a letter grade to a 0–4.3 GPA so per-dimension trends are comparable. + * +/- shift the base letter by 0.3 (F has no minus). N/A and anything + * unrecognized return null and are excluded from averages — a dimension that + * doesn't apply carries no signal. + */ +export function gradeToGpa(grade: Grade): number | null { + if (grade === 'N/A') return null; + const base: Record = { A: 4, B: 3, C: 2, D: 1, F: 0 }; + const letter = grade.charAt(0); + if (!(letter in base)) return null; + let value = base[letter]; + if (grade.endsWith('+')) value += 0.3; + if (grade.endsWith('-')) value -= 0.3; + return Math.max(0, Math.min(4.3, value)); +} + +// The effective grade for a dimension prefers the cold external calibration +// (the more objective signal) and falls back to the internal gate grade. +function effectiveGrade(run: QualityRun, dim: string): Grade | undefined { + return run.external?.[dim] ?? run.internal[dim]; +} + +const RECENT_WINDOW = 5; + +function mean(values: number[]): number { + return values.length === 0 ? 0 : values.reduce((sum, v) => sum + v, 0) / values.length; +} + +/** + * Render the quality trend across recorded runs as a per-dimension table: + * overall GPA vs the recent window, with a direction arrow. Declining + * dimensions are the point — they surface in red. + */ +export function formatQualityTrend(runs: QualityRun[]): string { + const DIM = process.stdout.isTTY ? '\x1b[2m' : ''; + const BOLD = process.stdout.isTTY ? '\x1b[1m' : ''; + const RED = process.stdout.isTTY ? '\x1b[31m' : ''; + const GREEN = process.stdout.isTTY ? '\x1b[32m' : ''; + const RESET = process.stdout.isTTY ? '\x1b[0m' : ''; + + if (runs.length === 0) return 'No quality runs recorded yet. Run a gated workflow first.'; + + // Chronological order (records are appended in order, but sort defensively). + const ordered = [...runs].sort((a, b) => a.ts.localeCompare(b.ts)); + + // Per-dimension GPA series in run order, using the effective grade. + const series = new Map(); + for (const run of ordered) { + const dims = new Set([...Object.keys(run.internal), ...Object.keys(run.external ?? {})]); + for (const dim of dims) { + const grade = effectiveGrade(run, dim); + if (!grade) continue; + const gpa = gradeToGpa(grade); + if (gpa === null) continue; + if (!series.has(dim)) series.set(dim, []); + series.get(dim)!.push(gpa); + } + } + + const lines: string[] = []; + lines.push(`${BOLD}QUALITY TREND${RESET}`); + lines.push( + `${BOLD}${'DIMENSION'.padEnd(18)} ${'N'.padStart(4)} ${'OVERALL'.padStart(8)} ${'RECENT'.padStart(8)} ${'TREND'.padStart(6)}${RESET}`, + ); + + for (const [dim, values] of [...series.entries()].sort((a, b) => a[0].localeCompare(b[0]))) { + const overall = mean(values); + const recent = mean(values.slice(-RECENT_WINDOW)); + const delta = recent - overall; + const arrow = delta > 0.15 ? `${GREEN}↑${RESET}` : delta < -0.15 ? `${RED}↓${RESET}` : `${DIM}→${RESET}`; + lines.push( + `${dim.padEnd(18)} ${String(values.length).padStart(4)} ${overall.toFixed(2).padStart(8)} ${recent.toFixed(2).padStart(8)} ${arrow.padStart(6)}`, + ); + } + + // Summary footer: run count, approval rate, calibration coverage, drift rate. + const total = ordered.length; + const approved = ordered.filter((r) => r.decision === 'approve').length; + const calibrated = ordered.filter((r) => r.external && Object.keys(r.external).length > 0); + const drifted = calibrated.filter((r) => (r.drift?.drifted.length ?? 0) > 0).length; + const pct = (n: number, d: number) => (d === 0 ? 'n/a' : `${Math.round((n / d) * 100)}%`); + lines.push( + `${DIM}${total} runs · ${pct(approved, total)} approved · ${pct(calibrated.length, total)} calibrated · ${pct(drifted, calibrated.length)} drifted${RESET}`, + ); + + return lines.join('\n'); +} diff --git a/src/workflows.ts b/src/workflows.ts index 61f5e24..7818a66 100644 --- a/src/workflows.ts +++ b/src/workflows.ts @@ -6,8 +6,16 @@ import { formatEvent } from './stream.js'; import { callAdvisor } from './advisor.js'; import { getAgentByRole, getBudgetLimit, getPrimaryRepo, setProjectLanguage, setSourceDirs } from './state.js'; import { CODE_GATE_ROLES, DOCS_GATE_ROLES } from './standards.js'; -import { parseGateVerdict, mergeGateVerdicts, parseQualityGrades, compareGrades, parseCitations } from './gate.js'; -import type { GateVerdict, Grade, FileReader } from './gate.js'; +import { + parseGateVerdict, + mergeGateVerdicts, + parseQualityGrades, + compareGrades, + parseCitations, + aggregateGrades, +} from './gate.js'; +import type { GateVerdict, Grade, GradeDrift, FileReader } from './gate.js'; +import { appendQualityRun } from './quality.js'; import { slugForBranch, createBranchIfMissing, fetchRepoFile } from './git.js'; import { estimateCost } from './pricing.js'; import { normalizeDelimiters, spotlight } from './guardrails.js'; @@ -984,6 +992,7 @@ async function runMergeGate( const gateRoles = profile === 'code' ? CODE_GATE_ROLES : DOCS_GATE_ROLES; let context = initialContext; let lastResult: GateResult = { decision: 'reject', feedback: 'Gate did not run.' }; + let lastInternal: Record = {}; for (let attempt = 0; attempt < 3; attempt++) { console.log( @@ -1009,26 +1018,82 @@ Review the PR candidate against your role's merge-gate criteria per FACTORY_PREA } lastResult = mergeGateVerdicts(verdicts); + lastInternal = aggregateGrades(verdicts); if (lastResult.decision === 'approve') { // External-reviewer calibration runs only for code profile — it's // a heavy step and docs workflows don't grade enough dimensions // to need cold triangulation. + let external: Record | undefined; + let drift: GradeDrift | undefined; if (profile === 'code') { const calibration = await runExternalCalibration(runtime, workflowName, verdicts, context); - if (calibration) return calibration; // blocking REJECT from drift + if (calibration) { + external = calibration.external; + drift = calibration.drift; + if (calibration.block) { + // Blocking REJECT from drift — record the graded run, then fail. + await recordQuality( + workflowName, + profile, + calibration.block.decision, + attempt + 1, + lastInternal, + external, + drift, + ); + return calibration.block; + } + } } + await recordQuality(workflowName, profile, 'approve', attempt + 1, lastInternal, external, drift); + return lastResult; + } + if (lastResult.decision === 'reject') { + await recordQuality(workflowName, profile, 'reject', attempt + 1, lastInternal); return lastResult; } - if (lastResult.decision === 'reject') return lastResult; // revise — append feedback and retry context += `\n\nMERGE GATE REVISION REQUESTED:\n${lastResult.feedback ?? ''}`; } + // Exhausted the revision attempts still asking for changes. + await recordQuality(workflowName, profile, lastResult.decision, 3, lastInternal); return lastResult; } +/** + * Append one record of a gated run to the cross-engagement quality log. + * Persistence is best-effort — a metrics write must never break the gate, + * so failures are logged and swallowed. + */ +async function recordQuality( + workflow: string, + profile: GateProfile, + decision: GateResult['decision'], + attempts: number, + internal: Record, + external?: Record, + drift?: GradeDrift, +): Promise { + try { + await appendQualityRun({ + ts: new Date().toISOString(), + workflow, + profile, + decision, + attempts, + internal, + ...(external ? { external } : {}), + ...(drift ? { drift } : {}), + }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.log(`${DIM}Quality run not recorded (${msg}).${RESET}`); + } +} + /** * Cold-context external-reviewer calibration. Runs AFTER the four gate * roles approve. The external-reviewer grades the 9 QUALITY_RUBRIC @@ -1036,17 +1101,25 @@ Review the PR candidate against your role's merge-gate criteria per FACTORY_PREA * verdicts. The pipeline compares its grades against the aggregate of * internal grades; >1-letter drift on any dimension blocks release. * - * Returns null if calibration passes. Returns a blocking GateResult - * (decision: 'reject') if drift is detected — the feedback names which - * dimension(s) diverged so the next attempt can re-invoke the right - * role. + * Returns null only when the external-reviewer produced no parseable + * grades. Otherwise returns the internal + external grades and the drift, + * with `block` set to a blocking GateResult (decision: 'reject') when drift + * is detected — the feedback names which dimension(s) diverged so the next + * attempt can re-invoke the right role, and the grades feed the quality log. */ +interface CalibrationResult { + block: GateResult | null; // blocking REJECT when drift detected, else null + internal: Record; + external: Record; + drift: GradeDrift; +} + async function runExternalCalibration( runtime: AgentRuntime, workflowName: string, internalVerdicts: GateVerdict[], context: string, -): Promise { +): Promise { console.log(`${CYAN}── External-reviewer calibration ──${RESET}\n`); const output = await runRoleSession( @@ -1070,18 +1143,12 @@ Apply the 9-dimension QUALITY_RUBRIC to the post-merge tree. Output the QUALITY_ return null; } - const internalGrades: Record = {}; - for (const v of internalVerdicts) { - if (v.advisory) continue; - for (const [dim, grade] of Object.entries(v.grades ?? {})) { - internalGrades[dim] = grade; - } - } + const internalGrades = aggregateGrades(internalVerdicts); const drift = compareGrades(internalGrades, externalGrades); if (drift.drifted.length === 0) { console.log(`${GREEN}External calibration aligned (max drift ${drift.maxDrift} letter).${RESET}\n`); - return null; + return { block: null, internal: internalGrades, external: externalGrades, drift }; } const fmt = (g: Record) => @@ -1090,8 +1157,13 @@ Apply the 9-dimension QUALITY_RUBRIC to the post-merge tree. Output the QUALITY_ .join('\n'); return { - decision: 'reject', - feedback: `External-reviewer calibration flagged ${drift.drifted.length} dimension(s) with >1-letter drift: ${drift.drifted.join(', ')}. Max drift: ${drift.maxDrift} letter(s). Re-invoke the diverged role(s) with the external-reviewer's citations.\n\nInternal grades:\n${fmt(internalGrades)}\n\nExternal grades:\n${fmt(externalGrades)}`, + block: { + decision: 'reject', + feedback: `External-reviewer calibration flagged ${drift.drifted.length} dimension(s) with >1-letter drift: ${drift.drifted.join(', ')}. Max drift: ${drift.maxDrift} letter(s). Re-invoke the diverged role(s) with the external-reviewer's citations.\n\nInternal grades:\n${fmt(internalGrades)}\n\nExternal grades:\n${fmt(externalGrades)}`, + }, + internal: internalGrades, + external: externalGrades, + drift, }; }