From 68c4bcf09951c6b348cefc168f9b9623d889c3ba Mon Sep 17 00:00:00 2001
From: stxkxs <139715017+stxkxs@users.noreply.github.com>
Date: Mon, 22 Jun 2026 16:12:33 -0700
Subject: [PATCH] feat(quality): persist the gate's grade signal as a
 cross-engagement trend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The merge gate grades every PR across the 9 QUALITY_RUBRIC dimensions and the
external-reviewer re-grades cold, but those grades only ever drove a single
ship/block decision and were then discarded. The factory could not answer the
one question that tells it whether it is improving: are my grades trending up
or down across engagements? This wires the existing signal into an append-only
record and surfaces the trend.

─────────────────────────── What changed ───────────────────────────

src/quality.ts (new) — the quality log.
  - QualityRun record: timestamp, workflow, gate profile, final decision,
    revision attempts, the aggregate internal grades, and (for calibrated
    code runs) the external-reviewer grades + drift.
  - appendQualityRun / loadQualityRuns over an append-only JSONL at
    ~/.fab/quality.jsonl — next to state.json, so the signal spans every repo
    the factory ships rather than one working tree. Overridable with
    FAB_QUALITY_FILE, mirroring FAB_STATE_FILE.
  - gradeToGpa maps letters (with +/-) to a 0–4.3 scale; N/A is excluded.
  - formatQualityTrend renders a per-dimension table (overall vs recent-window
    GPA with a direction arrow — declining dimensions show in red) plus a
    footer with approval rate, calibration coverage, and drift rate.

src/gate.ts — extracted aggregateGrades(verdicts), the internal-grade
  aggregation the external calibration already did inline (advisory verdicts
  skipped, later verdict wins on collision). Now shared by the gate and the
  quality record so they cannot diverge.

src/workflows.ts — capture at the gate, not a new pass.
  - runExternalCalibration now returns the internal + external grades and the
    drift alongside its blocking result, instead of returning null and
    throwing the grades away on the aligned path.
  - runMergeGate records exactly one QualityRun on every terminal path
    (approve, drift-block reject, gate reject, exhausted revisions) via a
    best-effort recordQuality helper — a metrics write is wrapped so it can
    never break the gate.

src/bin/fab.ts — `fab perf` now prints the quality trend below the agent
  performance table. One command, the whole picture.

src/index.ts — exports the quality surface.

─────────────────────────── Scope ───────────────────────────

This is the loop-closing wire-up, deliberately not a new repo or datastore.
A frozen golden-brief benchmark and a re-grading harness become worthwhile
only once the JSONL has real run density — they are explicitly deferred.

collectSessionMetrics (the per-role token table behind `fab perf`) remains
unwired; it is managed-agents-API-specific and orthogonal to the grade trend.
Tracked separately.

Tests: quality.test.ts (roundtrip, gradeToGpa, trend formatting/footer) and
aggregateGrades cases in gate.test.ts. Full suite, typecheck, lint, and
prettier all green.
---
 __tests__/gate.test.ts    |  36 ++++++++++
 __tests__/quality.test.ts | 102 +++++++++++++++++++++++++++
 __tests__/setup.ts        |   3 +
 src/bin/fab.ts            |   5 +-
 src/gate.ts               |  17 +++++
 src/index.ts              |   2 +
 src/quality.ts            | 142 ++++++++++++++++++++++++++++++++++++++
 src/workflows.ts          | 110 ++++++++++++++++++++++++-----
 8 files changed, 397 insertions(+), 20 deletions(-)
 create mode 100644 __tests__/quality.test.ts
 create mode 100644 src/quality.ts

diff --git a/__tests__/gate.test.ts b/__tests__/gate.test.ts
index 61aad5d..25cec34 100644
--- a/__tests__/gate.test.ts
+++ b/__tests__/gate.test.ts
@@ -5,6 +5,7 @@ import {
   applySelfReviewDowngrade,
   parseQualityGrades,
   compareGrades,
+  aggregateGrades,
   parseCitations,
   verifyCitations,
 } from '../src/gate.js';
@@ -194,6 +195,41 @@ describe('mergeGateVerdicts', () => {
   });
 });
 
+describe('aggregateGrades', () => {
+  const v = (role: TeamRole, grades?: Record<string, Grade>, advisory = false): GateVerdict => ({
+    role,
+    verdict: 'APPROVE',
+    feedback: '',
+    advisory,
+    grades,
+  });
+
+  it('merges disjoint per-role dimensions into one map', () => {
+    const merged = aggregateGrades([
+      v('pr-reviewer', { architecture: 'B+', code_quality: 'A-' }),
+      v('qa-security', { security: 'A' }),
+    ]);
+    expect(merged).toEqual({ architecture: 'B+', code_quality: 'A-', security: 'A' });
+  });
+
+  it('skips advisory verdicts (self-review downgrades carry no weight)', () => {
+    const merged = aggregateGrades([
+      v('pr-reviewer', { architecture: 'A' }, true),
+      v('qa-security', { security: 'B' }),
+    ]);
+    expect(merged).toEqual({ security: 'B' });
+  });
+
+  it('lets a later verdict win on a collision', () => {
+    const merged = aggregateGrades([v('pr-reviewer', { architecture: 'C' }), v('qa-security', { architecture: 'A' })]);
+    expect(merged).toEqual({ architecture: 'A' });
+  });
+
+  it('tolerates verdicts without grades', () => {
+    expect(aggregateGrades([v('pr-reviewer')])).toEqual({});
+  });
+});
+
 describe('applySelfReviewDowngrade', () => {
   it('downgrades conflicted role to advisory', () => {
     const verdicts: GateVerdict[] = [
diff --git a/__tests__/quality.test.ts b/__tests__/quality.test.ts
new file mode 100644
index 0000000..4ad2838
--- /dev/null
+++ b/__tests__/quality.test.ts
@@ -0,0 +1,102 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { unlink } from 'node:fs/promises';
+import { appendQualityRun, loadQualityRuns, formatQualityTrend, gradeToGpa, type QualityRun } from '../src/quality.js';
+
+const QUALITY_FILE = process.env.FAB_QUALITY_FILE!;
+
+async function cleanup() {
+  try {
+    await unlink(QUALITY_FILE);
+  } catch {
+    /* ignore */
+  }
+}
+
+function run(overrides: Partial<QualityRun> = {}): QualityRun {
+  return {
+    ts: '2026-06-22T00:00:00.000Z',
+    workflow: 'feature-build',
+    profile: 'code',
+    decision: 'approve',
+    attempts: 1,
+    internal: { architecture: 'B+', security: 'A-' },
+    ...overrides,
+  };
+}
+
+describe('quality', () => {
+  beforeEach(cleanup);
+  afterEach(cleanup);
+
+  it('loadQualityRuns returns [] when no file exists', async () => {
+    expect(await loadQualityRuns()).toEqual([]);
+  });
+
+  it('appendQualityRun + loadQualityRuns roundtrips records in order', async () => {
+    await appendQualityRun(run({ ts: '2026-06-20T00:00:00.000Z', workflow: 'a' }));
+    await appendQualityRun(run({ ts: '2026-06-21T00:00:00.000Z', workflow: 'b' }));
+    const runs = await loadQualityRuns();
+    expect(runs.map((r) => r.workflow)).toEqual(['a', 'b']);
+    expect(runs[0].internal).toEqual({ architecture: 'B+', security: 'A-' });
+  });
+
+  it('appends as JSONL (one record per line), tolerating a trailing newline', async () => {
+    await appendQualityRun(run());
+    await appendQualityRun(run());
+    const runs = await loadQualityRuns();
+    expect(runs).toHaveLength(2);
+  });
+
+  it('preserves optional external grades and drift', async () => {
+    await appendQualityRun(
+      run({
+        external: { architecture: 'B', security: 'A-' },
+        drift: { drifted: ['architecture'], maxDrift: 1 },
+      }),
+    );
+    const [r] = await loadQualityRuns();
+    expect(r.external).toEqual({ architecture: 'B', security: 'A-' });
+    expect(r.drift).toEqual({ drifted: ['architecture'], maxDrift: 1 });
+  });
+
+  describe('gradeToGpa', () => {
+    it('maps letters with +/- to a 0–4.3 scale', () => {
+      expect(gradeToGpa('A+')).toBeCloseTo(4.3);
+      expect(gradeToGpa('A')).toBe(4);
+      expect(gradeToGpa('A-')).toBeCloseTo(3.7);
+      expect(gradeToGpa('B')).toBe(3);
+      expect(gradeToGpa('F')).toBe(0);
+    });
+
+    it('returns null for N/A so it is excluded from averages', () => {
+      expect(gradeToGpa('N/A')).toBeNull();
+    });
+
+    it('floors at 0 (no negative GPA)', () => {
+      expect(gradeToGpa('F')).toBe(0);
+    });
+  });
+
+  describe('formatQualityTrend', () => {
+    it('reports an empty-state message with no runs', () => {
+      expect(formatQualityTrend([])).toMatch(/No quality runs recorded/);
+    });
+
+    it('prefers external grades over internal for the trend', () => {
+      const out = formatQualityTrend([run({ internal: { architecture: 'A' }, external: { architecture: 'C' } })]);
+      // External C (2.00) wins over internal A (4.00).
+      expect(out).toMatch(/architecture\s+1\s+2\.00\s+2\.00/);
+    });
+
+    it('counts approvals, calibration coverage and drift in the footer', () => {
+      const out = formatQualityTrend([
+        run({ decision: 'approve', external: { architecture: 'B' }, drift: { drifted: [], maxDrift: 0 } }),
+        run({ decision: 'reject' }),
+      ]);
+      expect(out).toMatch(/2 runs/);
+      expect(out).toMatch(/50% approved/);
+      expect(out).toMatch(/50% calibrated/);
+      expect(out).toMatch(/0% drifted/);
+    });
+  });
+});
diff --git a/__tests__/setup.ts b/__tests__/setup.ts
index e94ecae..0e1e96e 100644
--- a/__tests__/setup.ts
+++ b/__tests__/setup.ts
@@ -4,3 +4,6 @@ import { join } from 'node:path';
 // Point fab's state file at a throwaway temp path so the test suite never
 // reads or writes the real ~/.fab/state.json.
 process.env.FAB_STATE_FILE = join(tmpdir(), `fab-test-state-${process.pid}.json`);
+
+// Same for the quality log — never touch the real ~/.fab/quality.jsonl.
+process.env.FAB_QUALITY_FILE = join(tmpdir(), `fab-test-quality-${process.pid}.jsonl`);
diff --git a/src/bin/fab.ts b/src/bin/fab.ts
index e7b37bf..f49edd3 100644
--- a/src/bin/fab.ts
+++ b/src/bin/fab.ts
@@ -45,6 +45,7 @@ import { executeRoleSession } from '../runtimes/role-session.js';
 import { ADVISOR_TOOL, hasAdvisorAccess } from '../advisor.js';
 import { aggregateUsage, formatUsageReport } from '../usage.js';
 import { loadPerf, formatPerfReport } from '../perf.js';
+import { loadQualityRuns, formatQualityTrend } from '../quality.js';
 import { deliverResult } from '../webhook.js';
 import { parseArgs, type ParsedArgs } from '../args.js';
 import type {
@@ -1159,6 +1160,8 @@ async function model(args: ParsedArgs): Promise<void> {
 async function perf(): Promise<void> {
   const data = await loadPerf();
   console.log(formatPerfReport(data));
+  const runs = await loadQualityRuns();
+  console.log('\n' + formatQualityTrend(runs));
 }
 
 // ── Budget command ──────────────────────────────────────────────────
@@ -1447,7 +1450,7 @@ USAGE
   fab usage [--since <date>]        Token usage and cost report
   fab budget [set|clear] <dollars>  Per-session cost limit
   fab export <session-id>           Extract artifacts to local disk
-  fab perf                          Agent performance metrics
+  fab perf                          Agent performance metrics + quality trend
 
   fab scaffold <description...>     Full product scaffold [--deploy] [--timeline] [--client] [--webhook <url>]
 
diff --git a/src/gate.ts b/src/gate.ts
index 36c7eee..0151ebc 100644
--- a/src/gate.ts
+++ b/src/gate.ts
@@ -464,6 +464,23 @@ export interface GradeDrift {
   maxDrift: number; // largest letter-level gap observed
 }
 
+/**
+ * Aggregate per-dimension QUALITY_GRADES across a set of gate verdicts into
+ * a single map. Advisory verdicts (self-review downgrades) carry no weight
+ * and are skipped. The gate roles own disjoint dimensions, so collisions are
+ * rare; when they do occur the later verdict wins.
+ */
+export function aggregateGrades(verdicts: GateVerdict[]): Record<string, Grade> {
+  const grades: Record<string, Grade> = {};
+  for (const v of verdicts) {
+    if (v.advisory) continue;
+    for (const [dim, grade] of Object.entries(v.grades ?? {})) {
+      grades[dim] = grade;
+    }
+  }
+  return grades;
+}
+
 /**
  * Compare internal gate grades against external-reviewer grades.
  *
diff --git a/src/index.ts b/src/index.ts
index 4f068eb..9d7328b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -7,6 +7,8 @@ export { resolveMcpServers, getRegistry } from './mcp.js';
 export { getWorkflow, listWorkflows, executeWorkflow, reviseWorkflow, streamWithAdvisor } from './workflows.js';
 export { ADVISOR_TOOL, callAdvisor } from './advisor.js';
 export { loadPerf, collectSessionMetrics, formatPerfReport } from './perf.js';
+export { appendQualityRun, loadQualityRuns, formatQualityTrend, gradeToGpa } from './quality.js';
+export type { QualityRun } from './quality.js';
 export { deliverResult } from './webhook.js';
 export { aggregateUsage, formatUsageReport } from './usage.js';
 export { getAllSkillDefs, getSkillDef, loadSkillContent, previewSkillContent, resolveNanohypePath } from './skills.js';
diff --git a/src/quality.ts b/src/quality.ts
new file mode 100644
index 0000000..65f8e5b
--- /dev/null
+++ b/src/quality.ts
@@ -0,0 +1,142 @@
+import { appendFile, readFile, mkdir } from 'node:fs/promises';
+import { homedir } from 'node:os';
+import { dirname, join } from 'node:path';
+import type { Grade, GradeDrift } from './gate.js';
+import type { GateDecision } from './types.js';
+
+// ── Quality trend — the factory's own grade record ──────────────────
+//
+// The merge gate grades every PR across the 9 QUALITY_RUBRIC dimensions,
+// and the external-reviewer calibration re-grades cold. Until now those
+// grades drove a single ship/block decision and were discarded. This
+// module appends one record per gated run so the factory can answer the
+// question it could not before: are my grades trending up or down across
+// engagements?
+//
+// The log lives next to state.json under ~/.fab so the signal spans every
+// repo the factory ships — a cross-engagement trend, not one working tree.
+// Override with FAB_QUALITY_FILE (used by tests; an escape hatch) — mirrors
+// FAB_STATE_FILE. Append-only JSONL: one self-describing record per line,
+// cheap to write mid-pipeline and trivial to tail.
+
+function qualityFile(): string {
+  return process.env.FAB_QUALITY_FILE ?? join(homedir(), '.fab', 'quality.jsonl');
+}
+
+export interface QualityRun {
+  ts: string; // ISO timestamp the run was recorded
+  workflow: string; // workflow name (e.g. 'feature-build')
+  profile: 'code' | 'docs'; // gate profile
+  decision: GateDecision; // final gate outcome
+  attempts: number; // 1-based revision attempts the gate took
+  internal: Record<string, Grade>; // aggregate of the gate roles' QUALITY_GRADES
+  external?: Record<string, Grade>; // external-reviewer cold grades (code profile, when parseable)
+  drift?: GradeDrift; // calibration drift vs internal (present iff external is)
+}
+
+export async function appendQualityRun(run: QualityRun): Promise<void> {
+  const file = qualityFile();
+  await mkdir(dirname(file), { recursive: true });
+  await appendFile(file, JSON.stringify(run) + '\n', 'utf-8');
+}
+
+export async function loadQualityRuns(): Promise<QualityRun[]> {
+  try {
+    const raw = await readFile(qualityFile(), 'utf-8');
+    return raw
+      .split('\n')
+      .filter((line) => line.trim().length > 0)
+      .map((line) => JSON.parse(line) as QualityRun);
+  } catch {
+    return [];
+  }
+}
+
+/**
+ * Map a letter grade to a 0–4.3 GPA so per-dimension trends are comparable.
+ * +/- shift the base letter by 0.3 (F has no minus). N/A and anything
+ * unrecognized return null and are excluded from averages — a dimension that
+ * doesn't apply carries no signal.
+ */
+export function gradeToGpa(grade: Grade): number | null {
+  if (grade === 'N/A') return null;
+  const base: Record<string, number> = { A: 4, B: 3, C: 2, D: 1, F: 0 };
+  const letter = grade.charAt(0);
+  if (!(letter in base)) return null;
+  let value = base[letter];
+  if (grade.endsWith('+')) value += 0.3;
+  if (grade.endsWith('-')) value -= 0.3;
+  return Math.max(0, Math.min(4.3, value));
+}
+
+// The effective grade for a dimension prefers the cold external calibration
+// (the more objective signal) and falls back to the internal gate grade.
+function effectiveGrade(run: QualityRun, dim: string): Grade | undefined {
+  return run.external?.[dim] ?? run.internal[dim];
+}
+
+const RECENT_WINDOW = 5;
+
+function mean(values: number[]): number {
+  return values.length === 0 ? 0 : values.reduce((sum, v) => sum + v, 0) / values.length;
+}
+
+/**
+ * Render the quality trend across recorded runs as a per-dimension table:
+ * overall GPA vs the recent window, with a direction arrow. Declining
+ * dimensions are the point — they surface in red.
+ */
+export function formatQualityTrend(runs: QualityRun[]): string {
+  const DIM = process.stdout.isTTY ? '\x1b[2m' : '';
+  const BOLD = process.stdout.isTTY ? '\x1b[1m' : '';
+  const RED = process.stdout.isTTY ? '\x1b[31m' : '';
+  const GREEN = process.stdout.isTTY ? '\x1b[32m' : '';
+  const RESET = process.stdout.isTTY ? '\x1b[0m' : '';
+
+  if (runs.length === 0) return 'No quality runs recorded yet. Run a gated workflow first.';
+
+  // Chronological order (records are appended in order, but sort defensively).
+  const ordered = [...runs].sort((a, b) => a.ts.localeCompare(b.ts));
+
+  // Per-dimension GPA series in run order, using the effective grade.
+  const series = new Map<string, number[]>();
+  for (const run of ordered) {
+    const dims = new Set([...Object.keys(run.internal), ...Object.keys(run.external ?? {})]);
+    for (const dim of dims) {
+      const grade = effectiveGrade(run, dim);
+      if (!grade) continue;
+      const gpa = gradeToGpa(grade);
+      if (gpa === null) continue;
+      if (!series.has(dim)) series.set(dim, []);
+      series.get(dim)!.push(gpa);
+    }
+  }
+
+  const lines: string[] = [];
+  lines.push(`${BOLD}QUALITY TREND${RESET}`);
+  lines.push(
+    `${BOLD}${'DIMENSION'.padEnd(18)} ${'N'.padStart(4)} ${'OVERALL'.padStart(8)} ${'RECENT'.padStart(8)} ${'TREND'.padStart(6)}${RESET}`,
+  );
+
+  for (const [dim, values] of [...series.entries()].sort((a, b) => a[0].localeCompare(b[0]))) {
+    const overall = mean(values);
+    const recent = mean(values.slice(-RECENT_WINDOW));
+    const delta = recent - overall;
+    const arrow = delta > 0.15 ? `${GREEN}↑${RESET}` : delta < -0.15 ? `${RED}↓${RESET}` : `${DIM}→${RESET}`;
+    lines.push(
+      `${dim.padEnd(18)} ${String(values.length).padStart(4)} ${overall.toFixed(2).padStart(8)} ${recent.toFixed(2).padStart(8)} ${arrow.padStart(6)}`,
+    );
+  }
+
+  // Summary footer: run count, approval rate, calibration coverage, drift rate.
+  const total = ordered.length;
+  const approved = ordered.filter((r) => r.decision === 'approve').length;
+  const calibrated = ordered.filter((r) => r.external && Object.keys(r.external).length > 0);
+  const drifted = calibrated.filter((r) => (r.drift?.drifted.length ?? 0) > 0).length;
+  const pct = (n: number, d: number) => (d === 0 ? 'n/a' : `${Math.round((n / d) * 100)}%`);
+  lines.push(
+    `${DIM}${total} runs · ${pct(approved, total)} approved · ${pct(calibrated.length, total)} calibrated · ${pct(drifted, calibrated.length)} drifted${RESET}`,
+  );
+
+  return lines.join('\n');
+}
diff --git a/src/workflows.ts b/src/workflows.ts
index 61f5e24..7818a66 100644
--- a/src/workflows.ts
+++ b/src/workflows.ts
@@ -6,8 +6,16 @@ import { formatEvent } from './stream.js';
 import { callAdvisor } from './advisor.js';
 import { getAgentByRole, getBudgetLimit, getPrimaryRepo, setProjectLanguage, setSourceDirs } from './state.js';
 import { CODE_GATE_ROLES, DOCS_GATE_ROLES } from './standards.js';
-import { parseGateVerdict, mergeGateVerdicts, parseQualityGrades, compareGrades, parseCitations } from './gate.js';
-import type { GateVerdict, Grade, FileReader } from './gate.js';
+import {
+  parseGateVerdict,
+  mergeGateVerdicts,
+  parseQualityGrades,
+  compareGrades,
+  parseCitations,
+  aggregateGrades,
+} from './gate.js';
+import type { GateVerdict, Grade, GradeDrift, FileReader } from './gate.js';
+import { appendQualityRun } from './quality.js';
 import { slugForBranch, createBranchIfMissing, fetchRepoFile } from './git.js';
 import { estimateCost } from './pricing.js';
 import { normalizeDelimiters, spotlight } from './guardrails.js';
@@ -984,6 +992,7 @@ async function runMergeGate(
   const gateRoles = profile === 'code' ? CODE_GATE_ROLES : DOCS_GATE_ROLES;
   let context = initialContext;
   let lastResult: GateResult = { decision: 'reject', feedback: 'Gate did not run.' };
+  let lastInternal: Record<string, Grade> = {};
 
   for (let attempt = 0; attempt < 3; attempt++) {
     console.log(
@@ -1009,26 +1018,82 @@ Review the PR candidate against your role's merge-gate criteria per FACTORY_PREA
     }
 
     lastResult = mergeGateVerdicts(verdicts);
+    lastInternal = aggregateGrades(verdicts);
 
     if (lastResult.decision === 'approve') {
       // External-reviewer calibration runs only for code profile — it's
       // a heavy step and docs workflows don't grade enough dimensions
       // to need cold triangulation.
+      let external: Record<string, Grade> | undefined;
+      let drift: GradeDrift | undefined;
       if (profile === 'code') {
         const calibration = await runExternalCalibration(runtime, workflowName, verdicts, context);
-        if (calibration) return calibration; // blocking REJECT from drift
+        if (calibration) {
+          external = calibration.external;
+          drift = calibration.drift;
+          if (calibration.block) {
+            // Blocking REJECT from drift — record the graded run, then fail.
+            await recordQuality(
+              workflowName,
+              profile,
+              calibration.block.decision,
+              attempt + 1,
+              lastInternal,
+              external,
+              drift,
+            );
+            return calibration.block;
+          }
+        }
       }
+      await recordQuality(workflowName, profile, 'approve', attempt + 1, lastInternal, external, drift);
+      return lastResult;
+    }
+    if (lastResult.decision === 'reject') {
+      await recordQuality(workflowName, profile, 'reject', attempt + 1, lastInternal);
       return lastResult;
     }
-    if (lastResult.decision === 'reject') return lastResult;
 
     // revise — append feedback and retry
     context += `\n\nMERGE GATE REVISION REQUESTED:\n${lastResult.feedback ?? ''}`;
   }
 
+  // Exhausted the revision attempts still asking for changes.
+  await recordQuality(workflowName, profile, lastResult.decision, 3, lastInternal);
   return lastResult;
 }
 
+/**
+ * Append one record of a gated run to the cross-engagement quality log.
+ * Persistence is best-effort — a metrics write must never break the gate,
+ * so failures are logged and swallowed.
+ */
+async function recordQuality(
+  workflow: string,
+  profile: GateProfile,
+  decision: GateResult['decision'],
+  attempts: number,
+  internal: Record<string, Grade>,
+  external?: Record<string, Grade>,
+  drift?: GradeDrift,
+): Promise<void> {
+  try {
+    await appendQualityRun({
+      ts: new Date().toISOString(),
+      workflow,
+      profile,
+      decision,
+      attempts,
+      internal,
+      ...(external ? { external } : {}),
+      ...(drift ? { drift } : {}),
+    });
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.log(`${DIM}Quality run not recorded (${msg}).${RESET}`);
+  }
+}
+
 /**
  * Cold-context external-reviewer calibration. Runs AFTER the four gate
  * roles approve. The external-reviewer grades the 9 QUALITY_RUBRIC
@@ -1036,17 +1101,25 @@ Review the PR candidate against your role's merge-gate criteria per FACTORY_PREA
  * verdicts. The pipeline compares its grades against the aggregate of
  * internal grades; >1-letter drift on any dimension blocks release.
  *
- * Returns null if calibration passes. Returns a blocking GateResult
- * (decision: 'reject') if drift is detected — the feedback names which
- * dimension(s) diverged so the next attempt can re-invoke the right
- * role.
+ * Returns null only when the external-reviewer produced no parseable
+ * grades. Otherwise returns the internal + external grades and the drift,
+ * with `block` set to a blocking GateResult (decision: 'reject') when drift
+ * is detected — the feedback names which dimension(s) diverged so the next
+ * attempt can re-invoke the right role, and the grades feed the quality log.
  */
+interface CalibrationResult {
+  block: GateResult | null; // blocking REJECT when drift detected, else null
+  internal: Record<string, Grade>;
+  external: Record<string, Grade>;
+  drift: GradeDrift;
+}
+
 async function runExternalCalibration(
   runtime: AgentRuntime,
   workflowName: string,
   internalVerdicts: GateVerdict[],
   context: string,
-): Promise<GateResult | null> {
+): Promise<CalibrationResult | null> {
   console.log(`${CYAN}── External-reviewer calibration ──${RESET}\n`);
 
   const output = await runRoleSession(
@@ -1070,18 +1143,12 @@ Apply the 9-dimension QUALITY_RUBRIC to the post-merge tree. Output the QUALITY_
     return null;
   }
 
-  const internalGrades: Record<string, Grade> = {};
-  for (const v of internalVerdicts) {
-    if (v.advisory) continue;
-    for (const [dim, grade] of Object.entries(v.grades ?? {})) {
-      internalGrades[dim] = grade;
-    }
-  }
+  const internalGrades = aggregateGrades(internalVerdicts);
 
   const drift = compareGrades(internalGrades, externalGrades);
   if (drift.drifted.length === 0) {
     console.log(`${GREEN}External calibration aligned (max drift ${drift.maxDrift} letter).${RESET}\n`);
-    return null;
+    return { block: null, internal: internalGrades, external: externalGrades, drift };
   }
 
   const fmt = (g: Record<string, Grade>) =>
@@ -1090,8 +1157,13 @@ Apply the 9-dimension QUALITY_RUBRIC to the post-merge tree. Output the QUALITY_
       .join('\n');
 
   return {
-    decision: 'reject',
-    feedback: `External-reviewer calibration flagged ${drift.drifted.length} dimension(s) with >1-letter drift: ${drift.drifted.join(', ')}. Max drift: ${drift.maxDrift} letter(s). Re-invoke the diverged role(s) with the external-reviewer's citations.\n\nInternal grades:\n${fmt(internalGrades)}\n\nExternal grades:\n${fmt(externalGrades)}`,
+    block: {
+      decision: 'reject',
+      feedback: `External-reviewer calibration flagged ${drift.drifted.length} dimension(s) with >1-letter drift: ${drift.drifted.join(', ')}. Max drift: ${drift.maxDrift} letter(s). Re-invoke the diverged role(s) with the external-reviewer's citations.\n\nInternal grades:\n${fmt(internalGrades)}\n\nExternal grades:\n${fmt(externalGrades)}`,
+    },
+    internal: internalGrades,
+    external: externalGrades,
+    drift,
   };
 }