nanohype · stxkxs · Jun 24, 2026 · Jun 22, 2026
diff --git a/__tests__/gate.test.ts b/__tests__/gate.test.ts
@@ -5,6 +5,7 @@ import {
   applySelfReviewDowngrade,
   parseQualityGrades,
   compareGrades,
+  aggregateGrades,
   parseCitations,
   verifyCitations,
 } from '../src/gate.js';
@@ -194,6 +195,41 @@ describe('mergeGateVerdicts', () => {
   });
 });
 
+describe('aggregateGrades', () => {
+  const v = (role: TeamRole, grades?: Record<string, Grade>, advisory = false): GateVerdict => ({
+    role,
+    verdict: 'APPROVE',
+    feedback: '',
+    advisory,
+    grades,
+  });
+
+  it('merges disjoint per-role dimensions into one map', () => {
+    const merged = aggregateGrades([
+      v('pr-reviewer', { architecture: 'B+', code_quality: 'A-' }),
+      v('qa-security', { security: 'A' }),
+    ]);
+    expect(merged).toEqual({ architecture: 'B+', code_quality: 'A-', security: 'A' });
+  });
+
+  it('skips advisory verdicts (self-review downgrades carry no weight)', () => {
+    const merged = aggregateGrades([
+      v('pr-reviewer', { architecture: 'A' }, true),
+      v('qa-security', { security: 'B' }),
+    ]);
+    expect(merged).toEqual({ security: 'B' });
+  });
+
+  it('lets a later verdict win on a collision', () => {
+    const merged = aggregateGrades([v('pr-reviewer', { architecture: 'C' }), v('qa-security', { architecture: 'A' })]);
+    expect(merged).toEqual({ architecture: 'A' });
+  });
+
+  it('tolerates verdicts without grades', () => {
+    expect(aggregateGrades([v('pr-reviewer')])).toEqual({});
+  });
+});
+
 describe('applySelfReviewDowngrade', () => {
   it('downgrades conflicted role to advisory', () => {
     const verdicts: GateVerdict[] = [

diff --git a/__tests__/quality.test.ts b/__tests__/quality.test.ts
@@ -0,0 +1,102 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { unlink } from 'node:fs/promises';
+import { appendQualityRun, loadQualityRuns, formatQualityTrend, gradeToGpa, type QualityRun } from '../src/quality.js';
+
+const QUALITY_FILE = process.env.FAB_QUALITY_FILE!;
+
+async function cleanup() {
+  try {
+    await unlink(QUALITY_FILE);
+  } catch {
+    /* ignore */
+  }
+}
+
+function run(overrides: Partial<QualityRun> = {}): QualityRun {
+  return {
+    ts: '2026-06-22T00:00:00.000Z',
+    workflow: 'feature-build',
+    profile: 'code',
+    decision: 'approve',
+    attempts: 1,
+    internal: { architecture: 'B+', security: 'A-' },
+    ...overrides,
+  };
+}
+
+describe('quality', () => {
+  beforeEach(cleanup);
+  afterEach(cleanup);
+
+  it('loadQualityRuns returns [] when no file exists', async () => {
+    expect(await loadQualityRuns()).toEqual([]);
+  });
+
+  it('appendQualityRun + loadQualityRuns roundtrips records in order', async () => {
+    await appendQualityRun(run({ ts: '2026-06-20T00:00:00.000Z', workflow: 'a' }));
+    await appendQualityRun(run({ ts: '2026-06-21T00:00:00.000Z', workflow: 'b' }));
+    const runs = await loadQualityRuns();
+    expect(runs.map((r) => r.workflow)).toEqual(['a', 'b']);
+    expect(runs[0].internal).toEqual({ architecture: 'B+', security: 'A-' });
+  });
+
+  it('appends as JSONL (one record per line), tolerating a trailing newline', async () => {
+    await appendQualityRun(run());
+    await appendQualityRun(run());
+    const runs = await loadQualityRuns();
+    expect(runs).toHaveLength(2);
+  });
+
+  it('preserves optional external grades and drift', async () => {
+    await appendQualityRun(
+      run({
+        external: { architecture: 'B', security: 'A-' },
+        drift: { drifted: ['architecture'], maxDrift: 1 },
+      }),
+    );
+    const [r] = await loadQualityRuns();
+    expect(r.external).toEqual({ architecture: 'B', security: 'A-' });
+    expect(r.drift).toEqual({ drifted: ['architecture'], maxDrift: 1 });
+  });
+
+  describe('gradeToGpa', () => {
+    it('maps letters with +/- to a 0–4.3 scale', () => {
+      expect(gradeToGpa('A+')).toBeCloseTo(4.3);
+      expect(gradeToGpa('A')).toBe(4);
+      expect(gradeToGpa('A-')).toBeCloseTo(3.7);
+      expect(gradeToGpa('B')).toBe(3);
+      expect(gradeToGpa('F')).toBe(0);
+    });
+
+    it('returns null for N/A so it is excluded from averages', () => {
+      expect(gradeToGpa('N/A')).toBeNull();
+    });
+
+    it('floors at 0 (no negative GPA)', () => {
+      expect(gradeToGpa('F')).toBe(0);
+    });
+  });
+
+  describe('formatQualityTrend', () => {
+    it('reports an empty-state message with no runs', () => {
+      expect(formatQualityTrend([])).toMatch(/No quality runs recorded/);
+    });
+
+    it('prefers external grades over internal for the trend', () => {
+      const out = formatQualityTrend([run({ internal: { architecture: 'A' }, external: { architecture: 'C' } })]);
+      // External C (2.00) wins over internal A (4.00).
+      expect(out).toMatch(/architecture\s+1\s+2\.00\s+2\.00/);
+    });
+
+    it('counts approvals, calibration coverage and drift in the footer', () => {
+      const out = formatQualityTrend([
+        run({ decision: 'approve', external: { architecture: 'B' }, drift: { drifted: [], maxDrift: 0 } }),
+        run({ decision: 'reject' }),
+      ]);
+      expect(out).toMatch(/2 runs/);
+      expect(out).toMatch(/50% approved/);
+      expect(out).toMatch(/50% calibrated/);
+      expect(out).toMatch(/0% drifted/);
+    });
+  });
+});
diff --git a/__tests__/setup.ts b/__tests__/setup.ts
@@ -4,3 +4,6 @@ import { join } from 'node:path';
 // Point fab's state file at a throwaway temp path so the test suite never
 // reads or writes the real ~/.fab/state.json.
 process.env.FAB_STATE_FILE = join(tmpdir(), `fab-test-state-${process.pid}.json`);
+
+// Same for the quality log — never touch the real ~/.fab/quality.jsonl.
+process.env.FAB_QUALITY_FILE = join(tmpdir(), `fab-test-quality-${process.pid}.jsonl`);
diff --git a/src/bin/fab.ts b/src/bin/fab.ts
@@ -45,6 +45,7 @@ import { executeRoleSession } from '../runtimes/role-session.js';
 import { ADVISOR_TOOL, hasAdvisorAccess } from '../advisor.js';
 import { aggregateUsage, formatUsageReport } from '../usage.js';
 import { loadPerf, formatPerfReport } from '../perf.js';
+import { loadQualityRuns, formatQualityTrend } from '../quality.js';
 import { deliverResult } from '../webhook.js';
 import { parseArgs, type ParsedArgs } from '../args.js';
 import type {
@@ -1159,6 +1160,8 @@ async function model(args: ParsedArgs): Promise<void> {
 async function perf(): Promise<void> {
   const data = await loadPerf();
   console.log(formatPerfReport(data));
+  const runs = await loadQualityRuns();
+  console.log('\n' + formatQualityTrend(runs));
 }
 
 // ── Budget command ──────────────────────────────────────────────────
@@ -1447,7 +1450,7 @@ USAGE
   fab usage [--since <date>]        Token usage and cost report
   fab budget [set|clear] <dollars>  Per-session cost limit
   fab export <session-id>           Extract artifacts to local disk
-  fab perf                          Agent performance metrics
+  fab perf                          Agent performance metrics + quality trend
 
   fab scaffold <description...>     Full product scaffold [--deploy] [--timeline] [--client] [--webhook <url>]
 

diff --git a/src/gate.ts b/src/gate.ts
@@ -464,6 +464,23 @@ export interface GradeDrift {
   maxDrift: number; // largest letter-level gap observed
 }
 
+/**
+ * Aggregate per-dimension QUALITY_GRADES across a set of gate verdicts into
+ * a single map. Advisory verdicts (self-review downgrades) carry no weight
+ * and are skipped. The gate roles own disjoint dimensions, so collisions are
+ * rare; when they do occur the later verdict wins.
+ */
+export function aggregateGrades(verdicts: GateVerdict[]): Record<string, Grade> {
+  const grades: Record<string, Grade> = {};
+  for (const v of verdicts) {
+    if (v.advisory) continue;
+    for (const [dim, grade] of Object.entries(v.grades ?? {})) {
+      grades[dim] = grade;
+    }
+  }
+  return grades;
+}
+
 /**
  * Compare internal gate grades against external-reviewer grades.
  *

diff --git a/src/index.ts b/src/index.ts
@@ -7,6 +7,8 @@ export { resolveMcpServers, getRegistry } from './mcp.js';
 export { getWorkflow, listWorkflows, executeWorkflow, reviseWorkflow, streamWithAdvisor } from './workflows.js';
 export { ADVISOR_TOOL, callAdvisor } from './advisor.js';
 export { loadPerf, collectSessionMetrics, formatPerfReport } from './perf.js';
+export { appendQualityRun, loadQualityRuns, formatQualityTrend, gradeToGpa } from './quality.js';
+export type { QualityRun } from './quality.js';
 export { deliverResult } from './webhook.js';
 export { aggregateUsage, formatUsageReport } from './usage.js';
 export { getAllSkillDefs, getSkillDef, loadSkillContent, previewSkillContent, resolveNanohypePath } from './skills.js';

diff --git a/src/quality.ts b/src/quality.ts
@@ -0,0 +1,142 @@
+import { appendFile, readFile, mkdir } from 'node:fs/promises';
+import { homedir } from 'node:os';
+import { dirname, join } from 'node:path';
+import type { Grade, GradeDrift } from './gate.js';
+import type { GateDecision } from './types.js';
+
+// ── Quality trend — the factory's own grade record ──────────────────
+//
+// The merge gate grades every PR across the 9 QUALITY_RUBRIC dimensions,
+// and the external-reviewer calibration re-grades cold. Until now those
+// grades drove a single ship/block decision and were discarded. This
+// module appends one record per gated run so the factory can answer the
+// question it could not before: are my grades trending up or down across
+// engagements?
+//
+// The log lives next to state.json under ~/.fab so the signal spans every
+// repo the factory ships — a cross-engagement trend, not one working tree.
+// Override with FAB_QUALITY_FILE (used by tests; an escape hatch) — mirrors
+// FAB_STATE_FILE. Append-only JSONL: one self-describing record per line,
+// cheap to write mid-pipeline and trivial to tail.
+
+function qualityFile(): string {
+  return process.env.FAB_QUALITY_FILE ?? join(homedir(), '.fab', 'quality.jsonl');
+}
+
+export interface QualityRun {
+  ts: string; // ISO timestamp the run was recorded
+  workflow: string; // workflow name (e.g. 'feature-build')
+  profile: 'code' | 'docs'; // gate profile
+  decision: GateDecision; // final gate outcome
+  attempts: number; // 1-based revision attempts the gate took
+  internal: Record<string, Grade>; // aggregate of the gate roles' QUALITY_GRADES
+  external?: Record<string, Grade>; // external-reviewer cold grades (code profile, when parseable)
+  drift?: GradeDrift; // calibration drift vs internal (present iff external is)
+}
+
+export async function appendQualityRun(run: QualityRun): Promise<void> {
+  const file = qualityFile();
+  await mkdir(dirname(file), { recursive: true });
+  await appendFile(file, JSON.stringify(run) + '\n', 'utf-8');
+}
+
+export async function loadQualityRuns(): Promise<QualityRun[]> {
+  try {
+    const raw = await readFile(qualityFile(), 'utf-8');
+    return raw
+      .split('\n')
+      .filter((line) => line.trim().length > 0)
+      .map((line) => JSON.parse(line) as QualityRun);
+  } catch {
+    return [];
+  }
+}
+
+/**
+ * Map a letter grade to a 0–4.3 GPA so per-dimension trends are comparable.
+ * +/- shift the base letter by 0.3 (F has no minus). N/A and anything
+ * unrecognized return null and are excluded from averages — a dimension that
+ * doesn't apply carries no signal.
+ */
+export function gradeToGpa(grade: Grade): number | null {
+  if (grade === 'N/A') return null;
+  const base: Record<string, number> = { A: 4, B: 3, C: 2, D: 1, F: 0 };
+  const letter = grade.charAt(0);
+  if (!(letter in base)) return null;
+  let value = base[letter];
+  if (grade.endsWith('+')) value += 0.3;
+  if (grade.endsWith('-')) value -= 0.3;
+  return Math.max(0, Math.min(4.3, value));
+}
+
+// The effective grade for a dimension prefers the cold external calibration
+// (the more objective signal) and falls back to the internal gate grade.
+function effectiveGrade(run: QualityRun, dim: string): Grade | undefined {
+  return run.external?.[dim] ?? run.internal[dim];
+}
+
+const RECENT_WINDOW = 5;
+
+function mean(values: number[]): number {
+  return values.length === 0 ? 0 : values.reduce((sum, v) => sum + v, 0) / values.length;
+}
+
+/**
+ * Render the quality trend across recorded runs as a per-dimension table:
+ * overall GPA vs the recent window, with a direction arrow. Declining
+ * dimensions are the point — they surface in red.
+ */
+export function formatQualityTrend(runs: QualityRun[]): string {
+  const DIM = process.stdout.isTTY ? '\x1b[2m' : '';
+  const BOLD = process.stdout.isTTY ? '\x1b[1m' : '';
+  const RED = process.stdout.isTTY ? '\x1b[31m' : '';
+  const GREEN = process.stdout.isTTY ? '\x1b[32m' : '';
+  const RESET = process.stdout.isTTY ? '\x1b[0m' : '';
+
+  if (runs.length === 0) return 'No quality runs recorded yet. Run a gated workflow first.';
+
+  // Chronological order (records are appended in order, but sort defensively).
+  const ordered = [...runs].sort((a, b) => a.ts.localeCompare(b.ts));
+
+  // Per-dimension GPA series in run order, using the effective grade.
+  const series = new Map<string, number[]>();
+  for (const run of ordered) {
+    const dims = new Set([...Object.keys(run.internal), ...Object.keys(run.external ?? {})]);
+    for (const dim of dims) {
+      const grade = effectiveGrade(run, dim);
+      if (!grade) continue;
+      const gpa = gradeToGpa(grade);
+      if (gpa === null) continue;
+      if (!series.has(dim)) series.set(dim, []);
+      series.get(dim)!.push(gpa);
+    }
+  }
+
+  const lines: string[] = [];
+  lines.push(`${BOLD}QUALITY TREND${RESET}`);
+  lines.push(
+    `${BOLD}${'DIMENSION'.padEnd(18)} ${'N'.padStart(4)} ${'OVERALL'.padStart(8)} ${'RECENT'.padStart(8)} ${'TREND'.padStart(6)}${RESET}`,
+  );
+
+  for (const [dim, values] of [...series.entries()].sort((a, b) => a[0].localeCompare(b[0]))) {
+    const overall = mean(values);
+    const recent = mean(values.slice(-RECENT_WINDOW));
+    const delta = recent - overall;
+    const arrow = delta > 0.15 ? `${GREEN}↑${RESET}` : delta < -0.15 ? `${RED}↓${RESET}` : `${DIM}→${RESET}`;
+    lines.push(
+      `${dim.padEnd(18)} ${String(values.length).padStart(4)} ${overall.toFixed(2).padStart(8)} ${recent.toFixed(2).padStart(8)} ${arrow.padStart(6)}`,
+    );
+  }
+
+  // Summary footer: run count, approval rate, calibration coverage, drift rate.
+  const total = ordered.length;
+  const approved = ordered.filter((r) => r.decision === 'approve').length;
+  const calibrated = ordered.filter((r) => r.external && Object.keys(r.external).length > 0);
+  const drifted = calibrated.filter((r) => (r.drift?.drifted.length ?? 0) > 0).length;
+  const pct = (n: number, d: number) => (d === 0 ? 'n/a' : `${Math.round((n / d) * 100)}%`);
+  lines.push(
+    `${DIM}${total} runs · ${pct(approved, total)} approved · ${pct(calibrated.length, total)} calibrated · ${pct(drifted, calibrated.length)} drifted${RESET}`,
+  );
+
+  return lines.join('\n');
+}