Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions __tests__/gate.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
applySelfReviewDowngrade,
parseQualityGrades,
compareGrades,
aggregateGrades,
parseCitations,
verifyCitations,
} from '../src/gate.js';
Expand Down Expand Up @@ -194,6 +195,41 @@ describe('mergeGateVerdicts', () => {
});
});

describe('aggregateGrades', () => {
const v = (role: TeamRole, grades?: Record<string, Grade>, advisory = false): GateVerdict => ({
role,
verdict: 'APPROVE',
feedback: '',
advisory,
grades,
});

it('merges disjoint per-role dimensions into one map', () => {
const merged = aggregateGrades([
v('pr-reviewer', { architecture: 'B+', code_quality: 'A-' }),
v('qa-security', { security: 'A' }),
]);
expect(merged).toEqual({ architecture: 'B+', code_quality: 'A-', security: 'A' });
});

it('skips advisory verdicts (self-review downgrades carry no weight)', () => {
const merged = aggregateGrades([
v('pr-reviewer', { architecture: 'A' }, true),
v('qa-security', { security: 'B' }),
]);
expect(merged).toEqual({ security: 'B' });
});

it('lets a later verdict win on a collision', () => {
const merged = aggregateGrades([v('pr-reviewer', { architecture: 'C' }), v('qa-security', { architecture: 'A' })]);
expect(merged).toEqual({ architecture: 'A' });
});

it('tolerates verdicts without grades', () => {
expect(aggregateGrades([v('pr-reviewer')])).toEqual({});
});
});

describe('applySelfReviewDowngrade', () => {
it('downgrades conflicted role to advisory', () => {
const verdicts: GateVerdict[] = [
Expand Down
102 changes: 102 additions & 0 deletions __tests__/quality.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { unlink } from 'node:fs/promises';
import { appendQualityRun, loadQualityRuns, formatQualityTrend, gradeToGpa, type QualityRun } from '../src/quality.js';

const QUALITY_FILE = process.env.FAB_QUALITY_FILE!;

async function cleanup() {
try {
await unlink(QUALITY_FILE);
} catch {
/* ignore */
}
}

function run(overrides: Partial<QualityRun> = {}): QualityRun {
return {
ts: '2026-06-22T00:00:00.000Z',
workflow: 'feature-build',
profile: 'code',
decision: 'approve',
attempts: 1,
internal: { architecture: 'B+', security: 'A-' },
...overrides,
};
}

describe('quality', () => {
beforeEach(cleanup);
afterEach(cleanup);

it('loadQualityRuns returns [] when no file exists', async () => {
expect(await loadQualityRuns()).toEqual([]);
});

it('appendQualityRun + loadQualityRuns roundtrips records in order', async () => {
await appendQualityRun(run({ ts: '2026-06-20T00:00:00.000Z', workflow: 'a' }));
await appendQualityRun(run({ ts: '2026-06-21T00:00:00.000Z', workflow: 'b' }));
const runs = await loadQualityRuns();
expect(runs.map((r) => r.workflow)).toEqual(['a', 'b']);
expect(runs[0].internal).toEqual({ architecture: 'B+', security: 'A-' });
});

it('appends as JSONL (one record per line), tolerating a trailing newline', async () => {
await appendQualityRun(run());
await appendQualityRun(run());
const runs = await loadQualityRuns();
expect(runs).toHaveLength(2);
});

it('preserves optional external grades and drift', async () => {
await appendQualityRun(
run({
external: { architecture: 'B', security: 'A-' },
drift: { drifted: ['architecture'], maxDrift: 1 },
}),
);
const [r] = await loadQualityRuns();
expect(r.external).toEqual({ architecture: 'B', security: 'A-' });
expect(r.drift).toEqual({ drifted: ['architecture'], maxDrift: 1 });
});

describe('gradeToGpa', () => {
it('maps letters with +/- to a 0–4.3 scale', () => {
expect(gradeToGpa('A+')).toBeCloseTo(4.3);
expect(gradeToGpa('A')).toBe(4);
expect(gradeToGpa('A-')).toBeCloseTo(3.7);
expect(gradeToGpa('B')).toBe(3);
expect(gradeToGpa('F')).toBe(0);
});

it('returns null for N/A so it is excluded from averages', () => {
expect(gradeToGpa('N/A')).toBeNull();
});

it('floors at 0 (no negative GPA)', () => {
expect(gradeToGpa('F')).toBe(0);
});
});

describe('formatQualityTrend', () => {
it('reports an empty-state message with no runs', () => {
expect(formatQualityTrend([])).toMatch(/No quality runs recorded/);
});

it('prefers external grades over internal for the trend', () => {
const out = formatQualityTrend([run({ internal: { architecture: 'A' }, external: { architecture: 'C' } })]);
// External C (2.00) wins over internal A (4.00).
expect(out).toMatch(/architecture\s+1\s+2\.00\s+2\.00/);
});

it('counts approvals, calibration coverage and drift in the footer', () => {
const out = formatQualityTrend([
run({ decision: 'approve', external: { architecture: 'B' }, drift: { drifted: [], maxDrift: 0 } }),
run({ decision: 'reject' }),
]);
expect(out).toMatch(/2 runs/);
expect(out).toMatch(/50% approved/);
expect(out).toMatch(/50% calibrated/);
expect(out).toMatch(/0% drifted/);
});
});
});
3 changes: 3 additions & 0 deletions __tests__/setup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ import { join } from 'node:path';
// Point fab's state file at a throwaway temp path so the test suite never
// reads or writes the real ~/.fab/state.json.
process.env.FAB_STATE_FILE = join(tmpdir(), `fab-test-state-${process.pid}.json`);

// Same for the quality log — never touch the real ~/.fab/quality.jsonl.
process.env.FAB_QUALITY_FILE = join(tmpdir(), `fab-test-quality-${process.pid}.jsonl`);
5 changes: 4 additions & 1 deletion src/bin/fab.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import { executeRoleSession } from '../runtimes/role-session.js';
import { ADVISOR_TOOL, hasAdvisorAccess } from '../advisor.js';
import { aggregateUsage, formatUsageReport } from '../usage.js';
import { loadPerf, formatPerfReport } from '../perf.js';
import { loadQualityRuns, formatQualityTrend } from '../quality.js';
import { deliverResult } from '../webhook.js';
import { parseArgs, type ParsedArgs } from '../args.js';
import type {
Expand Down Expand Up @@ -1159,6 +1160,8 @@ async function model(args: ParsedArgs): Promise<void> {
async function perf(): Promise<void> {
const data = await loadPerf();
console.log(formatPerfReport(data));
const runs = await loadQualityRuns();
console.log('\n' + formatQualityTrend(runs));
}

// ── Budget command ──────────────────────────────────────────────────
Expand Down Expand Up @@ -1447,7 +1450,7 @@ USAGE
fab usage [--since <date>] Token usage and cost report
fab budget [set|clear] <dollars> Per-session cost limit
fab export <session-id> Extract artifacts to local disk
fab perf Agent performance metrics
fab perf Agent performance metrics + quality trend

fab scaffold <description...> Full product scaffold [--deploy] [--timeline] [--client] [--webhook <url>]

Expand Down
17 changes: 17 additions & 0 deletions src/gate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,23 @@ export interface GradeDrift {
maxDrift: number; // largest letter-level gap observed
}

/**
* Aggregate per-dimension QUALITY_GRADES across a set of gate verdicts into
* a single map. Advisory verdicts (self-review downgrades) carry no weight
* and are skipped. The gate roles own disjoint dimensions, so collisions are
* rare; when they do occur the later verdict wins.
*/
export function aggregateGrades(verdicts: GateVerdict[]): Record<string, Grade> {
const grades: Record<string, Grade> = {};
for (const v of verdicts) {
if (v.advisory) continue;
for (const [dim, grade] of Object.entries(v.grades ?? {})) {
grades[dim] = grade;
}
}
return grades;
}

/**
* Compare internal gate grades against external-reviewer grades.
*
Expand Down
2 changes: 2 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ export { resolveMcpServers, getRegistry } from './mcp.js';
export { getWorkflow, listWorkflows, executeWorkflow, reviseWorkflow, streamWithAdvisor } from './workflows.js';
export { ADVISOR_TOOL, callAdvisor } from './advisor.js';
export { loadPerf, collectSessionMetrics, formatPerfReport } from './perf.js';
export { appendQualityRun, loadQualityRuns, formatQualityTrend, gradeToGpa } from './quality.js';
export type { QualityRun } from './quality.js';
export { deliverResult } from './webhook.js';
export { aggregateUsage, formatUsageReport } from './usage.js';
export { getAllSkillDefs, getSkillDef, loadSkillContent, previewSkillContent, resolveNanohypePath } from './skills.js';
Expand Down
142 changes: 142 additions & 0 deletions src/quality.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import { appendFile, readFile, mkdir } from 'node:fs/promises';
import { homedir } from 'node:os';
import { dirname, join } from 'node:path';
import type { Grade, GradeDrift } from './gate.js';
import type { GateDecision } from './types.js';

// ── Quality trend — the factory's own grade record ──────────────────
//
// The merge gate grades every PR across the 9 QUALITY_RUBRIC dimensions,
// and the external-reviewer calibration re-grades cold. Until now those
// grades drove a single ship/block decision and were discarded. This
// module appends one record per gated run so the factory can answer the
// question it could not before: are my grades trending up or down across
// engagements?
//
// The log lives next to state.json under ~/.fab so the signal spans every
// repo the factory ships — a cross-engagement trend, not one working tree.
// Override with FAB_QUALITY_FILE (used by tests; an escape hatch) — mirrors
// FAB_STATE_FILE. Append-only JSONL: one self-describing record per line,
// cheap to write mid-pipeline and trivial to tail.

function qualityFile(): string {
return process.env.FAB_QUALITY_FILE ?? join(homedir(), '.fab', 'quality.jsonl');
}

export interface QualityRun {
ts: string; // ISO timestamp the run was recorded
workflow: string; // workflow name (e.g. 'feature-build')
profile: 'code' | 'docs'; // gate profile
decision: GateDecision; // final gate outcome
attempts: number; // 1-based revision attempts the gate took
internal: Record<string, Grade>; // aggregate of the gate roles' QUALITY_GRADES
external?: Record<string, Grade>; // external-reviewer cold grades (code profile, when parseable)
drift?: GradeDrift; // calibration drift vs internal (present iff external is)
}

export async function appendQualityRun(run: QualityRun): Promise<void> {
const file = qualityFile();
await mkdir(dirname(file), { recursive: true });
await appendFile(file, JSON.stringify(run) + '\n', 'utf-8');
}

export async function loadQualityRuns(): Promise<QualityRun[]> {
try {
const raw = await readFile(qualityFile(), 'utf-8');
return raw
.split('\n')
.filter((line) => line.trim().length > 0)
.map((line) => JSON.parse(line) as QualityRun);
} catch {
return [];
}
}

/**
* Map a letter grade to a 0–4.3 GPA so per-dimension trends are comparable.
* +/- shift the base letter by 0.3 (F has no minus). N/A and anything
* unrecognized return null and are excluded from averages — a dimension that
* doesn't apply carries no signal.
*/
export function gradeToGpa(grade: Grade): number | null {
if (grade === 'N/A') return null;
const base: Record<string, number> = { A: 4, B: 3, C: 2, D: 1, F: 0 };
const letter = grade.charAt(0);
if (!(letter in base)) return null;
let value = base[letter];
if (grade.endsWith('+')) value += 0.3;
if (grade.endsWith('-')) value -= 0.3;
return Math.max(0, Math.min(4.3, value));
}

// The effective grade for a dimension prefers the cold external calibration
// (the more objective signal) and falls back to the internal gate grade.
function effectiveGrade(run: QualityRun, dim: string): Grade | undefined {
return run.external?.[dim] ?? run.internal[dim];
}

const RECENT_WINDOW = 5;

function mean(values: number[]): number {
return values.length === 0 ? 0 : values.reduce((sum, v) => sum + v, 0) / values.length;
}

/**
* Render the quality trend across recorded runs as a per-dimension table:
* overall GPA vs the recent window, with a direction arrow. Declining
* dimensions are the point — they surface in red.
*/
export function formatQualityTrend(runs: QualityRun[]): string {
const DIM = process.stdout.isTTY ? '\x1b[2m' : '';
const BOLD = process.stdout.isTTY ? '\x1b[1m' : '';
const RED = process.stdout.isTTY ? '\x1b[31m' : '';
const GREEN = process.stdout.isTTY ? '\x1b[32m' : '';
const RESET = process.stdout.isTTY ? '\x1b[0m' : '';

if (runs.length === 0) return 'No quality runs recorded yet. Run a gated workflow first.';

// Chronological order (records are appended in order, but sort defensively).
const ordered = [...runs].sort((a, b) => a.ts.localeCompare(b.ts));

// Per-dimension GPA series in run order, using the effective grade.
const series = new Map<string, number[]>();
for (const run of ordered) {
const dims = new Set([...Object.keys(run.internal), ...Object.keys(run.external ?? {})]);
for (const dim of dims) {
const grade = effectiveGrade(run, dim);
if (!grade) continue;
const gpa = gradeToGpa(grade);
if (gpa === null) continue;
if (!series.has(dim)) series.set(dim, []);
series.get(dim)!.push(gpa);
}
}

const lines: string[] = [];
lines.push(`${BOLD}QUALITY TREND${RESET}`);
lines.push(
`${BOLD}${'DIMENSION'.padEnd(18)} ${'N'.padStart(4)} ${'OVERALL'.padStart(8)} ${'RECENT'.padStart(8)} ${'TREND'.padStart(6)}${RESET}`,
);

for (const [dim, values] of [...series.entries()].sort((a, b) => a[0].localeCompare(b[0]))) {
const overall = mean(values);
const recent = mean(values.slice(-RECENT_WINDOW));
const delta = recent - overall;
const arrow = delta > 0.15 ? `${GREEN}↑${RESET}` : delta < -0.15 ? `${RED}↓${RESET}` : `${DIM}→${RESET}`;
lines.push(
`${dim.padEnd(18)} ${String(values.length).padStart(4)} ${overall.toFixed(2).padStart(8)} ${recent.toFixed(2).padStart(8)} ${arrow.padStart(6)}`,
);
}

// Summary footer: run count, approval rate, calibration coverage, drift rate.
const total = ordered.length;
const approved = ordered.filter((r) => r.decision === 'approve').length;
const calibrated = ordered.filter((r) => r.external && Object.keys(r.external).length > 0);
const drifted = calibrated.filter((r) => (r.drift?.drifted.length ?? 0) > 0).length;
const pct = (n: number, d: number) => (d === 0 ? 'n/a' : `${Math.round((n / d) * 100)}%`);
lines.push(
`${DIM}${total} runs · ${pct(approved, total)} approved · ${pct(calibrated.length, total)} calibrated · ${pct(drifted, calibrated.length)} drifted${RESET}`,
);

return lines.join('\n');
}
Loading
Loading