From 7f660df7022a32fb246eb9218998147c5fa08b93 Mon Sep 17 00:00:00 2001
From: stxkxs <139715017+stxkxs@users.noreply.github.com>
Date: Wed, 3 Jun 2026 17:24:36 -0700
Subject: [PATCH] =?UTF-8?q?feat(p2):=20mid-2026=20capability=20evaluations?=
 =?UTF-8?q?=20=E2=80=94=20advisor,=20tool=20surface,=20compaction,=20tieri?=
 =?UTF-8?q?ng?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The four P2 evaluations from the mid-2026 Claude/Anthropic capabilities review,
each researched against primary docs + the installed SDK types and adversarially
verified before deciding. Recurring finding: fab's default (managed-agents)
transport and Bedrock lag several newer Messages-API features, so three of the
four "adopt native feature X" ideas correctly resolve to keep / can't-yet /
already-handled — recorded in-code so the decisions aren't silently reversed.

─── P2.b · advisor tool → keep custom consult_advisor ───

Anthropic's native advisor tool (beta advisor-tool-2026-03-01) can't preserve
fab's invariants: only a per-REQUEST max_uses (no per-session budget), and it's
beta on the Claude API + Claude Platform on AWS only — not Bedrock, not the
Managed Agents toolset. Rationale recorded as a header comment in src/advisor.ts.

─── P2.c · Tool Search / defer_loading → can't adopt yet; cost made visible ───

Corrected a wrong assumption: 23 of 83 roles wire ≥4 MCP servers (github alone
exposes ~50 tools), so a role can load 100+ tool definitions eagerly. But
defer_loading / Tool Search is Messages-API-only — not exposed by the Managed
Agents API. Added summarizeToolSurface() + a single deploy-time line surfacing
the latent context cost (no per-role noise), documented the constraint + revisit
trigger in src/mcp.ts. Tested, incl. against the live roster.

─── P2.d · compaction → already automatic; nothing to wire ───

The installed Agent SDK 0.3.x types show the Claude Code loop auto-compacts
(SDKStatus 'compacting' + Pre/PostCompact hooks); it is not a query() option and
the SDK betas union only accepts context-1m-2025-08-07. managed-agents handles
long context via its durable session log. Recorded at the sdk.ts query() site.

─── P2.e · Haiku / effort tiering → pilot methodology, no blind flips ───

fab runs 82 roles on Sonnet, 0 on Haiku (a real $1/$5 vs $3/$15 lever), but
flipping defaults without eval data silently risks quality and firm roles skip
the merge gate. docs/roster.md gains a Model tiering section: candidate roles,
the data-driven pilot path via `fab model set`, and the effort deferral.

No behavior change beyond the deploy-time tool-surface line. Verified: npm run
lint / build / format:check clean; npm test passes.

Co-authored-by: stxkxsbot <275011021+stxkxsbot@users.noreply.github.com>
---
 __tests__/mcp.test.ts | 37 ++++++++++++++++++++++++++++++++++++-
 docs/roster.md        | 19 +++++++++++++++++++
 src/advisor.ts        | 27 +++++++++++++++++++++++++++
 src/bin/fab.ts        | 13 ++++++++++++-
 src/mcp.ts            | 43 +++++++++++++++++++++++++++++++++++++++++++
 src/runtimes/sdk.ts   |  9 +++++++++
 6 files changed, 146 insertions(+), 2 deletions(-)

diff --git a/__tests__/mcp.test.ts b/__tests__/mcp.test.ts
index ec7dcda..c5c023b 100644
--- a/__tests__/mcp.test.ts
+++ b/__tests__/mcp.test.ts
@@ -1,5 +1,12 @@
 import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
-import { resolveMcpServers, getRegistry, parseTunnelRegistry } from '../src/mcp.js';
+import {
+  resolveMcpServers,
+  getRegistry,
+  parseTunnelRegistry,
+  summarizeToolSurface,
+  HEAVY_TOOL_SURFACE,
+} from '../src/mcp.js';
+import { TEAM } from '../src/team.js';
 
 describe('mcp', () => {
   it('getRegistry returns all servers', () => {
@@ -131,4 +138,32 @@ describe('mcp', () => {
       expect(servers[0].url).not.toBe('https://attacker.example/mcp');
     });
   });
+
+  describe('summarizeToolSurface', () => {
+    it('counts roles at or above the heavy threshold and tracks the max', () => {
+      const roles = [
+        { mcpServers: ['github'] },
+        { mcpServers: ['github', 'linear'] },
+        { mcpServers: ['github', 'linear', 'slack', 'sentry'] }, // heavy (4)
+        { mcpServers: ['github', 'linear', 'slack', 'sentry', 'notion'] }, // heavy (5)
+      ];
+      const s = summarizeToolSurface(roles);
+      expect(HEAVY_TOOL_SURFACE).toBe(4);
+      expect(s.totalRoles).toBe(4);
+      expect(s.heavyRoles).toBe(2);
+      expect(s.maxServers).toBe(5);
+    });
+
+    it('reports zero heavy roles when all are light', () => {
+      const s = summarizeToolSurface([{ mcpServers: [] }, { mcpServers: ['github'] }, { mcpServers: ['a', 'b', 'c'] }]);
+      expect(s.heavyRoles).toBe(0);
+      expect(s.maxServers).toBe(3);
+    });
+
+    it('reflects the live roster — TEAM carries heavy roles (the eager tool surface is real, not hypothetical)', () => {
+      const s = summarizeToolSurface(TEAM);
+      expect(s.totalRoles).toBeGreaterThan(0);
+      expect(s.heavyRoles).toBeGreaterThan(0);
+    });
+  });
 });
diff --git a/docs/roster.md b/docs/roster.md
index eb092e3..6ac0c3e 100644
--- a/docs/roster.md
+++ b/docs/roster.md
@@ -136,3 +136,22 @@ $FAB_SKILLS_DIR → ~/.fab/skills/ → <cwd>/.fab/skills/ → bundled fab/skills
 ```
 
 Curators and engineers have bundled baselines at `fab/skills/<def.name>.md`. Override any of them via `~/.fab/skills/<def.name>.md` (replace) or `<def.name>.append.md` (append). The brief-typed roles (`product`, `design-lead`, `sales-lead`, `marketing-lead`) resolve to nanohype brief templates by default.
+
+## Model tiering
+
+Every role declares a `model` in `src/team/<phase>/<area>.ts`. The current spread is deliberate but not yet cost-tuned:
+
+- **82 roles on `claude-sonnet-4-6`** — the default for all factory + firm work.
+- **2 lab roles on Opus** (`external-reviewer`, `prompt-optimizer`) plus the `consult_advisor` escalation (`src/advisor.ts`) — Opus where deep reasoning or cold calibration earns it.
+- **0 roles on `claude-haiku-4-5`** — an open cost opportunity. Haiku is $1/$5 per MTok vs Sonnet's $3/$15 (3× cheaper), a good fit for classification / routing / filter / low-stakes-high-volume work.
+
+**Haiku candidates** (a shape, not a mandate — pilot before promoting): `lead-research-curator`, `lead-outbound`, `lead-events`, `seo-engineer`, and similar firm roles whose output is short-form, templated, or a filter step. Caveat: firm roles do **not** pass the merge gate, so a quality regression there isn't caught automatically — pilot deliberately rather than flipping defaults blind.
+
+**Pilot methodology** (don't change a default on a guess):
+
+1. Override at runtime, no redeploy of defaults: `fab model set <role> claude-haiku-4-5`.
+2. Run the role through representative workflows.
+3. Grade the output — the merge gate + `external-reviewer` calibration for factory roles; a manual read (or `external-reviewer`) for firm roles.
+4. Promote (edit the role's `model` in `src/team/*`) only if quality holds; otherwise `fab model clear <role>` to roll back.
+
+**The `effort` parameter** (GA on the Messages API for Opus 4.6+) is deferred: it would need an `AgentCreateParams` shape change and — like context compaction and the Tool Search tool — is not currently exposed on the Managed Agents agent-create surface fab's default transport uses. Revisit once the Managed Agents API carries it.
diff --git a/src/advisor.ts b/src/advisor.ts
index 28fb7fc..e54e040 100644
--- a/src/advisor.ts
+++ b/src/advisor.ts
@@ -1,5 +1,32 @@
 import type { CustomTool, TeamRole } from './types.js';
 
+// ── Senior advisor escalation — fab's own mechanism, deliberately NOT the
+// native advisor tool ──────────────────────────────────────────────────
+//
+// `consult_advisor` is a custom, client-executed tool: a gated role escalates
+// one hard decision to a senior Opus advisor (`callAdvisor` makes a separate
+// `/v1/messages` call). Three invariants make this fab-specific rather than a
+// drop-in for Anthropic's native advisor tool (beta `advisor-tool-2026-03-01`):
+//
+//   1. Role-gating — only `ADVISOR_ROLES` get the tool, applied at deploy time
+//      by `advisorToolsFor` (bin/fab.ts). The native tool is just an entry in a
+//      request/agent toolset and carries no role policy of its own.
+//   2. Per-SESSION call budget — `streamWithAdvisor` caps escalations across a
+//      whole multi-turn session (`maxAdvisorCalls`, default 3). The native
+//      tool's `max_uses` is a per-REQUEST cap only; the docs state it has "no
+//      built-in conversation-level cap", so a per-session budget there means
+//      hand-stripping `advisor_tool_result` blocks from history (fragile).
+//   3. Separate pinned Opus — `callAdvisor` always targets `ADVISOR_MODEL`
+//      regardless of the caller's model, uniformly across all four transports
+//      (the interception lives in the shared stream consumer).
+//
+// Availability also rules out a swap: the native advisor tool is beta on the
+// Claude API and Claude Platform on AWS only — not Bedrock/Vertex/Foundry and
+// not in the Managed Agents toolset, i.e. neither fab's default (managed-agents)
+// nor regulated (Bedrock) path. Verified against platform.claude.com docs,
+// 2026-06. Revisit only if the native tool gains a per-conversation budget AND
+// Managed Agents support.
+
 const BASE = 'https://api.anthropic.com';
 const ADVISOR_MODEL = 'claude-opus-4-8';
 
diff --git a/src/bin/fab.ts b/src/bin/fab.ts
index 623c6d0..e7b37bf 100644
--- a/src/bin/fab.ts
+++ b/src/bin/fab.ts
@@ -36,7 +36,7 @@ import {
   setBudgetLimit,
 } from '../state.js';
 import { getAllSkillDefs, getSkillDef, loadSkillContent, previewSkillContent, resolveNanohypePath } from '../skills.js';
-import { resolveMcpServers } from '../mcp.js';
+import { resolveMcpServers, summarizeToolSurface, HEAVY_TOOL_SURFACE } from '../mcp.js';
 import { buildSystemPrompt } from '../prompts.js';
 import { resolveSandboxMode, environmentConfig } from '../sandbox.js';
 import { getWorkflow, listWorkflows, executeWorkflow, reviseWorkflow, streamWithAdvisor } from '../workflows.js';
@@ -319,6 +319,17 @@ async function deploy(args: ParsedArgs): Promise<void> {
 
   const skillCount = Object.keys(skillRefs).length;
   console.log(`\nDeployed — ${state.agents.length} agents, ${skillCount} skills`);
+
+  // Surface eager-loaded MCP tool-surface pressure (see mcp.ts). Managed Agents
+  // has no defer_loading/Tool Search yet, so heavy rosters pay it in context.
+  const surface = summarizeToolSurface(TEAM);
+  if (surface.heavyRoles > 0) {
+    console.log(
+      `Tool surface — ${surface.heavyRoles}/${surface.totalRoles} roles wire ≥${HEAVY_TOOL_SURFACE} MCP servers ` +
+        `(max ${surface.maxServers}); MCP tool definitions load eagerly. Tool Search/defer_loading (the native ` +
+        `~85% reduction) is not yet available on the Managed Agents transport.`,
+    );
+  }
   if (failedSkills.length > 0) {
     console.error(`\nWARNING: ${failedSkills.length} skills failed to upload: ${failedSkills.join(', ')}`);
     console.error('These agents were deployed without their domain skills.');
diff --git a/src/mcp.ts b/src/mcp.ts
index 0661c1c..0e5cc9a 100644
--- a/src/mcp.ts
+++ b/src/mcp.ts
@@ -229,3 +229,46 @@ export function resolveMcpServers(serverNames: string[]): { servers: McpServer[]
 export function getRegistry(): Record<string, McpServerDef> {
   return fullRegistry();
 }
+
+// ── Tool-surface visibility (P2.c — Tool Search / defer_loading) ───────
+//
+// resolveMcpServers wires every server's tools as an always-on `mcp_toolset`,
+// so all of a role's MCP tools load into context eagerly at session start.
+// A single heavy server (github exposes ~50 tools) means a role wiring 4-5
+// servers can carry well over a hundred tool definitions — tens of thousands
+// of tokens — before it does any work, and tool-selection accuracy degrades
+// past ~30-50 available tools.
+//
+// Anthropic's native fix is the Tool Search Tool + `defer_loading` (tools load
+// on demand; ~85% definition-token reduction). As of 2026-06 that is a
+// Messages-API feature — available on the Claude API, Claude Platform on AWS,
+// and Microsoft Foundry, but NOT exposed by the Managed Agents API (whose
+// mcp_toolset config supports only `enabled`, no `defer_loading`) and NOT on
+// Bedrock. So fab cannot defer tool loading on its default (managed-agents) or
+// regulated (bedrock) paths today. The lever fab DOES have on Managed Agents is
+// per-tool `enabled` via the mcp_toolset `configs` array — a future curation
+// pass could allow-list the tools a heavy role actually uses.
+//
+// summarizeToolSurface makes the current pressure visible at deploy time;
+// revisit defer_loading adoption when the Managed Agents API exposes it.
+
+/** Roles wiring at least this many MCP servers carry a heavy eager-loaded tool surface. */
+export const HEAVY_TOOL_SURFACE = 4;
+
+export interface ToolSurfaceSummary {
+  totalRoles: number;
+  heavyRoles: number; // roles wiring >= HEAVY_TOOL_SURFACE servers
+  maxServers: number;
+}
+
+/** Summarize eager-loaded MCP tool-surface pressure across the roster. */
+export function summarizeToolSurface(roles: ReadonlyArray<{ mcpServers: string[] }>): ToolSurfaceSummary {
+  let heavyRoles = 0;
+  let maxServers = 0;
+  for (const r of roles) {
+    const n = r.mcpServers.length;
+    if (n >= HEAVY_TOOL_SURFACE) heavyRoles++;
+    if (n > maxServers) maxServers = n;
+  }
+  return { totalRoles: roles.length, heavyRoles, maxServers };
+}
diff --git a/src/runtimes/sdk.ts b/src/runtimes/sdk.ts
index 6df170e..9f74dcc 100644
--- a/src/runtimes/sdk.ts
+++ b/src/runtimes/sdk.ts
@@ -119,6 +119,15 @@ class SdkAgentSession implements AgentSession {
     });
 
     const backendEnv = inferenceEnv(this.backend);
+    // Context compaction is automatic here, so there is nothing to wire: the
+    // Agent SDK runs the Claude Code agent loop, which auto-compacts when the
+    // window fills (surfaced as a `compacting` status + Pre/PostCompact hooks).
+    // It is NOT a query() option — there is no `context_management` knob, and
+    // the SDK's `betas` option only accepts `context-1m-2025-08-07`, never
+    // `compact-2026-01-12` — so the raw Messages-API compaction config does not
+    // apply to this runtime. managed-agents handles long context via its durable
+    // session log. Verified against the installed @anthropic-ai/claude-agent-sdk
+    // 0.3.x types, 2026-06.
     this.sdkQuery = this.sdk.query({
       prompt: inputs,
       options: {