From 7f660df7022a32fb246eb9218998147c5fa08b93 Mon Sep 17 00:00:00 2001 From: stxkxs <139715017+stxkxs@users.noreply.github.com> Date: Wed, 3 Jun 2026 17:24:36 -0700 Subject: [PATCH] =?UTF-8?q?feat(p2):=20mid-2026=20capability=20evaluations?= =?UTF-8?q?=20=E2=80=94=20advisor,=20tool=20surface,=20compaction,=20tieri?= =?UTF-8?q?ng?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The four P2 evaluations from the mid-2026 Claude/Anthropic capabilities review, each researched against primary docs + the installed SDK types and adversarially verified before deciding. Recurring finding: fab's default (managed-agents) transport and Bedrock lag several newer Messages-API features, so three of the four "adopt native feature X" ideas correctly resolve to keep / can't-yet / already-handled — recorded in-code so the decisions aren't silently reversed. ─── P2.b · advisor tool → keep custom consult_advisor ─── Anthropic's native advisor tool (beta advisor-tool-2026-03-01) can't preserve fab's invariants: only a per-REQUEST max_uses (no per-session budget), and it's beta on the Claude API + Claude Platform on AWS only — not Bedrock, not the Managed Agents toolset. Rationale recorded as a header comment in src/advisor.ts. ─── P2.c · Tool Search / defer_loading → can't adopt yet; cost made visible ─── Corrected a wrong assumption: 23 of 83 roles wire ≥4 MCP servers (github alone exposes ~50 tools), so a role can load 100+ tool definitions eagerly. But defer_loading / Tool Search is Messages-API-only — not exposed by the Managed Agents API. Added summarizeToolSurface() + a single deploy-time line surfacing the latent context cost (no per-role noise), documented the constraint + revisit trigger in src/mcp.ts. Tested, incl. against the live roster. ─── P2.d · compaction → already automatic; nothing to wire ─── The installed Agent SDK 0.3.x types show the Claude Code loop auto-compacts (SDKStatus 'compacting' + Pre/PostCompact hooks); it is not a query() option and the SDK betas union only accepts context-1m-2025-08-07. managed-agents handles long context via its durable session log. Recorded at the sdk.ts query() site. ─── P2.e · Haiku / effort tiering → pilot methodology, no blind flips ─── fab runs 82 roles on Sonnet, 0 on Haiku (a real $1/$5 vs $3/$15 lever), but flipping defaults without eval data silently risks quality and firm roles skip the merge gate. docs/roster.md gains a Model tiering section: candidate roles, the data-driven pilot path via `fab model set`, and the effort deferral. No behavior change beyond the deploy-time tool-surface line. Verified: npm run lint / build / format:check clean; npm test passes. Co-authored-by: stxkxsbot <275011021+stxkxsbot@users.noreply.github.com> --- __tests__/mcp.test.ts | 37 ++++++++++++++++++++++++++++++++++++- docs/roster.md | 19 +++++++++++++++++++ src/advisor.ts | 27 +++++++++++++++++++++++++++ src/bin/fab.ts | 13 ++++++++++++- src/mcp.ts | 43 +++++++++++++++++++++++++++++++++++++++++++ src/runtimes/sdk.ts | 9 +++++++++ 6 files changed, 146 insertions(+), 2 deletions(-) diff --git a/__tests__/mcp.test.ts b/__tests__/mcp.test.ts index ec7dcda..c5c023b 100644 --- a/__tests__/mcp.test.ts +++ b/__tests__/mcp.test.ts @@ -1,5 +1,12 @@ import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; -import { resolveMcpServers, getRegistry, parseTunnelRegistry } from '../src/mcp.js'; +import { + resolveMcpServers, + getRegistry, + parseTunnelRegistry, + summarizeToolSurface, + HEAVY_TOOL_SURFACE, +} from '../src/mcp.js'; +import { TEAM } from '../src/team.js'; describe('mcp', () => { it('getRegistry returns all servers', () => { @@ -131,4 +138,32 @@ describe('mcp', () => { expect(servers[0].url).not.toBe('https://attacker.example/mcp'); }); }); + + describe('summarizeToolSurface', () => { + it('counts roles at or above the heavy threshold and tracks the max', () => { + const roles = [ + { mcpServers: ['github'] }, + { mcpServers: ['github', 'linear'] }, + { mcpServers: ['github', 'linear', 'slack', 'sentry'] }, // heavy (4) + { mcpServers: ['github', 'linear', 'slack', 'sentry', 'notion'] }, // heavy (5) + ]; + const s = summarizeToolSurface(roles); + expect(HEAVY_TOOL_SURFACE).toBe(4); + expect(s.totalRoles).toBe(4); + expect(s.heavyRoles).toBe(2); + expect(s.maxServers).toBe(5); + }); + + it('reports zero heavy roles when all are light', () => { + const s = summarizeToolSurface([{ mcpServers: [] }, { mcpServers: ['github'] }, { mcpServers: ['a', 'b', 'c'] }]); + expect(s.heavyRoles).toBe(0); + expect(s.maxServers).toBe(3); + }); + + it('reflects the live roster — TEAM carries heavy roles (the eager tool surface is real, not hypothetical)', () => { + const s = summarizeToolSurface(TEAM); + expect(s.totalRoles).toBeGreaterThan(0); + expect(s.heavyRoles).toBeGreaterThan(0); + }); + }); }); diff --git a/docs/roster.md b/docs/roster.md index eb092e3..6ac0c3e 100644 --- a/docs/roster.md +++ b/docs/roster.md @@ -136,3 +136,22 @@ $FAB_SKILLS_DIR → ~/.fab/skills/ → /.fab/skills/ → bundled fab/skills ``` Curators and engineers have bundled baselines at `fab/skills/.md`. Override any of them via `~/.fab/skills/.md` (replace) or `.append.md` (append). The brief-typed roles (`product`, `design-lead`, `sales-lead`, `marketing-lead`) resolve to nanohype brief templates by default. + +## Model tiering + +Every role declares a `model` in `src/team//.ts`. The current spread is deliberate but not yet cost-tuned: + +- **82 roles on `claude-sonnet-4-6`** — the default for all factory + firm work. +- **2 lab roles on Opus** (`external-reviewer`, `prompt-optimizer`) plus the `consult_advisor` escalation (`src/advisor.ts`) — Opus where deep reasoning or cold calibration earns it. +- **0 roles on `claude-haiku-4-5`** — an open cost opportunity. Haiku is $1/$5 per MTok vs Sonnet's $3/$15 (3× cheaper), a good fit for classification / routing / filter / low-stakes-high-volume work. + +**Haiku candidates** (a shape, not a mandate — pilot before promoting): `lead-research-curator`, `lead-outbound`, `lead-events`, `seo-engineer`, and similar firm roles whose output is short-form, templated, or a filter step. Caveat: firm roles do **not** pass the merge gate, so a quality regression there isn't caught automatically — pilot deliberately rather than flipping defaults blind. + +**Pilot methodology** (don't change a default on a guess): + +1. Override at runtime, no redeploy of defaults: `fab model set claude-haiku-4-5`. +2. Run the role through representative workflows. +3. Grade the output — the merge gate + `external-reviewer` calibration for factory roles; a manual read (or `external-reviewer`) for firm roles. +4. Promote (edit the role's `model` in `src/team/*`) only if quality holds; otherwise `fab model clear ` to roll back. + +**The `effort` parameter** (GA on the Messages API for Opus 4.6+) is deferred: it would need an `AgentCreateParams` shape change and — like context compaction and the Tool Search tool — is not currently exposed on the Managed Agents agent-create surface fab's default transport uses. Revisit once the Managed Agents API carries it. diff --git a/src/advisor.ts b/src/advisor.ts index 28fb7fc..e54e040 100644 --- a/src/advisor.ts +++ b/src/advisor.ts @@ -1,5 +1,32 @@ import type { CustomTool, TeamRole } from './types.js'; +// ── Senior advisor escalation — fab's own mechanism, deliberately NOT the +// native advisor tool ────────────────────────────────────────────────── +// +// `consult_advisor` is a custom, client-executed tool: a gated role escalates +// one hard decision to a senior Opus advisor (`callAdvisor` makes a separate +// `/v1/messages` call). Three invariants make this fab-specific rather than a +// drop-in for Anthropic's native advisor tool (beta `advisor-tool-2026-03-01`): +// +// 1. Role-gating — only `ADVISOR_ROLES` get the tool, applied at deploy time +// by `advisorToolsFor` (bin/fab.ts). The native tool is just an entry in a +// request/agent toolset and carries no role policy of its own. +// 2. Per-SESSION call budget — `streamWithAdvisor` caps escalations across a +// whole multi-turn session (`maxAdvisorCalls`, default 3). The native +// tool's `max_uses` is a per-REQUEST cap only; the docs state it has "no +// built-in conversation-level cap", so a per-session budget there means +// hand-stripping `advisor_tool_result` blocks from history (fragile). +// 3. Separate pinned Opus — `callAdvisor` always targets `ADVISOR_MODEL` +// regardless of the caller's model, uniformly across all four transports +// (the interception lives in the shared stream consumer). +// +// Availability also rules out a swap: the native advisor tool is beta on the +// Claude API and Claude Platform on AWS only — not Bedrock/Vertex/Foundry and +// not in the Managed Agents toolset, i.e. neither fab's default (managed-agents) +// nor regulated (Bedrock) path. Verified against platform.claude.com docs, +// 2026-06. Revisit only if the native tool gains a per-conversation budget AND +// Managed Agents support. + const BASE = 'https://api.anthropic.com'; const ADVISOR_MODEL = 'claude-opus-4-8'; diff --git a/src/bin/fab.ts b/src/bin/fab.ts index 623c6d0..e7b37bf 100644 --- a/src/bin/fab.ts +++ b/src/bin/fab.ts @@ -36,7 +36,7 @@ import { setBudgetLimit, } from '../state.js'; import { getAllSkillDefs, getSkillDef, loadSkillContent, previewSkillContent, resolveNanohypePath } from '../skills.js'; -import { resolveMcpServers } from '../mcp.js'; +import { resolveMcpServers, summarizeToolSurface, HEAVY_TOOL_SURFACE } from '../mcp.js'; import { buildSystemPrompt } from '../prompts.js'; import { resolveSandboxMode, environmentConfig } from '../sandbox.js'; import { getWorkflow, listWorkflows, executeWorkflow, reviseWorkflow, streamWithAdvisor } from '../workflows.js'; @@ -319,6 +319,17 @@ async function deploy(args: ParsedArgs): Promise { const skillCount = Object.keys(skillRefs).length; console.log(`\nDeployed — ${state.agents.length} agents, ${skillCount} skills`); + + // Surface eager-loaded MCP tool-surface pressure (see mcp.ts). Managed Agents + // has no defer_loading/Tool Search yet, so heavy rosters pay it in context. + const surface = summarizeToolSurface(TEAM); + if (surface.heavyRoles > 0) { + console.log( + `Tool surface — ${surface.heavyRoles}/${surface.totalRoles} roles wire ≥${HEAVY_TOOL_SURFACE} MCP servers ` + + `(max ${surface.maxServers}); MCP tool definitions load eagerly. Tool Search/defer_loading (the native ` + + `~85% reduction) is not yet available on the Managed Agents transport.`, + ); + } if (failedSkills.length > 0) { console.error(`\nWARNING: ${failedSkills.length} skills failed to upload: ${failedSkills.join(', ')}`); console.error('These agents were deployed without their domain skills.'); diff --git a/src/mcp.ts b/src/mcp.ts index 0661c1c..0e5cc9a 100644 --- a/src/mcp.ts +++ b/src/mcp.ts @@ -229,3 +229,46 @@ export function resolveMcpServers(serverNames: string[]): { servers: McpServer[] export function getRegistry(): Record { return fullRegistry(); } + +// ── Tool-surface visibility (P2.c — Tool Search / defer_loading) ─────── +// +// resolveMcpServers wires every server's tools as an always-on `mcp_toolset`, +// so all of a role's MCP tools load into context eagerly at session start. +// A single heavy server (github exposes ~50 tools) means a role wiring 4-5 +// servers can carry well over a hundred tool definitions — tens of thousands +// of tokens — before it does any work, and tool-selection accuracy degrades +// past ~30-50 available tools. +// +// Anthropic's native fix is the Tool Search Tool + `defer_loading` (tools load +// on demand; ~85% definition-token reduction). As of 2026-06 that is a +// Messages-API feature — available on the Claude API, Claude Platform on AWS, +// and Microsoft Foundry, but NOT exposed by the Managed Agents API (whose +// mcp_toolset config supports only `enabled`, no `defer_loading`) and NOT on +// Bedrock. So fab cannot defer tool loading on its default (managed-agents) or +// regulated (bedrock) paths today. The lever fab DOES have on Managed Agents is +// per-tool `enabled` via the mcp_toolset `configs` array — a future curation +// pass could allow-list the tools a heavy role actually uses. +// +// summarizeToolSurface makes the current pressure visible at deploy time; +// revisit defer_loading adoption when the Managed Agents API exposes it. + +/** Roles wiring at least this many MCP servers carry a heavy eager-loaded tool surface. */ +export const HEAVY_TOOL_SURFACE = 4; + +export interface ToolSurfaceSummary { + totalRoles: number; + heavyRoles: number; // roles wiring >= HEAVY_TOOL_SURFACE servers + maxServers: number; +} + +/** Summarize eager-loaded MCP tool-surface pressure across the roster. */ +export function summarizeToolSurface(roles: ReadonlyArray<{ mcpServers: string[] }>): ToolSurfaceSummary { + let heavyRoles = 0; + let maxServers = 0; + for (const r of roles) { + const n = r.mcpServers.length; + if (n >= HEAVY_TOOL_SURFACE) heavyRoles++; + if (n > maxServers) maxServers = n; + } + return { totalRoles: roles.length, heavyRoles, maxServers }; +} diff --git a/src/runtimes/sdk.ts b/src/runtimes/sdk.ts index 6df170e..9f74dcc 100644 --- a/src/runtimes/sdk.ts +++ b/src/runtimes/sdk.ts @@ -119,6 +119,15 @@ class SdkAgentSession implements AgentSession { }); const backendEnv = inferenceEnv(this.backend); + // Context compaction is automatic here, so there is nothing to wire: the + // Agent SDK runs the Claude Code agent loop, which auto-compacts when the + // window fills (surfaced as a `compacting` status + Pre/PostCompact hooks). + // It is NOT a query() option — there is no `context_management` knob, and + // the SDK's `betas` option only accepts `context-1m-2025-08-07`, never + // `compact-2026-01-12` — so the raw Messages-API compaction config does not + // apply to this runtime. managed-agents handles long context via its durable + // session log. Verified against the installed @anthropic-ai/claude-agent-sdk + // 0.3.x types, 2026-06. this.sdkQuery = this.sdk.query({ prompt: inputs, options: {