From b20074578419c1102f002a0e1e38eb43e035bf5a Mon Sep 17 00:00:00 2001 From: Frederik Prijck Date: Tue, 16 Jun 2026 15:37:15 +0200 Subject: [PATCH] feat(mcp): add protected HTTP MCP server support Adds OAuth-protected HTTP MCP server support so evals can target servers that require an Authorization: Bearer token (e.g. the Auth0 hosted MCP server, which authenticates with a Management API token). - mintMcpToken: per-job client-credentials token mint for HTTP MCP servers - MCPHttpServerConfig.auth (tokenUrl/clientId/clientSecret/audience) - All four runners forward the minted token: claude-code, copilot, and gemini-cli set an Authorization header on the server config; codex writes a bearer_token_env_var reference into config.toml and injects the token into its process env (an inline bearer_token is rejected by codex) - A failed token mint skips the server with a warning rather than registering it unauthenticated - sandbox.passthroughEnv: forward named host env vars into the Docker sandbox so MCP credentials reach the container - docs/PROTECTED_MCP.md setup guide --- AGENTS.md | 2 + apps/auth0-evals/eval.config.js | 23 ++++ docs/PROTECTED_MCP.md | 126 ++++++++++++++++++ packages/eval-core/src/config/defaults.ts | 2 + packages/eval-core/src/config/framework.ts | 29 ++++ packages/eval-core/src/config/mcp-auth.ts | 41 ++++++ packages/eval-core/src/index.ts | 3 + .../eval-core/tests/config/mcp-auth.test.ts | 56 ++++++++ packages/eval/src/cli/run.ts | 1 + .../eval/src/runners/claude-code/agent.ts | 24 +++- packages/eval/src/runners/codex/agent.ts | 59 +++++++- packages/eval/src/runners/copilot/agent.ts | 35 ++++- packages/eval/src/runners/gemini-cli/agent.ts | 34 ++++- packages/eval/src/sandbox/docker.ts | 16 ++- packages/eval/tests/docker.test.ts | 41 ++++++ .../eval/tests/runners/codex-agent.test.ts | 76 +++++++++++ .../eval/tests/runners/copilot-agent.test.ts | 79 +++++++++-- .../tests/runners/gemini-cli-agent.test.ts | 72 +++++++++- 18 files changed, 685 insertions(+), 34 deletions(-) create mode 100644 docs/PROTECTED_MCP.md create mode 100644 packages/eval-core/src/config/mcp-auth.ts create mode 100644 packages/eval-core/tests/config/mcp-auth.test.ts diff --git a/AGENTS.md b/AGENTS.md index d5845885..bf491216 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -344,6 +344,8 @@ All agent runners have access to file/shell tools in their respective environmen When MCP tools are enabled (`--tools mcp`), MCP server tool definitions are appended to the tool list. +Authenticated HTTP MCP servers are configured with an `auth` block (`tokenUrl`, `clientId`, `clientSecret`, `audience`). The framework mints a Management API token per agent job via a client-credentials exchange and forwards it to the MCP server. All four runners support this: claude-code, copilot, and gemini-cli forward it as an `Authorization: Bearer` header in their server config; codex passes it via a `bearer_token_env_var` reference in `config.toml` (Codex rejects an inline token, so the secret stays out of the file). A failed token mint skips the server with a warning rather than registering it unauthenticated. Full setup guide: [docs/PROTECTED_MCP.md](docs/PROTECTED_MCP.md). + --- ## Models diff --git a/apps/auth0-evals/eval.config.js b/apps/auth0-evals/eval.config.js index 5d96cc0a..0d2eec42 100644 --- a/apps/auth0-evals/eval.config.js +++ b/apps/auth0-evals/eval.config.js @@ -44,6 +44,22 @@ export default { type: 'http', url: 'https://auth0.com/docs/mcp', }, + ...(process.env.MCP_TENANT_DOMAIN && + process.env.MCP_CLIENT_ID && + process.env.MCP_CLIENT_SECRET + ? { + 'auth0-hosted-mcp': { + type: 'http', + url: `https://${process.env.MCP_TENANT_DOMAIN}/v1/mcp`, + auth: { + tokenUrl: `https://${process.env.MCP_TENANT_DOMAIN}/oauth/token`, + clientId: process.env.MCP_CLIENT_ID, + clientSecret: process.env.MCP_CLIENT_SECRET, + audience: `https://${process.env.MCP_TENANT_DOMAIN}/api/v2/`, + }, + }, + } + : {}), }, }, @@ -81,6 +97,13 @@ export default { }, + sandbox: { + // Host env vars forwarded into the Docker sandbox (names only; values resolved + // from process.env at launch). Needed so the authenticated auth0-hosted-mcp + // server can mint its token inside the container. + passthroughEnv: ['MCP_TENANT_DOMAIN', 'MCP_CLIENT_ID', 'MCP_CLIENT_SECRET'], + }, + braintrust: { projectId: '38395851-dd41-46ec-a971-a30402db6921', datasetName: 'auth0-evals', diff --git a/docs/PROTECTED_MCP.md b/docs/PROTECTED_MCP.md new file mode 100644 index 00000000..eda5fa65 --- /dev/null +++ b/docs/PROTECTED_MCP.md @@ -0,0 +1,126 @@ +# Protected MCP Servers + +This guide covers how to wire up a **protected HTTP MCP server** — one that requires an `Authorization: Bearer` token, such as the Auth0 hosted MCP server which authenticates with a Management API token. It explains the credentials you need, the config to add, and how the framework mints and forwards the token to every runner. + +--- + +## When to use this + +- You want an agent to use an MCP server that requires an `Authorization: Bearer` token rather than being publicly reachable. +- The credentials come from a Machine-to-Machine (client-credentials) application, and you want a fresh token minted per job rather than a long-lived secret baked into config. + +> **Runner support:** token forwarding is implemented for **all runners** — claude-code, copilot, gemini-cli, and codex. The first three forward the token as an `Authorization: Bearer` header in their MCP server config; codex passes it via a `bearer_token_env_var` reference in `config.toml` (Codex rejects an inline token, so the secret never lands in the file). + +--- + +## Prerequisites + +You need an Auth0 tenant with a **Machine-to-Machine application** authorized for the **Management API**: + +1. In the Auth0 Dashboard, create (or reuse) an M2M application. +2. Authorize it for the **Auth0 Management API** (`https://YOUR_TENANT/api/v2/`) with the scopes the task needs — e.g. `read:clients` to list applications. +3. Note the application's **Client ID** and **Client Secret**. + +> **Audience matters.** The hosted MCP server authenticates with a **Management API** token (`/api/v2/` audience). The `/v1/mcp` audience is reserved by Auth0 and returns `access_denied` for client credentials — so the `audience` field below points at `/api/v2/`, not at the MCP URL. + +--- + +## Step 1 — Set the environment variables + +The server entry in `eval.config.js` is gated on three env vars. If any is missing, the server is omitted (see [Troubleshooting](#troubleshooting)). + +```bash +export MCP_TENANT_DOMAIN="your-tenant.us.auth0.com" # no scheme, no trailing slash +export MCP_CLIENT_ID="your-m2m-client-id" +export MCP_CLIENT_SECRET="your-m2m-client-secret" +``` + +You can also set the LLM `--model` you intend to run; the proxy/model setup is unchanged from any other eval. + +--- + +## Step 2 — Register the MCP server in `eval.config.js` + +The Auth0 hosted MCP server is already registered in `apps/auth0-evals/eval.config.js`, gated on the env vars above: + +```js +mcp: { + servers: { + 'auth0-docs': { type: 'http', url: 'https://auth0.com/docs/mcp' }, + + ...(process.env.MCP_TENANT_DOMAIN && + process.env.MCP_CLIENT_ID && + process.env.MCP_CLIENT_SECRET + ? { + 'auth0-hosted-mcp': { + type: 'http', + url: `https://${process.env.MCP_TENANT_DOMAIN}/v1/mcp`, + auth: { + tokenUrl: `https://${process.env.MCP_TENANT_DOMAIN}/oauth/token`, + clientId: process.env.MCP_CLIENT_ID, + clientSecret: process.env.MCP_CLIENT_SECRET, + audience: `https://${process.env.MCP_TENANT_DOMAIN}/api/v2/`, + }, + }, + } + : {}), + }, +}, +``` + +To wire up **a different** protected HTTP MCP server, add another entry with an `auth` block. The `auth` field is typed as `MCPOAuthConfig`: + +| Field | Meaning | +|---|---| +| `tokenUrl` | OAuth token endpoint, e.g. `https://TENANT/oauth/token` | +| `clientId` | Client ID for the client-credentials grant | +| `clientSecret` | Client secret for the client-credentials grant | +| `audience` | API audience the token is minted for, e.g. `https://TENANT/api/v2/` | + +Servers **without** an `auth` block (like `auth0-docs`) continue to work unauthenticated. + +--- + +## Step 3 — How the token is minted and forwarded + +You don't write any token code — the framework does it per job: + +1. When a job starts with `--tools mcp`, the active runner walks the configured MCP servers. +2. For each HTTP server with an `auth` block, it calls `mintMcpToken(auth)` — a **client-credentials** exchange (`grant_type=client_credentials`) against `tokenUrl` for the given `audience`. +3. The resulting token is forwarded to the MCP server. claude-code, copilot, and gemini-cli set it as an `Authorization: Bearer ` header in the server config; codex writes a `bearer_token_env_var` reference into `config.toml` and injects the token into the Codex process env under that name (Codex rejects an inline `bearer_token`, so the secret stays out of the config file). + +The token is minted **per job**, not at config-load time, so a long `--model all --mode all` matrix never reuses an expired token. + +**Loud failure:** if the token mint fails (bad creds, network error, missing field), the server is **skipped with a `logger.warn`** rather than registered unauthenticated. This makes a misconfigured run look like "MCP wasn't available" — not a silent "the agent chose not to use MCP." + +--- + +## Sandbox credential passthrough + +When evals run in the Docker sandbox (the default), the framework can only mint a token inside the container if the credentials reach it. The three `MCP_*` vars are forwarded via `sandbox.passthroughEnv` in `eval.config.js`: + +```js +sandbox: { + passthroughEnv: ['MCP_TENANT_DOMAIN', 'MCP_CLIENT_ID', 'MCP_CLIENT_SECRET'], +}, +``` + +Only the **names** are listed here; values are resolved from `process.env` at job launch and never stored in config. Vars that aren't currently set on the host are skipped. + +--- + +## Troubleshooting + +| Symptom | Likely cause | +|---|---| +| Log: `MCP server 'auth0-hosted-mcp' skipped — token mint failed or creds missing` | One of `MCP_TENANT_DOMAIN` / `MCP_CLIENT_ID` / `MCP_CLIENT_SECRET` is unset, or the token endpoint rejected the credentials. | +| `auth0-hosted-mcp` not registered at all (only `auth0-docs`) | The env-var gate in `eval.config.js` evaluated false — at least one of the three vars is empty. | +| Token mint returns `access_denied` | `audience` points at `/v1/mcp` instead of `/api/v2/`, or the M2M app isn't authorized for the Management API. | +| `401` late in a very long job | The minted token's TTL expired mid-job. Management API tokens are typically long-lived (hours) vs. the 30-min job timeout, so this is rare. | + +--- + +## Related docs + +- [docs/ADDING_EVALS.md](ADDING_EVALS.md) — grader primitives and how evals are structured. +- [AGENTS.md](../AGENTS.md) — framework overview, runner details, and the MCP auth summary. diff --git a/packages/eval-core/src/config/defaults.ts b/packages/eval-core/src/config/defaults.ts index 07e6ba27..db402c75 100644 --- a/packages/eval-core/src/config/defaults.ts +++ b/packages/eval-core/src/config/defaults.ts @@ -50,4 +50,6 @@ export const DEFAULT_FRAMEWORK_CONFIG: Required = { }, scoring: {}, + + sandbox: {}, }; diff --git a/packages/eval-core/src/config/framework.ts b/packages/eval-core/src/config/framework.ts index 09666de0..995fc537 100644 --- a/packages/eval-core/src/config/framework.ts +++ b/packages/eval-core/src/config/framework.ts @@ -37,11 +37,28 @@ export interface MCPStdioServerConfig { env?: Record; } +export interface MCPOAuthConfig { + /** OAuth token endpoint, e.g. https://TENANT/oauth/token */ + tokenUrl: string; + /** OAuth client ID for the client-credentials grant. */ + clientId: string; + /** OAuth client secret for the client-credentials grant. */ + clientSecret: string; + /** API audience the token is requested for, e.g. https://TENANT/api/v2/ */ + audience: string; +} + export interface MCPHttpServerConfig { /** URL-based MCP server. */ type: 'http'; /** HTTP URL for the remote MCP server. */ url: string; + /** + * Optional OAuth config. When present, the framework mints a fresh Bearer + * token per agent job and injects it as an Authorization header. If any + * field is empty (e.g. a missing env var), the server is omitted with a warning. + */ + auth?: MCPOAuthConfig; } /** Discriminated union — either a stdio (command-based) or http (URL-based) MCP server. */ @@ -119,6 +136,16 @@ export interface ScoringConfig { // ── Root config ────────────────────────────────────────────────────────────── +export interface SandboxConfig { + /** + * Names of host environment variables to forward into the Docker sandbox. + * Each name is resolved from `process.env` at job launch; only currently-set + * vars are forwarded. Use for app-specific secrets the framework can't know + * about (e.g. MCP server credentials). Names only — values are never stored here. + */ + passthroughEnv?: string[]; +} + export interface FrameworkConfig { /** Directory containing evaluation definitions (required). */ evalsDir: string; @@ -140,4 +167,6 @@ export interface FrameworkConfig { braintrust?: BraintrustConfig; /** Scoring behaviour overrides (e.g. custom doc URL allowlist). */ scoring?: ScoringConfig; + /** Docker sandbox settings (e.g. env vars to forward into the container). */ + sandbox?: SandboxConfig; } diff --git a/packages/eval-core/src/config/mcp-auth.ts b/packages/eval-core/src/config/mcp-auth.ts new file mode 100644 index 00000000..2c6e2ed2 --- /dev/null +++ b/packages/eval-core/src/config/mcp-auth.ts @@ -0,0 +1,41 @@ +/** + * OAuth token minting for authenticated HTTP MCP servers. + * + * Performs a client-credentials exchange to obtain a short-lived Bearer token. + * Called once per agent job so a long matrix run never reuses an expired token. + */ + +import type { MCPOAuthConfig } from './framework.js'; +import { logger } from '../utils/logger.js'; + +export async function mintMcpToken(auth: MCPOAuthConfig): Promise { + if (!auth.tokenUrl || !auth.clientId || !auth.clientSecret || !auth.audience) { + logger.warn('[mcp-auth] Incomplete OAuth config — skipping token mint'); + return undefined; + } + try { + const res = await fetch(auth.tokenUrl, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ + grant_type: 'client_credentials', + client_id: auth.clientId, + client_secret: auth.clientSecret, + audience: auth.audience, + }), + }); + if (!res.ok) { + logger.warn(`[mcp-auth] Token request failed: ${res.status}`); + return undefined; + } + const { access_token } = (await res.json()) as { access_token?: string }; + if (!access_token) { + logger.warn('[mcp-auth] Token response missing access_token'); + return undefined; + } + return access_token; + } catch (err) { + logger.warn(`[mcp-auth] Token request error: ${err instanceof Error ? err.message : String(err)}`); + return undefined; + } +} diff --git a/packages/eval-core/src/index.ts b/packages/eval-core/src/index.ts index e6147132..b9cc5ae9 100644 --- a/packages/eval-core/src/index.ts +++ b/packages/eval-core/src/index.ts @@ -62,6 +62,7 @@ export type { MCPServerConfig, MCPStdioServerConfig, MCPHttpServerConfig, + MCPOAuthConfig, SkillsConfig, RemoteSkillRepo, JudgeConfig, @@ -69,10 +70,12 @@ export type { WorkspaceConfig, BraintrustConfig, ScoringConfig, + SandboxConfig, } from './config/framework.js'; export { DEFAULT_FRAMEWORK_CONFIG } from './config/defaults.js'; export { defineConfig, loadConfig, deepMerge } from './config/loader.js'; export type { LoadConfigOptions } from './config/loader.js'; +export { mintMcpToken } from './config/mcp-auth.js'; // Workspace export { diff --git a/packages/eval-core/tests/config/mcp-auth.test.ts b/packages/eval-core/tests/config/mcp-auth.test.ts new file mode 100644 index 00000000..1b6418bb --- /dev/null +++ b/packages/eval-core/tests/config/mcp-auth.test.ts @@ -0,0 +1,56 @@ +import { describe, it, expect, vi, afterEach } from 'vitest'; +import { mintMcpToken } from '../../src/config/mcp-auth.js'; +import type { MCPOAuthConfig } from '../../src/config/framework.js'; + +const validAuth: MCPOAuthConfig = { + tokenUrl: 'https://tenant.us.auth0.com/oauth/token', + clientId: 'client-id', + clientSecret: 'client-secret', + audience: 'https://tenant.us.auth0.com/api/v2/', +}; + +afterEach(() => { + vi.restoreAllMocks(); +}); + +describe('mintMcpToken', () => { + it('returns the access_token on a successful exchange', async () => { + const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ access_token: 'tok-123' }), + }); + vi.stubGlobal('fetch', fetchMock); + + const token = await mintMcpToken(validAuth); + + expect(token).toBe('tok-123'); + expect(fetchMock).toHaveBeenCalledOnce(); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe(validAuth.tokenUrl); + const body = JSON.parse((init as RequestInit).body as string); + expect(body).toMatchObject({ + grant_type: 'client_credentials', + client_id: 'client-id', + client_secret: 'client-secret', + audience: validAuth.audience, + }); + }); + + it('returns undefined when the response is not ok', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ ok: false, json: async () => ({}) })); + expect(await mintMcpToken(validAuth)).toBeUndefined(); + }); + + it('returns undefined without calling fetch when a credential is missing', async () => { + const fetchMock = vi.fn(); + vi.stubGlobal('fetch', fetchMock); + const token = await mintMcpToken({ ...validAuth, clientSecret: '' }); + expect(token).toBeUndefined(); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + it('returns undefined when the body has no access_token', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ ok: true, json: async () => ({}) })); + expect(await mintMcpToken(validAuth)).toBeUndefined(); + }); +}); diff --git a/packages/eval/src/cli/run.ts b/packages/eval/src/cli/run.ts index 19da6f06..23f4b123 100644 --- a/packages/eval/src/cli/run.ts +++ b/packages/eval/src/cli/run.ts @@ -153,6 +153,7 @@ async function runAgentJob( agentType, apiKey, ghToken: process.env.GH_TOKEN, + passthroughEnv: getFrameworkConfig().sandbox.passthroughEnv, }); } diff --git a/packages/eval/src/runners/claude-code/agent.ts b/packages/eval/src/runners/claude-code/agent.ts index 0cd40adb..e48aa3f9 100644 --- a/packages/eval/src/runners/claude-code/agent.ts +++ b/packages/eval/src/runners/claude-code/agent.ts @@ -30,6 +30,7 @@ import { logger, makeSessionId, filteredEnv, + mintMcpToken, } from '@a0/eval-core'; import { classifyActionType, classifyErrorCategory, detectRetry } from '@a0/eval-core'; import { LLM_API_KEY_ENV } from '../../cli/constants.js'; @@ -137,12 +138,25 @@ export async function runClaudeCodeAgent( } // Build MCP server config when --tools mcp is requested. - let mcpServers: Record | undefined; + // Token is minted here (job start) so a long matrix run never reuses an expired token. + let mcpServers: Record }> | undefined; if (tools.includes('mcp')) { const configServers = getFrameworkConfig().mcp.servers; - const httpServers: Record = {}; + const httpServers: Record }> = {}; for (const [name, server] of Object.entries(configServers)) { - if (server.type === 'http') { + if (server.type !== 'http') continue; + if (server.auth) { + const token = await mintMcpToken(server.auth); + if (!token) { + logger.warn(`[ClaudeCode] MCP server '${name}' skipped — token mint failed or creds missing`); + continue; + } + httpServers[name] = { + type: 'http' as const, + url: server.url, + headers: { Authorization: `Bearer ${token}` }, + }; + } else { httpServers[name] = { type: 'http' as const, url: server.url }; } } @@ -305,9 +319,7 @@ export function handleMessage( const usage = msg.usage; const turnInput = - (usage?.input_tokens ?? 0) + - (usage?.cache_read_input_tokens ?? 0) + - (usage?.cache_creation_input_tokens ?? 0); + (usage?.input_tokens ?? 0) + (usage?.cache_read_input_tokens ?? 0) + (usage?.cache_creation_input_tokens ?? 0); const turnOutput = usage?.output_tokens ?? 0; record.inputTokens += turnInput; record.outputTokens += turnOutput; diff --git a/packages/eval/src/runners/codex/agent.ts b/packages/eval/src/runners/codex/agent.ts index 016e5f0b..7253630f 100644 --- a/packages/eval/src/runners/codex/agent.ts +++ b/packages/eval/src/runners/codex/agent.ts @@ -36,6 +36,7 @@ import { logger, filteredEnv, readWorkspaceFile, + mintMcpToken, } from '@a0/eval-core'; import { classifyActionType, classifyErrorCategory, detectRetry } from '@a0/eval-core'; import { LLM_API_KEY_ENV } from '../../cli/constants.js'; @@ -56,12 +57,27 @@ function tomlEscape(s: string): string { return s.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); } -function buildMcpToml(servers: Record): string { +/** + * Builds the `[mcp_servers.*]` TOML blocks. + * + * For HTTP servers with an entry in `bearerTokenEnvVars`, emits + * `bearer_token_env_var = ""` so Codex reads the Bearer token from that + * env var at runtime. Codex rejects an inline `bearer_token` key, so the token + * is never written to the config file — only the env-var name is. + */ +function buildMcpToml( + servers: Record, + bearerTokenEnvVars: Record = {}, +): string { let toml = ''; for (const [name, server] of Object.entries(servers)) { const safeName = tomlEscape(name); if (server.type === 'http') { toml += `\n[mcp_servers."${safeName}"]\nurl = "${tomlEscape(server.url)}"\n`; + const envVar = bearerTokenEnvVars[name]; + if (envVar) { + toml += `bearer_token_env_var = "${tomlEscape(envVar)}"\n`; + } } else { toml += `\n[mcp_servers."${safeName}"]\ncommand = "${tomlEscape(server.command)}"\n`; if (server.args && server.args.length > 0) { @@ -84,6 +100,7 @@ function writeCodexConfig( proxyBaseUrl: string, workspace: string, mcpServers: Record = {}, + bearerTokenEnvVars: Record = {}, ): void { mkdirSync(codexHome, { recursive: true }); // Resolve canonical path — on macOS /var is a symlink to /private/var. @@ -101,7 +118,7 @@ wire_api = "responses" [projects."${resolvedWorkspace}"] trust_level = "trusted" -${buildMcpToml(mcpServers)}`; +${buildMcpToml(mcpServers, bearerTokenEnvVars)}`; writeFileSync(join(codexHome, 'config.toml'), configToml, 'utf-8'); } @@ -212,8 +229,7 @@ function handleItem(item: ThreadItem, record: RunRecord, ctx: RunCtx, now: numbe const isError = item.status === 'failed'; for (const change of item.changes) { const rawName = change.kind === 'delete' ? 'delete_file' : 'write_file'; - const content = - !isError && change.kind !== 'delete' ? readWorkspaceFile(ctx.workspace, change.path) : ''; + const content = !isError && change.kind !== 'delete' ? readWorkspaceFile(ctx.workspace, change.path) : ''; ctx.turnToolCount++; ctx.toolCallsInTurn++; pushToolCall(record, rawName, { path: change.path, content }, '', isError, now); @@ -433,11 +449,36 @@ export async function runCodexAgent( mkdirSync(codexHome, { recursive: true }); // Resolve MCP servers from framework config when --tools mcp is requested. - const mcpServers: Record = tools.includes('mcp') ? getFrameworkConfig().mcp.servers : {}; + const configuredServers: Record = tools.includes('mcp') + ? getFrameworkConfig().mcp.servers + : {}; + + // Mint a Bearer token per HTTP server that declares an `auth` block. The token + // is passed to Codex via an env var (referenced by `bearer_token_env_var` in + // config.toml) — Codex rejects an inline `bearer_token`, so the secret never + // touches the config file. Minting per job avoids reusing an expired token on + // a long matrix run. A failed mint drops the server rather than registering it + // unauthenticated, so a misconfigured run looks like "MCP wasn't available". + const mcpServers: Record = {}; + const bearerTokenEnvVars: Record = {}; + const bearerTokens: Record = {}; + for (const [name, server] of Object.entries(configuredServers)) { + if (server.type === 'http' && server.auth) { + const token = await mintMcpToken(server.auth); + if (!token) { + logger.warn(`[Codex] MCP server '${name}' skipped — token mint failed or creds missing`); + continue; + } + const envVar = `MCP_BEARER_${name.replace(/[^A-Za-z0-9]/g, '_').toUpperCase()}`; + bearerTokenEnvVars[name] = envVar; + bearerTokens[envVar] = token; + } + mcpServers[name] = server; + } const normalizedBaseUrl = proxyBaseUrl.replace(/\/+$/, ''); const codexApiUrl = normalizedBaseUrl.endsWith('/v1') ? normalizedBaseUrl : `${normalizedBaseUrl}/v1`; - writeCodexConfig(codexHome, codexApiUrl, workspace, mcpServers); + writeCodexConfig(codexHome, codexApiUrl, workspace, mcpServers, bearerTokenEnvVars); logger.info(`[Codex] Proxy: ${proxyBaseUrl}`); logger.info(`[Codex] CODEX_HOME: ${codexHome}`); if (Object.keys(mcpServers).length > 0) { @@ -462,6 +503,12 @@ export async function runCodexAgent( } } + // Inject minted Bearer tokens so Codex can resolve each authed server's + // `bearer_token_env_var` reference at runtime. + for (const [key, value] of Object.entries(bearerTokens)) { + codexEnv[key] = value; + } + // Skills are injected into the workspace by CodexRunner.prepareSkills(). const ctx: RunCtx = { diff --git a/packages/eval/src/runners/copilot/agent.ts b/packages/eval/src/runners/copilot/agent.ts index 638d9578..f9f2a53a 100644 --- a/packages/eval/src/runners/copilot/agent.ts +++ b/packages/eval/src/runners/copilot/agent.ts @@ -26,6 +26,7 @@ import { logger, makeSessionId, filteredEnv, + mintMcpToken, } from '@a0/eval-core'; import { classifyActionType, classifyErrorCategory, detectRetry } from '@a0/eval-core'; import { CopilotCliTranslator } from './translator.js'; @@ -49,12 +50,33 @@ export interface CopilotRunOptions { model?: string; } -/** Returns MCP server config for the Auth0 docs server. */ -export function getMcpServers(): Record { +/** + * Builds the Copilot MCP server config from the framework config. + * + * For HTTP servers with an `auth` block, mints a fresh Bearer token per job + * (client-credentials exchange) and forwards it as an `Authorization` header. + * If the token mint fails, the server is skipped with a warning rather than + * registered unauthenticated — a misconfigured run looks like "MCP wasn't + * available", not a silent "the agent chose not to use MCP". + */ +export async function getMcpServers(): Promise> { const servers = getFrameworkConfig().mcp.servers; const result: Record = {}; for (const [name, server] of Object.entries(servers)) { - if (server.type === 'http') { + if (server.type !== 'http') continue; + if (server.auth) { + const token = await mintMcpToken(server.auth); + if (!token) { + logger.warn(`[Copilot] MCP server '${name}' skipped — token mint failed or creds missing`); + continue; + } + result[name] = { + type: 'http', + url: server.url, + tools: ['*'], + headers: { Authorization: `Bearer ${token}` }, + }; + } else { result[name] = { type: 'http', url: server.url, tools: ['*'] }; } } @@ -125,7 +147,10 @@ export async function runCopilotAgent( if (model) { logger.info(`[Copilot] Model: ${model}`); } - if (tools.includes('mcp')) logger.info(`[Copilot] MCP: ${Object.keys(getMcpServers()).join(', ')}`); + + // Mint tokens once per job so a long matrix run never reuses an expired token. + const mcpServers = tools.includes('mcp') ? await getMcpServers() : {}; + if (tools.includes('mcp')) logger.info(`[Copilot] MCP: ${Object.keys(mcpServers).join(', ')}`); if (tools.includes('skills')) logger.info('[Copilot] Skills: .github/skills/'); const session = await client.createSession({ @@ -134,7 +159,7 @@ export async function runCopilotAgent( onPermissionRequest: approveAll, // Suppress ask_user to prevent eval runs from blocking on interactive input. excludedTools: ['ask_user'], - ...(tools.includes('mcp') ? { mcpServers: getMcpServers() } : {}), + ...(tools.includes('mcp') ? { mcpServers } : {}), // Skill files are pre-copied to .github/skills/ by CopySkillsStrategy. ...(tools.includes('skills') ? { skillDirectories: [join(workspace, '.github', 'skills')] } : {}), // Disable infinite sessions — each eval run is a clean, isolated session. diff --git a/packages/eval/src/runners/gemini-cli/agent.ts b/packages/eval/src/runners/gemini-cli/agent.ts index 3c4d879e..c6d81254 100644 --- a/packages/eval/src/runners/gemini-cli/agent.ts +++ b/packages/eval/src/runners/gemini-cli/agent.ts @@ -28,6 +28,7 @@ import { estimateCost, logger, filteredEnv, + mintMcpToken, } from '@a0/eval-core'; import { classifyActionType, classifyErrorCategory, detectRetry } from '@a0/eval-core'; import { LLM_API_KEY_ENV } from '../../cli/constants.js'; @@ -76,21 +77,44 @@ function isAutoCancelled(output: string): boolean { * MCP tool calls appear in the stream-json output as tool_use events with names * using the format `mcp____` (e.g. `mcp__auth0-docs__search_auth0_docs`). * + * For HTTP servers with an `auth` block, mints a fresh Bearer token per job + * (client-credentials exchange) and writes it as an `Authorization` header into + * the server config. If the token mint fails, the server is skipped with a + * warning rather than registered unauthenticated. + * * Returns the names of the registered MCP servers (empty when MCP is disabled). */ -function writeGeminiSettings(workspace: string, includeMcp: boolean): string[] { +interface GeminiMcpServer { + httpUrl: string; + timeout: number; + headers?: Record; +} + +async function writeGeminiSettings(workspace: string, includeMcp: boolean): Promise { const settings: { security: { auth: { selectedType: string } }; - mcpServers?: Record; + mcpServers?: Record; } = { security: { auth: { selectedType: GEMINI_AUTH_TYPE } }, }; - const mcpServers: Record = {}; + const mcpServers: Record = {}; if (includeMcp) { const configServers = getFrameworkConfig().mcp.servers; for (const [name, server] of Object.entries(configServers)) { - if (server.type === 'http') { + if (server.type !== 'http') continue; + if (server.auth) { + const token = await mintMcpToken(server.auth); + if (!token) { + logger.warn(`[GeminiCLI] MCP server '${name}' skipped — token mint failed or creds missing`); + continue; + } + mcpServers[name] = { + httpUrl: server.url, + timeout: 30000, + headers: { Authorization: `Bearer ${token}` }, + }; + } else { mcpServers[name] = { httpUrl: server.url, timeout: 30000 }; } } @@ -154,7 +178,7 @@ export async function runGeminiCliAgent( logger.info(`\n[GeminiCLI] Starting task: ${evalDef.id}`); logger.info(`[GeminiCLI] Workspace: ${workspace}`); logger.info(`[GeminiCLI] Model: ${model}`); - const mcpNames = writeGeminiSettings(workspace, tools.includes('mcp')); + const mcpNames = await writeGeminiSettings(workspace, tools.includes('mcp')); if (mcpNames.length > 0) logger.info(`[GeminiCLI] MCP: ${mcpNames.join(', ')}`); // Trust only this workspace so YOLO mode isn't overridden in CI/headless environments. diff --git a/packages/eval/src/sandbox/docker.ts b/packages/eval/src/sandbox/docker.ts index ad87a304..f360af90 100644 --- a/packages/eval/src/sandbox/docker.ts +++ b/packages/eval/src/sandbox/docker.ts @@ -30,6 +30,11 @@ export interface DockerRunOptions { apiKey: string; /** Optional GitHub token (for copilot runner). */ ghToken?: string; + /** + * Names of host env vars to forward into the container (from `sandbox.passthroughEnv`). + * Each is resolved from `process.env` here; only currently-set vars are forwarded. + */ + passthroughEnv?: string[]; } // Serialises concurrent ensureDockerImage calls so only one build runs at a time. @@ -115,7 +120,7 @@ function findRepoRoot(): string { * after the container exits. */ export async function runJobInDocker(options: DockerRunOptions): Promise { - const { workspace, evalId, model, mode, tools, agentType, apiKey, ghToken } = options; + const { workspace, evalId, model, mode, tools, agentType, apiKey, ghToken, passthroughEnv } = options; await ensureDockerImage(); @@ -150,6 +155,15 @@ export async function runJobInDocker(options: DockerRunOptions): Promise { rmSync(workspace, { recursive: true, force: true }); }); + + it('forwards passthroughEnv vars that are set on the host and skips unset ones', async () => { + const runJobInDocker = await getRunJobInDocker(); + const workspace = mkdtempSync(join(tmpdir(), 'docker-passthrough-')); + + const origDomain = process.env.MCP_TENANT_DOMAIN; + const origSecret = process.env.MCP_CLIENT_SECRET; + process.env.MCP_TENANT_DOMAIN = 'tenant.us.auth0.com'; + delete process.env.MCP_CLIENT_SECRET; + + mockExecFileSync.mockReturnValue(''); + + let capturedArgs: string[] = []; + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + capturedArgs = args; + writeFileSync(join(workspace, '.eval-results.json'), JSON.stringify({ ok: true })); + return makeCloseEmitter(); + }); + + await runJobInDocker({ + workspace, + evalId: 'test_eval', + model: 'gpt-5.4', + mode: 'agent' as const, + tools: ['mcp'], + agentType: 'claude-code' as const, + apiKey: 'test-key', + passthroughEnv: ['MCP_TENANT_DOMAIN', 'MCP_CLIENT_SECRET'], + }); + + const envPairs = extractEnvPairs(capturedArgs); + + // Set var is forwarded; unset var is skipped entirely. + expect(envPairs).toContain('MCP_TENANT_DOMAIN=tenant.us.auth0.com'); + expect(envPairs.some((e) => e.startsWith('MCP_CLIENT_SECRET='))).toBe(false); + + if (origDomain === undefined) delete process.env.MCP_TENANT_DOMAIN; + else process.env.MCP_TENANT_DOMAIN = origDomain; + if (origSecret !== undefined) process.env.MCP_CLIENT_SECRET = origSecret; + rmSync(workspace, { recursive: true, force: true }); + }); }); // ── Host-side timeout ──────────────────────────────────────────────────────── diff --git a/packages/eval/tests/runners/codex-agent.test.ts b/packages/eval/tests/runners/codex-agent.test.ts index bb778fe8..0539e14e 100644 --- a/packages/eval/tests/runners/codex-agent.test.ts +++ b/packages/eval/tests/runners/codex-agent.test.ts @@ -51,10 +51,13 @@ const mockGetFrameworkConfig = vi.hoisted(() => }), ); +const mintMcpTokenMock = vi.hoisted(() => vi.fn()); + vi.mock('@a0/eval-core', async () => ({ ...(await vi.importActual('@a0/eval-core')), getAgentProxyBaseUrl: vi.fn().mockReturnValue('https://your-llm-proxy.example.com'), getFrameworkConfig: mockGetFrameworkConfig, + mintMcpToken: mintMcpTokenMock, })); // ── Mock @openai/codex-sdk ────────────────────────────────────────────────── @@ -752,6 +755,79 @@ describe('MCP integration', () => { expect(env['MY_SECRET_TOKEN']).toBe('secret123'); }); + it('mints a token and writes bearer_token_env_var for authed http servers', async () => { + mintMcpTokenMock.mockResolvedValueOnce('minted-token'); + mockGetFrameworkConfig.mockReturnValue({ + proxy: { baseUrl: 'https://your-llm-proxy.example.com/v1' }, + mcp: { + servers: { + 'auth0-hosted-mcp': { + type: 'http', + url: 'https://tenant.auth0.com/v1/mcp', + auth: { + tokenUrl: 'https://tenant.auth0.com/oauth/token', + clientId: 'cid', + clientSecret: 'secret', + audience: 'https://tenant.auth0.com/api/v2/', + }, + }, + }, + }, + }); + queueTurns([{ type: 'item.completed', item: { type: 'agent_message', text: 'Done.' } }, turnCompleted()]); + + await runCodexAgent(evalDef, workspace, { tools: ['mcp'] }); + + const written = (writeFileSync as ReturnType).mock.calls.find( + (c: unknown[]) => typeof c[0] === 'string' && (c[0] as string).endsWith('config.toml'), + ); + expect(written).toBeDefined(); + if (!written) return; + const toml = written[1] as string; + expect(toml).toContain('[mcp_servers."auth0-hosted-mcp"]'); + expect(toml).toContain('url = "https://tenant.auth0.com/v1/mcp"'); + expect(toml).toContain('bearer_token_env_var = "MCP_BEARER_AUTH0_HOSTED_MCP"'); + // The token itself must never be written to the config file. + expect(toml).not.toContain('minted-token'); + + // The minted token is injected into the Codex env under the referenced name. + const codexOptions = sdk.state.constructorCalls[0]; + const env = codexOptions.env as Record; + expect(env['MCP_BEARER_AUTH0_HOSTED_MCP']).toBe('minted-token'); + }); + + it('skips an authed server when the token mint fails', async () => { + mintMcpTokenMock.mockResolvedValueOnce(undefined); + mockGetFrameworkConfig.mockReturnValue({ + proxy: { baseUrl: 'https://your-llm-proxy.example.com/v1' }, + mcp: { + servers: { + 'auth0-hosted-mcp': { + type: 'http', + url: 'https://tenant.auth0.com/v1/mcp', + auth: { + tokenUrl: 'https://tenant.auth0.com/oauth/token', + clientId: 'cid', + clientSecret: 'secret', + audience: 'https://tenant.auth0.com/api/v2/', + }, + }, + }, + }, + }); + queueTurns([{ type: 'item.completed', item: { type: 'agent_message', text: 'Done.' } }, turnCompleted()]); + + await runCodexAgent(evalDef, workspace, { tools: ['mcp'] }); + + const written = (writeFileSync as ReturnType).mock.calls.find( + (c: unknown[]) => typeof c[0] === 'string' && (c[0] as string).endsWith('config.toml'), + ); + expect(written).toBeDefined(); + if (!written) return; + const toml = written[1] as string; + expect(toml).not.toContain('auth0-hosted-mcp'); + }); + it('does not write MCP sections when tools does not include mcp', async () => { mockGetFrameworkConfig.mockReturnValue({ proxy: { baseUrl: 'https://your-llm-proxy.example.com/v1' }, diff --git a/packages/eval/tests/runners/copilot-agent.test.ts b/packages/eval/tests/runners/copilot-agent.test.ts index a054c76e..c2c28ecb 100644 --- a/packages/eval/tests/runners/copilot-agent.test.ts +++ b/packages/eval/tests/runners/copilot-agent.test.ts @@ -8,6 +8,14 @@ import { describe, it, expect, vi, beforeAll, beforeEach, afterEach } from 'vitest'; import { EventEmitter } from 'node:events'; + +// Mock only mintMcpToken so authed-server tests don't perform a real OAuth fetch. +const mintMcpTokenMock = vi.hoisted(() => vi.fn()); +vi.mock('@a0/eval-core', async () => ({ + ...(await vi.importActual('@a0/eval-core')), + mintMcpToken: mintMcpTokenMock, +})); + import { setFrameworkConfig } from '@a0/eval-core'; import { TEST_CONFIG } from '../test-config.js'; @@ -123,16 +131,71 @@ describe('COPILOT_DEFAULT_MODEL', () => { }); describe('getMcpServers', () => { - it('returns auth0-docs remote MCP server config', () => { - const servers = getMcpServers(); + it('returns auth0-docs remote MCP server config', async () => { + const servers = await getMcpServers(); expect(servers).toHaveProperty('auth0-docs'); expect(servers['auth0-docs'].type).toBe('http'); - expect(servers['auth0-docs'].url).toBe('https://auth0.com/docs/mcp'); - }); - - it('includes all tools via wildcard', () => { - const servers = getMcpServers(); - expect(servers['auth0-docs'].tools).toContain('*'); + expect((servers['auth0-docs'] as { url: string }).url).toBe('https://auth0.com/docs/mcp'); + }); + + it('includes all tools via wildcard', async () => { + const servers = await getMcpServers(); + expect((servers['auth0-docs'] as { tools: string[] }).tools).toContain('*'); + }); + + it('does not set an Authorization header for unauthenticated servers', async () => { + const servers = await getMcpServers(); + expect((servers['auth0-docs'] as { headers?: Record }).headers).toBeUndefined(); + }); + + it('mints a token and forwards it as an Authorization header for authed servers', async () => { + mintMcpTokenMock.mockResolvedValueOnce('minted-token'); + setFrameworkConfig({ + ...TEST_CONFIG, + mcp: { + servers: { + 'auth0-hosted-mcp': { + type: 'http', + url: 'https://tenant.auth0.com/v1/mcp', + auth: { + tokenUrl: 'https://tenant.auth0.com/oauth/token', + clientId: 'cid', + clientSecret: 'secret', + audience: 'https://tenant.auth0.com/api/v2/', + }, + }, + }, + }, + }); + const servers = await getMcpServers(); + expect((servers['auth0-hosted-mcp'] as { headers?: Record }).headers).toEqual({ + Authorization: 'Bearer minted-token', + }); + setFrameworkConfig(TEST_CONFIG); + }); + + it('skips an authed server when the token mint fails', async () => { + mintMcpTokenMock.mockResolvedValueOnce(undefined); + setFrameworkConfig({ + ...TEST_CONFIG, + mcp: { + servers: { + 'auth0-hosted-mcp': { + type: 'http', + url: 'https://tenant.auth0.com/v1/mcp', + auth: { + tokenUrl: 'https://tenant.auth0.com/oauth/token', + clientId: 'cid', + clientSecret: 'secret', + audience: 'https://tenant.auth0.com/api/v2/', + }, + }, + }, + }, + }); + const servers = await getMcpServers(); + expect(servers).not.toHaveProperty('auth0-hosted-mcp'); + setFrameworkConfig(TEST_CONFIG); }); }); diff --git a/packages/eval/tests/runners/gemini-cli-agent.test.ts b/packages/eval/tests/runners/gemini-cli-agent.test.ts index 2cc38182..aae9e7e2 100644 --- a/packages/eval/tests/runners/gemini-cli-agent.test.ts +++ b/packages/eval/tests/runners/gemini-cli-agent.test.ts @@ -20,9 +20,8 @@ import { join } from 'node:path'; // ── Mock framework config ──────────────────────────────────────────────────── -vi.mock('@a0/eval-core', async () => ({ - ...(await vi.importActual('@a0/eval-core')), - getFrameworkConfig: vi.fn().mockReturnValue({ +const mockGetFrameworkConfig = vi.hoisted(() => + vi.fn().mockReturnValue({ proxy: { baseUrl: 'https://llm.example.com/v1' }, mcp: { servers: { @@ -30,6 +29,13 @@ vi.mock('@a0/eval-core', async () => ({ }, }, }), +); +const mintMcpTokenMock = vi.hoisted(() => vi.fn()); + +vi.mock('@a0/eval-core', async () => ({ + ...(await vi.importActual('@a0/eval-core')), + getFrameworkConfig: mockGetFrameworkConfig, + mintMcpToken: mintMcpTokenMock, })); // ── Mock spawn ──────────────────────────────────────────────────────────────── @@ -568,6 +574,66 @@ describe('.gemini/settings.json', () => { expect(existsSync(join(tmpWorkspace, '.gemini', 'settings.json'))).toBe(true); }); + + it('mints a token and writes an Authorization header for authed servers', async () => { + mockSpawn.mockReturnValue(makeChild([resultEvent()])); + mintMcpTokenMock.mockResolvedValueOnce('minted-token'); + mockGetFrameworkConfig.mockReturnValueOnce({ + proxy: { baseUrl: 'https://llm.example.com/v1' }, + mcp: { + servers: { + 'auth0-hosted-mcp': { + type: 'http', + url: 'https://tenant.auth0.com/v1/mcp', + auth: { + tokenUrl: 'https://tenant.auth0.com/oauth/token', + clientId: 'cid', + clientSecret: 'secret', + audience: 'https://tenant.auth0.com/api/v2/', + }, + }, + }, + }, + }); + + await runGeminiCliAgent(evalDef, tmpWorkspace, { tools: ['mcp'] }); + + const settings = readSettings(); + expect(settings.mcpServers).toEqual({ + 'auth0-hosted-mcp': { + httpUrl: 'https://tenant.auth0.com/v1/mcp', + timeout: 30000, + headers: { Authorization: 'Bearer minted-token' }, + }, + }); + }); + + it('skips an authed server when the token mint fails', async () => { + mockSpawn.mockReturnValue(makeChild([resultEvent()])); + mintMcpTokenMock.mockResolvedValueOnce(undefined); + mockGetFrameworkConfig.mockReturnValueOnce({ + proxy: { baseUrl: 'https://llm.example.com/v1' }, + mcp: { + servers: { + 'auth0-hosted-mcp': { + type: 'http', + url: 'https://tenant.auth0.com/v1/mcp', + auth: { + tokenUrl: 'https://tenant.auth0.com/oauth/token', + clientId: 'cid', + clientSecret: 'secret', + audience: 'https://tenant.auth0.com/api/v2/', + }, + }, + }, + }, + }); + + await runGeminiCliAgent(evalDef, tmpWorkspace, { tools: ['mcp'] }); + + const settings = readSettings(); + expect(settings).not.toHaveProperty('mcpServers'); + }); }); // ── GH_TOKEN env forwarding ──────────────────────────────────────────────────