diff --git a/benchmarks/terminal_bench/prepare_leaderboard_submission.py b/benchmarks/terminal_bench/prepare_leaderboard_submission.py index 89a099e1b5..26bdd48ac4 100755 --- a/benchmarks/terminal_bench/prepare_leaderboard_submission.py +++ b/benchmarks/terminal_bench/prepare_leaderboard_submission.py @@ -109,6 +109,13 @@ "model_org_display_name": "Anthropic", "folder_name": "Claude-Opus-4.7", }, + "anthropic/claude-opus-4-8": { + "model_name": "claude-opus-4-8", + "model_provider": "anthropic", + "model_display_name": "Claude Opus 4.8", + "model_org_display_name": "Anthropic", + "folder_name": "Claude-Opus-4.8", + }, # Keep historical GPT metadata alongside the current GPT-5.5 bench target # so mixed or older artifact sets still map to the canonical leaderboard names. "openai/gpt-5.2": { @@ -442,7 +449,7 @@ def main(): parser.add_argument( "--models", nargs="+", - help="Only process specific models (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)", + help="Only process specific models (e.g., anthropic/claude-opus-4-8, openai/gpt-5.5)", ) args = parser.parse_args() diff --git a/src/browser/features/Settings/Sections/ModelsSection.stories.tsx b/src/browser/features/Settings/Sections/ModelsSection.stories.tsx index 6748693956..07df912205 100644 --- a/src/browser/features/Settings/Sections/ModelsSection.stories.tsx +++ b/src/browser/features/Settings/Sections/ModelsSection.stories.tsx @@ -76,7 +76,7 @@ export const ModelsConfigured: Story = { isEnabled: true, isConfigured: true, baseUrl: "", - models: ["claude-sonnet-4-20250514", "claude-opus-4-7"], + models: ["claude-sonnet-4-20250514", "claude-opus-4-8"], }, openai: { apiKeySet: true, diff --git a/src/common/routing/types.ts b/src/common/routing/types.ts index e50cdedb7c..85c92d744c 100644 --- a/src/common/routing/types.ts +++ b/src/common/routing/types.ts @@ -1,7 +1,7 @@ import type { ProviderName } from "@/common/constants/providers"; export interface RouteContext { - /** Canonical model string (e.g., "anthropic:claude-opus-4-7") */ + /** Canonical model string (e.g., "anthropic:claude-opus-4-8") */ canonical: string; /** Origin provider — who made the model. Determines capabilities. */ origin: ProviderName; diff --git a/src/common/types/thinking.ts b/src/common/types/thinking.ts index ff3e120744..2e2927377c 100644 --- a/src/common/types/thinking.ts +++ b/src/common/types/thinking.ts @@ -167,7 +167,7 @@ export const ANTHROPIC_THINKING_BUDGETS: Record = { * Note: Opus 4.7 introduced a native "xhigh" effort level in the API, but the * SDK's Zod validator still rejects "xhigh". Mux handles this by sending "max" * through the SDK and rewriting `output_config.effort` to "xhigh" in a fetch - * wrapper for Opus 4.7 when the user selected the xhigh ThinkingLevel. + * wrapper for Opus 4.7+ when the user selected the xhigh ThinkingLevel. * See `wrapFetchWithAnthropicCacheControl` and `buildRequestHeaders`. */ export type AnthropicEffortLevel = "low" | "medium" | "high" | "max"; @@ -189,7 +189,7 @@ const ANTHROPIC_EFFORT: Record = { low: "low", medium: "medium", high: "high", - xhigh: "max", // SDK placeholder; fetch wrapper rewrites to "xhigh" on Opus 4.7 + xhigh: "max", // SDK placeholder; fetch wrapper rewrites to "xhigh" on Opus 4.7+ max: "max", }; diff --git a/src/common/utils/ai/models.test.ts b/src/common/utils/ai/models.test.ts index db7d98b3e4..38def5e5b6 100644 --- a/src/common/utils/ai/models.test.ts +++ b/src/common/utils/ai/models.test.ts @@ -122,7 +122,9 @@ describe("Anthropic 1M context classification", () => { expect(hasNative1MContext("anthropic:claude-sonnet-4-5")).toBe(false); }); - it("treats Opus 4.6 and Sonnet 4.6 as native 1M models", () => { + it("treats Opus 4.6 through 4.8 and Sonnet 4.6 as native 1M models", () => { + expect(getAnthropic1MContextMode("anthropic:claude-opus-4-8")).toBe("native"); + expect(getAnthropic1MContextMode("anthropic:claude-opus-4-8-20260528")).toBe("native"); expect(getAnthropic1MContextMode("anthropic:claude-opus-4-6")).toBe("native"); expect(getAnthropic1MContextMode("anthropic:claude-opus-4-6-20260201")).toBe("native"); expect(getAnthropic1MContextMode("anthropic:claude-sonnet-4-6")).toBe("native"); diff --git a/src/common/utils/ai/providerOptions.test.ts b/src/common/utils/ai/providerOptions.test.ts index 58b9a63fa8..1434aaca3a 100644 --- a/src/common/utils/ai/providerOptions.test.ts +++ b/src/common/utils/ai/providerOptions.test.ts @@ -962,7 +962,7 @@ describe("buildRequestHeaders", () => { }); } - describe("Opus 4.7 xhigh effort override", () => { + describe("Opus 4.7+ xhigh effort override", () => { for (const { name, model, routeProvider, thinkingLevel, expected } of [ { name: "emits override header when thinkingLevel=xhigh for Opus 4.7", @@ -979,8 +979,7 @@ describe("buildRequestHeaders", () => { expected: { [MUX_ANTHROPIC_EFFORT_OVERRIDE_HEADER]: "xhigh" }, }, { - name: "emits override header for hypothetical Opus 4.8", - // Detection should match future versions so xhigh doesn't silently collapse to max. + name: "emits override header for Opus 4.8", model: "anthropic:claude-opus-4-8", routeProvider: undefined, thinkingLevel: "xhigh", diff --git a/src/common/utils/ai/providerOptions.ts b/src/common/utils/ai/providerOptions.ts index 9c8e76c68b..26710dbd42 100644 --- a/src/common/utils/ai/providerOptions.ts +++ b/src/common/utils/ai/providerOptions.ts @@ -32,7 +32,7 @@ import { normalizeToCanonical, supports1MContext } from "./models"; /** * Request header used to override Anthropic's `output_config.effort` at the * wire level. The @ai-sdk/anthropic Zod schema rejects "xhigh", so for - * Opus 4.7 + xhigh ThinkingLevel we send "max" through the SDK and ask the + * Opus 4.7+ with xhigh ThinkingLevel we send "max" through the SDK and ask the * fetch wrapper to rewrite it to "xhigh" via this header (which is then stripped * before the request reaches Anthropic). */ @@ -283,7 +283,7 @@ export function buildProviderOptions( const usesAdaptiveThinking = isOpus46 || isOpus47Plus || isSonnet46; if (isOpus45 || usesAdaptiveThinking) { - // Map to SDK-accepted effort. For Opus 4.7 + xhigh ThinkingLevel, the SDK + // Map to SDK-accepted effort. For Opus 4.7+ with xhigh ThinkingLevel, the SDK // gets "max" as a placeholder and the Anthropic fetch wrapper rewrites // `output_config.effort` to "xhigh" via the X-Mux-Anthropic-Effort header // (added in buildRequestHeaders). @@ -305,10 +305,10 @@ export function buildProviderOptions( thinkingLevel: effectiveThinking, }); - // Note: Opus 4.7 requires `thinking.display: "summarized"` to receive + // Note: Opus 4.7+ requires `thinking.display: "summarized"` to receive // thinking content, but the SDK's Zod schema strips unknown keys so we // can't add it here. The Anthropic fetch wrapper injects `display` on the - // wire for Opus 4.7 + adaptive thinking requests. + // wire for Opus 4.7+ adaptive thinking requests. const anthropicOptions: AnthropicProviderOptions = { disableParallelToolUse: false, sendReasoning: true, diff --git a/src/common/utils/thinking/policy.test.ts b/src/common/utils/thinking/policy.test.ts index aae7e1cb62..9e58fb0fc5 100644 --- a/src/common/utils/thinking/policy.test.ts +++ b/src/common/utils/thinking/policy.test.ts @@ -342,8 +342,8 @@ describe("getThinkingPolicyForModel", () => { ]); }); - test("returns all 6 levels for future Opus versions (4.8+, 5+)", () => { - // Detection should extend forward so future models don't regress to the default policy. + test("returns all 6 levels for Opus 4.8 and future Opus versions", () => { + // Detection should extend forward so new Opus models don't regress to the default policy. expect(getThinkingPolicyForModel("anthropic:claude-opus-4-8")).toEqual([ "off", "low", diff --git a/src/node/services/providerModelFactory.test.ts b/src/node/services/providerModelFactory.test.ts index 25ea46abd1..c8d997328f 100644 --- a/src/node/services/providerModelFactory.test.ts +++ b/src/node/services/providerModelFactory.test.ts @@ -1746,20 +1746,22 @@ function parseSentBody(call: CapturedFetchCall): Record { return JSON.parse(call.init.body as string) as Record; } -describe("wrapFetchWithAnthropicCacheControl — Opus 4.7 wire transforms", () => { - it("injects thinking.display=summarized for Opus 4.7 adaptive thinking", async () => { - const { calls, fakeFetch } = createCapturingFetch(); - const wrapped = wrapFetchWithAnthropicCacheControl(fakeFetch); - const body = JSON.stringify({ - model: "claude-opus-4-7", - thinking: { type: "adaptive" }, - output_config: { effort: "medium" }, +describe("wrapFetchWithAnthropicCacheControl — Opus 4.7+ wire transforms", () => { + for (const model of ["claude-opus-4-7", "claude-opus-4-8"] as const) { + it(`injects thinking.display=summarized for ${model} adaptive thinking`, async () => { + const { calls, fakeFetch } = createCapturingFetch(); + const wrapped = wrapFetchWithAnthropicCacheControl(fakeFetch); + const body = JSON.stringify({ + model, + thinking: { type: "adaptive" }, + output_config: { effort: "medium" }, + }); + await wrapped("https://api.anthropic.com/v1/messages", { method: "POST", body }); + expect(calls.length).toBe(1); + const sent = parseSentBody(calls[0]); + expect(sent.thinking).toEqual({ type: "adaptive", display: "summarized" }); }); - await wrapped("https://api.anthropic.com/v1/messages", { method: "POST", body }); - expect(calls.length).toBe(1); - const sent = parseSentBody(calls[0]); - expect(sent.thinking).toEqual({ type: "adaptive", display: "summarized" }); - }); + } it("preserves a user-supplied display value on Opus 4.7", async () => { const { calls, fakeFetch } = createCapturingFetch(); diff --git a/src/node/services/providerModelFactory.ts b/src/node/services/providerModelFactory.ts index 29a901aa0b..cf3a64c8cb 100644 --- a/src/node/services/providerModelFactory.ts +++ b/src/node/services/providerModelFactory.ts @@ -1453,7 +1453,7 @@ export class ProviderModelFactory { // Use getProviderFetch to preserve any user-configured custom fetch (e.g., proxies) const baseFetch = getProviderFetch(providerConfig); const disableBeta = muxProviderOptions?.anthropic?.disableBetaFeatures === true; - // Always wrap to apply Opus 4.7 wire-level transforms (display + effort + // Always wrap to apply Opus 4.7+ wire-level transforms (display + effort // override); skip cache_control injection when beta features are off. const fetchWithCacheControl = wrapFetchWithAnthropicCacheControl( baseFetch, @@ -1965,7 +1965,7 @@ export class ProviderModelFactory { const baseFetch = getProviderFetch(providerConfig); const isAnthropicModel = modelId.startsWith("anthropic/"); const disableBeta = muxProviderOptions?.anthropic?.disableBetaFeatures === true; - // For Anthropic models via gateway, always wrap to apply Opus 4.7 wire + // For Anthropic models via gateway, always wrap to apply Opus 4.7+ wire // transforms; skip cache_control injection when beta features are off. const fetchWithCacheControl = isAnthropicModel ? wrapFetchWithAnthropicCacheControl(baseFetch, effectiveAnthropicCacheTtl, {