enyst · smolpaws · May 11, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/package.json b/package.json
@@ -535,6 +535,7 @@
     "lint:cycles": "node scripts/check-circular-deps.mjs",
     "lint:duplication": "node scripts/check-duplication.mjs",
     "lint:fix": "eslint . --fix",
+    "smoke:anthropic-cache": "npm run build -w @smolpaws/agent-sdk && node scripts/anthropic-cache-smoke.mjs",
     "build:webview": "node esbuild.webview.mjs",
     "agent-server": "bash scripts/start-agent-server.sh",
     "agent-server:prepare": "PREPARE=1 bash scripts/start-agent-server.sh",

diff --git a/packages/agent-sdk/src/sdk/llm/__tests__/promptCaching.test.ts b/packages/agent-sdk/src/sdk/llm/__tests__/promptCaching.test.ts
@@ -0,0 +1,202 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+import { LLMStreamer } from '../../runtime';
+import { AnthropicClient, OpenAICompatibleClient } from '../index';
+import type { ChatCompletionRequest, LLMConfiguration } from '../types';
+
+const encoder = new TextEncoder();
+const EPHEMERAL_CACHE_CONTROL = { type: 'ephemeral' };
+
+const createStreamResponse = (payload: string, status = 200): Response =>
+  new Response(
+    new ReadableStream({
+      start(controller) {
+        controller.enqueue(encoder.encode(payload));
+        controller.close();
+      },
+    }),
+    { status, headers: { 'content-type': 'text/event-stream' } },
+  );
+
+const anthropicSse = [
+  'event: content_block_delta',
+  'data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Done"}}',
+  '',
+  'event: message_delta',
+  'data: {"type":"message_delta","delta":{"stop_reason":"end_turn"}}',
+  '',
+].join('\n');
+
+const openAiSse = [
+  'data: {"choices":[{"delta":{"content":"Done"}}]}',
+  'data: {"choices":[{"delta":{},"finish_reason":"stop"}]}',
+  'data: [DONE]',
+].join('\n');
+
+const splitSystemPromptRequest = (
+  overrides: Partial<ChatCompletionRequest> = {},
+): ChatCompletionRequest => ({
+  systemPrompt: 'STATIC\n\nDYNAMIC',
+  cacheableSystemPrompt: 'STATIC',
+  dynamicSystemPrompt: 'DYNAMIC',
+  messages: [{ role: 'user', content: [{ type: 'text', text: 'hello' }] }],
+  ...overrides,
+});
+
+afterEach(() => {
+  vi.restoreAllMocks();
+});
+
+describe('Anthropic prompt caching', () => {
+  const baseConfig: LLMConfiguration = {
+    model: 'claude-opus-4-7',
+    provider: 'anthropic',
+  };
+
+  it('marks only the static system block and last user block for caching', async () => {
+    const fetchMock = vi
+      .spyOn(global, 'fetch')
+      .mockResolvedValue(createStreamResponse(anthropicSse));
+
+    const client = new AnthropicClient(baseConfig, 'test-key');
+    const streamer = new LLMStreamer(client);
+
+    await streamer.runChat(splitSystemPromptRequest());
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const init = fetchMock.mock.calls[0]?.[1] as { body?: unknown } | undefined;
+    const body = typeof init?.body === 'string' ? JSON.parse(init.body) : null;
+
+    expect(body?.system).toEqual([
+      { type: 'text', text: 'STATIC', cache_control: EPHEMERAL_CACHE_CONTROL },
+      { type: 'text', text: 'DYNAMIC' },
+    ]);
+    expect(body?.messages?.[0]).toMatchObject({
+      role: 'user',
+      content: [{ type: 'text', text: 'hello', cache_control: EPHEMERAL_CACHE_CONTROL }],
+    });
+  });
+
+  it('moves the cache marker to the tool_result block', async () => {
+    const fetchMock = vi
+      .spyOn(global, 'fetch')
+      .mockResolvedValue(createStreamResponse(anthropicSse));
+
+    const client = new AnthropicClient(baseConfig, 'test-key');
+    const streamer = new LLMStreamer(client);
+
+    await streamer.runChat(splitSystemPromptRequest({
+      messages: [
+        { role: 'user', content: [{ type: 'text', text: 'hello' }] },
+        {
+          role: 'assistant',
+          content: [],
+          tool_calls: [
+            {
+              id: 'call_1',
+              type: 'function',
+              function: { name: 'bash', arguments: '{"command":"echo hi"}' },
+            },
+          ],
+        },
+        {
+          role: 'tool',
+          content: [{ type: 'text', text: 'hi' }],
+          tool_call_id: 'call_1',
+        },
+      ],
+      tools: [{ type: 'function', function: { name: 'bash' } }],
+    }));
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const init = fetchMock.mock.calls[0]?.[1] as { body?: unknown } | undefined;
+    const body = typeof init?.body === 'string' ? JSON.parse(init.body) : null;
+
+    expect(body?.messages?.at(-1)).toEqual({
+      role: 'user',
+      content: [{
+        type: 'tool_result',
+        tool_use_id: 'call_1',
+        content: 'hi',
+        cache_control: EPHEMERAL_CACHE_CONTROL,
+      }],
+    });
+  });
+});
+
+describe('OpenAI-compatible Anthropic prompt caching', () => {
+  const baseConfig: LLMConfiguration = {
+    model: 'claude-opus-4-7',
+    provider: 'litellm_proxy',
+    baseUrl: 'http://localhost:4000',
+  };
+
+  it('marks only the static system block and last user block for caching', async () => {
+    const fetchMock = vi
+      .spyOn(global, 'fetch')
+      .mockResolvedValue(createStreamResponse(openAiSse));
+
+    const client = new OpenAICompatibleClient(baseConfig, 'test-key');
+    const streamer = new LLMStreamer(client);
+
+    await streamer.runChat(splitSystemPromptRequest());
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const init = fetchMock.mock.calls[0]?.[1] as { body?: unknown } | undefined;
+    const body = typeof init?.body === 'string' ? JSON.parse(init.body) : null;
+
+    expect(body?.messages?.[0]).toEqual({
+      role: 'system',
+      content: [
+        { type: 'text', text: 'STATIC', cache_control: EPHEMERAL_CACHE_CONTROL },
+        { type: 'text', text: 'DYNAMIC' },
+      ],
+    });
+    expect(body?.messages?.[1]).toMatchObject({
+      role: 'user',
+      content: [{ type: 'text', text: 'hello', cache_control: EPHEMERAL_CACHE_CONTROL }],
+    });
+  });
+
+  it('moves the cache marker to the tool message level', async () => {
+    const fetchMock = vi
+      .spyOn(global, 'fetch')
+      .mockResolvedValue(createStreamResponse(openAiSse));
+
+    const client = new OpenAICompatibleClient(baseConfig, 'test-key');
+    const streamer = new LLMStreamer(client);
+
+    await streamer.runChat(splitSystemPromptRequest({
+      messages: [
+        { role: 'user', content: [{ type: 'text', text: 'hello' }] },
+        {
+          role: 'assistant',
+          content: [{ type: 'text', text: '' }],
+          tool_calls: [
+            {
+              id: 'call_1',
+              type: 'function',
+              function: { name: 'bash', arguments: '{"command":"echo hi"}' },
+            },
+          ],
+        },
+        {
+          role: 'tool',
+          content: [{ type: 'text', text: 'hi' }],
+          tool_call_id: 'call_1',
+        },
+      ],
+      tools: [{ type: 'function', function: { name: 'bash' } }],
+    }));
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const init = fetchMock.mock.calls[0]?.[1] as { body?: unknown } | undefined;
+    const body = typeof init?.body === 'string' ? JSON.parse(init.body) : null;
+
+    expect(body?.messages?.at(-1)).toEqual({
+      role: 'tool',
+      content: 'hi',
+      tool_call_id: 'call_1',
+      cache_control: EPHEMERAL_CACHE_CONTROL,
+    });
+  });
+});
diff --git a/packages/agent-sdk/src/sdk/llm/__tests__/providerQuirks.test.ts b/packages/agent-sdk/src/sdk/llm/__tests__/providerQuirks.test.ts
@@ -1,6 +1,12 @@
 import { describe, expect, it } from 'vitest';
 import type { LLMConfiguration } from '../types';
-import { normalizeGenerationParamsForModel, isAnthropicModel, supportsThinkingBlocks, getAnthropicThinkingBudget } from '../providerQuirks';
+import {
+  normalizeGenerationParamsForModel,
+  isAnthropicModel,
+  supportsPromptCaching,
+  supportsThinkingBlocks,
+  getAnthropicThinkingBudget,
+} from '../providerQuirks';
 
 const makeConfig = (overrides: Partial<LLMConfiguration> = {}): LLMConfiguration => ({
   model: 'gpt-4o',
@@ -135,6 +141,40 @@ describe('supportsThinkingBlocks', () => {
   });
 });
 
+describe('supportsPromptCaching', () => {
+  it('returns true for supported Anthropic cacheable models', () => {
+    expect(supportsPromptCaching(makeConfig({
+      model: 'claude-sonnet-4-5-20250929',
+      provider: 'anthropic',
+    }))).toBe(true);
+    expect(supportsPromptCaching(makeConfig({
+      model: 'claude-opus-4-7',
+      provider: 'anthropic',
+    }))).toBe(true);
+  });
+
+  it('returns true for LiteLLM Anthropic routing with supported cacheable models', () => {
+    expect(supportsPromptCaching(makeConfig({
+      model: 'anthropic/claude-3-5-sonnet-20241022',
+      provider: 'litellm_proxy',
+    }))).toBe(true);
+  });
+
+  it('returns false for Anthropic models outside the prompt-cache allowlist', () => {
+    expect(supportsPromptCaching(makeConfig({
+      model: 'claude-2.1',
+      provider: 'anthropic',
+    }))).toBe(false);
+  });
+
+  it('returns false for non-Anthropic models', () => {
+    expect(supportsPromptCaching(makeConfig({
+      model: 'gpt-4o',
+      provider: 'openai',
+    }))).toBe(false);
+  });
+});
+
 describe('getAnthropicThinkingBudget', () => {
   it('returns undefined for non-Anthropic models', () => {
     expect(getAnthropicThinkingBudget(makeConfig({

diff --git a/packages/agent-sdk/src/sdk/llm/__tests__/thinkingBlocks.test.ts b/packages/agent-sdk/src/sdk/llm/__tests__/thinkingBlocks.test.ts
@@ -371,7 +371,7 @@ describe('AnthropicClient thinking blocks', () => {
     expect(toolResultMsg).toBeDefined();
 
     const toolResultBlock = toolResultMsg.content.find((b: { type: string }) => b.type === 'tool_result');
-    expect(toolResultBlock).toEqual({
+    expect(toolResultBlock).toMatchObject({
       type: 'tool_result',
       tool_use_id: 'call_1',
       content: 'hi',

diff --git a/packages/agent-sdk/src/sdk/llm/anthropic.ts b/packages/agent-sdk/src/sdk/llm/anthropic.ts
@@ -1,8 +1,11 @@
 import { reduceTextContent, DEFAULT_RETRY_OPTIONS, DEFAULT_TIMEOUT_MS, type ChatCompletionRequest, type LLMClient, type LLMConfiguration, type LLMStreamChunk, type LLMToolDefinition, type RetryOptions, type ToolCallAccumulator } from './types';
-import { getAnthropicThinkingBudget } from './providerQuirks';
+import { getAnthropicThinkingBudget, supportsPromptCaching } from './providerQuirks';
 import { NonRetryableHttpStatusError, requestWithRetry } from './httpRetry';
 
 const decoder = new TextDecoder();
+const EPHEMERAL_CACHE_CONTROL = { type: 'ephemeral' } as const;
+
+type AnthropicCacheControl = typeof EPHEMERAL_CACHE_CONTROL;
 
 // Anthropic content block types
 type AnthropicThinkingBlock = {
@@ -14,6 +17,7 @@ type AnthropicThinkingBlock = {
 type AnthropicTextBlock = {
   type: 'text';
   text: string;
+  cache_control?: AnthropicCacheControl;
 };
 
 type AnthropicToolUseBlock = {
@@ -27,11 +31,13 @@ type AnthropicToolResultBlock = {
   type: 'tool_result';
   tool_use_id: string;
   content: string;
+  cache_control?: AnthropicCacheControl;
 };
 
 type AnthropicImageBlock = {
   type: 'image';
   source: { type: 'base64'; media_type: string; data: string };
+  cache_control?: AnthropicCacheControl;
 };
 
 type AnthropicContentBlock =
@@ -137,10 +143,25 @@ const parseBase64DataUrl = (url: string): { mediaType: string; base64: string }
   return { mediaType: match[1].toLowerCase(), base64: match[2] };
 };
 
-const toAnthropicMessages = (request: ChatCompletionRequest): AnthropicMessage[] => {
+const toAnthropicMessages = (
+  request: ChatCompletionRequest,
+  options?: { cacheLastMessage?: boolean },
+): AnthropicMessage[] => {
   const result: AnthropicMessage[] = [];
+  const lastCacheableMessageIndex = options?.cacheLastMessage
+    ? (() => {
+        for (let index = request.messages.length - 1; index >= 0; index -= 1) {
+          const role = request.messages[index]?.role;
+          if (role === 'user' || role === 'tool') {
+            return index;
+          }
+        }
+        return -1;
+      })()
+    : -1;
 
-  for (const message of request.messages) {
+  for (const [index, message] of request.messages.entries()) {
+    const shouldCacheMessage = index === lastCacheableMessageIndex;
     if (message.role === 'user') {
       const contentBlocks: AnthropicContentBlock[] = [];
       for (const part of message.content) {
@@ -162,6 +183,14 @@ const toAnthropicMessages = (request: ChatCompletionRequest): AnthropicMessage[]
       if (contentBlocks.length === 0) {
         contentBlocks.push({ type: 'text', text: '' });
       }
+      const lastBlock = contentBlocks.at(-1);
+      if (
+        shouldCacheMessage &&
+        lastBlock &&
+        (lastBlock.type === 'text' || lastBlock.type === 'image')
+      ) {
+        lastBlock.cache_control = EPHEMERAL_CACHE_CONTROL;
+      }
       result.push({ role: 'user', content: contentBlocks });
     } else if (message.role === 'assistant') {
       // Assistant messages: may have thinking + tool_use
@@ -218,6 +247,7 @@ const toAnthropicMessages = (request: ChatCompletionRequest): AnthropicMessage[]
         type: 'tool_result',
         tool_use_id: message.tool_call_id ?? '',
         content: reduceTextContent(message),
+        ...(shouldCacheMessage ? { cache_control: EPHEMERAL_CACHE_CONTROL } : {}),
       };
 
       if (lastMessage?.role === 'user') {
@@ -411,15 +441,30 @@ export class AnthropicClient implements LLMClient {
   private requestBody(request: ChatCompletionRequest): Record<string, unknown> {
     const anthropicTools = toAnthropicTools(request.tools);
     const thinkingBudget = getAnthropicThinkingBudget(this.config);
+    const cacheableSystemPrompt =
+      typeof request.cacheableSystemPrompt === 'string' && request.cacheableSystemPrompt.trim()
+        ? request.cacheableSystemPrompt
+        : request.systemPrompt;
+    const dynamicSystemPrompt =
+      typeof request.dynamicSystemPrompt === 'string' && request.dynamicSystemPrompt.trim()
+        ? request.dynamicSystemPrompt
+        : undefined;
+    const promptCachingEnabled = supportsPromptCaching(this.config);
+    const system = promptCachingEnabled
+      ? [
+          { type: 'text' as const, text: cacheableSystemPrompt, cache_control: EPHEMERAL_CACHE_CONTROL },
+          ...(dynamicSystemPrompt ? [{ type: 'text' as const, text: dynamicSystemPrompt }] : []),
+        ]
+      : [{ type: 'text' as const, text: request.systemPrompt }];
 
     return {
       model: this.config.model,
       max_tokens: this.config.maxOutputTokens ?? 16000,
       // Note: temperature is normalized by providerQuirks.normalizeGenerationParamsForModel()
       // which sets temperature=1 when thinking is enabled (Anthropic requirement)
       temperature: this.config.temperature ?? 0,
-      system: [{ type: 'text', text: request.systemPrompt }],
-      messages: toAnthropicMessages(request),
+      system,
+      messages: toAnthropicMessages(request, { cacheLastMessage: promptCachingEnabled }),
       stream: true,
       ...(anthropicTools ? { tools: anthropicTools, tool_choice: { type: 'auto' } } : {}),
       thinking: thinkingBudget !== undefined