Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@
"lint:cycles": "node scripts/check-circular-deps.mjs",
"lint:duplication": "node scripts/check-duplication.mjs",
"lint:fix": "eslint . --fix",
"smoke:anthropic-cache": "npm run build -w @smolpaws/agent-sdk && node scripts/anthropic-cache-smoke.mjs",
"build:webview": "node esbuild.webview.mjs",
"agent-server": "bash scripts/start-agent-server.sh",
"agent-server:prepare": "PREPARE=1 bash scripts/start-agent-server.sh",
Expand Down
202 changes: 202 additions & 0 deletions packages/agent-sdk/src/sdk/llm/__tests__/promptCaching.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import { afterEach, describe, expect, it, vi } from 'vitest';
import { LLMStreamer } from '../../runtime';
import { AnthropicClient, OpenAICompatibleClient } from '../index';
import type { ChatCompletionRequest, LLMConfiguration } from '../types';

const encoder = new TextEncoder();
const EPHEMERAL_CACHE_CONTROL = { type: 'ephemeral' };

const createStreamResponse = (payload: string, status = 200): Response =>
new Response(
new ReadableStream({
start(controller) {
controller.enqueue(encoder.encode(payload));
controller.close();
},
}),
{ status, headers: { 'content-type': 'text/event-stream' } },
);

const anthropicSse = [
'event: content_block_delta',
'data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Done"}}',
'',
'event: message_delta',
'data: {"type":"message_delta","delta":{"stop_reason":"end_turn"}}',
'',
].join('\n');

const openAiSse = [
'data: {"choices":[{"delta":{"content":"Done"}}]}',
'data: {"choices":[{"delta":{},"finish_reason":"stop"}]}',
'data: [DONE]',
].join('\n');

const splitSystemPromptRequest = (
overrides: Partial<ChatCompletionRequest> = {},
): ChatCompletionRequest => ({
systemPrompt: 'STATIC\n\nDYNAMIC',
cacheableSystemPrompt: 'STATIC',
dynamicSystemPrompt: 'DYNAMIC',
messages: [{ role: 'user', content: [{ type: 'text', text: 'hello' }] }],
...overrides,
});

afterEach(() => {
vi.restoreAllMocks();
});

describe('Anthropic prompt caching', () => {
const baseConfig: LLMConfiguration = {
model: 'claude-opus-4-7',
provider: 'anthropic',
};

it('marks only the static system block and last user block for caching', async () => {
const fetchMock = vi
.spyOn(global, 'fetch')
.mockResolvedValue(createStreamResponse(anthropicSse));

const client = new AnthropicClient(baseConfig, 'test-key');
const streamer = new LLMStreamer(client);

await streamer.runChat(splitSystemPromptRequest());

expect(fetchMock).toHaveBeenCalledTimes(1);
const init = fetchMock.mock.calls[0]?.[1] as { body?: unknown } | undefined;
const body = typeof init?.body === 'string' ? JSON.parse(init.body) : null;

expect(body?.system).toEqual([
{ type: 'text', text: 'STATIC', cache_control: EPHEMERAL_CACHE_CONTROL },
{ type: 'text', text: 'DYNAMIC' },
]);
expect(body?.messages?.[0]).toMatchObject({
role: 'user',
content: [{ type: 'text', text: 'hello', cache_control: EPHEMERAL_CACHE_CONTROL }],
});
});

it('moves the cache marker to the tool_result block', async () => {
const fetchMock = vi
.spyOn(global, 'fetch')
.mockResolvedValue(createStreamResponse(anthropicSse));

const client = new AnthropicClient(baseConfig, 'test-key');
const streamer = new LLMStreamer(client);

await streamer.runChat(splitSystemPromptRequest({
messages: [
{ role: 'user', content: [{ type: 'text', text: 'hello' }] },
{
role: 'assistant',
content: [],
tool_calls: [
{
id: 'call_1',
type: 'function',
function: { name: 'bash', arguments: '{"command":"echo hi"}' },
},
],
},
{
role: 'tool',
content: [{ type: 'text', text: 'hi' }],
tool_call_id: 'call_1',
},
],
tools: [{ type: 'function', function: { name: 'bash' } }],
}));

expect(fetchMock).toHaveBeenCalledTimes(1);
const init = fetchMock.mock.calls[0]?.[1] as { body?: unknown } | undefined;
const body = typeof init?.body === 'string' ? JSON.parse(init.body) : null;

expect(body?.messages?.at(-1)).toEqual({
role: 'user',
content: [{
type: 'tool_result',
tool_use_id: 'call_1',
content: 'hi',
cache_control: EPHEMERAL_CACHE_CONTROL,
}],
});
});
});

describe('OpenAI-compatible Anthropic prompt caching', () => {
const baseConfig: LLMConfiguration = {
model: 'claude-opus-4-7',
provider: 'litellm_proxy',
baseUrl: 'http://localhost:4000',
};

it('marks only the static system block and last user block for caching', async () => {
const fetchMock = vi
.spyOn(global, 'fetch')
.mockResolvedValue(createStreamResponse(openAiSse));

const client = new OpenAICompatibleClient(baseConfig, 'test-key');
const streamer = new LLMStreamer(client);

await streamer.runChat(splitSystemPromptRequest());

expect(fetchMock).toHaveBeenCalledTimes(1);
const init = fetchMock.mock.calls[0]?.[1] as { body?: unknown } | undefined;
const body = typeof init?.body === 'string' ? JSON.parse(init.body) : null;

expect(body?.messages?.[0]).toEqual({
role: 'system',
content: [
{ type: 'text', text: 'STATIC', cache_control: EPHEMERAL_CACHE_CONTROL },
{ type: 'text', text: 'DYNAMIC' },
],
});
expect(body?.messages?.[1]).toMatchObject({
role: 'user',
content: [{ type: 'text', text: 'hello', cache_control: EPHEMERAL_CACHE_CONTROL }],
});
});

it('moves the cache marker to the tool message level', async () => {
const fetchMock = vi
.spyOn(global, 'fetch')
.mockResolvedValue(createStreamResponse(openAiSse));

const client = new OpenAICompatibleClient(baseConfig, 'test-key');
const streamer = new LLMStreamer(client);

await streamer.runChat(splitSystemPromptRequest({
messages: [
{ role: 'user', content: [{ type: 'text', text: 'hello' }] },
{
role: 'assistant',
content: [{ type: 'text', text: '' }],
tool_calls: [
{
id: 'call_1',
type: 'function',
function: { name: 'bash', arguments: '{"command":"echo hi"}' },
},
],
},
{
role: 'tool',
content: [{ type: 'text', text: 'hi' }],
tool_call_id: 'call_1',
},
],
tools: [{ type: 'function', function: { name: 'bash' } }],
}));

expect(fetchMock).toHaveBeenCalledTimes(1);
const init = fetchMock.mock.calls[0]?.[1] as { body?: unknown } | undefined;
const body = typeof init?.body === 'string' ? JSON.parse(init.body) : null;

expect(body?.messages?.at(-1)).toEqual({
role: 'tool',
content: 'hi',
tool_call_id: 'call_1',
cache_control: EPHEMERAL_CACHE_CONTROL,
});
});
});
42 changes: 41 additions & 1 deletion packages/agent-sdk/src/sdk/llm/__tests__/providerQuirks.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import { describe, expect, it } from 'vitest';
import type { LLMConfiguration } from '../types';
import { normalizeGenerationParamsForModel, isAnthropicModel, supportsThinkingBlocks, getAnthropicThinkingBudget } from '../providerQuirks';
import {
normalizeGenerationParamsForModel,
isAnthropicModel,
supportsPromptCaching,
supportsThinkingBlocks,
getAnthropicThinkingBudget,
} from '../providerQuirks';

const makeConfig = (overrides: Partial<LLMConfiguration> = {}): LLMConfiguration => ({
model: 'gpt-4o',
Expand Down Expand Up @@ -135,6 +141,40 @@ describe('supportsThinkingBlocks', () => {
});
});

describe('supportsPromptCaching', () => {
it('returns true for supported Anthropic cacheable models', () => {
expect(supportsPromptCaching(makeConfig({
model: 'claude-sonnet-4-5-20250929',
provider: 'anthropic',
}))).toBe(true);
expect(supportsPromptCaching(makeConfig({
model: 'claude-opus-4-7',
provider: 'anthropic',
}))).toBe(true);
});

it('returns true for LiteLLM Anthropic routing with supported cacheable models', () => {
expect(supportsPromptCaching(makeConfig({
model: 'anthropic/claude-3-5-sonnet-20241022',
provider: 'litellm_proxy',
}))).toBe(true);
});

it('returns false for Anthropic models outside the prompt-cache allowlist', () => {
expect(supportsPromptCaching(makeConfig({
model: 'claude-2.1',
provider: 'anthropic',
}))).toBe(false);
});

it('returns false for non-Anthropic models', () => {
expect(supportsPromptCaching(makeConfig({
model: 'gpt-4o',
provider: 'openai',
}))).toBe(false);
});
});

describe('getAnthropicThinkingBudget', () => {
it('returns undefined for non-Anthropic models', () => {
expect(getAnthropicThinkingBudget(makeConfig({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ describe('AnthropicClient thinking blocks', () => {
expect(toolResultMsg).toBeDefined();

const toolResultBlock = toolResultMsg.content.find((b: { type: string }) => b.type === 'tool_result');
expect(toolResultBlock).toEqual({
expect(toolResultBlock).toMatchObject({
type: 'tool_result',
tool_use_id: 'call_1',
content: 'hi',
Expand Down
55 changes: 50 additions & 5 deletions packages/agent-sdk/src/sdk/llm/anthropic.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import { reduceTextContent, DEFAULT_RETRY_OPTIONS, DEFAULT_TIMEOUT_MS, type ChatCompletionRequest, type LLMClient, type LLMConfiguration, type LLMStreamChunk, type LLMToolDefinition, type RetryOptions, type ToolCallAccumulator } from './types';
import { getAnthropicThinkingBudget } from './providerQuirks';
import { getAnthropicThinkingBudget, supportsPromptCaching } from './providerQuirks';
import { NonRetryableHttpStatusError, requestWithRetry } from './httpRetry';

const decoder = new TextDecoder();
const EPHEMERAL_CACHE_CONTROL = { type: 'ephemeral' } as const;

type AnthropicCacheControl = typeof EPHEMERAL_CACHE_CONTROL;

// Anthropic content block types
type AnthropicThinkingBlock = {
Expand All @@ -14,6 +17,7 @@ type AnthropicThinkingBlock = {
type AnthropicTextBlock = {
type: 'text';
text: string;
cache_control?: AnthropicCacheControl;
};

type AnthropicToolUseBlock = {
Expand All @@ -27,11 +31,13 @@ type AnthropicToolResultBlock = {
type: 'tool_result';
tool_use_id: string;
content: string;
cache_control?: AnthropicCacheControl;
};

type AnthropicImageBlock = {
type: 'image';
source: { type: 'base64'; media_type: string; data: string };
cache_control?: AnthropicCacheControl;
};

type AnthropicContentBlock =
Expand Down Expand Up @@ -137,10 +143,25 @@ const parseBase64DataUrl = (url: string): { mediaType: string; base64: string }
return { mediaType: match[1].toLowerCase(), base64: match[2] };
};

const toAnthropicMessages = (request: ChatCompletionRequest): AnthropicMessage[] => {
const toAnthropicMessages = (
request: ChatCompletionRequest,
options?: { cacheLastMessage?: boolean },
): AnthropicMessage[] => {
const result: AnthropicMessage[] = [];
const lastCacheableMessageIndex = options?.cacheLastMessage
? (() => {
for (let index = request.messages.length - 1; index >= 0; index -= 1) {
const role = request.messages[index]?.role;
if (role === 'user' || role === 'tool') {
return index;
}
}
return -1;
})()
: -1;

for (const message of request.messages) {
for (const [index, message] of request.messages.entries()) {
const shouldCacheMessage = index === lastCacheableMessageIndex;
if (message.role === 'user') {
const contentBlocks: AnthropicContentBlock[] = [];
for (const part of message.content) {
Expand All @@ -162,6 +183,14 @@ const toAnthropicMessages = (request: ChatCompletionRequest): AnthropicMessage[]
if (contentBlocks.length === 0) {
contentBlocks.push({ type: 'text', text: '' });
}
const lastBlock = contentBlocks.at(-1);
if (
shouldCacheMessage &&
lastBlock &&
(lastBlock.type === 'text' || lastBlock.type === 'image')
) {
lastBlock.cache_control = EPHEMERAL_CACHE_CONTROL;
}
result.push({ role: 'user', content: contentBlocks });
} else if (message.role === 'assistant') {
// Assistant messages: may have thinking + tool_use
Expand Down Expand Up @@ -218,6 +247,7 @@ const toAnthropicMessages = (request: ChatCompletionRequest): AnthropicMessage[]
type: 'tool_result',
tool_use_id: message.tool_call_id ?? '',
content: reduceTextContent(message),
...(shouldCacheMessage ? { cache_control: EPHEMERAL_CACHE_CONTROL } : {}),
};

if (lastMessage?.role === 'user') {
Expand Down Expand Up @@ -411,15 +441,30 @@ export class AnthropicClient implements LLMClient {
private requestBody(request: ChatCompletionRequest): Record<string, unknown> {
const anthropicTools = toAnthropicTools(request.tools);
const thinkingBudget = getAnthropicThinkingBudget(this.config);
const cacheableSystemPrompt =
typeof request.cacheableSystemPrompt === 'string' && request.cacheableSystemPrompt.trim()
? request.cacheableSystemPrompt
: request.systemPrompt;
const dynamicSystemPrompt =
typeof request.dynamicSystemPrompt === 'string' && request.dynamicSystemPrompt.trim()
? request.dynamicSystemPrompt
: undefined;
const promptCachingEnabled = supportsPromptCaching(this.config);
const system = promptCachingEnabled
? [
{ type: 'text' as const, text: cacheableSystemPrompt, cache_control: EPHEMERAL_CACHE_CONTROL },
...(dynamicSystemPrompt ? [{ type: 'text' as const, text: dynamicSystemPrompt }] : []),
]
: [{ type: 'text' as const, text: request.systemPrompt }];

return {
model: this.config.model,
max_tokens: this.config.maxOutputTokens ?? 16000,
// Note: temperature is normalized by providerQuirks.normalizeGenerationParamsForModel()
// which sets temperature=1 when thinking is enabled (Anthropic requirement)
temperature: this.config.temperature ?? 0,
system: [{ type: 'text', text: request.systemPrompt }],
messages: toAnthropicMessages(request),
system,
messages: toAnthropicMessages(request, { cacheLastMessage: promptCachingEnabled }),
stream: true,
...(anthropicTools ? { tools: anthropicTools, tool_choice: { type: 'auto' } } : {}),
thinking: thinkingBudget !== undefined
Expand Down
Loading
Loading