diff --git a/py/src/braintrust/integrations/anthropic/_utils.py b/py/src/braintrust/integrations/anthropic/_utils.py index df5a412d..c485cbec 100644 --- a/py/src/braintrust/integrations/anthropic/_utils.py +++ b/py/src/braintrust/integrations/anthropic/_utils.py @@ -27,11 +27,6 @@ def __getattr__(self, name: str) -> Any: ("ephemeral_1h_input_tokens", "prompt_cache_creation_ephemeral_1h_tokens"), ) -_ANTHROPIC_SERVER_TOOL_USE_METRIC_FIELDS = ( - ("web_search_requests", "server_tool_use_web_search_requests"), - ("web_fetch_requests", "server_tool_use_web_fetch_requests"), -) - _ANTHROPIC_USAGE_METADATA_FIELDS = frozenset( { "service_tier", @@ -95,8 +90,8 @@ def extract_anthropic_usage(usage: Any) -> tuple[dict[str, float], dict[str, Any server_tool_use = _try_to_dict(usage.get("server_tool_use")) if server_tool_use is not None: - for source_name, metric_name in _ANTHROPIC_SERVER_TOOL_USE_METRIC_FIELDS: - _set_numeric_metric(metrics, metric_name, server_tool_use.get(source_name)) + for source_name, value in server_tool_use.items(): + _set_numeric_metric(metrics, f"server_tool_use_{source_name}", value) if "prompt_cache_creation_tokens" not in metrics and cache_creation_breakdown: metrics["prompt_cache_creation_tokens"] = sum(cache_creation_breakdown) diff --git a/py/src/braintrust/integrations/anthropic/test_anthropic.py b/py/src/braintrust/integrations/anthropic/test_anthropic.py index e187365a..d2db7719 100644 --- a/py/src/braintrust/integrations/anthropic/test_anthropic.py +++ b/py/src/braintrust/integrations/anthropic/test_anthropic.py @@ -53,6 +53,10 @@ def test_log_message_to_span_includes_stop_reason_and_stop_sequence(): "output_tokens": 7, "cache_read_input_tokens": 0, "cache_creation_input_tokens": 0, + "server_tool_use": { + "web_search_requests": 2, + "web_fetch_requests": 1, + }, }, ) @@ -71,6 +75,8 @@ def test_log_message_to_span_includes_stop_reason_and_stop_sequence(): "completion_tokens": 7.0, "prompt_cached_tokens": 0.0, "prompt_cache_creation_tokens": 0.0, + "server_tool_use_web_search_requests": 2.0, + "server_tool_use_web_fetch_requests": 1.0, "tokens": 18.0, "time_to_first_token": 0.123, }, @@ -78,6 +84,34 @@ def test_log_message_to_span_includes_stop_reason_and_stop_sequence(): ) +def test_extract_anthropic_usage_includes_server_tool_use_metrics_from_objects(): + usage = SimpleNamespace( + input_tokens=11, + output_tokens=7, + cache_read_input_tokens=3, + cache_creation_input_tokens=2, + server_tool_use=SimpleNamespace( + web_search_requests=2, + web_fetch_requests=1, + code_execution_requests=4, + ), + ) + + metrics, metadata = extract_anthropic_usage(usage) + + assert metrics == { + "prompt_tokens": 16.0, + "completion_tokens": 7.0, + "prompt_cached_tokens": 3.0, + "prompt_cache_creation_tokens": 2.0, + "server_tool_use_web_search_requests": 2.0, + "server_tool_use_web_fetch_requests": 1.0, + "server_tool_use_code_execution_requests": 4.0, + "tokens": 23.0, + } + assert metadata == {} + + @pytest.mark.vcr def test_anthropic_messages_create_stream_true(memory_logger): assert not memory_logger.pop()