Skip to content

Commit e03dead

Browse files
committed
fix(anthropic): capture server-side tool usage metrics
Flatten Anthropic usage.server_tool_use into Braintrust span metrics so server-side tool invocations are preserved for tracing and cost analysis. Add regression tests using a red/green workflow to cover dict-backed span logging and object-backed usage extraction. Fixes #171
1 parent a656bab commit e03dead

File tree

2 files changed

+81
-32
lines changed

2 files changed

+81
-32
lines changed

py/src/braintrust/integrations/anthropic/_utils.py

Lines changed: 50 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Shared utilities for Anthropic API wrappers."""
22

3+
from collections.abc import Iterator
34
from typing import Any
45

56

@@ -13,6 +14,41 @@ def __getattr__(self, name: str) -> Any:
1314
return getattr(self.__wrapped, name)
1415

1516

17+
def _get_usage_value(usage: Any, key: str) -> Any:
18+
if isinstance(usage, dict):
19+
return usage.get(key)
20+
return getattr(usage, key, None)
21+
22+
23+
def _iter_usage_items(usage: Any) -> Iterator[tuple[str, Any]]:
24+
if isinstance(usage, dict):
25+
yield from usage.items()
26+
return
27+
28+
try:
29+
yield from vars(usage).items()
30+
return
31+
except TypeError:
32+
pass
33+
34+
for key in dir(usage):
35+
if key.startswith("_"):
36+
continue
37+
value = getattr(usage, key, None)
38+
if callable(value):
39+
continue
40+
yield key, value
41+
42+
43+
def _maybe_add_metric(metrics: dict[str, float], key: str, value: Any) -> None:
44+
if value is None or isinstance(value, bool):
45+
return
46+
try:
47+
metrics[key] = float(value)
48+
except (ValueError, TypeError):
49+
pass
50+
51+
1652
def extract_anthropic_usage(usage: Any) -> dict[str, float]:
1753
"""Extract and normalize usage metrics from Anthropic usage object or dict.
1854
@@ -28,44 +64,26 @@ def extract_anthropic_usage(usage: Any) -> dict[str, float]:
2864
- completion_tokens (from output_tokens)
2965
- prompt_cached_tokens (from cache_read_input_tokens)
3066
- prompt_cache_creation_tokens (from cache_creation_input_tokens)
67+
- server_tool_use_* (flattened from server_tool_use)
3168
"""
3269
metrics: dict[str, float] = {}
3370

3471
if not usage:
3572
return metrics
3673

37-
def get_value(key: str) -> Any:
38-
if isinstance(usage, dict):
39-
return usage.get(key)
40-
return getattr(usage, key, None)
41-
42-
input_tokens = get_value("input_tokens")
43-
if input_tokens is not None:
44-
try:
45-
metrics["prompt_tokens"] = float(input_tokens)
46-
except (ValueError, TypeError):
47-
pass
48-
49-
output_tokens = get_value("output_tokens")
50-
if output_tokens is not None:
51-
try:
52-
metrics["completion_tokens"] = float(output_tokens)
53-
except (ValueError, TypeError):
54-
pass
55-
56-
cache_read_tokens = get_value("cache_read_input_tokens")
57-
if cache_read_tokens is not None:
58-
try:
59-
metrics["prompt_cached_tokens"] = float(cache_read_tokens)
60-
except (ValueError, TypeError):
61-
pass
62-
63-
cache_creation_tokens = get_value("cache_creation_input_tokens")
64-
if cache_creation_tokens is not None:
65-
try:
66-
metrics["prompt_cache_creation_tokens"] = float(cache_creation_tokens)
67-
except (ValueError, TypeError):
68-
pass
74+
_maybe_add_metric(metrics, "prompt_tokens", _get_usage_value(usage, "input_tokens"))
75+
_maybe_add_metric(metrics, "completion_tokens", _get_usage_value(usage, "output_tokens"))
76+
_maybe_add_metric(metrics, "prompt_cached_tokens", _get_usage_value(usage, "cache_read_input_tokens"))
77+
_maybe_add_metric(
78+
metrics,
79+
"prompt_cache_creation_tokens",
80+
_get_usage_value(usage, "cache_creation_input_tokens"),
81+
)
82+
83+
server_tool_use = _get_usage_value(usage, "server_tool_use")
84+
if server_tool_use:
85+
for key, value in _iter_usage_items(server_tool_use):
86+
_maybe_add_metric(metrics, f"server_tool_use_{key}", value)
6987

7088
return metrics
7189

py/src/braintrust/integrations/anthropic/test_anthropic.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pytest
1212
from braintrust import logger
1313
from braintrust.integrations.anthropic import AnthropicIntegration, wrap_anthropic
14+
from braintrust.integrations.anthropic._utils import extract_anthropic_usage
1415
from braintrust.integrations.anthropic.tracing import _log_message_to_span
1516
from braintrust.test_helpers import init_test_logger
1617

@@ -52,6 +53,10 @@ def test_log_message_to_span_includes_stop_reason_and_stop_sequence():
5253
"output_tokens": 7,
5354
"cache_read_input_tokens": 0,
5455
"cache_creation_input_tokens": 0,
56+
"server_tool_use": {
57+
"web_search_requests": 2,
58+
"web_fetch_requests": 1,
59+
},
5560
},
5661
)
5762

@@ -70,12 +75,38 @@ def test_log_message_to_span_includes_stop_reason_and_stop_sequence():
7075
"completion_tokens": 7.0,
7176
"prompt_cached_tokens": 0.0,
7277
"prompt_cache_creation_tokens": 0.0,
78+
"server_tool_use_web_search_requests": 2.0,
79+
"server_tool_use_web_fetch_requests": 1.0,
7380
"tokens": 18.0,
7481
"time_to_first_token": 0.123,
7582
},
7683
)
7784

7885

86+
def test_extract_anthropic_usage_includes_server_tool_use_metrics_from_objects():
87+
usage = SimpleNamespace(
88+
input_tokens=11,
89+
output_tokens=7,
90+
cache_read_input_tokens=3,
91+
cache_creation_input_tokens=2,
92+
server_tool_use=SimpleNamespace(
93+
web_search_requests=2,
94+
web_fetch_requests=1,
95+
code_execution_requests=4,
96+
),
97+
)
98+
99+
assert extract_anthropic_usage(usage) == {
100+
"prompt_tokens": 11.0,
101+
"completion_tokens": 7.0,
102+
"prompt_cached_tokens": 3.0,
103+
"prompt_cache_creation_tokens": 2.0,
104+
"server_tool_use_web_search_requests": 2.0,
105+
"server_tool_use_web_fetch_requests": 1.0,
106+
"server_tool_use_code_execution_requests": 4.0,
107+
}
108+
109+
79110
@pytest.mark.vcr
80111
def test_anthropic_messages_create_stream_true(memory_logger):
81112
assert not memory_logger.pop()

0 commit comments

Comments
 (0)