From dfb2f09dedae3e7671588196de67e615275d3881 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Fri, 3 Apr 2026 22:35:21 -0400 Subject: [PATCH] fix(anthropic): move ephemeral cache creation fields to metadata The ephemeral cache creation breakdown fields (ephemeral_5m_input_tokens, ephemeral_1h_input_tokens) were being recorded as metrics, which required adding them to the standard metrics allowlists. Move them to span metadata instead, since they are informational breakdowns of the already-tracked prompt_cache_creation_tokens metric. This fixes the CI failure in test_standard_metrics_crawl where the integration tests reject unrecognized standard metric names. --- .../integrations/anthropic/_utils.py | 19 +++++++++---------- .../integrations/anthropic/test_anthropic.py | 10 ++++++---- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/py/src/braintrust/integrations/anthropic/_utils.py b/py/src/braintrust/integrations/anthropic/_utils.py index c485cbec..cc0a0242 100644 --- a/py/src/braintrust/integrations/anthropic/_utils.py +++ b/py/src/braintrust/integrations/anthropic/_utils.py @@ -22,9 +22,9 @@ def __getattr__(self, name: str) -> Any: ("cache_creation_input_tokens", "prompt_cache_creation_tokens"), ) -_ANTHROPIC_CACHE_CREATION_METRIC_FIELDS = ( - ("ephemeral_5m_input_tokens", "prompt_cache_creation_ephemeral_5m_tokens"), - ("ephemeral_1h_input_tokens", "prompt_cache_creation_ephemeral_1h_tokens"), +_ANTHROPIC_CACHE_CREATION_METADATA_FIELDS = ( + ("ephemeral_5m_input_tokens", "cache_creation_ephemeral_5m_input_tokens"), + ("ephemeral_1h_input_tokens", "cache_creation_ephemeral_1h_input_tokens"), ) _ANTHROPIC_USAGE_METADATA_FIELDS = frozenset( @@ -76,16 +76,17 @@ def extract_anthropic_usage(usage: Any) -> tuple[dict[str, float], dict[str, Any return {}, {} metrics: dict[str, float] = {} + metadata: dict[str, Any] = {} for source_name, metric_name in _ANTHROPIC_USAGE_METRIC_FIELDS: _set_numeric_metric(metrics, metric_name, usage.get(source_name)) cache_creation = _try_to_dict(usage.get("cache_creation")) cache_creation_breakdown: list[float] = [] if cache_creation is not None: - for source_name, metric_name in _ANTHROPIC_CACHE_CREATION_METRIC_FIELDS: + for source_name, metadata_key in _ANTHROPIC_CACHE_CREATION_METADATA_FIELDS: value = cache_creation.get(source_name) - _set_numeric_metric(metrics, metric_name, value) if is_numeric(value): + metadata[metadata_key] = int(value) cache_creation_breakdown.append(float(value)) server_tool_use = _try_to_dict(usage.get("server_tool_use")) @@ -105,9 +106,7 @@ def extract_anthropic_usage(usage: Any) -> tuple[dict[str, float], dict[str, Any metrics["prompt_tokens"] = total_prompt_tokens metrics["tokens"] = total_prompt_tokens + metrics.get("completion_tokens", 0) - metadata = { - f"usage_{name}": value - for name, value in usage.items() - if name in _ANTHROPIC_USAGE_METADATA_FIELDS and value is not None - } + for name, value in usage.items(): + if name in _ANTHROPIC_USAGE_METADATA_FIELDS and value is not None: + metadata[f"usage_{name}"] = value return metrics, metadata diff --git a/py/src/braintrust/integrations/anthropic/test_anthropic.py b/py/src/braintrust/integrations/anthropic/test_anthropic.py index d2db7719..7d54a533 100644 --- a/py/src/braintrust/integrations/anthropic/test_anthropic.py +++ b/py/src/braintrust/integrations/anthropic/test_anthropic.py @@ -608,8 +608,8 @@ def test_setup_creates_spans(memory_logger): ) assert metrics["completion_tokens"] == usage.output_tokens assert metrics["prompt_cache_creation_tokens"] == usage.cache_creation_input_tokens - assert metrics["prompt_cache_creation_ephemeral_5m_tokens"] == ephemeral_5m - assert metrics["prompt_cache_creation_ephemeral_1h_tokens"] == ephemeral_1h + assert span["metadata"]["cache_creation_ephemeral_5m_input_tokens"] == ephemeral_5m + assert span["metadata"]["cache_creation_ephemeral_1h_input_tokens"] == ephemeral_1h assert "service_tier" not in metrics @@ -634,12 +634,14 @@ def test_extract_anthropic_usage_preserves_nested_numeric_fields(): assert metrics["completion_tokens"] == 12 assert metrics["tokens"] == 27 assert metrics["prompt_cache_creation_tokens"] == 7 - assert metrics["prompt_cache_creation_ephemeral_5m_tokens"] == 3 - assert metrics["prompt_cache_creation_ephemeral_1h_tokens"] == 4 + assert metadata["cache_creation_ephemeral_5m_input_tokens"] == 3 + assert metadata["cache_creation_ephemeral_1h_input_tokens"] == 4 assert metrics["server_tool_use_web_search_requests"] == 2 assert metrics["server_tool_use_web_fetch_requests"] == 1 assert "service_tier" not in metrics assert metadata == { + "cache_creation_ephemeral_5m_input_tokens": 3, + "cache_creation_ephemeral_1h_input_tokens": 4, "usage_service_tier": "standard", "usage_inference_geo": "not_available", }