From dfb2f09dedae3e7671588196de67e615275d3881 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Fri, 3 Apr 2026 22:35:21 -0400
Subject: [PATCH] fix(anthropic): move ephemeral cache creation fields to
 metadata

The ephemeral cache creation breakdown fields
(ephemeral_5m_input_tokens, ephemeral_1h_input_tokens) were being
recorded as metrics, which required adding them to the standard
metrics allowlists. Move them to span metadata instead, since they
are informational breakdowns of the already-tracked
prompt_cache_creation_tokens metric.

This fixes the CI failure in test_standard_metrics_crawl where the
integration tests reject unrecognized standard metric names.
---
 .../integrations/anthropic/_utils.py          | 19 +++++++++----------
 .../integrations/anthropic/test_anthropic.py  | 10 ++++++----
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/py/src/braintrust/integrations/anthropic/_utils.py b/py/src/braintrust/integrations/anthropic/_utils.py
index c485cbec..cc0a0242 100644
--- a/py/src/braintrust/integrations/anthropic/_utils.py
+++ b/py/src/braintrust/integrations/anthropic/_utils.py
@@ -22,9 +22,9 @@ def __getattr__(self, name: str) -> Any:
     ("cache_creation_input_tokens", "prompt_cache_creation_tokens"),
 )
 
-_ANTHROPIC_CACHE_CREATION_METRIC_FIELDS = (
-    ("ephemeral_5m_input_tokens", "prompt_cache_creation_ephemeral_5m_tokens"),
-    ("ephemeral_1h_input_tokens", "prompt_cache_creation_ephemeral_1h_tokens"),
+_ANTHROPIC_CACHE_CREATION_METADATA_FIELDS = (
+    ("ephemeral_5m_input_tokens", "cache_creation_ephemeral_5m_input_tokens"),
+    ("ephemeral_1h_input_tokens", "cache_creation_ephemeral_1h_input_tokens"),
 )
 
 _ANTHROPIC_USAGE_METADATA_FIELDS = frozenset(
@@ -76,16 +76,17 @@ def extract_anthropic_usage(usage: Any) -> tuple[dict[str, float], dict[str, Any
         return {}, {}
 
     metrics: dict[str, float] = {}
+    metadata: dict[str, Any] = {}
     for source_name, metric_name in _ANTHROPIC_USAGE_METRIC_FIELDS:
         _set_numeric_metric(metrics, metric_name, usage.get(source_name))
 
     cache_creation = _try_to_dict(usage.get("cache_creation"))
     cache_creation_breakdown: list[float] = []
     if cache_creation is not None:
-        for source_name, metric_name in _ANTHROPIC_CACHE_CREATION_METRIC_FIELDS:
+        for source_name, metadata_key in _ANTHROPIC_CACHE_CREATION_METADATA_FIELDS:
             value = cache_creation.get(source_name)
-            _set_numeric_metric(metrics, metric_name, value)
             if is_numeric(value):
+                metadata[metadata_key] = int(value)
                 cache_creation_breakdown.append(float(value))
 
     server_tool_use = _try_to_dict(usage.get("server_tool_use"))
@@ -105,9 +106,7 @@ def extract_anthropic_usage(usage: Any) -> tuple[dict[str, float], dict[str, Any
         metrics["prompt_tokens"] = total_prompt_tokens
         metrics["tokens"] = total_prompt_tokens + metrics.get("completion_tokens", 0)
 
-    metadata = {
-        f"usage_{name}": value
-        for name, value in usage.items()
-        if name in _ANTHROPIC_USAGE_METADATA_FIELDS and value is not None
-    }
+    for name, value in usage.items():
+        if name in _ANTHROPIC_USAGE_METADATA_FIELDS and value is not None:
+            metadata[f"usage_{name}"] = value
     return metrics, metadata
diff --git a/py/src/braintrust/integrations/anthropic/test_anthropic.py b/py/src/braintrust/integrations/anthropic/test_anthropic.py
index d2db7719..7d54a533 100644
--- a/py/src/braintrust/integrations/anthropic/test_anthropic.py
+++ b/py/src/braintrust/integrations/anthropic/test_anthropic.py
@@ -608,8 +608,8 @@ def test_setup_creates_spans(memory_logger):
     )
     assert metrics["completion_tokens"] == usage.output_tokens
     assert metrics["prompt_cache_creation_tokens"] == usage.cache_creation_input_tokens
-    assert metrics["prompt_cache_creation_ephemeral_5m_tokens"] == ephemeral_5m
-    assert metrics["prompt_cache_creation_ephemeral_1h_tokens"] == ephemeral_1h
+    assert span["metadata"]["cache_creation_ephemeral_5m_input_tokens"] == ephemeral_5m
+    assert span["metadata"]["cache_creation_ephemeral_1h_input_tokens"] == ephemeral_1h
     assert "service_tier" not in metrics
 
 
@@ -634,12 +634,14 @@ def test_extract_anthropic_usage_preserves_nested_numeric_fields():
     assert metrics["completion_tokens"] == 12
     assert metrics["tokens"] == 27
     assert metrics["prompt_cache_creation_tokens"] == 7
-    assert metrics["prompt_cache_creation_ephemeral_5m_tokens"] == 3
-    assert metrics["prompt_cache_creation_ephemeral_1h_tokens"] == 4
+    assert metadata["cache_creation_ephemeral_5m_input_tokens"] == 3
+    assert metadata["cache_creation_ephemeral_1h_input_tokens"] == 4
     assert metrics["server_tool_use_web_search_requests"] == 2
     assert metrics["server_tool_use_web_fetch_requests"] == 1
     assert "service_tier" not in metrics
     assert metadata == {
+        "cache_creation_ephemeral_5m_input_tokens": 3,
+        "cache_creation_ephemeral_1h_input_tokens": 4,
         "usage_service_tier": "standard",
         "usage_inference_geo": "not_available",
     }