native inference billing

knjiang · knjiang · commit 4f147d06208f · 2026-04-01T19:53:40.000-04:00
diff --git a/apis/cloudflare/src/billing.ts b/apis/cloudflare/src/billing.ts
@@ -0,0 +1,82 @@
+import { type BillingEvent } from "@braintrust/proxy";
+
+const DEFAULT_BILLING_TELEMETRY_URL =
+  "https://api.braintrust.dev/billing/telemetry/ingest";
+
+function buildPayloadEvent(event: BillingEvent) {
+  if (!event.org_id) {
+    console.warn("billing event skipped: missing org_id");
+    return null;
+  }
+  if (!event.model) {
+    console.warn("billing event skipped: missing model");
+    return null;
+  }
+  if (!event.resolved_model) {
+    console.warn("billing event skipped: missing resolved_model");
+    return null;
+  }
+  const hasTokenUsageData =
+    event.input_tokens !== undefined ||
+    event.output_tokens !== undefined ||
+    event.cached_input_tokens !== undefined ||
+    event.cache_write_input_tokens !== undefined;
+  if (!hasTokenUsageData) {
+    console.warn("billing event skipped: missing token usage");
+    return null;
+  }
+  const requestId = crypto.randomUUID();
+  const timestamp = new Date().toISOString();
+
+  return {
+    event_name: "NativeInferenceTokenUsageEvent",
+    external_customer_id: event.org_id,
+    timestamp,
+    idempotency_key: requestId,
+    properties: {
+      model: event.model,
+      resolved_model: event.resolved_model,
+      org_id: event.org_id,
+      input_tokens: event.input_tokens,
+      output_tokens: event.output_tokens,
+      cached_input_tokens: event.cached_input_tokens,
+      cache_write_input_tokens: event.cache_write_input_tokens,
+    },
+  };
+}
+
+export async function sendBillingTelemetryEvent({
+  telemetryUrl,
+  event,
+}: {
+  telemetryUrl?: string;
+  event: BillingEvent;
+}): Promise<void> {
+  try {
+    const payloadEvent = buildPayloadEvent(event);
+    if (!payloadEvent) {
+      return;
+    }
+
+    const destination = telemetryUrl || DEFAULT_BILLING_TELEMETRY_URL;
+    const response = await fetch(destination, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${event.auth_token}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        events: [payloadEvent],
+      }),
+    });
+
+    if (!response.ok) {
+      const responseBody = await response.text();
+      console.warn(
+        `billing event failed: ${response.status} ${response.statusText} ${responseBody}`,
+      );
+    }
+  } catch (error) {
+    console.warn("billing event threw an error", error);
+  }
+}
diff --git a/apis/cloudflare/src/env.ts b/apis/cloudflare/src/env.ts
@@ -4,6 +4,7 @@ declare global {
     BRAINTRUST_APP_URL: string;
     WHITELISTED_ORIGINS?: string;
     METRICS_LICENSE_KEY?: string;
+    BILLING_TELEMETRY_URL?: string;
     NATIVE_INFERENCE_SECRET_KEY?: string;
   }
 }
diff --git a/apis/cloudflare/src/proxy.ts b/apis/cloudflare/src/proxy.ts
@@ -19,6 +19,7 @@ import { BT_PARENT, resolveParentHeader } from "braintrust/util";
 import { cachedLogin, makeProxySpanLogger } from "./tracing";
 import { MeterProvider } from "@opentelemetry/sdk-metrics";
 import { Meter, Attributes, Histogram } from "@opentelemetry/api";
+import { sendBillingTelemetryEvent } from "./billing";
 
 export type LogHistogramFn = (args: {
   name: string;
@@ -117,6 +118,30 @@ export async function handleProxyV1(
   let span: Span | undefined;
   let spanId: string | undefined;
   let spanExport: string | undefined;
+  let billingOrgId: string | undefined;
+  const orgName = request.headers.get(ORG_NAME_HEADER) ?? undefined;
+  const apiKey =
+    parseAuthHeader({
+      authorization: request.headers.get("authorization") ?? undefined,
+    }) ?? undefined;
+
+  const getLoginState = async () =>
+    cachedLogin({
+      appUrl: braintrustAppUrl(env).toString(),
+      apiKey,
+      orgName,
+      cache: credentialsCache,
+    });
+
+  if (apiKey) {
+    try {
+      const loginState = await getLoginState();
+      billingOrgId = loginState.orgId ?? undefined;
+    } catch (error) {
+      console.warn("Failed to resolve billing org id", error);
+    }
+  }
+
   const parentHeader = request.headers.get(BT_PARENT);
   if (parentHeader) {
     let parent;
@@ -131,19 +156,11 @@ export async function handleProxyV1(
       );
     }
 
-    const orgName = request.headers.get(ORG_NAME_HEADER) ?? undefined;
-    const apiKey =
-      parseAuthHeader({
-        authorization: request.headers.get("authorization") ?? undefined,
-      }) ?? undefined;
+    const loginState = await getLoginState();
+    billingOrgId = loginState.orgId ?? undefined;
 
     span = startSpan({
-      state: await cachedLogin({
-        appUrl: braintrustAppUrl(env).toString(),
-        apiKey,
-        orgName,
-        cache: credentialsCache,
-      }),
+      state: loginState,
       type: "llm",
       name: "LLM",
       parent: parent.toStr(),
@@ -199,6 +216,17 @@ export async function handleProxyV1(
     spanLogger,
     spanId,
     spanExport,
+    billingOrgId,
+    onBillingEvent: (event) => {
+      ctx.waitUntil(
+        sendBillingTelemetryEvent({
+          telemetryUrl: env.BILLING_TELEMETRY_URL,
+          event,
+        }).catch((error) => {
+          console.warn("billing waitUntil task failed", error);
+        }),
+      );
+    },
     nativeInferenceSecretKey: env.NATIVE_INFERENCE_SECRET_KEY,
   };
 
diff --git a/apis/cloudflare/wrangler-template.toml b/apis/cloudflare/wrangler-template.toml
@@ -28,10 +28,12 @@ head_sampling_rate = 0.2
 # You should not need to edit this
 BRAINTRUST_APP_URL = "https://www.braintrust.dev"
 METRICS_LICENSE_KEY="<YOUR_METRICS_LICENSE_KEY>"
+BILLING_TELEMETRY_URL="https://api.braintrust.dev/billing/telemetry/ingest"
 
 [env.staging.vars]
 BRAINTRUST_APP_URL = "https://www.braintrust.dev"
 METRICS_LICENSE_KEY="<YOUR_METRICS_LICENSE_KEY>"
+BILLING_TELEMETRY_URL="https://api.braintrust.dev/billing/telemetry/ingest"
 
 [env.staging]
 kv_namespaces = [
diff --git a/packages/proxy/edge/index.ts b/packages/proxy/edge/index.ts
@@ -1,6 +1,6 @@
 import { DEFAULT_BRAINTRUST_APP_URL } from "@lib/constants";
 import { flushMetrics } from "@lib/metrics";
-import { proxyV1, SpanLogger, LogHistogramFn } from "@lib/proxy";
+import { proxyV1, SpanLogger, LogHistogramFn, BillingEvent } from "@lib/proxy";
 import { isEmpty } from "@lib/util";
 import { MeterProvider } from "@opentelemetry/sdk-metrics";
 
@@ -36,6 +36,8 @@ export interface ProxyOpts {
   logHistogram?: LogHistogramFn;
   whitelist?: (string | RegExp)[];
   spanLogger?: SpanLogger;
+  billingOrgId?: string;
+  onBillingEvent?: (event: BillingEvent) => void;
   spanId?: string;
   spanExport?: string;
   nativeInferenceSecretKey?: string;
@@ -398,6 +400,8 @@ export function EdgeProxyV1(opts: ProxyOpts) {
         digest: digestMessage,
         logHistogram: opts.logHistogram,
         spanLogger: opts.spanLogger,
+        billingOrgId: opts.billingOrgId,
+        onBillingEvent: opts.onBillingEvent,
       });
     } catch (e) {
       return new Response(`${e}`, {
diff --git a/packages/proxy/src/proxy.ts b/packages/proxy/src/proxy.ts
@@ -190,6 +190,19 @@ export interface SpanLogger {
   reportProgress: (progress: string) => void;
 }
 
+export type BillingEvent = {
+  event_name: "NativeInferenceTokenUsageEvent";
+  auth_token: string;
+  org_id?: string;
+  model?: string | null;
+  resolved_model?: string | null;
+  org_name?: string;
+  input_tokens?: number;
+  output_tokens?: number;
+  cached_input_tokens?: number;
+  cache_write_input_tokens?: number;
+};
+
 // This is an isomorphic implementation of proxyV1, which is used by both edge functions
 // in CloudFlare and by the node proxy (locally and in lambda).
 export async function proxyV1({
@@ -208,6 +221,8 @@ export async function proxyV1({
   cacheKeyOptions = {},
   decompressFetch = false,
   spanLogger,
+  billingOrgId,
+  onBillingEvent,
   signal,
   fetch = globalThis.fetch,
 }: {
@@ -237,6 +252,8 @@ export async function proxyV1({
   cacheKeyOptions?: CacheKeyOptions;
   decompressFetch?: boolean;
   spanLogger?: SpanLogger;
+  billingOrgId?: string;
+  onBillingEvent?: (event: BillingEvent) => void;
   signal?: AbortSignal;
   fetch?: FetchFn;
 }): Promise<void> {
@@ -299,6 +316,7 @@ export async function proxyV1({
   );
 
   let orgName: string | undefined = proxyHeaders[ORG_NAME_HEADER] ?? undefined;
+  let resolvedOrgName: string | undefined = orgName;
   const projectId: string | undefined =
     proxyHeaders[PROJECT_ID_HEADER] ?? undefined;
 
@@ -649,6 +667,7 @@ export async function proxyV1({
 
         if (secrets.length > 0 && !orgName && secrets[0].org_name) {
           baseAttributes.org_name = secrets[0].org_name;
+          resolvedOrgName = secrets[0].org_name;
         }
         logRequest();
 
@@ -759,6 +778,11 @@ export async function proxyV1({
   if (stream) {
     let first = true;
     const allChunks: Uint8Array[] = [];
+    let resolvedModel: string | undefined = undefined;
+    let inputTokens: number | undefined = undefined;
+    let outputTokens: number | undefined = undefined;
+    let cachedInputTokens: number | undefined = undefined;
+    let cacheWriteInputTokens: number | undefined = undefined;
 
     // These parameters are for the streaming case
     let reasoning: OpenAIReasoning[] | undefined = undefined;
@@ -787,10 +811,20 @@ export async function proxyV1({
                 | OpenAIChatCompletionChunk
                 | undefined;
               if (result) {
+                if (typeof result.model === "string" && result.model) {
+                  resolvedModel = result.model;
+                }
                 const extendedUsage = completionUsageSchema.safeParse(
                   result.usage,
                 );
                 if (extendedUsage.success) {
+                  inputTokens = extendedUsage.data.prompt_tokens;
+                  outputTokens = extendedUsage.data.completion_tokens;
+                  cachedInputTokens =
+                    extendedUsage.data.prompt_tokens_details?.cached_tokens;
+                  cacheWriteInputTokens =
+                    extendedUsage.data.prompt_tokens_details
+                      ?.cache_creation_tokens;
                   spanLogger?.log({
                     metrics: {
                       tokens: extendedUsage.data.total_tokens,
@@ -978,10 +1012,20 @@ export async function proxyV1({
               case "chat":
               case "completion": {
                 const data = dataRaw as ChatCompletion;
+                if (typeof data.model === "string" && data.model) {
+                  resolvedModel = data.model;
+                }
                 const extendedUsage = completionUsageSchema.safeParse(
                   data.usage,
                 );
                 if (extendedUsage.success) {
+                  inputTokens = extendedUsage.data.prompt_tokens;
+                  outputTokens = extendedUsage.data.completion_tokens;
+                  cachedInputTokens =
+                    extendedUsage.data.prompt_tokens_details?.cached_tokens;
+                  cacheWriteInputTokens =
+                    extendedUsage.data.prompt_tokens_details
+                      ?.cache_creation_tokens;
                   spanLogger?.log({
                     output: data.choices,
                     metrics: {
@@ -1041,6 +1085,15 @@ export async function proxyV1({
               }
               case "response": {
                 const data = dataRaw as OpenAIResponse;
+                if (typeof data.model === "string" && data.model) {
+                  resolvedModel = data.model;
+                }
+                if (data.usage) {
+                  inputTokens = data.usage.input_tokens;
+                  outputTokens = data.usage.output_tokens;
+                  cachedInputTokens =
+                    data.usage.input_tokens_details?.cached_tokens;
+                }
                 spanLogger?.log({
                   output: data.output,
                   metrics: {
@@ -1089,6 +1142,27 @@ export async function proxyV1({
         });
 
         spanLogger?.end();
+        if (!responseFailed) {
+          try {
+            if (typeof onBillingEvent !== "function") {
+              return;
+            }
+            onBillingEvent({
+              event_name: "NativeInferenceTokenUsageEvent",
+              auth_token: authToken,
+              org_id: billingOrgId,
+              model,
+              resolved_model: resolvedModel,
+              org_name: resolvedOrgName,
+              input_tokens: inputTokens,
+              output_tokens: outputTokens,
+              cached_input_tokens: cachedInputTokens,
+              cache_write_input_tokens: cacheWriteInputTokens,
+            });
+          } catch (error) {
+            console.warn("billing callback failed", error);
+          }
+        }
         controller.terminate();
       },
     });

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@ declare global {`
`4`	`4`	`BRAINTRUST_APP_URL: string;`
`5`	`5`	`WHITELISTED_ORIGINS?: string;`
`6`	`6`	`METRICS_LICENSE_KEY?: string;`
	`7`	`+ BILLING_TELEMETRY_URL?: string;`
`7`	`8`	`NATIVE_INFERENCE_SECRET_KEY?: string;`
`8`	`9`	`}`
`9`	`10`	`}`