From 073649ea8953b9c322718c85870edbc0c93f3272 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Tue, 17 Mar 2026 07:45:28 -0700 Subject: [PATCH 1/5] Initial commit Signed-off-by: Vinay Damodaran --- .../openai/test_request_stats_headers.py | 68 +++++++++++++++++++ .../openai/chat_completion/api_router.py | 9 ++- .../openai/chat_completion/serving.py | 2 + vllm/entrypoints/openai/cli_args.py | 5 +- .../openai/completion/api_router.py | 9 ++- vllm/entrypoints/openai/completion/serving.py | 5 ++ vllm/entrypoints/openai/engine/protocol.py | 9 ++- .../openai/request_stats_headers.py | 68 +++++++++++++++++++ .../openai/responses/api_router.py | 8 ++- vllm/entrypoints/openai/responses/serving.py | 6 ++ 10 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 tests/entrypoints/openai/test_request_stats_headers.py create mode 100644 vllm/entrypoints/openai/request_stats_headers.py diff --git a/tests/entrypoints/openai/test_request_stats_headers.py b/tests/entrypoints/openai/test_request_stats_headers.py new file mode 100644 index 000000000000..1f9b43db6f95 --- /dev/null +++ b/tests/entrypoints/openai/test_request_stats_headers.py @@ -0,0 +1,68 @@ +import time +from vllm.entrypoints.openai.request_stats_headers import ( + build_request_stats_headers, +) +from vllm.entrypoints.openai.engine.protocol import UsageInfo +from vllm.v1.metrics.stats import RequestStateStats + + +def test_build_request_stats_headers_basic(): + """Headers are computed correctly from known timestamps.""" + now = time.time() + stats = RequestStateStats( + arrival_time=now - 1.0, + queued_ts=100.0, + scheduled_ts=100.05, + first_token_ts=100.15, + last_token_ts=100.45, + num_generation_tokens=10, + ) + usage = UsageInfo( + prompt_tokens=50, + completion_tokens=10, + total_tokens=60, + ) + headers = build_request_stats_headers( + metrics=stats, + usage=usage, + num_cached_tokens=5, + ) + + assert "x-total-time" in headers + assert "x-queue-time" in headers + assert "x-inference-time" in headers + assert "x-prefill-time" in headers + assert "x-decode-time" in headers + assert "x-prompt-tokens" in headers + assert "x-completion-tokens" in headers + assert "x-cached-tokens" in headers + + assert float(headers["x-queue-time"]) == round((100.05 - 100.0) * 1000, 2) + assert float(headers["x-prefill-time"]) == round((100.15 - 100.05) * 1000, 2) + assert float(headers["x-decode-time"]) == round((100.45 - 100.15) * 1000, 2) + assert float(headers["x-inference-time"]) == round((100.45 - 100.05) * 1000, 2) + + assert headers["x-prompt-tokens"] == "50" + assert headers["x-completion-tokens"] == "10" + assert headers["x-cached-tokens"] == "5" + + total_time = float(headers["x-total-time"]) + assert 900 < total_time < 1500 + + +def test_build_request_stats_headers_zero_timestamps(): + """When timestamps are 0 (not set), timing headers show 0.""" + stats = RequestStateStats( + arrival_time=time.time(), + queued_ts=0.0, + scheduled_ts=0.0, + first_token_ts=0.0, + last_token_ts=0.0, + ) + usage = UsageInfo(prompt_tokens=10, completion_tokens=5, total_tokens=15) + headers = build_request_stats_headers(metrics=stats, usage=usage, num_cached_tokens=0) + + assert headers["x-queue-time"] == "0.00" + assert headers["x-prefill-time"] == "0.00" + assert headers["x-decode-time"] == "0.00" + assert headers["x-inference-time"] == "0.00" diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py index cdaaa27fcdab..0b09a353ecc2 100644 --- a/vllm/entrypoints/openai/chat_completion/api_router.py +++ b/vllm/entrypoints/openai/chat_completion/api_router.py @@ -16,6 +16,9 @@ from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.orca_metrics import metrics_header +from vllm.entrypoints.openai.request_stats_headers import ( + maybe_build_request_stats_headers, +) from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import ( load_aware_call, @@ -66,9 +69,13 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re ) elif isinstance(generator, ChatCompletionResponse): + headers = {**(metrics_header(metrics_header_format) or {})} + stats_headers = maybe_build_request_stats_headers(raw_request) + if stats_headers: + headers.update(stats_headers) return JSONResponse( content=generator.model_dump(), - headers=metrics_header(metrics_header_format), + headers=headers or None, ) return StreamingResponse(content=generator, media_type="text/event-stream") diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index a426836afd35..20b0a17138cc 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -1605,6 +1605,8 @@ async def chat_completion_full_generator( ) request_metadata.final_usage_info = usage + request_metadata.request_stats = final_res.metrics + request_metadata.num_cached_tokens = final_res.num_cached_tokens or 0 response = ChatCompletionResponse( id=request_id, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 2bd991b0010e..1505a3f5c09d 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -153,9 +153,12 @@ class BaseFrontendArgs: """If set to True, log the stack trace of error responses""" tokens_only: bool = False """ - If set to True, only enable the Tokens In<>Out endpoint. + If set to True, only enable the Tokens In<>Out endpoint. This is intended for use in a Disaggregated Everything setup. """ + enable_request_stats_headers: bool = False + """If set to True, include per-request timing and compute stats as + x- response headers on non-streaming completion responses.""" @classmethod def _customize_cli_kwargs( diff --git a/vllm/entrypoints/openai/completion/api_router.py b/vllm/entrypoints/openai/completion/api_router.py index 4d8e0f885837..11fa47a620c2 100644 --- a/vllm/entrypoints/openai/completion/api_router.py +++ b/vllm/entrypoints/openai/completion/api_router.py @@ -14,6 +14,9 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.orca_metrics import metrics_header +from vllm.entrypoints.openai.request_stats_headers import ( + maybe_build_request_stats_headers, +) from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import ( load_aware_call, @@ -58,9 +61,13 @@ async def create_completion(request: CompletionRequest, raw_request: Request): content=generator.model_dump(), status_code=generator.error.code ) elif isinstance(generator, CompletionResponse): + headers = {**(metrics_header(metrics_header_format) or {})} + stats_headers = maybe_build_request_stats_headers(raw_request) + if stats_headers: + headers.update(stats_headers) return JSONResponse( content=generator.model_dump(), - headers=metrics_header(metrics_header_format), + headers=headers or None, ) return StreamingResponse(content=generator, media_type="text/event-stream") diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index fb7f253c7ea3..3d2a197da3aa 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -554,6 +554,11 @@ def request_output_to_completion_response( ) request_metadata.final_usage_info = usage + if last_final_res is not None: + request_metadata.request_stats = last_final_res.metrics + request_metadata.num_cached_tokens = ( + last_final_res.num_cached_tokens or 0 + ) if final_res_batch: kv_transfer_params = final_res_batch[0].kv_transfer_params return CompletionResponse( diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py index 8f6cdb3e6241..6b4791f4e539 100644 --- a/vllm/entrypoints/openai/engine/protocol.py +++ b/vllm/entrypoints/openai/engine/protocol.py @@ -5,7 +5,7 @@ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py import time from http import HTTPStatus -from typing import Any, ClassVar, Literal, TypeAlias +from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeAlias import regex as re from pydantic import ( @@ -20,6 +20,9 @@ from vllm.utils import random_uuid from vllm.utils.import_utils import resolve_obj_by_qualname +if TYPE_CHECKING: + from vllm.v1.metrics.stats import RequestStateStats + logger = init_logger(__name__) @@ -109,8 +112,12 @@ class UsageInfo(OpenAIBaseModel): class RequestResponseMetadata(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + request_id: str final_usage_info: UsageInfo | None = None + request_stats: "RequestStateStats | None" = None + num_cached_tokens: int = 0 class JsonSchemaResponseFormat(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/request_stats_headers.py b/vllm/entrypoints/openai/request_stats_headers.py new file mode 100644 index 000000000000..956470f8f042 --- /dev/null +++ b/vllm/entrypoints/openai/request_stats_headers.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import time +from typing import TYPE_CHECKING + +from vllm.entrypoints.openai.engine.protocol import UsageInfo + +if TYPE_CHECKING: + from fastapi import Request + + from vllm.v1.metrics.stats import RequestStateStats + + +def build_request_stats_headers( + metrics: RequestStateStats, + usage: UsageInfo, + num_cached_tokens: int, +) -> dict[str, str]: + """Build HTTP response headers with per-request timing and compute stats. + + Times are in milliseconds, rounded to 2 decimal places. + """ + total_time_ms = round((time.time() - metrics.arrival_time) * 1000, 2) + queue_time_ms = round((metrics.scheduled_ts - metrics.queued_ts) * 1000, 2) + prefill_time_ms = round( + (metrics.first_token_ts - metrics.scheduled_ts) * 1000, 2 + ) + decode_time_ms = round( + (metrics.last_token_ts - metrics.first_token_ts) * 1000, 2 + ) + inference_time_ms = round( + (metrics.last_token_ts - metrics.scheduled_ts) * 1000, 2 + ) + + return { + "x-total-time": f"{total_time_ms:.2f}", + "x-queue-time": f"{queue_time_ms:.2f}", + "x-inference-time": f"{inference_time_ms:.2f}", + "x-prefill-time": f"{prefill_time_ms:.2f}", + "x-decode-time": f"{decode_time_ms:.2f}", + "x-prompt-tokens": str(usage.prompt_tokens), + "x-completion-tokens": str(usage.completion_tokens or 0), + "x-cached-tokens": str(num_cached_tokens), + } + + +def maybe_build_request_stats_headers( + raw_request: Request, +) -> dict[str, str] | None: + """Build stats headers if enabled and stats are available. + + Returns None if the feature is disabled or stats are not available. + """ + if not getattr( + raw_request.app.state.args, "enable_request_stats_headers", False + ): + return None + metadata = getattr(raw_request.state, "request_metadata", None) + if metadata is None or metadata.request_stats is None: + return None + return build_request_stats_headers( + metrics=metadata.request_stats, + usage=metadata.final_usage_info, + num_cached_tokens=metadata.num_cached_tokens, + ) diff --git a/vllm/entrypoints/openai/responses/api_router.py b/vllm/entrypoints/openai/responses/api_router.py index 88d821260940..6fd72e0dcf1d 100644 --- a/vllm/entrypoints/openai/responses/api_router.py +++ b/vllm/entrypoints/openai/responses/api_router.py @@ -14,6 +14,9 @@ ResponsesResponse, StreamingResponsesResponse, ) +from vllm.entrypoints.openai.request_stats_headers import ( + maybe_build_request_stats_headers, +) from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import ( @@ -68,7 +71,10 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): content=generator.model_dump(), status_code=generator.error.code ) elif isinstance(generator, ResponsesResponse): - return JSONResponse(content=generator.model_dump()) + headers = maybe_build_request_stats_headers(raw_request) + return JSONResponse( + content=generator.model_dump(), headers=headers + ) return StreamingResponse( content=_convert_stream_to_sse_events(generator), media_type="text/event-stream" diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index df94848e3b9b..8e402bdb7059 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -861,6 +861,12 @@ async def responses_full_generator( ], ), ) + # Populate request stats for timing headers (SimpleContext only) + last_output = getattr(context, "last_output", None) + if last_output is not None and hasattr(last_output, "metrics"): + request_metadata.request_stats = last_output.metrics + request_metadata.num_cached_tokens = num_cached_tokens + response = ResponsesResponse.from_request( request, sampling_params, From 266dc162e30732a54bd501590de804d87ea5c3e5 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Sun, 29 Mar 2026 13:17:26 -0700 Subject: [PATCH 2/5] Working POC for headers Signed-off-by: Vinay Damodaran --- .../openai/test_request_stats_headers.py | 25 +++++++++++++++++++ vllm/entrypoints/openai/engine/protocol.py | 8 +++--- .../openai/request_stats_headers.py | 20 +++++++++------ 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/tests/entrypoints/openai/test_request_stats_headers.py b/tests/entrypoints/openai/test_request_stats_headers.py index 1f9b43db6f95..bb7db21f46a4 100644 --- a/tests/entrypoints/openai/test_request_stats_headers.py +++ b/tests/entrypoints/openai/test_request_stats_headers.py @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + import time from vllm.entrypoints.openai.request_stats_headers import ( build_request_stats_headers, @@ -66,3 +69,25 @@ def test_build_request_stats_headers_zero_timestamps(): assert headers["x-prefill-time"] == "0.00" assert headers["x-decode-time"] == "0.00" assert headers["x-inference-time"] == "0.00" + + +def test_build_request_stats_headers_partial_timestamps(): + """When scheduled but cancelled before tokens, timing values clamp to 0.""" + stats = RequestStateStats( + arrival_time=time.time() - 0.5, + queued_ts=100.0, + scheduled_ts=100.05, + first_token_ts=0.0, # no tokens generated + last_token_ts=0.0, + ) + usage = UsageInfo(prompt_tokens=20, completion_tokens=0, total_tokens=20) + headers = build_request_stats_headers( + metrics=stats, usage=usage, num_cached_tokens=0 + ) + + # These would be negative without clamping + assert float(headers["x-prefill-time"]) == 0.0 + assert float(headers["x-decode-time"]) == 0.0 + assert float(headers["x-inference-time"]) == 0.0 + # Queue time should still be valid + assert float(headers["x-queue-time"]) == 50.0 diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py index 6b4791f4e539..e4a69688278b 100644 --- a/vllm/entrypoints/openai/engine/protocol.py +++ b/vllm/entrypoints/openai/engine/protocol.py @@ -5,7 +5,7 @@ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py import time from http import HTTPStatus -from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeAlias +from typing import Any, ClassVar, Literal, TypeAlias import regex as re from pydantic import ( @@ -19,9 +19,7 @@ from vllm.logger import init_logger from vllm.utils import random_uuid from vllm.utils.import_utils import resolve_obj_by_qualname - -if TYPE_CHECKING: - from vllm.v1.metrics.stats import RequestStateStats +from vllm.v1.metrics.stats import RequestStateStats logger = init_logger(__name__) @@ -116,7 +114,7 @@ class RequestResponseMetadata(BaseModel): request_id: str final_usage_info: UsageInfo | None = None - request_stats: "RequestStateStats | None" = None + request_stats: RequestStateStats | None = None num_cached_tokens: int = 0 diff --git a/vllm/entrypoints/openai/request_stats_headers.py b/vllm/entrypoints/openai/request_stats_headers.py index 956470f8f042..8ca56a2026ae 100644 --- a/vllm/entrypoints/openai/request_stats_headers.py +++ b/vllm/entrypoints/openai/request_stats_headers.py @@ -23,16 +23,20 @@ def build_request_stats_headers( Times are in milliseconds, rounded to 2 decimal places. """ - total_time_ms = round((time.time() - metrics.arrival_time) * 1000, 2) - queue_time_ms = round((metrics.scheduled_ts - metrics.queued_ts) * 1000, 2) - prefill_time_ms = round( - (metrics.first_token_ts - metrics.scheduled_ts) * 1000, 2 + total_time_ms = max( + round((time.time() - metrics.arrival_time) * 1000, 2), 0 ) - decode_time_ms = round( - (metrics.last_token_ts - metrics.first_token_ts) * 1000, 2 + queue_time_ms = max( + round((metrics.scheduled_ts - metrics.queued_ts) * 1000, 2), 0 ) - inference_time_ms = round( - (metrics.last_token_ts - metrics.scheduled_ts) * 1000, 2 + prefill_time_ms = max( + round((metrics.first_token_ts - metrics.scheduled_ts) * 1000, 2), 0 + ) + decode_time_ms = max( + round((metrics.last_token_ts - metrics.first_token_ts) * 1000, 2), 0 + ) + inference_time_ms = max( + round((metrics.last_token_ts - metrics.scheduled_ts) * 1000, 2), 0 ) return { From ef92ec4b795cbfe5bb622174c0b90b64768d4eb2 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Mon, 30 Mar 2026 14:44:12 -0700 Subject: [PATCH 3/5] Changed to middleware approach Signed-off-by: Vinay Damodaran --- .../openai/test_request_stats_headers.py | 229 +++++++++++++++--- vllm/entrypoints/openai/api_server.py | 26 +- .../openai/chat_completion/api_router.py | 9 +- .../openai/completion/api_router.py | 9 +- .../openai/request_stats_headers.py | 56 ++--- .../openai/responses/api_router.py | 8 +- vllm/entrypoints/openai/responses/serving.py | 7 +- 7 files changed, 247 insertions(+), 97 deletions(-) diff --git a/tests/entrypoints/openai/test_request_stats_headers.py b/tests/entrypoints/openai/test_request_stats_headers.py index bb7db21f46a4..cf3ea40b3440 100644 --- a/tests/entrypoints/openai/test_request_stats_headers.py +++ b/tests/entrypoints/openai/test_request_stats_headers.py @@ -1,11 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + import time + +import pytest +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse +from httpx import ASGITransport, AsyncClient + +from vllm.entrypoints.openai.engine.protocol import ( + RequestResponseMetadata, + UsageInfo, +) from vllm.entrypoints.openai.request_stats_headers import ( build_request_stats_headers, ) -from vllm.entrypoints.openai.engine.protocol import UsageInfo from vllm.v1.metrics.stats import RequestStateStats @@ -31,25 +42,25 @@ def test_build_request_stats_headers_basic(): num_cached_tokens=5, ) - assert "x-total-time" in headers - assert "x-queue-time" in headers - assert "x-inference-time" in headers - assert "x-prefill-time" in headers - assert "x-decode-time" in headers - assert "x-prompt-tokens" in headers - assert "x-completion-tokens" in headers - assert "x-cached-tokens" in headers - - assert float(headers["x-queue-time"]) == round((100.05 - 100.0) * 1000, 2) - assert float(headers["x-prefill-time"]) == round((100.15 - 100.05) * 1000, 2) - assert float(headers["x-decode-time"]) == round((100.45 - 100.15) * 1000, 2) - assert float(headers["x-inference-time"]) == round((100.45 - 100.05) * 1000, 2) - - assert headers["x-prompt-tokens"] == "50" - assert headers["x-completion-tokens"] == "10" - assert headers["x-cached-tokens"] == "5" - - total_time = float(headers["x-total-time"]) + # All headers use x-vllm- prefix + for key in headers: + assert key.startswith("x-vllm-"), f"Header {key} missing x-vllm- prefix" + + assert float(headers["x-vllm-queue-time"]) == round((100.05 - 100.0) * 1000, 2) + assert float(headers["x-vllm-prefill-time"]) == round((100.15 - 100.05) * 1000, 2) + assert float(headers["x-vllm-decode-time"]) == round((100.45 - 100.15) * 1000, 2) + assert float(headers["x-vllm-inference-time"]) == round((100.45 - 100.05) * 1000, 2) + + assert headers["x-vllm-prompt-tokens"] == "50" + assert headers["x-vllm-completion-tokens"] == "10" + assert headers["x-vllm-cached-tokens"] == "5" + + # tokens-per-second: 10 tokens / 0.3s decode = 33.33 + decode_time_s = 100.45 - 100.15 + expected_tps = round(10 / decode_time_s, 2) + assert float(headers["x-vllm-tokens-per-second"]) == expected_tps + + total_time = float(headers["x-vllm-total-time"]) assert 900 < total_time < 1500 @@ -63,12 +74,15 @@ def test_build_request_stats_headers_zero_timestamps(): last_token_ts=0.0, ) usage = UsageInfo(prompt_tokens=10, completion_tokens=5, total_tokens=15) - headers = build_request_stats_headers(metrics=stats, usage=usage, num_cached_tokens=0) + headers = build_request_stats_headers( + metrics=stats, usage=usage, num_cached_tokens=0 + ) - assert headers["x-queue-time"] == "0.00" - assert headers["x-prefill-time"] == "0.00" - assert headers["x-decode-time"] == "0.00" - assert headers["x-inference-time"] == "0.00" + assert headers["x-vllm-queue-time"] == "0.00" + assert headers["x-vllm-prefill-time"] == "0.00" + assert headers["x-vllm-decode-time"] == "0.00" + assert headers["x-vllm-inference-time"] == "0.00" + assert headers["x-vllm-tokens-per-second"] == "0.00" def test_build_request_stats_headers_partial_timestamps(): @@ -77,7 +91,7 @@ def test_build_request_stats_headers_partial_timestamps(): arrival_time=time.time() - 0.5, queued_ts=100.0, scheduled_ts=100.05, - first_token_ts=0.0, # no tokens generated + first_token_ts=0.0, last_token_ts=0.0, ) usage = UsageInfo(prompt_tokens=20, completion_tokens=0, total_tokens=20) @@ -85,9 +99,160 @@ def test_build_request_stats_headers_partial_timestamps(): metrics=stats, usage=usage, num_cached_tokens=0 ) - # These would be negative without clamping - assert float(headers["x-prefill-time"]) == 0.0 - assert float(headers["x-decode-time"]) == 0.0 - assert float(headers["x-inference-time"]) == 0.0 - # Queue time should still be valid - assert float(headers["x-queue-time"]) == 50.0 + assert float(headers["x-vllm-prefill-time"]) == 0.0 + assert float(headers["x-vllm-decode-time"]) == 0.0 + assert float(headers["x-vllm-inference-time"]) == 0.0 + assert float(headers["x-vllm-tokens-per-second"]) == 0.0 + assert float(headers["x-vllm-queue-time"]) == 50.0 + + +def test_build_request_stats_headers_zero_decode_time_with_tokens(): + """Division by zero guard: tokens exist but decode time is 0.""" + stats = RequestStateStats( + arrival_time=time.time() - 0.1, + queued_ts=100.0, + scheduled_ts=100.05, + first_token_ts=100.10, + last_token_ts=100.10, # same as first_token_ts + ) + usage = UsageInfo(prompt_tokens=10, completion_tokens=5, total_tokens=15) + headers = build_request_stats_headers( + metrics=stats, usage=usage, num_cached_tokens=0 + ) + + assert headers["x-vllm-decode-time"] == "0.00" + assert headers["x-vllm-tokens-per-second"] == "0.00" + + +def _create_test_app(enable_headers: bool) -> FastAPI: + """Create a minimal FastAPI app with the stats middleware.""" + from vllm.entrypoints.openai.request_stats_headers import ( + build_request_stats_headers, + ) + + app = FastAPI() + + class Args: + enable_request_stats_headers = enable_headers + + app.state.args = Args() + + @app.middleware("http") + async def request_stats_headers_middleware(request: Request, call_next): + response = await call_next(request) + if not getattr( + request.app.state.args, + "enable_request_stats_headers", + False, + ): + return response + metadata = getattr(request.state, "request_metadata", None) + if ( + metadata is None + or metadata.request_stats is None + or metadata.final_usage_info is None + ): + return response + headers = build_request_stats_headers( + metrics=metadata.request_stats, + usage=metadata.final_usage_info, + num_cached_tokens=metadata.num_cached_tokens, + ) + for key, value in headers.items(): + response.headers[key] = value + return response + + @app.get("/test-with-stats") + async def test_with_stats(request: Request): + metadata = RequestResponseMetadata(request_id="test-123") + metadata.final_usage_info = UsageInfo( + prompt_tokens=50, completion_tokens=10, total_tokens=60 + ) + metadata.request_stats = RequestStateStats( + arrival_time=time.time() - 1.0, + queued_ts=100.0, + scheduled_ts=100.05, + first_token_ts=100.15, + last_token_ts=100.45, + num_generation_tokens=10, + ) + metadata.num_cached_tokens = 5 + request.state.request_metadata = metadata + return JSONResponse(content={"ok": True}) + + @app.get("/test-no-stats") + async def test_no_stats(request: Request): + return JSONResponse(content={"ok": True}) + + @app.get("/test-partial-stats") + async def test_partial_stats(request: Request): + metadata = RequestResponseMetadata(request_id="test-456") + metadata.request_stats = RequestStateStats( + arrival_time=time.time(), + queued_ts=100.0, + scheduled_ts=100.05, + ) + # final_usage_info is None + request.state.request_metadata = metadata + return JSONResponse(content={"ok": True}) + + return app + + +@pytest.mark.asyncio +async def test_middleware_flag_disabled(): + """No headers when flag is disabled.""" + app = _create_test_app(enable_headers=False) + async with AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + resp = await client.get("/test-with-stats") + assert resp.status_code == 200 + assert "x-vllm-total-time" not in resp.headers + + +@pytest.mark.asyncio +async def test_middleware_no_metadata(): + """No headers when request_metadata is not set.""" + app = _create_test_app(enable_headers=True) + async with AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + resp = await client.get("/test-no-stats") + assert resp.status_code == 200 + assert "x-vllm-total-time" not in resp.headers + + +@pytest.mark.asyncio +async def test_middleware_missing_usage(): + """No headers when final_usage_info is None.""" + app = _create_test_app(enable_headers=True) + async with AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + resp = await client.get("/test-partial-stats") + assert resp.status_code == 200 + assert "x-vllm-total-time" not in resp.headers + + +@pytest.mark.asyncio +async def test_middleware_full_stats(): + """All headers present when flag enabled and stats available.""" + app = _create_test_app(enable_headers=True) + async with AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + resp = await client.get("/test-with-stats") + assert resp.status_code == 200 + assert "x-vllm-total-time" in resp.headers + assert "x-vllm-queue-time" in resp.headers + assert "x-vllm-inference-time" in resp.headers + assert "x-vllm-prefill-time" in resp.headers + assert "x-vllm-decode-time" in resp.headers + assert "x-vllm-prompt-tokens" in resp.headers + assert "x-vllm-completion-tokens" in resp.headers + assert "x-vllm-cached-tokens" in resp.headers + assert "x-vllm-tokens-per-second" in resp.headers + assert resp.headers["x-vllm-prompt-tokens"] == "50" + assert resp.headers["x-vllm-completion-tokens"] == "10" + assert resp.headers["x-vllm-cached-tokens"] == "5" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 95e831b51ec0..deb097c084a7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -16,7 +16,7 @@ from typing import Any import uvloop -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from starlette.datastructures import State @@ -32,6 +32,9 @@ from vllm.entrypoints.openai.engine.protocol import GenerationError from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.openai.request_stats_headers import ( + build_request_stats_headers, +) from vllm.entrypoints.openai.server_utils import ( engine_error_handler, exception_handler, @@ -273,6 +276,27 @@ def build_app( app.add_middleware(XRequestIdMiddleware) + if args.enable_request_stats_headers: + + @app.middleware("http") + async def request_stats_headers_middleware(request: Request, call_next): + response = await call_next(request) + metadata = getattr(request.state, "request_metadata", None) + if ( + metadata is None + or metadata.request_stats is None + or metadata.final_usage_info is None + ): + return response + headers = build_request_stats_headers( + metrics=metadata.request_stats, + usage=metadata.final_usage_info, + num_cached_tokens=metadata.num_cached_tokens, + ) + for key, value in headers.items(): + response.headers[key] = value + return response + # Add scaling middleware to check for scaling state app.add_middleware(ScalingMiddleware) diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py index 0b09a353ecc2..cdaaa27fcdab 100644 --- a/vllm/entrypoints/openai/chat_completion/api_router.py +++ b/vllm/entrypoints/openai/chat_completion/api_router.py @@ -16,9 +16,6 @@ from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.orca_metrics import metrics_header -from vllm.entrypoints.openai.request_stats_headers import ( - maybe_build_request_stats_headers, -) from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import ( load_aware_call, @@ -69,13 +66,9 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re ) elif isinstance(generator, ChatCompletionResponse): - headers = {**(metrics_header(metrics_header_format) or {})} - stats_headers = maybe_build_request_stats_headers(raw_request) - if stats_headers: - headers.update(stats_headers) return JSONResponse( content=generator.model_dump(), - headers=headers or None, + headers=metrics_header(metrics_header_format), ) return StreamingResponse(content=generator, media_type="text/event-stream") diff --git a/vllm/entrypoints/openai/completion/api_router.py b/vllm/entrypoints/openai/completion/api_router.py index 11fa47a620c2..4d8e0f885837 100644 --- a/vllm/entrypoints/openai/completion/api_router.py +++ b/vllm/entrypoints/openai/completion/api_router.py @@ -14,9 +14,6 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.orca_metrics import metrics_header -from vllm.entrypoints.openai.request_stats_headers import ( - maybe_build_request_stats_headers, -) from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import ( load_aware_call, @@ -61,13 +58,9 @@ async def create_completion(request: CompletionRequest, raw_request: Request): content=generator.model_dump(), status_code=generator.error.code ) elif isinstance(generator, CompletionResponse): - headers = {**(metrics_header(metrics_header_format) or {})} - stats_headers = maybe_build_request_stats_headers(raw_request) - if stats_headers: - headers.update(stats_headers) return JSONResponse( content=generator.model_dump(), - headers=headers or None, + headers=metrics_header(metrics_header_format), ) return StreamingResponse(content=generator, media_type="text/event-stream") diff --git a/vllm/entrypoints/openai/request_stats_headers.py b/vllm/entrypoints/openai/request_stats_headers.py index 8ca56a2026ae..9c640ec05563 100644 --- a/vllm/entrypoints/openai/request_stats_headers.py +++ b/vllm/entrypoints/openai/request_stats_headers.py @@ -9,8 +9,6 @@ from vllm.entrypoints.openai.engine.protocol import UsageInfo if TYPE_CHECKING: - from fastapi import Request - from vllm.v1.metrics.stats import RequestStateStats @@ -22,13 +20,10 @@ def build_request_stats_headers( """Build HTTP response headers with per-request timing and compute stats. Times are in milliseconds, rounded to 2 decimal places. + Tokens-per-second is decode throughput (completion_tokens / decode_time). """ - total_time_ms = max( - round((time.time() - metrics.arrival_time) * 1000, 2), 0 - ) - queue_time_ms = max( - round((metrics.scheduled_ts - metrics.queued_ts) * 1000, 2), 0 - ) + total_time_ms = max(round((time.time() - metrics.arrival_time) * 1000, 2), 0) + queue_time_ms = max(round((metrics.scheduled_ts - metrics.queued_ts) * 1000, 2), 0) prefill_time_ms = max( round((metrics.first_token_ts - metrics.scheduled_ts) * 1000, 2), 0 ) @@ -39,34 +34,21 @@ def build_request_stats_headers( round((metrics.last_token_ts - metrics.scheduled_ts) * 1000, 2), 0 ) + decode_time_s = decode_time_ms / 1000.0 + completion_tokens = usage.completion_tokens or 0 + if decode_time_s > 0 and completion_tokens > 0: + tokens_per_second = round(completion_tokens / decode_time_s, 2) + else: + tokens_per_second = 0.0 + return { - "x-total-time": f"{total_time_ms:.2f}", - "x-queue-time": f"{queue_time_ms:.2f}", - "x-inference-time": f"{inference_time_ms:.2f}", - "x-prefill-time": f"{prefill_time_ms:.2f}", - "x-decode-time": f"{decode_time_ms:.2f}", - "x-prompt-tokens": str(usage.prompt_tokens), - "x-completion-tokens": str(usage.completion_tokens or 0), - "x-cached-tokens": str(num_cached_tokens), + "x-vllm-total-time": f"{total_time_ms:.2f}", + "x-vllm-queue-time": f"{queue_time_ms:.2f}", + "x-vllm-inference-time": f"{inference_time_ms:.2f}", + "x-vllm-prefill-time": f"{prefill_time_ms:.2f}", + "x-vllm-decode-time": f"{decode_time_ms:.2f}", + "x-vllm-prompt-tokens": str(usage.prompt_tokens), + "x-vllm-completion-tokens": str(completion_tokens), + "x-vllm-cached-tokens": str(num_cached_tokens), + "x-vllm-tokens-per-second": f"{tokens_per_second:.2f}", } - - -def maybe_build_request_stats_headers( - raw_request: Request, -) -> dict[str, str] | None: - """Build stats headers if enabled and stats are available. - - Returns None if the feature is disabled or stats are not available. - """ - if not getattr( - raw_request.app.state.args, "enable_request_stats_headers", False - ): - return None - metadata = getattr(raw_request.state, "request_metadata", None) - if metadata is None or metadata.request_stats is None: - return None - return build_request_stats_headers( - metrics=metadata.request_stats, - usage=metadata.final_usage_info, - num_cached_tokens=metadata.num_cached_tokens, - ) diff --git a/vllm/entrypoints/openai/responses/api_router.py b/vllm/entrypoints/openai/responses/api_router.py index 6fd72e0dcf1d..88d821260940 100644 --- a/vllm/entrypoints/openai/responses/api_router.py +++ b/vllm/entrypoints/openai/responses/api_router.py @@ -14,9 +14,6 @@ ResponsesResponse, StreamingResponsesResponse, ) -from vllm.entrypoints.openai.request_stats_headers import ( - maybe_build_request_stats_headers, -) from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import ( @@ -71,10 +68,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): content=generator.model_dump(), status_code=generator.error.code ) elif isinstance(generator, ResponsesResponse): - headers = maybe_build_request_stats_headers(raw_request) - return JSONResponse( - content=generator.model_dump(), headers=headers - ) + return JSONResponse(content=generator.model_dump()) return StreamingResponse( content=_convert_stream_to_sse_events(generator), media_type="text/event-stream" diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 8e402bdb7059..0505cd01c010 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -861,10 +861,9 @@ async def responses_full_generator( ], ), ) - # Populate request stats for timing headers (SimpleContext only) - last_output = getattr(context, "last_output", None) - if last_output is not None and hasattr(last_output, "metrics"): - request_metadata.request_stats = last_output.metrics + # Populate request stats for timing headers + if context.last_output is not None and context.last_output.metrics is not None: + request_metadata.request_stats = context.last_output.metrics request_metadata.num_cached_tokens = num_cached_tokens response = ResponsesResponse.from_request( From 98e05bd4b200dd901f74f3c5d1d1648f0d0812be Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Mon, 30 Mar 2026 15:09:32 -0700 Subject: [PATCH 4/5] add usage info mapping for responses Signed-off-by: Vinay Damodaran --- vllm/entrypoints/openai/responses/serving.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 0505cd01c010..d71bc875b04d 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -54,6 +54,7 @@ DeltaMessage, ErrorResponse, RequestResponseMetadata, + UsageInfo, ) from vllm.entrypoints.openai.engine.serving import ( GenerationError, @@ -865,6 +866,11 @@ async def responses_full_generator( if context.last_output is not None and context.last_output.metrics is not None: request_metadata.request_stats = context.last_output.metrics request_metadata.num_cached_tokens = num_cached_tokens + request_metadata.final_usage_info = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + num_generated_tokens, + ) response = ResponsesResponse.from_request( request, From 6bceab52bf6a9b4043590f225d5e04c804ecf984 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Mon, 30 Mar 2026 15:40:19 -0700 Subject: [PATCH 5/5] clean up and address codex comments Signed-off-by: Vinay Damodaran --- .../openai/test_request_stats_headers.py | 63 +++++++++---------- vllm/entrypoints/openai/api_server.py | 24 +------ vllm/entrypoints/openai/completion/serving.py | 7 ++- .../openai/request_stats_headers.py | 34 +++++++++- vllm/entrypoints/openai/responses/serving.py | 5 +- 5 files changed, 73 insertions(+), 60 deletions(-) diff --git a/tests/entrypoints/openai/test_request_stats_headers.py b/tests/entrypoints/openai/test_request_stats_headers.py index cf3ea40b3440..84ee5a1b3902 100644 --- a/tests/entrypoints/openai/test_request_stats_headers.py +++ b/tests/entrypoints/openai/test_request_stats_headers.py @@ -16,6 +16,7 @@ ) from vllm.entrypoints.openai.request_stats_headers import ( build_request_stats_headers, + request_stats_headers_middleware, ) from vllm.v1.metrics.stats import RequestStateStats @@ -125,42 +126,12 @@ def test_build_request_stats_headers_zero_decode_time_with_tokens(): def _create_test_app(enable_headers: bool) -> FastAPI: - """Create a minimal FastAPI app with the stats middleware.""" - from vllm.entrypoints.openai.request_stats_headers import ( - build_request_stats_headers, - ) - + """Create a minimal FastAPI app with the production middleware.""" app = FastAPI() - class Args: - enable_request_stats_headers = enable_headers - - app.state.args = Args() - - @app.middleware("http") - async def request_stats_headers_middleware(request: Request, call_next): - response = await call_next(request) - if not getattr( - request.app.state.args, - "enable_request_stats_headers", - False, - ): - return response - metadata = getattr(request.state, "request_metadata", None) - if ( - metadata is None - or metadata.request_stats is None - or metadata.final_usage_info is None - ): - return response - headers = build_request_stats_headers( - metrics=metadata.request_stats, - usage=metadata.final_usage_info, - num_cached_tokens=metadata.num_cached_tokens, - ) - for key, value in headers.items(): - response.headers[key] = value - return response + # Mirror production: only register middleware when flag is enabled + if enable_headers: + app.middleware("http")(request_stats_headers_middleware) @app.get("/test-with-stats") async def test_with_stats(request: Request): @@ -196,6 +167,16 @@ async def test_partial_stats(request: Request): request.state.request_metadata = metadata return JSONResponse(content={"ok": True}) + @app.get("/test-missing-request-stats") + async def test_missing_request_stats(request: Request): + metadata = RequestResponseMetadata(request_id="test-789") + metadata.final_usage_info = UsageInfo( + prompt_tokens=10, completion_tokens=5, total_tokens=15 + ) + # request_stats is None + request.state.request_metadata = metadata + return JSONResponse(content={"ok": True}) + return app @@ -256,3 +237,17 @@ async def test_middleware_full_stats(): assert resp.headers["x-vllm-prompt-tokens"] == "50" assert resp.headers["x-vllm-completion-tokens"] == "10" assert resp.headers["x-vllm-cached-tokens"] == "5" + # 10 tokens / 0.3s decode = 33.33 tok/s + assert float(resp.headers["x-vllm-tokens-per-second"]) == round(10 / 0.3, 2) + + +@pytest.mark.asyncio +async def test_middleware_missing_request_stats(): + """No headers when request_stats is None but usage is present.""" + app = _create_test_app(enable_headers=True) + async with AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + resp = await client.get("/test-missing-request-stats") + assert resp.status_code == 200 + assert "x-vllm-total-time" not in resp.headers diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index deb097c084a7..6a00b705f2c3 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -16,7 +16,7 @@ from typing import Any import uvloop -from fastapi import FastAPI, HTTPException, Request +from fastapi import FastAPI, HTTPException from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from starlette.datastructures import State @@ -33,7 +33,7 @@ from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.request_stats_headers import ( - build_request_stats_headers, + request_stats_headers_middleware, ) from vllm.entrypoints.openai.server_utils import ( engine_error_handler, @@ -277,25 +277,7 @@ def build_app( app.add_middleware(XRequestIdMiddleware) if args.enable_request_stats_headers: - - @app.middleware("http") - async def request_stats_headers_middleware(request: Request, call_next): - response = await call_next(request) - metadata = getattr(request.state, "request_metadata", None) - if ( - metadata is None - or metadata.request_stats is None - or metadata.final_usage_info is None - ): - return response - headers = build_request_stats_headers( - metrics=metadata.request_stats, - usage=metadata.final_usage_info, - num_cached_tokens=metadata.num_cached_tokens, - ) - for key, value in headers.items(): - response.headers[key] = value - return response + app.middleware("http")(request_stats_headers_middleware) # Add scaling middleware to check for scaling state app.add_middleware(ScalingMiddleware) diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index 3d2a197da3aa..ce5a5cfa5cf0 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -555,10 +555,11 @@ def request_output_to_completion_response( request_metadata.final_usage_info = usage if last_final_res is not None: + # Known limitation: for multi-prompt batch requests, timing + # headers reflect only the last prompt's metrics, not the + # aggregate. Token counts in usage are correctly summed. request_metadata.request_stats = last_final_res.metrics - request_metadata.num_cached_tokens = ( - last_final_res.num_cached_tokens or 0 - ) + request_metadata.num_cached_tokens = last_final_res.num_cached_tokens or 0 if final_res_batch: kv_transfer_params = final_res_batch[0].kv_transfer_params return CompletionResponse( diff --git a/vllm/entrypoints/openai/request_stats_headers.py b/vllm/entrypoints/openai/request_stats_headers.py index 9c640ec05563..24dadcc73603 100644 --- a/vllm/entrypoints/openai/request_stats_headers.py +++ b/vllm/entrypoints/openai/request_stats_headers.py @@ -6,9 +6,14 @@ import time from typing import TYPE_CHECKING +from starlette.requests import Request +from starlette.responses import Response + from vllm.entrypoints.openai.engine.protocol import UsageInfo if TYPE_CHECKING: + from collections.abc import Callable + from vllm.v1.metrics.stats import RequestStateStats @@ -47,8 +52,35 @@ def build_request_stats_headers( "x-vllm-inference-time": f"{inference_time_ms:.2f}", "x-vllm-prefill-time": f"{prefill_time_ms:.2f}", "x-vllm-decode-time": f"{decode_time_ms:.2f}", - "x-vllm-prompt-tokens": str(usage.prompt_tokens), + "x-vllm-prompt-tokens": str(usage.prompt_tokens or 0), "x-vllm-completion-tokens": str(completion_tokens), "x-vllm-cached-tokens": str(num_cached_tokens), "x-vllm-tokens-per-second": f"{tokens_per_second:.2f}", } + + +async def request_stats_headers_middleware( + request: Request, + call_next: Callable, +) -> Response: + """Middleware that injects x-vllm-* timing headers into responses. + + Reads request_metadata from request.state (populated by serving layers). + Returns response unchanged if metadata or stats are missing. + """ + response = await call_next(request) + metadata = getattr(request.state, "request_metadata", None) + if ( + metadata is None + or metadata.request_stats is None + or metadata.final_usage_info is None + ): + return response + headers = build_request_stats_headers( + metrics=metadata.request_stats, + usage=metadata.final_usage_info, + num_cached_tokens=metadata.num_cached_tokens, + ) + for key, value in headers.items(): + response.headers[key] = value + return response diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index d71bc875b04d..3523925769ac 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -862,7 +862,10 @@ async def responses_full_generator( ], ), ) - # Populate request stats for timing headers + # Populate request stats for timing headers. + # Known limitation: for multi-turn tool-calling flows, + # timing breakdown (queue/prefill/decode) reflects only the + # final turn. Total wall-clock time is still correct. if context.last_output is not None and context.last_output.metrics is not None: request_metadata.request_stats = context.last_output.metrics request_metadata.num_cached_tokens = num_cached_tokens