From e725b27b4293ca964fe1987d0f890e214d1f0771 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Sat, 9 May 2026 18:20:16 -0700 Subject: [PATCH 1/3] [Frontend] Add x-vllm-* response headers for per-request stats Adds an opt-in --enable-request-stats-headers flag that attaches per-request timing and token-count headers to non-streaming OpenAI responses (chat, completion, responses). The timing intervals (queue, prefill, decode, inference, e2e) and mean time per output token are computed in exactly one place: IterationStats.update_from_finished_request, which now returns the FinishedRequestStats it builds. Both the Prometheus path and the new headers middleware consume the same object - the middleware performs no arithmetic, only formatting. Headers added (all opt-in via --enable-request-stats-headers): x-vllm-total-time, x-vllm-queue-time, x-vllm-prefill-time, x-vllm-decode-time, x-vllm-inference-time, x-vllm-prompt-tokens, x-vllm-completion-tokens, x-vllm-cached-tokens, x-vllm-time-per-output-token Streaming responses are unchanged. Error responses are unchanged. Co-Authored-By: Claude Signed-off-by: Vinay Damodaran --- .../openai/test_request_stats_headers.py | 115 ++++++++++++++++++ tests/v1/metrics/test_stats.py | 27 ++++ vllm/entrypoints/openai/api_server.py | 6 + .../openai/chat_completion/serving.py | 1 + vllm/entrypoints/openai/cli_args.py | 5 + vllm/entrypoints/openai/completion/serving.py | 5 + vllm/entrypoints/openai/engine/protocol.py | 15 +++ .../openai/request_stats_headers.py | 54 ++++++++ vllm/entrypoints/openai/responses/serving.py | 7 ++ vllm/outputs.py | 7 +- vllm/v1/engine/output_processor.py | 13 +- vllm/v1/metrics/stats.py | 4 +- 12 files changed, 256 insertions(+), 3 deletions(-) create mode 100644 tests/entrypoints/openai/test_request_stats_headers.py create mode 100644 vllm/entrypoints/openai/request_stats_headers.py diff --git a/tests/entrypoints/openai/test_request_stats_headers.py b/tests/entrypoints/openai/test_request_stats_headers.py new file mode 100644 index 000000000000..b7ca0769b42d --- /dev/null +++ b/tests/entrypoints/openai/test_request_stats_headers.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import pytest +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse +from httpx import ASGITransport, AsyncClient + +from vllm.entrypoints.openai.engine.protocol import RequestResponseMetadata +from vllm.entrypoints.openai.request_stats_headers import ( + build_request_stats_headers, + request_stats_headers_middleware, +) +from vllm.v1.engine import FinishReason +from vllm.v1.metrics.stats import FinishedRequestStats + + +def _stats(**overrides) -> FinishedRequestStats: + base = dict( + finish_reason=FinishReason.STOP, + request_id="req-1", + e2e_latency=1.0, + num_prompt_tokens=50, + num_generation_tokens=10, + max_tokens_param=None, + queued_time=0.05, + prefill_time=0.10, + inference_time=0.40, + decode_time=0.30, + mean_time_per_output_token=0.030, + is_corrupted=False, + num_cached_tokens=5, + ) + base.update(overrides) + return FinishedRequestStats(**base) + + +def test_build_headers_basic(): + headers = build_request_stats_headers(_stats()) + + for key in headers: + assert key.startswith("x-vllm-"), f"{key} missing x-vllm- prefix" + + assert headers["x-vllm-total-time"] == "1000.00" + assert headers["x-vllm-queue-time"] == "50.00" + assert headers["x-vllm-prefill-time"] == "100.00" + assert headers["x-vllm-inference-time"] == "400.00" + assert headers["x-vllm-decode-time"] == "300.00" + assert headers["x-vllm-prompt-tokens"] == "50" + assert headers["x-vllm-completion-tokens"] == "10" + assert headers["x-vllm-cached-tokens"] == "5" + assert headers["x-vllm-time-per-output-token"] == "30.00" + + +def test_build_headers_zero_decode(): + """Single-token completion: mean_time_per_output_token is 0.""" + headers = build_request_stats_headers( + _stats(num_generation_tokens=1, decode_time=0.0, mean_time_per_output_token=0.0) + ) + assert headers["x-vllm-time-per-output-token"] == "0.00" + assert headers["x-vllm-completion-tokens"] == "1" + + +def _create_test_app() -> FastAPI: + app = FastAPI() + app.middleware("http")(request_stats_headers_middleware) + + @app.get("/with-stats") + async def with_stats(request: Request) -> JSONResponse: + meta = RequestResponseMetadata(request_id="r") + meta.finished_stats = _stats() + request.state.request_metadata = meta + return JSONResponse({"ok": True}) + + @app.get("/no-stats") + async def no_stats(request: Request) -> JSONResponse: + meta = RequestResponseMetadata(request_id="r") + request.state.request_metadata = meta + return JSONResponse({"ok": True}) + + @app.get("/no-metadata") + async def no_metadata() -> JSONResponse: + return JSONResponse({"ok": True}) + + return app + + +@pytest.mark.asyncio +async def test_middleware_attaches_headers_when_stats_present(): + app = _create_test_app() + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c: + resp = await c.get("/with-stats") + assert resp.status_code == 200 + assert resp.headers["x-vllm-decode-time"] == "300.00" + assert resp.headers["x-vllm-prompt-tokens"] == "50" + + +@pytest.mark.asyncio +async def test_middleware_passes_through_when_finished_stats_missing(): + app = _create_test_app() + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c: + resp = await c.get("/no-stats") + assert resp.status_code == 200 + assert "x-vllm-decode-time" not in resp.headers + + +@pytest.mark.asyncio +async def test_middleware_passes_through_when_metadata_missing(): + app = _create_test_app() + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c: + resp = await c.get("/no-metadata") + assert resp.status_code == 200 + assert "x-vllm-decode-time" not in resp.headers diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py index 21f496ea4aea..3c0cbda15891 100644 --- a/tests/v1/metrics/test_stats.py +++ b/tests/v1/metrics/test_stats.py @@ -244,3 +244,30 @@ def test_prompt_token_stats_full_external_transfer_recompute(): assert stats.external_kv_transfer == 999 assert stats.cached_tokens == 999 assert stats.total == 1000 + + +def test_update_from_finished_request_returns_finished_stats(): + """update_from_finished_request returns the same FinishedRequestStats it appends.""" + iteration_stats = IterationStats() + req_stats = RequestStateStats( + arrival_time=100.0, + queued_ts=100.05, + scheduled_ts=100.10, + first_token_ts=100.20, + last_token_ts=100.50, + num_generation_tokens=5, + ) + + returned = iteration_stats.update_from_finished_request( + finish_reason=FinishReason.STOP, + request_id="req-1", + num_prompt_tokens=10, + max_tokens_param=None, + req_stats=req_stats, + num_cached_tokens=3, + ) + + assert returned is not None + assert iteration_stats.finished_requests[-1] is returned + assert returned.request_id == "req-1" + assert returned.num_cached_tokens == 3 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index da2ec10284c5..7f7157b15c1b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -32,6 +32,9 @@ from vllm.entrypoints.openai.engine.protocol import GenerationError from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.openai.request_stats_headers import ( + request_stats_headers_middleware, +) from vllm.entrypoints.openai.server_utils import ( engine_error_handler, exception_handler, @@ -279,6 +282,9 @@ def build_app( app.add_middleware(XRequestIdMiddleware) + if args.enable_request_stats_headers: + app.middleware("http")(request_stats_headers_middleware) + # Add scaling middleware to check for scaling state app.add_middleware(ScalingMiddleware) diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 1026e0a1e3f7..c25b5ab47f28 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -1357,6 +1357,7 @@ async def chat_completion_full_generator( ) request_metadata.final_usage_info = usage + request_metadata.finished_stats = final_res.finished_stats prompt_routed_experts = None if final_res.prompt_routed_experts is not None: diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 3ebdec3c67fa..8c5d2e2fb781 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -151,6 +151,11 @@ class BaseFrontendArgs: """ log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE """If set to True, log the stack trace of error responses""" + enable_request_stats_headers: bool = False + """If set to True, include per-request timing and compute stats as + x-vllm-* response headers on non-streaming responses. Headers reflect + the same intervals reported by Prometheus (e2e, queued, prefill, + decode, inference, mean time per output token).""" tokens_only: bool = False """ If set to True, only enable the Tokens In<>Out endpoint. diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index ee4ca9f3ada3..a8b895e3462c 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -570,6 +570,11 @@ def request_output_to_completion_response( ) request_metadata.final_usage_info = usage + if last_final_res is not None: + # Known limitation: for multi-prompt batch requests, timing + # headers reflect only the last prompt's metrics. Token counts + # in usage are correctly summed across all prompts. + request_metadata.finished_stats = last_final_res.finished_stats prompt_routed_experts = None if final_res_batch: kv_transfer_params = final_res_batch[0].kv_transfer_params diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py index 890af0300efc..a599ccc98ac5 100644 --- a/vllm/entrypoints/openai/engine/protocol.py +++ b/vllm/entrypoints/openai/engine/protocol.py @@ -20,6 +20,7 @@ from vllm.logger import init_logger from vllm.utils import random_uuid from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.v1.metrics.stats import FinishedRequestStats logger = init_logger(__name__) @@ -110,8 +111,11 @@ class UsageInfo(OpenAIBaseModel): class RequestResponseMetadata(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + request_id: str final_usage_info: UsageInfo | None = None + finished_stats: FinishedRequestStats | None = None class JsonSchemaResponseFormat(OpenAIBaseModel): @@ -277,3 +281,14 @@ class GenerationError(Exception): def __init__(self, message: str = "Internal server error"): super().__init__(message) self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR + + +# RequestResponseMetadata.finished_stats references FinishedRequestStats, which +# in turn has a forward-reference annotation `FinishReason`. Pydantic resolves +# forward refs in the namespace of the module where the dataclass was defined +# (vllm.v1.metrics.stats), so we must supply the real symbol via +# _types_namespace. Import here (not at module top) to keep this localized to +# the rebuild and avoid polluting the public import surface. +from vllm.v1.engine import FinishReason as _FinishReason # noqa: E402, F401 + +RequestResponseMetadata.model_rebuild(_types_namespace={"FinishReason": _FinishReason}) diff --git a/vllm/entrypoints/openai/request_stats_headers.py b/vllm/entrypoints/openai/request_stats_headers.py new file mode 100644 index 000000000000..bf5a217d23ea --- /dev/null +++ b/vllm/entrypoints/openai/request_stats_headers.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from starlette.requests import Request +from starlette.responses import Response + +if TYPE_CHECKING: + from collections.abc import Callable + + from vllm.v1.metrics.stats import FinishedRequestStats + + +def build_request_stats_headers(stats: FinishedRequestStats) -> dict[str, str]: + """Format computed request timings as x-vllm-* response headers. + + Times are in milliseconds, rounded to 2 decimal places. Values come + directly from FinishedRequestStats; no arithmetic happens here. + """ + return { + "x-vllm-total-time": f"{stats.e2e_latency * 1000:.2f}", + "x-vllm-queue-time": f"{stats.queued_time * 1000:.2f}", + "x-vllm-inference-time": f"{stats.inference_time * 1000:.2f}", + "x-vllm-prefill-time": f"{stats.prefill_time * 1000:.2f}", + "x-vllm-decode-time": f"{stats.decode_time * 1000:.2f}", + "x-vllm-prompt-tokens": str(stats.num_prompt_tokens), + "x-vllm-completion-tokens": str(stats.num_generation_tokens), + "x-vllm-cached-tokens": str(stats.num_cached_tokens), + "x-vllm-time-per-output-token": ( + f"{stats.mean_time_per_output_token * 1000:.2f}" + ), + } + + +async def request_stats_headers_middleware( + request: Request, + call_next: Callable, +) -> Response: + """FastAPI middleware that attaches x-vllm-* timing headers. + + Reads request.state.request_metadata (populated by the serving layer). + No-op if metadata or finished_stats is missing — covers streaming, + errors, and non-OpenAI routes. + """ + response = await call_next(request) + metadata = getattr(request.state, "request_metadata", None) + if metadata is None or metadata.finished_stats is None: + return response + for key, value in build_request_stats_headers(metadata.finished_stats).items(): + response.headers[key] = value + return response diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 9c4dc48589ff..9af2f999e626 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -890,6 +890,13 @@ async def responses_full_generator( kv_transfer_params=context.kv_transfer_params, ) + # Attach finished_stats for x-vllm-* response headers. + # Known limitation: for multi-turn tool-calling flows, timing + # breakdown reflects only the final turn. Total wall-clock time + # is still correct. + if hasattr(context, "last_output") and context.last_output is not None: + request_metadata.finished_stats = context.last_output.finished_stats + if request.store: async with self.response_store_lock: stored_response = self.response_store.get(response.id) diff --git a/vllm/outputs.py b/vllm/outputs.py index aa6d12768ccc..bda70d4c728c 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -13,7 +13,7 @@ from vllm.logger import init_logger from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.lora.request import LoRARequest -from vllm.v1.metrics.stats import RequestStateStats +from vllm.v1.metrics.stats import FinishedRequestStats, RequestStateStats logger = init_logger(__name__) @@ -104,6 +104,8 @@ class RequestOutput: None if decoder-only. num_cached_tokens: The number of tokens with prefix cache hit. kv_transfer_params: The params for remote K/V transfer. + finished_stats: Computed timing intervals for the finished request, + set on the terminal RequestOutput. Internal use only. """ def __init__( @@ -122,6 +124,7 @@ def __init__( *, kv_transfer_params: dict[str, Any] | None = None, prompt_routed_experts: np.ndarray | None = None, + finished_stats: FinishedRequestStats | None = None, # Forward compatibility, code that uses args added in new release can # still run with older versions of vLLM without breaking. **kwargs: Any, @@ -143,6 +146,7 @@ def __init__( self.num_cached_tokens = num_cached_tokens self.kv_transfer_params = kv_transfer_params self.prompt_routed_experts = prompt_routed_experts + self.finished_stats = finished_stats def add(self, next_output: "RequestOutput", aggregate: bool) -> None: """Merge subsequent RequestOutput into this one""" @@ -187,6 +191,7 @@ def __repr__(self) -> str: f"outputs={self.outputs}, " f"finished={self.finished}, " f"metrics={self.metrics}, " + f"finished_stats={self.finished_stats}, " f"lora_request={self.lora_request}, " f"num_cached_tokens={self.num_cached_tokens})" ) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index e0e53694c400..301fc6fd830e 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -35,6 +35,7 @@ from vllm.v1.engine.logprobs import LogprobsProcessor from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.metrics.stats import ( + FinishedRequestStats, IterationStats, LoRARequestStates, RequestStateStats, @@ -177,6 +178,7 @@ def __init__( self.num_cached_tokens = 0 self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None + self.finished_stats: FinishedRequestStats | None = None # Stream Interval self.stream_interval = stream_interval @@ -396,6 +398,7 @@ def _new_request_output( num_cached_tokens=self.num_cached_tokens, metrics=self.stats, prompt_routed_experts=prompt_routed_experts, + finished_stats=self.finished_stats, ) def _new_completion_output( @@ -703,6 +706,14 @@ def process_outputs( self._update_stats_from_finished( req_state, finish_reason, iteration_stats ) + # Attach finished_stats to the already-built RequestOutput so + # downstream serving layers can read it. We can't compute + # finished_stats earlier because update_from_finished_request + # depends on prior side effects in this iteration. + if request_output is not None and isinstance( + request_output, RequestOutput + ): + request_output.finished_stats = req_state.finished_stats if self.tracing_enabled: self.do_tracing(engine_core_output, req_state, iteration_stats) @@ -822,7 +833,7 @@ def _update_stats_from_finished( assert finish_reason is not None assert req_state.stats is not None - iteration_stats.update_from_finished_request( + req_state.finished_stats = iteration_stats.update_from_finished_request( finish_reason=finish_reason, request_id=req_state.external_req_id, num_prompt_tokens=req_state.prompt_len, diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index a7a5fb7a2d2f..4a807639841d 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -433,7 +433,7 @@ def update_from_finished_request( max_tokens_param: int | None, req_stats: RequestStateStats, num_cached_tokens: int = 0, - ): + ) -> FinishedRequestStats: e2e_latency = self._time_since(req_stats.arrival_time) # Queued interval is from first QUEUED event to first SCHEDULED @@ -479,6 +479,8 @@ def update_from_finished_request( if req_stats.is_corrupted: self.num_corrupted_reqs += 1 + return finished_req + class LoRAStats: """Tracks waiting and running request IDs for a single LoRA.""" From 09ae3a25515e243c6dc9ee210dea7b463b66b518 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Sat, 9 May 2026 21:02:13 -0700 Subject: [PATCH 2/3] [Frontend] Extract FinishReason to leaf module to remove model_rebuild workaround The previous commit added a `RequestResponseMetadata.model_rebuild()` call at the bottom of `protocol.py` to repair a Pydantic forward-reference resolution failure. The root cause was structural: `FinishedRequestStats` lives in `vllm.v1.metrics.stats` and annotates `finish_reason: "FinishReason"`, but `FinishReason` was defined in `vllm.v1.engine.__init__` - which already imports `vllm.v1.metrics.stats` at runtime. To break that cycle, `stats.py` hid the import under `TYPE_CHECKING`, which left the forward-reference string unresolvable when Pydantic tried to introspect the dataclass. This change moves `FinishReason` and `FINISH_REASON_STRINGS` into a new leaf module `vllm/v1/finish_reason.py` that imports nothing else from `vllm.v1.*`. `vllm/v1/engine/__init__.py` re-exports the symbols so every existing `from vllm.v1.engine import FinishReason` keeps working unchanged. The class identity is preserved (re-export, not redefinition), so isinstance checks and enum value comparisons continue to work. `stats.py` can now import `FinishReason` at module top level without circularity, and the `model_rebuild` block in `protocol.py` is deleted - the forward reference resolves naturally. Net: -1 line, 4 files touched, no behavior change. Co-Authored-By: Claude Signed-off-by: Vinay Damodaran --- vllm/entrypoints/openai/engine/protocol.py | 11 ------- vllm/v1/engine/__init__.py | 34 ++++------------------ vllm/v1/finish_reason.py | 33 +++++++++++++++++++++ vllm/v1/metrics/stats.py | 7 +++-- 4 files changed, 42 insertions(+), 43 deletions(-) create mode 100644 vllm/v1/finish_reason.py diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py index a599ccc98ac5..857b339be770 100644 --- a/vllm/entrypoints/openai/engine/protocol.py +++ b/vllm/entrypoints/openai/engine/protocol.py @@ -281,14 +281,3 @@ class GenerationError(Exception): def __init__(self, message: str = "Internal server error"): super().__init__(message) self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR - - -# RequestResponseMetadata.finished_stats references FinishedRequestStats, which -# in turn has a forward-reference annotation `FinishReason`. Pydantic resolves -# forward refs in the namespace of the module where the dataclass was defined -# (vllm.v1.metrics.stats), so we must supply the real symbol via -# _types_namespace. Import here (not at module top) to keep this localized to -# the rebuild and avoid polluting the public import surface. -from vllm.v1.engine import FinishReason as _FinishReason # noqa: E402, F401 - -RequestResponseMetadata.model_rebuild(_types_namespace={"FinishReason": _FinishReason}) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 8172ead08319..2994bd614be8 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -15,6 +15,11 @@ from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams + +# FinishReason is re-exported for backwards compatibility. The canonical home +# is vllm.v1.finish_reason - a leaf module that vllm.v1.metrics can import at +# runtime without circularity. +from vllm.v1.finish_reason import FINISH_REASON_STRINGS, FinishReason # noqa: F401 from vllm.v1.metrics.stats import PrefillStats, SchedulerStats from vllm.v1.outputs import LogprobsLists, LogprobsTensors from vllm.v1.serial_utils import UtilityResult @@ -25,10 +30,6 @@ # - "keep": Freeze requests in queue; they resume on resume_generation(). PauseMode = Literal["abort", "wait", "keep"] -# These are possible values of RequestOutput.finish_reason, -# so form part of the external API. -FINISH_REASON_STRINGS = ("stop", "length", "abort", "error", "repetition") - EEP_NOTIFICATION_CALL_ID = -1 @@ -39,31 +40,6 @@ class EEPNotificationType(enum.Enum): SHUTDOWN_COMPLETE = "SHUTDOWN_COMPLETE" -class FinishReason(enum.IntEnum): - """ - Reason a request finished - stop, length, abort, error, or repetition. - - Int rather than Str for more compact serialization. - - stop - a stop string was emitted - length - max_tokens was consumed, or max_model_len was reached - abort - aborted by client - error - retryable request-level internal error (e.g., KV load failure). - Invariant: always converted to 500 Internal Server Error. - repetition - repetitive token pattern detected (hallucination) - - """ - - STOP = 0 - LENGTH = 1 - ABORT = 2 - ERROR = 3 - REPETITION = 4 - - def __str__(self): - return FINISH_REASON_STRINGS[self.value] - - @dataclass class EngineCoreReadyResponse: """Sent from EngineCore to each frontend at the end of engine startup. diff --git a/vllm/v1/finish_reason.py b/vllm/v1/finish_reason.py new file mode 100644 index 000000000000..01972f5f6d27 --- /dev/null +++ b/vllm/v1/finish_reason.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import enum + +# These are possible values of RequestOutput.finish_reason, +# so form part of the external API. +FINISH_REASON_STRINGS = ("stop", "length", "abort", "error", "repetition") + + +class FinishReason(enum.IntEnum): + """ + Reason a request finished - stop, length, abort, error, or repetition. + + Int rather than Str for more compact serialization. + + stop - a stop string was emitted + length - max_tokens was consumed, or max_model_len was reached + abort - aborted by client + error - retryable request-level internal error (e.g., KV load failure). + Invariant: always converted to 500 Internal Server Error. + repetition - repetitive token pattern detected (hallucination) + + """ + + STOP = 0 + LENGTH = 1 + ABORT = 2 + ERROR = 3 + REPETITION = 4 + + def __str__(self): + return FINISH_REASON_STRINGS[self.value] diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 4a807639841d..46738b037b24 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -8,11 +8,12 @@ import vllm.envs as envs from vllm.compilation.cuda_graph import CUDAGraphStat +from vllm.v1.finish_reason import FinishReason from vllm.v1.metrics.perf import PerfStats from vllm.v1.spec_decode.metrics import SpecDecodingStats if TYPE_CHECKING: - from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason + from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput @dataclass @@ -224,7 +225,7 @@ class RequestStateStats: class FinishedRequestStats: """Stats associated with a finished request.""" - finish_reason: "FinishReason" + finish_reason: FinishReason request_id: str | None = None e2e_latency: float = 0.0 num_prompt_tokens: int = 0 @@ -427,7 +428,7 @@ def update_from_events( def update_from_finished_request( self, - finish_reason: "FinishReason", + finish_reason: FinishReason, request_id: str, num_prompt_tokens: int, max_tokens_param: int | None, From 72507ce31d918da9a810a7f532c7464080dc9d04 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Sat, 9 May 2026 21:15:33 -0700 Subject: [PATCH 3/3] [Frontend] Address PR review: reorder stats finalization, add flag validation, skip multi-prompt headers Three changes addressing PR #42198 review feedback: 1. Reorder _update_stats_from_finished before make_request_output in the output processor. The previous version attached finished_stats to the already-built RequestOutput after the fact; in AsyncLLM that meant the queued output was momentarily stale. The reorder is safe because update_from_finished_request only reads req_state.stats, which is already finalized by _update_stats_from_output earlier in the same iteration. Removes the post-attach hack and the misleading comment claiming a side-effect ordering constraint. 2. Reject --enable-request-stats-headers + --disable-log-stats at startup. The two flags are silently incompatible: when log_stats is off, req_state.stats is None and finished_stats is never produced, so the middleware becomes a permanent no-op. Fail loudly instead. 3. Skip emitting x-vllm-* headers for multi-prompt batched /v1/completions requests. The previous code reported only the last prompt's stats with a comment flagging the limitation; that's misleading. Per-prompt FinishedRequestStats can't be meaningfully aggregated (queue/prefill/ decode intervals are per-prompt), so we skip headers entirely when len(final_res_batch) > 1. Single-prompt requests are unchanged. Co-Authored-By: Claude Signed-off-by: Vinay Damodaran --- vllm/entrypoints/openai/cli_args.py | 6 ++++++ vllm/entrypoints/openai/completion/serving.py | 10 +++++---- vllm/v1/engine/output_processor.py | 21 +++++++------------ 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 8c5d2e2fb781..aa3a8e3b4b73 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -391,6 +391,12 @@ def validate_parsed_serve_args(args: argparse.Namespace): raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser") if args.enable_log_outputs and not args.enable_log_requests: raise TypeError("Error: --enable-log-outputs requires --enable-log-requests") + if args.enable_request_stats_headers and args.disable_log_stats: + raise TypeError( + "Error: --enable-request-stats-headers requires per-request stats " + "collection, which is disabled by --disable-log-stats. Drop one of " + "the two flags." + ) def create_parser_for_docs() -> FlexibleArgumentParser: diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index a8b895e3462c..141cab072bd4 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -570,10 +570,12 @@ def request_output_to_completion_response( ) request_metadata.final_usage_info = usage - if last_final_res is not None: - # Known limitation: for multi-prompt batch requests, timing - # headers reflect only the last prompt's metrics. Token counts - # in usage are correctly summed across all prompts. + # x-vllm-* headers reflect a single request's timing. For multi-prompt + # batch requests the per-prompt FinishedRequestStats can't be + # meaningfully aggregated (queue/prefill/decode intervals are + # per-prompt), so we skip emitting headers entirely in that case. + # Token counts in usage are correctly summed across all prompts. + if last_final_res is not None and len(final_res_batch) == 1: request_metadata.finished_stats = last_final_res.finished_stats prompt_routed_experts = None if final_res_batch: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 301fc6fd830e..0964f21578c0 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -668,7 +668,14 @@ def process_outputs( # if required. req_state.logprobs_processor.update_from_output(engine_core_output) - # 4) Create and handle RequestOutput objects. + # 4) Finalize per-request stats before constructing RequestOutput + # so that finished_stats is populated on the output we hand off. + if finish_reason is not None and not req_state.streaming_input: + self._update_stats_from_finished( + req_state, finish_reason, iteration_stats + ) + + # 5) Create and handle RequestOutput objects. if request_output := req_state.make_request_output( new_token_ids, pooling_output, @@ -702,18 +709,6 @@ def process_outputs( # detected stop string, abort needed in EngineCore. reqs_to_abort.append(req_id) - # Track per-request stats - self._update_stats_from_finished( - req_state, finish_reason, iteration_stats - ) - # Attach finished_stats to the already-built RequestOutput so - # downstream serving layers can read it. We can't compute - # finished_stats earlier because update_from_finished_request - # depends on prior side effects in this iteration. - if request_output is not None and isinstance( - request_output, RequestOutput - ): - request_output.finished_stats = req_state.finished_stats if self.tracing_enabled: self.do_tracing(engine_core_output, req_state, iteration_stats)