From e725b27b4293ca964fe1987d0f890e214d1f0771 Mon Sep 17 00:00:00 2001
From: Vinay Damodaran <vrdn@hey.com>
Date: Sat, 9 May 2026 18:20:16 -0700
Subject: [PATCH 1/3] [Frontend] Add x-vllm-* response headers for per-request
 stats

Adds an opt-in --enable-request-stats-headers flag that attaches
per-request timing and token-count headers to non-streaming OpenAI
responses (chat, completion, responses).

The timing intervals (queue, prefill, decode, inference, e2e) and
mean time per output token are computed in exactly one place:
IterationStats.update_from_finished_request, which now returns the
FinishedRequestStats it builds. Both the Prometheus path and the
new headers middleware consume the same object - the middleware
performs no arithmetic, only formatting.

Headers added (all opt-in via --enable-request-stats-headers):
  x-vllm-total-time, x-vllm-queue-time, x-vllm-prefill-time,
  x-vllm-decode-time, x-vllm-inference-time,
  x-vllm-prompt-tokens, x-vllm-completion-tokens,
  x-vllm-cached-tokens, x-vllm-time-per-output-token

Streaming responses are unchanged. Error responses are unchanged.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Vinay Damodaran <vrdn@hey.com>
---
 .../openai/test_request_stats_headers.py      | 115 ++++++++++++++++++
 tests/v1/metrics/test_stats.py                |  27 ++++
 vllm/entrypoints/openai/api_server.py         |   6 +
 .../openai/chat_completion/serving.py         |   1 +
 vllm/entrypoints/openai/cli_args.py           |   5 +
 vllm/entrypoints/openai/completion/serving.py |   5 +
 vllm/entrypoints/openai/engine/protocol.py    |  15 +++
 .../openai/request_stats_headers.py           |  54 ++++++++
 vllm/entrypoints/openai/responses/serving.py  |   7 ++
 vllm/outputs.py                               |   7 +-
 vllm/v1/engine/output_processor.py            |  13 +-
 vllm/v1/metrics/stats.py                      |   4 +-
 12 files changed, 256 insertions(+), 3 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_request_stats_headers.py
 create mode 100644 vllm/entrypoints/openai/request_stats_headers.py

diff --git a/tests/entrypoints/openai/test_request_stats_headers.py b/tests/entrypoints/openai/test_request_stats_headers.py
new file mode 100644
index 000000000000..b7ca0769b42d
--- /dev/null
+++ b/tests/entrypoints/openai/test_request_stats_headers.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import pytest
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+from httpx import ASGITransport, AsyncClient
+
+from vllm.entrypoints.openai.engine.protocol import RequestResponseMetadata
+from vllm.entrypoints.openai.request_stats_headers import (
+    build_request_stats_headers,
+    request_stats_headers_middleware,
+)
+from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.stats import FinishedRequestStats
+
+
+def _stats(**overrides) -> FinishedRequestStats:
+    base = dict(
+        finish_reason=FinishReason.STOP,
+        request_id="req-1",
+        e2e_latency=1.0,
+        num_prompt_tokens=50,
+        num_generation_tokens=10,
+        max_tokens_param=None,
+        queued_time=0.05,
+        prefill_time=0.10,
+        inference_time=0.40,
+        decode_time=0.30,
+        mean_time_per_output_token=0.030,
+        is_corrupted=False,
+        num_cached_tokens=5,
+    )
+    base.update(overrides)
+    return FinishedRequestStats(**base)
+
+
+def test_build_headers_basic():
+    headers = build_request_stats_headers(_stats())
+
+    for key in headers:
+        assert key.startswith("x-vllm-"), f"{key} missing x-vllm- prefix"
+
+    assert headers["x-vllm-total-time"] == "1000.00"
+    assert headers["x-vllm-queue-time"] == "50.00"
+    assert headers["x-vllm-prefill-time"] == "100.00"
+    assert headers["x-vllm-inference-time"] == "400.00"
+    assert headers["x-vllm-decode-time"] == "300.00"
+    assert headers["x-vllm-prompt-tokens"] == "50"
+    assert headers["x-vllm-completion-tokens"] == "10"
+    assert headers["x-vllm-cached-tokens"] == "5"
+    assert headers["x-vllm-time-per-output-token"] == "30.00"
+
+
+def test_build_headers_zero_decode():
+    """Single-token completion: mean_time_per_output_token is 0."""
+    headers = build_request_stats_headers(
+        _stats(num_generation_tokens=1, decode_time=0.0, mean_time_per_output_token=0.0)
+    )
+    assert headers["x-vllm-time-per-output-token"] == "0.00"
+    assert headers["x-vllm-completion-tokens"] == "1"
+
+
+def _create_test_app() -> FastAPI:
+    app = FastAPI()
+    app.middleware("http")(request_stats_headers_middleware)
+
+    @app.get("/with-stats")
+    async def with_stats(request: Request) -> JSONResponse:
+        meta = RequestResponseMetadata(request_id="r")
+        meta.finished_stats = _stats()
+        request.state.request_metadata = meta
+        return JSONResponse({"ok": True})
+
+    @app.get("/no-stats")
+    async def no_stats(request: Request) -> JSONResponse:
+        meta = RequestResponseMetadata(request_id="r")
+        request.state.request_metadata = meta
+        return JSONResponse({"ok": True})
+
+    @app.get("/no-metadata")
+    async def no_metadata() -> JSONResponse:
+        return JSONResponse({"ok": True})
+
+    return app
+
+
+@pytest.mark.asyncio
+async def test_middleware_attaches_headers_when_stats_present():
+    app = _create_test_app()
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c:
+        resp = await c.get("/with-stats")
+    assert resp.status_code == 200
+    assert resp.headers["x-vllm-decode-time"] == "300.00"
+    assert resp.headers["x-vllm-prompt-tokens"] == "50"
+
+
+@pytest.mark.asyncio
+async def test_middleware_passes_through_when_finished_stats_missing():
+    app = _create_test_app()
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c:
+        resp = await c.get("/no-stats")
+    assert resp.status_code == 200
+    assert "x-vllm-decode-time" not in resp.headers
+
+
+@pytest.mark.asyncio
+async def test_middleware_passes_through_when_metadata_missing():
+    app = _create_test_app()
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c:
+        resp = await c.get("/no-metadata")
+    assert resp.status_code == 200
+    assert "x-vllm-decode-time" not in resp.headers
diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py
index 21f496ea4aea..3c0cbda15891 100644
--- a/tests/v1/metrics/test_stats.py
+++ b/tests/v1/metrics/test_stats.py
@@ -244,3 +244,30 @@ def test_prompt_token_stats_full_external_transfer_recompute():
     assert stats.external_kv_transfer == 999
     assert stats.cached_tokens == 999
     assert stats.total == 1000
+
+
+def test_update_from_finished_request_returns_finished_stats():
+    """update_from_finished_request returns the same FinishedRequestStats it appends."""
+    iteration_stats = IterationStats()
+    req_stats = RequestStateStats(
+        arrival_time=100.0,
+        queued_ts=100.05,
+        scheduled_ts=100.10,
+        first_token_ts=100.20,
+        last_token_ts=100.50,
+        num_generation_tokens=5,
+    )
+
+    returned = iteration_stats.update_from_finished_request(
+        finish_reason=FinishReason.STOP,
+        request_id="req-1",
+        num_prompt_tokens=10,
+        max_tokens_param=None,
+        req_stats=req_stats,
+        num_cached_tokens=3,
+    )
+
+    assert returned is not None
+    assert iteration_stats.finished_requests[-1] is returned
+    assert returned.request_id == "req-1"
+    assert returned.num_cached_tokens == 3
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index da2ec10284c5..7f7157b15c1b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -32,6 +32,9 @@
 from vllm.entrypoints.openai.engine.protocol import GenerationError
 from vllm.entrypoints.openai.models.protocol import BaseModelPath
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
+from vllm.entrypoints.openai.request_stats_headers import (
+    request_stats_headers_middleware,
+)
 from vllm.entrypoints.openai.server_utils import (
     engine_error_handler,
     exception_handler,
@@ -279,6 +282,9 @@ def build_app(
 
         app.add_middleware(XRequestIdMiddleware)
 
+    if args.enable_request_stats_headers:
+        app.middleware("http")(request_stats_headers_middleware)
+
     # Add scaling middleware to check for scaling state
     app.add_middleware(ScalingMiddleware)
 
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
index 1026e0a1e3f7..c25b5ab47f28 100644
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -1357,6 +1357,7 @@ async def chat_completion_full_generator(
             )
 
         request_metadata.final_usage_info = usage
+        request_metadata.finished_stats = final_res.finished_stats
 
         prompt_routed_experts = None
         if final_res.prompt_routed_experts is not None:
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 3ebdec3c67fa..8c5d2e2fb781 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -151,6 +151,11 @@ class BaseFrontendArgs:
     """
     log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
     """If set to True, log the stack trace of error responses"""
+    enable_request_stats_headers: bool = False
+    """If set to True, include per-request timing and compute stats as
+    x-vllm-* response headers on non-streaming responses. Headers reflect
+    the same intervals reported by Prometheus (e2e, queued, prefill,
+    decode, inference, mean time per output token)."""
     tokens_only: bool = False
     """
     If set to True, only enable the Tokens In<>Out endpoint.
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
index ee4ca9f3ada3..a8b895e3462c 100644
--- a/vllm/entrypoints/openai/completion/serving.py
+++ b/vllm/entrypoints/openai/completion/serving.py
@@ -570,6 +570,11 @@ def request_output_to_completion_response(
             )
 
         request_metadata.final_usage_info = usage
+        if last_final_res is not None:
+            # Known limitation: for multi-prompt batch requests, timing
+            # headers reflect only the last prompt's metrics. Token counts
+            # in usage are correctly summed across all prompts.
+            request_metadata.finished_stats = last_final_res.finished_stats
         prompt_routed_experts = None
         if final_res_batch:
             kv_transfer_params = final_res_batch[0].kv_transfer_params
diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py
index 890af0300efc..a599ccc98ac5 100644
--- a/vllm/entrypoints/openai/engine/protocol.py
+++ b/vllm/entrypoints/openai/engine/protocol.py
@@ -20,6 +20,7 @@
 from vllm.logger import init_logger
 from vllm.utils import random_uuid
 from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.v1.metrics.stats import FinishedRequestStats
 
 logger = init_logger(__name__)
 
@@ -110,8 +111,11 @@ class UsageInfo(OpenAIBaseModel):
 
 
 class RequestResponseMetadata(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
     request_id: str
     final_usage_info: UsageInfo | None = None
+    finished_stats: FinishedRequestStats | None = None
 
 
 class JsonSchemaResponseFormat(OpenAIBaseModel):
@@ -277,3 +281,14 @@ class GenerationError(Exception):
     def __init__(self, message: str = "Internal server error"):
         super().__init__(message)
         self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+# RequestResponseMetadata.finished_stats references FinishedRequestStats, which
+# in turn has a forward-reference annotation `FinishReason`. Pydantic resolves
+# forward refs in the namespace of the module where the dataclass was defined
+# (vllm.v1.metrics.stats), so we must supply the real symbol via
+# _types_namespace. Import here (not at module top) to keep this localized to
+# the rebuild and avoid polluting the public import surface.
+from vllm.v1.engine import FinishReason as _FinishReason  # noqa: E402, F401
+
+RequestResponseMetadata.model_rebuild(_types_namespace={"FinishReason": _FinishReason})
diff --git a/vllm/entrypoints/openai/request_stats_headers.py b/vllm/entrypoints/openai/request_stats_headers.py
new file mode 100644
index 000000000000..bf5a217d23ea
--- /dev/null
+++ b/vllm/entrypoints/openai/request_stats_headers.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from starlette.requests import Request
+from starlette.responses import Response
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from vllm.v1.metrics.stats import FinishedRequestStats
+
+
+def build_request_stats_headers(stats: FinishedRequestStats) -> dict[str, str]:
+    """Format computed request timings as x-vllm-* response headers.
+
+    Times are in milliseconds, rounded to 2 decimal places. Values come
+    directly from FinishedRequestStats; no arithmetic happens here.
+    """
+    return {
+        "x-vllm-total-time": f"{stats.e2e_latency * 1000:.2f}",
+        "x-vllm-queue-time": f"{stats.queued_time * 1000:.2f}",
+        "x-vllm-inference-time": f"{stats.inference_time * 1000:.2f}",
+        "x-vllm-prefill-time": f"{stats.prefill_time * 1000:.2f}",
+        "x-vllm-decode-time": f"{stats.decode_time * 1000:.2f}",
+        "x-vllm-prompt-tokens": str(stats.num_prompt_tokens),
+        "x-vllm-completion-tokens": str(stats.num_generation_tokens),
+        "x-vllm-cached-tokens": str(stats.num_cached_tokens),
+        "x-vllm-time-per-output-token": (
+            f"{stats.mean_time_per_output_token * 1000:.2f}"
+        ),
+    }
+
+
+async def request_stats_headers_middleware(
+    request: Request,
+    call_next: Callable,
+) -> Response:
+    """FastAPI middleware that attaches x-vllm-* timing headers.
+
+    Reads request.state.request_metadata (populated by the serving layer).
+    No-op if metadata or finished_stats is missing — covers streaming,
+    errors, and non-OpenAI routes.
+    """
+    response = await call_next(request)
+    metadata = getattr(request.state, "request_metadata", None)
+    if metadata is None or metadata.finished_stats is None:
+        return response
+    for key, value in build_request_stats_headers(metadata.finished_stats).items():
+        response.headers[key] = value
+    return response
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
index 9c4dc48589ff..9af2f999e626 100644
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -890,6 +890,13 @@ async def responses_full_generator(
             kv_transfer_params=context.kv_transfer_params,
         )
 
+        # Attach finished_stats for x-vllm-* response headers.
+        # Known limitation: for multi-turn tool-calling flows, timing
+        # breakdown reflects only the final turn. Total wall-clock time
+        # is still correct.
+        if hasattr(context, "last_output") and context.last_output is not None:
+            request_metadata.finished_stats = context.last_output.finished_stats
+
         if request.store:
             async with self.response_store_lock:
                 stored_response = self.response_store.get(response.id)
diff --git a/vllm/outputs.py b/vllm/outputs.py
index aa6d12768ccc..bda70d4c728c 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -13,7 +13,7 @@
 from vllm.logger import init_logger
 from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.lora.request import LoRARequest
-from vllm.v1.metrics.stats import RequestStateStats
+from vllm.v1.metrics.stats import FinishedRequestStats, RequestStateStats
 
 logger = init_logger(__name__)
 
@@ -104,6 +104,8 @@ class RequestOutput:
                                   None if decoder-only.
         num_cached_tokens: The number of tokens with prefix cache hit.
         kv_transfer_params: The params for remote K/V transfer.
+        finished_stats: Computed timing intervals for the finished request,
+            set on the terminal RequestOutput. Internal use only.
     """
 
     def __init__(
@@ -122,6 +124,7 @@ def __init__(
         *,
         kv_transfer_params: dict[str, Any] | None = None,
         prompt_routed_experts: np.ndarray | None = None,
+        finished_stats: FinishedRequestStats | None = None,
         # Forward compatibility, code that uses args added in new release can
         # still run with older versions of vLLM without breaking.
         **kwargs: Any,
@@ -143,6 +146,7 @@ def __init__(
         self.num_cached_tokens = num_cached_tokens
         self.kv_transfer_params = kv_transfer_params
         self.prompt_routed_experts = prompt_routed_experts
+        self.finished_stats = finished_stats
 
     def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
         """Merge subsequent RequestOutput into this one"""
@@ -187,6 +191,7 @@ def __repr__(self) -> str:
             f"outputs={self.outputs}, "
             f"finished={self.finished}, "
             f"metrics={self.metrics}, "
+            f"finished_stats={self.finished_stats}, "
             f"lora_request={self.lora_request}, "
             f"num_cached_tokens={self.num_cached_tokens})"
         )
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index e0e53694c400..301fc6fd830e 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -35,6 +35,7 @@
 from vllm.v1.engine.logprobs import LogprobsProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.metrics.stats import (
+    FinishedRequestStats,
     IterationStats,
     LoRARequestStates,
     RequestStateStats,
@@ -177,6 +178,7 @@ def __init__(
         self.num_cached_tokens = 0
 
         self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None
+        self.finished_stats: FinishedRequestStats | None = None
 
         # Stream Interval
         self.stream_interval = stream_interval
@@ -396,6 +398,7 @@ def _new_request_output(
             num_cached_tokens=self.num_cached_tokens,
             metrics=self.stats,
             prompt_routed_experts=prompt_routed_experts,
+            finished_stats=self.finished_stats,
         )
 
     def _new_completion_output(
@@ -703,6 +706,14 @@ def process_outputs(
                     self._update_stats_from_finished(
                         req_state, finish_reason, iteration_stats
                     )
+                    # Attach finished_stats to the already-built RequestOutput so
+                    # downstream serving layers can read it. We can't compute
+                    # finished_stats earlier because update_from_finished_request
+                    # depends on prior side effects in this iteration.
+                    if request_output is not None and isinstance(
+                        request_output, RequestOutput
+                    ):
+                        request_output.finished_stats = req_state.finished_stats
                     if self.tracing_enabled:
                         self.do_tracing(engine_core_output, req_state, iteration_stats)
 
@@ -822,7 +833,7 @@ def _update_stats_from_finished(
 
         assert finish_reason is not None
         assert req_state.stats is not None
-        iteration_stats.update_from_finished_request(
+        req_state.finished_stats = iteration_stats.update_from_finished_request(
             finish_reason=finish_reason,
             request_id=req_state.external_req_id,
             num_prompt_tokens=req_state.prompt_len,
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index a7a5fb7a2d2f..4a807639841d 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -433,7 +433,7 @@ def update_from_finished_request(
         max_tokens_param: int | None,
         req_stats: RequestStateStats,
         num_cached_tokens: int = 0,
-    ):
+    ) -> FinishedRequestStats:
         e2e_latency = self._time_since(req_stats.arrival_time)
 
         # Queued interval is from first QUEUED event to first SCHEDULED
@@ -479,6 +479,8 @@ def update_from_finished_request(
         if req_stats.is_corrupted:
             self.num_corrupted_reqs += 1
 
+        return finished_req
+
 
 class LoRAStats:
     """Tracks waiting and running request IDs for a single LoRA."""

From 09ae3a25515e243c6dc9ee210dea7b463b66b518 Mon Sep 17 00:00:00 2001
From: Vinay Damodaran <vrdn@hey.com>
Date: Sat, 9 May 2026 21:02:13 -0700
Subject: [PATCH 2/3] [Frontend] Extract FinishReason to leaf module to remove
 model_rebuild workaround

The previous commit added a `RequestResponseMetadata.model_rebuild()` call at
the bottom of `protocol.py` to repair a Pydantic forward-reference resolution
failure. The root cause was structural: `FinishedRequestStats` lives in
`vllm.v1.metrics.stats` and annotates `finish_reason: "FinishReason"`, but
`FinishReason` was defined in `vllm.v1.engine.__init__` - which already
imports `vllm.v1.metrics.stats` at runtime. To break that cycle, `stats.py`
hid the import under `TYPE_CHECKING`, which left the forward-reference string
unresolvable when Pydantic tried to introspect the dataclass.

This change moves `FinishReason` and `FINISH_REASON_STRINGS` into a new leaf
module `vllm/v1/finish_reason.py` that imports nothing else from `vllm.v1.*`.
`vllm/v1/engine/__init__.py` re-exports the symbols so every existing
`from vllm.v1.engine import FinishReason` keeps working unchanged. The class
identity is preserved (re-export, not redefinition), so isinstance checks and
enum value comparisons continue to work.

`stats.py` can now import `FinishReason` at module top level without
circularity, and the `model_rebuild` block in `protocol.py` is deleted - the
forward reference resolves naturally.

Net: -1 line, 4 files touched, no behavior change.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Vinay Damodaran <vrdn@hey.com>
---
 vllm/entrypoints/openai/engine/protocol.py | 11 -------
 vllm/v1/engine/__init__.py                 | 34 ++++------------------
 vllm/v1/finish_reason.py                   | 33 +++++++++++++++++++++
 vllm/v1/metrics/stats.py                   |  7 +++--
 4 files changed, 42 insertions(+), 43 deletions(-)
 create mode 100644 vllm/v1/finish_reason.py

diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py
index a599ccc98ac5..857b339be770 100644
--- a/vllm/entrypoints/openai/engine/protocol.py
+++ b/vllm/entrypoints/openai/engine/protocol.py
@@ -281,14 +281,3 @@ class GenerationError(Exception):
     def __init__(self, message: str = "Internal server error"):
         super().__init__(message)
         self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
-
-
-# RequestResponseMetadata.finished_stats references FinishedRequestStats, which
-# in turn has a forward-reference annotation `FinishReason`. Pydantic resolves
-# forward refs in the namespace of the module where the dataclass was defined
-# (vllm.v1.metrics.stats), so we must supply the real symbol via
-# _types_namespace. Import here (not at module top) to keep this localized to
-# the rebuild and avoid polluting the public import surface.
-from vllm.v1.engine import FinishReason as _FinishReason  # noqa: E402, F401
-
-RequestResponseMetadata.model_rebuild(_types_namespace={"FinishReason": _FinishReason})
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 8172ead08319..2994bd614be8 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -15,6 +15,11 @@
 from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
+
+# FinishReason is re-exported for backwards compatibility. The canonical home
+# is vllm.v1.finish_reason - a leaf module that vllm.v1.metrics can import at
+# runtime without circularity.
+from vllm.v1.finish_reason import FINISH_REASON_STRINGS, FinishReason  # noqa: F401
 from vllm.v1.metrics.stats import PrefillStats, SchedulerStats
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
 from vllm.v1.serial_utils import UtilityResult
@@ -25,10 +30,6 @@
 # - "keep": Freeze requests in queue; they resume on resume_generation().
 PauseMode = Literal["abort", "wait", "keep"]
 
-# These are possible values of RequestOutput.finish_reason,
-# so form part of the external API.
-FINISH_REASON_STRINGS = ("stop", "length", "abort", "error", "repetition")
-
 EEP_NOTIFICATION_CALL_ID = -1
 
 
@@ -39,31 +40,6 @@ class EEPNotificationType(enum.Enum):
     SHUTDOWN_COMPLETE = "SHUTDOWN_COMPLETE"
 
 
-class FinishReason(enum.IntEnum):
-    """
-    Reason a request finished - stop, length, abort, error, or repetition.
-
-    Int rather than Str for more compact serialization.
-
-    stop - a stop string was emitted
-    length - max_tokens was consumed, or max_model_len was reached
-    abort - aborted by client
-    error - retryable request-level internal error (e.g., KV load failure).
-            Invariant: always converted to 500 Internal Server Error.
-    repetition - repetitive token pattern detected (hallucination)
-
-    """
-
-    STOP = 0
-    LENGTH = 1
-    ABORT = 2
-    ERROR = 3
-    REPETITION = 4
-
-    def __str__(self):
-        return FINISH_REASON_STRINGS[self.value]
-
-
 @dataclass
 class EngineCoreReadyResponse:
     """Sent from EngineCore to each frontend at the end of engine startup.
diff --git a/vllm/v1/finish_reason.py b/vllm/v1/finish_reason.py
new file mode 100644
index 000000000000..01972f5f6d27
--- /dev/null
+++ b/vllm/v1/finish_reason.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+
+# These are possible values of RequestOutput.finish_reason,
+# so form part of the external API.
+FINISH_REASON_STRINGS = ("stop", "length", "abort", "error", "repetition")
+
+
+class FinishReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, abort, error, or repetition.
+
+    Int rather than Str for more compact serialization.
+
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted by client
+    error - retryable request-level internal error (e.g., KV load failure).
+            Invariant: always converted to 500 Internal Server Error.
+    repetition - repetitive token pattern detected (hallucination)
+
+    """
+
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+    ERROR = 3
+    REPETITION = 4
+
+    def __str__(self):
+        return FINISH_REASON_STRINGS[self.value]
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 4a807639841d..46738b037b24 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -8,11 +8,12 @@
 
 import vllm.envs as envs
 from vllm.compilation.cuda_graph import CUDAGraphStat
+from vllm.v1.finish_reason import FinishReason
 from vllm.v1.metrics.perf import PerfStats
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 if TYPE_CHECKING:
-    from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
+    from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput
 
 
 @dataclass
@@ -224,7 +225,7 @@ class RequestStateStats:
 class FinishedRequestStats:
     """Stats associated with a finished request."""
 
-    finish_reason: "FinishReason"
+    finish_reason: FinishReason
     request_id: str | None = None
     e2e_latency: float = 0.0
     num_prompt_tokens: int = 0
@@ -427,7 +428,7 @@ def update_from_events(
 
     def update_from_finished_request(
         self,
-        finish_reason: "FinishReason",
+        finish_reason: FinishReason,
         request_id: str,
         num_prompt_tokens: int,
         max_tokens_param: int | None,

From 72507ce31d918da9a810a7f532c7464080dc9d04 Mon Sep 17 00:00:00 2001
From: Vinay Damodaran <vrdn@hey.com>
Date: Sat, 9 May 2026 21:15:33 -0700
Subject: [PATCH 3/3] [Frontend] Address PR review: reorder stats finalization,
 add flag validation, skip multi-prompt headers

Three changes addressing PR #42198 review feedback:

1. Reorder _update_stats_from_finished before make_request_output in the
   output processor. The previous version attached finished_stats to the
   already-built RequestOutput after the fact; in AsyncLLM that meant the
   queued output was momentarily stale. The reorder is safe because
   update_from_finished_request only reads req_state.stats, which is
   already finalized by _update_stats_from_output earlier in the same
   iteration. Removes the post-attach hack and the misleading comment
   claiming a side-effect ordering constraint.

2. Reject --enable-request-stats-headers + --disable-log-stats at startup.
   The two flags are silently incompatible: when log_stats is off,
   req_state.stats is None and finished_stats is never produced, so the
   middleware becomes a permanent no-op. Fail loudly instead.

3. Skip emitting x-vllm-* headers for multi-prompt batched /v1/completions
   requests. The previous code reported only the last prompt's stats with
   a comment flagging the limitation; that's misleading. Per-prompt
   FinishedRequestStats can't be meaningfully aggregated (queue/prefill/
   decode intervals are per-prompt), so we skip headers entirely when
   len(final_res_batch) > 1. Single-prompt requests are unchanged.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Vinay Damodaran <vrdn@hey.com>
---
 vllm/entrypoints/openai/cli_args.py           |  6 ++++++
 vllm/entrypoints/openai/completion/serving.py | 10 +++++----
 vllm/v1/engine/output_processor.py            | 21 +++++++------------
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 8c5d2e2fb781..aa3a8e3b4b73 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -391,6 +391,12 @@ def validate_parsed_serve_args(args: argparse.Namespace):
         raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser")
     if args.enable_log_outputs and not args.enable_log_requests:
         raise TypeError("Error: --enable-log-outputs requires --enable-log-requests")
+    if args.enable_request_stats_headers and args.disable_log_stats:
+        raise TypeError(
+            "Error: --enable-request-stats-headers requires per-request stats "
+            "collection, which is disabled by --disable-log-stats. Drop one of "
+            "the two flags."
+        )
 
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
index a8b895e3462c..141cab072bd4 100644
--- a/vllm/entrypoints/openai/completion/serving.py
+++ b/vllm/entrypoints/openai/completion/serving.py
@@ -570,10 +570,12 @@ def request_output_to_completion_response(
             )
 
         request_metadata.final_usage_info = usage
-        if last_final_res is not None:
-            # Known limitation: for multi-prompt batch requests, timing
-            # headers reflect only the last prompt's metrics. Token counts
-            # in usage are correctly summed across all prompts.
+        # x-vllm-* headers reflect a single request's timing. For multi-prompt
+        # batch requests the per-prompt FinishedRequestStats can't be
+        # meaningfully aggregated (queue/prefill/decode intervals are
+        # per-prompt), so we skip emitting headers entirely in that case.
+        # Token counts in usage are correctly summed across all prompts.
+        if last_final_res is not None and len(final_res_batch) == 1:
             request_metadata.finished_stats = last_final_res.finished_stats
         prompt_routed_experts = None
         if final_res_batch:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 301fc6fd830e..0964f21578c0 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -668,7 +668,14 @@ def process_outputs(
                 # if required.
                 req_state.logprobs_processor.update_from_output(engine_core_output)
 
-            # 4) Create and handle RequestOutput objects.
+            # 4) Finalize per-request stats before constructing RequestOutput
+            # so that finished_stats is populated on the output we hand off.
+            if finish_reason is not None and not req_state.streaming_input:
+                self._update_stats_from_finished(
+                    req_state, finish_reason, iteration_stats
+                )
+
+            # 5) Create and handle RequestOutput objects.
             if request_output := req_state.make_request_output(
                 new_token_ids,
                 pooling_output,
@@ -702,18 +709,6 @@ def process_outputs(
                         # detected stop string, abort needed in EngineCore.
                         reqs_to_abort.append(req_id)
 
-                    # Track per-request stats
-                    self._update_stats_from_finished(
-                        req_state, finish_reason, iteration_stats
-                    )
-                    # Attach finished_stats to the already-built RequestOutput so
-                    # downstream serving layers can read it. We can't compute
-                    # finished_stats earlier because update_from_finished_request
-                    # depends on prior side effects in this iteration.
-                    if request_output is not None and isinstance(
-                        request_output, RequestOutput
-                    ):
-                        request_output.finished_stats = req_state.finished_stats
                     if self.tracing_enabled:
                         self.do_tracing(engine_core_output, req_state, iteration_stats)