Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions tests/entrypoints/openai/test_request_stats_headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from __future__ import annotations

import pytest
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from httpx import ASGITransport, AsyncClient

from vllm.entrypoints.openai.engine.protocol import RequestResponseMetadata
from vllm.entrypoints.openai.request_stats_headers import (
build_request_stats_headers,
request_stats_headers_middleware,
)
from vllm.v1.engine import FinishReason
from vllm.v1.metrics.stats import FinishedRequestStats


def _stats(**overrides) -> FinishedRequestStats:
base = dict(
finish_reason=FinishReason.STOP,
request_id="req-1",
e2e_latency=1.0,
num_prompt_tokens=50,
num_generation_tokens=10,
max_tokens_param=None,
queued_time=0.05,
prefill_time=0.10,
inference_time=0.40,
decode_time=0.30,
mean_time_per_output_token=0.030,
is_corrupted=False,
num_cached_tokens=5,
)
base.update(overrides)
return FinishedRequestStats(**base)


def test_build_headers_basic():
headers = build_request_stats_headers(_stats())

for key in headers:
assert key.startswith("x-vllm-"), f"{key} missing x-vllm- prefix"

assert headers["x-vllm-total-time"] == "1000.00"
assert headers["x-vllm-queue-time"] == "50.00"
assert headers["x-vllm-prefill-time"] == "100.00"
assert headers["x-vllm-inference-time"] == "400.00"
assert headers["x-vllm-decode-time"] == "300.00"
assert headers["x-vllm-prompt-tokens"] == "50"
assert headers["x-vllm-completion-tokens"] == "10"
assert headers["x-vllm-cached-tokens"] == "5"
assert headers["x-vllm-time-per-output-token"] == "30.00"


def test_build_headers_zero_decode():
"""Single-token completion: mean_time_per_output_token is 0."""
headers = build_request_stats_headers(
_stats(num_generation_tokens=1, decode_time=0.0, mean_time_per_output_token=0.0)
)
assert headers["x-vllm-time-per-output-token"] == "0.00"
assert headers["x-vllm-completion-tokens"] == "1"


def _create_test_app() -> FastAPI:
app = FastAPI()
app.middleware("http")(request_stats_headers_middleware)

@app.get("/with-stats")
async def with_stats(request: Request) -> JSONResponse:
meta = RequestResponseMetadata(request_id="r")
meta.finished_stats = _stats()
request.state.request_metadata = meta
return JSONResponse({"ok": True})

@app.get("/no-stats")
async def no_stats(request: Request) -> JSONResponse:
meta = RequestResponseMetadata(request_id="r")
request.state.request_metadata = meta
return JSONResponse({"ok": True})

@app.get("/no-metadata")
async def no_metadata() -> JSONResponse:
return JSONResponse({"ok": True})

return app


@pytest.mark.asyncio
async def test_middleware_attaches_headers_when_stats_present():
app = _create_test_app()
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c:
resp = await c.get("/with-stats")
assert resp.status_code == 200
assert resp.headers["x-vllm-decode-time"] == "300.00"
assert resp.headers["x-vllm-prompt-tokens"] == "50"


@pytest.mark.asyncio
async def test_middleware_passes_through_when_finished_stats_missing():
app = _create_test_app()
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c:
resp = await c.get("/no-stats")
assert resp.status_code == 200
assert "x-vllm-decode-time" not in resp.headers


@pytest.mark.asyncio
async def test_middleware_passes_through_when_metadata_missing():
app = _create_test_app()
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://t") as c:
resp = await c.get("/no-metadata")
assert resp.status_code == 200
assert "x-vllm-decode-time" not in resp.headers
27 changes: 27 additions & 0 deletions tests/v1/metrics/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,30 @@ def test_prompt_token_stats_full_external_transfer_recompute():
assert stats.external_kv_transfer == 999
assert stats.cached_tokens == 999
assert stats.total == 1000


def test_update_from_finished_request_returns_finished_stats():
"""update_from_finished_request returns the same FinishedRequestStats it appends."""
iteration_stats = IterationStats()
req_stats = RequestStateStats(
arrival_time=100.0,
queued_ts=100.05,
scheduled_ts=100.10,
first_token_ts=100.20,
last_token_ts=100.50,
num_generation_tokens=5,
)

returned = iteration_stats.update_from_finished_request(
finish_reason=FinishReason.STOP,
request_id="req-1",
num_prompt_tokens=10,
max_tokens_param=None,
req_stats=req_stats,
num_cached_tokens=3,
)

assert returned is not None
assert iteration_stats.finished_requests[-1] is returned
assert returned.request_id == "req-1"
assert returned.num_cached_tokens == 3
6 changes: 6 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.request_stats_headers import (
request_stats_headers_middleware,
)
from vllm.entrypoints.openai.server_utils import (
engine_error_handler,
exception_handler,
Expand Down Expand Up @@ -272,6 +275,9 @@ def build_app(

app.add_middleware(XRequestIdMiddleware)

if args.enable_request_stats_headers:
app.middleware("http")(request_stats_headers_middleware)
Comment thread
vrdn-23 marked this conversation as resolved.

# Add scaling middleware to check for scaling state
app.add_middleware(ScalingMiddleware)

Expand Down
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/chat_completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -1379,6 +1379,7 @@ async def chat_completion_full_generator(
)

request_metadata.final_usage_info = usage
request_metadata.finished_stats = final_res.finished_stats

# ``final_res.prompt`` is the rendered chat-templated prompt text
prompt_text = final_res.prompt if request.return_prompt_text else None
Expand Down
11 changes: 11 additions & 0 deletions vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ class BaseFrontendArgs:
"""
log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
"""If set to True, log the stack trace of error responses"""
enable_request_stats_headers: bool = False
"""If set to True, include per-request timing and compute stats as
x-vllm-* response headers on non-streaming responses. Headers reflect
the same intervals reported by Prometheus (e2e, queued, prefill,
decode, inference, mean time per output token)."""
tokens_only: bool = False
"""
If set to True, only enable the Tokens In<>Out endpoint.
Expand Down Expand Up @@ -397,6 +402,12 @@ def validate_parsed_serve_args(args: argparse.Namespace):
raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser")
if args.enable_log_outputs and not args.enable_log_requests:
raise TypeError("Error: --enable-log-outputs requires --enable-log-requests")
if args.enable_request_stats_headers and args.disable_log_stats:
raise TypeError(
"Error: --enable-request-stats-headers requires per-request stats "
"collection, which is disabled by --disable-log-stats. Drop one of "
"the two flags."
)

if args.data_parallel_multi_port_external_lb:
from vllm.entrypoints.openai.dp_supervisor import (
Expand Down
7 changes: 7 additions & 0 deletions vllm/entrypoints/openai/completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,13 @@ def request_output_to_completion_response(
)

request_metadata.final_usage_info = usage
# x-vllm-* headers reflect a single request's timing. For multi-prompt
# batch requests the per-prompt FinishedRequestStats can't be
# meaningfully aggregated (queue/prefill/decode intervals are
# per-prompt), so we skip emitting headers entirely in that case.
# Token counts in usage are correctly summed across all prompts.
if last_final_res is not None and len(final_res_batch) == 1:
request_metadata.finished_stats = last_final_res.finished_stats
if final_res_batch:
kv_transfer_params = final_res_batch[0].kv_transfer_params
return CompletionResponse(
Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from vllm.logger import init_logger
from vllm.utils import random_uuid
from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.v1.metrics.stats import FinishedRequestStats

logger = init_logger(__name__)

Expand Down Expand Up @@ -110,8 +111,11 @@ class UsageInfo(OpenAIBaseModel):


class RequestResponseMetadata(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

request_id: str
final_usage_info: UsageInfo | None = None
finished_stats: FinishedRequestStats | None = None


class JsonSchemaResponseFormat(OpenAIBaseModel):
Expand Down
54 changes: 54 additions & 0 deletions vllm/entrypoints/openai/request_stats_headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from __future__ import annotations

from typing import TYPE_CHECKING

from starlette.requests import Request
from starlette.responses import Response

if TYPE_CHECKING:
from collections.abc import Callable

from vllm.v1.metrics.stats import FinishedRequestStats


def build_request_stats_headers(stats: FinishedRequestStats) -> dict[str, str]:
"""Format computed request timings as x-vllm-* response headers.

Times are in milliseconds, rounded to 2 decimal places. Values come
directly from FinishedRequestStats; no arithmetic happens here.
"""
return {
"x-vllm-total-time": f"{stats.e2e_latency * 1000:.2f}",
"x-vllm-queue-time": f"{stats.queued_time * 1000:.2f}",
"x-vllm-inference-time": f"{stats.inference_time * 1000:.2f}",
"x-vllm-prefill-time": f"{stats.prefill_time * 1000:.2f}",
"x-vllm-decode-time": f"{stats.decode_time * 1000:.2f}",
"x-vllm-prompt-tokens": str(stats.num_prompt_tokens),
"x-vllm-completion-tokens": str(stats.num_generation_tokens),
"x-vllm-cached-tokens": str(stats.num_cached_tokens),
"x-vllm-time-per-output-token": (
f"{stats.mean_time_per_output_token * 1000:.2f}"
),
}


async def request_stats_headers_middleware(
request: Request,
call_next: Callable,
) -> Response:
"""FastAPI middleware that attaches x-vllm-* timing headers.

Reads request.state.request_metadata (populated by the serving layer).
No-op if metadata or finished_stats is missing — covers streaming,
errors, and non-OpenAI routes.
"""
response = await call_next(request)
metadata = getattr(request.state, "request_metadata", None)
if metadata is None or metadata.finished_stats is None:
return response
for key, value in build_request_stats_headers(metadata.finished_stats).items():
response.headers[key] = value
return response
7 changes: 7 additions & 0 deletions vllm/entrypoints/openai/responses/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,13 @@ async def responses_full_generator(
kv_transfer_params=context.kv_transfer_params,
)

# Attach finished_stats for x-vllm-* response headers.
# Known limitation: for multi-turn tool-calling flows, timing
# breakdown reflects only the final turn. Total wall-clock time
# is still correct.
if hasattr(context, "last_output") and context.last_output is not None:
request_metadata.finished_stats = context.last_output.finished_stats

if request.store:
async with self.response_store_lock:
stored_response = self.response_store.get(response.id)
Expand Down
7 changes: 6 additions & 1 deletion vllm/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from vllm.logger import init_logger
from vllm.logprobs import PromptLogprobs, SampleLogprobs
from vllm.lora.request import LoRARequest
from vllm.v1.metrics.stats import RequestStateStats
from vllm.v1.metrics.stats import FinishedRequestStats, RequestStateStats

logger = init_logger(__name__)

Expand Down Expand Up @@ -104,6 +104,8 @@ class RequestOutput:
None if decoder-only.
num_cached_tokens: The number of tokens with prefix cache hit.
kv_transfer_params: The params for remote K/V transfer.
finished_stats: Computed timing intervals for the finished request,
set on the terminal RequestOutput. Internal use only.
"""

def __init__(
Expand All @@ -121,6 +123,7 @@ def __init__(
num_cached_tokens: int | None = None,
*,
kv_transfer_params: dict[str, Any] | None = None,
finished_stats: FinishedRequestStats | None = None,
# Forward compatibility, code that uses args added in new release can
# still run with older versions of vLLM without breaking.
**kwargs: Any,
Expand All @@ -141,6 +144,7 @@ def __init__(
self.encoder_prompt_token_ids = encoder_prompt_token_ids
self.num_cached_tokens = num_cached_tokens
self.kv_transfer_params = kv_transfer_params
self.finished_stats = finished_stats

def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
"""Merge subsequent RequestOutput into this one"""
Expand Down Expand Up @@ -183,6 +187,7 @@ def __repr__(self) -> str:
f"outputs={self.outputs}, "
f"finished={self.finished}, "
f"metrics={self.metrics}, "
f"finished_stats={self.finished_stats}, "
f"lora_request={self.lora_request}, "
f"num_cached_tokens={self.num_cached_tokens})"
)
Expand Down
34 changes: 5 additions & 29 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
from vllm.multimodal.inputs import MultiModalFeatureSpec
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams

# FinishReason is re-exported for backwards compatibility. The canonical home
# is vllm.v1.finish_reason - a leaf module that vllm.v1.metrics can import at
# runtime without circularity.
from vllm.v1.finish_reason import FINISH_REASON_STRINGS, FinishReason # noqa: F401
from vllm.v1.metrics.stats import PrefillStats, SchedulerStats
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
from vllm.v1.serial_utils import UtilityResult
Expand All @@ -25,10 +30,6 @@
# - "keep": Freeze requests in queue; they resume on resume_generation().
PauseMode = Literal["abort", "wait", "keep"]

# These are possible values of RequestOutput.finish_reason,
# so form part of the external API.
FINISH_REASON_STRINGS = ("stop", "length", "abort", "error", "repetition")

EEP_NOTIFICATION_CALL_ID = -1


Expand All @@ -39,31 +40,6 @@ class EEPNotificationType(enum.Enum):
SHUTDOWN_COMPLETE = "SHUTDOWN_COMPLETE"


class FinishReason(enum.IntEnum):
"""
Reason a request finished - stop, length, abort, error, or repetition.

Int rather than Str for more compact serialization.

stop - a stop string was emitted
length - max_tokens was consumed, or max_model_len was reached
abort - aborted by client
error - retryable request-level internal error (e.g., KV load failure).
Invariant: always converted to 500 Internal Server Error.
repetition - repetitive token pattern detected (hallucination)

"""

STOP = 0
LENGTH = 1
ABORT = 2
ERROR = 3
REPETITION = 4

def __str__(self):
return FINISH_REASON_STRINGS[self.value]


@dataclass
class EngineCoreReadyResponse:
"""Sent from EngineCore to each frontend at the end of engine startup.
Expand Down
Loading
Loading