Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
073649e
Initial commit
vrdn-23 Mar 17, 2026
266dc16
Working POC for headers
vrdn-23 Mar 29, 2026
ef92ec4
Changed to middleware approach
vrdn-23 Mar 30, 2026
665f305
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Mar 30, 2026
98e05bd
add usage info mapping for responses
vrdn-23 Mar 30, 2026
e7f24d5
Merge remote-tracking branch 'refs/remotes/origin/vrdn-23/add-reponse…
vrdn-23 Mar 30, 2026
6bceab5
clean up and address codex comments
vrdn-23 Mar 30, 2026
09f1bd1
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Mar 30, 2026
0c86666
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Mar 31, 2026
fdd8e3b
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Mar 31, 2026
2572019
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Mar 31, 2026
d93a33b
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Mar 31, 2026
649a51c
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Mar 31, 2026
cfd7fde
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Mar 31, 2026
4663ef0
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 1, 2026
a5358c6
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 1, 2026
aaf1ab4
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 1, 2026
656a88b
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 1, 2026
90239e5
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 2, 2026
ca889a7
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 3, 2026
20695dc
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 4, 2026
69797c3
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 6, 2026
bcf45fe
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 6, 2026
9e4c724
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 6, 2026
9031314
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 7, 2026
0a48d22
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 7, 2026
cb486b3
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 9, 2026
d131c63
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 9, 2026
2bf8dad
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 10, 2026
909789c
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 10, 2026
a2e3735
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 12, 2026
09bdead
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 13, 2026
69e1b5c
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 13, 2026
a4d8a11
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 13, 2026
db95e1f
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 14, 2026
956f619
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 14, 2026
75ccf8d
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 15, 2026
d07950e
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 15, 2026
465df57
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 15, 2026
72a579a
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 16, 2026
34299e6
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 17, 2026
42f24bf
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 17, 2026
a30bcf8
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 17, 2026
fbbf55c
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 20, 2026
4375250
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 21, 2026
c47e88b
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 21, 2026
fab0ad9
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 22, 2026
7e1324b
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 22, 2026
376437a
Merge branch 'main' into vrdn-23/add-reponse-headers
vrdn-23 Apr 23, 2026
5bc7b86
fix merge conficts
vrdn-23 May 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
253 changes: 253 additions & 0 deletions tests/entrypoints/openai/test_request_stats_headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from __future__ import annotations

import time

import pytest
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from httpx import ASGITransport, AsyncClient

from vllm.entrypoints.openai.engine.protocol import (
RequestResponseMetadata,
UsageInfo,
)
from vllm.entrypoints.openai.request_stats_headers import (
build_request_stats_headers,
request_stats_headers_middleware,
)
from vllm.v1.metrics.stats import RequestStateStats


def test_build_request_stats_headers_basic():
"""Headers are computed correctly from known timestamps."""
now = time.time()
stats = RequestStateStats(
arrival_time=now - 1.0,
queued_ts=100.0,
scheduled_ts=100.05,
first_token_ts=100.15,
last_token_ts=100.45,
num_generation_tokens=10,
)
usage = UsageInfo(
prompt_tokens=50,
completion_tokens=10,
total_tokens=60,
)
headers = build_request_stats_headers(
metrics=stats,
usage=usage,
num_cached_tokens=5,
)

# All headers use x-vllm- prefix
for key in headers:
assert key.startswith("x-vllm-"), f"Header {key} missing x-vllm- prefix"

assert float(headers["x-vllm-queue-time"]) == round((100.05 - 100.0) * 1000, 2)
assert float(headers["x-vllm-prefill-time"]) == round((100.15 - 100.05) * 1000, 2)
assert float(headers["x-vllm-decode-time"]) == round((100.45 - 100.15) * 1000, 2)
assert float(headers["x-vllm-inference-time"]) == round((100.45 - 100.05) * 1000, 2)

assert headers["x-vllm-prompt-tokens"] == "50"
assert headers["x-vllm-completion-tokens"] == "10"
assert headers["x-vllm-cached-tokens"] == "5"

# tokens-per-second: 10 tokens / 0.3s decode = 33.33
decode_time_s = 100.45 - 100.15
expected_tps = round(10 / decode_time_s, 2)
assert float(headers["x-vllm-tokens-per-second"]) == expected_tps

total_time = float(headers["x-vllm-total-time"])
assert 900 < total_time < 1500


def test_build_request_stats_headers_zero_timestamps():
"""When timestamps are 0 (not set), timing headers show 0."""
stats = RequestStateStats(
arrival_time=time.time(),
queued_ts=0.0,
scheduled_ts=0.0,
first_token_ts=0.0,
last_token_ts=0.0,
)
usage = UsageInfo(prompt_tokens=10, completion_tokens=5, total_tokens=15)
headers = build_request_stats_headers(
metrics=stats, usage=usage, num_cached_tokens=0
)

assert headers["x-vllm-queue-time"] == "0.00"
assert headers["x-vllm-prefill-time"] == "0.00"
assert headers["x-vllm-decode-time"] == "0.00"
assert headers["x-vllm-inference-time"] == "0.00"
assert headers["x-vllm-tokens-per-second"] == "0.00"


def test_build_request_stats_headers_partial_timestamps():
"""When scheduled but cancelled before tokens, timing values clamp to 0."""
stats = RequestStateStats(
arrival_time=time.time() - 0.5,
queued_ts=100.0,
scheduled_ts=100.05,
first_token_ts=0.0,
last_token_ts=0.0,
)
usage = UsageInfo(prompt_tokens=20, completion_tokens=0, total_tokens=20)
headers = build_request_stats_headers(
metrics=stats, usage=usage, num_cached_tokens=0
)

assert float(headers["x-vllm-prefill-time"]) == 0.0
assert float(headers["x-vllm-decode-time"]) == 0.0
assert float(headers["x-vllm-inference-time"]) == 0.0
assert float(headers["x-vllm-tokens-per-second"]) == 0.0
assert float(headers["x-vllm-queue-time"]) == 50.0


def test_build_request_stats_headers_zero_decode_time_with_tokens():
"""Division by zero guard: tokens exist but decode time is 0."""
stats = RequestStateStats(
arrival_time=time.time() - 0.1,
queued_ts=100.0,
scheduled_ts=100.05,
first_token_ts=100.10,
last_token_ts=100.10, # same as first_token_ts
)
usage = UsageInfo(prompt_tokens=10, completion_tokens=5, total_tokens=15)
headers = build_request_stats_headers(
metrics=stats, usage=usage, num_cached_tokens=0
)

assert headers["x-vllm-decode-time"] == "0.00"
assert headers["x-vllm-tokens-per-second"] == "0.00"


def _create_test_app(enable_headers: bool) -> FastAPI:
"""Create a minimal FastAPI app with the production middleware."""
app = FastAPI()

# Mirror production: only register middleware when flag is enabled
if enable_headers:
app.middleware("http")(request_stats_headers_middleware)

@app.get("/test-with-stats")
async def test_with_stats(request: Request):
metadata = RequestResponseMetadata(request_id="test-123")
metadata.final_usage_info = UsageInfo(
prompt_tokens=50, completion_tokens=10, total_tokens=60
)
metadata.request_stats = RequestStateStats(
arrival_time=time.time() - 1.0,
queued_ts=100.0,
scheduled_ts=100.05,
first_token_ts=100.15,
last_token_ts=100.45,
num_generation_tokens=10,
)
metadata.num_cached_tokens = 5
request.state.request_metadata = metadata
return JSONResponse(content={"ok": True})

@app.get("/test-no-stats")
async def test_no_stats(request: Request):
return JSONResponse(content={"ok": True})

@app.get("/test-partial-stats")
async def test_partial_stats(request: Request):
metadata = RequestResponseMetadata(request_id="test-456")
metadata.request_stats = RequestStateStats(
arrival_time=time.time(),
queued_ts=100.0,
scheduled_ts=100.05,
)
# final_usage_info is None
request.state.request_metadata = metadata
return JSONResponse(content={"ok": True})

@app.get("/test-missing-request-stats")
async def test_missing_request_stats(request: Request):
metadata = RequestResponseMetadata(request_id="test-789")
metadata.final_usage_info = UsageInfo(
prompt_tokens=10, completion_tokens=5, total_tokens=15
)
# request_stats is None
request.state.request_metadata = metadata
return JSONResponse(content={"ok": True})

return app


@pytest.mark.asyncio
async def test_middleware_flag_disabled():
"""No headers when flag is disabled."""
app = _create_test_app(enable_headers=False)
async with AsyncClient(
transport=ASGITransport(app=app), base_url="http://test"
) as client:
resp = await client.get("/test-with-stats")
assert resp.status_code == 200
assert "x-vllm-total-time" not in resp.headers


@pytest.mark.asyncio
async def test_middleware_no_metadata():
"""No headers when request_metadata is not set."""
app = _create_test_app(enable_headers=True)
async with AsyncClient(
transport=ASGITransport(app=app), base_url="http://test"
) as client:
resp = await client.get("/test-no-stats")
assert resp.status_code == 200
assert "x-vllm-total-time" not in resp.headers


@pytest.mark.asyncio
async def test_middleware_missing_usage():
"""No headers when final_usage_info is None."""
app = _create_test_app(enable_headers=True)
async with AsyncClient(
transport=ASGITransport(app=app), base_url="http://test"
) as client:
resp = await client.get("/test-partial-stats")
assert resp.status_code == 200
assert "x-vllm-total-time" not in resp.headers


@pytest.mark.asyncio
async def test_middleware_full_stats():
"""All headers present when flag enabled and stats available."""
app = _create_test_app(enable_headers=True)
async with AsyncClient(
transport=ASGITransport(app=app), base_url="http://test"
) as client:
resp = await client.get("/test-with-stats")
assert resp.status_code == 200
assert "x-vllm-total-time" in resp.headers
assert "x-vllm-queue-time" in resp.headers
assert "x-vllm-inference-time" in resp.headers
assert "x-vllm-prefill-time" in resp.headers
assert "x-vllm-decode-time" in resp.headers
assert "x-vllm-prompt-tokens" in resp.headers
assert "x-vllm-completion-tokens" in resp.headers
assert "x-vllm-cached-tokens" in resp.headers
assert "x-vllm-tokens-per-second" in resp.headers
assert resp.headers["x-vllm-prompt-tokens"] == "50"
assert resp.headers["x-vllm-completion-tokens"] == "10"
assert resp.headers["x-vllm-cached-tokens"] == "5"
# 10 tokens / 0.3s decode = 33.33 tok/s
assert float(resp.headers["x-vllm-tokens-per-second"]) == round(10 / 0.3, 2)


@pytest.mark.asyncio
async def test_middleware_missing_request_stats():
"""No headers when request_stats is None but usage is present."""
app = _create_test_app(enable_headers=True)
async with AsyncClient(
transport=ASGITransport(app=app), base_url="http://test"
) as client:
resp = await client.get("/test-missing-request-stats")
assert resp.status_code == 200
assert "x-vllm-total-time" not in resp.headers
6 changes: 6 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.request_stats_headers import (
request_stats_headers_middleware,
)
from vllm.entrypoints.openai.server_utils import (
engine_error_handler,
exception_handler,
Expand Down Expand Up @@ -279,6 +282,9 @@ def build_app(

app.add_middleware(XRequestIdMiddleware)

if args.enable_request_stats_headers:
app.middleware("http")(request_stats_headers_middleware)

# Add scaling middleware to check for scaling state
app.add_middleware(ScalingMiddleware)

Expand Down
2 changes: 2 additions & 0 deletions vllm/entrypoints/openai/chat_completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,8 @@ async def chat_completion_full_generator(
)

request_metadata.final_usage_info = usage
request_metadata.request_stats = final_res.metrics
request_metadata.num_cached_tokens = final_res.num_cached_tokens or 0

prompt_routed_experts = None
if final_res.prompt_routed_experts is not None:
Expand Down
3 changes: 3 additions & 0 deletions vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ class BaseFrontendArgs:
If set to True, only enable the Tokens In<>Out endpoint.
This is intended for use in a Disaggregated Everything setup.
"""
enable_request_stats_headers: bool = False
"""If set to True, include per-request timing and compute stats as
x- response headers on non-streaming completion responses."""
fingerprint_mode: Literal["full", "hash", "custom", "none"] = "full"
"""Controls the ``system_fingerprint`` field on responses.

Expand Down
6 changes: 6 additions & 0 deletions vllm/entrypoints/openai/completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,12 @@ def request_output_to_completion_response(
)

request_metadata.final_usage_info = usage
if last_final_res is not None:
# Known limitation: for multi-prompt batch requests, timing
# headers reflect only the last prompt's metrics, not the
# aggregate. Token counts in usage are correctly summed.
request_metadata.request_stats = last_final_res.metrics
request_metadata.num_cached_tokens = last_final_res.num_cached_tokens or 0
prompt_routed_experts = None
if final_res_batch:
kv_transfer_params = final_res_batch[0].kv_transfer_params
Expand Down
5 changes: 5 additions & 0 deletions vllm/entrypoints/openai/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from vllm.logger import init_logger
from vllm.utils import random_uuid
from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.v1.metrics.stats import RequestStateStats

logger = init_logger(__name__)

Expand Down Expand Up @@ -110,8 +111,12 @@ class UsageInfo(OpenAIBaseModel):


class RequestResponseMetadata(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

request_id: str
final_usage_info: UsageInfo | None = None
request_stats: RequestStateStats | None = None
num_cached_tokens: int = 0


class JsonSchemaResponseFormat(OpenAIBaseModel):
Expand Down
Loading
Loading