From 4aaba00f92d4fc481aeff10db72465e26463a632 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Fri, 29 May 2026 20:07:16 +0200 Subject: [PATCH 01/11] [EPLB] Make async EPLB default (#43219) Signed-off-by: Markov Ilya Co-authored-by: Markov Ilya Co-authored-by: Tyler Michael Smith --- .../deepseek_v2_lite_ep_eplb.sh | 1 + .../qwen30b_a3b_fp8_block_ep_eplb.sh | 2 +- .buildkite/test-amd.yaml | 6 +++--- .buildkite/test_areas/e2e_integration.yaml | 12 ++++++------ docs/serving/expert_parallel_deployment.md | 2 +- tests/distributed/test_eplb_spec_decode.py | 5 ++--- vllm/config/parallel.py | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index e26273bba39a..2ff0b2e1251c 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -49,6 +49,7 @@ for BACK in "${BACKENDS[@]}"; do --data-parallel-size 2 \ --enable-expert-parallel \ --enable-eplb \ + --eplb-config '{"use_async": false}' \ --trust-remote-code \ --max-model-len 2048 \ --all2all-backend "$BACK" \ diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh index 729a0fb7f688..871685531909 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh @@ -48,7 +48,7 @@ for BACK in "${BACKENDS[@]}"; do --enforce-eager \ --enable-eplb \ --all2all-backend "$BACK" \ - --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ + --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true, "use_async":false}' \ --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \ --data-parallel-size "${DATA_PARALLEL_SIZE}" \ --enable-expert-parallel \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index c7ef61b719aa..80033fefb407 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1484,7 +1484,7 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt -- label: DeepSeek V2-Lite Accuracy (4xH100-4xMI300) # TBD +- label: DeepSeek V2-Lite Sync EPLB Accuracy (4xH100-4xMI300) # TBD timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] agent_pool: mi300_4 @@ -1526,7 +1526,7 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -- label: Qwen3-30B-A3B-FP8-block Accuracy (4xH100-4xMI300) # TBD +- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (4xH100-4xMI300) # TBD timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] agent_pool: mi300_4 @@ -2895,7 +2895,7 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD +- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (B200-MI355) # TBD timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_2 diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index bb8aa14eac18..88039a339607 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -2,8 +2,8 @@ group: E2E Integration depends_on: - image-build steps: -- label: DeepSeek V2-Lite Accuracy - key: deepseek-v2-lite-accuracy +- label: DeepSeek V2-Lite Sync EPLB Accuracy + key: deepseek-v2-lite-sync-eplb-accuracy timeout_in_minutes: 60 device: h100 optional: true @@ -12,8 +12,8 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 -- label: Qwen3-30B-A3B-FP8-block Accuracy - key: qwen3-30b-a3b-fp8-block-accuracy +- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy + key: qwen3-30b-a3b-fp8-block-sync-eplb-accuracy timeout_in_minutes: 60 device: h100 optional: true @@ -22,8 +22,8 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) - key: qwen3-30b-a3b-fp8-block-accuracy-b200 +- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (B200) + key: qwen3-30b-a3b-fp8-block-sync-eplb-accuracy-b200 timeout_in_minutes: 60 device: b200-k8s optional: true diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index fef4df770fa3..b7c2ee873750 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -151,7 +151,7 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T | `step_interval` | Frequency of rebalancing (every N engine steps) | 3000 | | `log_balancedness` | Log balancedness metrics (avg tokens per expert รท max tokens per expert) | `false` | | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` | -| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` | +| `use_async` | Use non-blocking EPLB for reduced latency overhead | `true` | | `policy` | The policy type for expert parallel load balancing | `"default"` | | `communicator` | Backend for expert weight transfers: `"torch_nccl"`, `"torch_gloo"`, `"pynccl"`, `"nixl"`, or `null` (auto) | `null` | diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index 22977ce94404..b4211c74c171 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -15,7 +15,7 @@ def get_model_args( spec_method: str, tp_size: int, model_max_len: int, - use_async: bool = False, + use_async: bool = True, ) -> dict: speculative_config = { "method": spec_method, @@ -28,9 +28,8 @@ def get_model_args( "window_size": 128, "step_interval": 1024, "log_balancedness": False, + "use_async": use_async, } - if use_async: - eplb_config["use_async"] = True model_args = { "pretrained": model_name, "dtype": "auto", diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 66f0000716f8..f32ecef14821 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -78,7 +78,7 @@ class EPLBConfig: """ Interval for logging the balancedness. """ - use_async: bool = False + use_async: bool = True """ Whether to use non-blocking EPLB. """ From d07ad0693bc7f5c993bfb473f02a71df8f8c7109 Mon Sep 17 00:00:00 2001 From: qizixi <22851944+zixi-qi@users.noreply.github.com> Date: Fri, 29 May 2026 11:14:25 -0700 Subject: [PATCH 02/11] [Bugfix] Use storage_block_size in KV cache reshape for compressed specs (DeepSeek V4) (#43988) Signed-off-by: zixi-qi Co-authored-by: Claude Opus 4.8 (1M context) --- vllm/v1/worker/gpu/attn_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index c2ba12437f89..6fc55ee32030 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -199,7 +199,12 @@ def _reshape_kv_cache( if isinstance(kv_cache_spec, AttentionSpec): has_attn = True - num_blocks_per_kv_block = kv_cache_spec.block_size // kernel_block_size + # Use storage_block_size: it equals block_size for uncompressed + # specs but is smaller for compressed ones (DeepSeek V4), which + # store block_size tokens in block_size // compress_ratio slots. + num_blocks_per_kv_block = ( + kv_cache_spec.storage_block_size // kernel_block_size + ) kernel_num_blocks = num_blocks * num_blocks_per_kv_block kv_cache_shape = group.backend.get_kv_cache_shape( kernel_num_blocks, From 8b9deeec4b7ad889c62f31e2eec2d2537c780685 Mon Sep 17 00:00:00 2001 From: czhu-cohere Date: Fri, 29 May 2026 11:51:05 -0700 Subject: [PATCH 03/11] [Bugfix] Fix Ray placement group allocation with grouped nodes (#43998) Signed-off-by: Signed-off-by: root --- vllm/v1/engine/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 554e8d6f0056..8a7269a7707a 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -575,7 +575,9 @@ def create_dp_placement_groups( node_ip_keys = [ key for key in node_resources - if key != "node:__internal_head__" and key.startswith("node:") + if key != "node:__internal_head__" + and key.startswith("node:") + and "_group_" not in key ] assert len(node_ip_keys) == 1, ( f"Zero or multiple node IP keys found in node resources: {node_ip_keys}" @@ -654,6 +656,9 @@ def create_dp_placement_groups( if len(placement_groups) == dp_size: break + if len(placement_groups) == dp_size: + break + if len(placement_groups) < dp_size: raise ValueError( f"Not enough resources to allocate {dp_size} " From 739096a028214991cff4af23a558795905ad35a1 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 29 May 2026 14:55:00 -0400 Subject: [PATCH 04/11] [Bug] Fix torch device issue for MOE permute (#44005) Signed-off-by: yewentao256 --- vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index df430689436e..ad9fb509a1f7 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -74,6 +74,9 @@ def __post_init__(self) -> None: self.sort_workspace = torch.empty( sorter_size, dtype=torch.int8, device=self.device ) + # torch.device("cuda") in config, after initialized, + # will be changed to cuda:{index}, so we need to refresh here. + self.device = self.token_expert_indices.device def validate(self, hidden_states: torch.Tensor, topk_ids: torch.Tensor) -> None: n_token, n_hidden = hidden_states.shape From 6aabe221a56052965e6bb0a95e9ec682d046a6e7 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Fri, 29 May 2026 11:58:25 -0700 Subject: [PATCH 05/11] [CI] Make Model Executor test hangs fail fast with a traceback (#43971) Signed-off-by: khluu Co-authored-by: Claude --- .buildkite/test_areas/model_executor.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml index c41ef8a7110d..e34b7eadfacf 100644 --- a/.buildkite/test_areas/model_executor.yaml +++ b/.buildkite/test_areas/model_executor.yaml @@ -14,5 +14,12 @@ steps: commands: - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor -m '(not slow_test)' - - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py + # Dump tracebacks of all threads if a test hangs, so a wedged GPU/CUDA + # init surfaces a stack instead of silently stalling. + - export PYTHONFAULTHANDLER=1 + # Per-test watchdog: a single hung test (e.g. stuck during engine/CUDA + # init) fails fast with a traceback instead of running until the global + # build timeout. The `thread` method also handles hangs inside C/CUDA + # calls that the signal method cannot interrupt. + - pytest -v -s model_executor -m '(not slow_test)' --timeout=900 --timeout-method=thread + - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py --timeout=900 --timeout-method=thread From 6de08e8b46e082172bc146cf8031b29d9fdcab92 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Fri, 29 May 2026 15:23:56 -0400 Subject: [PATCH 06/11] [CI] Remove redundant test_chat_with_tool_reasoning.py (#44011) Signed-off-by: sfeng33 <4florafeng@gmail.com> --- .buildkite/test-amd.yaml | 4 +- .buildkite/test_areas/entrypoints.yaml | 2 +- .buildkite/test_areas/rust_frontend.yaml | 4 +- .../test_chat_with_tool_reasoning.py | 141 ------------------ 4 files changed, 5 insertions(+), 146 deletions(-) delete mode 100644 tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 80033fefb407..50c13e4a89a6 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1261,7 +1261,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py + - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py - label: Entrypoints Integration (API Server openai - Part 2) # TBD timeout_in_minutes: 180 @@ -2768,7 +2768,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py + - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py - label: Entrypoints Integration (API Server openai - Part 2) # TBD timeout_in_minutes: 180 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 57bde22194a8..1ae8c79fab7f 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -43,7 +43,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py + - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py mirror: amd: device: mi325_1 diff --git a/.buildkite/test_areas/rust_frontend.yaml b/.buildkite/test_areas/rust_frontend.yaml index f750d58be586..df37022725f5 100644 --- a/.buildkite/test_areas/rust_frontend.yaml +++ b/.buildkite/test_areas/rust_frontend.yaml @@ -16,7 +16,7 @@ steps: - tests/benchmarks/test_serve_cli.py - tests/entrypoints/openai/chat_completion/test_chat_completion.py # - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py - # - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py + # - tests/entrypoints/openai/completion/test_prompt_validation.py - tests/entrypoints/openai/completion/test_shutdown.py # - tests/entrypoints/openai/test_return_token_ids.py @@ -28,7 +28,7 @@ steps: - pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)" - pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py # - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid" - # - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py + # - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds" - pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly" # - pytest -v -s entrypoints/openai/test_return_token_ids.py diff --git a/tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py deleted file mode 100644 index 295b55889412..000000000000 --- a/tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py +++ /dev/null @@ -1,141 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import openai # use the official client for correctness check -import pytest -import pytest_asyncio - -from tests.utils import RemoteOpenAIServer - -# a reasoning and tool calling model -MODEL_NAME = "Qwen/QwQ-32B" - - -@pytest.fixture(scope="module") -def server(): - args = [ - "--max-model-len", - "8192", - "--enforce-eager", - "--reasoning-parser", - "deepseek_r1", - "--enable-auto-tool-choice", - "--tool-call-parser", - "hermes", - ] - - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server - - -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client - - -TOOLS = [ - { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": "The city to find the weather for, e.g. " - "'San Francisco'", - }, - "state": { - "type": "string", - "description": "the two-letter abbreviation for the state that " - "the city is in, e.g. 'CA' which would mean 'California'", - }, - "unit": { - "type": "string", - "description": "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - }, - "required": ["city", "state", "unit"], - }, - }, - } -] - -MESSAGES = [ - {"role": "user", "content": "Hi! How are you doing today?"}, - {"role": "assistant", "content": "I'm doing well! How can I help you?"}, - { - "role": "user", - "content": "Can you tell me what the temperate will be in Dallas, " - "in fahrenheit?", - }, -] - -FUNC_NAME = "get_current_weather" -FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}""" - - -def extract_reasoning_and_calls(chunks: list): - reasoning = "" - tool_call_idx = -1 - arguments = [] - function_names = [] - for chunk in chunks: - if chunk.choices[0].delta.tool_calls: - tool_call = chunk.choices[0].delta.tool_calls[0] - if tool_call.index != tool_call_idx: - tool_call_idx = chunk.choices[0].delta.tool_calls[0].index - arguments.append("") - function_names.append("") - - if tool_call.function: - if tool_call.function.name: - function_names[tool_call_idx] = tool_call.function.name - - if tool_call.function.arguments: - arguments[tool_call_idx] += tool_call.function.arguments - else: - if hasattr(chunk.choices[0].delta, "reasoning"): - reasoning += chunk.choices[0].delta.reasoning - return reasoning, arguments, function_names - - -# test streaming -@pytest.mark.asyncio -async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI): - stream = await client.chat.completions.create( - model=MODEL_NAME, - messages=MESSAGES, - tools=TOOLS, - temperature=0.0, - stream=True, - ) - - chunks = [] - async for chunk in stream: - chunks.append(chunk) - - reasoning, arguments, function_names = extract_reasoning_and_calls(chunks) - assert len(reasoning) > 0 - assert len(function_names) > 0 and function_names[0] == FUNC_NAME - assert len(arguments) > 0 and arguments[0] == FUNC_ARGS - - -# test full generate -@pytest.mark.asyncio -async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI): - tool_calls = await client.chat.completions.create( - model=MODEL_NAME, - messages=MESSAGES, - tools=TOOLS, - temperature=0.0, - stream=False, - ) - - assert len(tool_calls.choices[0].message.reasoning) > 0 - assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME - assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS From acbc203340bf9bffaf22121b8748cce384f8e013 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Fri, 29 May 2026 12:24:29 -0700 Subject: [PATCH 07/11] Add @khluu to CODEOWNERS (#44019) Signed-off-by: Kevin H. Luu --- .github/CODEOWNERS | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 540540e51328..bd5deff1b821 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -72,8 +72,10 @@ /vllm/v1/worker/gpu/kv_connector.py @orozery # CI & building -/.buildkite @Harry-Chen -/docker/Dockerfile @Harry-Chen +/.buildkite @Harry-Chen @khluu +/docker/Dockerfile @Harry-Chen @khluu +/pyproject.toml @khluu +/setup.py @khluu # Test ownership /.buildkite/lm-eval-harness @mgoin From 5dbf1605a0e2e4927533ac4b279edaf753580173 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 29 May 2026 15:28:12 -0400 Subject: [PATCH 08/11] [Feature] SSL support for dp supervisor (#43688) Signed-off-by: yewentao256 --- .../entrypoints/openai/test_dp_supervisor.py | 143 +++++++++++++++--- vllm/entrypoints/openai/dp_supervisor.py | 22 ++- 2 files changed, 143 insertions(+), 22 deletions(-) diff --git a/tests/entrypoints/openai/test_dp_supervisor.py b/tests/entrypoints/openai/test_dp_supervisor.py index 0b678b226254..9967e6d86d0a 100644 --- a/tests/entrypoints/openai/test_dp_supervisor.py +++ b/tests/entrypoints/openai/test_dp_supervisor.py @@ -21,7 +21,10 @@ import contextlib import os import signal +import subprocess +import tempfile import time +from pathlib import Path from types import SimpleNamespace import aiohttp @@ -35,6 +38,7 @@ DPSupervisor, _build_vllm_dp_server_args, infer_multi_port_external_lb_start_rank, + validate_multi_port_external_lb_args, ) from vllm.logger import init_logger @@ -75,6 +79,8 @@ def _make_unit_args(**overrides) -> argparse.Namespace: "ssl_keyfile": None, "ssl_certfile": None, "ssl_ca_certs": None, + "ssl_cert_reqs": 0, + "ssl_ciphers": None, "node_rank": 1, "tensor_parallel_size": 1, "pipeline_parallel_size": 1, @@ -108,6 +114,8 @@ def _make_args(**overrides) -> argparse.Namespace: ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, + ssl_cert_reqs=0, + ssl_ciphers=None, node_rank=0, tensor_parallel_size=1, pipeline_parallel_size=1, @@ -118,6 +126,33 @@ def _make_args(**overrides) -> argparse.Namespace: return argparse.Namespace(**base) +def _generate_self_signed_cert(cert_dir: Path) -> tuple[Path, Path]: + """Generate a self-signed certificate for HTTPS lifecycle tests.""" + cert_file = cert_dir / "cert.pem" + key_file = cert_dir / "key.pem" + subprocess.run( + [ + "openssl", + "req", + "-x509", + "-newkey", + "rsa:2048", + "-keyout", + str(key_file), + "-out", + str(cert_file), + "-days", + "1", + "-nodes", + "-subj", + "/CN=localhost", + ], + check=True, + capture_output=True, + ) + return cert_file, key_file + + # --------------------------------------------------------------------------- # Unit tests # --------------------------------------------------------------------------- @@ -141,6 +176,15 @@ def test_build_multi_port_external_lb_child_args_sets_external_rank_server(): assert child_args.api_server_count == 1 +def test_validate_multi_port_external_lb_args_allows_ssl(): + args = _make_unit_args( + ssl_keyfile="/tmp/server.key", + ssl_certfile="/tmp/server.crt", + ssl_ca_certs="/tmp/ca.crt", + ) + validate_multi_port_external_lb_args(args) + + def test_aggregates_health(): supervisor = DPSupervisor(_make_unit_args()) supervisor._is_ready = True @@ -236,10 +280,18 @@ class MockVLLMServer: Health state is toggled by the test via set_healthy(). """ - def __init__(self, port: int, drain_seconds: float = 0.0) -> None: + def __init__( + self, + port: int, + drain_seconds: float = 0.0, + ssl_keyfile: str | None = None, + ssl_certfile: str | None = None, + ) -> None: self.port = port self._healthy = False self._drain_seconds = drain_seconds + self._ssl_keyfile = ssl_keyfile + self._ssl_certfile = ssl_certfile self._server: uvicorn.Server | None = None self._serve_task: asyncio.Task | None = None @@ -274,6 +326,8 @@ async def kill() -> Response: port=self.port, log_level="warning", lifespan="off", + ssl_keyfile=self._ssl_keyfile, + ssl_certfile=self._ssl_certfile, ) self._server = uvicorn.Server(config) @@ -312,7 +366,11 @@ def _custom_handle_exit(sig: int, frame: object) -> None: def launch_mock_vllm(child_args: argparse.Namespace, env_updates: dict[str, str]): logger.info("Launching mock vLLM on port %s", child_args.port) - mock_vllm = MockVLLMServer(port=child_args.port) + mock_vllm = MockVLLMServer( + port=child_args.port, + ssl_keyfile=child_args.ssl_keyfile, + ssl_certfile=child_args.ssl_certfile, + ) asyncio.run(mock_vllm.start()) @@ -320,7 +378,12 @@ def launch_mock_vllm_with_drain( child_args: argparse.Namespace, env_updates: dict[str, str] ): logger.info("Launching mock vLLM with 15s drain on port %s", child_args.port) - mock_vllm = MockVLLMServer(port=child_args.port, drain_seconds=10.0) + mock_vllm = MockVLLMServer( + port=child_args.port, + drain_seconds=10.0, + ssl_keyfile=child_args.ssl_keyfile, + ssl_certfile=child_args.ssl_certfile, + ) asyncio.run(mock_vllm.start()) @@ -329,15 +392,16 @@ def launch_mock_vllm_with_drain( # --------------------------------------------------------------------------- -async def _poll_supervisor_health(expected_status: int) -> bool: +async def _poll_supervisor_health(expected_status: int, use_ssl: bool = False) -> bool: """ Poll GET /health on the supervisor until expected_status is seen. A connection error is treated as 503-equivalent when expected_status != 200. """ - url = f"http://127.0.0.1:{_SUPERVISOR_PORT}/health" + scheme = "https" if use_ssl else "http" + url = f"{scheme}://127.0.0.1:{_SUPERVISOR_PORT}/health" async with aiohttp.ClientSession() as session: try: - async with session.get(url) as resp: + async with session.get(url, ssl=False if use_ssl else None) as resp: if resp.status != expected_status: print(f"expected: {expected_status=}, got: {resp.status=}") return False @@ -349,12 +413,15 @@ async def _poll_supervisor_health(expected_status: int) -> bool: return True -async def _poll_until_api_server_running(port: int, retries: int = 10) -> None: - url = f"http://127.0.0.1:{port}/health" +async def _poll_until_api_server_running( + port: int, retries: int = 10, use_ssl: bool = False +) -> None: + scheme = "https" if use_ssl else "http" + url = f"{scheme}://127.0.0.1:{port}/health" async with aiohttp.ClientSession() as session: for _ in range(retries): try: - async with session.get(url) as resp: + async with session.get(url, ssl=False if use_ssl else None) as resp: if resp.status != 200: return await asyncio.sleep(1.0) @@ -363,22 +430,34 @@ async def _poll_until_api_server_running(port: int, retries: int = 10) -> None: await asyncio.sleep(1.0) -async def _set_healthy(port: int) -> None: - url = f"http://127.0.0.1:{port}/set_healthy" - async with aiohttp.ClientSession() as session, session.get(url) as resp: +async def _set_healthy(port: int, use_ssl: bool = False) -> None: + scheme = "https" if use_ssl else "http" + url = f"{scheme}://127.0.0.1:{port}/set_healthy" + async with ( + aiohttp.ClientSession() as session, + session.get(url, ssl=False if use_ssl else None) as resp, + ): assert resp.status == 200 -async def _set_unhealthy(port: int) -> None: - url = f"http://127.0.0.1:{port}/set_unhealthy" - async with aiohttp.ClientSession() as session, session.get(url) as resp: +async def _set_unhealthy(port: int, use_ssl: bool = False) -> None: + scheme = "https" if use_ssl else "http" + url = f"{scheme}://127.0.0.1:{port}/set_unhealthy" + async with ( + aiohttp.ClientSession() as session, + session.get(url, ssl=False if use_ssl else None) as resp, + ): assert resp.status == 200 -async def _kill_server(port: int) -> None: - url = f"http://127.0.0.1:{port}/kill" +async def _kill_server(port: int, use_ssl: bool = False) -> None: + scheme = "https" if use_ssl else "http" + url = f"{scheme}://127.0.0.1:{port}/kill" try: - async with aiohttp.ClientSession() as session, session.get(url) as resp: + async with ( + aiohttp.ClientSession() as session, + session.get(url, ssl=False if use_ssl else None) as resp, + ): assert resp.status != 200 except Exception as e: assert isinstance(e, aiohttp.ClientConnectorError) @@ -455,6 +534,34 @@ async def test_basic_lifecycle(monkeypatch): print("everything was cleaned up!") +@pytest.mark.asyncio +async def test_basic_lifecycle_with_ssl(monkeypatch): + with tempfile.TemporaryDirectory() as cert_dir: + cert_file, key_file = _generate_self_signed_cert(Path(cert_dir)) + args = _make_args( + ssl_keyfile=str(key_file), + ssl_certfile=str(cert_file), + ) + + vllm_server_ports = [_CHILD_PORT_BASE + i for i in range(_N_CHILDREN)] + + async with _run_supervisor(args, monkeypatch) as (supervisor, _task): + assert await _poll_supervisor_health(503, use_ssl=True) + assert not supervisor.is_ready + + for port in vllm_server_ports: + assert await _poll_supervisor_health(503, use_ssl=True) + assert not supervisor.is_ready + await _poll_until_api_server_running(port, use_ssl=True) + + for port in vllm_server_ports: + await _set_healthy(port, use_ssl=True) + await asyncio.sleep(1.0) + + assert await _poll_supervisor_health(200, use_ssl=True) + assert supervisor.is_ready + + @pytest.mark.asyncio async def test_failed_startup(monkeypatch): """ diff --git a/vllm/entrypoints/openai/dp_supervisor.py b/vllm/entrypoints/openai/dp_supervisor.py index 2dff91fa7942..13444015ecc0 100644 --- a/vllm/entrypoints/openai/dp_supervisor.py +++ b/vllm/entrypoints/openai/dp_supervisor.py @@ -55,9 +55,9 @@ def validate_multi_port_external_lb_args(args: argparse.Namespace) -> None: raise ValueError( "Error: --data-parallel-multi-port-external-lb does not support --uds" ) - if any((args.ssl_keyfile, args.ssl_certfile, args.ssl_ca_certs)): + if bool(args.ssl_keyfile) != bool(args.ssl_certfile): raise ValueError( - "Error: --data-parallel-multi-port-external-lb does not support HTTPS yet" + "Error: --ssl-keyfile and --ssl-certfile must be provided together" ) if args.api_server_count not in (None, 1): raise ValueError( @@ -151,7 +151,8 @@ def _child_base_url(args: argparse.Namespace, port: int) -> str: host = "127.0.0.1" elif host == "::": host = "::1" - return f"http://{host}:{port}" + scheme = "https" if args.ssl_keyfile and args.ssl_certfile else "http" + return f"{scheme}://{host}:{port}" def _join_processes_with_timeout(processes: list[BaseProcess], timeout: float) -> None: @@ -178,7 +179,15 @@ async def _probe_endpoint( """ for iteration in range(conn_err_failure_threshold): try: - async with session.get(_child_base_url(args, port) + path) as response: + probe_ssl = None + if args.ssl_keyfile and args.ssl_certfile: + # Probes target node-local child servers over loopback, so skip + # certificate verification to avoid SAN/hostname mismatches for + # localhost/127.0.0.1 deployments. + probe_ssl = False + async with session.get( + _child_base_url(args, port) + path, ssl=probe_ssl + ) as response: # vLLM returns 503 on EngineDeadError, so we should return # immediately if vLLM responds with a non-200 status code. return response.status == HTTPStatus.OK @@ -272,6 +281,11 @@ async def run(self) -> None: host=host, port=self.supervisor_port, log_level=self.args.uvicorn_log_level, + ssl_keyfile=self.args.ssl_keyfile, + ssl_certfile=self.args.ssl_certfile, + ssl_ca_certs=self.args.ssl_ca_certs, + ssl_cert_reqs=self.args.ssl_cert_reqs, + ssl_ciphers=self.args.ssl_ciphers, ) supervisor_server = uvicorn.Server(config) supervisor_server_task = asyncio.create_task( From 38b864d81d8bc42d6d7d892a0931f4c4c2517735 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 29 May 2026 15:56:44 -0400 Subject: [PATCH 09/11] [Metrics] Exclude KV transfer tokens from iteration_tokens_total (#43346) Signed-off-by: Tyler Michael Smith Co-authored-by: Claude Opus 4.6 (1M context) --- vllm/v1/metrics/loggers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 6855efd9f54c..0052a35366a3 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1161,7 +1161,8 @@ def record( iteration_stats.num_generation_tokens ) self.histogram_iteration_tokens[engine_idx].observe( - iteration_stats.num_prompt_tokens + iteration_stats.num_generation_tokens + iteration_stats.prompt_token_stats.computed + + iteration_stats.num_generation_tokens ) for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter: From f3eeaa2df27123d26a18a16bf7a60afbfc11b4a2 Mon Sep 17 00:00:00 2001 From: Terrencezzj Date: Fri, 29 May 2026 19:37:28 +0000 Subject: [PATCH 10/11] fix router Signed-off-by: Terrencezzj --- vllm/model_executor/layers/fused_moe/config.py | 4 +++- .../layers/fused_moe/experts/trtllm_fp8_moe.py | 2 ++ .../layers/fused_moe/experts/trtllm_nvfp4_moe.py | 2 +- .../layers/fused_moe/router/custom_routing_router.py | 6 ++++-- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index b87b87f136dd..7bf7ae516553 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -117,8 +117,10 @@ class RoutingMethodType(IntEnum): SigmoidRenorm = (6,) # MiniMax2: Sigmoid + Bias -> TopK -> ScaledSumNormalize MiniMax2 = (7,) + # Sigmoid: Sigmoid -> TopK + Sigmoid = (8,) # Unspecified - Unspecified = (8,) + Unspecified = (9,) # other routing types (not passed to FlashInfer kernels) # Deepseek V4 -> sqrtsoftplus + Bias + Normalize DeepseekV4 = (100,) diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py index b98b84cdc623..45680eba3cfc 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py @@ -282,6 +282,7 @@ def _supports_routing_method( RoutingMethodType.Renormalize, RoutingMethodType.RenormalizeNaive, RoutingMethodType.SigmoidRenorm, + RoutingMethodType.Sigmoid, RoutingMethodType.MiniMax2, RoutingMethodType.Simulated, ] @@ -293,6 +294,7 @@ def _supports_routing_method( RoutingMethodType.Renormalize, RoutingMethodType.RenormalizeNaive, RoutingMethodType.SigmoidRenorm, + RoutingMethodType.Sigmoid, RoutingMethodType.MiniMax2, RoutingMethodType.Simulated, ] diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py index e4f292b7b1e4..e675c04177f1 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py @@ -353,9 +353,9 @@ def _supports_routing_method( RoutingMethodType.RenormalizeNaive, RoutingMethodType.Llama4, RoutingMethodType.SigmoidRenorm, + RoutingMethodType.Sigmoid, RoutingMethodType.MiniMax2, RoutingMethodType.Simulated, - RoutingMethodType.SigmoidRenorm, ] @staticmethod diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py index 731afffd15f8..8ea4e4925b42 100644 --- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py +++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py @@ -38,9 +38,11 @@ def routing_method_type(self) -> RoutingMethodType: # NOTE: FLASHINFER_TRTLLM support the Llama4 router. if self.custom_routing_function == Llama4MoE.custom_routing_function: return RoutingMethodType.Llama4 - # Cohere MoE uses a sigmoid -> top-k -> renormalize routing function. + # Cohere MoE uses sigmoid -> top-k, optionally followed by renormalize. if self.custom_routing_function == token_choice_with_bias: - return RoutingMethodType.SigmoidRenorm + if self.renormalize: + return RoutingMethodType.SigmoidRenorm + return RoutingMethodType.Sigmoid return RoutingMethodType.Custom def _compute_routing( From 315ad25e4b47a2afad8a250cd0ac8f3c60a49ca9 Mon Sep 17 00:00:00 2001 From: Terrencezzj Date: Fri, 29 May 2026 19:47:54 +0000 Subject: [PATCH 11/11] router fix Signed-off-by: Terrencezzj --- vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py index 02b7450a5c93..a1f6ecfb0700 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py @@ -80,6 +80,8 @@ def _supports_routing_method( RoutingMethodType.Llama4, RoutingMethodType.Renormalize, RoutingMethodType.RenormalizeNaive, + RoutingMethodType.SigmoidRenorm, + RoutingMethodType.Sigmoid, ] @staticmethod