From 4aaba00f92d4fc481aeff10db72465e26463a632 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Fri, 29 May 2026 20:07:16 +0200
Subject: [PATCH 01/11] [EPLB] Make async EPLB default (#43219)

Signed-off-by: Markov Ilya <markovilya19@gmail.com>
Co-authored-by: Markov Ilya <markovilya19@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .../deepseek_v2_lite_ep_eplb.sh                      |  1 +
 .../qwen30b_a3b_fp8_block_ep_eplb.sh                 |  2 +-
 .buildkite/test-amd.yaml                             |  6 +++---
 .buildkite/test_areas/e2e_integration.yaml           | 12 ++++++------
 docs/serving/expert_parallel_deployment.md           |  2 +-
 tests/distributed/test_eplb_spec_decode.py           |  5 ++---
 vllm/config/parallel.py                              |  2 +-
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index e26273bba39a..2ff0b2e1251c 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -49,6 +49,7 @@ for BACK in "${BACKENDS[@]}"; do
     --data-parallel-size 2 \
     --enable-expert-parallel \
     --enable-eplb \
+    --eplb-config '{"use_async": false}' \
     --trust-remote-code \
     --max-model-len 2048 \
     --all2all-backend "$BACK" \
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index 729a0fb7f688..871685531909 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -48,7 +48,7 @@ for BACK in "${BACKENDS[@]}"; do
     --enforce-eager \
     --enable-eplb \
     --all2all-backend "$BACK" \
-    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true, "use_async":false}' \
     --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
     --data-parallel-size "${DATA_PARALLEL_SIZE}" \
     --enable-expert-parallel \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index c7ef61b719aa..80033fefb407 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1484,7 +1484,7 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt
 
-- label: DeepSeek V2-Lite Accuracy (4xH100-4xMI300) # TBD
+- label: DeepSeek V2-Lite Sync EPLB Accuracy (4xH100-4xMI300) # TBD
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_4
@@ -1526,7 +1526,7 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy (4xH100-4xMI300) # TBD
+- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (4xH100-4xMI300) # TBD
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_4
@@ -2895,7 +2895,7 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD
+- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (B200-MI355) # TBD
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_2
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index bb8aa14eac18..88039a339607 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -2,8 +2,8 @@ group: E2E Integration
 depends_on: 
   - image-build
 steps:
-- label: DeepSeek V2-Lite Accuracy
-  key: deepseek-v2-lite-accuracy
+- label: DeepSeek V2-Lite Sync EPLB Accuracy
+  key: deepseek-v2-lite-sync-eplb-accuracy
   timeout_in_minutes: 60
   device: h100
   optional: true
@@ -12,8 +12,8 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy
-  key: qwen3-30b-a3b-fp8-block-accuracy
+- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy
+  key: qwen3-30b-a3b-fp8-block-sync-eplb-accuracy
   timeout_in_minutes: 60
   device: h100
   optional: true
@@ -22,8 +22,8 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
-  key: qwen3-30b-a3b-fp8-block-accuracy-b200
+- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (B200)
+  key: qwen3-30b-a3b-fp8-block-sync-eplb-accuracy-b200
   timeout_in_minutes: 60
   device: b200-k8s
   optional: true
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index fef4df770fa3..b7c2ee873750 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -151,7 +151,7 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T
 | `step_interval` | Frequency of rebalancing (every N engine steps) | 3000 |
 | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
 | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
-| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
+| `use_async` | Use non-blocking EPLB for reduced latency overhead | `true` |
 | `policy` | The policy type for expert parallel load balancing | `"default"` |
 | `communicator` | Backend for expert weight transfers: `"torch_nccl"`, `"torch_gloo"`, `"pynccl"`, `"nixl"`,  or `null` (auto) | `null` |
 
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index 22977ce94404..b4211c74c171 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -15,7 +15,7 @@ def get_model_args(
     spec_method: str,
     tp_size: int,
     model_max_len: int,
-    use_async: bool = False,
+    use_async: bool = True,
 ) -> dict:
     speculative_config = {
         "method": spec_method,
@@ -28,9 +28,8 @@ def get_model_args(
         "window_size": 128,
         "step_interval": 1024,
         "log_balancedness": False,
+        "use_async": use_async,
     }
-    if use_async:
-        eplb_config["use_async"] = True
     model_args = {
         "pretrained": model_name,
         "dtype": "auto",
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 66f0000716f8..f32ecef14821 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -78,7 +78,7 @@ class EPLBConfig:
     """
     Interval for logging the balancedness.
     """
-    use_async: bool = False
+    use_async: bool = True
     """
     Whether to use non-blocking EPLB.
     """

From d07ad0693bc7f5c993bfb473f02a71df8f8c7109 Mon Sep 17 00:00:00 2001
From: qizixi <22851944+zixi-qi@users.noreply.github.com>
Date: Fri, 29 May 2026 11:14:25 -0700
Subject: [PATCH 02/11] [Bugfix] Use storage_block_size in KV cache reshape for
 compressed specs (DeepSeek V4) (#43988)

Signed-off-by: zixi-qi <zixi@inferact.ai>
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 vllm/v1/worker/gpu/attn_utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index c2ba12437f89..6fc55ee32030 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -199,7 +199,12 @@ def _reshape_kv_cache(
 
             if isinstance(kv_cache_spec, AttentionSpec):
                 has_attn = True
-                num_blocks_per_kv_block = kv_cache_spec.block_size // kernel_block_size
+                # Use storage_block_size: it equals block_size for uncompressed
+                # specs but is smaller for compressed ones (DeepSeek V4), which
+                # store block_size tokens in block_size // compress_ratio slots.
+                num_blocks_per_kv_block = (
+                    kv_cache_spec.storage_block_size // kernel_block_size
+                )
                 kernel_num_blocks = num_blocks * num_blocks_per_kv_block
                 kv_cache_shape = group.backend.get_kv_cache_shape(
                     kernel_num_blocks,

From 8b9deeec4b7ad889c62f31e2eec2d2537c780685 Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Fri, 29 May 2026 11:51:05 -0700
Subject: [PATCH 03/11] [Bugfix] Fix Ray placement group allocation with
 grouped nodes (#43998)

Signed-off-by: <conway.zhu@cohere.com>
Signed-off-by: root <conway.zhu@cohere.com>
---
 vllm/v1/engine/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 554e8d6f0056..8a7269a7707a 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -575,7 +575,9 @@ def create_dp_placement_groups(
             node_ip_keys = [
                 key
                 for key in node_resources
-                if key != "node:__internal_head__" and key.startswith("node:")
+                if key != "node:__internal_head__"
+                and key.startswith("node:")
+                and "_group_" not in key
             ]
             assert len(node_ip_keys) == 1, (
                 f"Zero or multiple node IP keys found in node resources: {node_ip_keys}"
@@ -654,6 +656,9 @@ def create_dp_placement_groups(
                 if len(placement_groups) == dp_size:
                     break
 
+            if len(placement_groups) == dp_size:
+                break
+
         if len(placement_groups) < dp_size:
             raise ValueError(
                 f"Not enough resources to allocate {dp_size} "

From 739096a028214991cff4af23a558795905ad35a1 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 29 May 2026 14:55:00 -0400
Subject: [PATCH 04/11] [Bug] Fix torch device issue for MOE permute (#44005)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index df430689436e..ad9fb509a1f7 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -74,6 +74,9 @@ def __post_init__(self) -> None:
         self.sort_workspace = torch.empty(
             sorter_size, dtype=torch.int8, device=self.device
         )
+        # torch.device("cuda") in config, after initialized,
+        # will be changed to cuda:{index}, so we need to refresh here.
+        self.device = self.token_expert_indices.device
 
     def validate(self, hidden_states: torch.Tensor, topk_ids: torch.Tensor) -> None:
         n_token, n_hidden = hidden_states.shape

From 6aabe221a56052965e6bb0a95e9ec682d046a6e7 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Fri, 29 May 2026 11:58:25 -0700
Subject: [PATCH 05/11] [CI] Make Model Executor test hangs fail fast with a
 traceback (#43971)

Signed-off-by: khluu <khluu000@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
---
 .buildkite/test_areas/model_executor.yaml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
index c41ef8a7110d..e34b7eadfacf 100644
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -14,5 +14,12 @@ steps:
   commands:
     - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor -m '(not slow_test)'
-    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
+    # Dump tracebacks of all threads if a test hangs, so a wedged GPU/CUDA
+    # init surfaces a stack instead of silently stalling.
+    - export PYTHONFAULTHANDLER=1
+    # Per-test watchdog: a single hung test (e.g. stuck during engine/CUDA
+    # init) fails fast with a traceback instead of running until the global
+    # build timeout. The `thread` method also handles hangs inside C/CUDA
+    # calls that the signal method cannot interrupt.
+    - pytest -v -s model_executor -m '(not slow_test)' --timeout=900 --timeout-method=thread
+    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py --timeout=900 --timeout-method=thread

From 6de08e8b46e082172bc146cf8031b29d9fdcab92 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Fri, 29 May 2026 15:23:56 -0400
Subject: [PATCH 06/11] [CI] Remove redundant test_chat_with_tool_reasoning.py
 (#44011)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 .buildkite/test-amd.yaml                      |   4 +-
 .buildkite/test_areas/entrypoints.yaml        |   2 +-
 .buildkite/test_areas/rust_frontend.yaml      |   4 +-
 .../test_chat_with_tool_reasoning.py          | 141 ------------------
 4 files changed, 5 insertions(+), 146 deletions(-)
 delete mode 100644 tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 80033fefb407..50c13e4a89a6 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1261,7 +1261,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
 
 - label: Entrypoints Integration (API Server openai - Part 2) # TBD
   timeout_in_minutes: 180
@@ -2768,7 +2768,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
 
 - label: Entrypoints Integration (API Server openai - Part 2) # TBD
   timeout_in_minutes: 180
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index 57bde22194a8..1ae8c79fab7f 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -43,7 +43,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
   mirror:
     amd:
       device: mi325_1
diff --git a/.buildkite/test_areas/rust_frontend.yaml b/.buildkite/test_areas/rust_frontend.yaml
index f750d58be586..df37022725f5 100644
--- a/.buildkite/test_areas/rust_frontend.yaml
+++ b/.buildkite/test_areas/rust_frontend.yaml
@@ -16,7 +16,7 @@ steps:
   - tests/benchmarks/test_serve_cli.py
   - tests/entrypoints/openai/chat_completion/test_chat_completion.py
   # - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
-  # - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+
   # - tests/entrypoints/openai/completion/test_prompt_validation.py
   - tests/entrypoints/openai/completion/test_shutdown.py
   # - tests/entrypoints/openai/test_return_token_ids.py
@@ -28,7 +28,7 @@ steps:
   - pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)"
   - pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py
   # - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid"
-  # - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+
   # - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds"
   - pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly"
   # - pytest -v -s entrypoints/openai/test_return_token_ids.py
diff --git a/tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
deleted file mode 100644
index 295b55889412..000000000000
--- a/tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-
-from tests.utils import RemoteOpenAIServer
-
-# a reasoning and tool calling model
-MODEL_NAME = "Qwen/QwQ-32B"
-
-
-@pytest.fixture(scope="module")
-def server():
-    args = [
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        "--reasoning-parser",
-        "deepseek_r1",
-        "--enable-auto-tool-choice",
-        "--tool-call-parser",
-        "hermes",
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
-TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "get_current_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "city": {
-                        "type": "string",
-                        "description": "The city to find the weather for, e.g. "
-                        "'San Francisco'",
-                    },
-                    "state": {
-                        "type": "string",
-                        "description": "the two-letter abbreviation for the state that "
-                        "the city is in, e.g. 'CA' which would mean 'California'",
-                    },
-                    "unit": {
-                        "type": "string",
-                        "description": "The unit to fetch the temperature in",
-                        "enum": ["celsius", "fahrenheit"],
-                    },
-                },
-                "required": ["city", "state", "unit"],
-            },
-        },
-    }
-]
-
-MESSAGES = [
-    {"role": "user", "content": "Hi! How are you doing today?"},
-    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
-    {
-        "role": "user",
-        "content": "Can you tell me what the temperate will be in Dallas, "
-        "in fahrenheit?",
-    },
-]
-
-FUNC_NAME = "get_current_weather"
-FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
-
-
-def extract_reasoning_and_calls(chunks: list):
-    reasoning = ""
-    tool_call_idx = -1
-    arguments = []
-    function_names = []
-    for chunk in chunks:
-        if chunk.choices[0].delta.tool_calls:
-            tool_call = chunk.choices[0].delta.tool_calls[0]
-            if tool_call.index != tool_call_idx:
-                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
-                arguments.append("")
-                function_names.append("")
-
-            if tool_call.function:
-                if tool_call.function.name:
-                    function_names[tool_call_idx] = tool_call.function.name
-
-                if tool_call.function.arguments:
-                    arguments[tool_call_idx] += tool_call.function.arguments
-        else:
-            if hasattr(chunk.choices[0].delta, "reasoning"):
-                reasoning += chunk.choices[0].delta.reasoning
-    return reasoning, arguments, function_names
-
-
-# test streaming
-@pytest.mark.asyncio
-async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI):
-    stream = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=MESSAGES,
-        tools=TOOLS,
-        temperature=0.0,
-        stream=True,
-    )
-
-    chunks = []
-    async for chunk in stream:
-        chunks.append(chunk)
-
-    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
-    assert len(reasoning) > 0
-    assert len(function_names) > 0 and function_names[0] == FUNC_NAME
-    assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
-
-
-# test full generate
-@pytest.mark.asyncio
-async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
-    tool_calls = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=MESSAGES,
-        tools=TOOLS,
-        temperature=0.0,
-        stream=False,
-    )
-
-    assert len(tool_calls.choices[0].message.reasoning) > 0
-    assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
-    assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS

From acbc203340bf9bffaf22121b8748cce384f8e013 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Fri, 29 May 2026 12:24:29 -0700
Subject: [PATCH 07/11] Add @khluu to CODEOWNERS (#44019)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .github/CODEOWNERS | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 540540e51328..bd5deff1b821 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -72,8 +72,10 @@
 /vllm/v1/worker/gpu/kv_connector.py @orozery
 
 # CI & building
-/.buildkite @Harry-Chen
-/docker/Dockerfile @Harry-Chen
+/.buildkite @Harry-Chen @khluu
+/docker/Dockerfile @Harry-Chen @khluu
+/pyproject.toml @khluu
+/setup.py @khluu
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 

From 5dbf1605a0e2e4927533ac4b279edaf753580173 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 29 May 2026 15:28:12 -0400
Subject: [PATCH 08/11] [Feature] SSL support for dp supervisor (#43688)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../entrypoints/openai/test_dp_supervisor.py  | 143 +++++++++++++++---
 vllm/entrypoints/openai/dp_supervisor.py      |  22 ++-
 2 files changed, 143 insertions(+), 22 deletions(-)

diff --git a/tests/entrypoints/openai/test_dp_supervisor.py b/tests/entrypoints/openai/test_dp_supervisor.py
index 0b678b226254..9967e6d86d0a 100644
--- a/tests/entrypoints/openai/test_dp_supervisor.py
+++ b/tests/entrypoints/openai/test_dp_supervisor.py
@@ -21,7 +21,10 @@
 import contextlib
 import os
 import signal
+import subprocess
+import tempfile
 import time
+from pathlib import Path
 from types import SimpleNamespace
 
 import aiohttp
@@ -35,6 +38,7 @@
     DPSupervisor,
     _build_vllm_dp_server_args,
     infer_multi_port_external_lb_start_rank,
+    validate_multi_port_external_lb_args,
 )
 from vllm.logger import init_logger
 
@@ -75,6 +79,8 @@ def _make_unit_args(**overrides) -> argparse.Namespace:
         "ssl_keyfile": None,
         "ssl_certfile": None,
         "ssl_ca_certs": None,
+        "ssl_cert_reqs": 0,
+        "ssl_ciphers": None,
         "node_rank": 1,
         "tensor_parallel_size": 1,
         "pipeline_parallel_size": 1,
@@ -108,6 +114,8 @@ def _make_args(**overrides) -> argparse.Namespace:
         ssl_keyfile=None,
         ssl_certfile=None,
         ssl_ca_certs=None,
+        ssl_cert_reqs=0,
+        ssl_ciphers=None,
         node_rank=0,
         tensor_parallel_size=1,
         pipeline_parallel_size=1,
@@ -118,6 +126,33 @@ def _make_args(**overrides) -> argparse.Namespace:
     return argparse.Namespace(**base)
 
 
+def _generate_self_signed_cert(cert_dir: Path) -> tuple[Path, Path]:
+    """Generate a self-signed certificate for HTTPS lifecycle tests."""
+    cert_file = cert_dir / "cert.pem"
+    key_file = cert_dir / "key.pem"
+    subprocess.run(
+        [
+            "openssl",
+            "req",
+            "-x509",
+            "-newkey",
+            "rsa:2048",
+            "-keyout",
+            str(key_file),
+            "-out",
+            str(cert_file),
+            "-days",
+            "1",
+            "-nodes",
+            "-subj",
+            "/CN=localhost",
+        ],
+        check=True,
+        capture_output=True,
+    )
+    return cert_file, key_file
+
+
 # ---------------------------------------------------------------------------
 # Unit tests
 # ---------------------------------------------------------------------------
@@ -141,6 +176,15 @@ def test_build_multi_port_external_lb_child_args_sets_external_rank_server():
     assert child_args.api_server_count == 1
 
 
+def test_validate_multi_port_external_lb_args_allows_ssl():
+    args = _make_unit_args(
+        ssl_keyfile="/tmp/server.key",
+        ssl_certfile="/tmp/server.crt",
+        ssl_ca_certs="/tmp/ca.crt",
+    )
+    validate_multi_port_external_lb_args(args)
+
+
 def test_aggregates_health():
     supervisor = DPSupervisor(_make_unit_args())
     supervisor._is_ready = True
@@ -236,10 +280,18 @@ class MockVLLMServer:
     Health state is toggled by the test via set_healthy().
     """
 
-    def __init__(self, port: int, drain_seconds: float = 0.0) -> None:
+    def __init__(
+        self,
+        port: int,
+        drain_seconds: float = 0.0,
+        ssl_keyfile: str | None = None,
+        ssl_certfile: str | None = None,
+    ) -> None:
         self.port = port
         self._healthy = False
         self._drain_seconds = drain_seconds
+        self._ssl_keyfile = ssl_keyfile
+        self._ssl_certfile = ssl_certfile
         self._server: uvicorn.Server | None = None
         self._serve_task: asyncio.Task | None = None
 
@@ -274,6 +326,8 @@ async def kill() -> Response:
             port=self.port,
             log_level="warning",
             lifespan="off",
+            ssl_keyfile=self._ssl_keyfile,
+            ssl_certfile=self._ssl_certfile,
         )
         self._server = uvicorn.Server(config)
 
@@ -312,7 +366,11 @@ def _custom_handle_exit(sig: int, frame: object) -> None:
 
 def launch_mock_vllm(child_args: argparse.Namespace, env_updates: dict[str, str]):
     logger.info("Launching mock vLLM on port %s", child_args.port)
-    mock_vllm = MockVLLMServer(port=child_args.port)
+    mock_vllm = MockVLLMServer(
+        port=child_args.port,
+        ssl_keyfile=child_args.ssl_keyfile,
+        ssl_certfile=child_args.ssl_certfile,
+    )
     asyncio.run(mock_vllm.start())
 
 
@@ -320,7 +378,12 @@ def launch_mock_vllm_with_drain(
     child_args: argparse.Namespace, env_updates: dict[str, str]
 ):
     logger.info("Launching mock vLLM with 15s drain on port %s", child_args.port)
-    mock_vllm = MockVLLMServer(port=child_args.port, drain_seconds=10.0)
+    mock_vllm = MockVLLMServer(
+        port=child_args.port,
+        drain_seconds=10.0,
+        ssl_keyfile=child_args.ssl_keyfile,
+        ssl_certfile=child_args.ssl_certfile,
+    )
     asyncio.run(mock_vllm.start())
 
 
@@ -329,15 +392,16 @@ def launch_mock_vllm_with_drain(
 # ---------------------------------------------------------------------------
 
 
-async def _poll_supervisor_health(expected_status: int) -> bool:
+async def _poll_supervisor_health(expected_status: int, use_ssl: bool = False) -> bool:
     """
     Poll GET /health on the supervisor until expected_status is seen.
     A connection error is treated as 503-equivalent when expected_status != 200.
     """
-    url = f"http://127.0.0.1:{_SUPERVISOR_PORT}/health"
+    scheme = "https" if use_ssl else "http"
+    url = f"{scheme}://127.0.0.1:{_SUPERVISOR_PORT}/health"
     async with aiohttp.ClientSession() as session:
         try:
-            async with session.get(url) as resp:
+            async with session.get(url, ssl=False if use_ssl else None) as resp:
                 if resp.status != expected_status:
                     print(f"expected: {expected_status=}, got: {resp.status=}")
                     return False
@@ -349,12 +413,15 @@ async def _poll_supervisor_health(expected_status: int) -> bool:
             return True
 
 
-async def _poll_until_api_server_running(port: int, retries: int = 10) -> None:
-    url = f"http://127.0.0.1:{port}/health"
+async def _poll_until_api_server_running(
+    port: int, retries: int = 10, use_ssl: bool = False
+) -> None:
+    scheme = "https" if use_ssl else "http"
+    url = f"{scheme}://127.0.0.1:{port}/health"
     async with aiohttp.ClientSession() as session:
         for _ in range(retries):
             try:
-                async with session.get(url) as resp:
+                async with session.get(url, ssl=False if use_ssl else None) as resp:
                     if resp.status != 200:
                         return
                 await asyncio.sleep(1.0)
@@ -363,22 +430,34 @@ async def _poll_until_api_server_running(port: int, retries: int = 10) -> None:
                 await asyncio.sleep(1.0)
 
 
-async def _set_healthy(port: int) -> None:
-    url = f"http://127.0.0.1:{port}/set_healthy"
-    async with aiohttp.ClientSession() as session, session.get(url) as resp:
+async def _set_healthy(port: int, use_ssl: bool = False) -> None:
+    scheme = "https" if use_ssl else "http"
+    url = f"{scheme}://127.0.0.1:{port}/set_healthy"
+    async with (
+        aiohttp.ClientSession() as session,
+        session.get(url, ssl=False if use_ssl else None) as resp,
+    ):
         assert resp.status == 200
 
 
-async def _set_unhealthy(port: int) -> None:
-    url = f"http://127.0.0.1:{port}/set_unhealthy"
-    async with aiohttp.ClientSession() as session, session.get(url) as resp:
+async def _set_unhealthy(port: int, use_ssl: bool = False) -> None:
+    scheme = "https" if use_ssl else "http"
+    url = f"{scheme}://127.0.0.1:{port}/set_unhealthy"
+    async with (
+        aiohttp.ClientSession() as session,
+        session.get(url, ssl=False if use_ssl else None) as resp,
+    ):
         assert resp.status == 200
 
 
-async def _kill_server(port: int) -> None:
-    url = f"http://127.0.0.1:{port}/kill"
+async def _kill_server(port: int, use_ssl: bool = False) -> None:
+    scheme = "https" if use_ssl else "http"
+    url = f"{scheme}://127.0.0.1:{port}/kill"
     try:
-        async with aiohttp.ClientSession() as session, session.get(url) as resp:
+        async with (
+            aiohttp.ClientSession() as session,
+            session.get(url, ssl=False if use_ssl else None) as resp,
+        ):
             assert resp.status != 200
     except Exception as e:
         assert isinstance(e, aiohttp.ClientConnectorError)
@@ -455,6 +534,34 @@ async def test_basic_lifecycle(monkeypatch):
         print("everything was cleaned up!")
 
 
+@pytest.mark.asyncio
+async def test_basic_lifecycle_with_ssl(monkeypatch):
+    with tempfile.TemporaryDirectory() as cert_dir:
+        cert_file, key_file = _generate_self_signed_cert(Path(cert_dir))
+        args = _make_args(
+            ssl_keyfile=str(key_file),
+            ssl_certfile=str(cert_file),
+        )
+
+        vllm_server_ports = [_CHILD_PORT_BASE + i for i in range(_N_CHILDREN)]
+
+        async with _run_supervisor(args, monkeypatch) as (supervisor, _task):
+            assert await _poll_supervisor_health(503, use_ssl=True)
+            assert not supervisor.is_ready
+
+            for port in vllm_server_ports:
+                assert await _poll_supervisor_health(503, use_ssl=True)
+                assert not supervisor.is_ready
+                await _poll_until_api_server_running(port, use_ssl=True)
+
+            for port in vllm_server_ports:
+                await _set_healthy(port, use_ssl=True)
+            await asyncio.sleep(1.0)
+
+            assert await _poll_supervisor_health(200, use_ssl=True)
+            assert supervisor.is_ready
+
+
 @pytest.mark.asyncio
 async def test_failed_startup(monkeypatch):
     """
diff --git a/vllm/entrypoints/openai/dp_supervisor.py b/vllm/entrypoints/openai/dp_supervisor.py
index 2dff91fa7942..13444015ecc0 100644
--- a/vllm/entrypoints/openai/dp_supervisor.py
+++ b/vllm/entrypoints/openai/dp_supervisor.py
@@ -55,9 +55,9 @@ def validate_multi_port_external_lb_args(args: argparse.Namespace) -> None:
         raise ValueError(
             "Error: --data-parallel-multi-port-external-lb does not support --uds"
         )
-    if any((args.ssl_keyfile, args.ssl_certfile, args.ssl_ca_certs)):
+    if bool(args.ssl_keyfile) != bool(args.ssl_certfile):
         raise ValueError(
-            "Error: --data-parallel-multi-port-external-lb does not support HTTPS yet"
+            "Error: --ssl-keyfile and --ssl-certfile must be provided together"
         )
     if args.api_server_count not in (None, 1):
         raise ValueError(
@@ -151,7 +151,8 @@ def _child_base_url(args: argparse.Namespace, port: int) -> str:
         host = "127.0.0.1"
     elif host == "::":
         host = "::1"
-    return f"http://{host}:{port}"
+    scheme = "https" if args.ssl_keyfile and args.ssl_certfile else "http"
+    return f"{scheme}://{host}:{port}"
 
 
 def _join_processes_with_timeout(processes: list[BaseProcess], timeout: float) -> None:
@@ -178,7 +179,15 @@ async def _probe_endpoint(
     """
     for iteration in range(conn_err_failure_threshold):
         try:
-            async with session.get(_child_base_url(args, port) + path) as response:
+            probe_ssl = None
+            if args.ssl_keyfile and args.ssl_certfile:
+                # Probes target node-local child servers over loopback, so skip
+                # certificate verification to avoid SAN/hostname mismatches for
+                # localhost/127.0.0.1 deployments.
+                probe_ssl = False
+            async with session.get(
+                _child_base_url(args, port) + path, ssl=probe_ssl
+            ) as response:
                 # vLLM returns 503 on EngineDeadError, so we should return
                 # immediately if vLLM responds with a non-200 status code.
                 return response.status == HTTPStatus.OK
@@ -272,6 +281,11 @@ async def run(self) -> None:
             host=host,
             port=self.supervisor_port,
             log_level=self.args.uvicorn_log_level,
+            ssl_keyfile=self.args.ssl_keyfile,
+            ssl_certfile=self.args.ssl_certfile,
+            ssl_ca_certs=self.args.ssl_ca_certs,
+            ssl_cert_reqs=self.args.ssl_cert_reqs,
+            ssl_ciphers=self.args.ssl_ciphers,
         )
         supervisor_server = uvicorn.Server(config)
         supervisor_server_task = asyncio.create_task(

From 38b864d81d8bc42d6d7d892a0931f4c4c2517735 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Date: Fri, 29 May 2026 15:56:44 -0400
Subject: [PATCH 09/11] [Metrics] Exclude KV transfer tokens from
 iteration_tokens_total (#43346)

Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 vllm/v1/metrics/loggers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 6855efd9f54c..0052a35366a3 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1161,7 +1161,8 @@ def record(
             iteration_stats.num_generation_tokens
         )
         self.histogram_iteration_tokens[engine_idx].observe(
-            iteration_stats.num_prompt_tokens + iteration_stats.num_generation_tokens
+            iteration_stats.prompt_token_stats.computed
+            + iteration_stats.num_generation_tokens
         )
 
         for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:

From f3eeaa2df27123d26a18a16bf7a60afbfc11b4a2 Mon Sep 17 00:00:00 2001
From: Terrencezzj <terrence@cohere.ai>
Date: Fri, 29 May 2026 19:37:28 +0000
Subject: [PATCH 10/11] fix router

Signed-off-by: Terrencezzj <terrence@cohere.ai>
---
 vllm/model_executor/layers/fused_moe/config.py              | 4 +++-
 .../layers/fused_moe/experts/trtllm_fp8_moe.py              | 2 ++
 .../layers/fused_moe/experts/trtllm_nvfp4_moe.py            | 2 +-
 .../layers/fused_moe/router/custom_routing_router.py        | 6 ++++--
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index b87b87f136dd..7bf7ae516553 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -117,8 +117,10 @@ class RoutingMethodType(IntEnum):
     SigmoidRenorm = (6,)
     # MiniMax2: Sigmoid + Bias -> TopK -> ScaledSumNormalize
     MiniMax2 = (7,)
+    # Sigmoid: Sigmoid -> TopK
+    Sigmoid = (8,)
     # Unspecified
-    Unspecified = (8,)
+    Unspecified = (9,)
     # other routing types (not passed to FlashInfer kernels)
     # Deepseek V4 -> sqrtsoftplus + Bias + Normalize
     DeepseekV4 = (100,)
diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
index b98b84cdc623..45680eba3cfc 100644
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
@@ -282,6 +282,7 @@ def _supports_routing_method(
                 RoutingMethodType.Renormalize,
                 RoutingMethodType.RenormalizeNaive,
                 RoutingMethodType.SigmoidRenorm,
+                RoutingMethodType.Sigmoid,
                 RoutingMethodType.MiniMax2,
                 RoutingMethodType.Simulated,
             ]
@@ -293,6 +294,7 @@ def _supports_routing_method(
                 RoutingMethodType.Renormalize,
                 RoutingMethodType.RenormalizeNaive,
                 RoutingMethodType.SigmoidRenorm,
+                RoutingMethodType.Sigmoid,
                 RoutingMethodType.MiniMax2,
                 RoutingMethodType.Simulated,
             ]
diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
index e4f292b7b1e4..e675c04177f1 100644
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
@@ -353,9 +353,9 @@ def _supports_routing_method(
             RoutingMethodType.RenormalizeNaive,
             RoutingMethodType.Llama4,
             RoutingMethodType.SigmoidRenorm,
+            RoutingMethodType.Sigmoid,
             RoutingMethodType.MiniMax2,
             RoutingMethodType.Simulated,
-            RoutingMethodType.SigmoidRenorm,
         ]
 
     @staticmethod
diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
index 731afffd15f8..8ea4e4925b42 100644
--- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
+++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
@@ -38,9 +38,11 @@ def routing_method_type(self) -> RoutingMethodType:
         # NOTE: FLASHINFER_TRTLLM support the Llama4 router.
         if self.custom_routing_function == Llama4MoE.custom_routing_function:
             return RoutingMethodType.Llama4
-        # Cohere MoE uses a sigmoid -> top-k -> renormalize routing function.
+        # Cohere MoE uses sigmoid -> top-k, optionally followed by renormalize.
         if self.custom_routing_function == token_choice_with_bias:
-            return RoutingMethodType.SigmoidRenorm
+            if self.renormalize:
+                return RoutingMethodType.SigmoidRenorm
+            return RoutingMethodType.Sigmoid
         return RoutingMethodType.Custom
 
     def _compute_routing(

From 315ad25e4b47a2afad8a250cd0ac8f3c60a49ca9 Mon Sep 17 00:00:00 2001
From: Terrencezzj <terrence@cohere.ai>
Date: Fri, 29 May 2026 19:47:54 +0000
Subject: [PATCH 11/11] router fix

Signed-off-by: Terrencezzj <terrence@cohere.ai>
---
 vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py
index 02b7450a5c93..a1f6ecfb0700 100644
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_bf16_moe.py
@@ -80,6 +80,8 @@ def _supports_routing_method(
             RoutingMethodType.Llama4,
             RoutingMethodType.Renormalize,
             RoutingMethodType.RenormalizeNaive,
+            RoutingMethodType.SigmoidRenorm,
+            RoutingMethodType.Sigmoid,
         ]
 
     @staticmethod