Terrencezzj · Terrencezzj · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -49,6 +49,7 @@ for BACK in "${BACKENDS[@]}"; do
     --data-parallel-size 2 \
     --enable-expert-parallel \
     --enable-eplb \
+    --eplb-config '{"use_async": false}' \
     --trust-remote-code \
     --max-model-len 2048 \
     --all2all-backend "$BACK" \

diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -48,7 +48,7 @@ for BACK in "${BACKENDS[@]}"; do
     --enforce-eager \
     --enable-eplb \
     --all2all-backend "$BACK" \
-    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true, "use_async":false}' \
     --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
     --data-parallel-size "${DATA_PARALLEL_SIZE}" \
     --enable-expert-parallel \

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -1261,7 +1261,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
 
 - label: Entrypoints Integration (API Server openai - Part 2) # TBD
   timeout_in_minutes: 180
@@ -1484,7 +1484,7 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt
 
-- label: DeepSeek V2-Lite Accuracy (4xH100-4xMI300) # TBD
+- label: DeepSeek V2-Lite Sync EPLB Accuracy (4xH100-4xMI300) # TBD
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_4
@@ -1526,7 +1526,7 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy (4xH100-4xMI300) # TBD
+- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (4xH100-4xMI300) # TBD
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_4
@@ -2768,7 +2768,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
 
 - label: Entrypoints Integration (API Server openai - Part 2) # TBD
   timeout_in_minutes: 180
@@ -2895,7 +2895,7 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD
+- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (B200-MI355) # TBD
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_2

diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
@@ -2,8 +2,8 @@ group: E2E Integration
 depends_on: 
   - image-build
 steps:
-- label: DeepSeek V2-Lite Accuracy
-  key: deepseek-v2-lite-accuracy
+- label: DeepSeek V2-Lite Sync EPLB Accuracy
+  key: deepseek-v2-lite-sync-eplb-accuracy
   timeout_in_minutes: 60
   device: h100
   optional: true
@@ -12,8 +12,8 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy
-  key: qwen3-30b-a3b-fp8-block-accuracy
+- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy
+  key: qwen3-30b-a3b-fp8-block-sync-eplb-accuracy
   timeout_in_minutes: 60
   device: h100
   optional: true
@@ -22,8 +22,8 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
-  key: qwen3-30b-a3b-fp8-block-accuracy-b200
+- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (B200)
+  key: qwen3-30b-a3b-fp8-block-sync-eplb-accuracy-b200
   timeout_in_minutes: 60
   device: b200-k8s
   optional: true

diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
@@ -43,7 +43,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
   mirror:
     amd:
       device: mi325_1

diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
@@ -14,5 +14,12 @@ steps:
   commands:
     - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor -m '(not slow_test)'
-    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
+    # Dump tracebacks of all threads if a test hangs, so a wedged GPU/CUDA
+    # init surfaces a stack instead of silently stalling.
+    - export PYTHONFAULTHANDLER=1
+    # Per-test watchdog: a single hung test (e.g. stuck during engine/CUDA
+    # init) fails fast with a traceback instead of running until the global
+    # build timeout. The `thread` method also handles hangs inside C/CUDA
+    # calls that the signal method cannot interrupt.
+    - pytest -v -s model_executor -m '(not slow_test)' --timeout=900 --timeout-method=thread
+    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py --timeout=900 --timeout-method=thread
diff --git a/.buildkite/test_areas/rust_frontend.yaml b/.buildkite/test_areas/rust_frontend.yaml
@@ -16,7 +16,7 @@ steps:
   - tests/benchmarks/test_serve_cli.py
   - tests/entrypoints/openai/chat_completion/test_chat_completion.py
   # - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
-  # - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+
   # - tests/entrypoints/openai/completion/test_prompt_validation.py
   - tests/entrypoints/openai/completion/test_shutdown.py
   # - tests/entrypoints/openai/test_return_token_ids.py
@@ -28,7 +28,7 @@ steps:
   - pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)"
   - pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py
   # - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid"
-  # - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+
   # - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds"
   - pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly"
   # - pytest -v -s entrypoints/openai/test_return_token_ids.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -72,8 +72,10 @@
 /vllm/v1/worker/gpu/kv_connector.py @orozery
 
 # CI & building
-/.buildkite @Harry-Chen
-/docker/Dockerfile @Harry-Chen
+/.buildkite @Harry-Chen @khluu
+/docker/Dockerfile @Harry-Chen @khluu
+/pyproject.toml @khluu
+/setup.py @khluu
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 

diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
@@ -151,7 +151,7 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T
 | `step_interval` | Frequency of rebalancing (every N engine steps) | 3000 |
 | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
 | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
-| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
+| `use_async` | Use non-blocking EPLB for reduced latency overhead | `true` |
 | `policy` | The policy type for expert parallel load balancing | `"default"` |
 | `communicator` | Backend for expert weight transfers: `"torch_nccl"`, `"torch_gloo"`, `"pynccl"`, `"nixl"`,  or `null` (auto) | `null` |
 

diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
@@ -15,7 +15,7 @@ def get_model_args(
     spec_method: str,
     tp_size: int,
     model_max_len: int,
-    use_async: bool = False,
+    use_async: bool = True,
 ) -> dict:
     speculative_config = {
         "method": spec_method,
@@ -28,9 +28,8 @@ def get_model_args(
         "window_size": 128,
         "step_interval": 1024,
         "log_balancedness": False,
+        "use_async": use_async,
     }
-    if use_async:
-        eplb_config["use_async"] = True
     model_args = {
         "pretrained": model_name,
         "dtype": "auto",

diff --git a/tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py