Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ for BACK in "${BACKENDS[@]}"; do
--data-parallel-size 2 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"use_async": false}' \
--trust-remote-code \
--max-model-len 2048 \
--all2all-backend "$BACK" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ for BACK in "${BACKENDS[@]}"; do
--enforce-eager \
--enable-eplb \
--all2all-backend "$BACK" \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true, "use_async":false}' \
--tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
--data-parallel-size "${DATA_PARALLEL_SIZE}" \
--enable-expert-parallel \
Expand Down
10 changes: 5 additions & 5 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1261,7 +1261,7 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
- pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py

- label: Entrypoints Integration (API Server openai - Part 2) # TBD
timeout_in_minutes: 180
Expand Down Expand Up @@ -1484,7 +1484,7 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt

- label: DeepSeek V2-Lite Accuracy (4xH100-4xMI300) # TBD
- label: DeepSeek V2-Lite Sync EPLB Accuracy (4xH100-4xMI300) # TBD
timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
agent_pool: mi300_4
Expand Down Expand Up @@ -1526,7 +1526,7 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

- label: Qwen3-30B-A3B-FP8-block Accuracy (4xH100-4xMI300) # TBD
- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (4xH100-4xMI300) # TBD
timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
agent_pool: mi300_4
Expand Down Expand Up @@ -2768,7 +2768,7 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
- pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py

- label: Entrypoints Integration (API Server openai - Part 2) # TBD
timeout_in_minutes: 180
Expand Down Expand Up @@ -2895,7 +2895,7 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt

- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD
- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (B200-MI355) # TBD
timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
agent_pool: mi355_2
Expand Down
12 changes: 6 additions & 6 deletions .buildkite/test_areas/e2e_integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ group: E2E Integration
depends_on:
- image-build
steps:
- label: DeepSeek V2-Lite Accuracy
key: deepseek-v2-lite-accuracy
- label: DeepSeek V2-Lite Sync EPLB Accuracy
key: deepseek-v2-lite-sync-eplb-accuracy
timeout_in_minutes: 60
device: h100
optional: true
Expand All @@ -12,8 +12,8 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

- label: Qwen3-30B-A3B-FP8-block Accuracy
key: qwen3-30b-a3b-fp8-block-accuracy
- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy
key: qwen3-30b-a3b-fp8-block-sync-eplb-accuracy
timeout_in_minutes: 60
device: h100
optional: true
Expand All @@ -22,8 +22,8 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
key: qwen3-30b-a3b-fp8-block-accuracy-b200
- label: Qwen3-30B-A3B-FP8-block Sync EPLB Accuracy (B200)
key: qwen3-30b-a3b-fp8-block-sync-eplb-accuracy-b200
timeout_in_minutes: 60
device: b200-k8s
optional: true
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/test_areas/entrypoints.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
- pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
mirror:
amd:
device: mi325_1
Expand Down
11 changes: 9 additions & 2 deletions .buildkite/test_areas/model_executor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,12 @@ steps:
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor -m '(not slow_test)'
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
# Dump tracebacks of all threads if a test hangs, so a wedged GPU/CUDA
# init surfaces a stack instead of silently stalling.
- export PYTHONFAULTHANDLER=1
# Per-test watchdog: a single hung test (e.g. stuck during engine/CUDA
# init) fails fast with a traceback instead of running until the global
# build timeout. The `thread` method also handles hangs inside C/CUDA
# calls that the signal method cannot interrupt.
- pytest -v -s model_executor -m '(not slow_test)' --timeout=900 --timeout-method=thread
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py --timeout=900 --timeout-method=thread
4 changes: 2 additions & 2 deletions .buildkite/test_areas/rust_frontend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ steps:
- tests/benchmarks/test_serve_cli.py
- tests/entrypoints/openai/chat_completion/test_chat_completion.py
# - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
# - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py

# - tests/entrypoints/openai/completion/test_prompt_validation.py
- tests/entrypoints/openai/completion/test_shutdown.py
# - tests/entrypoints/openai/test_return_token_ids.py
Expand All @@ -28,7 +28,7 @@ steps:
- pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)"
- pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py
# - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid"
# - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py

# - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds"
- pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly"
# - pytest -v -s entrypoints/openai/test_return_token_ids.py
Expand Down
6 changes: 4 additions & 2 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@
/vllm/v1/worker/gpu/kv_connector.py @orozery

# CI & building
/.buildkite @Harry-Chen
/docker/Dockerfile @Harry-Chen
/.buildkite @Harry-Chen @khluu
/docker/Dockerfile @Harry-Chen @khluu
/pyproject.toml @khluu
/setup.py @khluu

# Test ownership
/.buildkite/lm-eval-harness @mgoin
Expand Down
2 changes: 1 addition & 1 deletion docs/serving/expert_parallel_deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T
| `step_interval` | Frequency of rebalancing (every N engine steps) | 3000 |
| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
| `use_async` | Use non-blocking EPLB for reduced latency overhead | `true` |
| `policy` | The policy type for expert parallel load balancing | `"default"` |
| `communicator` | Backend for expert weight transfers: `"torch_nccl"`, `"torch_gloo"`, `"pynccl"`, `"nixl"`, or `null` (auto) | `null` |

Expand Down
5 changes: 2 additions & 3 deletions tests/distributed/test_eplb_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def get_model_args(
spec_method: str,
tp_size: int,
model_max_len: int,
use_async: bool = False,
use_async: bool = True,
) -> dict:
speculative_config = {
"method": spec_method,
Expand All @@ -28,9 +28,8 @@ def get_model_args(
"window_size": 128,
"step_interval": 1024,
"log_balancedness": False,
"use_async": use_async,
}
if use_async:
eplb_config["use_async"] = True
model_args = {
"pretrained": model_name,
"dtype": "auto",
Expand Down

This file was deleted.

Loading
Loading