diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
new file mode 100644
index 0000000000..199c33159f
--- /dev/null
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+ - ".buildkite/test_areas"
+ - ".buildkite/image_build"
+run_all_patterns:
+ - "docker/Dockerfile"
+ - "CMakeLists.txt"
+ - "requirements/common.txt"
+ - "requirements/cuda.txt"
+ - "requirements/build.txt"
+ - "requirements/test.txt"
+ - "setup.py"
+ - "csrc/"
+ - "cmake/"
+run_all_exclude_patterns:
+ - "docker/Dockerfile."
+ - "csrc/cpu/"
+ - "csrc/rocm/"
+ - "cmake/hipify.py"
+ - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+ main: "vllm-ci-postmerge-repo"
+ premerge: "vllm-ci-test-repo"
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe8..0000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """
-
-
- Links for vLLM
- {x86_wheel}
- {arm_wheel}
-
-
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
- print(f"Generated index.html for {args.wheel}")
- # sync the abi tag with .buildkite/scripts/upload-wheels.sh
- if "x86_64" in filename:
- x86_wheel = filename
- arm_wheel = filename.replace("x86_64", "aarch64").replace(
- "manylinux1", "manylinux2014"
- )
- elif "aarch64" in filename:
- x86_wheel = filename.replace("aarch64", "x86_64").replace(
- "manylinux2014", "manylinux1"
- )
- arm_wheel = filename
- else:
- raise ValueError(f"Unsupported wheel: {filename}")
- # cloudfront requires escaping the '+' character
- f.write(
- template.format(
- x86_wheel=x86_wheel,
- x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
- arm_wheel=arm_wheel,
- arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
- )
- )
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
new file mode 100755
index 0000000000..9a2384e524
--- /dev/null
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+ merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+ merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+ --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+ --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+ ${merge_base_commit_build_args} \
+ --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+ --cache-to type=registry,ref=${CACHE_TO},mode=max \
+ --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+ $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+ --push \
+ --target test \
+ --progress plain .
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
new file mode 100644
index 0000000000..d01c71dd9b
--- /dev/null
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,57 @@
+group: Abuild
+steps:
+ - label: ":docker: Build image"
+ key: image-build
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU image"
+ key: image-build-cpu
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build HPU image"
+ soft_fail: true
+ depends_on: []
+ key: image-build-hpu
+ commands:
+ - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU arm64 image"
+ key: cpu-arm64-image-build
+ depends_on: []
+ optional: true
+ commands:
+ - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
new file mode 100755
index 0000000000..a69732f430
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg VLLM_CPU_AVX512BF16=true \
+ --build-arg VLLM_CPU_AVX512VNNI=true \
+ --build-arg VLLM_CPU_AMXBF16=true \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
new file mode 100755
index 0000000000..615298b655
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
new file mode 100755
index 0000000000..192447ef45
--- /dev/null
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build \
+ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+ --progress plain \
+ https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
index 46f1a9fbf6..6c0b5540cb 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -8,3 +8,4 @@ tasks:
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5
+rtol: 0.05
diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
new file mode 100644
index 0000000000..4fb0b84bc4
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
index c8db951381..0745da8dc4 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
-# pip install lm-eval==0.4.9
+# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index 897f84d1e3..5c17a06245 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
-# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 792f355c47..1b617ff17c 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
index d85a1721db..12336d7f85 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 3627b760ed..a22abe73e3 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,11 +9,40 @@
--tp-size=1
"""
+import os
+from contextlib import contextmanager
+
import lm_eval
import numpy as np
import yaml
-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+ if not new_env:
+ # Fast path: nothing to do
+ yield
+ return
+
+ old_values = {}
+ new_keys = []
+
+ try:
+ for key, value in new_env.items():
+ if key in os.environ:
+ old_values[key] = os.environ[key]
+ else:
+ new_keys.append(key)
+ os.environ[key] = str(value)
+ yield
+ finally:
+ # Restore / clean up
+ for key, value in old_values.items():
+ os.environ[key] = value
+ for key in new_keys:
+ os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size):
@@ -31,24 +60,28 @@ def launch_lm_eval(eval_config, tp_size):
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len},"
+ "allow_deprecated_quantization=True,"
)
- results = lm_eval.simple_evaluate(
- model=backend,
- model_args=model_args,
- tasks=[task["name"] for task in eval_config["tasks"]],
- num_fewshot=eval_config["num_fewshot"],
- limit=eval_config["limit"],
- # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
- # text models. however, this is regressing measured strict-match for
- # existing text models in CI, so only apply it for mm, or explicitly set
- apply_chat_template=eval_config.get(
- "apply_chat_template", backend == "vllm-vlm"
- ),
- fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
- # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
- gen_kwargs=eval_config.get("gen_kwargs"),
- batch_size=batch_size,
- )
+
+ env_vars = eval_config.get("env_vars", None)
+ with scoped_env_vars(env_vars):
+ results = lm_eval.simple_evaluate(
+ model=backend,
+ model_args=model_args,
+ tasks=[task["name"] for task in eval_config["tasks"]],
+ num_fewshot=eval_config["num_fewshot"],
+ limit=eval_config["limit"],
+ # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+ # text models. however, this is regressing measured strict-match for
+ # existing text models in CI, so only apply it for mm, or explicitly set
+ apply_chat_template=eval_config.get(
+ "apply_chat_template", backend == "vllm-vlm"
+ ),
+ fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+ # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+ gen_kwargs=eval_config.get("gen_kwargs"),
+ batch_size=batch_size,
+ )
return results
@@ -57,6 +90,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size)
+ rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
@@ -64,8 +99,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]]
print(
f"{task['name']} | {metric['name']}: "
- f"ground_truth={ground_truth} | measured={measured_value}"
+ f"ground_truth={ground_truth:.3f} | "
+ f"measured={measured_value:.3f} | rtol={rtol}"
)
- success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+ success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success
diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 6d494f64f1..289877e504 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
## Performance benchmark quick overview
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
**Benchmarking Duration**: about 1hr.
@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Runtime environment variables:
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -34,8 +34,9 @@ Runtime environment variables:
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
-For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
->
+> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
+> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
+
### Latency test
Here is an example of one test inside `latency-tests.json`:
@@ -108,6 +109,65 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+
+ An Example of default parameters field
+
+```json
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "server_environment_variables": {
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+ },
+ "server_parameters": {
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "block_size": 128,
+ "disable_log_stats": "",
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "num_prompts": 200,
+ "ignore-eos": ""
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama3B_tp2_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 2,
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ }
+ },
+ {
+ "test_name": "serving_qwen3_tp4_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ "tensor_parallel_size": 4,
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ }
+ },
+ ]
+}
+```
+
+
+
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
@@ -116,19 +176,6 @@ If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
-The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
-When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
-If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-
-| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
-|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
-| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
-| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
+#### Performance Results Comparison
-A comparison diagram will be generated below the table.
-Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
-
+Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
index c8bf7b0453..b3d0a2d3bb 100644
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -1,8 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
import argparse
+import html as _html
import json
import os
+from dataclasses import dataclass
from importlib import util
import pandas as pd
@@ -10,27 +15,49 @@
pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None
-
+DEFAULT_INFO_COLS = [
+ "Model",
+ "Dataset Name",
+ "Input Len",
+ "Output Len",
+ # "TP Size",
+ # "PP Size",
+ "# of max concurrency.",
+ "qps",
+]
+
+# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
+pd.set_option("display.precision", 2)
+pd.set_option("display.float_format", lambda x: f"{x:.2f}")
+
+
+# -----------------------------
+# Core data compare
+# -----------------------------
def compare_data_columns(
- files, name_column, data_column, info_cols, drop_column, debug=False
+ files: list[str],
+ name_column: str,
+ data_column: str,
+ info_cols: list[str],
+ drop_column: str,
+ debug: bool = False,
):
"""
Align concatenation by keys derived from info_cols instead of row order.
- Pick one canonical key list: subset of info_cols present in ALL files.
- For each file: set index to those keys, aggregate duplicates
- - (mean for metric, first for names).
+ (mean for metric, first for names).
- Concat along axis=1 (indexes align), then reset_index so callers can
- - group by columns.
+ group by columns.
- If --debug, add a _name column per file.
"""
print("\ncompare_data_column:", data_column)
frames = []
- raw_data_cols = []
+ raw_data_cols: list[str] = []
compare_frames = []
- # 1) choose a canonical key list from info_cols that exists in ALL files
- cols_per_file = []
+ cols_per_file: list[set] = []
for f in files:
try:
df_tmp = pd.read_json(f, orient="records")
@@ -40,24 +67,20 @@ def compare_data_columns(
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
if not key_cols:
- # soft fallback: use any info_cols present in the first file
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
if not key_cols:
raise ValueError(
"No common key columns found from info_cols across the input files."
)
- # 2) build a single "meta" block (keys as columns) once, aligned by the key index
meta_added = False
for file in files:
df = pd.read_json(file, orient="records")
- # Keep rows that actually have the compared metric (same as original behavior)
if drop_column in df.columns:
df = df.dropna(subset=[drop_column], ignore_index=True)
- # Stabilize numeric key columns (harmless if missing)
for c in (
"Input Len",
"Output Len",
@@ -69,32 +92,26 @@ def compare_data_columns(
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
- # Ensure all key columns exist
for c in key_cols:
if c not in df.columns:
df[c] = pd.NA
- # Set index = key_cols and aggregate duplicates → unique MultiIndex
df_idx = df.set_index(key_cols, drop=False)
- # meta (key columns), unique per key
meta = df_idx[key_cols]
if not meta.index.is_unique:
meta = meta.groupby(level=key_cols, dropna=False).first()
- # metric series for this file, aggregated to one row per key
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
- s.name = file_label # column label like original
+ s.name = file_label
- # add meta once (from first file) so keys are the leftmost columns
if not meta_added:
frames.append(meta)
meta_added = True
- # (NEW) debug: aligned test-name column per file
if debug and name_column in df_idx.columns:
name_s = df_idx[name_column]
if not name_s.index.is_unique:
@@ -106,26 +123,19 @@ def compare_data_columns(
raw_data_cols.append(file_label)
compare_frames.append(s)
- # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
if len(compare_frames) >= 2:
base = compare_frames[0]
current = compare_frames[-1]
if "P99" in data_column or "Median" in data_column:
- ratio = base / current # for latency
+ ratio = base / current
else:
ratio = current / base
- ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
+ ratio = ratio.mask(base == 0)
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio)
- # 4) concat on columns with aligned MultiIndex;
- # then reset_index to return keys as columns
- concat_df = pd.concat(frames, axis=1)
- concat_df = concat_df.reset_index(drop=True).reset_index()
- if "index" in concat_df.columns:
- concat_df = concat_df.drop(columns=["index"])
+ concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
- # Ensure key/info columns appear first (in your info_cols order)
front = [c for c in info_cols if c in concat_df.columns]
rest = [c for c in concat_df.columns if c not in front]
concat_df = concat_df[front + rest]
@@ -134,20 +144,15 @@ def compare_data_columns(
return concat_df, raw_data_cols
+# -----------------------------
+# Split helper
+# -----------------------------
def split_json_by_tp_pp(
input_file: str = "benchmark_results.json", output_root: str = "."
) -> list[str]:
- """
- Split a benchmark JSON into separate folders by (TP Size, PP Size).
-
- Creates: /tp{TP}_pp{PP}/benchmark_results.json
- Returns: list of file paths written.
- """
- # Load JSON data into DataFrame
with open(input_file, encoding="utf-8") as f:
data = json.load(f)
- # If the JSON is a dict with a list under common keys, use that list
if isinstance(data, dict):
for key in ("results", "serving_results", "benchmarks", "data"):
if isinstance(data.get(key), list):
@@ -156,7 +161,6 @@ def split_json_by_tp_pp(
df = pd.DataFrame(data)
- # Keep only "serving" tests
name_col = next(
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
)
@@ -165,7 +169,6 @@ def split_json_by_tp_pp(
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
].copy()
- # Handle alias column names
rename_map = {
"tp_size": "TP Size",
"tensor_parallel_size": "TP Size",
@@ -176,21 +179,14 @@ def split_json_by_tp_pp(
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
)
- # Ensure TP/PP columns exist (default to 1 if missing)
if "TP Size" not in df.columns:
df["TP Size"] = 1
if "PP Size" not in df.columns:
df["PP Size"] = 1
- # make sure TP/PP are numeric ints with no NaN
- df["TP Size"] = (
- pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
- )
- df["PP Size"] = (
- pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
- )
+ df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
+ df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)
- # Split into separate folders
saved_paths: list[str] = []
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
@@ -203,32 +199,9 @@ def split_json_by_tp_pp(
return saved_paths
-def _add_limit_line(fig, y_value, label):
- # Visible dashed line + annotation
- fig.add_hline(
- y=y_value,
- line_dash="dash",
- line_color="red" if "ttft" in label.lower() else "blue",
- annotation_text=f"{label}: {y_value} ms",
- annotation_position="top left",
- )
- # Optional: add a legend item (as a transparent helper trace)
- if plot and plotly_found:
- import plotly.graph_objects as go
-
- fig.add_trace(
- go.Scatter(
- x=[None],
- y=[None],
- mode="lines",
- line=dict(
- dash="dash", color="red" if "ttft" in label.lower() else "blue"
- ),
- name=f"{label}",
- )
- )
-
-
+# -----------------------------
+# Styling helpers
+# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
@@ -239,7 +212,6 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
]:
if c in df.columns:
return c
- # Fallback: guess an integer-like column (harmless if unused)
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
@@ -248,8 +220,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
def _highlight_threshold(
df: pd.DataFrame, threshold: float
-) -> "pd.io.formats.style.Styler":
- """Highlight numeric per-configuration columns with value <= threshold."""
+) -> pd.io.formats.style.Styler:
conc_col = _find_concurrency_col(df)
key_cols = [
c
@@ -260,6 +231,7 @@ def _highlight_threshold(
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+
return df.style.map(
lambda v: "background-color:#e6ffe6;font-weight:bold;"
if pd.notna(v) and v <= threshold
@@ -268,7 +240,264 @@ def _highlight_threshold(
)
-if __name__ == "__main__":
+def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
+ ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
+ if not ratio_cols:
+ return styler
+
+ styler = styler.apply(
+ lambda _: ["background-color: #fff3b0"] * len(styler.data),
+ subset=ratio_cols,
+ axis=0,
+ )
+
+ styler = styler.set_table_styles(
+ [
+ {
+ "selector": f"th.col_heading.level0.col{i}",
+ "props": [("background-color", "#fff3b0")],
+ }
+ for i, col in enumerate(styler.data.columns)
+ if col in ratio_cols
+ ],
+ overwrite=False,
+ )
+ return styler
+
+
+def _apply_two_decimals(
+ styler: pd.io.formats.style.Styler,
+) -> pd.io.formats.style.Styler:
+ df = styler.data
+ num_cols = df.select_dtypes("number").columns
+ if len(num_cols) == 0:
+ return styler
+ return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
+
+
+# -----------------------------
+# Valid max concurrency summary helpers
+# -----------------------------
+def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
+ key_cols = [
+ c
+ for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
+ if c in df.columns
+ ]
+ exclude = set(key_cols + [conc_col, "qps", "QPS"])
+
+ cols: list[str] = []
+ for c in df.columns:
+ if c in exclude:
+ continue
+ lc = str(c).lower()
+ if lc.startswith("ratio"):
+ continue
+ if lc.endswith("_name") or lc == "test name" or lc == "test_name":
+ continue
+ if pd.api.types.is_numeric_dtype(df[c]):
+ cols.append(c)
+ return cols
+
+
+def _max_concurrency_ok(
+ df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+):
+ if df is None or conc_col not in df.columns or cfg_col not in df.columns:
+ return pd.NA
+
+ d = df[[conc_col, cfg_col]].copy()
+ d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+ d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+ d = d.dropna(subset=[conc_col, cfg_col])
+
+ if d.empty:
+ return pd.NA
+
+ ok = d[d[cfg_col] <= threshold]
+ if ok.empty:
+ return pd.NA
+
+ return ok[conc_col].max()
+
+
+def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
+ if (
+ df is None
+ or conc_col not in df.columns
+ or cfg_col not in df.columns
+ or pd.isna(conc_value)
+ ):
+ return pd.NA
+
+ d = df[[conc_col, cfg_col]].copy()
+ d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+ d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+
+ conc_value = pd.to_numeric(conc_value, errors="coerce")
+ if pd.isna(conc_value):
+ return pd.NA
+
+ hit = d[d[conc_col] == conc_value]
+ if hit.empty:
+ return pd.NA
+ return hit[cfg_col].iloc[0]
+
+
+def build_valid_max_concurrency_summary_html(
+ tput_group_df: pd.DataFrame | None,
+ ttft_group_df: pd.DataFrame | None,
+ tpot_group_df: pd.DataFrame | None,
+ conc_col: str,
+ args,
+) -> str:
+ if ttft_group_df is None and tpot_group_df is None:
+ return ""
+
+ ttft_cols = (
+ _config_value_columns(ttft_group_df, conc_col)
+ if ttft_group_df is not None
+ else []
+ )
+ tpot_cols = (
+ _config_value_columns(tpot_group_df, conc_col)
+ if tpot_group_df is not None
+ else []
+ )
+ tput_cols = (
+ _config_value_columns(tput_group_df, conc_col)
+ if tput_group_df is not None
+ else []
+ )
+
+ if ttft_group_df is not None and tpot_group_df is not None:
+ cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+ if tput_group_df is not None:
+ cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+ else:
+ cfg_cols = ttft_cols or tpot_cols
+
+ if not cfg_cols:
+ cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+ rows = []
+ for cfg in cfg_cols:
+ ttft_max = (
+ _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+ if ttft_group_df is not None
+ else pd.NA
+ )
+ tpot_max = (
+ _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+ if tpot_group_df is not None
+ else pd.NA
+ )
+ both = (
+ pd.NA
+ if (pd.isna(ttft_max) or pd.isna(tpot_max))
+ else min(ttft_max, tpot_max)
+ )
+
+ tput_at_both = (
+ _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+ if tput_group_df is not None
+ else pd.NA
+ )
+ ttft_at_both = (
+ _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+ if ttft_group_df is not None
+ else pd.NA
+ )
+ tpot_at_both = (
+ _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+ if tpot_group_df is not None
+ else pd.NA
+ )
+
+ rows.append(
+ {
+ "Configuration": cfg,
+ f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+ f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+ f"Max {conc_col} (Both)": both,
+ "Output Tput @ Both (tok/s)": tput_at_both,
+ "TTFT @ Both (ms)": ttft_at_both,
+ "TPOT @ Both (ms)": tpot_at_both,
+ }
+ )
+
+ summary_df = pd.DataFrame(rows)
+
+ # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
+ for c in summary_df.columns:
+ if c == "Configuration":
+ continue
+ summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
+
+ both_col = f"Max {conc_col} (Both)"
+
+ # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
+ formatters = {}
+ for c in summary_df.columns:
+ if c == "Configuration":
+ continue
+ # default argument binds per-column formatter correctly
+ formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
+
+ styler = summary_df.style.format(formatters)
+
+ def _green(v):
+ return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
+
+ if both_col in summary_df.columns:
+ styler = styler.map(_green, subset=[both_col])
+
+ title = (
+ ''
+ "Valid Max Concurrency Summary"
+ "
\n"
+ )
+ return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+# -----------------------------
+# Plot helper
+# -----------------------------
+def _add_limit_line(fig, y_value: float, label: str):
+ fig.add_hline(
+ y=y_value,
+ line_dash="dash",
+ line_color="red" if "ttft" in label.lower() else "blue",
+ annotation_text=f"{label}: {y_value} ms",
+ annotation_position="top left",
+ )
+ if plotly_found:
+ import plotly.graph_objects as go
+
+ fig.add_trace(
+ go.Scatter(
+ x=[None],
+ y=[None],
+ mode="lines",
+ line=dict(
+ dash="dash",
+ color="red" if "ttft" in label.lower() else "blue",
+ ),
+ name=label,
+ )
+ )
+
+
+# -----------------------------
+# Refactored main + group-first report
+# -----------------------------
+@dataclass(frozen=True)
+class MetricPlan:
+ data_cols: list[str]
+ drop_column: str
+
+
+def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
parser.add_argument(
"-f", "--file", action="append", type=str, help="input file name"
@@ -308,149 +537,289 @@ def _highlight_threshold(
default=100.0,
help="Reference limit for TPOT plots (ms)",
)
+ return parser
- args = parser.parse_args()
+def choose_metrics(latency: str) -> MetricPlan:
+ latency = (latency or "").lower()
drop_column = "P99"
- name_column = "Test name"
- info_cols = [
- "Model",
- "Dataset Name",
- "Input Len",
- "Output Len",
- "TP Size",
- "PP Size",
- "# of max concurrency.",
- "qps",
- ]
- if "median" in args.latency:
- data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
- html_msgs_for_data_cols = [
- "Compare Output Tokens /n",
- "Median TTFT /n",
- "Median TPOT /n",
- ]
- drop_column = "P99"
- elif "p99" in args.latency:
- data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
- html_msgs_for_data_cols = [
- "Compare Output Tokens /n",
- "P99 TTFT /n",
- "P99 TPOT /n",
- ]
+ if "median" in latency:
+ return MetricPlan(
+ data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
+ drop_column=drop_column,
+ )
+
+ return MetricPlan(
+ data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
+ drop_column=drop_column,
+ )
+
+
+def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
+ if not args.file:
+ raise ValueError("No input files provided. Use -f/--file.")
if len(args.file) == 1:
files = split_json_by_tp_pp(args.file[0], output_root="splits")
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
else:
files = args.file
+
+ return files, info_cols
+
+
+def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
+ y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
+ return info_cols[y_axis_index]
+
+
+def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
+ filtered_info_cols = info_cols[:4]
+ group_cols = [c for c in filtered_info_cols if c in output_df.columns]
+ if not group_cols:
+ raise ValueError(
+ f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
+ f"but DataFrame has: {list(output_df.columns)}"
+ )
+ return group_cols
+
+
+def normalize_group_key(name):
+ return name if isinstance(name, tuple) else (name,)
+
+
+def group_filename(name, prefix: str = "perf_comparison_") -> str:
+ name_vals = normalize_group_key(name)
+ safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
+ return f"{prefix}{safe}.html"
+
+
+def build_group_suffix(group_cols: list[str], name) -> str:
+ name_vals = normalize_group_key(name)
+ return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
+
+
+def render_metric_table_html(
+ display_group: pd.DataFrame,
+ metric_label: str,
+ group_suffix: str,
+ args,
+) -> str:
+ title = (
+ f''
+ f"{_html.escape(metric_label)}"
+ f" — {_html.escape(group_suffix)}"
+ f"
\n"
+ )
+
+ metric_name = metric_label.lower()
+ if "ttft" in metric_name:
+ styler = _highlight_threshold(display_group, args.ttft_max_ms)
+ elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+ styler = _highlight_threshold(display_group, args.tpot_max_ms)
+ else:
+ styler = display_group.style
+
+ styler = _apply_two_decimals(styler)
+ styler = highlight_ratio_columns(styler)
+
+ return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+def maybe_write_plot(
+ main_fh,
+ sub_fh,
+ group_df: pd.DataFrame,
+ raw_data_cols: list[str],
+ metric_label: str,
+ y_axis_col: str,
+ args,
+):
+ if not (args.plot and plotly_found):
+ return
+
+ import plotly.express as px
+
+ df = group_df[raw_data_cols].sort_values(by=y_axis_col)
+ df_melted = df.melt(
+ id_vars=y_axis_col,
+ var_name="Configuration",
+ value_name=metric_label,
+ )
+
+ fig = px.line(
+ df_melted,
+ x=y_axis_col,
+ y=metric_label,
+ color="Configuration",
+ title=f"{metric_label} vs {y_axis_col}",
+ markers=True,
+ )
+
+ # Ensure plot hover + y tick labels are also 2 decimals.
+ fig.update_traces(hovertemplate="%{y:.2f}")
+ fig.update_yaxes(tickformat=".2f")
+
+ metric_name = metric_label.lower()
+ if "ttft" in metric_name:
+ _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+ elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+ _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+ html = fig.to_html(full_html=True, include_plotlyjs="cdn")
+ main_fh.write(html)
+ sub_fh.write(html)
+
+
+def build_group_keys(
+ df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
+):
+ if sort_cols:
+ df = df.sort_values(by=sort_cols)
+ gb = df.groupby(group_cols, dropna=False)
+ return [k for k, _ in gb]
+
+
+def write_report_group_first(
+ files: list[str], info_cols: list[str], plan: MetricPlan, args
+):
+ name_column = "Test name"
+ y_axis_col = get_y_axis_col(info_cols, args.xaxis)
+
print("comparing : " + ", ".join(files))
- debug = args.debug
- plot = args.plot
- # For Plot feature, assign y axis from one of info_cols
- y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
- with open("perf_comparison.html", "w") as text_file:
- for i in range(len(data_cols_to_compare)):
- output_df, raw_data_cols = compare_data_columns(
- files,
- name_column,
- data_cols_to_compare[i],
- info_cols,
- drop_column,
- debug=debug,
+
+ metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
+ group_cols_canonical: list[str] | None = None
+
+ for metric_label in plan.data_cols:
+ output_df, raw_data_cols = compare_data_columns(
+ files,
+ name_column,
+ metric_label,
+ info_cols,
+ plan.drop_column,
+ debug=args.debug,
+ )
+
+ raw_data_cols = list(raw_data_cols)
+ raw_data_cols.insert(0, y_axis_col)
+
+ group_cols = get_group_cols(output_df, info_cols)
+ if group_cols_canonical is None:
+ group_cols_canonical = group_cols
+ else:
+ group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
+
+ metric_cache[metric_label] = (
+ output_df.sort_values(by=args.xaxis),
+ raw_data_cols,
+ )
+
+ if not group_cols_canonical:
+ raise ValueError("No canonical group columns found across metrics.")
+
+ first_metric = plan.data_cols[0]
+ first_df_sorted, _ = metric_cache[first_metric]
+ group_keys = build_group_keys(
+ first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
+ )
+
+ metric_groupbys = {
+ metric_label: df.groupby(group_cols_canonical, dropna=False)
+ for metric_label, (df, _) in metric_cache.items()
+ }
+
+ with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+ main_fh.write('\n')
+ for gkey in group_keys:
+ gkey_tuple = normalize_group_key(gkey)
+ suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+ sub_path = group_filename(gkey_tuple)
+ group_header = (
+ ''
+ f"{_html.escape(suffix)}"
+ "
\n"
)
- # For Plot feature, insert y axis from one of info_cols
- raw_data_cols.insert(0, info_cols[y_axis_index])
-
- filtered_info_cols = info_cols[:-2]
- existing_group_cols = [
- c for c in filtered_info_cols if c in output_df.columns
- ]
- if not existing_group_cols:
- raise ValueError(
- f"No valid group-by columns "
- f"Expected subset: {filtered_info_cols}, "
- f"but DataFrame has: {list(output_df.columns)}"
- )
- # output_df_sorted = output_df.sort_values(by=existing_group_cols)
- output_df_sorted = output_df.sort_values(by=args.xaxis)
- output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
- for name, group in output_groups:
- group_name = (
- ",".join(map(str, name)).replace(",", "_").replace("/", "-")
- )
- group_html_name = "perf_comparison_" + group_name + ".html"
-
- metric_name = str(data_cols_to_compare[i]).lower()
- if "tok/s" in metric_name:
- html = group.to_html()
- elif "ttft" in metric_name:
- styler = _highlight_threshold(group, args.ttft_max_ms).format(
- {c: "{:.2f}" for c in group.select_dtypes("number").columns},
- na_rep="—",
- )
- html = styler.to_html(
- table_attributes='border="1" class="dataframe"'
+ main_fh.write(group_header)
+ with open(sub_path, "w", encoding="utf-8") as sub_fh:
+ sub_fh.write('\n')
+ sub_fh.write(group_header)
+ tput_group_df = None
+ ttft_group_df = None
+ tpot_group_df = None
+ conc_col = args.xaxis
+
+ for metric_label in plan.data_cols:
+ gb = metric_groupbys[metric_label]
+ df_sorted, raw_data_cols = metric_cache[metric_label]
+
+ try:
+ group_df = gb.get_group(gkey)
+ except KeyError:
+ missing = (
+ ''
+ f"{_html.escape(metric_label)} — missing for this group"
+ "
\n"
+ )
+
+ main_fh.write(missing)
+ sub_fh.write(missing)
+ continue
+
+ if conc_col not in group_df.columns:
+ conc_col = _find_concurrency_col(group_df)
+
+ mn = metric_label.lower().strip()
+ if "tok/s" in mn:
+ tput_group_df = group_df
+ elif "ttft" in mn:
+ ttft_group_df = group_df
+ elif mn in ("p99", "median") or "tpot" in mn:
+ tpot_group_df = group_df
+
+ display_group = group_df.drop(
+ columns=group_cols_canonical, errors="ignore"
)
- elif (
- "tpot" in metric_name
- or "median" in metric_name
- or "p99" in metric_name
- ):
- styler = _highlight_threshold(group, args.tpot_max_ms).format(
- {c: "{:.2f}" for c in group.select_dtypes("number").columns},
- na_rep="—",
+
+ html = render_metric_table_html(
+ display_group, metric_label, suffix, args
)
- html = styler.to_html(
- table_attributes='border="1" class="dataframe"'
+ main_fh.write(html)
+ sub_fh.write(html)
+
+ maybe_write_plot(
+ main_fh,
+ sub_fh,
+ group_df=group_df,
+ raw_data_cols=raw_data_cols,
+ metric_label=metric_label,
+ y_axis_col=y_axis_col,
+ args=args,
)
- text_file.write(html_msgs_for_data_cols[i])
- text_file.write(html)
- with open(group_html_name, "a+") as sub_text_file:
- sub_text_file.write(html_msgs_for_data_cols[i])
- sub_text_file.write(html)
-
- if plot and plotly_found:
- import plotly.express as px
-
- df = group[raw_data_cols]
- df_sorted = df.sort_values(by=info_cols[y_axis_index])
- # Melt DataFrame for plotting
- df_melted = df_sorted.melt(
- id_vars=info_cols[y_axis_index],
- var_name="Configuration",
- value_name=data_cols_to_compare[i],
- )
- title = (
- data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
- )
- # Create Plotly line chart
- fig = px.line(
- df_melted,
- x=info_cols[y_axis_index],
- y=data_cols_to_compare[i],
- color="Configuration",
- title=title,
- markers=True,
- )
+ summary_html = build_valid_max_concurrency_summary_html(
+ tput_group_df=tput_group_df,
+ ttft_group_df=ttft_group_df,
+ tpot_group_df=tpot_group_df,
+ conc_col=conc_col,
+ args=args,
+ )
+ if summary_html:
+ main_fh.write(summary_html)
+ sub_fh.write(summary_html)
- # ---- Add threshold lines based on metric name ----
- if "ttft" in metric_name:
- _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
- elif (
- "tpot" in metric_name
- or "median" in metric_name
- or "p99" in metric_name
- ):
- _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
-
- # Export to HTML
- text_file.write(
- fig.to_html(full_html=True, include_plotlyjs="cdn")
- )
- sub_text_file.write(
- fig.to_html(full_html=True, include_plotlyjs="cdn")
- )
+
+def main():
+ args = build_parser().parse_args()
+ info_cols = list(DEFAULT_INFO_COLS)
+ plan = choose_metrics(args.latency)
+ files, info_cols = prepare_input_files(args, info_cols)
+ write_report_group_first(files, info_cols, plan, args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
old mode 100644
new mode 100755
index 99a5a5e334..6b6a7e472b
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -49,7 +49,11 @@ check_cpus() {
echo "Need at least 1 NUMA to run benchmarking."
exit 1
fi
- declare -g gpu_type="cpu"
+ if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+ declare -g gpu_type="arm64-cpu"
+ else
+ declare -g gpu_type="cpu"
+ fi
echo "GPU type is $gpu_type"
}
@@ -110,7 +114,8 @@ json2envs() {
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
- timeout 1200 bash -c '
+ local timeout_val="1200"
+ timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
@@ -206,8 +211,8 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ]; then
- pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+ if [[ "$ON_CPU" == "1" ]]; then
+ pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -275,8 +280,8 @@ run_throughput_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ]; then
- pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+ if [[ "$ON_CPU" == "1" ]]; then
+ pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -316,12 +321,44 @@ run_throughput_tests() {
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
+ #
+ # Supported JSON formats:
+ # 1) Plain format: top-level array
+ # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ #
+ # 2) Default parameters field + plain format tests
+ # {
+ # "defaults": { ... },
+ # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ # }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
- jq -c '.[]' "$serving_test_file" | while read -r params; do
+ jq -c '
+ if type == "array" then
+ # Plain format: test cases array
+ .[]
+ elif (type == "object" and has("tests")) then
+ # merge the default parameters into each test cases
+ . as $root
+ | ($root.defaults // {}) as $d
+ | ($root.tests // [])[]
+ # default qps / max_concurrency from defaults if missing
+ | .qps_list = (.qps_list // $d.qps_list)
+ | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+ # merge envs / params: test overrides defaults
+ | .server_environment_variables =
+ (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+ | .server_parameters =
+ (($d.server_parameters // {}) + (.server_parameters // {}))
+ | .client_parameters =
+ (($d.client_parameters // {}) + (.client_parameters // {}))
+ else
+ error("Unsupported serving test file format: must be array or object with .tests")
+ end
+ ' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,28 +372,33 @@ run_serving_tests() {
continue
fi
- # get client and server arguments
+ # get client and server arguments (after merged the default parameters)
server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
+
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
+
+ # qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
+
+ # max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
- num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
- max_concurrency_list="[$num_prompts]"
+ num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+ max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"
# check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ]; then
- pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+ if [[ "$ON_CPU" == "1" ]]; then
+ pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -458,9 +500,9 @@ run_serving_tests() {
main() {
local ARCH
ARCH=''
- if [ "$ON_CPU" == "1" ];then
- check_cpus
- ARCH='-cpu'
+ if [[ "$ON_CPU" == "1" ]]; then
+ check_cpus
+ ARCH="-$gpu_type"
else
check_gpus
ARCH="$arch_suffix"
diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
new file mode 100644
index 0000000000..fba695041e
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
@@ -0,0 +1,26 @@
+[
+ {
+ "test_name": "latency_llama8B_tp1",
+ "environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "load_format": "dummy",
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "num_iters_warmup": 5,
+ "num_iters": 15
+ }
+ }
+]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
new file mode 100644
index 0000000000..63f1f8ab88
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -0,0 +1,130 @@
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [
+ 12,
+ 16,
+ 24,
+ 32,
+ 64,
+ 128,
+ 200
+ ],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "ignore-eos": "",
+ "num_prompts": 200
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama8B_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
deleted file mode 100644
index f758097e09..0000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ /dev/null
@@ -1,610 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
deleted file mode 100644
index 0b1a42e790..0000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ /dev/null
@@ -1,1023 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index f792956f39..25ed7415ec 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,276 +1,283 @@
-[
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "ignore-eos": "",
+ "num_prompts": 200
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama8B_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
{
- "test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+ "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+ "test_name": "serving_llama3B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_granite2B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp2_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_qwen1.7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp1_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_qwen4B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp2_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_qwen8B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp1_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_glm9B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama8B_tp2_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "test_name": "serving_gemma7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "google/gemma-7b",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "google/gemma-7b",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
}
-]
+ ]
+}
diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
new file mode 100644
index 0000000000..da84dd4d0c
--- /dev/null
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
@@ -0,0 +1,27 @@
+[
+ {
+ "test_name": "throughput_llama8B_tp1",
+ "environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "load_format": "dummy",
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200,
+ "backend": "vllm"
+ }
+ }
+]
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1f..a9427a9366 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,6 +1,6 @@
steps:
# aarch64 + CUDA builds
- - label: "Build arm64 wheel - CUDA 12.9"
+ - label: "Build wheel - aarch64 - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
@@ -8,15 +8,30 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ env:
+ DOCKER_BUILDKIT: "1"
+
+ - label: "Build wheel - aarch64 - CUDA 13.0"
+ depends_on: ~
+ id: build-wheel-arm64-cuda-13-0
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+ # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# aarch64 build
- - label: "Build arm64 CPU wheel"
+ - label: "Build wheel - aarch64 - CPU"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
@@ -25,52 +40,53 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- - label: "Build wheel - CUDA 12.8"
+ - label: "Build wheel - x86_64 - CUDA 12.9"
depends_on: ~
- id: build-wheel-cuda-12-8
+ id: build-wheel-x86-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
- - label: "Build wheel - CUDA 12.9"
+ - label: "Build wheel - x86_64 - CUDA 13.0"
depends_on: ~
- id: build-wheel-cuda-12-9
+ id: build-wheel-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- - label: "Build wheel - CUDA 13.0"
+ # x86 CPU wheel build
+ - label: "Build wheel - x86_64 - CPU"
depends_on: ~
- id: build-wheel-cuda-13-0
+ id: build-wheel-x86-cpu
agents:
queue: cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- # Build release images (12.9)
- - label: "Build release image (x86)"
+ # Build release images (CUDA 12.9)
+ - label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
id: build-release-image-x86
agents:
@@ -83,7 +99,7 @@ steps:
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- - label: "Build release image (arm64)"
+ - label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~
id: build-release-image-arm64
agents:
@@ -93,35 +109,93 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
- # Add job to create multi-arch manifest
- - label: "Create multi-arch manifest"
+ - label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
- queue: cpu_queue_postmerge
+ queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- - label: "Annotate release workflow"
+ - label: "Annotate release workflow - CUDA 12.9"
depends_on:
- create-multi-arch-manifest
- - build-wheel-cuda-12-8
id: annotate-release-workflow
agents:
- queue: cpu_queue_postmerge
+ queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
+ - block: "Build CUDA 13.0 release images"
+ key: block-release-image-build-cuda-13-0
+ depends_on: ~
+
+ - label: "Build release image - x86_64 - CUDA 13.0"
+ depends_on: block-release-image-build-cuda-13-0
+ id: build-release-image-x86-cuda-13-0
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+ # re-tag to default image tag and push, just in case arm64 build fails
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+ - label: "Build release image - aarch64 - CUDA 13.0"
+ depends_on: block-release-image-build-cuda-13-0
+ id: build-release-image-arm64-cuda-13-0
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+
+ - label: "Create multi-arch manifest - CUDA 13.0"
+ depends_on:
+ - build-release-image-x86-cuda-13-0
+ - build-release-image-arm64-cuda-13-0
+ id: create-multi-arch-manifest-cuda-13-0
+ agents:
+ queue: small_cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
- input: "Provide Release version here"
id: input-release-version
fields:
- text: "What is the release version?"
key: release-version
+ - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+ key: block-upload-release-wheels
+ depends_on:
+ - input-release-version
+ - build-wheel-x86-cuda-12-9
+ - build-wheel-x86-cuda-13-0
+ - build-wheel-x86-cpu
+ - build-wheel-arm64-cuda-12-9
+ - build-wheel-arm64-cuda-13-0
+ - build-wheel-arm64-cpu
+
+ - label: "Upload release wheels to PyPI and GitHub"
+ depends_on:
+ - block-upload-release-wheels
+ id: upload-release-wheels
+ agents:
+ queue: small_cpu_queue_postmerge
+ commands:
+ - "bash .buildkite/scripts/upload-release-wheels.sh"
+
- block: "Build CPU release image"
key: block-cpu-release-image-build
depends_on: ~
@@ -154,24 +228,31 @@ steps:
env:
DOCKER_BUILDKIT: "1"
+ - block: "Build ROCm release image"
+ key: block-rocm-release-image-build
+ depends_on: ~
+
+ - label: "Build release image (ROCm)"
+ depends_on: block-rocm-release-image-build
+ id: build-release-image-rocm
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ # Build base image first
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
+ # Build vLLM ROCm image using the base
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
+
- label: "Build and publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
agents:
- queue: cpu_queue_postmerge
+ queue: small_cpu_queue_postmerge
commands:
- - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- - "docker push vllm/vllm-openai:nightly-x86_64"
- - "docker push vllm/vllm-openai:nightly-aarch64"
- - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- - "docker manifest push vllm/vllm-openai:nightly"
- - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ - "bash .buildkite/scripts/push-nightly-builds.sh"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
@@ -181,3 +262,384 @@ steps:
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
+
+ - label: "Build and publish nightly multi-arch image to DockerHub - CUDA 13.0"
+ depends_on:
+ - create-multi-arch-manifest-cuda-13-0
+ if: build.env("NIGHTLY") == "1"
+ agents:
+ queue: small_cpu_queue_postmerge
+ commands:
+ - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
+ # Clean up old nightly builds (keep only last 14)
+ - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllmbot
+ password-env: DOCKERHUB_TOKEN
+ env:
+ DOCKER_BUILDKIT: "1"
+ DOCKERHUB_USERNAME: "vllmbot"
+
+
+ # =============================================================================
+ # ROCm Release Pipeline (x86_64 only)
+ # =============================================================================
+ #
+ # vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
+ # To build a specific version, trigger the build from that branch/tag.
+ #
+ # Environment variables for ROCm builds (set via Buildkite UI or schedule):
+ # ROCM_PYTHON_VERSION: Python version (default: 3.12)
+ # PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
+ # ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
+ # ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
+ #
+ # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
+ # (currently rocm/dev-ubuntu-22.04:7.1-complete)
+ #
+ # =============================================================================
+
+ # ROCm Input Step - Collect build configuration (manual trigger only)
+ - input: "ROCm Wheel Release Build Configuration"
+ key: input-rocm-config
+ depends_on: ~
+ if: build.source == "ui"
+ fields:
+ - text: "Python Version"
+ key: "rocm-python-version"
+ default: "3.12"
+ hint: "Python version (e.g., 3.12)"
+ - text: "GPU Architectures"
+ key: "rocm-pytorch-rocm-arch"
+ default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
+ hint: "Semicolon-separated GPU architectures"
+ - select: "Upload Wheels to S3"
+ key: "rocm-upload-wheels"
+ default: "true"
+ options:
+ - label: "No - Build only (nightly/dev)"
+ value: "false"
+ - label: "Yes - Upload to S3 (release)"
+ value: "true"
+ - select: "Force Rebuild Base Wheels"
+ key: "rocm-force-rebuild"
+ default: "false"
+ hint: "Ignore S3 cache and rebuild base wheels from scratch"
+ options:
+ - label: "No - Use cached wheels if available"
+ value: "false"
+ - label: "Yes - Rebuild even if cache exists"
+ value: "true"
+
+ # ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
+ - label: ":rocm: Build ROCm Base Wheels"
+ id: build-rocm-base-wheels
+ depends_on:
+ - step: input-rocm-config
+ allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ # Set configuration and check cache
+ - |
+ set -euo pipefail
+
+ # Get values from meta-data (set by input step) or use defaults
+ PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
+ export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
+
+ PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+ export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+ # Check for force rebuild flag
+ ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
+ if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
+ ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
+ fi
+
+ echo "========================================"
+ echo "ROCm Base Wheels Build Configuration"
+ echo "========================================"
+ echo " PYTHON_VERSION: $${PYTHON_VERSION}"
+ echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+ echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
+ echo "========================================"
+
+ # Save resolved config for later jobs
+ buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
+ buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
+
+ # Check S3 cache for pre-built wheels
+ CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
+ CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
+ echo ""
+ echo "Cache key: $${CACHE_KEY}"
+ echo "Cache path: $${CACHE_PATH}"
+
+ # Save cache key for downstream jobs
+ buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
+
+ CACHE_STATUS="miss"
+ if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
+ CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
+ else
+ echo "Force rebuild requested, skipping cache check"
+ fi
+
+ if [ "$${CACHE_STATUS}" = "hit" ]; then
+ echo ""
+ echo "CACHE HIT! Downloading pre-built wheels..."
+ echo ""
+ .buildkite/scripts/cache-rocm-base-wheels.sh download
+
+ # Set the S3 path for the cached Docker image (for Job 2 to download)
+ S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+ buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+ # Mark that we used cache (for Docker image handling)
+ buildkite-agent meta-data set "rocm-used-cache" "true"
+
+ echo ""
+ echo "Cache download complete. Skipping Docker build."
+ echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+ else
+ echo ""
+ echo "CACHE MISS. Building from scratch..."
+ echo ""
+
+ # Build full base image (for later vLLM build)
+ DOCKER_BUILDKIT=1 docker buildx build \
+ --file docker/Dockerfile.rocm_base \
+ --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
+ --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+ --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+ --build-arg SCCACHE_REGION_NAME=us-west-2 \
+ --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+ --load \
+ .
+
+ # Build debs_wheel_release stage for wheel extraction
+ DOCKER_BUILDKIT=1 docker buildx build \
+ --file docker/Dockerfile.rocm_base \
+ --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
+ --target debs_wheel_release \
+ --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+ --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+ --build-arg SCCACHE_REGION_NAME=us-west-2 \
+ --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+ --load \
+ .
+
+ # Extract wheels from Docker image
+ mkdir -p artifacts/rocm-base-wheels
+ container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
+ docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
+ docker rm $${container_id}
+ echo "Extracted base wheels:"
+ ls -lh artifacts/rocm-base-wheels/
+
+ # Upload wheels to S3 cache for future builds
+ echo ""
+ echo "Uploading wheels to S3 cache..."
+ .buildkite/scripts/cache-rocm-base-wheels.sh upload
+
+ # Export base Docker image for reuse in vLLM build
+ mkdir -p artifacts/rocm-docker-image
+ docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
+ echo "Docker image size:"
+ ls -lh artifacts/rocm-docker-image/
+
+ # Upload large Docker image to S3 (also cached by cache key)
+ S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+ echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
+ aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+ # Save the S3 path for downstream jobs
+ buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+ # Mark that we did NOT use cache
+ buildkite-agent meta-data set "rocm-used-cache" "false"
+
+ echo ""
+ echo "Build complete. Wheels cached for future builds."
+ fi
+ artifact_paths:
+ - "artifacts/rocm-base-wheels/*.whl"
+ env:
+ DOCKER_BUILDKIT: "1"
+ S3_BUCKET: "vllm-wheels"
+
+ # ROCm Job 2: Build vLLM ROCm Wheel
+ - label: ":python: Build vLLM ROCm Wheel"
+ id: build-rocm-vllm-wheel
+ depends_on:
+ - step: build-rocm-base-wheels
+ allow_failure: false
+ agents:
+ queue: cpu_queue_postmerge
+ timeout_in_minutes: 180
+ commands:
+ # Download artifacts and prepare Docker image
+ - |
+ set -euo pipefail
+
+ # Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
+ # This fixes version detection when tags are moved/force-pushed
+ echo "Fetching latest tags from origin..."
+ git fetch --tags --force origin
+
+ # Log tag information for debugging version detection
+ echo "========================================"
+ echo "Git Tag Verification"
+ echo "========================================"
+ echo "Current HEAD: $(git rev-parse HEAD)"
+ echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
+ echo ""
+ echo "Recent tags (pointing to commits near HEAD):"
+ git tag -l --sort=-creatordate | head -5
+ echo "setuptools_scm version detection:"
+ pip install -q setuptools_scm 2>/dev/null || true
+ python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
+ echo "========================================"
+
+ # Download wheel artifacts from current build
+ echo "Downloading wheel artifacts from current build"
+ buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+
+ # Download Docker image from S3 (too large for Buildkite artifacts)
+ DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
+ if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
+ echo "ERROR: rocm-docker-image-s3-path metadata not found"
+ echo "This should have been set by the build-rocm-base-wheels job"
+ exit 1
+ fi
+ echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
+ mkdir -p artifacts/rocm-docker-image
+ aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
+
+ # Load base Docker image and capture the tag
+ echo "Loading base Docker image..."
+ LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
+ echo "$${LOAD_OUTPUT}"
+ # Extract the actual loaded image tag from "Loaded image: " output
+ # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
+ BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
+ if [ -z "$${BASE_IMAGE_TAG}" ]; then
+ echo "ERROR: Failed to extract image tag from docker load output"
+ echo "Load output was: $${LOAD_OUTPUT}"
+ exit 1
+ fi
+ echo "Loaded base image: $${BASE_IMAGE_TAG}"
+
+ # Prepare base wheels for Docker build context
+ mkdir -p docker/context/base-wheels
+ touch docker/context/base-wheels/.keep
+ cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
+ echo "Base wheels for vLLM build:"
+ ls -lh docker/context/base-wheels/
+
+ # Get GPU architectures from meta-data
+ PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+ PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+ echo "========================================"
+ echo "Building vLLM wheel with:"
+ echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
+ echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+ echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+ echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
+ echo "========================================"
+
+ # Build vLLM wheel using local checkout (REMOTE_VLLM=0)
+ DOCKER_BUILDKIT=1 docker build \
+ --file docker/Dockerfile.rocm \
+ --target export_vllm_wheel_release \
+ --output type=local,dest=rocm-dist \
+ --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
+ --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+ --build-arg REMOTE_VLLM=0 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+ --build-arg SCCACHE_REGION_NAME=us-west-2 \
+ --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+ .
+
+ echo "Built vLLM wheel:"
+ ls -lh rocm-dist/*.whl
+
+ # Copy wheel to artifacts directory
+ mkdir -p artifacts/rocm-vllm-wheel
+ cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
+ echo "Final vLLM wheel:"
+ ls -lh artifacts/rocm-vllm-wheel/
+ artifact_paths:
+ - "artifacts/rocm-vllm-wheel/*.whl"
+ env:
+ DOCKER_BUILDKIT: "1"
+ S3_BUCKET: "vllm-wheels"
+
+ # ROCm Job 3: Upload Wheels to S3
+ - label: ":s3: Upload ROCm Wheels to S3"
+ id: upload-rocm-wheels
+ depends_on:
+ - step: build-rocm-vllm-wheel
+ allow_failure: false
+ agents:
+ queue: cpu_queue_postmerge
+ timeout_in_minutes: 60
+ commands:
+ # Download all wheel artifacts and run upload
+ - |
+ set -euo pipefail
+
+ # Check if upload is enabled (from env var, meta-data, or release branch)
+ ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
+ if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
+ # Try to get from meta-data (input form)
+ ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
+ fi
+
+ echo "========================================"
+ echo "Upload check:"
+ echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
+ echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+ echo "========================================"
+
+ # Skip upload if not enabled
+ if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
+ echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
+ echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
+ exit 0
+ fi
+
+ echo "Upload enabled, proceeding..."
+
+ # Download artifacts from current build
+ echo "Downloading artifacts from current build"
+ buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+ buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
+
+ # Run upload script
+ bash .buildkite/scripts/upload-rocm-wheels.sh
+ env:
+ DOCKER_BUILDKIT: "1"
+ S3_BUCKET: "vllm-wheels"
+
+ # ROCm Job 4: Annotate ROCm Wheel Release
+ - label: ":memo: Annotate ROCm wheel release"
+ id: annotate-rocm-release
+ depends_on:
+ - step: upload-rocm-wheels
+ allow_failure: true
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "bash .buildkite/scripts/annotate-rocm-release.sh"
+ env:
+ S3_BUCKET: "vllm-wheels"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index df805e0850..d178fb8884 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -32,6 +32,7 @@ To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -45,6 +46,12 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai:latest-rocm
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
diff --git a/.buildkite/scripts/annotate-rocm-release.sh b/.buildkite/scripts/annotate-rocm-release.sh
new file mode 100755
index 0000000000..fcc7c290ec
--- /dev/null
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Generate Buildkite annotation for ROCm wheel release
+
+set -ex
+
+# Get build configuration from meta-data
+# Extract ROCm version dynamically from Dockerfile.rocm_base
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
+ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
+PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
+PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+
+# S3 URLs
+S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
+S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
+S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
+ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
+
+buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
+## :rocm: ROCm Wheel Release
+
+### Build Configuration
+| Setting | Value |
+|---------|-------|
+| **ROCm Version** | ${ROCM_VERSION} |
+| **Python Version** | ${PYTHON_VERSION} |
+| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
+| **Branch** | \`${BUILDKITE_BRANCH}\` |
+| **Commit** | \`${BUILDKITE_COMMIT}\` |
+
+### :package: Installation
+
+**Install from this build (by commit):**
+\`\`\`bash
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
+
+# Example:
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
+\`\`\`
+
+**Install from nightly (if published):**
+\`\`\`bash
+uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
+\`\`\`
+
+### :floppy_disk: Download Wheels Directly
+
+\`\`\`bash
+# List all ROCm wheels
+aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
+
+# Download specific wheels
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
+\`\`\`
+
+### :gear: Included Packages
+- **vllm**: vLLM with ROCm support
+- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
+- **triton_rocm**: Triton built for ROCm
+- **torchvision**: TorchVision for ROCm PyTorch
+- **amdsmi**: AMD SMI Python bindings
+
+### :warning: Notes
+- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
+- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
+- Platform: Linux x86_64 only
+EOF
diff --git a/.buildkite/scripts/cache-rocm-base-wheels.sh b/.buildkite/scripts/cache-rocm-base-wheels.sh
new file mode 100755
index 0000000000..be24472502
--- /dev/null
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Cache helper for ROCm base wheels
+#
+# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
+# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
+#
+# Usage:
+# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
+# cache-rocm-base-wheels.sh upload - Upload wheels to cache
+# cache-rocm-base-wheels.sh download - Download wheels from cache
+# cache-rocm-base-wheels.sh key - Output the cache key
+#
+# Environment variables:
+# S3_BUCKET - S3 bucket name (default: vllm-wheels)
+# PYTHON_VERSION - Python version (affects cache key)
+# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
+#
+# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
+# so changes to ROCm version are captured by the Dockerfile hash.
+
+set -euo pipefail
+
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+DOCKERFILE="docker/Dockerfile.rocm_base"
+CACHE_PREFIX="rocm/cache"
+
+# Generate hash from Dockerfile content + build args
+generate_cache_key() {
+ # Include Dockerfile content
+ if [[ ! -f "$DOCKERFILE" ]]; then
+ echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
+ exit 1
+ fi
+ local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
+
+ # Include key build args that affect the output
+ # These should match the ARGs in Dockerfile.rocm_base that change the build output
+ # Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
+ local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
+ local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
+
+ echo "${dockerfile_hash}-${args_hash}"
+}
+
+CACHE_KEY=$(generate_cache_key)
+CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
+
+case "${1:-}" in
+ check)
+ echo "Checking cache for key: ${CACHE_KEY}" >&2
+ echo "Cache path: ${CACHE_PATH}" >&2
+ echo "Variables used in cache key:" >&2
+ echo " PYTHON_VERSION: ${PYTHON_VERSION:-}" >&2
+ echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-}" >&2
+
+ # Check if cache exists by listing objects
+ # We look for at least one .whl file
+ echo "Running: aws s3 ls ${CACHE_PATH}" >&2
+ S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
+ echo "S3 ls output:" >&2
+ echo "$S3_OUTPUT" | head -5 >&2
+
+ if echo "$S3_OUTPUT" | grep -q "\.whl"; then
+ echo "hit"
+ else
+ echo "miss"
+ fi
+ ;;
+
+ upload)
+ echo "========================================"
+ echo "Uploading wheels to cache"
+ echo "========================================"
+ echo "Cache key: ${CACHE_KEY}"
+ echo "Cache path: ${CACHE_PATH}"
+ echo ""
+
+ if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
+ echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
+ exit 1
+ fi
+
+ WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+ if [[ "$WHEEL_COUNT" -eq 0 ]]; then
+ echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
+ exit 1
+ fi
+
+ echo "Uploading $WHEEL_COUNT wheels..."
+ aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
+
+ echo ""
+ echo "Cache upload complete!"
+ echo "========================================"
+ ;;
+
+ download)
+ echo "========================================"
+ echo "Downloading wheels from cache"
+ echo "========================================"
+ echo "Cache key: ${CACHE_KEY}"
+ echo "Cache path: ${CACHE_PATH}"
+ echo ""
+
+ mkdir -p artifacts/rocm-base-wheels
+ aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
+
+ echo ""
+ echo "Downloaded wheels:"
+ ls -lh artifacts/rocm-base-wheels/
+
+ WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+ echo ""
+ echo "Total: $WHEEL_COUNT wheels"
+ echo "========================================"
+ ;;
+
+ key)
+ echo "${CACHE_KEY}"
+ ;;
+
+ path)
+ echo "${CACHE_PATH}"
+ ;;
+
+ *)
+ echo "Usage: $0 {check|upload|download|key|path}" >&2
+ echo "" >&2
+ echo "Commands:" >&2
+ echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
+ echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
+ echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
+ echo " key - Output the cache key" >&2
+ echo " path - Output the full S3 cache path" >&2
+ exit 1
+ ;;
+esac
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
index f02a128c67..9e015e19f9 100755
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -3,7 +3,14 @@
set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
-# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+# This script uses DockerHub API to list and delete old tags with specified prefix
+# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
+# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
+
+# Get tag prefix from argument, default to "nightly-" if not provided
+TAG_PREFIX="${1:-nightly-}"
+
+echo "Cleaning up tags with prefix: $TAG_PREFIX"
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -45,7 +52,7 @@ get_all_tags() {
set -x
# Get both last_updated timestamp and tag name, separated by |
- local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+ local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then
break
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 0000000000..2eb4211402
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,468 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import sys
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+import regex as re
+
+
+def normalize_package_name(name: str) -> str:
+ """
+ Normalize package name according to PEP 503.
+ https://peps.python.org/pep-0503/#normalized-names
+
+ Replace runs of underscores, hyphens, and periods with a single hyphen,
+ and lowercase the result.
+ """
+ return re.sub(r"[-_.]+", "-", name).lower()
+
+
+if not sys.version_info >= (3, 12):
+ raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """
+
+
+
+
+{items}
+
+
+"""
+
+
+@dataclass
+class WheelFileInfo:
+ package_name: str
+ version: str
+ build_tag: str | None
+ python_tag: str
+ abi_tag: str
+ platform_tag: str
+ variant: str | None
+ filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+ """
+ Parse wheel file name to extract metadata.
+
+ The format of wheel names:
+ {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+ All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+ Example:
+ vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+ vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+ """
+ wheel_file_re = re.compile(
+ r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$"
+ )
+ match = wheel_file_re.match(file)
+ if not match:
+ raise ValueError(f"Invalid wheel file name: {file}")
+
+ package_name = match.group("package_name")
+ version = match.group("version")
+ build_tag = match.group("build_tag")
+ python_tag = match.group("python_tag")
+ abi_tag = match.group("abi_tag")
+ platform_tag = match.group("platform_tag")
+
+ # extract variant from version
+ variant = None
+ if "dev" in version:
+ ver_after_dev = version.split("dev")[-1]
+ if "." in ver_after_dev:
+ variant = ver_after_dev.split(".")[-1]
+ version = version.removesuffix("." + variant)
+ else:
+ if "+" in version:
+ version_part, suffix = version.split("+", 1)
+ # Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
+ # Git hashes and other suffixes are NOT variants
+ if suffix.startswith(("rocm", "cu", "cpu")):
+ variant = suffix
+ version = version_part
+ # Otherwise keep the full version string (variant stays None)
+
+ return WheelFileInfo(
+ package_name=package_name,
+ version=version,
+ build_tag=build_tag,
+ python_tag=python_tag,
+ abi_tag=abi_tag,
+ platform_tag=platform_tag,
+ variant=variant,
+ filename=file,
+ )
+
+
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
+ """
+ Generate project list HTML content linking to each project & variant sub-directory.
+ """
+ href_tags = []
+ for name in sorted(subdir_names):
+ name = name.strip("/").strip(".")
+ href_tags.append(f' {name}/
')
+ return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+
+
+def generate_package_index_and_metadata(
+ wheel_files: list[WheelFileInfo],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ comment: str = "",
+) -> tuple[str, str]:
+ """
+ Generate package index HTML content for a specific package, linking to actual wheel files.
+ """
+ href_tags = []
+ metadata = []
+ for file in sorted(wheel_files, key=lambda x: x.filename):
+ relative_path = (
+ wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+ )
+ # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+ # NOTE: this is AWS S3 specific behavior!
+ file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+ href_tags.append(f' {file.filename}
')
+ file_meta = asdict(file)
+ file_meta["path"] = file_path_quoted
+ metadata.append(file_meta)
+ index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+ metadata_str = json.dumps(metadata, indent=2)
+ return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+ whl_files: list[str],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ default_variant: str | None = None,
+ alias_to_default: str | None = None,
+ comment: str = "",
+):
+ """
+ Generate index for all wheel files.
+
+ Args:
+ whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+ wheel_base_dir (Path): Base directory for wheel files.
+ index_base_dir (Path): Base directory to store index files.
+ default_variant (str | None): The default variant name, if any.
+ alias_to_default (str | None): Alias variant name for the default variant, if any.
+ comment (str | None): Optional comment to include in the generated HTML files.
+
+ First, parse all wheel files to extract metadata.
+ We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+ The index for the default variant (if any) is generated in the root index directory.
+
+ If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+ is purely a copy of the corresponding variant index, with only the links adjusted.
+ Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+ If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+ as the default variant index, but the links are adjusted accordingly.
+
+ Index directory structure:
+ index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+ index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
+ vllm/
+ index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+ metadata.json # machine-readable metadata for all wheels in this package
+ cpu/ # cpu variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu129/ # cu129 is actually the alias to default variant
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu130/ # cu130 variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ ...
+
+ metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+ [
+ {
+ "package_name": "vllm",
+ "version": "0.10.2rc2",
+ "build_tag": null,
+ "python_tag": "cp38",
+ "abi_tag": "abi3",
+ "platform_tag": "manylinux2014_aarch64",
+ "variant": "cu129",
+ "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+ "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+ },
+ ...
+ ]
+ """
+
+ parsed_files = [parse_from_filename(f) for f in whl_files]
+
+ if not parsed_files:
+ print("No wheel files found, skipping index generation.")
+ return
+
+ # For ROCm builds: inherit variant from vllm wheel
+ # All ROCm wheels should share the same variant as vllm
+ rocm_variant = None
+ for file in parsed_files:
+ if (
+ file.package_name == "vllm"
+ and file.variant
+ and file.variant.startswith("rocm")
+ ):
+ rocm_variant = file.variant
+ print(f"Detected ROCm variant from vllm: {rocm_variant}")
+ break
+
+ # Apply ROCm variant to all wheels without a variant
+ if rocm_variant:
+ for file in parsed_files:
+ if file.variant is None:
+ file.variant = rocm_variant
+ print(f"Inherited variant '{rocm_variant}' for {file.filename}")
+
+ # Group by variant
+ variant_to_files: dict[str, list[WheelFileInfo]] = {}
+ for file in parsed_files:
+ variant = file.variant or "default"
+ if variant not in variant_to_files:
+ variant_to_files[variant] = []
+ variant_to_files[variant].append(file)
+
+ print(f"Found variants: {list(variant_to_files.keys())}")
+
+ # sanity check for default variant
+ if default_variant:
+ if "default" in variant_to_files:
+ raise ValueError(
+ "All wheel files must have variant suffixes when `default_variant` is specified."
+ )
+ if default_variant not in variant_to_files:
+ raise ValueError(
+ f"Default variant '{default_variant}' not found among wheel files."
+ )
+
+ if alias_to_default:
+ if "default" not in variant_to_files:
+ # e.g. only some wheels are uploaded to S3 currently
+ print(
+ "[WARN] Alias to default variant specified, but no default variant found."
+ )
+ elif alias_to_default in variant_to_files:
+ raise ValueError(
+ f"Alias variant name '{alias_to_default}' already exists among wheel files."
+ )
+ else:
+ variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+ print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+ # Generate comment in HTML header
+ comment_str = f" ({comment})" if comment else ""
+ comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
+ # Generate index for each variant
+ subdir_names = set()
+ for variant, files in variant_to_files.items():
+ if variant == "default":
+ variant_dir = index_base_dir
+ else:
+ variant_dir = index_base_dir / variant
+ subdir_names.add(variant)
+
+ variant_dir.mkdir(parents=True, exist_ok=True)
+
+ # gather all package names in this variant (normalized per PEP 503)
+ packages = set(normalize_package_name(f.package_name) for f in files)
+ if variant == "default":
+ # these packages should also appear in the "project list"
+ # generate after all variants are processed
+ subdir_names = subdir_names.union(packages)
+ else:
+ # generate project list for this variant directly
+ project_list_str = generate_project_list(sorted(packages), comment_tmpl)
+ with open(variant_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+ for package in packages:
+ # filter files belonging to this package only (compare normalized names)
+ package_files = [
+ f for f in files if normalize_package_name(f.package_name) == package
+ ]
+ package_dir = variant_dir / package
+ package_dir.mkdir(parents=True, exist_ok=True)
+ index_str, metadata_str = generate_package_index_and_metadata(
+ package_files, wheel_base_dir, package_dir, comment
+ )
+ with open(package_dir / "index.html", "w") as f:
+ f.write(index_str)
+ with open(package_dir / "metadata.json", "w") as f:
+ f.write(metadata_str)
+
+ # Generate top-level project list index
+ project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
+ with open(index_base_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+
+if __name__ == "__main__":
+ """
+ Arguments:
+ --version : version string for the current build (e.g., commit hash)
+ --wheel-dir : directory containing wheel files (default to be same as `version`)
+ --current-objects : path to JSON file containing current S3 objects listing in this version directory
+ --output-dir : directory to store generated index files
+ --alias-to-default : (optional) alias variant name for the default variant
+ --comment : (optional) comment string to include in generated HTML files
+ """
+
+ parser = argparse.ArgumentParser(
+ description="Process nightly build wheel files to generate indices."
+ )
+ parser.add_argument(
+ "--version",
+ type=str,
+ required=True,
+ help="Version string for the current build (e.g., commit hash)",
+ )
+ parser.add_argument(
+ "--current-objects",
+ type=str,
+ required=True,
+ help="Path to JSON file containing current S3 objects listing in this version directory",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ required=True,
+ help="Directory to store generated index files",
+ )
+ parser.add_argument(
+ "--wheel-dir",
+ type=str,
+ default=None,
+ help="Directory containing wheel files (default to be same as `version`)",
+ )
+ parser.add_argument(
+ "--alias-to-default",
+ type=str,
+ default=None,
+ help="Alias variant name for the default variant",
+ )
+ parser.add_argument(
+ "--comment",
+ type=str,
+ default="",
+ help="Optional comment string to include in generated HTML files",
+ )
+
+ args = parser.parse_args()
+
+ version = args.version
+ # Allow rocm/ prefix, reject other slashes and all backslashes
+ if "\\" in version:
+ raise ValueError("Version string must not contain backslashes.")
+ if "/" in version and not version.startswith("rocm/"):
+ raise ValueError(
+ "Version string must not contain slashes (except for 'rocm/' prefix)."
+ )
+ current_objects_path = Path(args.current_objects)
+ output_dir = Path(args.output_dir)
+ if not output_dir.exists():
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Read current objects JSON
+ with open(current_objects_path) as f:
+ current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+ # current_objects looks like from list_objects_v2 S3 API:
+ """
+ "Contents": [
+ {
+ "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+ "LastModified": "2025-11-28T14:00:32+00:00",
+ "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+ "ChecksumAlgorithm": [
+ "CRC64NVME"
+ ],
+ "ChecksumType": "FULL_OBJECT",
+ "Size": 435649349,
+ "StorageClass": "STANDARD"
+ },
+ ...
+ ]
+ """
+
+ # Extract wheel file keys
+ wheel_files = []
+ for item in current_objects.get("Contents", []):
+ key: str = item["Key"]
+ if key.endswith(".whl"):
+ wheel_files.append(key.split("/")[-1]) # only the filename is used
+
+ print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+ # keep only "official" files for a non-nightly version (specified by cli args)
+ PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
+ if PY_VERSION_RE.match(version):
+ # upload-wheels.sh ensures no "dev" is in args.version
+ wheel_files = list(
+ filter(lambda x: version in x and "dev" not in x, wheel_files)
+ )
+ print(f"Non-nightly version detected, wheel files used: {wheel_files}")
+ else:
+ print("Nightly version detected, keeping all wheel files.")
+
+ # Generate index and metadata, assuming wheels and indices are stored as:
+ # s3://vllm-wheels/{wheel_dir}/
+ # s3://vllm-wheels//
+ #
+ # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
+ # - rocm/{commit}/ (same as wheels)
+ # - rocm/nightly/
+ # - rocm/{version}/
+ # All these are under the "rocm/" prefix, so relative paths should be
+ # relative to "rocm/", not the bucket root.
+ if args.wheel_dir:
+ # Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
+ wheel_dir = args.wheel_dir.strip().rstrip("/")
+ elif version.startswith("rocm/"):
+ # For rocm/commit, wheel_base_dir should be just the commit part
+ # so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
+ wheel_dir = version.split("/", 1)[1]
+ else:
+ wheel_dir = version
+ wheel_base_dir = Path(output_dir).parent / wheel_dir
+ index_base_dir = Path(output_dir)
+
+ generate_index_and_metadata(
+ whl_files=wheel_files,
+ wheel_base_dir=wheel_base_dir,
+ index_base_dir=index_base_dir,
+ default_variant=None,
+ alias_to_default=args.alias_to_default,
+ comment=args.comment.strip(),
+ )
+ print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 864eb470bb..484167f461 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
- --ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
@@ -210,12 +209,21 @@ if [[ $commands == *"--shard-id="* ]]; then
wait "${pid}"
STATUS+=($?)
done
+ at_least_one_shard_with_tests=0
for st in "${STATUS[@]}"; do
- if [[ ${st} -ne 0 ]]; then
+ if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
echo "One of the processes failed with $st"
exit "${st}"
+ elif [[ ${st} -eq 5 ]]; then
+ echo "Shard exited with status 5 (no tests collected) - treating as success"
+ else # This means st is 0
+ at_least_one_shard_with_tests=1
fi
done
+ if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
+ echo "All shards reported no tests collected. Failing the build."
+ exit 1
+ fi
else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index d0036f24c8..b6274d698d 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -7,53 +7,57 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-0-16}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
-NUMA_NODE=${NUMA_NODE:-0}
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+export CMAKE_BUILD_PARALLEL_LEVEL=16
# Setup cleanup
remove_docker_container() {
set -e;
- docker rm -f cpu-test-"$NUMA_NODE" || true;
+ docker rm -f cpu-test || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
-# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
function cpu_tests() {
set -e
- export NUMA_NODE=$2
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
pip list"
# offline inference
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ # Run model tests
+ docker exec cpu-test bash -c "
+ set -e
+ pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
# Run kernel tests
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
- pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+ pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+ pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
- docker exec cpu-test-"$NUMA_NODE" bash -c '
+ docker exec cpu-test bash -c '
set -e
- VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+ VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
- --model meta-llama/Llama-3.2-3B-Instruct \
+ --model Qwen/Qwen3-0.6B \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
@@ -61,4 +65,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c cpu_tests
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 39ea180173..3728f73fa2 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,20 +25,22 @@ function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
+ export TORCH_COMPILE_DISABLE=1
set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
+ export TORCH_COMPILE_DISABLE=1
set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
- pip install sentence-transformers datamodel_code_generator
+ pip install sentence-transformers datamodel_code_generator tblib
# Note: disable Bart until supports V1
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
- pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
- pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
- pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+ pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
+ pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
+ pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 2267718f75..ee6510bf88 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
@@ -50,6 +50,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+ pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
@@ -83,7 +84,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
- tests/lora/test_qwen2vl.py"
+ tests/lora/test_qwenvl.py"
# online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
index 29c8f5ed5a..0db1abe37b 100644
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index cbb2527a4f..6959f81eab 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index f022fa3672..eafc82b984 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index d49f3e2f47..85b554e5e8 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -35,10 +35,11 @@ docker run \
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
- python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
- VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+ python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
@@ -46,6 +47,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'
diff --git a/.buildkite/scripts/push-nightly-builds.sh b/.buildkite/scripts/push-nightly-builds.sh
new file mode 100755
index 0000000000..98e80fd99e
--- /dev/null
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -ex
+
+# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
+# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
+# otherwise they will be cleaned up together with the main "nightly" tags.
+
+TAG_VARIANT="$1"
+if [ -n "$TAG_VARIANT" ]; then
+ ORIG_TAG_SUFFIX="-$TAG_VARIANT"
+ TAG_NAME="$TAG_VARIANT-nightly"
+else
+ ORIG_TAG_SUFFIX=""
+ TAG_NAME="nightly"
+fi
+
+ORIG_TAG_NAME="$BUILDKITE_COMMIT"
+
+echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
+
+# pull original arch-dependent images from AWS ECR Public
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
+# tag arch-dependent images
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
+# push arch-dependent images to DockerHub
+docker push vllm/vllm-openai:$TAG_NAME-x86_64
+docker push vllm/vllm-openai:$TAG_NAME-aarch64
+# push arch-independent manifest to DockerHub
+docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest push vllm/vllm-openai:$TAG_NAME
+docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
diff --git a/.buildkite/scripts/run-multi-node-test.sh b/.buildkite/scripts/run-multi-node-test.sh
index 49aebce786..c0911f17b6 100755
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -2,6 +2,17 @@
set -euox pipefail
+# To detect ROCm
+# Check multiple indicators:
+if [ -e /dev/kfd ] || \
+ [ -d /opt/rocm ] || \
+ command -v rocm-smi &> /dev/null || \
+ [ -n "${ROCM_HOME:-}" ]; then
+ IS_ROCM=1
+else
+ IS_ROCM=0
+fi
+
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
@@ -26,13 +37,18 @@ for command in "${COMMANDS[@]}"; do
echo "$command"
done
+
start_network() {
docker network create --subnet=192.168.10.0/24 docker-net
}
start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
- GPU_DEVICES='"device='
+ if [ "$IS_ROCM" -eq 1 ]; then
+ GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
+ else
+ GPU_DEVICES='--gpus "device='
+ fi
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
@@ -40,7 +56,9 @@ start_nodes() {
GPU_DEVICES+=','
fi
done
- GPU_DEVICES+='"'
+ if [ "$IS_ROCM" -eq 0 ]; then
+ GPU_DEVICES+='"'
+ fi
# start the container in detached mode
# things to note:
@@ -49,7 +67,7 @@ start_nodes() {
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
- docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+ docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
/bin/bash -c "tail -f /dev/null"
diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
index 5b25c358fc..3fb7c82c8d 100755
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+ echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+ exit 0
+fi
+
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index 5302f524a0..8106f50f18 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -17,7 +17,17 @@ wait_for_server() {
}
MODEL="deepseek-ai/DeepSeek-V2-lite"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+ # ROCm platform
+ BACKENDS=("allgather_reducescatter")
+ # Disable MOE padding for ROCm since it is causing eplb to fail
+ export VLLM_ROCM_MOE_PADDING=0
+else
+ # Non-ROCm platform (CUDA/other)
+ BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
new file mode 100644
index 0000000000..d0921c5699
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
+THRESHOLD=${1:-0.8}
+NUM_Q=${2:-1319}
+PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+ local port=$1
+ timeout 600 bash -c '
+ until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+ sleep 1
+ done'
+}
+
+MODEL="QWen/Qwen3-30B-A3B-FP8"
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+ # ROCm platform
+ BACKENDS=("allgather_reducescatter")
+ # Disable MOE padding for ROCm since it is causing eplb to fail
+ export VLLM_ROCM_MOE_PADDING=0
+else
+ # Non-ROCm platform (CUDA/other)
+ BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+ if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+ kill "${SERVER_PID}" 2>/dev/null || true
+ for _ in {1..20}; do
+ kill -0 "${SERVER_PID}" 2>/dev/null || break
+ sleep 0.5
+ done
+ kill -9 "${SERVER_PID}" 2>/dev/null || true
+ fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+ VLLM_DEEP_GEMM_WARMUP=skip \
+ vllm serve "$MODEL" \
+ --enforce-eager \
+ --enable-eplb \
+ --all2all-backend $BACK \
+ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+ --data-parallel-size ${DATA_PARALLEL_SIZE} \
+ --enable-expert-parallel \
+ --trust-remote-code \
+ --max-model-len 2048 \
+ --port $PORT &
+ SERVER_PID=$!
+ wait_for_server $PORT
+
+ TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+ OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+ python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+ python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+ cleanup
+ SERVER_PID=
+ sleep 1
+ PORT=$((PORT+1))
+done
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
similarity index 64%
rename from .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
rename to .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
index a513529929..b3b65128e6 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -2,9 +2,9 @@
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.8}
+THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
-PORT=${3:-8020}
+PORT=${3:-8040}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
@@ -16,8 +16,18 @@ wait_for_server() {
done'
}
-MODEL="QWen/Qwen3-30B-A3B-FP8"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+ # ROCm platform
+ BACKENDS=("allgather_reducescatter")
+ # Disable MOE padding for ROCm since it is causing eplb to fail
+ export VLLM_ROCM_MOE_PADDING=0
+else
+ # Non-ROCm platform (CUDA/other)
+ BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@@ -33,14 +43,17 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
- VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
- --tensor-parallel-size 2 \
- --data-parallel-size 2 \
+ --tensor-parallel-size 4 \
--enable-expert-parallel \
+ --enable-eplb \
+ --all2all-backend $BACK \
+ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \
--max-model-len 2048 \
+ --gpu-memory-utilization 0.9 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
diff --git a/.buildkite/scripts/upload-nightly-wheels.sh b/.buildkite/scripts/upload-nightly-wheels.sh
new file mode 100644
index 0000000000..1af7f476ae
--- /dev/null
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+ # use new python from docker
+ docker pull python:3-slim
+ PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+ echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+ exit 1
+fi
+wheel="${wheel_files[0]}"
+
+# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# we also accept params as manylinux tag
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="${1:-manylinux_2_31}"
+
+# Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+ echo "Error: Wheel filename does not contain 'linux': $wheel"
+ exit 1
+fi
+new_wheel="${wheel/linux/$manylinux_version}"
+mv -- "$wheel" "$new_wheel"
+wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
+
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
+
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels//
+# and indices can be placed in //, or /nightly/, or //
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+ alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+else
+ alias_arg=""
+fi
+
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
+
+# copy indices to // unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+ echo "Uploading indices to overwrite /nightly/"
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
+fi
+
+# re-generate and copy to // only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+ echo "Re-generating indices for /$pure_version/"
+ rm -rf "$INDICES_OUTPUT_DIR/*"
+ mkdir -p "$INDICES_OUTPUT_DIR"
+ # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
+ $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/.buildkite/scripts/upload-release-wheels.sh b/.buildkite/scripts/upload-release-wheels.sh
new file mode 100644
index 0000000000..2d8fc6a40b
--- /dev/null
+++ b/.buildkite/scripts/upload-release-wheels.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+
+set -e
+
+BUCKET="vllm-wheels"
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
+echo "Release version from Buildkite: $RELEASE_VERSION"
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
+if [ -z "$GIT_VERSION" ]; then
+ echo "[FATAL] Not on a git tag, cannot create release."
+ exit 1
+else
+ echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
+fi
+# sanity check for version mismatch
+if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
+ if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
+ echo "[WARNING] Force release and ignore version mismatch"
+ else
+ echo "[FATAL] Release version from Buildkite does not match Git version."
+ exit 1
+ fi
+fi
+PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
+
+# check pypi token
+if [ -z "$PYPI_TOKEN" ]; then
+ echo "[FATAL] PYPI_TOKEN is not set."
+ exit 1
+else
+ export TWINE_USERNAME="__token__"
+ export TWINE_PASSWORD="$PYPI_TOKEN"
+fi
+
+# check github token
+if [ -z "$GITHUB_TOKEN" ]; then
+ echo "[FATAL] GITHUB_TOKEN is not set."
+ exit 1
+else
+ export GH_TOKEN="$GITHUB_TOKEN"
+fi
+
+set -x # avoid printing secrets above
+
+# download gh CLI from github
+# Get latest gh CLI version from GitHub API
+GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
+if [ -z "$GH_VERSION" ]; then
+ echo "[FATAL] Failed to get latest gh CLI version from GitHub"
+ exit 1
+fi
+echo "Downloading gh CLI version: $GH_VERSION"
+GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
+GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
+GH_INSTALL_DIR="/tmp/gh-install"
+mkdir -p "$GH_INSTALL_DIR"
+pushd "$GH_INSTALL_DIR"
+curl -L -o "$GH_TARBALL" "$GH_URL"
+tar -xzf "$GH_TARBALL"
+GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
+if [ -z "$GH_BIN" ]; then
+ echo "[FATAL] Failed to find gh CLI executable"
+ exit 1
+fi
+echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
+echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
+command "$GH_BIN" release list --limit 5
+popd
+
+# install twine from pypi
+python3 -m venv /tmp/vllm-release-env
+source /tmp/vllm-release-env/bin/activate
+pip install twine
+python3 -m twine --version
+
+# copy release wheels to local directory
+DIST_DIR=/tmp/vllm-release-dist
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+echo "Copying wheels to local directory"
+mkdir -p $DIST_DIR
+# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
+aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
+echo "Wheels copied to local directory"
+# generate source tarball
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
+ls -la $DIST_DIR
+
+
+# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
+PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
+if [ -z "$PYPI_WHEEL_FILES" ]; then
+ echo "No default variant wheels found, quitting..."
+ exit 1
+fi
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
+echo "Wheels uploaded to PyPI"
+
+# create release on GitHub with the release version and all wheels
+command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
diff --git a/.buildkite/scripts/upload-rocm-wheels.sh b/.buildkite/scripts/upload-rocm-wheels.sh
new file mode 100755
index 0000000000..bb555bc842
--- /dev/null
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Upload ROCm wheels to S3 with proper index generation
+#
+# Required environment variables:
+# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
+# S3_BUCKET (default: vllm-wheels)
+#
+# S3 path structure:
+# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
+# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
+# s3://vllm-wheels/rocm/{version}/ - Index for release versions
+
+set -ex
+
+# ======== Configuration ========
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
+S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
+INDICES_OUTPUT_DIR="rocm-indices"
+PYTHON="${PYTHON_PROG:-python3}"
+
+# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
+MANYLINUX_VERSION="manylinux_2_35"
+
+echo "========================================"
+echo "ROCm Wheel Upload Configuration"
+echo "========================================"
+echo "S3 Bucket: $BUCKET"
+echo "S3 Path: $ROCM_SUBPATH"
+echo "Commit: $BUILDKITE_COMMIT"
+echo "Branch: $BUILDKITE_BRANCH"
+echo "========================================"
+
+# ======== Part 0: Setup Python ========
+
+# Detect if python3.12+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
+if [[ "$has_new_python" -eq 0 ]]; then
+ # Use new python from docker
+ # Use --user to ensure files are created with correct ownership (not root)
+ docker pull python:3-slim
+ PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ======== Part 1: Collect and prepare wheels ========
+
+# Collect all wheels
+mkdir -p all-rocm-wheels
+cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
+cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
+
+WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
+echo "Total wheels to upload: $WHEEL_COUNT"
+
+if [ "$WHEEL_COUNT" -eq 0 ]; then
+ echo "ERROR: No wheels found to upload!"
+ exit 1
+fi
+
+# Rename linux to manylinux in wheel filenames
+for wheel in all-rocm-wheels/*.whl; do
+ if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
+ new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
+ mv -- "$wheel" "$new_wheel"
+ echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
+ fi
+done
+
+echo ""
+echo "Wheels to upload:"
+ls -lh all-rocm-wheels/
+
+# ======== Part 2: Upload wheels to S3 ========
+
+echo ""
+echo "Uploading wheels to $S3_COMMIT_PREFIX"
+for wheel in all-rocm-wheels/*.whl; do
+ aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+done
+
+# ======== Part 3: Generate and upload indices ========
+
+# List existing wheels in commit directory
+echo ""
+echo "Generating indices..."
+obj_json="rocm-objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
+
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# Use the existing generate-nightly-index.py
+# HACK: Replace regex module with stdlib re (same as CUDA script)
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+
+$PYTHON .buildkite/scripts/generate-nightly-index.py \
+ --version "$ROCM_SUBPATH" \
+ --current-objects "$obj_json" \
+ --output-dir "$INDICES_OUTPUT_DIR" \
+ --comment "ROCm commit $BUILDKITE_COMMIT"
+
+# Upload indices to commit directory
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# Update rocm/nightly/ if on main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
+ echo "Updating rocm/nightly/ index..."
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
+fi
+
+# Extract version from vLLM wheel and update version-specific index
+VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
+if [ -n "$VLLM_WHEEL" ]; then
+ VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+ echo "Version in wheel: $VERSION"
+ PURE_VERSION="${VERSION%%+*}"
+ PURE_VERSION="${PURE_VERSION%%.rocm}"
+ echo "Pure version: $PURE_VERSION"
+
+ if [[ "$VERSION" != *"dev"* ]]; then
+ echo "Updating rocm/$PURE_VERSION/ index..."
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
+ fi
+fi
+
+# ======== Part 4: Summary ========
+
+echo ""
+echo "========================================"
+echo "ROCm Wheel Upload Complete!"
+echo "========================================"
+echo ""
+echo "Wheels available at:"
+echo " s3://$BUCKET/$ROCM_SUBPATH/"
+echo ""
+echo "Install command (by commit):"
+echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
+echo ""
+if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
+ echo "Install command (nightly):"
+ echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
+fi
+echo ""
+echo "Wheel count: $WHEEL_COUNT"
+echo "========================================"
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
deleted file mode 100644
index 945c5e48c0..0000000000
--- a/.buildkite/scripts/upload-wheels.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-# Assume wheels are in artifacts/dist/*.whl
-wheel_files=(artifacts/dist/*.whl)
-
-# Check that exactly one wheel is found
-if [[ ${#wheel_files[@]} -ne 1 ]]; then
- echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
- exit 1
-fi
-
-# Get the single wheel file
-wheel="${wheel_files[0]}"
-
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
- manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
- manylinux_version="manylinux2014"
-else
- echo "Warning: Unknown architecture $arch, using manylinux1 as default"
- manylinux_version="manylinux1"
-fi
-
-# Rename 'linux' to the appropriate manylinux version in the wheel filename
-new_wheel="${wheel/linux/$manylinux_version}"
-mv -- "$wheel" "$new_wheel"
-wheel="$new_wheel"
-
-# Extract the version from the wheel
-version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
-
-normal_wheel="$wheel" # Save the original wheel filename
-
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
- suffix="${version##*.}"
- if [[ $suffix == cu* ]]; then
- new_version="1.0.0.dev+${suffix}"
- else
- new_version="1.0.0.dev"
- fi
- new_wheel="${wheel/$version/$new_version}"
- # use cp to keep both files in the artifacts directory
- cp -- "$wheel" "$new_wheel"
- wheel="$new_wheel"
- version="$new_version"
-fi
-
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
-
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
- aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
-else
- echo "Skipping index files for non-cu129 wheels"
-fi
-
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
- echo "Skipping index files for non-cu129 wheels"
-fi
-
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4e2ff5c5a6..044a82c977 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -39,9 +39,9 @@ steps:
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:
- label: Async Engine, Inputs, Utils, Worker Test # 10min
timeout_in_minutes: 15
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/multimodal
@@ -61,17 +61,19 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
- timeout_in_minutes: 10
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -80,6 +82,8 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -113,9 +117,9 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py
- label: Entrypoints Unit Tests # 5min
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -124,7 +128,7 @@ steps:
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40
@@ -144,7 +148,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
@@ -158,10 +162,28 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/test_chat_utils.py
+- label: Entrypoints Integration Test (API Server 2)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/sleep
+ - tests/entrypoints/rpc
+ - tests/tool_use
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/sleep
+ - pytest -v -s tool_use
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+
- label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@@ -177,6 +199,21 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
+- label: Entrypoints Integration Test (Responses API)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai/responses
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/openai/responses
+
- label: Distributed Tests (4 GPUs) # 35min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@@ -197,6 +234,9 @@ steps:
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
+ # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+ # TODO: Remove when the bug is fixed in a future ROCm release
+ - export TORCH_NCCL_BLOCKING_WAIT=1
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
@@ -212,6 +252,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -244,15 +285,16 @@ steps:
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
- # https://github.com/NVIDIA/nccl/issues/1838
- #- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
+ # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+ # TODO: Remove when the bug is fixed in a future ROCm release
+ - export TORCH_NCCL_BLOCKING_WAIT=1
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
@@ -308,28 +350,27 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 25min
- timeout_in_minutes: 40
+- label: Engine Test # 9min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/engine
- - tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- # OOM in the CI unless we run this separately
- - pytest -v -s tokenization
-- label: V1 Test e2e + engine # 30min
- timeout_in_minutes: 45
+- label: V1 Test e2e + engine # 65min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
- agent_pool: mi325_1
+ # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
+ # See discussion here: https://github.com/vllm-project/vllm/pull/31040
+ agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
- vllm/
@@ -342,9 +383,9 @@ steps:
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
@@ -392,6 +433,21 @@ steps:
commands:
- pytest -v -s v1/attention
+- label: Batch Invariance Tests (H100) # 10min
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
@@ -402,9 +458,9 @@ steps:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
@@ -420,29 +476,34 @@ steps:
- label: Examples Test # 30min
timeout_in_minutes: 45
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- - python3 offline_inference/basic/classify.py
- - python3 offline_inference/basic/embed.py
- - python3 offline_inference/basic/score.py
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -470,8 +531,7 @@ steps:
- tests/samplers
- tests/conftest.py
commands:
- - pytest -v -s samplers
- - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+ - pytest -v -s -m 'not skip_v1' samplers
- label: LoRA Test %N # 20min each
timeout_in_minutes: 30
@@ -496,7 +556,7 @@ steps:
- label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
@@ -513,7 +573,7 @@ steps:
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
@@ -569,7 +629,7 @@ steps:
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
@@ -596,7 +656,7 @@ steps:
- label: Kernels MoE Test %N # 40min
timeout_in_minutes: 60
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
@@ -623,6 +683,26 @@ steps:
commands:
- pytest -v -s kernels/mamba
+- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
+# Not replicating for CUTLAS & CuTe
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
- label: Model Executor Test # 23min
timeout_in_minutes: 35
torch_nightly: true
@@ -665,7 +745,7 @@ steps:
- label: Quantization Test # 70min
timeout_in_minutes: 90
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
@@ -680,19 +760,21 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- - uv pip install --system torchao==0.13.0
+ - uv pip install --system torchao==0.14.1
+ - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-- label: LM Eval Small Models # 15min
- timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
+ autorun_on_main: true
commands:
- - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: OpenAI API correctness # 10min
timeout_in_minutes: 15
@@ -703,33 +785,11 @@ steps:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
- commands: # LMEval
- # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
+ - tools/
+ commands: # LMEval+Transcription WER check
+ - bash ../tools/install_torchcodec_rocm.sh || exit 1
- pytest -s entrypoints/openai/correctness/
-- label: OpenAI-Compatible Tool Use # 23 min
- timeout_in_minutes: 35
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_1
- # grade: Blocking
- fast_check: false
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- commands:
- - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_1
- # grade: Blocking
- timeout_in_minutes: 10
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- no_gpu: true
- commands:
- - pytest -v -s -m 'cpu_test' tool_use
##### models test #####
@@ -754,6 +814,7 @@ steps:
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
+ - vllm/transformers_utils/
- tests/models/test_initialization.py
commands:
# Only when vLLM model source is modified - test initialization of a large
@@ -821,6 +882,7 @@ steps:
# Shard slow subset of standard language models tests. Only run when model
# source is modified, or when specified test files are modified
- pip freeze | grep -E 'torch'
+ - export TORCH_NCCL_BLOCKING_WAIT=1
- pytest -v -s models/language -m 'core_model and slow_test' \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -838,7 +900,7 @@ steps:
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests
- pytest -v -s models/language/generation \
@@ -859,7 +921,7 @@ steps:
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
@@ -899,6 +961,18 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
@@ -912,7 +986,7 @@ steps:
- pytest -v -s models/multimodal/processing
- label: Multi-Modal Models Test (Standard) # 60min
- timeout_in_minutes: 80
+ timeout_in_minutes: 100
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -921,13 +995,16 @@ steps:
- vllm/
- tests/models/multimodal
commands:
+ - export MIOPEN_DEBUG_CONV_DIRECT=0
+ - export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
+ - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
- timeout_in_minutes: 70
+- label: Multi-Modal Accuracy Eval (Small Models) # 5min
+ timeout_in_minutes: 10
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
@@ -937,9 +1014,12 @@ steps:
- vllm/inputs/
- vllm/v1/core/
commands:
- - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+ - export MIOPEN_DEBUG_CONV_DIRECT=0
+ - export MIOPEN_DEBUG_CONV_GEMM=0
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
-- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+ timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -948,10 +1028,13 @@ steps:
- vllm/
- tests/models/multimodal
commands:
+ - export MIOPEN_DEBUG_CONV_DIRECT=0
+ - export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
-- label: Multi-Modal Models Test (Extended) 2
+- label: Multi-Modal Models Test (Extended) 2 #60min
+ timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -960,10 +1043,13 @@ steps:
- vllm/
- tests/models/multimodal
commands:
+ - export MIOPEN_DEBUG_CONV_DIRECT=0
+ - export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
-- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+ timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -972,6 +1058,8 @@ steps:
- vllm/
- tests/models/multimodal
commands:
+ - export MIOPEN_DEBUG_CONV_DIRECT=0
+ - export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
@@ -1031,8 +1119,8 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
- - vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
@@ -1055,6 +1143,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
@@ -1064,11 +1153,18 @@ steps:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
@@ -1079,7 +1175,7 @@ steps:
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
@@ -1097,17 +1193,15 @@ steps:
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
- - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- - pytest -v -s tests/compile/test_fusions_e2e.py
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-- label: ROCm GPT-OSS Eval
+- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
- agent_pool: mi325_1
- mirror_hardwares: [amdexperimental, amdproduction]
+ gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
@@ -1116,7 +1210,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
@@ -1143,7 +1237,7 @@ steps:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test #####
##### multi gpus test #####
@@ -1183,13 +1277,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+ - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+ - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
@@ -1215,7 +1309,11 @@ steps:
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
+ # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+ # TODO: Remove when the bug is fixed in a future ROCm release
+ - export TORCH_NCCL_BLOCKING_WAIT=1
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -1251,7 +1349,7 @@ steps:
- label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1319,12 +1417,15 @@ steps:
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- - pytest -v -s -x lora/test_gptoss_tp.py
+
+ # Disabled for now because MXFP4 backend on non-cuda platform
+ # doesn't support LoRA yet
+ #- pytest -v -s -x lora/test_gptoss_tp.py
- label: Weight Loading Multiple GPU Test # 33min
timeout_in_minutes: 45
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1360,8 +1461,22 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+ - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+ - VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
##### multi gpus test #####
##### A100 test #####
@@ -1383,12 +1498,13 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
+
- label: LM Eval Large Models # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
gpu: a100
optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
@@ -1400,11 +1516,11 @@ steps:
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
gpu: h100
optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
@@ -1414,6 +1530,7 @@ steps:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
##### H200 test #####
- label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental]
@@ -1424,14 +1541,14 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - pytest -v -s tests/compile/distributed/test_async_tp.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
+ - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
@@ -1445,6 +1562,57 @@ steps:
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
+##### E2E Eval Tests #####
+- label: LM Eval Small Models (1 Card) # 15min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
+
+- label: LM Eval Large Models (4 Card)
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: ROCm LM Eval Large Models (8 Card)
+ mirror_hardwares: [amdproduction]
+ agent_pool: mi325_8
+ num_gpus: 8
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
+
+- label: ROCm GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ agent_pool: mi325_1
+ mirror_hardwares: [amdexperimental, amdproduction]
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
@@ -1460,8 +1628,9 @@ steps:
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
+##### EPLB Accuracy Tests #####
- label: DeepSeek V2-Lite Accuracy
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
@@ -1472,8 +1641,8 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
-- label: Qwen3-30B-A3B-FP8-block Accuracy
- mirror_hardwares: [amdexperimental]
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
@@ -1482,4 +1651,36 @@ steps:
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6169b279dc..1c7a5ca368 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,14 +57,16 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
- timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+ timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -73,6 +75,8 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -110,7 +114,7 @@ steps:
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40
@@ -128,7 +132,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
@@ -140,10 +144,26 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/test_chat_utils.py
+- label: Entrypoints Integration Test (API Server 2)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/sleep
+ - tests/entrypoints/rpc
+ - tests/tool_use
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/sleep
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+ - pytest -v -s tool_use
+
- label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@@ -157,6 +177,18 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
+- label: Entrypoints Integration Test (Responses API)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai/responses
+ commands:
+ - pytest -v -s entrypoints/openai/responses
+
- label: Distributed Tests (4 GPUs) # 35min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@@ -192,6 +224,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -275,21 +308,18 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 25min
- timeout_in_minutes: 40
+- label: Engine Test # 9min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/engine
- - tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- # OOM in the CI unless we run this separately
- - pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45
@@ -301,7 +331,10 @@ steps:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- - pytest -v -s v1/engine
+ # Run this test standalone for now;
+ # need to untangle use (implicit) use of spawn/fork across the tests.
+ - pytest -v -s v1/engine/test_preprocess_error_handling.py
+ - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
@@ -346,6 +379,19 @@ steps:
commands:
- pytest -v -s v1/attention
+- label: Batch Invariance Tests (H100) # 10min
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
@@ -375,23 +421,28 @@ steps:
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- - python3 offline_inference/basic/classify.py
- - python3 offline_inference/basic/embed.py
- - python3 offline_inference/basic/score.py
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -450,7 +501,9 @@ steps:
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@@ -464,7 +517,9 @@ steps:
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
@@ -618,7 +673,8 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
@@ -629,7 +685,7 @@ steps:
- vllm/model_executor/layers/quantization
autorun_on_main: true
commands:
- - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: OpenAI API correctness # 22min
timeout_in_minutes: 30
@@ -641,25 +697,6 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
-- label: OpenAI-Compatible Tool Use # 23 min
- timeout_in_minutes: 35
- mirror_hardwares: [amdexperimental]
- fast_check: false
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- commands:
- - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
- timeout_in_minutes: 10
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- no_gpu: true
- commands:
- - pytest -v -s -m 'cpu_test' tool_use
-
##### models test #####
- label: Basic Models Tests (Initialization)
@@ -669,6 +706,7 @@ steps:
source_file_dependencies:
- vllm/
- tests/models/test_initialization.py
+ - tests/models/registry.py
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -679,7 +717,9 @@ steps:
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
+ - vllm/transformers_utils/
- tests/models/test_initialization.py
+ - tests/models/registry.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
@@ -805,14 +845,24 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
-- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- - pytest -v -s models/multimodal/processing
+ - pytest -v -s models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80
@@ -889,11 +939,12 @@ steps:
- label: Transformers Nightly Models Test
working_dir: "/vllm-workspace/"
optional: true
+ soft_fail: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+ - pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- # - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
@@ -904,7 +955,6 @@ steps:
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
- # optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
@@ -916,8 +966,8 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
- - vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
@@ -1034,7 +1084,7 @@ steps:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test #####
##### multi gpus test #####
@@ -1066,17 +1116,18 @@ steps:
- vllm/model_executor/models/
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
+ - .buildkite/scripts/run-multi-node-test.sh
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+ - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+ - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
@@ -1103,6 +1154,7 @@ steps:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -1192,6 +1244,8 @@ steps:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # Alot of these tests are on the edge of OOMing
+ - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
@@ -1225,8 +1279,8 @@ steps:
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-- label: NixlConnector PD accuracy tests (Distributed) # 30min
- timeout_in_minutes: 30
+- label: NixlConnector PD accuracy tests (Distributed) # 40min
+ timeout_in_minutes: 40
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@@ -1234,7 +1288,18 @@ steps:
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+ - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
##### multi gpus test #####
@@ -1286,15 +1351,23 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - pytest -v -s tests/compile/distributed/test_async_tp.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- - pytest -v -s tests/distributed/test_sequence_parallel.py
+ - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
+- label: LM Eval Large Models (H200) # optional
+ timeout_in_minutes: 60
+ gpu: h200
+ optional: true
+ num_gpus: 8
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
+
##### B200 test #####
- label: Distributed Tests (B200) # optional
gpu: b200
@@ -1310,12 +1383,14 @@ steps:
- label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30
optional: true
+ soft_fail: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
+ - nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy
@@ -1327,11 +1402,43 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
-- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+##### MoE Refactor (Temporary) Tests #####
+
+- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional
+ gpu: h100
+ optional: true
+ num_gpus: 2
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
+
+- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
new file mode 100644
index 0000000000..6e444eae14
--- /dev/null
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,21 @@
+group: Attention
+depends_on:
+ - image-build
+steps:
+- label: V1 attention (H100)
+ timeout_in_minutes: 30
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+ timeout_in_minutes: 30
+ gpu: b200
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
new file mode 100644
index 0000000000..759d2b5358
--- /dev/null
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on:
+ - image-build
+steps:
+- label: Basic Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/basic_correctness/test_basic_correctness
+ - tests/basic_correctness/test_cpu_offload
+ - tests/basic_correctness/test_cumem.py
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s basic_correctness/test_cumem.py
+ - pytest -v -s basic_correctness/test_basic_correctness.py
+ - pytest -v -s basic_correctness/test_cpu_offload.py
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
new file mode 100644
index 0000000000..574b642d40
--- /dev/null
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on:
+ - image-build
+steps:
+- label: Benchmarks
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/.buildkite"
+ source_file_dependencies:
+ - benchmarks/
+ commands:
+ - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/benchmarks/
+ commands:
+ - pytest -v -s benchmarks/
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
new file mode 100644
index 0000000000..0ba00925a4
--- /dev/null
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,57 @@
+group: Compile
+depends_on:
+ - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
+ commands:
+ - nvidia-smi
+ - pytest -v -s tests/compile/test_fusion_attn.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+ # this runner has 2 GPUs available even though num_gpus=2 is not set
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+ # Wrap with quotes to escape yaml
+ - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ commands:
+ - nvidia-smi
+ # Run all e2e fusion tests
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
new file mode 100644
index 0000000000..50c0c338c2
--- /dev/null
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,22 @@
+group: CUDA
+depends_on:
+ - image-build
+steps:
+- label: Platform Tests (CUDA)
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/cuda
+ commands:
+ - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/v1/cudagraph
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/config/compilation.py
+ - vllm/compilation
+ commands:
+ - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+ - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
new file mode 100644
index 0000000000..c88076bb52
--- /dev/null
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,199 @@
+group: Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Comm Ops
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/distributed
+ - tests/distributed
+ commands:
+ - pytest -v -s distributed/test_comm_ops.py
+ - pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+ timeout_in_minutes: 90
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/compilation/
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/worker/worker_base.py
+ - vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+ - pytest -v -s entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s ./compile/test_wrapper.py
+ - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - tests/distributed/test_utils
+ - tests/distributed/test_pynccl
+ - tests/distributed/test_events
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - examples/offline_inference/rlhf.py
+ - examples/offline_inference/rlhf_colocate.py
+ - tests/examples/offline_inference/data_parallel.py
+ - tests/v1/distributed
+ - tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and external_dp=2
+ - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and pp=2
+ - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with internal dp
+ - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+ - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+ - pytest -v -s distributed/test_utils.py
+ - pytest -v -s compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s distributed/test_pynccl.py
+ - pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
+ # TODO: create a dedicated test section for multi-GPU example tests
+ # when we have multiple distributed example tests
+ - cd ../examples/offline_inference
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+ timeout_in_minutes: 10
+ gpu: h100
+ num_gpus: 8
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - examples/offline_inference/torchrun_dp_example.py
+ - vllm/config/parallel.py
+ - vllm/distributed/
+ - vllm/v1/engine/llm_engine.py
+ - vllm/v1/executor/uniproc_executor.py
+ - vllm/v1/worker/gpu_worker.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and dp=4 with ep
+ - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/
+ commands:
+ # NOTE: don't test llama model here, it seems hf implementation is buggy
+ # see https://github.com/vllm-project/vllm/pull/5689 for details
+ - pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+ - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ num_nodes: 2
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ - tests/examples/offline_inference/data_parallel.py
+ commands:
+ - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ commands:
+ - pytest -v -s distributed/test_pp_cudagraph.py
+ - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
new file mode 100644
index 0000000000..2e0857986c
--- /dev/null
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,42 @@
+group: E2E Integration
+depends_on:
+ - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+ timeout_in_minutes: 30
+ optional: true
+ soft_fail: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
new file mode 100644
index 0000000000..a028e0e4af
--- /dev/null
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,26 @@
+group: Engine
+depends_on:
+ - image-build
+steps:
+- label: Engine
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/engine
+ - tests/test_sequence
+ - tests/test_config
+ - tests/test_logger
+ - tests/test_vllm_port
+ commands:
+ - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
new file mode 100644
index 0000000000..8e02d9f60b
--- /dev/null
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,90 @@
+group: Entrypoints
+depends_on:
+ - image-build
+steps:
+- label: Entrypoints Unit Tests
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/llm
+ - tests/entrypoints/offline_mode
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server 1)
+ timeout_in_minutes: 130
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai
+ - tests/entrypoints/test_chat_utils
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+ - pytest -v -s entrypoints/test_chat_utils.py
+
+- label: Entrypoints Integration (API Server 2)
+ timeout_in_minutes: 130
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ - tests/entrypoints/sleep
+ - tests/entrypoints/instrumentator
+ - tests/entrypoints/rpc
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+ - pytest -v -s entrypoints/instrumentator
+ - pytest -v -s entrypoints/sleep
+ - pytest -v -s tool_use
+
+- label: Entrypoints Integration (Pooling)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+- label: Entrypoints Integration (Responses API)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai/responses
+ commands:
+ - pytest -v -s entrypoints/openai/responses
+
+- label: Entrypoints V1
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - csrc/
+ - vllm/entrypoints/openai/
+ - vllm/model_executor/models/whisper.py
+ commands: # LMEval+Transcription WER check
+ - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
new file mode 100644
index 0000000000..feb8252148
--- /dev/null
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on:
+ - image-build
+steps:
+- label: EPLB Algorithm
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_algo.py
+ commands:
+ - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_execute.py
+ commands:
+ - pytest -v -s distributed/test_eplb_execute.py
+ - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
new file mode 100644
index 0000000000..cf4b646f34
--- /dev/null
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,117 @@
+group: Kernels
+depends_on:
+ - image-build
+steps:
+- label: Kernels Core Operation Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - tests/kernels/core
+ - tests/kernels/test_top_k_per_row.py
+ commands:
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - csrc/attention/
+ - vllm/attention
+ - vllm/v1/attention
+ - tests/kernels/attention
+ commands:
+ - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Quantization Test %N
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/quantization/
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization
+ commands:
+ - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels MoE Test %N
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - csrc/quantization/cutlass_w8a8/moe/
+ - csrc/moe/
+ - tests/kernels/moe
+ - vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
+ - vllm/envs.py
+ - vllm/config
+ commands:
+ - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Mamba Test
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - csrc/mamba/
+ - tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
+ commands:
+ - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ # optional: true
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - csrc/attention/mla/
+ - csrc/quantization/cutlass_w8a8/moe/
+ - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/attention/backends/mla/cutlass_mla.py
+ - vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/v1/attention/selector.py
+ - vllm/platforms/cuda.py
+ commands:
+ - nvidia-smi
+ - python3 examples/offline_inference/basic/chat.py
+ # Attention
+ # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_attention_selector.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+ - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+ # Quantization
+ - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+ - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
new file mode 100644
index 0000000000..e2498512bd
--- /dev/null
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,46 @@
+group: LM Eval
+depends_on:
+ - image-build
+steps:
+- label: LM Eval Small Models
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ autorun_on_main: true
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
new file mode 100644
index 0000000000..59ade40cc8
--- /dev/null
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,33 @@
+group: LoRA
+depends_on:
+ - image-build
+steps:
+- label: LoRA %N
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+ parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+ timeout_in_minutes: 30
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ # FIXIT: find out which code initialize cuda before running the test
+ # before the fix, we need to use spawn to test it
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # Alot of these tests are on the edge of OOMing
+ - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+ # There is some Tensor Parallelism related processing logic in LoRA that
+ # requires multi-GPU testing for validation.
+ - pytest -v -s -x lora/test_chatglm3_tp.py
+ - pytest -v -s -x lora/test_llama_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
+ - pytest -v -s -x lora/test_olmoe_tp.py
+ - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
new file mode 100644
index 0000000000..252af1e56a
--- /dev/null
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,165 @@
+group: Miscellaneous
+depends_on:
+ - image-build
+steps:
+- label: V1 Others
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ # split the test to avoid interference
+ - pytest -v -s -m 'not cpu_test' v1/core
+ - pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
+ - pytest -v -s v1/sample
+ - pytest -v -s v1/logits_processors
+ - pytest -v -s v1/worker
+ - pytest -v -s v1/spec_decode
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
+ - pytest -v -s v1/test_oracle.py
+ - pytest -v -s v1/test_request.py
+ - pytest -v -s v1/test_outputs.py
+ # Integration test for streaming correctness (requires special branch).
+ - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+ - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+ depends_on: ~
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/test_regression
+ commands:
+ - pip install modelscope
+ - pytest -v -s test_regression.py
+ working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/examples"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - vllm/multimodal
+ - examples/
+ commands:
+ - pip install tensorizer # for tensorizer test
+ - python3 offline_inference/basic/chat.py # for basic
+ - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+ - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+ timeout_in_minutes: 20
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/tracing
+ commands:
+ - "pip install \
+ 'opentelemetry-sdk>=1.26.0' \
+ 'opentelemetry-api>=1.26.0' \
+ 'opentelemetry-exporter-otlp>=1.26.0' \
+ 'opentelemetry-semantic-conventions-ai>=0.4.1'"
+ - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+ depends_on: ~
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/standalone_tests/python_only_compile.sh
+ - setup.py
+ commands:
+ - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+ depends_on: ~
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/test_inputs.py
+ - tests/test_outputs.py
+ - tests/multimodal
+ - tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/tool_parsers
+ - tests/transformers_utils
+ - tests/config
+ no_gpu: true
+ commands:
+ - python3 standalone_tests/lazy_imports.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s test_outputs.py
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
+ - pytest -v -s transformers_utils
+ - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
new file mode 100644
index 0000000000..996c8bb8b7
--- /dev/null
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,17 @@
+group: Model Executor
+depends_on:
+ - image-build
+steps:
+- label: Model Executor
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - vllm/engine/arg_utils.py
+ - vllm/config/model.py
+ - vllm/model_executor
+ - tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+ commands:
+ - apt-get update && apt-get install -y curl libsodium23
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
new file mode 100644
index 0000000000..2a86596a6d
--- /dev/null
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,64 @@
+group: Models - Basic
+depends_on:
+ - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_initialization.py
+ - tests/models/registry.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ - tests/models/registry.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+ working_dir: "/vllm-workspace/"
+ optional: true
+ soft_fail: true
+ commands:
+ - pip install --upgrade git+https://github.com/huggingface/transformers
+ - pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
+ - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/test_mapping.py
+ - python3 examples/offline_inference/basic/chat.py
+ - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
new file mode 100644
index 0000000000..b6bfbf2dda
--- /dev/null
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+ # Avoid importing model tests that cause CUDA reinitialization error
+ - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
new file mode 100644
index 0000000000..f70192c4eb
--- /dev/null
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,91 @@
+group: Models - Language
+depends_on:
+ - image-build
+steps:
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language
+ commands:
+ # Test standard language models, excluding a subset of slow tests
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling
+ commands:
+ - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
new file mode 100644
index 0000000000..fc24068c20
--- /dev/null
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on:
+ - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+ timeout_in_minutes: 80
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+ timeout_in_minutes: 70
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - vllm/multimodal/
+ - vllm/inputs/
+ - vllm/v1/core/
+ commands:
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+ optional: true
+ commands:
+ - echo 'Testing custom models...'
+ # PR authors can temporarily add commands below to test individual models
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
new file mode 100644
index 0000000000..60c179aa09
--- /dev/null
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,34 @@
+group: Plugins
+depends_on:
+ - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/plugins/
+ - tests/plugins/
+ commands:
+ # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+ - pip install -e ./plugins/vllm_add_dummy_platform
+ - pytest -v -s plugins_tests/test_platform_plugins.py
+ - pip uninstall vllm_add_dummy_platform -y
+ # end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
+ # begin stat_logger plugins test
+ - pip install -e ./plugins/vllm_add_dummy_stat_logger
+ - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+ - pip uninstall dummy_stat_logger -y
+ # end stat_logger plugins test
+ # other tests continue here:
+ - pytest -v -s plugins_tests/test_scheduler_plugins.py
+ - pip install -e ./plugins/vllm_add_dummy_model
+ - pytest -v -s distributed/test_distributed_oot.py
+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+ - pytest -v -s models/test_oot_registration.py # it needs a clean process
+ - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
new file mode 100644
index 0000000000..332d5202d8
--- /dev/null
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,52 @@
+group: PyTorch
+depends_on:
+ - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run unit tests defined directly under compile/,
+ # not including subdirectories, which are usually heavier
+ # tests covered elsewhere.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
+
+- label: PyTorch Fullgraph Smoke Test
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run smoke tests under fullgraph directory, except test_full_graph.py
+ # as it is a heavy test that is covered in other steps.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+ timeout_in_minutes: 40
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # fp8 kv scales not supported on sm89, tested on Blackwell instead
+ - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+ # Limit to no custom ops to reduce running time
+ # Wrap with quotes to escape yaml and avoid starting -k string with a -
+ - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+ # if this test fails, it means the nightly torch version is not compatible with some
+ # of the dependencies. Please check the error message and add the package to whitelist
+ # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+ soft_fail: true
+ source_file_dependencies:
+ - requirements/nightly_torch_test.txt
+ commands:
+ - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
new file mode 100644
index 0000000000..6e89d6af3b
--- /dev/null
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,46 @@
+group: Quantization
+depends_on:
+ - image-build
+steps:
+- label: Quantization
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ - tests/quantization
+ commands:
+ # temporary install here since we need nightly, will move to requirements/test.in
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system conch-triton-kernels
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/model_executor/layers/quantization
+ - tests/models/quantization
+ commands:
+ - pytest -v -s models/quantization
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
new file mode 100644
index 0000000000..ad377148fd
--- /dev/null
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,14 @@
+group: Samplers
+depends_on:
+ - image-build
+steps:
+- label: Samplers Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - vllm/model_executor/layers
+ - vllm/sampling_metadata.py
+ - tests/samplers
+ - tests/conftest.py
+ commands:
+ - pytest -v -s samplers
+ - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
new file mode 100644
index 0000000000..cfc5bb20fe
--- /dev/null
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on:
+ - image-build
+steps:
+- label: Weight Loading Multiple GPU # 33min
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ gpu: a100
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 6e178bb690..c963be4cb8 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,17 +3,18 @@
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
+/vllm/model_executor/layers/batch_invariant.py @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
@@ -25,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1
/vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
@@ -35,6 +37,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/offloading @ApostaC
+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
+
# Test ownership
/.buildkite/lm-eval-harness @mgoin
/tests/distributed/test_multi_node_assignment.py @youkaichao
@@ -56,6 +61,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
+/tests/v1/determinism @yewentao256
# Transformers modeling backend
/vllm/model_executor/models/transformers @hmellor
@@ -111,15 +117,15 @@ mkdocs.yaml @hmellor
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels
-/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
-/vllm/attention/ops/triton_unified_attention.py @tdoublep
+/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep
# ROCm related: specify owner with write access to notify AMD folks for careful code review
/vllm/**/*rocm* @tjtanaa
/docker/Dockerfile.rocm* @gshtras @tjtanaa
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
-/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
/csrc/rocm @gshtras @tjtanaa
/requirements/*rocm* @tjtanaa
@@ -141,12 +147,13 @@ mkdocs.yaml @hmellor
/requirements/kv_connectors.txt @NickLucche
# Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
-/vllm/model_executor/layers/pooler.py @noooop
+/vllm/model_executor/layers/pooler @noooop
# Security guide and policies
/docs/usage/security.md @russellb
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 997a40e18e..a496dd302d 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,6 +14,52 @@ pull_request_rules:
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
+- name: comment-pre-commit-failure
+ description: Comment on PR when pre-commit check fails
+ conditions:
+ - status-failure=pre-commit
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+ ```bash
+ uv pip install pre-commit
+ pre-commit install
+ pre-commit run --all-files
+ ```
+
+ Then, commit the changes and push to your branch.
+
+ For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+ > [!TIP]
+ >
+ > Is mypy or markdownlint failing?
+ >
+ > mypy and markdownlint are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+ >
+ > ```bash
+ > # For mypy (substitute "3.10" with the failing version if needed)
+ > pre-commit run --hook-stage manual mypy-3.10
+ > # For markdownlint
+ > pre-commit run --hook-stage manual markdownlint
+ > ```
+ >
+
+- name: comment-dco-failure
+ description: Comment on PR when DCO check fails
+ conditions:
+ - status-failure=dco
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
@@ -140,7 +186,7 @@ pull_request_rules:
- files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
- files~=^vllm/entrypoints/tool_server.py
- files~=^vllm/entrypoints/tool.py
- files~=^vllm/entrypoints/context.py
@@ -176,10 +222,10 @@ pull_request_rules:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
- files~=^requirements/rocm.*\.txt
- - files~=^vllm/attention/backends/rocm.*\.py
- - files~=^vllm/attention/ops/rocm.*\.py
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
+ - files~=^vllm/v1/attention/backends/rocm.*\.py
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
+ - files~=^vllm/v1/attention/ops/rocm.*\.py
- files~=^tests/kernels/.*_rocm.*\.py
- files=vllm/platforms/rocm.py
- title~=(?i)AMD
@@ -189,6 +235,20 @@ pull_request_rules:
add:
- rocm
+- name: label-cpu
+ description: Automatically apply cpu label
+ conditions:
+ - label != stale
+ - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
+ actions:
+ label:
+ add:
+ - cpu
+ assign:
+ users:
+ - "fadara01"
+ - "aditew01"
+
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
@@ -289,6 +349,18 @@ pull_request_rules:
add:
- tool-calling
+- name: auto-rebase if approved, ready, and 40 commits behind main
+ conditions:
+ - base = main
+ - label=ready
+ - "#approved-reviews-by >= 1"
+ - "#commits-behind >= 40"
+ - -closed
+ - -draft
+ - -conflict
+ actions:
+ rebase: {}
+
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- label != stale
@@ -358,4 +430,4 @@ pull_request_rules:
actions:
label:
add:
- - kv-connector
\ No newline at end of file
+ - kv-connector
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index c3e132a536..df89108377 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,10 +13,10 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Python
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 7d565ef9f2..629966b959 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,6 +105,31 @@ jobs:
}
],
},
+ cpu: {
+ // Keyword search - matches whole words only (with word boundaries)
+ keywords: [
+ {
+ term: "CPU Backend",
+ searchIn: "title"
+ },
+ {
+ term: "x86",
+ searchIn: "title"
+ },
+ {
+ term: "ARM",
+ searchIn: "title"
+ },
+ {
+ term: "Apple Silicon",
+ searchIn: "title"
+ },
+ {
+ term: "IBM Z",
+ searchIn: "title"
+ },
+ ],
+ },
// Add more label configurations here as needed
// example: {
// keywords: [...],
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index a183033c9a..e80a5c0cc8 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6.0.1
- uses: astral-sh/setup-uv@v7
with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index e21d13b816..1041653c2f 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,8 +16,8 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+ - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index dca3089f49..44bf71db5e 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,13 +7,15 @@ on:
jobs:
close-issues-and-pull-requests:
+ # Prevents triggering on forks or other repos
+ if: github.repository == 'vllm-project/vllm'
permissions:
issues: write
pull-requests: write
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+ - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/.gitignore b/.gitignore
index 7cda864786..864542128c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -227,3 +227,8 @@ ep_kernels_workspace/
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
!vllm/benchmarks/lib/
+
+# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
+vllm/grpc/vllm_engine_pb2.py
+vllm/grpc/vllm_engine_pb2_grpc.py
+vllm/grpc/vllm_engine_pb2.pyi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4cf51d17e..ec67ee8c3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
#
# Try to find python package with an executable that exactly matches
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
# ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
- NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+ Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.")
endif()
@@ -282,6 +282,7 @@ endif()
set(VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/cache_kernels.cu"
+ "csrc/cache_kernels_fused.cu"
"csrc/attention/paged_attention_v1.cu"
"csrc/attention/paged_attention_v2.cu"
"csrc/attention/merge_attn_states.cu"
@@ -354,9 +355,22 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
- # 9.0 for latest bf16 atomicAdd PTX
- cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
- if (MARLIN_ARCHS)
+
+ # marlin arches for fp16 output
+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+ # marlin has limited support for turing
+ cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
+ # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+ cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+ # marlin arches for fp8 input
+ # - sm80 doesn't support fp8 computation
+ # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+ # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
+ cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+ # marlin arches for other files
+ cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
+
+ if (MARLIN_OTHER_ARCHS)
#
# For the Marlin kernels we automatically generate sources for various
@@ -365,16 +379,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+ list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+ set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
- message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
- message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+ message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+ message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
- if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
- OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+ if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=$PYTHONPATH
- ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+ PYTHONPATH=$ENV{PYTHONPATH}
+ ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -387,40 +403,76 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
else()
- set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
- CACHE STRING "Last run Marlin generate script hash" FORCE)
+ set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
message(STATUS "Marlin generation completed successfully.")
endif()
else()
message(STATUS "Marlin generation script has not changed, skipping generation.")
endif()
- file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
- set_gencode_flags_for_srcs(
- SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
- CUDA_ARCHS "${MARLIN_ARCHS}")
- if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
- set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
- PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ if (MARLIN_ARCHS)
+ file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+ CUDA_ARCHS "${MARLIN_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+
+ file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+ CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
endif()
- list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+ if (MARLIN_SM75_ARCHS)
+ file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
+ CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
+ endif()
+
+ if (MARLIN_FP8_ARCHS)
+ file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+ CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+ endif()
set(MARLIN_SRCS
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
+ "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}"
- CUDA_ARCHS "${MARLIN_ARCHS}")
+ CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
- set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+ set_source_files_properties(${MARLIN_SRCS}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
- message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+ message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
" in CUDA target architectures")
@@ -604,12 +656,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
- "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+ "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+ "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+ "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+ list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -745,24 +800,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
- if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
- set_gencode_flags_for_srcs(
- SRCS "${SRCS}"
- CUDA_ARCHS "${SCALED_MM_ARCHS}")
- list(APPEND VLLM_EXT_SRC "${SRCS}")
- list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
- message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
- else()
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
- message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
- "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
- "if you intend on running FP8 quantized MoE models on Blackwell.")
- else()
- message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
- "in CUDA target architectures")
- endif()
- endif()
#
# Machete kernels
@@ -786,7 +823,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+ PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
@@ -838,7 +875,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+ "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+ )
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -908,7 +948,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
- "csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -938,9 +977,20 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${CUDA_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
- # 9.0 for latest bf16 atomicAdd PTX
- cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
- if (MARLIN_MOE_ARCHS)
+ # moe marlin arches
+ # note that we always set `use_atomic_add=False` for moe marlin now,
+ # so we don't need 9.0 for bf16 atomicAdd PTX
+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+ # moe marlin has limited support for turing
+ cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
+ # moe marlin arches for fp8 input
+ # - sm80 doesn't support fp8 computation
+ # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+ # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
+ cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+ # moe marlin arches for other files
+ cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
+ if (MARLIN_MOE_OTHER_ARCHS)
#
# For the Marlin MOE kernels we automatically generate sources for various
@@ -949,16 +999,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+ list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+ set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
- message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
- message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+ message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+ message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
- if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
- OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+ if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+ OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=$PYTHONPATH
- ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+ PYTHONPATH=$ENV{PYTHONPATH}
+ ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -971,7 +1023,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else()
- set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+ set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.")
endif()
@@ -979,18 +1031,53 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif()
- file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+ if (MARLIN_MOE_ARCHS)
+ file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_MOE_SRC}"
+ CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_MOE_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
+ endif()
+
+ if (MARLIN_MOE_SM75_ARCHS)
+ file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_MOE_SM75_SRC}"
+ CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_MOE_SM75_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC})
+ endif()
+
+ if (MARLIN_MOE_FP8_ARCHS)
+ file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${MARLIN_MOE_FP8_SRC}"
+ CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
+ list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+ endif()
+
+ set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
set_gencode_flags_for_srcs(
- SRCS "${MOE_WNAA16_MARLIN_SRC}"
- CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+ SRCS "${MARLIN_MOE_OTHER_SRC}"
+ CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
- set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+ set_source_files_properties(${MARLIN_MOE_OTHER_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
+ list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}")
- list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
-
- message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+ message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}")
else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
" in CUDA target architectures")
diff --git a/README.md b/README.md
index 033e1035d8..705fbcb915 100644
--- a/README.md
+++ b/README.md
@@ -14,50 +14,8 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |
----
-Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
-
----
-
-*Latest News* 🔥
-
-- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
-- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
-- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
-- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
-- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
-- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
-- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
-- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
-- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
-
-
-Previous News
-
-- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
-- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
-- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
-- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
-- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
-- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
-- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
-- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
-- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
-- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
-- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
-- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
-- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
-- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
-- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
-- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
-- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
-- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
-- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
-- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
-- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
-- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
-
-
+🔥 We have built a vllm website to help you get started with vllm. Please visit [vllm.ai](https://vllm.ai) to learn more.
+For events, please visit [vllm.ai/events](https://vllm.ai/events) to join us.
---
@@ -117,47 +75,6 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
-## Sponsors
-
-vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
-
-
-
-Cash Donations:
-
-- a16z
-- Dropbox
-- Sequoia Capital
-- Skywork AI
-- ZhenFund
-
-Compute Resources:
-
-- Alibaba Cloud
-- AMD
-- Anyscale
-- AWS
-- Crusoe Cloud
-- Databricks
-- DeepInfra
-- Google Cloud
-- Intel
-- Lambda Lab
-- Nebius
-- Novita AI
-- NVIDIA
-- Replicate
-- Roblox
-- RunPod
-- Trainy
-- UC Berkeley
-- UC San Diego
-- Volcengine
-
-Slack Sponsor: Anyscale
-
-We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
-
## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
@@ -178,7 +95,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
-- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+- For collaborations and partnerships, please contact us at [collaboration@vllm.ai](mailto:collaboration@vllm.ai)
## Media Kit
diff --git a/RELEASE.md b/RELEASE.md
index db0d51afc7..dfd4fa1ae0 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,47 +1,30 @@
# Releasing vLLM
-vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
+vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via [PyPI](https://pypi.org/project/vllm). These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
-## Release Versioning
+## Release Cadence and Versioning
-vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
+We aim to have a regular release every 2 weeks. Since v0.12.0, regular releases increment the minor version rather than patch version. The list of past releases can be found [here](https://vllm.ai/releases).
-* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
-* _minor_ major features
-* _patch_ features and backwards-compatible bug fixes
-* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
+Our version numbers are expressed in the form `vX.Y.Z`, where `X` is the major version, `Y` is the minor version, and `Z` is the patch version. They are incremented according to the following rules:
-## Release Cadence
+* _Major_ releases are reserved for architectural milestones involving sweeping API changes, similar to PyTorch 2.0.
+* _Minor_ releases correspond to regular releases, which include new features, bug fixes and other backwards-compatible changes.
+* _Patch_ releases correspond to special releases for new models, as well as emergency patches for critical performance, functionality and security issues.
-Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
-Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
+This versioning scheme is similar to [SemVer](https://semver.org/) for compatibility purposes, except that backwards compatibility is only guaranteed for a limited number of minor releases (see our [deprecation policy](https://docs.vllm.ai/en/latest/contributing/deprecation_policy) for details).
-| Release Date | Patch release versions | Post Release versions |
-| --- | --- | --- |
-| Jan 2025 | 0.7.0 | --- |
-| Feb 2025 | 0.7.1, 0.7.2, 0.7.3 | --- |
-| Mar 2025 | 0.7.4, 0.7.5 | --- |
-| Apr 2025 | 0.7.6, 0.7.7 | --- |
-| May 2025 | 0.7.8, 0.7.9 | --- |
-| Jun 2025 | 0.7.10, 0.7.11 | --- |
-| Jul 2025 | 0.7.12, 0.7.13 | --- |
-| Aug 2025 | 0.7.14, 0.7.15 | --- |
-| Sep 2025 | 0.7.16, 0.7.17 | --- |
-| Oct 2025 | 0.7.18, 0.7.19 | --- |
-| Nov 2025 | 0.7.20, 0.7.21 | --- |
-| Dec 2025 | 0.7.22, 0.7.23 | --- |
-
-## Release branch
+## Release Branch
Each release is built from a dedicated release branch.
-* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
-* For post releases, previously cut release branch is reused
-* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
-* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
-* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
+* For _major_ and _minor_ releases, the release branch cut is performed 1-2 days before release is live.
+* For _patch_ releases, previously cut release branch is reused.
+* Release builds are triggered via push to RC tag like `vX.Y.Z-rc1`. This enables us to build and test multiple RCs for each release.
+* Final tag: `vX.Y.Z` does not trigger the build but used for Release notes and assets.
+* After branch cut is created, we monitor the main branch for any reverts and apply these reverts to a release branch.
-## Release Cherry-Pick Criteria
+### Cherry-Pick Criteria
After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index d1bdb4c43f..9a9600e08d 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
```
-#### 2. Maximize Throughput with a Latency Requirement
+### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**:
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=500
```
-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**:
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 56b721cbb4..a245e2022e 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+ echo "Error: Failed to determine hostname." >&2
+ exit 1
+fi
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
"$MODEL"
"--disable-log-requests"
"--port" "8004"
+ "--host" "$HOSTNAME"
"--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -96,8 +102,9 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
- VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
- vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+ local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+ VLLM_SERVER_DEV_MODE=1 \
+ vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_SERVER_DEV_MODE=1 \
@@ -112,7 +119,7 @@ start_server() {
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
- RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+ RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
@@ -172,6 +179,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -187,7 +195,7 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
- curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+ curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
@@ -203,6 +211,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 \
--profile &> "$bm_log"
else
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4021fede72..831b76b66e 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
- from vllm.transformers_utils.tokenizer import MistralTokenizer
+ from vllm.tokenizers.mistral import MistralTokenizer
except ImportError as e:
raise ImportError(
"MistralTokenizer requires vllm package.\n"
diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
index b5c16c42de..7473a41e51 100755
--- a/benchmarks/benchmark_batch_invariance.py
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant(
random.seed(seed)
# Set environment variables
- os.environ["VLLM_ATTENTION_BACKEND"] = backend
if batch_invariant:
os.environ["VLLM_BATCH_INVARIANT"] = "1"
else:
@@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant(
max_model_len=max_model_len,
dtype="bfloat16",
tensor_parallel_size=tp_size,
+ attention_config={"backend": backend},
enable_prefix_caching=False,
)
init_time = time.perf_counter() - start_init
diff --git a/benchmarks/benchmark_hash.py b/benchmarks/benchmark_hash.py
new file mode 100644
index 0000000000..08cdc012d6
--- /dev/null
+++ b/benchmarks/benchmark_hash.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+ (32-byte bytes object, 32-int tuple)
+
+Usage:
+ python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+ """Generate a deterministic test payload."""
+ random.seed(seed)
+ bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+ int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+ return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+ """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+ times: list[float] = []
+
+ # Warm-up to avoid first-run noise.
+ for _ in range(200):
+ func(data)
+
+ for _ in range(iterations):
+ start = time.perf_counter()
+ func(data)
+ end = time.perf_counter()
+ times.append(end - start)
+
+ avg = statistics.mean(times)
+ std = statistics.stdev(times) if len(times) > 1 else 0.0
+ return avg, std
+
+
+def _run_benchmarks(
+ benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+ data: tuple,
+ iterations: int,
+):
+ """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+ for name, func in benchmarks:
+ try:
+ avg, std = _benchmark_func(func, data, iterations)
+ except ModuleNotFoundError as exc:
+ print(f"Skipping {name}: {exc}")
+ continue
+ yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+ """Wrapper for Python's built-in hash()."""
+ return hash(data)
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--iterations",
+ type=int,
+ default=10_000,
+ help="Number of measured iterations per hash function.",
+ )
+ parser.add_argument(
+ "--seed", type=int, default=42, help="Random seed for test payload."
+ )
+ args = parser.parse_args()
+
+ data = _generate_test_data(args.seed)
+ benchmarks = (
+ ("SHA256 (pickle)", sha256),
+ ("xxHash (pickle)", xxhash),
+ ("built-in hash()", builtin_hash),
+ )
+
+ print("=" * 60)
+ print("HASH FUNCTION MICRO BENCHMARK")
+ print("=" * 60)
+ print("Test data: (32-byte bytes object, 32-int tuple)")
+ print(f"Iterations: {args.iterations:,}")
+ print("=" * 60)
+
+ results = list(_run_benchmarks(benchmarks, data, args.iterations))
+ builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+ print("\nResults:")
+ for name, avg, std in results:
+ print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+ if builtin_entry:
+ _, builtin_avg, _ = builtin_entry
+ print("\n" + "=" * 60)
+ print("SUMMARY (relative to built-in hash())")
+ print("=" * 60)
+ for name, avg, _ in results:
+ if name == "built-in hash()":
+ continue
+ speed_ratio = avg / builtin_avg
+ print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+ else:
+ print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index dedb564fff..57a6c1aef5 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -32,12 +32,11 @@ def benchmark_propose(args):
model_config = ModelConfig(
model="facebook/opt-125m",
- task="generate",
max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
- seed=None,
+ seed=0,
trust_remote_code=False,
)
proposer = NgramProposer(
@@ -108,7 +107,10 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
- scheduler_config=SchedulerConfig(),
+ scheduler_config=SchedulerConfig(
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ ),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
@@ -133,7 +135,6 @@ def benchmark_batched_propose(args):
block_sizes=[16],
)
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
- dummy_input_batch.spec_decode_unsupported_reqs = ()
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
dummy_input_batch.token_ids_cpu = np.random.randint(
0, 20, (args.num_req, args.num_token)
@@ -149,10 +150,8 @@ def benchmark_batched_propose(args):
start = time.time()
runner.drafter.propose(
sampled_token_ids,
- dummy_input_batch.req_ids,
dummy_input_batch.num_tokens_no_spec,
dummy_input_batch.token_ids_cpu,
- dummy_input_batch.spec_decode_unsupported_reqs,
)
end = time.time()
print(f"Iteration time (s): {end - start}")
diff --git a/benchmarks/benchmark_prefix_block_hash.py b/benchmarks/benchmark_prefix_block_hash.py
new file mode 100644
index 0000000000..8bcd8af0d3
--- /dev/null
+++ b/benchmarks/benchmark_prefix_block_hash.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+ python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+ num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+ rng = random.Random(seed)
+ return [
+ [rng.randrange(vocab_size) for _ in range(block_size)]
+ for _ in range(num_blocks)
+ ]
+
+
+def _hash_all_blocks(
+ hash_fn: Callable[[object], bytes],
+ blocks: Iterable[Sequence[int]],
+) -> float:
+ parent_hash: BlockHash | None = None
+ start = time.perf_counter()
+ for block in blocks:
+ parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+ end = time.perf_counter()
+ return end - start
+
+
+def _benchmark(
+ hash_algo: str,
+ blocks: list[list[int]],
+ trials: int,
+) -> tuple[float, float, float] | None:
+ try:
+ hash_fn = get_hash_fn_by_name(hash_algo)
+ init_none_hash(hash_fn)
+ timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+ except ModuleNotFoundError as exc:
+ print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+ return None
+
+ avg = statistics.mean(timings)
+ best = min(timings)
+ # throughput: tokens / second
+ tokens_hashed = len(blocks) * len(blocks[0])
+ throughput = tokens_hashed / best
+ return avg, best, throughput
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+ parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+ parser.add_argument(
+ "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+ parser.add_argument(
+ "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+ )
+ parser.add_argument(
+ "--algorithms",
+ nargs="+",
+ default=SUPPORTED_ALGOS,
+ choices=SUPPORTED_ALGOS,
+ help="Hash algorithms to benchmark.",
+ )
+ args = parser.parse_args()
+
+ blocks = _generate_blocks(
+ args.num_blocks, args.block_size, args.vocab_size, args.seed
+ )
+ print(
+ f"Benchmarking {len(args.algorithms)} algorithms on "
+ f"{args.num_blocks} blocks (block size={args.block_size})."
+ )
+
+ for algo in args.algorithms:
+ result = _benchmark(algo, blocks, args.trials)
+ if result is None:
+ continue
+
+ avg, best, throughput = result
+ print(
+ f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
+ f"throughput: {throughput / 1e6:.2f}M tokens/s"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 28fc383a31..e6391134ff 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -40,7 +40,7 @@
from vllm.utils.argparse_utils import FlexibleArgumentParser
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 55001cf372..33aca83188 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -46,7 +46,7 @@
from transformers import PreTrainedTokenizerBase
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
@@ -574,7 +574,7 @@ async def limited_request_func(request_func_input, pbar):
)
print(
"{:<40} {:<10.2f}".format(
- "Total Token throughput (tok/s):", metrics.total_token_throughput
+ "Total token throughput (tok/s):", metrics.total_token_throughput
)
)
@@ -963,8 +963,7 @@ def create_argument_parser():
parser.add_argument(
"--profile",
action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+ help="Use vLLM Profiling. --profiler-config must be provided on the server.",
)
parser.add_argument(
"--result-dir",
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 67fccdf4fd..7720f15e45 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -343,7 +343,9 @@ def bench(
return bench_int8(dtype, m, k, n, label, sub_label)
if dtype == torch.float8_e4m3fn:
return bench_fp8(dtype, m, k, n, label, sub_label)
- raise ValueError("unsupported type")
+ raise ValueError(
+ f"Unsupported dtype {dtype}: should be one of torch.int8, torch.float8_e4m3fn."
+ )
# runner
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index d809bf1db8..fb3329975c 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@
import vllm._custom_ops as ops
from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ per_token_group_quant_fp8,
+)
@dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
hidden_size: int
add_residual: bool
dtype: torch.dtype
+ group_size: list[int]
def description(self):
return (
@@ -29,6 +33,7 @@ def description(self):
f"x D {self.hidden_size} "
f"x R {self.add_residual} "
f"x DT {self.dtype}"
+ f"x GS {self.group_size}"
)
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
HIDDEN_SIZES = list(range(1024, 8129, 1024))
ADD_RESIDUAL = [True, False]
DTYPES = [torch.bfloat16, torch.float]
+ GROUP_SIZES = [[1, 64], [1, 128]]
- combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+ combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
bench_params = list(
- map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+ map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
)
return bench_params
@@ -52,6 +58,7 @@ def unfused_int8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
torch_out, _ = ops.scaled_fp8_quant(torch_out)
+def unfused_groupwise_fp8_impl(
+ rms_norm_layer: RMSNorm,
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ # Norm
+ torch_out = None
+ if residual is None:
+ torch_out = rms_norm_layer.forward_cuda(x, residual)
+ else:
+ torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+ # Quant
+ torch_out, _ = per_token_group_quant_fp8(
+ torch_out, group_size=group_size[1], use_ue8m0=False
+ )
+
+
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
out, _ = ops.rms_norm_dynamic_per_token_quant(
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
)
+def fused_groupwise_impl(
+ rms_norm_layer: RMSNorm, # this stores the weights
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ out, _ = ops.rms_norm_per_block_quant(
+ x,
+ rms_norm_layer.weight,
+ 1e-6,
+ quant_dtype,
+ group_size,
+ residual=residual,
+ is_scale_transposed=True,
+ )
+
+
# Bench functions
def bench_fn(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: torch.Tensor,
quant_dtype: torch.dtype,
+ group_size: list[int],
label: str,
sub_label: str,
fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
"x": x,
"residual": residual,
"quant_dtype": quant_dtype,
+ "group_size": group_size,
"fn": fn,
}
return TBenchmark.Timer(
- stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+ stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
globals=globals,
label=label,
sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
)
)
+ # unfused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ unfused_groupwise_fp8_impl,
+ "unfused_groupwise_fp8_impl",
+ )
+ )
+
+ # fused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ fused_groupwise_impl,
+ "fused_groupwise_fp8_impl",
+ )
+ )
+
print_timers(timers)
return timers
diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/bench_nvfp4_quant.py
new file mode 100644
index 0000000000..7517376535
--- /dev/null
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.triton_utils import triton
+from vllm.utils.flashinfer import flashinfer_fp4_quantize
+
+if not current_platform.has_device_capability(100):
+ raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+PROVIDER_CFGS = {
+ "vllm": dict(backend="vllm", enabled=True),
+ "flashinfer": dict(backend="flashinfer", enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
+ """Compute global scale for FP4 quantization."""
+ amax = torch.abs(tensor).max().to(torch.float32)
+ return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+ x_log=False,
+ line_arg="provider",
+ line_vals=_enabled,
+ line_names=_enabled,
+ ylabel="us (lower is better)",
+ plot_name="NVFP4 Input Quantization Latency (us)",
+ args={},
+ )
+)
+def benchmark(batch_size, provider, N, K):
+ M = batch_size
+ device = "cuda"
+ dtype = torch.bfloat16
+
+ # Create input tensor
+ a = torch.randn((M, K), device=device, dtype=dtype)
+
+ # Compute global scale for activation
+ a_global_scale = compute_global_scale(a)
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ cfg = PROVIDER_CFGS[provider]
+
+ if cfg["backend"] == "vllm":
+ # vLLM's FP4 quantization
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: ops.scaled_fp4_quant(a, a_global_scale),
+ quantiles=quantiles,
+ )
+ elif cfg["backend"] == "flashinfer":
+ # FlashInfer's FP4 quantization
+ # Use is_sf_swizzled_layout=True to match vLLM's output format
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: flashinfer_fp4_quantize(
+ a, a_global_scale, is_sf_swizzled_layout=True
+ ),
+ quantiles=quantiles,
+ )
+
+ # Convert ms to us for better readability at small batch sizes
+ to_us = lambda t_ms: t_ms * 1000
+ return to_us(ms), to_us(max_ms), to_us(min_ms)
+
+
+def prepare_shapes(args):
+ out = []
+ for model, tp_size in itertools.product(args.models, args.tp_sizes):
+ for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+ KN[tp_dim] //= tp_size
+ KN.append(model)
+ out.append(KN)
+ return out
+
+
+def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
+ """Test accuracy between vLLM and FlashInfer FP4 quantization."""
+ # Create input tensor
+ a = torch.randn((M, K), device=device, dtype=dtype)
+
+ # Compute global scale
+ a_global_scale = compute_global_scale(a)
+
+ # vLLM quantization
+ vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
+
+ # FlashInfer quantization (with swizzled layout to match vLLM's output)
+ flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
+ a, a_global_scale, is_sf_swizzled_layout=True
+ )
+ flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
+
+ # Compare outputs
+ torch.testing.assert_close(
+ vllm_fp4,
+ flashinfer_fp4,
+ )
+ print(f"M={M}, K={K}, dtype={dtype}: PASSED")
+
+
+def test_accuracy():
+ """Run accuracy tests across various shapes."""
+ print("\n" + "=" * 60)
+ print("Running accuracy tests: vLLM vs FlashInfer")
+ print("=" * 60)
+
+ device = "cuda"
+ dtype = torch.bfloat16
+
+ # Test various batch sizes and hidden dimensions
+ Ms = [1, 1024]
+ Ks = [4096]
+
+ for M in Ms:
+ for K in Ks:
+ _test_accuracy_once(M, K, dtype, device)
+
+ print("\nAll accuracy tests passed!")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
+ )
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=["meta-llama/Llama-3.1-8B-Instruct"],
+ choices=list(WEIGHT_SHAPES.keys()),
+ )
+ parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default=None,
+ help="Path to save benchmark results",
+ )
+ parser.add_argument(
+ "--accuracy",
+ action="store_true",
+ help="Run accuracy tests",
+ )
+ args = parser.parse_args()
+
+ if args.accuracy:
+ test_accuracy()
+
+ for K, N, model in prepare_shapes(args):
+ print(f"\n{model}, N={N} K={K}")
+ benchmark.run(
+ print_data=True,
+ save_path=args.save_path,
+ N=N,
+ K=K,
+ )
+
+ print("\nBenchmark finished!")
diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
new file mode 100644
index 0000000000..04921dafbd
--- /dev/null
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ _per_token_group_quant_fp8_colmajor,
+ silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+ print(
+ f"Note : The timings reported above is for {cuda_graph_nops} "
+ "consecutive invocations of the benchmarking functions. "
+ f"Please divide by {cuda_graph_nops} for single invocation "
+ "timings."
+ )
+ compare = TBenchmark.Compare(timers)
+ compare.print()
+
+
+class ImplType(Enum):
+ SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+ REFERENCE = 2
+
+ def get_impl(self):
+ if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+ return silu_mul_per_token_group_quant_fp8_colmajor
+ elif self == ImplType.REFERENCE:
+ return reference
+ raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+ input: torch.Tensor
+ output: torch.Tensor
+
+ # Reference act output tensor
+ ref_act_out: torch.Tensor
+ ref_quant_out: torch.Tensor
+
+ @staticmethod
+ def make(T: int, N: int) -> "BenchmarkTensors":
+ assert T % GROUP_SIZE == 0
+ assert N % (GROUP_SIZE * 2) == 0
+
+ input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+ # silu_mul_per_token_group_quant_fp8_colmajor output.
+ output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+ FLOAT8_T
+ )
+
+ # reference output.
+ ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+ ref_quant_out = torch.empty(
+ (T, N // 2), dtype=torch.bfloat16, device="cuda"
+ ).to(FLOAT8_T)
+
+ return BenchmarkTensors(
+ input=input,
+ output=output,
+ ref_act_out=ref_act_out,
+ ref_quant_out=ref_quant_out,
+ )
+
+ @property
+ def T(self):
+ return self.input.size(0)
+
+ @property
+ def N(self):
+ return self.input.size(1)
+
+ def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+ if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+ return {
+ "input": self.input,
+ "output": self.output,
+ "use_ue8m0": is_deep_gemm_e8m0_used(),
+ }
+ elif impl_type == ImplType.REFERENCE:
+ return {
+ "input": self.input,
+ "act_out": self.ref_act_out,
+ "quant_out": self.ref_quant_out,
+ "use_ue8m0": is_deep_gemm_e8m0_used(),
+ }
+ raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+ """
+ Reference triton quant kernel from,
+ vllm.model_executor.layers.quantization.utils.fp8_utils
+ """
+ assert quant_out.size() == x.size()
+ # Allocate the scale tensor column-major format.
+ shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+ x_q = quant_out
+ x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+ M = x.numel() // GROUP_SIZE
+ N = GROUP_SIZE
+ BLOCK = triton.next_power_of_2(N)
+ # heuristics for number of warps
+ num_warps = min(max(BLOCK // 256, 1), 8)
+ num_stages = 1
+
+ finfo = torch.finfo(FLOAT8_T)
+ fp8_min = finfo.min
+ fp8_max = finfo.max
+
+ _per_token_group_quant_fp8_colmajor[(M,)](
+ x,
+ x_q,
+ x_s,
+ GROUP_SIZE,
+ x.shape[1],
+ x.stride(0),
+ x_s.stride(1),
+ eps=1e-10,
+ fp8_min=fp8_min,
+ fp8_max=fp8_max,
+ use_ue8m0=use_ue8m0,
+ BLOCK=BLOCK,
+ num_warps=num_warps,
+ num_stages=num_stages,
+ )
+ return x_q, x_s
+
+
+def reference(
+ input: torch.Tensor,
+ act_out: torch.Tensor,
+ quant_out: torch.Tensor,
+ use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ torch.ops._C.silu_and_mul(act_out, input)
+ return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+ bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+ T = bench_tensors[0].T
+ N = bench_tensors[0].N
+
+ arg_pool_size = len(bench_tensors)
+ kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+ # warmup
+ for kwargs in kwargs_list:
+ impl_type.get_impl()(**kwargs)
+ torch.cuda.synchronize()
+
+ # Merge into a single kwargs and qualify arguments as ArgPool
+ kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+ for _kwargs in kwargs_list:
+ for k, v in _kwargs.items():
+ kwargs[k].values.append(v)
+
+ cuda_graph_params = None
+ cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+ timer = None
+ with Bench(
+ cuda_graph_params,
+ "silu-mul-quant",
+ f"num_tokens={T}, N={N}",
+ impl_type.name,
+ impl_type.get_impl(),
+ **kwargs,
+ ) as bench:
+ timer = bench.run()
+ return timer
+
+
+def test_correctness(T: int, N: int):
+ print(f"Testing num_tokens={T}, N={N} ...")
+
+ bench_tensor = BenchmarkTensors.make(T, N)
+
+ def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+ return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+ # reference output
+ ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+ # test ouptut
+ out_q, out_s = output_from_impl(
+ ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+ )
+
+ torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+ torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+ timers = []
+ for N, T in product(Ns, Ts):
+ test_correctness(T, N)
+
+ bench_tensors: list[BenchmarkTensors] = [
+ BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+ ]
+
+ silu_mul_quant_timer = bench_impl(
+ bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+ )
+ timers.append(silu_mul_quant_timer)
+ reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+ timers.append(reference_timer)
+
+ print_timers(
+ [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+ )
+
+ print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+ return timers
+
+
+if __name__ == "__main__":
+ T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+ N = [2048, 4096, 8192]
+
+ print(f"T = {T}, N = {N}")
+ run(T, N, arg_pool_size=8)
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
index 66268b71b3..fbe5f74414 100644
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -8,13 +8,12 @@
import vllm.model_executor.layers.activation # noqa F401
from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+batch_size_range = [1, 16, 128]
+seq_len_range = [1, 16, 64, 1024, 4096]
intermediate_size = [3072, 9728, 12288]
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
@@ -30,7 +29,7 @@ def benchmark_activation(
device = "cuda"
num_tokens = batch_size * seq_len
dim = intermediate_size
- current_platform.seed_everything(42)
+ set_random_seed(42)
torch.set_default_device(device)
if func_name == "gelu_and_mul":
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index e07d6c776b..9c6edee7b2 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -6,15 +6,19 @@
but use different quantization strategies and backends.
"""
-import nvtx
import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+ MoEPrepareAndFinalizeNoEP,
+)
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.worker.workspace import init_workspace_manager
# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]
@@ -58,6 +62,7 @@ def bench_run(
per_out_ch: bool,
mkn: tuple[int, int, int],
):
+ init_workspace_manager(torch.cuda.current_device())
(m, k, n) = mkn
dtype = torch.half
@@ -120,85 +125,6 @@ def bench_run(
# Force per-tensor quantization for all cases
per_act_token = False
- # Create stride tensors for CUTLASS
- ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
- ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
- c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
- c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
-
- def run_triton_moe(
- a: torch.Tensor,
- w1: torch.Tensor,
- w2: torch.Tensor,
- topk_weights: torch.Tensor,
- topk_ids: torch.Tensor,
- w1_scale: torch.Tensor,
- w2_scale: torch.Tensor,
- a1_scale: torch.Tensor,
- a2_scale: torch.Tensor,
- num_repeats: int,
- ):
- quant_config = fp8_w8a8_moe_quant_config(
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- per_act_token_quant=per_act_token,
- per_out_ch_quant=per_out_ch,
- )
-
- for _ in range(num_repeats):
- fused_experts(
- a,
- w1,
- w2,
- topk_weights,
- topk_ids,
- quant_config=quant_config,
- )
-
- def run_cutlass_moe_fp8(
- a: torch.Tensor,
- w1: torch.Tensor,
- w2: torch.Tensor,
- topk_weights: torch.Tensor,
- topk_ids: torch.Tensor,
- ab_strides1: torch.Tensor,
- ab_strides2: torch.Tensor,
- c_strides1: torch.Tensor,
- c_strides2: torch.Tensor,
- w1_scale: torch.Tensor,
- w2_scale: torch.Tensor,
- a1_scale: torch.Tensor,
- a2_scale: torch.Tensor,
- num_repeats: int,
- ):
- quant_config = fp8_w8a8_moe_quant_config(
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- per_act_token_quant=per_act_token,
- per_out_ch_quant=per_out_ch,
- )
-
- for _ in range(num_repeats):
- with nvtx.annotate("cutlass_moe_fp8", color="blue"):
- cutlass_moe_fp8(
- a=a,
- w1_q=w1,
- w2_q=w2,
- topk_weights=topk_weights,
- topk_ids=topk_ids,
- ab_strides1=ab_strides1,
- ab_strides2=ab_strides2,
- c_strides1=c_strides1,
- c_strides2=c_strides2,
- quant_config=quant_config,
- activation="silu",
- global_num_experts=num_experts,
- )
-
# Pre-create quantization config to avoid creating it inside CUDA graph
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
@@ -209,23 +135,30 @@ def run_cutlass_moe_fp8(
per_out_ch_quant=per_out_ch,
)
+ fn = mk.FusedMoEModularKernel(
+ MoEPrepareAndFinalizeNoEP(),
+ CutlassExpertsFp8(
+ out_dtype=a.dtype,
+ e=num_experts,
+ n=n,
+ k=k,
+ quant_config=quant_config,
+ device=w1.device,
+ ),
+ )
+
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
- cutlass_moe_fp8(
- a=a,
- w1_q=w1_fp8q_cutlass,
- w2_q=w2_fp8q_cutlass,
- topk_weights=topk_weights,
- topk_ids=topk_ids,
- ab_strides1=ab_strides1,
- ab_strides2=ab_strides2,
- c_strides1=c_strides1,
- c_strides2=c_strides2,
- quant_config=quant_config,
+ fn(
+ a,
+ w1_fp8q_cutlass,
+ w2_fp8q_cutlass,
+ topk_weights,
+ topk_ids,
activation="silu",
global_num_experts=num_experts,
)
@@ -297,6 +230,10 @@ def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
def main(args):
+ # Initialize workspace manager (required for CUTLASS MoE kernels)
+ device = torch.device("cuda:0")
+ init_workspace_manager(device)
+
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
similarity index 92%
rename from benchmarks/kernels/benchmark_cutlass_fp4_moe.py
rename to benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
index 7982cbb142..10a3e3eab5 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -11,16 +11,23 @@
import torch
import torch.utils.benchmark as benchmark
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config,
nvfp4_moe_quant_config,
)
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+ CutlassExpertsFp4,
+)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+ MoEPrepareAndFinalizeNoEP,
+)
from vllm.scalar_type import scalar_types
from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.worker.workspace import init_workspace_manager
WEIGHT_SHAPES_MOE = {
"nvidia/DeepSeek-R1-FP4": [
@@ -187,19 +194,24 @@ def run_cutlass_moe_fp4(
g1_alphas=w1_gs,
g2_alphas=w2_gs,
)
+
+ kernel = mk.FusedMoEModularKernel(
+ MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+ CutlassExpertsFp4(
+ out_dtype=dtype,
+ max_experts_per_worker=e,
+ quant_config=quant_config,
+ ),
+ )
+
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp4", color="green"):
- cutlass_moe_fp4(
- a=a,
- w1_fp4=w1_fp4,
- w2_fp4=w2_fp4,
+ kernel(
+ hidden_states=a,
+ w1=w1_fp4,
+ w2=w2_fp4,
topk_weights=topk_weights,
topk_ids=topk_ids,
- m=m,
- n=n,
- k=k,
- e=num_experts,
- quant_config=quant_config,
)
def run_cutlass_from_graph(
@@ -229,20 +241,24 @@ def run_cutlass_from_graph(
g2_alphas=w2_gs,
)
+ kernel = mk.FusedMoEModularKernel(
+ MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+ CutlassExpertsFp4(
+ out_dtype=dtype,
+ max_experts_per_worker=e,
+ quant_config=quant_config,
+ ),
+ )
+
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
- return cutlass_moe_fp4(
- a=a,
- w1_fp4=w1_fp4,
- w2_fp4=w2_fp4,
+ return kernel(
+ hidden_states=a,
+ w1=w1_fp4,
+ w2=w2_fp4,
topk_weights=topk_weights,
topk_ids=topk_ids,
- m=m,
- n=n,
- k=k,
- e=num_experts,
- quant_config=quant_config,
)
def run_triton_from_graph(
@@ -441,6 +457,10 @@ def replay_graph(graph, num_repeats):
def main(args):
+ # Initialize workspace manager (required for CUTLASS MoE kernels)
+ device = torch.device("cuda:0")
+ init_workspace_manager(device)
+
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
index b414efa6e3..7b453fe7b6 100644
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -293,7 +293,7 @@ def benchmark_allreduce_single(
graph = torch.cuda.CUDAGraph()
graph_pool = torch.cuda.graph_pool_handle()
set_graph_pool_id(graph_pool)
- with torch.cuda.graph(graph, pool=graph_pool):
+ with torch.cuda.graph(graph, pool=graph_pool, stream=stream):
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
allreduce_fn(graph_input)
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 9b426d8d5f..b30a126387 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -5,15 +5,20 @@
import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES_MOE
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
fused_topk,
)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+ MoEPrepareAndFinalizeNoEP,
+)
from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.worker.workspace import init_workspace_manager
DEFAULT_MODELS = [
"mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -44,6 +49,7 @@ def bench_run(
per_out_ch: bool,
mkn: tuple[int, int, int],
):
+ init_workspace_manager(torch.cuda.current_device())
label = "Quant Matmul"
sub_label = (
@@ -81,11 +87,6 @@ def bench_run(
a, score, topk, renormalize=False
)
- ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
- ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
- c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
- c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
def run_triton_moe(
a: torch.Tensor,
w1: torch.Tensor,
@@ -119,10 +120,6 @@ def run_cutlass_moe(
w2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
- ab_strides1: torch.Tensor,
- ab_strides2: torch.Tensor,
- c_strides1: torch.Tensor,
- c_strides2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
per_act_token: bool,
@@ -134,31 +131,29 @@ def run_cutlass_moe(
per_act_token_quant=per_act_token,
)
- for _ in range(num_repeats):
- cutlass_moe_fp8(
- a,
- w1,
- w2,
- topk_weights,
- topk_ids,
- ab_strides1,
- ab_strides2,
- c_strides1,
- c_strides2,
+ fn = mk.FusedMoEModularKernel(
+ MoEPrepareAndFinalizeNoEP(),
+ CutlassExpertsFp8(
+ out_dtype=a.dtype,
+ # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+ e=w2.shape[0],
+ n=w2.shape[2],
+ k=w2.shape[1],
quant_config=quant_config,
- )
+ device=w1.device,
+ ),
+ )
+
+ for _ in range(num_repeats):
+ fn(a, w1, w2, topk_weights, topk_ids)
def run_cutlass_from_graph(
a: torch.Tensor,
a_scale: torch.Tensor,
- w1_q: torch.Tensor,
- w2_q: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
- ab_strides1: torch.Tensor,
- ab_strides2: torch.Tensor,
- c_strides1: torch.Tensor,
- c_strides2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
):
@@ -168,21 +163,23 @@ def run_cutlass_from_graph(
per_act_token_quant=per_act_token,
)
+ fn = mk.FusedMoEModularKernel(
+ MoEPrepareAndFinalizeNoEP(),
+ CutlassExpertsFp8(
+ out_dtype=a.dtype,
+ # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+ e=w2.shape[0],
+ n=w2.shape[2],
+ k=w2.shape[1],
+ quant_config=quant_config,
+ device=w1.device,
+ ),
+ )
+
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
- return cutlass_moe_fp8(
- a,
- w1_q,
- w2_q,
- topk_weights,
- topk_ids,
- ab_strides1,
- ab_strides2,
- c_strides1,
- c_strides2,
- quant_config=quant_config,
- )
+ return fn(a, w1, w2, topk_weights, topk_ids)
def run_triton_from_graph(
a: torch.Tensor,
@@ -226,10 +223,6 @@ def replay_graph(graph, num_repeats):
w2_q,
w1_scale,
w2_scale,
- ab_strides1,
- ab_strides2,
- c_strides1,
- c_strides2,
topk_weights,
topk_ids,
)
@@ -267,10 +260,6 @@ def replay_graph(graph, num_repeats):
"w1_scale": w1_scale,
"w2_scale": w2_scale,
"per_act_token": per_act_token,
- "ab_strides1": ab_strides1,
- "ab_strides2": ab_strides2,
- "c_strides1": c_strides1,
- "c_strides2": c_strides2,
# cuda graph params
"cutlass_graph": cutlass_graph,
"triton_graph": triton_graph,
@@ -329,10 +318,6 @@ def replay_graph(graph, num_repeats):
w2_q,
w1_scale,
w2_scale,
- ab_strides1,
- ab_strides2,
- c_strides1,
- c_strides2,
topk_weights,
topk_ids,
per_act_token,
@@ -341,7 +326,7 @@ def replay_graph(graph, num_repeats):
results.append(
benchmark.Timer(
- stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
+ stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@@ -364,6 +349,10 @@ def replay_graph(graph, num_repeats):
def main(args):
+ # Initialize workspace manager (required for CUTLASS MoE kernels)
+ device = torch.device("cuda:0")
+ init_workspace_manager(device)
+
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 6fa5c24867..2292d2f872 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -6,9 +6,8 @@
import torch
from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@torch.inference_mode()
@@ -22,7 +21,7 @@ def main(
num_warmup_iters: int = 5,
num_iters: int = 100,
) -> None:
- current_platform.seed_everything(seed)
+ set_random_seed(seed)
torch.set_default_device("cuda")
layer = RMSNorm(hidden_size).to(dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 8787724d77..ac78c019a5 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
b_q_weight=w_q,
b_bias=None,
b_scales=w_s,
+ a_scales=None,
global_scale=None,
b_zeros=w_zp,
g_idx=g_idx,
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 12ca9214b1..48d790aec9 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -263,7 +263,7 @@ def gen_allspark_params():
results.append(
benchmark.Timer(
- stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
+ stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@@ -273,7 +273,7 @@ def gen_allspark_params():
results.append(
benchmark.Timer(
- stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
+ stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py
new file mode 100644
index 0000000000..fb3b6c8f12
--- /dev/null
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
+in MLA (Multi-head Latent Attention) prefill.
+
+This validates that the optimization from commit 8d4142bd is beneficial across
+various batch sizes, not just the originally tested batch size of 32768.
+"""
+
+import time
+from collections.abc import Callable
+
+import torch
+
+# DeepSeek-V3 MLA dimensions
+NUM_HEADS = 128
+QK_NOPE_HEAD_DIM = 128
+PE_DIM = 64
+
+
+def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+ """Original torch.cat approach with expand."""
+ return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+
+def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+ """Optimized direct copy approach (avoids expand + cat overhead)."""
+ k = torch.empty(
+ (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
+ dtype=k_nope.dtype,
+ device=k_nope.device,
+ )
+ k[..., : k_nope.shape[-1]] = k_nope
+ k[..., k_nope.shape[-1] :] = k_pe
+ return k
+
+
+def benchmark_method(
+ method: Callable,
+ k_nope: torch.Tensor,
+ k_pe: torch.Tensor,
+ num_warmup: int = 10,
+ num_iters: int = 100,
+) -> float:
+ """Benchmark a concatenation method and return mean latency in ms."""
+ # Warmup
+ for _ in range(num_warmup):
+ _ = method(k_nope, k_pe)
+ torch.cuda.synchronize()
+
+ # Benchmark
+ start = time.perf_counter()
+ for _ in range(num_iters):
+ _ = method(k_nope, k_pe)
+ torch.cuda.synchronize()
+ end = time.perf_counter()
+
+ return (end - start) / num_iters * 1000 # Convert to ms
+
+
+@torch.inference_mode()
+def run_benchmark(dtype: torch.dtype, dtype_name: str):
+ """Run benchmark for a specific dtype."""
+ torch.set_default_device("cuda")
+
+ # Batch sizes to test (powers of 2 from 32 to 65536)
+ batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
+
+ print("=" * 80)
+ print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
+ print("=" * 80)
+ print(
+ f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
+ f"k_pe=[B, 1, {PE_DIM}]"
+ )
+ print(f"dtype: {dtype_name}")
+ print()
+ print(
+ f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
+ f"{'Speedup':>8} | {'Reduction':>10}"
+ )
+ print("-" * 70)
+
+ results = []
+ for batch_size in batch_sizes:
+ # Create input tensors (generate in float32 then convert for FP8 compatibility)
+ k_nope = torch.randn(
+ batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
+ ).to(dtype)
+ k_pe = torch.randn(
+ batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
+ ).to(dtype)
+
+ # Benchmark both methods
+ cat_time = benchmark_method(cat_method, k_nope, k_pe)
+ direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
+
+ speedup = cat_time / direct_time
+ reduction = (1 - direct_time / cat_time) * 100
+
+ results.append((batch_size, cat_time, direct_time, speedup, reduction))
+
+ print(
+ f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
+ f"{speedup:>7.2f}x | {reduction:>9.1f}%"
+ )
+
+ print("=" * 80)
+
+ # Summary statistics
+ speedups = [r[3] for r in results]
+ print("\nSpeedup summary:")
+ print(f" Min: {min(speedups):.2f}x")
+ print(f" Max: {max(speedups):.2f}x")
+ print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
+
+ # Find crossover point
+ crossover_batch = None
+ for batch_size, _, _, speedup, _ in results:
+ if speedup >= 1.0:
+ crossover_batch = batch_size
+ break
+
+ print("\nConclusion:")
+ if crossover_batch:
+ print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
+ # Filter for large batches (>= 512 which is typical for prefill)
+ large_batch_speedups = [r[3] for r in results if r[0] >= 512]
+ if large_batch_speedups:
+ avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
+ print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
+ print(" - MLA prefill typically uses large batches, so optimization is effective")
+
+ return results
+
+
+@torch.inference_mode()
+def main():
+ # Test bfloat16
+ print("\n")
+ run_benchmark(torch.bfloat16, "bfloat16")
+
+ # Test float8_e4m3fn
+ print("\n")
+ run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index a1af0b8aec..b31cfd6116 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
+import gc
import json
import os
import time
@@ -23,9 +24,48 @@
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed
FP8_DTYPE = current_platform.fp8_dtype()
+# Default interval for clearing Triton JIT cache during tuning
+# Set to 0 to disable automatic cache clearing
+_CACHE_CLEAR_INTERVAL_ENV = "VLLM_MOE_TUNE_CACHE_CLEAR_INTERVAL"
+TRITON_CACHE_CLEAR_INTERVAL = int(os.environ.get(_CACHE_CLEAR_INTERVAL_ENV, "50"))
+
+
+def clear_triton_cache():
+ """Clear Triton JIT compilation cache and Python/CUDA memory.
+
+ This helps prevent OOM during tuning with large models (many experts).
+ """
+ # Force Python garbage collection
+ gc.collect()
+
+ # Clear CUDA memory cache
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+
+ # Try to clear Triton's runtime cache
+ try:
+ if (
+ hasattr(triton, "runtime")
+ and hasattr(triton.runtime, "cache")
+ and hasattr(triton.runtime.cache, "clear")
+ ):
+ triton.runtime.cache.clear()
+ except ImportError:
+ # Triton not installed, skip cache clearing
+ pass
+ except AttributeError:
+ # Triton version doesn't have expected cache API
+ pass
+ except Exception as e:
+ print(f"Warning: Failed to clear Triton cache: {e}")
+
+ # Additional garbage collection after clearing caches
+ gc.collect()
+
def ensure_divisibility(numerator, denominator, text):
"""Ensure that numerator is divisible by the denominator."""
@@ -390,7 +430,7 @@ def merge_unique_dicts(list1, list2):
class BenchmarkWorker:
def __init__(self, seed: int) -> None:
torch.set_default_device("cuda")
- current_platform.seed_everything(seed)
+ set_random_seed(seed)
self.seed = seed
# Get the device ID to allocate tensors and kernels
# on the respective GPU. This is required for Ray to work
@@ -410,7 +450,7 @@ def benchmark(
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
- current_platform.seed_everything(self.seed)
+ set_random_seed(self.seed)
dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
@@ -483,7 +523,7 @@ def tune(
need_device_guard = True
with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
- for config in tqdm(search_space):
+ for idx, config in enumerate(tqdm(search_space)):
try:
kernel_time = benchmark_config(
config,
@@ -506,6 +546,19 @@ def tune(
if kernel_time < best_time:
best_time = kernel_time
best_config = config
+
+ # Periodically clear Triton JIT cache to prevent OOM
+ # This is especially important for large models with many experts
+ if (
+ TRITON_CACHE_CLEAR_INTERVAL > 0
+ and idx > 0
+ and idx % TRITON_CACHE_CLEAR_INTERVAL == 0
+ ):
+ clear_triton_cache()
+
+ # Final cleanup after tuning completes
+ clear_triton_cache()
+
now = datetime.now()
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
assert best_config is not None
diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index f540cff626..5f9a131f79 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+ep_size_range = [1, 8]
+configs = list(
+ itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
+)
@triton.testing.perf_report(
triton.testing.Benchmark(
- x_names=["num_tokens", "num_experts", "topk"],
+ x_names=["num_tokens", "num_experts", "topk", "ep_size"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm"],
@@ -38,16 +41,26 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
args={},
)
)
-def benchmark(num_tokens, num_experts, topk, provider):
+def benchmark(num_tokens, num_experts, topk, ep_size, provider):
"""Benchmark function for Triton."""
block_size = 256
+ torch.cuda.manual_seed_all(0)
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+ e_map = None
+ if ep_size != 1:
+ local_e = num_experts // ep_size
+ e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
+ e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+ e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
- lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+ lambda: moe_align_block_size(
+ topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
+ ),
quantiles=quantiles,
)
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index b8913a217c..77b77a15b5 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -18,6 +18,7 @@
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed
FP8_DTYPE = current_platform.fp8_dtype()
@@ -261,7 +262,7 @@ def run(input: tuple):
class BenchmarkWorker:
def __init__(self, seed: int) -> None:
torch.set_default_device("cuda")
- current_platform.seed_everything(seed)
+ set_random_seed(seed)
self.seed = seed
# Get the device ID to allocate tensors and kernels
# on the respective GPU. This is required for Ray to work
@@ -279,7 +280,7 @@ def benchmark(
use_int8_w8a16: bool,
use_customized_permute: bool = False,
) -> tuple[dict[str, int], float]:
- current_platform.seed_everything(self.seed)
+ set_random_seed(self.seed)
permute_time = benchmark_permute(
num_tokens,
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index 83bd919175..3e03651357 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -37,9 +37,9 @@
import torch
from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -94,12 +94,11 @@ def benchmark_mrope(
benchmark_iter: int = 100,
csv_writer=None,
):
- current_platform.seed_everything(seed)
+ set_random_seed(seed)
torch.set_default_device(device)
# the parameters to compute the q k v size based on tp_size
mrope_helper_class = get_rope(
head_size=head_dim,
- rotary_dim=head_dim,
max_position=max_position,
is_neox_style=is_neox_style,
rope_parameters=rope_parameters,
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 46ab2a5fe5..be871d3d1a 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -13,6 +13,7 @@
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random,
+ set_random_seed,
)
logger = init_logger(__name__)
@@ -38,7 +39,7 @@ def main(
device: str = "cuda",
kv_cache_dtype: str | None = None,
) -> None:
- current_platform.seed_everything(seed)
+ set_random_seed(seed)
scale = float(1.0 / (head_size**0.5))
query = torch.empty(
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 3c2ac91289..9a21cfe94e 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -6,9 +6,8 @@
import torch
from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@torch.inference_mode()
@@ -23,7 +22,7 @@ def main(
num_warmup_iters: int = 5,
num_iters: int = 100,
) -> None:
- current_platform.seed_everything(seed)
+ set_random_seed(seed)
torch.set_default_device("cuda")
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py
index 0d3aef0c63..99067d8ac3 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -8,11 +8,11 @@
from vllm import _custom_ops as ops
from vllm.logger import init_logger
-from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random,
+ set_random_seed,
)
logger = init_logger(__name__)
@@ -36,7 +36,7 @@ def run_benchmark(
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
- current_platform.seed_everything(42)
+ set_random_seed(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
index 12f17ea575..ef6be1f3c3 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -7,15 +7,15 @@
from tabulate import tabulate
from vllm import _custom_ops as ops
-from vllm.attention.ops.triton_reshape_and_cache_flash import (
- triton_reshape_and_cache_flash,
-)
from vllm.logger import init_logger
-from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random_flash,
+ set_random_seed,
+)
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+ triton_reshape_and_cache_flash,
)
logger = init_logger(__name__)
@@ -49,7 +49,7 @@ def run_benchmark(
if implementation == "triton" and kv_cache_layout == "HND":
return float("nan") # Triton does not support HND layout yet.
- current_platform.seed_everything(42)
+ set_random_seed(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 074b7a440b..7a1bc050bb 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
def benchmark(batch_size, seq_len, num_heads, provider):
dtype = torch.bfloat16
max_position = 8192
- base = 10000
- rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+ rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
+ rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
rope = rope.to(dtype=dtype, device=device)
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index de01ff197e..da32bc30cb 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -23,9 +23,9 @@
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
persistent_masked_m_silu_mul_quant,
)
-from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+from vllm.utils.torch_utils import set_random_seed
@triton.jit
@@ -207,7 +207,7 @@ def benchmark(
):
def generate_data(seed_offset=0):
"""Generate input data with given seed offset"""
- current_platform.seed_everything(42 + seed_offset)
+ set_random_seed(42 + seed_offset)
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
if gen_strategy == "random_imbalanced":
diff --git a/benchmarks/kernels/cpu/benchmark_cpu_attn.py b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
new file mode 100644
index 0000000000..30b8603953
--- /dev/null
+++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import time
+
+import numpy as np
+import torch
+
+from vllm._custom_ops import (
+ cpu_attention_with_kv_cache,
+ cpu_attn_get_scheduler_metadata,
+ cpu_attn_reshape_and_cache,
+)
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
+
+
+def get_attn_isa(
+ block_size: int | None = None,
+ dtype: torch.dtype | None = None,
+):
+ if block_size and dtype:
+ return _get_attn_isa(dtype, block_size)
+ else:
+ if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+ return "neon"
+ elif torch._C._cpu._is_amx_tile_supported():
+ return "amx"
+ else:
+ return "vec"
+
+
+# rand number generation takes too much time, cache rand tensors
+@functools.lru_cache(maxsize=128, typed=False)
+def tensor_cache(
+ elem_num: int,
+ dtype: torch.dtype,
+) -> torch.Tensor:
+ tensor = torch.randn(elem_num, dtype=dtype)
+ return tensor
+
+
+@torch.inference_mode()
+def main(
+ seq_lens: list[tuple[int, int]],
+ num_heads: tuple[int, int],
+ head_size: int,
+ sliding_window: int = None,
+ dtype: torch.dtype = torch.bfloat16,
+ block_size: int = 128,
+ num_blocks: int = 4096,
+ use_sink: bool = False,
+ enable_kv_split: bool = False,
+ isa: str | None = None,
+ seed: int = 0,
+ iters: int = 20,
+) -> None:
+ current_platform.seed_everything(seed)
+ num_seqs = len(seq_lens)
+ query_lens = [x[0] for x in seq_lens]
+ kv_lens = [x[1] for x in seq_lens]
+ num_query_heads = num_heads[0]
+ num_kv_heads = num_heads[1]
+ assert num_query_heads % num_kv_heads == 0
+ max_kv_len = max(kv_lens)
+ window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+ scale = head_size**-0.5
+ token_num = sum(query_lens)
+
+ if isa is None:
+ isa = get_attn_isa(block_size, dtype)
+
+ s_aux = (
+ 15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
+ )
+
+ query = tensor_cache(
+ elem_num=token_num * num_query_heads * head_size,
+ dtype=dtype,
+ )
+ query = query.view(
+ token_num,
+ num_query_heads,
+ head_size,
+ )
+
+ key_value = tensor_cache(
+ elem_num=2 * num_blocks * num_kv_heads * block_size * head_size,
+ dtype=dtype,
+ )
+ key_value = key_value.view(
+ 2,
+ num_blocks,
+ block_size,
+ num_kv_heads,
+ head_size,
+ )
+ key_cache, value_cache = key_value.unbind(0)
+
+ # KV cache for CPU attention
+ packed_key_cache = torch.empty(
+ num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+ )
+ packed_value_cache = torch.empty_like(packed_key_cache)
+
+ cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+ dim=0, dtype=torch.int32
+ )
+ kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+ max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+ block_tables = torch.randint(
+ 0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+ )
+
+ # use reshape_and_cache to pack key_cache and value_cache
+ slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
+ cpu_attn_reshape_and_cache(
+ key=key_cache.view(-1, num_kv_heads, head_size),
+ value=value_cache.view(-1, num_kv_heads, head_size),
+ key_cache=packed_key_cache,
+ value_cache=packed_value_cache,
+ slot_mapping=slot_mapping,
+ isa=isa,
+ )
+
+ metadata = cpu_attn_get_scheduler_metadata(
+ num_reqs=num_seqs,
+ num_heads=num_query_heads,
+ num_kv_heads=num_kv_heads,
+ head_dim=head_size,
+ seq_lens=kv_lens_tensor,
+ dtype=dtype,
+ query_start_loc=cu_query_lens,
+ causal=True,
+ sliding_window_size=sliding_window if sliding_window is not None else -1,
+ isa=isa,
+ enable_kv_split=enable_kv_split,
+ )
+
+ out_with_split = torch.empty_like(query)
+
+ def run_benchmark(iters: int) -> list[float]:
+ times = []
+ for _ in range(iters):
+ start_time = time.perf_counter_ns()
+ cpu_attention_with_kv_cache(
+ query=query,
+ key_cache=packed_key_cache,
+ value_cache=packed_value_cache,
+ output=out_with_split,
+ query_start_loc=cu_query_lens,
+ seq_lens=kv_lens_tensor,
+ scale=scale,
+ causal=True,
+ alibi_slopes=None,
+ sliding_window=window_size,
+ block_table=block_tables,
+ softcap=0,
+ scheduler_metadata=metadata,
+ s_aux=s_aux,
+ )
+ end_time = time.perf_counter_ns()
+ times.append((end_time - start_time) / 1e6)
+ return times
+
+ # warmup
+ run_benchmark(5)
+ # benchmark
+ times = run_benchmark(iters)
+
+ time_min = min(times)
+ time_max = max(times)
+ time_mean = np.mean(times)
+ time_std = np.std(times)
+
+ print("\tmin (ms) = ", time_min)
+ print("\tmax (ms) = ", time_max)
+ print("\tmean (ms) = ", time_mean)
+ print("\tstd = ", time_std)
+ print("\tmedian (ms) = ", np.median(times))
+
+
+def generate_seq_lens(
+ batch_size: int,
+ q_len_min: int,
+ q_len_max: int,
+ kv_len_min: int,
+ kv_len_max: int,
+ seed: int = 0,
+) -> list[tuple[int, int]]:
+ assert 1 <= q_len_min <= q_len_max
+ assert 1 <= kv_len_min <= kv_len_max
+ assert kv_len_max >= q_len_min
+
+ g = torch.Generator(device="cpu").manual_seed(seed)
+
+ def rint(lo: int, hi: int) -> int:
+ return torch.randint(lo, hi + 1, (1,), generator=g).item()
+
+ seq_lens: list[tuple[int, int]] = []
+ for _ in range(batch_size):
+ # ensure q <= kv
+ kv = rint(max(kv_len_min, q_len_min), kv_len_max)
+ q = rint(q_len_min, min(q_len_max, kv))
+ seq_lens.append((q, kv))
+
+ return seq_lens
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.")
+ parser.add_argument("--batch-size", type=int, default=64)
+ parser.add_argument("--q-len-min", type=int, default=512)
+ parser.add_argument("--q-len-max", type=int, default=512)
+ parser.add_argument("--kv-len-min", type=int, default=512)
+ parser.add_argument("--kv-len-max", type=int, default=512)
+ parser.add_argument("--num-blocks", type=int, default=4096)
+
+ parser.add_argument("--sliding-window", type=int, default=None)
+ parser.add_argument("--num-query-heads", type=int, default=32)
+ parser.add_argument("--num-kv-heads", type=int, default=8)
+ parser.add_argument(
+ "--head-size",
+ type=int,
+ choices=CPUAttentionBackend.get_supported_head_sizes(),
+ default=128,
+ )
+ parser.add_argument("--enable-kv-split", action="store_true")
+ parser.add_argument("--block-size", type=int, choices=[32, 64, 128], default=128)
+ parser.add_argument(
+ "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+ )
+ parser.add_argument("--use-sink", action="store_true")
+ parser.add_argument(
+ "--isa", type=str, choices=["vec", "neon", "amx", "vec16"], default=None
+ )
+ parser.add_argument("--seed", type=int, default=0)
+ parser.add_argument("--iters", type=int, default=20)
+
+ args = parser.parse_args()
+ print(args)
+
+ seq_lens = generate_seq_lens(
+ args.batch_size,
+ args.q_len_min,
+ args.q_len_max,
+ args.kv_len_min,
+ args.kv_len_max,
+ args.seed,
+ )
+
+ print("batch (query len, kv len) = ", seq_lens)
+
+ main(
+ seq_lens=seq_lens,
+ num_heads=(args.num_query_heads, args.num_kv_heads),
+ head_size=args.head_size,
+ sliding_window=args.sliding_window,
+ dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+ block_size=args.block_size,
+ num_blocks=args.num_blocks,
+ use_sink=args.use_sink,
+ enable_kv_split=args.enable_kv_split,
+ isa=args.isa
+ if args.isa is not None
+ else get_attn_isa(args.block_size, STR_DTYPE_TO_TORCH_DTYPE[args.dtype]),
+ seed=args.seed,
+ iters=args.iters,
+ )
diff --git a/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
new file mode 100644
index 0000000000..186b79ede0
--- /dev/null
+++ b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import sys
+import time
+
+import numpy as np
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# Check if CPU MoE operations are available
+try:
+ from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
+except (ImportError, AttributeError) as e:
+ print("ERROR: CPU fused MoE operations are not available on this platform.")
+ print("This benchmark requires x86 CPU with proper vLLM CPU extensions compiled.")
+ print(
+ "The cpu_fused_moe kernel is typically available on Linux x86_64 "
+ "with AVX2/AVX512."
+ )
+ print(f"Import error: {e}")
+ sys.exit(1)
+
+# ISA selection following test_cpu_fused_moe.py pattern
+ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+
+
+@torch.inference_mode()
+def main(
+ batch_size: int,
+ expert_num: int,
+ hidden_size: int,
+ intermediate_size: int,
+ topk_num: int,
+ use_bias: bool = False,
+ dtype: torch.dtype = torch.bfloat16,
+ activation: str = "silu",
+ isa: str = "vec",
+ seed: int = 0,
+ iters: int = 20,
+) -> None:
+ current_platform.seed_everything(seed)
+ # up_dim = 2 * intermediate_size for gate + up projection
+ up_dim = 2 * intermediate_size
+
+ input_tensor = torch.randn((batch_size, hidden_size), dtype=dtype) / (
+ 0.5 * hidden_size**0.5
+ )
+
+ w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
+ 0.5 * hidden_size**0.5
+ )
+ w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
+ 0.5 * intermediate_size**0.5
+ )
+
+ w13_bias = None
+ w2_bias = None
+ if use_bias:
+ w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
+ w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
+ 0.5 * hidden_size**0.5
+ )
+
+ router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
+ score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
+ topk_weights, topk_ids = torch.topk(score, topk_num)
+ topk_ids = topk_ids.to(torch.int32)
+
+ packed_w13 = cpu_prepack_moe_weight(w13, isa)
+ packed_w2 = cpu_prepack_moe_weight(w2, isa)
+
+ def run_benchmark(iters: int) -> list[float]:
+ times = []
+ for _ in range(iters):
+ start_time = time.perf_counter_ns()
+ _ = cpu_fused_moe(
+ input_tensor,
+ packed_w13,
+ packed_w2,
+ w13_bias,
+ w2_bias,
+ topk_weights,
+ topk_ids,
+ activation,
+ isa,
+ )
+ end_time = time.perf_counter_ns()
+ times.append((end_time - start_time) / 1e6)
+ return times
+
+ # warmup
+ run_benchmark(5)
+ # benchmark
+ times = run_benchmark(iters)
+
+ if not times:
+ print("No iterations to measure. Set --iters > 0.")
+ return
+
+ time_min = min(times)
+ time_max = max(times)
+ time_mean = np.mean(times)
+ time_std = np.std(times)
+
+ print("\tmin (ms) = ", time_min)
+ print("\tmax (ms) = ", time_max)
+ print("\tmean (ms) = ", time_mean)
+ print("\tstd = ", time_std)
+ print("\tmedian (ms) = ", np.median(times))
+
+ # Calculate throughput metrics
+ # FLOPs estimation: 2 * batch * topk * (hidden * up_dim + intermediate * hidden)
+ flops_per_token = (
+ 2 * topk_num * (hidden_size * up_dim + intermediate_size * hidden_size)
+ )
+ total_flops = batch_size * flops_per_token
+ tflops = total_flops / (time_mean * 1e-3) / 1e12
+ print(f"\tthroughput (TFLOP/s) = {tflops:.4f}")
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark the CPU fused MoE kernel.")
+ parser.add_argument("--batch-size", type=int, default=64)
+ parser.add_argument("--expert-num", type=int, default=8)
+ parser.add_argument("--hidden-size", type=int, default=2880)
+ parser.add_argument("--intermediate-size", type=int, default=2880)
+ parser.add_argument(
+ "--topk-num",
+ type=int,
+ default=None,
+ help="Number of experts to route each token to (default: expert_num // 2)",
+ )
+ parser.add_argument("--use-bias", action="store_true")
+ parser.add_argument(
+ "--activation",
+ type=str,
+ choices=["silu", "swigluoai"],
+ default="silu",
+ help="Activation function",
+ )
+ parser.add_argument(
+ "--isa",
+ type=str,
+ choices=ISA_CHOICES,
+ default=ISA_CHOICES[0],
+ help=f"ISA to use (available: {ISA_CHOICES})",
+ )
+ parser.add_argument("--seed", type=int, default=0)
+ parser.add_argument("--iters", type=int, default=20)
+
+ args = parser.parse_args()
+
+ # Default topk_num to expert_num // 2, minimum 1
+ topk_num = (
+ args.topk_num if args.topk_num is not None else max(args.expert_num // 2, 1)
+ )
+
+ print(args)
+
+ main(
+ batch_size=args.batch_size,
+ expert_num=args.expert_num,
+ hidden_size=args.hidden_size,
+ intermediate_size=args.intermediate_size,
+ topk_num=topk_num,
+ use_bias=args.use_bias,
+ dtype=torch.bfloat16, # Following test_cpu_fused_moe.py
+ activation=args.activation,
+ isa=args.isa,
+ seed=args.seed,
+ iters=args.iters,
+ )
diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
index 41e68e047b..a28c6956be 100644
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -2,7 +2,7 @@
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
-Currently this just includes dense GEMMs and only works on Hopper GPUs.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.
## Setup
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fbbb03c5ed..0af87fd7f0 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
# Build ACL with CMake
- set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
- set(CMAKE_BUILD_TYPE "Release")
- set(ARM_COMPUTE_ARCH "armv8.2-a")
- set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
- set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
- set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
- set(ARM_COMPUTE_ENABLE_OPENMP "ON")
- set(ARM_COMPUTE_ENABLE_WERROR "OFF")
- set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
- set(ARM_COMPUTE_BUILD_TESTING "OFF")
-
set(_cmake_config_cmd
${CMAKE_COMMAND} -G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
@@ -341,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
PUBLIC ${oneDNN_BINARY_DIR}/include
PRIVATE ${oneDNN_SOURCE_DIR}/src
)
- target_link_libraries(dnnl_ext dnnl)
+ target_link_libraries(dnnl_ext dnnl torch)
target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
list(APPEND LIBS dnnl_ext)
set(USE_ONEDNN ON)
@@ -369,13 +358,13 @@ set(VLLM_EXT_SRC
"csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
"csrc/cpu/cpu_attn.cpp"
- "csrc/cpu/scratchpad_manager.cpp"
"csrc/cpu/torch_bindings.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC
"csrc/cpu/shm.cpp"
"csrc/cpu/cpu_wna16.cpp"
+ "csrc/cpu/cpu_fused_moe.cpp"
${VLLM_EXT_SRC})
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
set(VLLM_EXT_SRC
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 2cf3c1a755..0d4f9b7aa0 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# sm90a
set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
- list(APPEND SUPPORT_ARCHS 9.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+ list(APPEND SUPPORT_ARCHS "9.0a")
endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
- list(APPEND SUPPORT_ARCHS 10.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+ # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+ # this supports all compute_10x family
+ list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ list(APPEND SUPPORT_ARCHS "10.0a")
endif()
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
if(FLASH_MLA_ARCHS)
+ message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
$<$:-UPy_LIMITED_API>
$<$:-UPy_LIMITED_API>)
else()
- # Create empty targets for setup.py when not targeting sm90a systems
+ message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+ # Create empty targets for setup.py on unsupported systems
add_custom_target(_flashmla_C)
add_custom_target(_flashmla_extension_C)
endif()
diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake
index 5a59a40999..84bb1b00c1 100644
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@@ -31,10 +31,15 @@ if(NOT qutlass_SOURCE_DIR)
endif()
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
-cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0f" "${CUDA_ARCHS}")
+else()
+ cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a;10.3a" "${CUDA_ARCHS}")
+endif()
+
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND QUTLASS_ARCHS)
- if(QUTLASS_ARCHS MATCHES "10\\.0a")
+ if(QUTLASS_ARCHS MATCHES "10\\.(0a|3a|0f)")
set(QUTLASS_TARGET_CC 100)
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
set(QUTLASS_TARGET_CC 120)
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 6cc5cda14c..b51934a3ab 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 71bb26f6295449be880344b93b51791cc009237d
+ GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index ca0062ba4f..bdb2ba74d9 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
run_python(_VLLM_TORCH_GOMP_PATH
"
import os, glob
-try:
- import torch
- torch_pkg = os.path.dirname(torch.__file__)
- site_root = os.path.dirname(torch_pkg)
- torch_libs = os.path.join(site_root, 'torch.libs')
- print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
-except:
- print('')
+import torch
+torch_pkg = os.path.dirname(torch.__file__)
+site_root = os.path.dirname(torch_pkg)
+
+# Search both torch.libs and torch/lib
+roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
+candidates = []
+for root in roots:
+ if not os.path.isdir(root):
+ continue
+ candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
+
+print(candidates[0] if candidates else '')
"
- "failed to probe torch.libs for libgomp")
+ "failed to probe for libgomp")
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
return()
@@ -495,7 +500,13 @@ function (define_extension_target MOD_NAME)
set(SOABI_KEYWORD "")
endif()
- if (ARG_USE_SABI)
+ run_python(IS_FREETHREADED_PYTHON
+ "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+ "Failed to determine whether interpreter is free-threaded")
+
+ # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
+ # so avoid using the stable ABI under free-threading only.
+ if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
else()
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index a4a880f13c..8268065ef0 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -15,19 +15,61 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
const scalar_t& y) {
return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
}
-// Activation and gating kernel template.
+// Check if all pointers are 16-byte aligned for int4 vectorized access
+__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
+ return (reinterpret_cast(ptr) & 15) == 0;
+}
+
+// Activation and gating kernel template.
template
__global__ void act_and_mul_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
const int d) {
+ constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
const int64_t token_idx = blockIdx.x;
- for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
- const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
- const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
- out[token_idx * d + idx] = compute(x, y);
+ const scalar_t* x_ptr = input + token_idx * 2 * d;
+ const scalar_t* y_ptr = x_ptr + d;
+ scalar_t* out_ptr = out + token_idx * d;
+
+ // Check alignment for 128-bit vectorized access.
+ // All three pointers must be 16-byte aligned for safe int4 operations.
+ const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+ is_16byte_aligned(out_ptr);
+
+ if (aligned && d >= VEC_SIZE) {
+ // Fast path: 128-bit vectorized loop
+ const int4* x_vec = reinterpret_cast(x_ptr);
+ const int4* y_vec = reinterpret_cast(y_ptr);
+ int4* out_vec = reinterpret_cast(out_ptr);
+ const int num_vecs = d / VEC_SIZE;
+ const int vec_end = num_vecs * VEC_SIZE;
+
+ for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+ int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+ auto* xp = reinterpret_cast(&x);
+ auto* yp = reinterpret_cast(&y);
+ auto* rp = reinterpret_cast(&r);
+#pragma unroll
+ for (int j = 0; j < VEC_SIZE; j++) {
+ rp[j] = compute(xp[j], yp[j]);
+ }
+ out_vec[i] = r;
+ }
+ // Scalar cleanup for remaining elements
+ for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+ out_ptr[i] = compute(VLLM_LDG(&x_ptr[i]),
+ VLLM_LDG(&y_ptr[i]));
+ }
+ } else {
+ // Scalar fallback for unaligned data or small d
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+ const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+ const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+ out_ptr[idx] = compute(x, y);
+ }
}
}
@@ -120,50 +162,115 @@ template
__global__ void act_and_mul_kernel_with_param(
scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
const float param) {
+ constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
const int64_t token_idx = blockIdx.x;
- for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
- const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
- const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
- out[token_idx * d + idx] = ACT_FN(x, param) * y;
+ const scalar_t* x_ptr = input + token_idx * 2 * d;
+ const scalar_t* y_ptr = x_ptr + d;
+ scalar_t* out_ptr = out + token_idx * d;
+
+ // Check alignment for 128-bit vectorized access
+ const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+ is_16byte_aligned(out_ptr);
+
+ if (aligned && d >= VEC_SIZE) {
+ // Fast path: 128-bit vectorized loop
+ const int4* x_vec = reinterpret_cast(x_ptr);
+ const int4* y_vec = reinterpret_cast(y_ptr);
+ int4* out_vec = reinterpret_cast(out_ptr);
+ const int num_vecs = d / VEC_SIZE;
+ const int vec_end = num_vecs * VEC_SIZE;
+
+ for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+ int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+ auto* xp = reinterpret_cast(&x);
+ auto* yp = reinterpret_cast(&y);
+ auto* rp = reinterpret_cast(&r);
+#pragma unroll
+ for (int j = 0; j < VEC_SIZE; j++) {
+ rp[j] = ACT_FN(xp[j], param) * yp[j];
+ }
+ out_vec[i] = r;
+ }
+ // Scalar cleanup for remaining elements
+ for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+ out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
+ }
+ } else {
+ // Scalar fallback for unaligned data or small d
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+ const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+ const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+ out_ptr[idx] = ACT_FN(x, param) * y;
+ }
}
}
template
__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
float alpha, float limit) {
- // clamp gate: min=None, max=limit
- const float gate_f = (float)gate;
- const float clamped_gate = gate_f > limit ? limit : gate_f;
-
- // clamp up: min=-limit, max=limit
- const float up_f = (float)up;
- const float clamped_up =
- up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
- // glu = gate * sigmoid(gate * alpha)
- const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
- const float glu = clamped_gate * sigmoid_val;
-
- // (up + 1) * glu
- return (T)((clamped_up + 1.0f) * glu);
+ // Clamp gate to (-inf, limit] and up to [-limit, limit]
+ const float g = fminf((float)gate, limit);
+ const float u = fmaxf(fminf((float)up, limit), -limit);
+ // glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu
+ return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha)));
}
+// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...].
template
__global__ void swigluoai_and_mul_kernel(
scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., 2, d]
+ const scalar_t* __restrict__ input, // [..., 2 * d] (interleaved)
const int d, const float alpha, const float limit) {
+ // For interleaved data: input has 2*d elements per token (gate/up pairs)
+ // output has d elements per token
+ constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+ constexpr int PAIRS = VEC_SIZE / 2; // Number of gate/up pairs per int4 load
const int64_t token_idx = blockIdx.x;
- // TODO: Vectorize loads and stores.
- for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
- // gate = x[..., ::2] (even indices)
- const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
- // up = x[..., 1::2] (odd indices)
- const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
-
- out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+ const scalar_t* in_ptr = input + token_idx * 2 * d;
+ scalar_t* out_ptr = out + token_idx * d;
+
+ // Check alignment for 128-bit vectorized access on input.
+ // For output we use int2 (64-bit) which has 8-byte alignment requirement.
+ const bool in_aligned = is_16byte_aligned(in_ptr);
+ const bool out_aligned =
+ (reinterpret_cast(out_ptr) & 7) == 0; // 8-byte for int2
+
+ if (in_aligned && out_aligned && d >= PAIRS) {
+ // Fast path: vectorized loop
+ // Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs
+ // Each int2 store writes PAIRS output elements
+ const int4* in_vec = reinterpret_cast(in_ptr);
+ int2* out_vec = reinterpret_cast(out_ptr);
+ const int num_vecs = d / PAIRS;
+ const int vec_end = num_vecs * PAIRS;
+
+ for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+ int4 v = VLLM_LDG(&in_vec[i]);
+ int2 r;
+ auto* vp = reinterpret_cast(&v);
+ auto* rp = reinterpret_cast(&r);
+#pragma unroll
+ for (int j = 0; j < PAIRS; j++) {
+ rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit);
+ }
+ out_vec[i] = r;
+ }
+ // Scalar cleanup for remaining elements
+ for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+ out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]),
+ VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit);
+ }
+ } else {
+ // Scalar fallback for unaligned data or small d
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+ // gate = x[..., ::2] (even indices)
+ const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]);
+ // up = x[..., 1::2] (odd indices)
+ const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]);
+ out_ptr[idx] = ACT_FN(gate, up, alpha, limit);
+ }
}
}
@@ -217,10 +324,41 @@ __global__ void activation_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., d]
const int d) {
+ constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
const int64_t token_idx = blockIdx.x;
- for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
- const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
- out[token_idx * d + idx] = ACT_FN(x);
+ const scalar_t* in_ptr = input + token_idx * d;
+ scalar_t* out_ptr = out + token_idx * d;
+
+ // Check alignment for 128-bit vectorized access
+ const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
+
+ if (aligned && d >= VEC_SIZE) {
+ // Fast path: 128-bit vectorized loop
+ const int4* in_vec = reinterpret_cast(in_ptr);
+ int4* out_vec = reinterpret_cast(out_ptr);
+ const int num_vecs = d / VEC_SIZE;
+ const int vec_end = num_vecs * VEC_SIZE;
+
+ for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+ int4 v = VLLM_LDG(&in_vec[i]), r;
+ auto* vp = reinterpret_cast(&v);
+ auto* rp = reinterpret_cast(&r);
+#pragma unroll
+ for (int j = 0; j < VEC_SIZE; j++) {
+ rp[j] = ACT_FN(vp[j]);
+ }
+ out_vec[i] = r;
+ }
+ // Scalar cleanup for remaining elements
+ for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+ out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
+ }
+ } else {
+ // Scalar fallback for unaligned data or small d
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+ const scalar_t x = VLLM_LDG(&in_ptr[idx]);
+ out_ptr[idx] = ACT_FN(x);
+ }
}
}
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 229d9862fb..27d1e990c6 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads,
- const uint head_size) {
+ const uint head_size, const uint prefix_head_stride,
+ const uint output_head_stride) {
using pack_128b_t = uint4;
const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size;
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
const uint head_idx = token_head_idx % num_heads;
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
- const uint head_offset =
- token_idx * num_heads * head_size + head_idx * head_size;
- const scalar_t* prefix_head_ptr = prefix_output + head_offset;
- const scalar_t* suffix_head_ptr = suffix_output + head_offset;
- scalar_t* output_head_ptr = output + head_offset;
+ const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+ head_idx * prefix_head_stride;
+ const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+ head_idx * output_head_stride;
+ const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+ const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
+ scalar_t* output_head_ptr = output + dst_head_offset;
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
reinterpret_cast(prefix_lse.data_ptr()), \
reinterpret_cast(suffix_output.data_ptr()), \
reinterpret_cast(suffix_lse.data_ptr()), num_tokens, \
- num_heads, head_size); \
+ num_heads, head_size, prefix_head_stride, output_head_stride); \
}
/*@brief Merges the attention states from prefix and suffix
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
const uint num_tokens = output.size(0);
const uint num_heads = output.size(1);
const uint head_size = output.size(2);
+ const uint prefix_head_stride = prefix_output.stride(1);
+ const uint output_head_stride = output.stride(1);
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
- TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
- "output heads must be contiguous in memory");
- TORCH_CHECK(
- prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
- "prefix_output heads must be contiguous in memory");
- TORCH_CHECK(
- suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
- "suffix_output heads must be contiguous in memory");
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr();
diff --git a/csrc/cache.h b/csrc/cache.h
index b162a4a2bc..d14f46c349 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,6 +1,7 @@
#pragma once
#include
+#include
#include