diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
index b199e554a739..21ffa1b9b8d7 100644
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -8,8 +8,9 @@ run_all_patterns:
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/cuda.txt"
- - "requirements/build.txt"
- - "requirements/test.txt"
+ - "requirements/kv_connectors.txt"
+ - "requirements/build/cuda.txt"
+ - "requirements/test/cuda.txt"
- "setup.py"
- "csrc/"
- "cmake/"
diff --git a/.buildkite/ci_config_intel.yaml b/.buildkite/ci_config_intel.yaml
index 375be84a396a..a1c0091e0f10 100644
--- a/.buildkite/ci_config_intel.yaml
+++ b/.buildkite/ci_config_intel.yaml
@@ -6,8 +6,8 @@ run_all_patterns:
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/xpu.txt"
- - "requirements/build.txt"
- - "requirements/test.txt"
+ - "requirements/build/cuda.txt"
+ - "requirements/test/cuda.txt"
- "setup.py"
- "csrc/"
- "cmake/"
diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml
index 23a23723ad93..0c514647dc2b 100644
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -20,11 +20,3 @@ steps:
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env:
DOCKER_BUILDKIT: "1"
- retry:
- automatic:
- - exit_status: -1 # Agent was lost
- limit: 1
- - exit_status: -10 # Agent was lost
- limit: 1
- - exit_status: 1 # Machine occasionally fail
- limit: 1
diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml
index acca2b368858..19716bab6de5 100644
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -12,13 +12,19 @@ steps:
- vllm/_custom_ops.py
- tests/kernels/attention/test_cpu_attn.py
- tests/kernels/moe/test_cpu_fused_moe.py
+ - tests/kernels/moe/test_cpu_quant_fused_moe.py
- tests/kernels/test_onednn.py
+ - tests/kernels/test_awq_int4_to_int8.py
+ - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
commands:
- |
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+ bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
- pytest -x -v -s tests/kernels/test_onednn.py"
+ pytest -x -v -s tests/kernels/moe/test_cpu_quant_fused_moe.py
+ pytest -x -v -s tests/kernels/test_onednn.py
+ pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py
+ pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py"
- label: CPU-Compatibility Tests
depends_on: []
@@ -44,10 +50,24 @@ steps:
- tests/models/language/pooling/
commands:
- |
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
+ bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 40m "
pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model"
+- label: CPU-ModelRunnerV2 Tests
+ depends_on: []
+ device: intel_cpu
+ no_plugin: true
+ soft_fail: true
+ source_file_dependencies:
+ - vllm/v1/worker/cpu/
+ - vllm/v1/worker/gpu/
+ commands:
+ - |
+ bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
+ uv pip install git+https://github.com/triton-lang/triton-cpu.git@270e696d
+ VLLM_USE_V2_MODEL_RUNNER=1 pytest -x -v -s tests/models/language/generation/test_granite.py -m cpu_model"
+
- label: CPU-Quantization Model Tests
depends_on: []
device: intel_cpu
@@ -55,23 +75,24 @@ steps:
source_file_dependencies:
- csrc/cpu/
- vllm/model_executor/layers/quantization/cpu_wna16.py
- - vllm/model_executor/layers/quantization/gptq_marlin.py
+ - vllm/model_executor/layers/quantization/auto_gptq.py
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+ - vllm/model_executor/layers/fused_moe/experts/cpu_moe.py
- tests/quantization/test_compressed_tensors.py
- tests/quantization/test_cpu_wna16.py
commands:
- |
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+ bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
-- label: CPU-Distributed Tests
+- label: CPU-Distributed Tests (PP+TP)
depends_on: []
device: intel_cpu
no_plugin: true
- source_file_dependencies:
+ source_file_dependencies: &cpu_distributed_deps
- csrc/cpu/shm.cpp
- vllm/v1/worker/cpu_worker.py
- vllm/v1/worker/gpu_worker.py
@@ -80,10 +101,21 @@ steps:
- vllm/platforms/cpu.py
- vllm/distributed/parallel_state.py
- vllm/distributed/device_communicators/cpu_communicator.py
+ - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+ commands:
+ - |
+ bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+ bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp"
+
+- label: CPU-Distributed Tests (DP+TP)
+ depends_on: []
+ device: intel_cpu
+ no_plugin: true
+ source_file_dependencies: *cpu_distributed_deps
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
- bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+ bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp"
- label: CPU-Multi-Modal Model Tests %N
depends_on: []
@@ -97,7 +129,7 @@ steps:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
- parallelism: 2
+ parallelism: 3
- label: "Arm CPU Test"
depends_on: []
diff --git a/.buildkite/hardware_tests/intel.yaml b/.buildkite/hardware_tests/intel.yaml
index ba0088b3af62..d70ce28428d4 100644
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -8,10 +8,3 @@ steps:
commands:
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
- - label: "Intel GPU Test"
- depends_on: []
- soft_fail: true
- device: intel_gpu
- no_plugin: true
- commands:
- - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
index 9131dfc71a0a..10c03c3e1773 100755
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -92,8 +92,8 @@ check_and_skip_if_image_exists() {
}
ecr_login() {
- aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
- aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+ aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+ aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
}
prepare_cache_tags() {
@@ -192,6 +192,7 @@ export BUILDKITE_COMMIT
export PARENT_COMMIT
export IMAGE_TAG
export IMAGE_TAG_LATEST
+export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}"
export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
index 42eaed7ddaa0..e0ef7d592424 100644
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -6,6 +6,48 @@ steps:
timeout_in_minutes: 600
commands:
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
+ # Non-root smoke 1: the default (root) image must still be importable
+ # under a non-root UID via `--user 2000:0`. Validates the `vllm` passwd
+ # entry + group-0-writable /home/vllm + uv path cleanup from #31959.
+ # Uses `import vllm` rather than `vllm serve --help` because the latter
+ # instantiates `VllmConfig` which requires a GPU attached to the
+ # container.
+ - docker run --rm --user 2000:0 --entrypoint python3 "$IMAGE_TAG" -c "import vllm; print(vllm.__version__)"
+ # Non-root smoke 2: assert the non-root enabling invariants are baked
+ # into the image. Runs as UID 2000:0 via a shell so we can verify
+ # filesystem perms + passwd/group file state + wrapper presence without
+ # triggering vLLM's GPU-requiring config-init path. The opt-in
+ # `vllm-openai-nonroot` target adds only `USER vllm`, `WORKDIR
+ # /home/vllm`, and an `ENTRYPOINT` override on top of these invariants;
+ # its build correctness is reviewed at the Dockerfile level. Wrapper
+ # logic is covered separately by the pre-commit hook
+ # `test-nonroot-entrypoint` (see .pre-commit-config.yaml).
+ - |
+ docker run --rm --user 2000:0 --entrypoint /bin/sh "$IMAGE_TAG" -ec '
+ if ! getent passwd 2000 | grep -q ^vllm:; then
+ echo FAIL: UID 2000 != vllm
+ exit 1
+ fi
+ if ! id -gn 2>/dev/null | grep -qx root; then
+ echo FAIL: GID 0 not root group
+ exit 1
+ fi
+ touch /home/vllm/.smoke && rm /home/vllm/.smoke
+ touch /opt/uv/cache/.smoke && rm /opt/uv/cache/.smoke
+ if ! test -x /usr/local/bin/vllm-nonroot-entrypoint.sh; then
+ echo FAIL: wrapper missing
+ exit 1
+ fi
+ if ! test -w /etc/passwd; then
+ echo FAIL: /etc/passwd not group-writable
+ exit 1
+ fi
+ if ! test -w /etc/group; then
+ echo FAIL: /etc/group not group-writable
+ exit 1
+ fi
+ echo non-root invariants OK
+ '
retry:
automatic:
- exit_status: -1 # Agent was lost
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
index ccfe155fa2b7..035f070ab891 100755
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,7 +11,7 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
index ff3d11c8d599..b561e2c2e463 100755
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,7 +11,7 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
index 60fa1789fa06..df900dc60342 100755
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,7 +11,7 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
diff --git a/.buildkite/image_build/image_build_torch_nightly.sh b/.buildkite/image_build/image_build_torch_nightly.sh
new file mode 100755
index 000000000000..cbd08aa7bd0b
--- /dev/null
+++ b/.buildkite/image_build/image_build_torch_nightly.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -euo pipefail
+
+# Build a vLLM test image with PyTorch nightly installed.
+# Called by the pipeline generator's "vLLM Against PyTorch Nightly" group.
+
+if [[ $# -lt 5 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+IMAGE_TAG=$5
+
+# --- Arguments ---
+echo "--- :mag: Arguments"
+echo "REGISTRY: ${REGISTRY}"
+echo "REPO: ${REPO}"
+echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
+echo "BRANCH: ${BRANCH}"
+echo "IMAGE_TAG: ${IMAGE_TAG}"
+
+# --- ECR login ---
+echo "--- :key: ECR login"
+aws ecr-public get-login-password --region us-east-1 \
+ | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr get-login-password --region us-east-1 \
+ | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# --- Set up buildx ---
+echo "--- :docker: Setting up buildx"
+docker buildx create --name vllm-builder --driver docker-container --use || true
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# --- Skip if image already exists ---
+echo "--- :mag: Checking if image already exists"
+if docker manifest inspect "$IMAGE_TAG" >/dev/null 2>&1; then
+ echo "Image found: $IMAGE_TAG — skipping build"
+ exit 0
+fi
+echo "Image not found, proceeding with build..."
+
+# --- CUDA 13.0 for nightly builds ---
+# Nightly CI uses CUDA 13.0 while regular CI stays on CUDA 12.9
+NIGHTLY_CUDA_VERSION="13.0.2"
+NIGHTLY_BUILD_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-devel-ubuntu22.04"
+NIGHTLY_FINAL_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-base-ubuntu22.04"
+
+echo "--- :docker: Building torch nightly image (CUDA ${NIGHTLY_CUDA_VERSION})"
+docker buildx build --file docker/Dockerfile \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg PYTORCH_NIGHTLY=1 \
+ --build-arg CUDA_VERSION="${NIGHTLY_CUDA_VERSION}" \
+ --build-arg BUILD_BASE_IMAGE="${NIGHTLY_BUILD_BASE_IMAGE}" \
+ --build-arg FINAL_BASE_IMAGE="${NIGHTLY_FINAL_BASE_IMAGE}" \
+ --build-arg torch_cuda_arch_list="8.0 8.9 9.0 10.0 12.0" \
+ --tag "$IMAGE_TAG" \
+ --push \
+ --target test \
+ --progress plain .
+
+echo "--- :white_check_mark: Torch nightly image build complete: $IMAGE_TAG"
diff --git a/.buildkite/image_build/image_build_xpu.sh b/.buildkite/image_build/image_build_xpu.sh
index c3734dce13ca..45417b7339be 100755
--- a/.buildkite/image_build/image_build_xpu.sh
+++ b/.buildkite/image_build/image_build_xpu.sh
@@ -11,8 +11,8 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
# skip build if image already exists
if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
diff --git a/.buildkite/intel_jobs/engine_intel.yaml b/.buildkite/intel_jobs/engine_intel.yaml
new file mode 100644
index 000000000000..c66576d40991
--- /dev/null
+++ b/.buildkite/intel_jobs/engine_intel.yaml
@@ -0,0 +1,21 @@
+group: Engine Intel
+depends_on:
+ - image-build-xpu
+steps:
+- label: Engine (1 GPU)
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/v1/engine/
+ - tests/v1/engine/
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py'
diff --git a/.buildkite/intel_jobs/kernels_intel.yaml b/.buildkite/intel_jobs/kernels_intel.yaml
new file mode 100644
index 000000000000..66a8db25f02e
--- /dev/null
+++ b/.buildkite/intel_jobs/kernels_intel.yaml
@@ -0,0 +1,21 @@
+group: Kernels Intel
+depends_on:
+ - image-build-xpu
+steps:
+- label: vLLM IR Tests
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/ir
+ - vllm/kernels
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ pytest -v -s kernels/ir'
diff --git a/.buildkite/intel_jobs/lora_intel.yaml b/.buildkite/intel_jobs/lora_intel.yaml
new file mode 100644
index 000000000000..32a56ef59b3f
--- /dev/null
+++ b/.buildkite/intel_jobs/lora_intel.yaml
@@ -0,0 +1,135 @@
+group: LoRA Intel
+depends_on:
+ - image-build-xpu
+steps:
+- label: LoRA Runtime + Utils
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ pytest -v -s lora/test_layers.py &&
+ pytest -v -s lora/test_lora_checkpoints.py &&
+ pytest -v -s lora/test_lora_functions.py &&
+ pytest -v -s lora/test_lora_huggingface.py &&
+ pytest -v -s lora/test_lora_manager.py &&
+ pytest -v -s lora/test_lora_utils.py &&
+ pytest -v -s lora/test_peft_helper.py &&
+ pytest -v -s lora/test_resolver.py &&
+ pytest -v -s lora/test_utils.py &&
+ pytest -v -s lora/test_add_lora.py &&
+ pytest -v -s lora/test_worker.py'
+
+- label: LoRA Fused/MoE Kernels
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ pytest -v -s lora/test_fused_moe_lora_kernel.py &&
+ pytest -v -s lora/test_moe_lora_align_sum.py --deselect="tests/lora/test_moe_lora_align_sum.py::test_moe_lora_align_block_size_mixed_base_and_lora[1]"'
+
+- label: LoRA Punica Kernels
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ set -o pipefail &&
+ pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-3-43264-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype1-1-2049-64-128-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]"'
+
+- label: LoRA Punica FP8/XPU Ops
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ pytest -v -s lora/test_punica_ops_fp8.py &&
+ pytest -v -s lora/test_punica_xpu_ops.py'
+
+- label: LoRA Models
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ (pytest -v -s lora/test_mixtral.py --deselect="tests/lora/test_mixtral.py::test_mixtral_lora[4]" || true) &&
+ pytest -v -s lora/test_quant_model.py --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model0]" --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model1]" --deselect="tests/lora/test_quant_model.py::test_quant_model_tp_equality[model0]" &&
+ pytest -v -s lora/test_transformers_model.py &&
+ pytest -v -s lora/test_chatglm3_tp.py &&
+ pytest -s -v lora/test_minicpmv_tp.py'
+
+- label: LoRA Multimodal
+ timeout_in_minutes: 45
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'cd tests &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ pytest -v -s lora/test_default_mm_loras.py &&
+ pytest -v -s lora/test_whisper.py'
diff --git a/.buildkite/intel_jobs/misc_intel.yaml b/.buildkite/intel_jobs/misc_intel.yaml
new file mode 100644
index 000000000000..864128bb5338
--- /dev/null
+++ b/.buildkite/intel_jobs/misc_intel.yaml
@@ -0,0 +1,55 @@
+group: Miscellaneous Intel
+depends_on:
+ - image-build-xpu
+steps:
+- label: V1 Core + KV + Metrics
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/core
+ - tests/v1/executor
+ - tests/v1/kv_offload
+ - tests/v1/worker
+ - tests/v1/kv_connector/unit
+ - tests/v1/metrics
+ - tests/entrypoints/openai/correctness/test_lmeval.py
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'pip install -r requirements/kv_connectors.txt &&
+ export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ cd tests &&
+ pytest -v -s v1/executor'
+
+- label: V1 Sample + Logits
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ working_dir: "."
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ VLLM_TEST_DEVICE: "xpu"
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/sample
+ - tests/v1/logits_processors
+ - tests/v1/test_oracle.py
+ - tests/v1/test_request.py
+ - tests/v1/test_outputs.py
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
+ cd tests &&
+ pytest -v -s v1/logits_processors --ignore=v1/logits_processors/test_custom_online.py --ignore=v1/logits_processors/test_custom_offline.py &&
+ pytest -v -s v1/test_oracle.py &&
+ pytest -v -s v1/test_request.py &&
+ pytest -v -s v1/test_outputs.py'
diff --git a/.buildkite/intel_jobs/test-intel.yaml b/.buildkite/intel_jobs/test-intel.yaml
index 3aa75f4754f9..805b7e54f120 100644
--- a/.buildkite/intel_jobs/test-intel.yaml
+++ b/.buildkite/intel_jobs/test-intel.yaml
@@ -35,9 +35,13 @@ steps:
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN &&
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 &&
+ python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --kv-cache-dtype fp8 &&
+ python3 examples/basic/offline_inference/generate.py --model nvidia/Llama-3.1-8B-Instruct-FP8 --block-size 64 --enforce-eager --quantization modelopt --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --max-model-len 4096 &&
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192 &&
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 &&
- python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel'
+ python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel &&
+ python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --max-model-len 8192
+ '
- label: "XPU V1 test"
depends_on:
- image-build-xpu
@@ -56,9 +60,28 @@ steps:
'cd tests &&
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py &&
pytest -v -s v1/engine --ignore=v1/engine/test_output_processor.py &&
- pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py &&
+ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py -k "not test_topk_only and not test_topp_only and not test_topk_and_topp" &&
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py &&
pytest -v -s v1/structured_output &&
pytest -v -s v1/test_serial_utils.py &&
- pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py &&
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py'
+ pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py &&
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py --ignore=v1/kv_connector/unit/test_offloading_connector.py'
+ - label: "XPU server test"
+ depends_on:
+ - image-build-xpu
+ timeout_in_minutes: 30
+ device: intel_gpu
+ no_plugin: true
+ env:
+ REGISTRY: "public.ecr.aws/q9t5s3a7"
+ REPO: "vllm-ci-test-repo"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/intel_jobs/test-intel.yaml
+ commands:
+ - >-
+ bash .buildkite/scripts/hardware_ci/run-intel-test.sh
+ 'pip install av &&
+ cd tests &&
+ pytest -v -s entrypoints/openai/chat_completion/test_audio_in_video.py &&
+ pytest -v -s benchmarks/test_serve_cli.py'
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
index 6c0b5540cbb6..9a5af8540118 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -1,6 +1,9 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+required_gpu_arch:
+ - gfx942
+ - gfx950
tasks:
- name: "mmlu_pro"
metrics:
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
index aa4fb9fa03d6..ff43fa187b0e 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,6 +1,9 @@
# For vllm script, with -t option (tensor parallel size)
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+required_gpu_arch:
+ - gfx942
+ - gfx950
tasks:
- name: "gsm8k"
metrics:
diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
index 514c15d6098e..84e4f3fe3349 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@@ -1,4 +1,7 @@
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
+required_gpu_arch:
+ - gfx942
+ - gfx950
tasks:
- name: "mmlu_pro"
metrics:
diff --git a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
index a3bb95e19e24..36e0543879b3 100644
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -1,5 +1,6 @@
Qwen2.5-1.5B-Instruct.yaml
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
index 518af9a66018..b495c0d123a6 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
-# pip install "lm-eval[api]>=0.4.11"
+# pip install "lm-eval[api]>=0.4.12"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index f010ffe6752d..e430e6183b2d 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
-# pip install "lm-eval[api]>=0.4.11"
+# pip install "lm-eval[api]>=0.4.12"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index fec4a94e63e4..f1a541ddbefc 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install "lm-eval[api]>=0.4.11"
+# pip install "lm-eval[api]>=0.4.12"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
index e3c6e16bd6b3..ba8da9fc3f55 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install "lm-eval[api]>=0.4.11"
+# pip install "lm-eval[api]>=0.4.12"
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index fad5f593be4f..d34e603b9e26 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,6 +13,7 @@
from contextlib import contextmanager
import lm_eval
+import pytest
import yaml
from vllm.platforms import current_platform
@@ -89,9 +90,40 @@ def launch_lm_eval(eval_config, tp_size):
return results
+def _check_rocm_gpu_arch_requirement(eval_config):
+ """Skip the test if the model requires a ROCm GPU arch not present.
+
+ Model YAML configs can specify::
+
+ required_gpu_arch:
+ - gfx942
+ - gfx950
+
+ The check only applies on ROCm. On other platforms (e.g. CUDA) the
+ field is ignored so that shared config files work for both NVIDIA and
+ AMD CI pipelines.
+ """
+ required_archs = eval_config.get("required_gpu_arch")
+ if not required_archs:
+ return
+
+ if not current_platform.is_rocm():
+ return
+
+ from vllm.platforms.rocm import _GCN_ARCH # noqa: E402
+
+ if not any(arch in _GCN_ARCH for arch in required_archs):
+ pytest.skip(
+ f"Model requires GPU arch {required_archs}, "
+ f"but detected arch is '{_GCN_ARCH}'"
+ )
+
+
def test_lm_eval_correctness_param(config_filename, tp_size):
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+ _check_rocm_gpu_arch_requirement(eval_config)
+
results = launch_lm_eval(eval_config, tp_size)
rtol = eval_config.get("rtol", DEFAULT_RTOL)
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
index 63f1f8ab887b..9f226ef2f819 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -36,6 +36,7 @@
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
+ "temperature": 0,
"num_prompts": 200
}
},
@@ -127,4 +128,4 @@
}
}
]
-}
\ No newline at end of file
+}
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
index f0dc3d5ec067..30879b5e9dc5 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
@@ -22,6 +22,7 @@
"hf_split": "test",
"no_stream": "",
"no_oversample": "",
+ "temperature": 0,
"num_prompts": 200
}
},
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
index 0411b04e1bd5..34c2cc82d395 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -26,34 +26,14 @@
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
+ "temperature": 0,
"num_prompts": 200
}
},
"tests": [
- {
- "test_name": "serving_llama8B_tp1_sharegpt",
- "server_parameters": {
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
- }
- },
- {
- "test_name": "serving_llama8B_tp2_sharegpt",
- "server_parameters": {
- "tensor_parallel_size": 2
- },
- "client_parameters": {
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
- }
- },
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
@@ -62,290 +42,244 @@
}
},
{
- "test_name": "serving_llama8B_tp2_random_128_128",
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 2
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
},
"client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp4_random_128_128",
+ "test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 4
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
},
"client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp1_random_128_2048",
- "server_parameters": {
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_128_2048",
+ "test_name": "serving_llama1B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 2
+ "model": "meta-llama/Llama-3.2-1B"
},
"client_parameters": {
+ "model": "meta-llama/Llama-3.2-1B",
"dataset_name": "random",
"random-input-len": 128,
- "random-output-len": 2048
+ "random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp4_random_128_2048",
+ "test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 4
+ "model": "meta-llama/Llama-3.2-3B-Instruct"
},
"client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_tp1_random_2048_128",
- "server_parameters": {
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 2048,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp2_random_2048_128",
+ "test_name": "serving_llama70B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 2
+ "model": "meta-llama/Llama-3.3-70B-Instruct"
},
"client_parameters": {
+ "model": "meta-llama/Llama-3.3-70B-Instruct",
"dataset_name": "random",
- "random-input-len": 2048,
+ "random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp4_random_2048_128",
+ "test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
- "tensor_parallel_size": 4
+ "model": "ibm-granite/granite-3.2-2b-instruct"
},
"client_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
- "random-input-len": 2048,
+ "random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_tp1_random_2048_2048",
- "server_parameters": {
- "tensor_parallel_size": 1
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_2048_2048",
- "server_parameters": {
- "tensor_parallel_size": 2
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_tp4_random_2048_2048",
- "server_parameters": {
- "tensor_parallel_size": 4
- },
- "client_parameters": {
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 2048
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 1
+ "model": "Qwen/Qwen3-1.7B"
},
"client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 2
+ "model": "Qwen/Qwen3-4B"
},
"client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "tensor_parallel_size": 4
+ "model": "Qwen/Qwen3-8B"
},
"client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int8_tp1_random_128_128",
+ "test_name": "serving_qwen14B_tp1_random_128_128",
"server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1
+ "model": "Qwen/Qwen3-14B"
},
"client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "model": "Qwen/Qwen3-14B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "test_name": "serving_qwen30B_tp1_random_128_128",
"server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2
+ "model": "Qwen/Qwen3-30B-A3B"
},
"client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "model": "Qwen/Qwen3-30B-A3B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
+ "test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4
+ "model": "zai-org/glm-4-9b-hf"
},
"client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_llama3B_tp1_random_128_128",
+ "test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
- "model": "meta-llama/Llama-3.2-3B-Instruct",
- "tensor_parallel_size": 1
+ "model": "google/gemma-7b"
},
"client_parameters": {
- "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_granite2B_tp1_random_128_128",
+ "test_name": "serving_gemma3-4b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0
+ },
"server_parameters": {
- "model": "ibm-granite/granite-3.2-2b-instruct",
- "tensor_parallel_size": 1
+ "model": "google/gemma-3-4b-it"
},
"client_parameters": {
- "model": "ibm-granite/granite-3.2-2b-instruct",
+ "model": "google/gemma-3-4b-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_qwen1.7B_tp1_random_128_128",
+ "test_name": "serving_gemma3-12b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0
+ },
"server_parameters": {
- "model": "Qwen/Qwen3-1.7B",
- "tensor_parallel_size": 1
+ "model": "google/gemma-3-12b-it"
},
"client_parameters": {
- "model": "Qwen/Qwen3-1.7B",
+ "model": "google/gemma-3-12b-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_qwen4B_tp1_random_128_128",
+ "test_name": "serving_gemma4-4b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0
+ },
"server_parameters": {
- "model": "Qwen/Qwen3-4B",
- "tensor_parallel_size": 1
+ "model": "google/gemma-4-E4B-it"
},
"client_parameters": {
- "model": "Qwen/Qwen3-4B",
+ "model": "google/gemma-4-E4B-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_qwen8B_tp1_random_128_128",
+ "test_name": "serving_gemma4-2b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0
+ },
"server_parameters": {
- "model": "Qwen/Qwen3-8B",
- "tensor_parallel_size": 1
+ "model": "google/gemma-4-E2B-it"
},
"client_parameters": {
- "model": "Qwen/Qwen3-8B",
+ "model": "google/gemma-4-E2B-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_glm9B_tp1_random_128_128",
+ "test_name": "serving_gemma4-26b_tp1_random_128_128",
+ "server_environment_variables": {
+ "VLLM_CPU_SGL_KERNEL": 0,
+ "VLLM_CPU_ATTN_SPLIT_KV": 0
+ },
"server_parameters": {
- "model": "zai-org/glm-4-9b-hf",
- "tensor_parallel_size": 1
+ "model": "google/gemma-4-26B-A4B-it"
},
"client_parameters": {
- "model": "zai-org/glm-4-9b-hf",
+ "model": "google/gemma-4-26B-A4B-it",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
- "test_name": "serving_gemma7B_tp1_random_128_128",
+ "test_name": "serving_phi4_tp1_random_128_128",
"server_parameters": {
- "model": "google/gemma-7b",
- "tensor_parallel_size": 1
+ "model": "microsoft/Phi-4-reasoning"
},
"client_parameters": {
- "model": "google/gemma-7b",
+ "model": "microsoft/Phi-4-reasoning",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index f66ef2af4bd6..c2d7768e2026 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -26,6 +26,7 @@
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
+ "temperature": 0,
"num_prompts": 200
}
},
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
index 3929aa5fbbe0..d5ef981689dd 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -21,6 +21,7 @@
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "temperature": 0,
"num_prompts": 200
}
},
@@ -47,6 +48,7 @@
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "temperature": 0,
"num_prompts": 200
}
},
@@ -73,6 +75,7 @@
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "temperature": 0,
"num_prompts": 200
}
},
@@ -100,6 +103,7 @@
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "temperature": 0,
"num_prompts": 200
}
},
@@ -127,6 +131,7 @@
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "temperature": 0,
"num_prompts": 200
}
},
@@ -151,6 +156,7 @@
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "temperature": 0,
"num_prompts": 200
}
}
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json
index 66d52abc1206..2cbd472295e7 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -1,73 +1,112 @@
-[
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "ignore-eos": "",
+ "temperature": 0,
+ "num_prompts": 200
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama8B_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
{
- "test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "disable_log_stats": "",
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
+ "test_name": "serving_llama8B_tp1_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama70B_tp4_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
- "tensor_parallel_size": 4,
- "disable_log_stats": "",
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
+ "test_name": "serving_llama8B_tp1_random_2048_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 2048
+ }
},
{
- "test_name": "serving_mixtral8x7B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "server_parameters": {
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
- "tensor_parallel_size": 2,
- "disable_log_stats": "",
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
+ "test_name": "serving_llama70B_tp4_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.3-70B-Instruct",
+ "async_scheduling": "",
+ "no_enable_prefix_caching": "",
+ "max_num_batched_tokens": 8192
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.3-70B-Instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
},
{
- "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
- "qps_list": [2],
- "server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
- "tensor_parallel_size": 4,
- "speculative_config": {
- "model": "turboderp/Qwama-0.5B-Instruct",
- "num_speculative_tokens": 4,
- "draft_tensor_parallel_size": 1
- }
- },
- "client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
+ "test_name": "serving_gemma4-e4b_tp1_random_128_128",
+ "server_parameters": {
+ "model": "google/gemma-4-E4B-it",
+ "enable_auto_tool_choice": "",
+ "tool_call_parser": "gemma4",
+ "chat_template": "examples/tool_chat_template_gemma4.jinja",
+ "reasoning_parser": "gemma4"
+ },
+ "client_parameters": {
+ "model": "google/gemma-4-E4B-it",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
}
-]
+ ]
+}
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 45b2996f7ead..df9b80f7f9a8 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,3 +1,16 @@
+# CUDA architecture lists — following PyTorch RELEASE.md
+# (https://github.com/pytorch/pytorch/blob/main/RELEASE.md)
+# SM86 included for broader Ampere coverage; SM89 for marlin fp8 support
+env:
+ CUDA_ARCH_X86: "7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX"
+ # aarch64 only architectures: 8.7 for Orin, 11.0 for Thor (since CUDA 13)
+ CUDA_ARCH_AARCH64: "8.0 8.7 8.9 9.0 10.0 11.0 12.0+PTX"
+ CUDA_ARCH_X86_CU129: "7.5 8.0 8.6 8.9 9.0 10.0 12.0"
+ CUDA_ARCH_AARCH64_CU129: "8.0 8.7 8.9 9.0 10.0 12.0"
+ MOONCAKE_WHEEL_AARCH64_2_35: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_35_aarch64.whl"
+ MOONCAKE_WHEEL_AARCH64_2_39: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_39_aarch64.whl"
+ MOONCAKE_WHEEL_X86_64: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_35_x86_64.whl"
+
steps:
- input: "Provide Release version here"
id: input-release-version
@@ -14,12 +27,11 @@ steps:
agents:
queue: arm64_cpu_queue_release
commands:
- # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
- # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda12.9 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -29,12 +41,11 @@ steps:
agents:
queue: arm64_cpu_queue_release
commands:
- # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
- # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -47,7 +58,8 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -57,10 +69,11 @@ steps:
agents:
queue: cpu_queue_release
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda12.9 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -70,10 +83,11 @@ steps:
agents:
queue: cpu_queue_release
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -86,7 +100,8 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+ - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"'
env:
DOCKER_BUILDKIT: "1"
@@ -98,105 +113,236 @@ steps:
commands:
- "bash .buildkite/scripts/generate-and-upload-nightly-index.sh"
+ - block: "Unblock to build release Docker images"
+ depends_on: ~
+ key: block-build-release-images
+ if: build.env("NIGHTLY") != "1"
+
- group: "Build release Docker images"
key: "build-release-images"
+ depends_on: block-build-release-images
+ allow_dependency_failure: true
steps:
- - label: "Build release image - x86_64 - CUDA 12.9"
+ - label: "Build release image - x86_64 - CUDA 13.0"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=13.0.2 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"'
- - label: "Build release image - aarch64 - CUDA 12.9"
+ - label: "Build release image - aarch64 - CUDA 13.0"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=13.0.2 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"'
- - label: "Build release image - x86_64 - CUDA 13.0"
+ - label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
- id: build-release-image-x86-cuda-13-0
+ id: build-release-image-x86-cuda-12-9
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=12.9.1 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"
# re-tag to default image tag and push, just in case arm64 build fails
- - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"'
- - label: "Build release image - aarch64 - CUDA 13.0"
+ - label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~
- id: build-release-image-arm64-cuda-13-0
+ id: build-release-image-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=12.9.1 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"'
- - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
+ - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-x86-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=13.0.2 \
+ --build-arg UBUNTU_VERSION=24.04 \
+ --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"'
- - label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
+ - label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-arm64-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=13.0.2 \
+ --build-arg UBUNTU_VERSION=24.04 \
+ --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"'
- - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
+ - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
- id: build-release-image-x86-cuda-13-0-ubuntu2404
+ id: build-release-image-x86-cuda-12-9-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
- - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=12.9.1 \
+ --build-arg UBUNTU_VERSION=24.04 \
+ --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"'
- - label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
+ - label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
- id: build-release-image-arm64-cuda-13-0-ubuntu2404
+ id: build-release-image-arm64-cuda-12-9-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
+ - |
+ DOCKER_BUILDKIT=1 docker build \
+ $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \
+ --build-arg max_jobs=16 \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg GIT_REPO_CHECK=1 \
+ --build-arg CUDA_VERSION=12.9.1 \
+ --build-arg UBUNTU_VERSION=24.04 \
+ --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \
+ --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \
+ --build-arg INSTALL_KV_CONNECTORS=true \
+ --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \
+ --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \
+ --target vllm-openai \
+ --progress plain \
+ -f docker/Dockerfile .
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"'
- block: "Build release image for x86_64 CPU"
key: block-cpu-release-image-build
depends_on: ~
- label: "Build release image - x86_64 - CPU"
+ key: build-cpu-release-image-x86
depends_on:
- block-cpu-release-image-build
- input-release-version
@@ -207,6 +353,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"'
env:
DOCKER_BUILDKIT: "1"
@@ -215,7 +362,8 @@ steps:
depends_on: ~
- label: "Build release image - arm64 - CPU"
- depends_on:
+ key: build-cpu-release-image-arm64
+ depends_on:
- block-arm64-cpu-release-image-build
- input-release-version
agents:
@@ -225,13 +373,14 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"'
env:
DOCKER_BUILDKIT: "1"
- group: "Publish release images"
key: "publish-release-images"
steps:
- - label: "Create multi-arch manifest - CUDA 12.9"
+ - label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86
- build-release-image-arm64
@@ -242,29 +391,22 @@ steps:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 13.0" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"'
- - label: "Annotate release workflow - CUDA 12.9"
- depends_on:
- - create-multi-arch-manifest
- id: annotate-release-workflow
- agents:
- queue: small_cpu_queue_release
- commands:
- - "bash .buildkite/scripts/annotate-release.sh"
-
- - label: "Create multi-arch manifest - CUDA 13.0"
+ - label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- - build-release-image-x86-cuda-13-0
- - build-release-image-arm64-cuda-13-0
- id: create-multi-arch-manifest-cuda-13-0
+ - build-release-image-x86-cuda-12-9
+ - build-release-image-arm64-cuda-12-9
+ id: create-multi-arch-manifest-cuda-12-9
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu129 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 12.9" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"'
- - label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
+ - label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
depends_on:
- build-release-image-x86-ubuntu2404
- build-release-image-arm64-ubuntu2404
@@ -275,18 +417,20 @@ steps:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 13.0 Ubuntu 24.04" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"'
- - label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
+ - label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
depends_on:
- - build-release-image-x86-cuda-13-0-ubuntu2404
- - build-release-image-arm64-cuda-13-0-ubuntu2404
- id: create-multi-arch-manifest-cuda-13-0-ubuntu2404
+ - build-release-image-x86-cuda-12-9-ubuntu2404
+ - build-release-image-arm64-cuda-12-9-ubuntu2404
+ id: create-multi-arch-manifest-cuda-12-9-ubuntu2404
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend"
- - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu129-ubuntu2404 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"
+ - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 12.9 Ubuntu 24.04" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"'
- label: "Publish nightly multi-arch image to DockerHub"
depends_on:
@@ -306,16 +450,16 @@ steps:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
+ - label: "Publish nightly multi-arch image to DockerHub - CUDA 12.9"
depends_on:
- - create-multi-arch-manifest-cuda-13-0
+ - create-multi-arch-manifest-cuda-12-9
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_release
commands:
- - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
+ - "bash .buildkite/scripts/push-nightly-builds.sh cu129"
# Clean up old nightly builds (keep only last 14)
- - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
+ - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu129-nightly-"
plugins:
- docker-login#v3.0.0:
username: vllmbot
@@ -324,24 +468,6 @@ steps:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- - group: "Publish wheels"
- key: "publish-wheels"
- steps:
- - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
- key: block-upload-release-wheels
- depends_on:
- - input-release-version
- - build-wheels
-
- - label: "Upload release wheels to PyPI"
- depends_on:
- - block-upload-release-wheels
- id: upload-release-wheels
- agents:
- queue: small_cpu_queue_release
- commands:
- - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
-
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
# =============================================================================
@@ -455,7 +581,7 @@ steps:
echo ""
echo " Build complete - Image and wheels cached"
fi
-
+
artifact_paths:
- "artifacts/rocm-base-wheels/*.whl"
env:
@@ -611,12 +737,14 @@ steps:
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env:
S3_BUCKET: "vllm-wheels"
- VARIANT: "rocm721"
+ VARIANT: "rocm722"
# ROCm Job 6: Build ROCm Release Docker Image
- label: ":docker: Build release image - x86_64 - ROCm"
id: build-rocm-release-image
depends_on:
+ - step: block-build-release-images
+ allow_failure: true
- step: build-rocm-base-wheels
allow_failure: false
agents:
@@ -669,7 +797,7 @@ steps:
# Push to ECR
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
-
+
echo ""
echo " Successfully built and pushed ROCm release image"
echo " Image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
@@ -696,3 +824,60 @@ steps:
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
+
+ # =============================================================================
+ # Publish to DockerHub and PyPI (at the end so all builds complete first)
+ # =============================================================================
+
+ - block: "Publish release images to DockerHub"
+ key: block-publish-release-images
+ depends_on:
+ - create-multi-arch-manifest
+ - create-multi-arch-manifest-cuda-12-9
+ - create-multi-arch-manifest-ubuntu2404
+ - create-multi-arch-manifest-cuda-12-9-ubuntu2404
+ - build-rocm-release-image
+ - input-release-version
+ # Wait for CPU builds if their block steps were unblocked, so publish
+ # doesn't race the in-progress CPU build. allow_failure lets publish
+ # proceed when the operator legitimately leaves the CPU block steps
+ # unblocked or the CPU build fails.
+ - step: build-cpu-release-image-x86
+ allow_failure: true
+ - step: build-cpu-release-image-arm64
+ allow_failure: true
+ if: build.env("NIGHTLY") != "1"
+
+ - label: "Publish release images to DockerHub"
+ depends_on:
+ - block-publish-release-images
+ key: publish-release-images-dockerhub
+ agents:
+ queue: small_cpu_queue_release
+ commands:
+ - "bash .buildkite/scripts/publish-release-images.sh"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllmbot
+ password-env: DOCKERHUB_TOKEN
+ env:
+ DOCKER_BUILDKIT: "1"
+ DOCKERHUB_USERNAME: "vllmbot"
+
+ - group: "Publish wheels"
+ key: "publish-wheels"
+ steps:
+ - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+ key: block-upload-release-wheels
+ depends_on:
+ - input-release-version
+ - build-wheels
+
+ - label: "Upload release wheels to PyPI"
+ depends_on:
+ - block-upload-release-wheels
+ id: upload-release-wheels
+ agents:
+ queue: small_cpu_queue_release
+ commands:
+ - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
diff --git a/.buildkite/scripts/annotate-build-artifact.sh b/.buildkite/scripts/annotate-build-artifact.sh
new file mode 100755
index 000000000000..67cdf7923658
--- /dev/null
+++ b/.buildkite/scripts/annotate-build-artifact.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Append a build artifact line to the Buildkite annotation.
+# Usage: annotate-build-artifact.sh
-🔥 We have built a vllm website to help you get started with vllm. Please visit [vllm.ai](https://vllm.ai) to learn more.
+🔥 We have built a vLLM website to help you get started with vLLM. Please visit [vllm.ai](https://vllm.ai) to learn more.
For events, please visit [vllm.ai/events](https://vllm.ai/events) to join us.
---
@@ -23,47 +23,54 @@ For events, please visit [vllm.ai/events](https://vllm.ai/events) to join us.
vLLM is a fast and easy-to-use library for LLM inference and serving.
-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has grown into one of the most active open-source AI projects built and maintained by a diverse community of many dozens of academic institutions and companies from over 2000 contributors.
vLLM is fast with:
- State-of-the-art serving throughput
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
-- Continuous batching of incoming requests
-- Fast model execution with CUDA/HIP graph
-- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
-- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
-- Speculative decoding
-- Chunked prefill
+- Continuous batching of incoming requests, chunked prefill, prefix caching
+- Fast and flexible model execution with piecewise and full CUDA/HIP graphs
+- Quantization: FP8, MXFP8/MXFP4, NVFP4, INT8, INT4, GPTQ/AWQ, GGUF, compressed-tensors, ModelOpt, TorchAO, and [more](https://docs.vllm.ai/en/latest/features/quantization/index.html)
+- Optimized attention kernels including FlashAttention, FlashInfer, TRTLLM-GEN, FlashMLA, and Triton
+- Optimized GEMM/MoE kernels for various precisions using CUTLASS, TRTLLM-GEN, CuTeDSL
+- Speculative decoding including n-gram, suffix, EAGLE, DFlash
+- Automatic kernel generation and graph-level transformations using torch.compile
+- Disaggregated prefill, decode, and encode
vLLM is flexible and easy to use with:
- Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Tensor, pipeline, data, expert, and context parallelism for distributed inference
- Streaming outputs
-- OpenAI-compatible API server
-- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
-- Prefix caching support
-- Multi-LoRA support
+- Generation of structured outputs using xgrammar or guidance
+- Tool calling and reasoning parsers
+- OpenAI-compatible API server, plus Anthropic Messages API and gRPC support
+- Efficient multi-LoRA support for dense and MoE layers
+- Support for NVIDIA GPUs, AMD GPUs, and x86/ARM/PowerPC CPUs. Additionally, diverse hardware plugins such as Google TPUs, Intel Gaudi, IBM Spyre, Huawei Ascend, Rebellions NPU, Apple Silicon, MetaX GPU, and more.
-vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+vLLM seamlessly supports 200+ model architectures on Hugging Face, including:
-- Transformer-like LLMs (e.g., Llama)
-- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
-- Embedding Models (e.g., E5-Mistral)
-- Multi-modal LLMs (e.g., LLaVA)
+- Decoder-only LLMs (e.g., Llama, Qwen, Gemma)
+- Mixture-of-Expert LLMs (e.g., Mixtral, DeepSeek-V3, Qwen-MoE, GPT-OSS)
+- Hybrid attention and state-space models (e.g., Mamba, Qwen3.5)
+- Multi-modal models (e.g., LLaVA, Qwen-VL, Pixtral)
+- Embedding and retrieval models (e.g., E5-Mistral, GTE, ColBERT)
+- Reward and classification models (e.g., Qwen-Math)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
## Getting Started
-Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
+Install vLLM with [`uv`](https://docs.astral.sh/uv/) (recommended) or `pip`:
```bash
-pip install vllm
+uv pip install vllm
```
+Or [build from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source) for development.
+
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
diff --git a/tests/entrypoints/openai/realtime/__init__.py b/benchmarks/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/realtime/__init__.py
rename to benchmarks/__init__.py
diff --git a/benchmarks/attention_benchmarks/configs/mla_decode.yaml b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
index d758654dbe80..8f12ac723064 100644
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -53,6 +53,7 @@ backends:
- FLASHINFER_MLA
- FLASH_ATTN_MLA # Hopper only
- FLASHMLA # Hopper only
+ - TOKENSPEED_MLA # Blackwell + R1 dims + FP8 KV (use --kv-cache-dtype fp8)
device: "cuda:0"
repeats: 100
diff --git a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
index 122dbd783c5b..1e1ab264bace 100644
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -3,6 +3,7 @@
# Compares all available MLA prefill backends:
# FA backends: fa2, fa3, fa4 (FlashAttention versions)
# Non-FA: flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
+# CuTe DSL: tokenspeed (Blackwell + R1 dims, requires tokenspeed_mla)
#
# Uses cutlass_mla as the decode backend for impl construction
# (only the prefill path is exercised).
@@ -120,6 +121,7 @@ prefill_backends:
- flashinfer
- cudnn
- trtllm
+ - tokenspeed
device: "cuda:0"
repeats: 20
diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index f8bc7b4a10ed..abab1e2edbac 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -29,6 +29,7 @@
VllmConfig,
set_current_vllm_config,
)
+from vllm.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
# ============================================================================
# VllmConfig Creation
@@ -79,8 +80,8 @@ def create_minimal_vllm_config(
index_topk: Optional topk value for sparse MLA backends. If provided,
the config will include index_topk for sparse attention.
prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
- "cudnn", "trtllm"). Configures the attention config to
- force the specified prefill backend.
+ "trtllm"). Configures the attention config to force
+ the specified prefill backend.
Returns:
VllmConfig for benchmarking
@@ -179,19 +180,13 @@ def create_minimal_vllm_config(
if prefill_backend is not None:
prefill_cfg = get_prefill_backend_config(prefill_backend)
+ vllm_config.attention_config.mla_prefill_backend = prefill_cfg[
+ "mla_prefill_backend"
+ ]
if prefill_cfg["flash_attn_version"] is not None:
vllm_config.attention_config.flash_attn_version = prefill_cfg[
"flash_attn_version"
]
- vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
- "disable_flashinfer_prefill"
- ]
- vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
- "use_cudnn_prefill"
- ]
- vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
- "use_trtllm_ragged_deepseek_prefill"
- ]
return vllm_config
@@ -206,39 +201,27 @@ def create_minimal_vllm_config(
_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
"fa2": {
"flash_attn_version": 2,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.FLASH_ATTN,
},
"fa3": {
"flash_attn_version": 3,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.FLASH_ATTN,
},
"fa4": {
"flash_attn_version": 4,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.FLASH_ATTN,
},
"flashinfer": {
"flash_attn_version": None,
- "disable_flashinfer_prefill": False,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.FLASHINFER,
},
- "cudnn": {
+ "trtllm": {
"flash_attn_version": None,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": True,
- "use_trtllm_ragged_deepseek_prefill": False,
+ "mla_prefill_backend": MLAPrefillBackendEnum.TRTLLM_RAGGED,
},
- "trtllm": {
+ "tokenspeed": {
"flash_attn_version": None,
- "disable_flashinfer_prefill": True,
- "use_cudnn_prefill": False,
- "use_trtllm_ragged_deepseek_prefill": True,
+ "mla_prefill_backend": MLAPrefillBackendEnum.TOKENSPEED_MLA,
},
}
@@ -404,6 +387,7 @@ def _build_attention_metadata(
query_start_loc=q_start_gpu,
query_start_loc_cpu=q_start_cpu,
seq_lens=seq_lens_gpu,
+ seq_lens_cpu_upper_bound=seq_lens_cpu,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
slot_mapping=slot_mapping,
@@ -624,6 +608,21 @@ def _create_backend_impl(
# Create mock layer
layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
+ # Attach a prefill backend (MLAAttention does this in __init__; the metadata
+ # builder reads layer.prefill_backend from static_forward_context).
+ from vllm.v1.attention.backends.mla.prefill import get_mla_prefill_backend
+
+ prefill_backend_cls = get_mla_prefill_backend(vllm_config)
+ layer.prefill_backend = prefill_backend_cls(
+ num_heads=mla_dims["num_q_heads"],
+ scale=(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]) ** -0.5,
+ kv_lora_rank=mla_dims["kv_lora_rank"],
+ qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+ qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+ v_head_dim=mla_dims["v_head_dim"],
+ vllm_config=vllm_config,
+ )
+
# Create builder instance if needed
builder_instance = None
if builder_class:
@@ -960,19 +959,6 @@ def _run_mla_benchmark_batched(
results = []
with set_current_vllm_config(vllm_config):
- # Clear cached prefill backend detection functions so they re-evaluate
- # with the current VllmConfig. These are @functools.cache decorated and
- # would otherwise return stale results from a previous backend's config.
- from vllm.model_executor.layers.attention.mla_attention import (
- use_cudnn_prefill,
- use_flashinfer_prefill,
- use_trtllm_ragged_deepseek_prefill,
- )
-
- use_flashinfer_prefill.cache_clear()
- use_cudnn_prefill.cache_clear()
- use_trtllm_ragged_deepseek_prefill.cache_clear()
-
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
impl, layer, builder_instance, indexer = _create_backend_impl(
backend_cfg,
@@ -984,38 +970,36 @@ def _run_mla_benchmark_batched(
kv_cache_dtype=kv_cache_dtype,
)
- # Verify the actual prefill backend matches what was requested
+ # Verify the actual prefill backend matches what was requested. The
+ # selector + impl construction already raise on misuse; here we just
+ # check the resolved class against the requested name as a sanity guard.
if prefill_backend is not None:
- prefill_cfg = get_prefill_backend_config(prefill_backend)
- fa_version = prefill_cfg["flash_attn_version"]
-
- if fa_version is not None:
- # FA backend: verify the impl's FA version
- actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
+ expected_class = {
+ "fa2": "FlashAttnPrefillBackend",
+ "fa3": "FlashAttnPrefillBackend",
+ "fa4": "FlashAttnPrefillBackend",
+ "flashinfer": "FlashInferPrefillBackend",
+ "trtllm": "TrtllmRaggedPrefillBackend",
+ "tokenspeed": "TokenspeedMLAPrefillBackend",
+ }.get(prefill_backend)
+ actual_class = type(getattr(layer, "prefill_backend", None)).__name__
+ if expected_class and actual_class != expected_class:
+ raise RuntimeError(
+ f"Prefill backend '{prefill_backend}' requested "
+ f"{expected_class}, got {actual_class}. Check "
+ f"attention_config plumbing or installed deps."
+ )
+ if prefill_backend in {"fa2", "fa3", "fa4"}:
+ fa_version = int(prefill_backend[2:])
+ actual_fa_version = getattr(
+ layer.prefill_backend, "vllm_flash_attn_version", None
+ )
if actual_fa_version != fa_version:
raise RuntimeError(
f"Prefill backend '{prefill_backend}' requested FA "
- f"version {fa_version}, but the impl is using FA "
- f"version {actual_fa_version}. Check "
- f"vllm/v1/attention/backends/fa_utils.py."
+ f"version {fa_version}, got "
+ f"{actual_fa_version} on {actual_class}."
)
- else:
- # Non-FA backend: verify the builder picked the right path
- expected_flags = {
- "flashinfer": "_use_fi_prefill",
- "cudnn": "_use_cudnn_prefill",
- "trtllm": "_use_trtllm_ragged_prefill",
- }
- flag_name = expected_flags.get(prefill_backend)
- if flag_name and not getattr(builder_instance, flag_name, False):
- raise RuntimeError(
- f"Prefill backend '{prefill_backend}' was requested "
- f"but the metadata builder did not enable it. This "
- f"usually means a dependency is missing (e.g., "
- f"flashinfer not installed) or the platform doesn't "
- f"support it."
- )
-
# Run each benchmark with the shared impl
for config, threshold, num_splits in configs_with_params:
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
diff --git a/benchmarks/benchmark_hidden_state_extraction.py b/benchmarks/benchmark_hidden_state_extraction.py
new file mode 100644
index 000000000000..6056fcdd072c
--- /dev/null
+++ b/benchmarks/benchmark_hidden_state_extraction.py
@@ -0,0 +1,415 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark hidden state extraction throughput.
+
+Measures two modes:
+ 1. Baseline: bulk inference with max_tokens=1, no extraction.
+ 2. Extract: async hidden state extraction via ExampleHiddenStatesConnector
+ with N concurrent clients, each consuming hidden states as
+ soon as their request finishes (overlapping I/O with generation).
+
+Reports tokens/s and prompts/s for each mode.
+
+Usage:
+ python benchmarks/benchmark_hidden_state_extraction.py \
+ --model Qwen/Qwen3-0.6B \
+ --num-prompts 64 \
+ --num-clients 8 \
+ --prompt-len 8192 \
+ --layers 1 2 3 4
+"""
+
+import argparse
+import asyncio
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import torch
+from transformers import AutoConfig
+
+from vllm import LLM, SamplingParams
+from vllm.config.kv_transfer import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_connector.v1 import (
+ example_hidden_states_connector,
+)
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+
+
+def _make_profiler_config(profile_dir: str) -> dict:
+ """Build a profiler_config dict for torch profiling."""
+ return {
+ "profiler": "torch",
+ "torch_profiler_dir": profile_dir,
+ "torch_profiler_with_stack": True,
+ }
+
+
+def make_random_prompts(
+ num_prompts: int, prompt_len: int, vocab_size: int, seed: int = 42
+) -> list[list[int]]:
+ """Generate lists of random token IDs."""
+ # Set seed for reproducibility
+ torch.manual_seed(seed)
+ return [
+ torch.randint(0, vocab_size, (prompt_len,)).tolist() for _ in range(num_prompts)
+ ]
+
+
+def consume_hidden_states(path: str) -> float:
+ """Load hidden states from disk and compute per-position mean.
+
+ Returns a single float: the grand mean of all hidden state values.
+ This forces the benchmark to actually read and reduce the data.
+
+ Uses :func:`load_hidden_states` which acquires a shared flock,
+ blocking (without polling) until the async writer releases its
+ exclusive lock.
+ """
+ obj = example_hidden_states_connector.load_hidden_states(path)
+ hs = obj["hidden_states"]
+ total = hs.mean().item()
+
+ example_hidden_states_connector.cleanup_hidden_states(path)
+
+ return total
+
+
+def run_baseline(
+ model: str,
+ prompts: list[list[int]],
+ extra_args: dict,
+ profile_dir: str | None = None,
+) -> dict:
+ """Baseline: bulk inference, no hidden state extraction."""
+ if profile_dir:
+ extra_args = {
+ **extra_args,
+ "profiler_config": _make_profiler_config(profile_dir),
+ }
+ llm = LLM(
+ model=model,
+ enable_prefix_caching=False,
+ enable_chunked_prefill=False,
+ **extra_args,
+ )
+ sampling_params = SamplingParams(max_tokens=1)
+ prompt_inputs = [{"prompt_token_ids": p} for p in prompts]
+
+ # Warmup
+ llm.generate(prompt_inputs[:4], sampling_params, use_tqdm=False)
+
+ if profile_dir:
+ llm.start_profile()
+
+ t0 = time.perf_counter()
+ outputs = llm.generate(prompt_inputs, sampling_params, use_tqdm=True)
+ elapsed = time.perf_counter() - t0
+
+ if profile_dir:
+ llm.stop_profile()
+
+ total_prompt_tokens = sum(len(o.prompt_token_ids) for o in outputs)
+ num_prompts = len(outputs)
+
+ del llm
+ torch.accelerator.empty_cache()
+
+ return {
+ "mode": "baseline",
+ "elapsed_s": elapsed,
+ "num_prompts": num_prompts,
+ "total_prompt_tokens": total_prompt_tokens,
+ "tokens_per_s": total_prompt_tokens / elapsed,
+ "prompts_per_s": num_prompts / elapsed,
+ }
+
+
+# ---- Async extraction benchmark ----
+
+
+async def _client_loop(
+ engine: AsyncLLM,
+ prompt_queue: asyncio.Queue,
+ consume_pool: ThreadPoolExecutor,
+ results: list[dict],
+ client_id: int,
+):
+ """A single async client: pulls prompts, submits to engine, consumes
+ hidden states as soon as each request finishes."""
+ loop = asyncio.get_event_loop()
+ while True:
+ item = await prompt_queue.get()
+ if item is None:
+ prompt_queue.task_done()
+ break
+ idx, token_ids = item
+
+ request_id = f"req-{idx}"
+ sampling_params = SamplingParams(
+ max_tokens=1,
+ output_kind=RequestOutputKind.FINAL_ONLY,
+ )
+
+ final_output = None
+ async for output in engine.generate(
+ request_id=request_id,
+ prompt={"prompt_token_ids": token_ids},
+ sampling_params=sampling_params,
+ ):
+ if output.finished:
+ final_output = output
+
+ # Consume hidden states on a thread (disk I/O)
+ path = final_output.kv_transfer_params["hidden_states_path"]
+ mean_val = await loop.run_in_executor(consume_pool, consume_hidden_states, path)
+ num_tokens = len(final_output.prompt_token_ids)
+
+ results.append(
+ {
+ "request_id": request_id,
+ "num_prompt_tokens": num_tokens,
+ "mean_hidden_value": mean_val,
+ }
+ )
+ prompt_queue.task_done()
+
+
+async def _run_extraction_async(
+ model: str,
+ prompts: list[list[int]],
+ num_clients: int,
+ layers: list[int],
+ tmpdir: str,
+ extra_args: dict,
+ profile_dir: str | None = None,
+) -> dict:
+ if profile_dir:
+ extra_args = {
+ **extra_args,
+ "profiler_config": _make_profiler_config(profile_dir),
+ }
+ engine_args = AsyncEngineArgs(
+ model=model,
+ enable_prefix_caching=False,
+ enable_chunked_prefill=False,
+ max_num_batched_tokens=40960,
+ max_model_len=40960,
+ speculative_config={
+ "method": "extract_hidden_states",
+ "num_speculative_tokens": 1,
+ "draft_model_config": {
+ "hf_config": {
+ "eagle_aux_hidden_state_layer_ids": layers,
+ },
+ },
+ },
+ kv_transfer_config=KVTransferConfig(
+ kv_connector="ExampleHiddenStatesConnector",
+ kv_role="kv_producer",
+ kv_connector_extra_config={
+ "shared_storage_path": tmpdir,
+ },
+ ),
+ **extra_args,
+ )
+ engine = AsyncLLM.from_engine_args(engine_args)
+
+ try:
+ # Warmup: run a few prompts sequentially, cleaning up generated files
+ for i in range(min(4, len(prompts))):
+ sp = SamplingParams(max_tokens=1, output_kind=RequestOutputKind.FINAL_ONLY)
+ final_output = None
+ async for output in engine.generate(
+ request_id=f"warmup-{i}",
+ prompt={"prompt_token_ids": prompts[i]},
+ sampling_params=sp,
+ ):
+ if output.finished:
+ final_output = output
+ if final_output and final_output.kv_transfer_params:
+ path = final_output.kv_transfer_params.get("hidden_states_path")
+ if path:
+ example_hidden_states_connector.cleanup_hidden_states(path)
+
+ if profile_dir:
+ await engine.start_profile()
+
+ # Fill prompt queue
+ prompt_queue: asyncio.Queue = asyncio.Queue()
+ for idx, token_ids in enumerate(prompts):
+ prompt_queue.put_nowait((idx, token_ids))
+ # Sentinel per client
+ for _ in range(num_clients):
+ prompt_queue.put_nowait(None)
+
+ results: list[dict] = []
+ consume_pool = ThreadPoolExecutor(max_workers=num_clients)
+
+ t0 = time.perf_counter()
+ tasks = [
+ asyncio.create_task(
+ _client_loop(engine, prompt_queue, consume_pool, results, i)
+ )
+ for i in range(num_clients)
+ ]
+ await asyncio.gather(*tasks)
+ elapsed = time.perf_counter() - t0
+
+ consume_pool.shutdown(wait=True)
+
+ if profile_dir:
+ await engine.stop_profile()
+
+ total_prompt_tokens = sum(r["num_prompt_tokens"] for r in results)
+ num_prompts = len(results)
+ mean_hidden = sum(r["mean_hidden_value"] for r in results) / max(
+ len(results), 1
+ )
+
+ return {
+ "mode": "extract",
+ "elapsed_s": elapsed,
+ "num_prompts": num_prompts,
+ "total_prompt_tokens": total_prompt_tokens,
+ "tokens_per_s": total_prompt_tokens / elapsed,
+ "prompts_per_s": num_prompts / elapsed,
+ "mean_hidden_value": mean_hidden,
+ }
+ finally:
+ engine.shutdown()
+
+
+def run_extraction(
+ model: str,
+ prompts: list[list[int]],
+ num_clients: int,
+ layers: list[int],
+ extra_args: dict,
+ profile_dir: str | None = None,
+) -> dict:
+ return asyncio.run(
+ _run_extraction_async(
+ model,
+ prompts,
+ num_clients,
+ layers,
+ "/dev/shm",
+ extra_args,
+ profile_dir=profile_dir,
+ )
+ )
+
+
+def print_results(results: dict):
+ mode = results["mode"]
+ print(f"\n{'=' * 60}")
+ print(f" {mode.upper()} RESULTS")
+ print(f"{'=' * 60}")
+ print(f" Prompts: {results['num_prompts']}")
+ print(f" Total prompt tokens: {results['total_prompt_tokens']:,}")
+ print(f" Wall time: {results['elapsed_s']:.2f}s")
+ print(f" Tokens/s: {results['tokens_per_s']:,.0f}")
+ print(f" Prompts/s: {results['prompts_per_s']:.2f}")
+ if mode == "extract":
+ print(f" Mean hidden value: {results['mean_hidden_value']:.6f}")
+ print(f"{'=' * 60}\n")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Benchmark hidden state extraction throughput"
+ )
+ parser.add_argument("--model", type=str, required=True)
+ parser.add_argument("--num-prompts", type=int, default=64)
+ parser.add_argument("--num-clients", type=int, default=8)
+ parser.add_argument("--prompt-len", type=int, default=8192)
+ parser.add_argument("--layers", type=int, nargs="+", default=[1, 2, 3, 4])
+ parser.add_argument("--skip-baseline", action="store_true")
+ parser.add_argument("--skip-extract", action="store_true")
+ parser.add_argument("--gpu-memory-utilization", type=float, default=0.9)
+ parser.add_argument("--max-num-batched-tokens", type=int, default=None)
+ parser.add_argument("--max-cudagraph-capture-size", type=int, default=None)
+ parser.add_argument("--max-model-len", type=int, default=None)
+ parser.add_argument("--enforce-eager", action="store_true")
+ parser.add_argument("--load-format", type=str, default=None)
+ parser.add_argument(
+ "--profile",
+ action="store_true",
+ help="Enable torch profiler for both baseline and extraction runs.",
+ )
+ parser.add_argument(
+ "--torch-profiler-dir",
+ type=str,
+ default="./vllm_profile",
+ help="Directory to save torch profiler traces (default: ./vllm_profile).",
+ )
+ parser.add_argument(
+ "--enable-flashinfer-autotune",
+ action="store_true",
+ default=False,
+ help="Enable FlashInfer autotuning (can be slow).",
+ )
+ args = parser.parse_args()
+
+ extra_args = {
+ "gpu_memory_utilization": args.gpu_memory_utilization,
+ }
+ if args.max_model_len is not None:
+ extra_args["max_model_len"] = args.max_model_len
+ if args.max_num_batched_tokens is not None:
+ extra_args["max_num_batched_tokens"] = args.max_num_batched_tokens
+ if args.max_model_len and args.max_num_batched_tokens < args.max_model_len:
+ raise ValueError(
+ "max_num_batched_tokens must be >= max_model_len since chunked prefill"
+ " is not supported by hidden state extraction."
+ )
+ if args.enforce_eager:
+ extra_args["enforce_eager"] = True
+ if args.load_format is not None:
+ extra_args["load_format"] = args.load_format
+ if args.max_cudagraph_capture_size is not None:
+ extra_args["max_cudagraph_capture_size"] = args.max_cudagraph_capture_size
+ extra_args["enable_flashinfer_autotune"] = args.enable_flashinfer_autotune
+
+ # Get vocab size from HF config without loading the full model
+ hf_config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
+ vocab_size = hf_config.vocab_size
+ prompts = make_random_prompts(args.num_prompts, args.prompt_len, vocab_size)
+ print(
+ f"Generated {args.num_prompts} prompts, "
+ f"{args.prompt_len} tokens each (vocab {vocab_size})"
+ )
+
+ profile_dir = args.torch_profiler_dir if args.profile else None
+ if profile_dir:
+ print(f"Torch profiler enabled, traces will be saved to {profile_dir}/")
+
+ if not args.skip_baseline:
+ baseline_profile_dir = f"{profile_dir}/baseline" if profile_dir else None
+ baseline = run_baseline(
+ args.model, prompts, extra_args, profile_dir=baseline_profile_dir
+ )
+ print_results(baseline)
+
+ if not args.skip_extract:
+ extract_profile_dir = f"{profile_dir}/extract" if profile_dir else None
+ extract = run_extraction(
+ args.model,
+ prompts,
+ args.num_clients,
+ args.layers,
+ extra_args,
+ profile_dir=extract_profile_dir,
+ )
+ print_results(extract)
+
+ if not args.skip_baseline and not args.skip_extract:
+ slowdown = baseline["tokens_per_s"] / extract["tokens_per_s"]
+ print("Extraction slowdown factor: {:.2f}x".format(slowdown))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 33aca831883a..664fa58dd49f 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -115,6 +115,39 @@ class SampleRequest:
def sample_requests(
tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
) -> list[SampleRequest]:
+ def _apply_random_prefix(
+ tokenizer: PreTrainedTokenizerBase,
+ requests: list[SampleRequest],
+ prefix_len: int,
+ seed: int,
+ ) -> list[SampleRequest]:
+ if prefix_len <= 0:
+ return requests
+ rng = np.random.default_rng(seed)
+ vocab_size = tokenizer.vocab_size
+ prohibited = getattr(tokenizer, "all_special_ids", None) or []
+ allowed = np.array([i for i in range(vocab_size) if i not in prohibited])
+ if len(allowed) == 0:
+ return requests
+ prefix_ids = rng.integers(0, len(allowed), size=prefix_len)
+ prefix_token_ids = allowed[prefix_ids].tolist()
+ out = []
+ for req in requests:
+ prompt_ids = tokenizer(req.prompt, add_special_tokens=False).input_ids
+ full_ids = prefix_token_ids + prompt_ids
+ full_prompt = tokenizer.decode(full_ids, skip_special_tokens=False)
+ out.append(
+ SampleRequest(
+ prompt=full_prompt,
+ prompt_len=len(tokenizer(full_prompt).input_ids),
+ expected_output_len=req.expected_output_len,
+ schema=req.schema,
+ structure_type=req.structure_type,
+ completion=req.completion,
+ )
+ )
+ return out
+
if args.dataset == "json" or args.dataset == "json-unique":
if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -261,6 +294,9 @@ def _filter_func(item):
)
)
+ requests = _apply_random_prefix(
+ tokenizer, requests, args.random_prefix_len, args.seed
+ )
return requests
@@ -945,6 +981,15 @@ def create_argument_parser():
"results in a more uniform arrival of requests.",
)
parser.add_argument("--seed", type=int, default=0)
+ parser.add_argument(
+ "--random-prefix-len",
+ type=int,
+ default=0,
+ help=(
+ "Number of prefix tokens to prepend to every prompt. "
+ "The same prefix is used for all prompts to enable prefix caching."
+ ),
+ )
parser.add_argument(
"--trust-remote-code",
action="store_true",
diff --git a/benchmarks/fused_kernels/merge_attn_states_benchmarks.py b/benchmarks/fused_kernels/merge_attn_states_benchmarks.py
new file mode 100644
index 000000000000..26b04299b353
--- /dev/null
+++ b/benchmarks/fused_kernels/merge_attn_states_benchmarks.py
@@ -0,0 +1,264 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark: Fused FP8 output quantization in merge_attn_states
+
+Compares fused vs unfused approaches for producing FP8-quantized merged
+attention output:
+ 1. Fused CUDA -- single CUDA kernel (merge + FP8 quant)
+ 2. Fused Triton -- single Triton kernel (merge + FP8 quant)
+ 3. Unfused CUDA -- CUDA merge + torch.compiled FP8 quant
+ 4. Unfused Triton -- Triton merge + torch.compiled FP8 quant
+
+Usage:
+ python benchmarks/fused_kernels/merge_attn_states_benchmarks.py
+ python benchmarks/fused_kernels/merge_attn_states_benchmarks.py --tp 1 4 8
+ python benchmarks/fused_kernels/merge_attn_states_benchmarks.py --dtype bfloat16
+"""
+
+import argparse
+import itertools
+
+import torch
+
+from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
+from vllm.benchmarks.lib.utils import default_vllm_config
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.v1.attention.ops.triton_merge_attn_states import (
+ merge_attn_states as merge_attn_states_triton,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration defaults
+# ---------------------------------------------------------------------------
+
+NUM_TOKENS_LIST = [1, 16, 64, 256, 1024, 4096]
+
+# (label, num_heads, head_size) — num_heads is for TP=1
+HEAD_CONFIGS = [
+ ("DeepSeek-V3 MLA", 128, 128),
+ ("Llama-70B", 64, 128),
+ ("Llama-8B", 32, 128),
+]
+
+TP_SIZES = [1, 2, 4, 8]
+
+INPUT_DTYPES = [torch.float32, torch.float16, torch.bfloat16]
+
+QUANTILES = [0.5, 0.2, 0.8]
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def short_dtype(dtype: torch.dtype) -> str:
+ return str(dtype).removeprefix("torch.")
+
+
+def make_inputs(
+ num_tokens: int,
+ num_heads: int,
+ head_size: int,
+ dtype: torch.dtype,
+):
+ """Create random prefix/suffix outputs and LSEs."""
+ prefix_output = torch.randn(
+ (num_tokens, num_heads, head_size), dtype=dtype, device="cuda"
+ )
+ suffix_output = torch.randn(
+ (num_tokens, num_heads, head_size), dtype=dtype, device="cuda"
+ )
+ prefix_lse = torch.randn(num_heads, num_tokens, dtype=torch.float32, device="cuda")
+ suffix_lse = torch.randn(num_heads, num_tokens, dtype=torch.float32, device="cuda")
+ # Sprinkle some inf values to exercise edge-case paths
+ mask = torch.rand(num_heads, num_tokens, device="cuda") < 0.05
+ prefix_lse[mask] = float("inf")
+ mask2 = torch.rand(num_heads, num_tokens, device="cuda") < 0.05
+ suffix_lse[mask2] = float("inf")
+ return prefix_output, suffix_output, prefix_lse, suffix_lse
+
+
+def build_configs(head_configs, num_tokens_list, input_dtypes, tp_sizes):
+ """Build (num_tokens, num_heads, head_size, dtype_str) config tuples,
+ applying TP division to num_heads and skipping invalid combos."""
+ configs = []
+ for (_, nh, hs), nt, dtype, tp in itertools.product(
+ head_configs, num_tokens_list, input_dtypes, tp_sizes
+ ):
+ nh_tp = nh // tp
+ if nh_tp >= 1:
+ configs.append((nt, nh_tp, hs, short_dtype(dtype)))
+ return configs
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description="Benchmark merge_attn_states fused FP8 quantization"
+ )
+ parser.add_argument(
+ "--num-tokens",
+ type=int,
+ nargs="+",
+ default=None,
+ help=f"Override token counts (default: {NUM_TOKENS_LIST})",
+ )
+ parser.add_argument(
+ "--tp",
+ type=int,
+ nargs="+",
+ default=None,
+ help=f"TP sizes to simulate (divides num_heads) (default: {TP_SIZES})",
+ )
+ parser.add_argument(
+ "--dtype",
+ type=str,
+ nargs="+",
+ default=None,
+ help="Input dtypes (e.g. bfloat16 float16 float32). "
+ f"Default: {[short_dtype(d) for d in INPUT_DTYPES]}",
+ )
+ return parser.parse_args()
+
+
+# ---------------------------------------------------------------------------
+# Parse args and build configs before decorators
+# ---------------------------------------------------------------------------
+
+args = parse_args()
+
+num_tokens_list = args.num_tokens if args.num_tokens else NUM_TOKENS_LIST
+tp_sizes = args.tp if args.tp else TP_SIZES
+
+if args.dtype:
+ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+
+ input_dtypes = [STR_DTYPE_TO_TORCH_DTYPE[d] for d in args.dtype]
+else:
+ input_dtypes = INPUT_DTYPES
+
+configs = build_configs(HEAD_CONFIGS, num_tokens_list, input_dtypes, tp_sizes)
+
+torch._dynamo.config.recompile_limit = 8888
+
+
+# ---------------------------------------------------------------------------
+# Benchmark function
+# ---------------------------------------------------------------------------
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["num_tokens", "num_heads", "head_size", "dtype_str"],
+ x_vals=configs,
+ line_arg="provider",
+ line_vals=["fused_cuda", "fused_triton", "unfused_cuda", "unfused_triton"],
+ line_names=["Fused CUDA", "Fused Triton", "Unfused CUDA", "Unfused Triton"],
+ styles=[("blue", "-"), ("green", "-"), ("blue", "--"), ("green", "--")],
+ ylabel="us",
+ plot_name="merge_attn_states FP8 (fused vs unfused)",
+ args={},
+ )
+)
+@default_vllm_config()
+def benchmark(num_tokens, num_heads, head_size, dtype_str, provider):
+ input_dtype = getattr(torch, dtype_str)
+ fp8_dtype = current_platform.fp8_dtype()
+ prefix_out, suffix_out, prefix_lse, suffix_lse = make_inputs(
+ num_tokens, num_heads, head_size, input_dtype
+ )
+ output_scale = torch.tensor([0.1], dtype=torch.float32, device="cuda")
+
+ if provider == "fused_cuda":
+ output = torch.empty(
+ (num_tokens, num_heads, head_size), dtype=fp8_dtype, device="cuda"
+ )
+ fn = lambda: merge_attn_states_cuda(
+ output,
+ prefix_out,
+ prefix_lse,
+ suffix_out,
+ suffix_lse,
+ output_scale=output_scale,
+ )
+ elif provider == "fused_triton":
+ output = torch.empty(
+ (num_tokens, num_heads, head_size), dtype=fp8_dtype, device="cuda"
+ )
+ fn = lambda: merge_attn_states_triton(
+ output,
+ prefix_out,
+ prefix_lse,
+ suffix_out,
+ suffix_lse,
+ output_scale=output_scale,
+ )
+ elif provider == "unfused_cuda":
+ merge_buf = torch.empty(
+ (num_tokens, num_heads, head_size), dtype=input_dtype, device="cuda"
+ )
+ quant_fp8 = QuantFP8(
+ static=True,
+ group_shape=GroupShape.PER_TENSOR,
+ column_major_scales=False,
+ )
+ quant_input = merge_buf.view(-1, head_size)
+ compiled_quant = torch.compile(
+ quant_fp8.forward_native, fullgraph=True, dynamic=False
+ )
+
+ def unfused_fn():
+ merge_attn_states_cuda(
+ merge_buf, prefix_out, prefix_lse, suffix_out, suffix_lse
+ )
+ compiled_quant(quant_input, output_scale)
+
+ fn = unfused_fn
+ else: # unfused_triton
+ merge_buf = torch.empty(
+ (num_tokens, num_heads, head_size), dtype=input_dtype, device="cuda"
+ )
+ quant_fp8 = QuantFP8(
+ static=True,
+ group_shape=GroupShape.PER_TENSOR,
+ column_major_scales=False,
+ )
+ quant_input = merge_buf.view(-1, head_size)
+ compiled_quant = torch.compile(
+ quant_fp8.forward_native, fullgraph=True, dynamic=False
+ )
+
+ def unfused_fn():
+ merge_attn_states_triton(
+ merge_buf, prefix_out, prefix_lse, suffix_out, suffix_lse
+ )
+ compiled_quant(quant_input, output_scale)
+
+ fn = unfused_fn
+
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=QUANTILES)
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms # us
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main():
+ device_name = current_platform.get_device_name()
+ print(f"Device: {device_name}")
+ print(f"Token counts: {num_tokens_list}")
+ print(f"TP sizes: {tp_sizes}")
+ print(f"Input dtypes: {[short_dtype(d) for d in input_dtypes]}")
+ print(f"Head configs: {[(c[0], c[1], c[2]) for c in HEAD_CONFIGS]}")
+ benchmark.run(print_data=True)
+
+
+if __name__ == "__main__":
+ with torch.inference_mode():
+ main()
diff --git a/benchmarks/fused_kernels/silu_mul_block_quant_benchmark.py b/benchmarks/fused_kernels/silu_mul_block_quant_benchmark.py
new file mode 100644
index 000000000000..4e8d787bf9c7
--- /dev/null
+++ b/benchmarks/fused_kernels/silu_mul_block_quant_benchmark.py
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass
+from itertools import product
+
+import torch
+import torch.nn.functional as F
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from tqdm import tqdm
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ per_token_group_quant_fp8,
+)
+
+
+@dataclass
+class bench_params_t:
+ num_tokens: int
+ hidden_size: int
+ dtype: torch.dtype
+ group_size: int # Changed from list[int] to int
+
+ def description(self):
+ return (
+ f"N {self.num_tokens} "
+ f"x D {self.hidden_size} "
+ f"x DT {self.dtype} "
+ f"x GS {self.group_size}"
+ )
+
+
+def get_bench_params() -> list[bench_params_t]:
+ """Test configurations covering common model sizes."""
+ NUM_TOKENS = [16, 128, 512, 2048]
+ HIDDEN_SIZES = [1024, 2048, 4096, 5120, 14336] # Common FFN sizes
+ DTYPES = [torch.float16, torch.bfloat16]
+ GROUP_SIZES = [64, 128] # Changed from [[1, 64], [1, 128]]
+
+ combinations = product(NUM_TOKENS, HIDDEN_SIZES, DTYPES, GROUP_SIZES)
+ bench_params = list(
+ map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+ )
+ return bench_params
+
+
+# Reference implementations
+def unfused_fp8_impl(
+ x: torch.Tensor,
+ quant_dtype: torch.dtype,
+ group_size: int, # Changed from list[int]
+):
+ """Unfused: SiLU+Mul then per-tensor quantize."""
+ hidden = x.shape[-1] // 2
+ gate, up = x.split(hidden, dim=-1)
+
+ # SiLU(gate) * up
+ silu_out = F.silu(gate) * up
+
+ # Per-tensor quantize (no group_size used here)
+ silu_out, _ = ops.scaled_fp8_quant(silu_out)
+
+
+def unfused_groupwise_fp8_impl(
+ x: torch.Tensor,
+ quant_dtype: torch.dtype,
+ group_size: int, # Changed from list[int]
+):
+ """Unfused: SiLU+Mul then group-wise quantize."""
+ hidden = x.shape[-1] // 2
+ gate, up = x.split(hidden, dim=-1)
+
+ # SiLU(gate) * up
+ silu_out = F.silu(gate) * up
+
+ # Group quantize - use group_size directly
+ silu_out, _ = per_token_group_quant_fp8(
+ silu_out, group_size=group_size, use_ue8m0=False
+ )
+
+
+def fused_impl(
+ x: torch.Tensor,
+ quant_dtype: torch.dtype,
+ group_size: int,
+):
+ """Fused: SiLU+Mul+Block Quantization in single kernel."""
+ out, _ = ops.silu_and_mul_per_block_quant(
+ x,
+ group_size=group_size,
+ quant_dtype=quant_dtype,
+ is_scale_transposed=False,
+ )
+
+
+# Bench functions
+def bench_fn(
+ x: torch.Tensor,
+ quant_dtype: torch.dtype,
+ group_size: int,
+ label: str,
+ sub_label: str,
+ fn: Callable,
+ description: str,
+) -> TMeasurement:
+ min_run_time = 1
+
+ globals = {
+ "x": x,
+ "quant_dtype": quant_dtype,
+ "group_size": group_size,
+ "fn": fn,
+ }
+ return TBenchmark.Timer(
+ stmt="fn(x, quant_dtype, group_size)",
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description=description,
+ ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasurement]:
+ """Run benchmarks for all implementations."""
+ # Make inputs: [num_tokens, hidden_size * 2] for [gate || up]
+ scale = 1 / params.hidden_size
+ x = (
+ torch.randn(
+ params.num_tokens,
+ params.hidden_size * 2,
+ dtype=params.dtype,
+ device="cuda",
+ )
+ * scale
+ )
+
+ timers = []
+
+ # Unfused per-tensor FP8
+ timers.append(
+ bench_fn(
+ x,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ unfused_fp8_impl,
+ "unfused_fp8_impl",
+ )
+ )
+
+ # Unfused group-wise FP8
+ timers.append(
+ bench_fn(
+ x,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ unfused_groupwise_fp8_impl,
+ "unfused_groupwise_fp8_impl",
+ )
+ )
+
+ # Fused group-wise FP8
+ timers.append(
+ bench_fn(
+ x,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ fused_impl,
+ "fused_groupwise_fp8_impl",
+ )
+ )
+
+ return timers
+
+
+def print_timers(timers: Iterable[TMeasurement]):
+ compare = TBenchmark.Compare(timers)
+ compare.print()
+
+
+def main():
+ torch.set_default_device("cuda")
+ bench_params = get_bench_params()
+
+ print(f"Running {len(bench_params)} benchmark configurations...")
+ print(
+ f"This will take approximately {len(bench_params) * 3} seconds (1s per variant)"
+ )
+ print()
+
+ timers = []
+ for bp in tqdm(bench_params):
+ result_timers = bench(bp, "silu-mul-block-quant", bp.description())
+ timers.extend(result_timers)
+
+ print("\n" + "=" * 80)
+ print("FINAL COMPARISON - ALL RESULTS")
+ print("=" * 80)
+ print_timers(timers)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/entrypoints/openai/speech_to_text/__init__.py b/benchmarks/kernels/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/speech_to_text/__init__.py
rename to benchmarks/kernels/__init__.py
diff --git a/benchmarks/kernels/benchmark_block_fp8_gemm.py b/benchmarks/kernels/benchmark_block_fp8_gemm.py
index 8d50c3828206..9eddc907b937 100644
--- a/benchmarks/kernels/benchmark_block_fp8_gemm.py
+++ b/benchmarks/kernels/benchmark_block_fp8_gemm.py
@@ -9,11 +9,12 @@
import torch
from vllm.benchmarks.lib.utils import default_vllm_config
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- W8A8BlockFp8LinearOp,
+from vllm.model_executor.kernels.linear import (
+ init_fp8_linear_kernel,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape,
+ create_fp8_quant_key,
)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
CUTLASS_BLOCK_FP8_SUPPORTED,
@@ -70,11 +71,15 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
weight_group_shape = GroupShape(block_n, block_k)
act_quant_group_shape = GroupShape(1, block_k) # Per-token, per-group quantization
- linear_op = W8A8BlockFp8LinearOp(
- weight_group_shape=weight_group_shape,
- act_quant_group_shape=act_quant_group_shape,
- cutlass_block_fp8_supported=use_cutlass,
- use_aiter_and_is_supported=False,
+ linear_op = init_fp8_linear_kernel(
+ weight_quant_key=create_fp8_quant_key(
+ static=True, group_shape=weight_group_shape
+ ),
+ activation_quant_key=create_fp8_quant_key(
+ static=False, group_shape=act_quant_group_shape
+ ),
+ out_dtype=torch.get_default_dtype(),
+ module_name="build_w8a8_block_fp8_runner",
)
def run():
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index 3f80b024e108..03d7fb386f74 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -16,7 +16,7 @@
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
index 2d4afd38c097..7379bf858889 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -22,7 +22,7 @@
fp8_w8a8_moe_quant_config,
nvfp4_moe_quant_config,
)
-from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassExpertsFp4,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index dd4060bbdb94..04fc2960d1e4 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -13,7 +13,7 @@
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
fused_topk,
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 65bc38c6c755..4463a23772ee 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -27,10 +27,10 @@
RoutingMethodType,
_get_config_dtype_str,
)
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.experts.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
+from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index 5f9a131f79b0..a340500379a0 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -9,6 +9,7 @@
moe_align_block_size,
)
from vllm.triton_utils import triton
+from vllm.utils.torch_utils import set_random_seed
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
@@ -44,7 +45,7 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
def benchmark(num_tokens, num_experts, topk, ep_size, provider):
"""Benchmark function for Triton."""
block_size = 256
- torch.cuda.manual_seed_all(0)
+ set_random_seed(0)
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
e_map = None
diff --git a/benchmarks/kernels/benchmark_norm_router_gemm.py b/benchmarks/kernels/benchmark_norm_router_gemm.py
new file mode 100644
index 000000000000..cd50e9159961
--- /dev/null
+++ b/benchmarks/kernels/benchmark_norm_router_gemm.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark and correctness check for ``ops.dsv4_norm_router_gemm``.
+
+Two implementations are compared:
+
+ 1. ``unfused`` — ``vllm_ops.rms_norm`` then ``ops.dsv3_router_gemm``,
+ i.e. the current vLLM hot path (two kernel launches).
+ 2. ``fused`` — ``ops.dsv4_norm_router_gemm``, the new single-kernel
+ fused path.
+
+Both produce ``(normed_x: bf16, router_logits: fp32)``. The correctness
+check verifies that ``fused`` and ``unfused`` agree to within ~1 bf16
+ULP — that is the precision floor for this op.
+"""
+
+import argparse
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+# The fused dsv4_norm_router_gemm kernel is templated only for DSV4-Pro
+# (hidden_size=7168, num_experts=384). Other shapes fall back to the
+# unfused path on the Python side (NormGatedLinear), so benchmark only
+# the configuration that the fused kernel actually targets.
+HIDDEN_SIZE = 7168
+NUM_EXPERTS_CHOICES = (384,)
+RMS_EPS = 1e-6
+
+
+def unfused_norm_router_gemm(
+ x: torch.Tensor,
+ norm_weight: torch.Tensor,
+ gate_weight: torch.Tensor,
+ eps: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ # Call ``_C::rms_norm`` directly (mirroring ``_dsv4_pro_norm_gate``'s
+ # fallback path) so the benchmarked baseline doesn't inherit any
+ # Python wrapper overhead or risk falling through to the native
+ # eager-primitive ``RMSNorm.forward_native`` path.
+ normed = torch.empty_like(x)
+ torch.ops._C.rms_norm(normed, x, norm_weight, eps)
+ logits = vllm_ops.dsv3_router_gemm(normed, gate_weight, torch.float32)
+ return normed, logits
+
+
+def fused_norm_router_gemm(
+ x: torch.Tensor,
+ norm_weight: torch.Tensor,
+ gate_weight: torch.Tensor,
+ eps: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ return vllm_ops.dsv4_norm_router_gemm(x, norm_weight, gate_weight, eps)
+
+
+def _make_inputs(num_tokens: int, num_experts: int, hidden_size: int, seed: int = 0):
+ torch.manual_seed(seed)
+ device = "cuda"
+ x = torch.randn(num_tokens, hidden_size, dtype=torch.bfloat16, device=device)
+ norm_w = torch.randn(hidden_size, dtype=torch.bfloat16, device=device)
+ gate_w = torch.randn(num_experts, hidden_size, dtype=torch.bfloat16, device=device)
+ # Down-scale gate_w so the GEMV output stays in a representable range.
+ gate_w = gate_w / float(hidden_size) ** 0.5
+ norm_w = (norm_w * 0.1) + 1.0
+ return x, norm_w, gate_w
+
+
+def calculate_diff(
+ num_tokens: int,
+ num_experts: int,
+ hidden_size: int = HIDDEN_SIZE,
+ normed_atol: float = 2e-3,
+ logits_atol: float = 1e-2,
+ rtol: float = 1e-2,
+) -> None:
+ x, norm_w, gate_w = _make_inputs(num_tokens, num_experts, hidden_size)
+
+ normed_unfused, logits_unfused = unfused_norm_router_gemm(
+ x.clone(), norm_w, gate_w, RMS_EPS
+ )
+ normed_fused, logits_fused = fused_norm_router_gemm(
+ x.clone(), norm_w, gate_w, RMS_EPS
+ )
+
+ def _max_abs(a, b):
+ return (a.float() - b.float()).abs().max().item()
+
+ print(f"\n=== M={num_tokens} E={num_experts} H={hidden_size} ===")
+ print(f"normed_x |fused - unfused| = {_max_abs(normed_fused, normed_unfused):.3e}")
+ print(f"logits |fused - unfused| = {_max_abs(logits_fused, logits_unfused):.3e}")
+
+ ok_normed = torch.allclose(
+ normed_fused.float(),
+ normed_unfused.float(),
+ atol=normed_atol,
+ rtol=rtol,
+ )
+ ok_logits = torch.allclose(
+ logits_fused.float(),
+ logits_unfused.float(),
+ atol=logits_atol,
+ rtol=rtol,
+ )
+ if ok_normed and ok_logits:
+ print(
+ f"OK fused vs unfused within "
+ f"normed_atol={normed_atol:.0e} logits_atol={logits_atol:.0e} "
+ f"rtol={rtol:.0e}"
+ )
+ else:
+ print(
+ f"FAIL normed_ok={ok_normed} logits_ok={ok_logits}; "
+ f"see max-abs values above"
+ )
+
+
+def get_benchmark():
+ # Only num_tokens varies (DSV4-Pro hard-codes E=384); single-axis
+ # sweep yields a clean line plot with M on the x-axis.
+ num_experts = NUM_EXPERTS_CHOICES[0]
+
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["num_tokens"],
+ x_vals=list(range(1, 17)),
+ line_arg="provider",
+ line_vals=["unfused", "fused"],
+ line_names=["unfused (rms+dsv3)", "fused (dsv4)"],
+ styles=[("green", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name=f"norm-router-gemm-E{num_experts}-H{HIDDEN_SIZE}",
+ args={},
+ )
+ )
+ def benchmark(num_tokens, provider):
+ x, norm_w, gate_w = _make_inputs(num_tokens, num_experts, HIDDEN_SIZE)
+
+ quantiles = [0.5, 0.2, 0.8]
+ if provider == "unfused":
+ fn = lambda: unfused_norm_router_gemm( # noqa: E731
+ x, norm_w, gate_w, RMS_EPS
+ )
+ else:
+ fn = lambda: fused_norm_router_gemm( # noqa: E731
+ x, norm_w, gate_w, RMS_EPS
+ )
+
+ ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+ return benchmark
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./configs/norm_router_gemm/",
+ )
+ parser.add_argument(
+ "--skip-bench",
+ action="store_true",
+ help="Run only the correctness check, not the perf sweep.",
+ )
+ args = parser.parse_args()
+
+ # Correctness sweep over the full fast-path range M=1..16.
+ for m in range(1, 17):
+ for e in NUM_EXPERTS_CHOICES:
+ calculate_diff(num_tokens=m, num_experts=e, hidden_size=HIDDEN_SIZE)
+
+ if args.skip_bench:
+ return
+
+ benchmark = get_benchmark()
+ benchmark.run(print_data=True, save_path=args.save_path)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_router_gemm.py b/benchmarks/kernels/benchmark_router_gemm.py
deleted file mode 100644
index cc63f8904c27..000000000000
--- a/benchmarks/kernels/benchmark_router_gemm.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-import torch.nn.functional as F
-
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
-from vllm.transformers_utils.config import get_config
-from vllm.triton_utils import triton
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-
-# Dimensions supported by the DSV3 specialized kernel
-DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
-DSV3_SUPPORTED_HIDDEN_SIZES = [7168]
-
-# Dimensions supported by the gpt-oss specialized kernel
-GPT_OSS_SUPPORTED_NUM_EXPERTS = [32, 128]
-GPT_OSS_SUPPORTED_HIDDEN_SIZES = [2880]
-
-
-def get_batch_size_range(max_batch_size):
- return [2**x for x in range(14) if 2**x <= max_batch_size]
-
-
-def get_model_params(config):
- if config.architectures[0] in (
- "DeepseekV2ForCausalLM",
- "DeepseekV3ForCausalLM",
- "DeepseekV32ForCausalLM",
- ):
- num_experts = config.n_routed_experts
- hidden_size = config.hidden_size
- elif config.architectures[0] in ("GptOssForCausalLM",):
- num_experts = config.num_local_experts
- hidden_size = config.hidden_size
- else:
- raise ValueError(f"Unsupported architecture: {config.architectures}")
- return num_experts, hidden_size
-
-
-def get_benchmark(model, max_batch_size, trust_remote_code):
- @triton.testing.perf_report(
- triton.testing.Benchmark(
- x_names=["batch_size"],
- x_vals=get_batch_size_range(max_batch_size),
- x_log=False,
- line_arg="provider",
- line_vals=[
- "torch",
- "vllm",
- ],
- line_names=["PyTorch", "vLLM"],
- styles=([("blue", "-"), ("red", "-")]),
- ylabel="TFLOPs",
- plot_name=f"{model} router gemm throughput",
- args={},
- )
- )
- def benchmark(batch_size, provider):
- config = get_config(model=model, trust_remote_code=trust_remote_code)
- num_experts, hidden_size = get_model_params(config)
-
- mat_a = torch.randn(
- (batch_size, hidden_size), dtype=torch.bfloat16, device="cuda"
- ).contiguous()
- mat_b = torch.randn(
- (num_experts, hidden_size), dtype=torch.bfloat16, device="cuda"
- ).contiguous()
- bias = torch.randn(
- num_experts, dtype=torch.bfloat16, device="cuda"
- ).contiguous()
-
- is_hopper_or_blackwell = current_platform.is_device_capability(
- 90
- ) or current_platform.is_device_capability_family(100)
- allow_dsv3_router_gemm = (
- is_hopper_or_blackwell
- and num_experts in DSV3_SUPPORTED_NUM_EXPERTS
- and hidden_size in DSV3_SUPPORTED_HIDDEN_SIZES
- )
- allow_gpt_oss_router_gemm = (
- is_hopper_or_blackwell
- and num_experts in GPT_OSS_SUPPORTED_NUM_EXPERTS
- and hidden_size in GPT_OSS_SUPPORTED_HIDDEN_SIZES
- )
-
- has_bias = False
- if allow_gpt_oss_router_gemm:
- has_bias = True
-
- quantiles = [0.5, 0.2, 0.8]
-
- if provider == "torch":
-
- def runner():
- if has_bias:
- F.linear(mat_a, mat_b, bias)
- else:
- F.linear(mat_a, mat_b)
- elif provider == "vllm":
-
- def runner():
- if allow_dsv3_router_gemm:
- ops.dsv3_router_gemm(mat_a, mat_b, torch.bfloat16)
- elif allow_gpt_oss_router_gemm:
- ops.gpt_oss_router_gemm(mat_a, mat_b, bias)
- else:
- raise ValueError("Unsupported router gemm")
-
- ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
- runner, quantiles=quantiles
- )
-
- def tflops(t_ms):
- flops = 2 * batch_size * hidden_size * num_experts
- return flops / (t_ms * 1e-3) / 1e12
-
- return tflops(ms), tflops(max_ms), tflops(min_ms)
-
- return benchmark
-
-
-if __name__ == "__main__":
- parser = FlexibleArgumentParser()
- parser.add_argument("--model", type=str, default="openai/gpt-oss-20b")
- parser.add_argument("--max-batch-size", default=16, type=int)
- parser.add_argument("--trust-remote-code", action="store_true")
- args = parser.parse_args()
-
- # Get the benchmark function
- benchmark = get_benchmark(args.model, args.max_batch_size, args.trust_remote_code)
- # Run performance benchmark
- benchmark.run(print_data=True)
diff --git a/benchmarks/kernels/benchmark_selective_state_update.py b/benchmarks/kernels/benchmark_selective_state_update.py
new file mode 100644
index 000000000000..a8b73da2aa9a
--- /dev/null
+++ b/benchmarks/kernels/benchmark_selective_state_update.py
@@ -0,0 +1,774 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark and tuning script for the Mamba selective_state_update kernel.
+
+Mirrors the fused MoE tuning workflow: sweeps (BLOCK_SIZE_M, num_warps) across
+an effective_batch grid for a given (headdim, dstate, ngroups, cache_dtype) and
+saves the best config per effective_batch to JSON. Generated configs are picked
+up by selective_state_update at runtime.
+
+Usage:
+ python -m benchmarks.kernels.benchmark_selective_state_update \
+ --all-dstates --save-configs --compare
+"""
+
+import argparse
+import json
+import os
+import sys
+from io import StringIO
+from itertools import product
+from typing import Any
+
+import torch
+
+from tests.kernels.mamba.utils import selective_state_update_ref
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+ _CONFIGS_DIR,
+ _canonical_cache_dtype,
+ _get_default_ssm_launch_config,
+ get_ssm_config_file_name,
+ get_ssm_device_name,
+ override_ssm_config,
+ selective_state_update,
+)
+from vllm.triton_utils import triton
+
+# bf16 shares configs with fp16 - same bit width.
+_SSM_CACHE_DTYPE_MAP: dict[str, torch.dtype] = {
+ "float32": torch.float32,
+ "float16": torch.float16,
+ "bfloat16": torch.float16,
+}
+
+_RESULTS_DIR = os.path.dirname(os.path.realpath(__file__))
+
+# ---------------------------------------------------------------------------
+# Tuning search space
+# ---------------------------------------------------------------------------
+
+_BSM_CHOICES_ALL = [4, 8, 16, 32, 64, 128, 256]
+
+NUM_WARPS_CHOICES = [1, 2, 4, 8]
+
+
+def _block_size_m_choices(headdim: int) -> list[int]:
+ """BLOCK_SIZE_M candidates worth sweeping for a given headdim.
+
+ BLOCK_SIZE_M > next_pow2(headdim) wastes >=50% of each tile via masking
+ (offs_m >= dim rows are zeroed out), so we cap the sweep there.
+ """
+ ceiling = 1
+ while ceiling < headdim:
+ ceiling <<= 1
+ return [b for b in _BSM_CHOICES_ALL if b <= ceiling]
+
+
+# Default deployment shapes. effective_batch = batch * nheads scales the
+# kernel grid, so configs transfer across (model, TP) combos sharing
+# (headdim, dstate, cache_dtype).
+DEFAULT_BATCH_SIZES = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
+DEFAULT_NHEADS = [128, 256]
+
+ALL_DSTATES = [16, 32, 64, 128, 256]
+
+# Default tuning shape — matches Nemotron-3-Super and Nemotron-3-Nano Mamba layers.
+# Override with CLI flags for other architectures.
+DEFAULT_HEADDIM = 64
+DEFAULT_NGROUPS = 8
+
+
+# ---------------------------------------------------------------------------
+# Benchmark helper
+# ---------------------------------------------------------------------------
+
+
+def _make_inputs(
+ batch: int,
+ nheads: int,
+ dim: int,
+ dstate: int,
+ ngroups: int,
+ dtype: torch.dtype,
+ state_dtype: torch.dtype | None = None,
+ device: str = "cuda",
+):
+ if state_dtype is None:
+ state_dtype = dtype
+ state = torch.randn(batch, nheads, dim, dstate, dtype=state_dtype, device=device)
+ x = torch.randn(batch, nheads, dim, dtype=dtype, device=device)
+ dt = torch.randn(batch, nheads, dim, dtype=dtype, device=device)
+ A = -torch.rand(nheads, dim, dstate, dtype=torch.float32, device=device)
+ B = torch.randn(batch, ngroups, dstate, dtype=dtype, device=device)
+ C = torch.randn(batch, ngroups, dstate, dtype=dtype, device=device)
+ D = torch.randn(nheads, dim, dtype=dtype, device=device)
+ dt_bias = torch.randn(nheads, dim, dtype=dtype, device=device)
+ out = torch.zeros(batch, nheads, dim, dtype=dtype, device=device)
+ return state, x, dt, A, B, C, D, dt_bias, out
+
+
+def benchmark_config(
+ batch: int,
+ nheads: int,
+ dim: int,
+ dstate: int,
+ ngroups: int,
+ block_size_m: int,
+ num_warps_val: int,
+ dtype: torch.dtype,
+ state_dtype: torch.dtype | None = None,
+ num_iters: int = 100,
+ num_warmup: int = 20,
+ graph_batch_size: int = 10,
+) -> float | None:
+ """
+ Time one (BLOCK_SIZE_M, num_warps) config for selective_state_update.
+ Returns elapsed time in microseconds, or None on error.
+
+ Uses CUDA graph capture-and-replay to isolate kernel time from Python
+ eager-mode dispatch / kwarg-resolution overhead, mirroring the timing
+ methodology in benchmarks/kernels/benchmark_moe.py.
+ """
+ state, x, dt, A, B, C, D, dt_bias, out = _make_inputs(
+ batch, nheads, dim, dstate, ngroups, dtype, state_dtype=state_dtype
+ )
+
+ def _call_kernel() -> None:
+ selective_state_update(
+ state,
+ x,
+ dt,
+ A,
+ B,
+ C,
+ D=D,
+ z=None,
+ dt_bias=dt_bias,
+ dt_softplus=True,
+ out=out,
+ )
+
+ try:
+ with override_ssm_config((block_size_m, num_warps_val)):
+ # Eager-mode warmup: triggers Triton autotune / JIT, primes caches.
+ for _ in range(num_warmup):
+ _call_kernel()
+ torch.accelerator.synchronize()
+
+ # Capture graph_batch_size invocations into a CUDA graph so the
+ # timed region runs without Python dispatch overhead per call.
+ graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(graph):
+ for _ in range(graph_batch_size):
+ _call_kernel()
+ torch.accelerator.synchronize()
+
+ # Warmup graph replays (let the runtime stabilize).
+ for _ in range(5):
+ graph.replay()
+ torch.accelerator.synchronize()
+
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+ latencies: list[float] = []
+ for _ in range(num_iters):
+ start.record()
+ graph.replay()
+ end.record()
+ end.synchronize()
+ latencies.append(start.elapsed_time(end))
+ graph.reset()
+ # elapsed_time returns ms; each replay runs graph_batch_size kernels,
+ # so divide by (num_iters * graph_batch_size) and convert ms -> us.
+ return sum(latencies) / (num_iters * graph_batch_size) * 1000
+ except Exception as e:
+ if "OutOfResources" not in str(e):
+ print(
+ f" Warning: config M={block_size_m},w={num_warps_val} "
+ f"raised {type(e).__name__}: {e}"
+ )
+ return None
+
+
+# ---------------------------------------------------------------------------
+# Tuning loop
+# ---------------------------------------------------------------------------
+
+
+# CUDA grid Y/Z dim limit — both `batch` and `nheads` must fit individually.
+_CUDA_MAX_GRID_DIM = 65535
+
+# Above this, kernel state-offset arithmetic (batch * nheads * headdim * dstate)
+# overflows int32 and the launch raises cudaErrorIllegalAddress.
+# 262144 covers Nemotron Super TP1 BS=2048.
+_MAX_EFFECTIVE_BATCH = 262144
+
+
+def expand_batch_x_nheads(
+ batch_sizes: list[int],
+ nheads_list: list[int],
+ ngroups: int,
+) -> list[tuple[int, int, int]]:
+ """Cross-product batch_sizes × nheads_list → sorted [(effective_batch,
+ batch, nheads)], deduped by effective_batch. Filters pairs that exceed
+ the CUDA grid dim limit, the effective_batch ceiling, or where nheads is
+ not a positive multiple of ngroups.
+ """
+ seen: dict[int, tuple[int, int]] = {}
+ skipped_grid: list[tuple[int, int]] = []
+ skipped_ngroups: list[tuple[int, int]] = []
+ skipped_eb: list[tuple[int, int]] = []
+ for b, n in product(batch_sizes, nheads_list):
+ if b <= 0 or n <= 0:
+ continue
+ if b > _CUDA_MAX_GRID_DIM or n > _CUDA_MAX_GRID_DIM:
+ skipped_grid.append((b, n))
+ continue
+ if n % ngroups != 0:
+ skipped_ngroups.append((b, n))
+ continue
+ if b * n > _MAX_EFFECTIVE_BATCH:
+ skipped_eb.append((b, n))
+ continue
+ seen.setdefault(b * n, (b, n))
+ if skipped_grid:
+ print(
+ f" Note: skipping (batch, nheads) pairs exceeding CUDA grid dim "
+ f"{_CUDA_MAX_GRID_DIM}: {skipped_grid}"
+ )
+ if skipped_ngroups:
+ print(
+ f" Note: skipping (batch, nheads) pairs where nheads % ngroups != 0 "
+ f"for ngroups={ngroups}: {skipped_ngroups}"
+ )
+ if skipped_eb:
+ print(
+ f" Note: skipping (batch, nheads) pairs whose effective_batch "
+ f"exceeds {_MAX_EFFECTIVE_BATCH}: {skipped_eb}"
+ )
+ return sorted((eb, b, n) for eb, (b, n) in seen.items())
+
+
+def tune_dstate(
+ dstate: int,
+ headdim: int,
+ ngroups: int,
+ dtype: torch.dtype,
+ num_iters: int,
+ verbose: bool,
+ active: list[tuple[int, int, int]],
+ state_dtype: torch.dtype | None = None,
+) -> tuple[dict[int, dict], dict[int, dict[tuple[int, int], float]]]:
+ """For each (effective_batch, batch, nheads) in *active*, sweep
+ (BLOCK_SIZE_M, num_warps) and return
+ ({effective_batch: best_config}, {effective_batch: {(bsm, nw): us}}).
+ The second map is the full timing grid, used downstream so we don't
+ re-measure the same config in the comparison phase.
+ """
+ best_per_eb: dict[int, dict] = {}
+ timings: dict[int, dict[tuple[int, int], float]] = {}
+
+ print(f"\n{'=' * 74}")
+ effective_state_dtype = state_dtype if state_dtype is not None else dtype
+ print(
+ f"Tuning headdim={headdim} dstate={dstate} ngroups={ngroups} "
+ f"dtype={dtype} ssm_cache_dtype={effective_state_dtype}"
+ )
+ print(f"{'=' * 74}")
+
+ bsm_choices = _block_size_m_choices(headdim)
+ print(f"BSM candidates (capped at next_pow2(headdim={headdim})): {bsm_choices}")
+
+ hdr = f"{'EffBatch':>8} | {'BLOCK_M':>7} | {'warps':>5} | {'us':>10} | note"
+ print(hdr)
+ print("-" * 52)
+
+ for eb, batch, nheads in active:
+ best_time = float("inf")
+ best_cfg: dict = {}
+ eb_timings: dict[tuple[int, int], float] = {}
+
+ for bsm, nw in product(bsm_choices, NUM_WARPS_CHOICES):
+ t = benchmark_config(
+ batch=batch,
+ nheads=nheads,
+ dim=headdim,
+ dstate=dstate,
+ ngroups=ngroups,
+ block_size_m=bsm,
+ num_warps_val=nw,
+ dtype=dtype,
+ state_dtype=state_dtype,
+ num_iters=num_iters,
+ )
+ if t is None:
+ continue
+ eb_timings[(bsm, nw)] = t
+ is_best = t < best_time
+ if is_best:
+ best_time = t
+ best_cfg = {"BLOCK_SIZE_M": bsm, "num_warps": nw}
+ if verbose:
+ marker = " <-- best" if is_best else ""
+ print(f"{eb:>8} | {bsm:>7} | {nw:>5} | {t:>10.2f} |{marker}")
+
+ timings[eb] = eb_timings
+
+ if not best_cfg:
+ print(
+ f"{eb:>8} | {'-':>7} | {'-':>5} | {'-':>10} | "
+ f"no working config (skipped)"
+ )
+ continue
+
+ if not verbose:
+ print(
+ f"{eb:>8} | {best_cfg['BLOCK_SIZE_M']:>7} | "
+ f"{best_cfg['num_warps']:>5} | {best_time:>10.2f} | best"
+ )
+
+ best_per_eb[eb] = best_cfg
+
+ return best_per_eb, timings
+
+
+# ---------------------------------------------------------------------------
+# Correctness validation
+# ---------------------------------------------------------------------------
+
+
+def validate_configs(
+ dstate: int,
+ headdim: int,
+ ngroups: int,
+ tuned: dict[int, dict],
+ active: list[tuple[int, int, int]],
+ dtype: torch.dtype,
+ atol: float = 1e-2,
+ rtol: float = 1e-2,
+ state_dtype: torch.dtype | None = None,
+) -> dict[int, bool]:
+ """
+ For every (effective_batch, batch, nheads) in *active* that has a tuned
+ config, run the kernel with that config and compare against the reference.
+ Returns {effective_batch: passed}.
+ """
+ # Disable TF32 so the reference's matmul matches the Triton kernel's
+ # fp32 accumulation; otherwise large ebs show bf16 rounding mismatches.
+ torch.set_float32_matmul_precision("highest")
+
+ print(f"\n{'=' * 74}")
+ effective_state_dtype = state_dtype if state_dtype is not None else dtype
+ print(
+ f"Validation headdim={headdim} dstate={dstate} ngroups={ngroups} "
+ f"dtype={dtype} ssm_cache_dtype={effective_state_dtype} atol={atol}"
+ )
+ print(f"{'=' * 74}")
+ print(f"{'EffBatch':>8} | {'MaxAbsErr':>12} | {'Status':>8}")
+ print("-" * 36)
+
+ results: dict[int, bool] = {}
+
+ for eb, batch, nheads in active:
+ cfg = tuned.get(eb)
+ if cfg is None:
+ continue
+ state, x, dt, A, B, C, D, dt_bias, out = _make_inputs(
+ batch=batch,
+ nheads=nheads,
+ dim=headdim,
+ dstate=dstate,
+ ngroups=ngroups,
+ dtype=dtype,
+ state_dtype=state_dtype,
+ )
+ # Clone state before GPU kernel modifies it in-place
+ state_ref = state.clone()
+
+ with override_ssm_config((cfg["BLOCK_SIZE_M"], cfg["num_warps"])):
+ selective_state_update(
+ state,
+ x,
+ dt,
+ A,
+ B,
+ C,
+ D=D,
+ z=None,
+ dt_bias=dt_bias,
+ dt_softplus=True,
+ out=out,
+ )
+ torch.accelerator.synchronize()
+ gpu_out = out.detach().cpu()
+
+ # Reference uses the original (unmodified) state
+ # Upcast to fp32 so the reference sums in fp32 (matches the Triton
+ # kernel); summing in bf16 over `dstate` blows up the error.
+ ref_out = (
+ selective_state_update_ref(
+ state_ref.float(),
+ x.float(),
+ dt.float(),
+ A.float(),
+ B.float(),
+ C.float(),
+ D=D.float(),
+ dt_bias=dt_bias.float(),
+ dt_softplus=True,
+ )
+ .to(out.dtype)
+ .cpu()
+ )
+
+ passed = torch.allclose(gpu_out.float(), ref_out.float(), atol=atol, rtol=rtol)
+ max_err = (gpu_out.float() - ref_out.float()).abs().max().item()
+ status = "PASS" if passed else "FAIL"
+ results[eb] = passed
+ print(f"{eb:>8} | {max_err:>12.6f} | {status:>8}")
+
+ n_pass = sum(results.values())
+ n_total = len(results)
+ print(f"\n {n_pass}/{n_total} configs passed validation for dstate={dstate}")
+ return results
+
+
+# ---------------------------------------------------------------------------
+# Save configs
+# ---------------------------------------------------------------------------
+
+
+def save_configs(
+ headdim: int,
+ dstate: int,
+ cache_dtype: str,
+ configs: dict[int, dict],
+ save_dir: str | None = None,
+) -> str:
+ # bf16 shares configs with fp16, use common filename for both
+ cache_dtype = _canonical_cache_dtype(cache_dtype)
+
+ base_dir = save_dir if save_dir else _CONFIGS_DIR
+ os.makedirs(base_dir, exist_ok=True)
+ file_path = os.path.join(
+ base_dir,
+ get_ssm_config_file_name(headdim, dstate, cache_dtype, get_ssm_device_name()),
+ )
+ # triton_version is informational only, the loader ignores it
+ payload: dict[str, Any] = {
+ "triton_version": triton.__version__,
+ **{str(k): v for k, v in sorted(configs.items())},
+ }
+ with open(file_path, "w") as f:
+ json.dump(payload, f, indent=4)
+ return file_path
+
+
+# ---------------------------------------------------------------------------
+# Comparison table
+# ---------------------------------------------------------------------------
+
+
+def current_heuristic(dstate: int, is_blackwell: bool = False) -> dict:
+ """Return the current hard-coded BLOCK_SIZE_M / num_warps for dstate."""
+ bsm, nw = _get_default_ssm_launch_config(dstate, is_blackwell)
+ return {"BLOCK_SIZE_M": bsm, "num_warps": nw}
+
+
+def compare_heuristic_vs_tuned(
+ dstate: int,
+ headdim: int,
+ ngroups: int,
+ tuned: dict[int, dict],
+ timings: dict[int, dict[tuple[int, int], float]],
+ active: list[tuple[int, int, int]],
+ dtype: torch.dtype,
+ num_iters: int,
+ is_blackwell: bool,
+ state_dtype: torch.dtype | None = None,
+):
+ heur_cfg = current_heuristic(dstate, is_blackwell)
+ heur_key = (heur_cfg["BLOCK_SIZE_M"], heur_cfg["num_warps"])
+
+ print(f"\n{'=' * 74}")
+ print(
+ f"Comparison headdim={headdim} dstate={dstate} "
+ f"ngroups={ngroups} — heuristic vs tuned"
+ )
+ print(
+ f"Heuristic: BLOCK_SIZE_M={heur_cfg['BLOCK_SIZE_M']}, "
+ f"num_warps={heur_cfg['num_warps']}"
+ )
+ print(f"{'=' * 74}")
+ hdr = (
+ f"{'EffBatch':>8} | {'Heur(us)':>10} | {'Tuned(us)':>10} | "
+ f"{'Speedup':>8} | Best config"
+ )
+ print(hdr)
+ print("-" * len(hdr))
+
+ for eb, batch, nheads in active:
+ eb_timings = timings.get(eb, {})
+
+ # Heuristic timing: reuse the tuning measurement if the heuristic
+ # config was in the swept grid; otherwise measure it once.
+ t_h = eb_timings.get(heur_key)
+ if t_h is None:
+ t_h = benchmark_config(
+ batch=batch,
+ nheads=nheads,
+ dim=headdim,
+ dstate=dstate,
+ ngroups=ngroups,
+ block_size_m=heur_cfg["BLOCK_SIZE_M"],
+ num_warps_val=heur_cfg["num_warps"],
+ dtype=dtype,
+ state_dtype=state_dtype,
+ num_iters=num_iters,
+ )
+
+ # `tuned[eb]` may be missing if all configs failed in tune_dstate;
+ # in that case fall back to the heuristic so the table still prints.
+ best = tuned.get(eb) or heur_cfg
+ t_t = eb_timings.get((best["BLOCK_SIZE_M"], best["num_warps"]))
+
+ if t_h is None or t_t is None:
+ print(f"{eb:>8} | {'N/A':>10} | {'N/A':>10} | {'N/A':>8} |")
+ continue
+ speedup = t_h / t_t
+ marker = " <--" if speedup > 1.05 else ""
+ print(
+ f"{eb:>8} | {t_h:>10.2f} | {t_t:>10.2f} | "
+ f"{speedup:>7.2f}x | "
+ f"M={best['BLOCK_SIZE_M']},w={best['num_warps']}{marker}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def save_results(device_name: str, output: str, results_file: str | None = None) -> str:
+ """Save the full benchmark output to a results text file."""
+ if results_file is None:
+ results_file = os.path.join(
+ _RESULTS_DIR, f"ssm_benchmark_results_{device_name}.txt"
+ )
+ with open(results_file, "w") as f:
+ f.write(output)
+ return results_file
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Tune selective_state_update kernel for Mamba SSM"
+ )
+ parser.add_argument(
+ "--dstate",
+ type=int,
+ default=128,
+ help="SSM state size to tune for (default: 128)",
+ )
+ parser.add_argument(
+ "--all-dstates",
+ action="store_true",
+ help="Tune all common dstate values: " + str(ALL_DSTATES),
+ )
+ parser.add_argument(
+ "--dtype",
+ type=str,
+ default="bfloat16",
+ choices=["float16", "bfloat16"],
+ help="Activation / input data type (default: bfloat16)",
+ )
+ parser.add_argument(
+ "--mamba-ssm-cache-dtype",
+ type=str,
+ default="float32",
+ choices=list(_SSM_CACHE_DTYPE_MAP.keys()),
+ help="SSM state cache dtype (default: float32)",
+ )
+ parser.add_argument(
+ "--num-iters",
+ type=int,
+ default=100,
+ help="Number of timing iterations (default: 100)",
+ )
+ parser.add_argument(
+ "--save-configs",
+ action="store_true",
+ help=f"Save best configs to JSON in {_CONFIGS_DIR}",
+ )
+ parser.add_argument(
+ "--compare",
+ action="store_true",
+ help="Show comparison table: heuristic vs tuned",
+ )
+ parser.add_argument(
+ "--verbose",
+ action="store_true",
+ help="Print every (BLOCK_SIZE_M, num_warps) result, not just best",
+ )
+ parser.add_argument(
+ "--results-file",
+ type=str,
+ default=None,
+ help="Path to save the benchmark results text file "
+ "(default: ssm_benchmark_results_.txt alongside this script)",
+ )
+ parser.add_argument(
+ "--save-dir",
+ type=str,
+ default=None,
+ help=f"Directory to save JSON configs (default: {_CONFIGS_DIR})",
+ )
+ parser.add_argument(
+ "--headdim",
+ type=int,
+ default=DEFAULT_HEADDIM,
+ help=f"Per-head feature dim (default: {DEFAULT_HEADDIM})",
+ )
+ parser.add_argument(
+ "--ngroups",
+ type=int,
+ default=DEFAULT_NGROUPS,
+ help=f"Number of B/C groups (default: {DEFAULT_NGROUPS})",
+ )
+ parser.add_argument(
+ "--batch-sizes",
+ type=int,
+ nargs="+",
+ default=DEFAULT_BATCH_SIZES,
+ metavar="B",
+ help=f"Decoder batch sizes to sweep (default: {DEFAULT_BATCH_SIZES})",
+ )
+ parser.add_argument(
+ "--nheads",
+ type=int,
+ nargs="+",
+ default=DEFAULT_NHEADS,
+ metavar="N",
+ help=f"Number of heads per rank to sweep (default: {DEFAULT_NHEADS}). "
+ "effective_batch = batch * nheads; cross-product is deduped by eb.",
+ )
+ parser.add_argument(
+ "--validate",
+ action="store_true",
+ help="After tuning, verify each best config against a CPU reference "
+ "implementation. Configs that fail are flagged in the output.",
+ )
+ parser.add_argument(
+ "--atol",
+ type=float,
+ default=1e-2,
+ help="Absolute tolerance for --validate (default: 1e-2)",
+ )
+ args = parser.parse_args()
+
+ dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+ state_dtype = _SSM_CACHE_DTYPE_MAP[args.mamba_ssm_cache_dtype]
+ device_name = get_ssm_device_name()
+ cap = torch.cuda.get_device_capability()
+ is_blackwell = cap[0] >= 10
+
+ # Mirror all output to a results file (like Unix tee).
+ buf = StringIO()
+
+ class _Tee:
+ """Writes to both the original stdout and an in-memory buffer."""
+
+ def write(self, s):
+ buf.write(s)
+ sys.__stdout__.write(s)
+
+ def flush(self):
+ sys.__stdout__.flush()
+
+ sys.stdout = _Tee() # type: ignore[assignment]
+
+ try:
+ print(f"Device : {device_name} (sm_{cap[0]}{cap[1]})")
+ print(f"Blackwell: {is_blackwell}")
+ print(f"dtype : {args.dtype}")
+ print(f"ssm_cache_dtype: {args.mamba_ssm_cache_dtype}")
+ print(f"headdim: {args.headdim}")
+ print(f"ngroups: {args.ngroups}")
+ print(f"triton : {triton.__version__}")
+
+ dstates = ALL_DSTATES if args.all_dstates else [args.dstate]
+ active = expand_batch_x_nheads(args.batch_sizes, args.nheads, args.ngroups)
+
+ for dstate in dstates:
+ tuned, timings = tune_dstate(
+ dstate=dstate,
+ headdim=args.headdim,
+ ngroups=args.ngroups,
+ dtype=dtype,
+ num_iters=args.num_iters,
+ verbose=args.verbose,
+ active=active,
+ state_dtype=state_dtype,
+ )
+
+ if args.compare:
+ compare_heuristic_vs_tuned(
+ dstate=dstate,
+ headdim=args.headdim,
+ ngroups=args.ngroups,
+ tuned=tuned,
+ timings=timings,
+ active=active,
+ dtype=dtype,
+ num_iters=args.num_iters,
+ is_blackwell=is_blackwell,
+ state_dtype=state_dtype,
+ )
+
+ if args.validate:
+ validity = validate_configs(
+ dstate=dstate,
+ headdim=args.headdim,
+ ngroups=args.ngroups,
+ tuned=tuned,
+ active=active,
+ dtype=dtype,
+ atol=args.atol,
+ state_dtype=state_dtype,
+ )
+ # Filter out any configs that failed correctness check
+ failed = [eb for eb, ok in validity.items() if not ok]
+ if failed:
+ print(
+ f"\n WARNING: {len(failed)} config(s) failed validation "
+ f"for dstate={dstate}: effective_batches {failed}"
+ )
+ print(" These will NOT be saved even with --save-configs.")
+ tuned = {
+ eb: cfg for eb, cfg in tuned.items() if validity.get(eb, True)
+ }
+
+ if args.save_configs:
+ path = save_configs(
+ headdim=args.headdim,
+ dstate=dstate,
+ cache_dtype=args.mamba_ssm_cache_dtype,
+ configs=tuned,
+ save_dir=args.save_dir,
+ )
+ print(f"\nSaved: {path}")
+ else:
+ print(f"\nBest configs for dstate={dstate}:")
+ for eb, cfg in sorted(tuned.items()):
+ print(f" effective_batch={eb:>6}: {cfg}")
+ print("\n(Re-run with --save-configs to persist to JSON)")
+ finally:
+ sys.stdout = sys.__stdout__
+ results_path = save_results(device_name, buf.getvalue(), args.results_file)
+ print(f"\nResults saved to: {results_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index 13b97b7696b3..9fcf278f2ef3 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -20,7 +20,7 @@
import numpy as np
import torch
-from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
persistent_masked_m_silu_mul_quant,
)
from vllm.triton_utils import tl, triton
diff --git a/benchmarks/kernels/benchmark_vit_bilinear_pos_embed.py b/benchmarks/kernels/benchmark_vit_bilinear_pos_embed.py
new file mode 100644
index 000000000000..65171a1b2e10
--- /dev/null
+++ b/benchmarks/kernels/benchmark_vit_bilinear_pos_embed.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Benchmarks the fused Triton bilinear position-embedding kernel against
+# the pure-PyTorch (native) implementation used in Qwen3-VL ViT models.
+#
+# == Usage Examples ==
+#
+# Default benchmark:
+# python3 benchmark_vit_bilinear_pos_embed.py
+#
+# Custom parameters:
+# python3 benchmark_vit_bilinear_pos_embed.py --hidden-dim 1152 \
+# --num-grid-per-side 48 --save-path ./configs/vit_pos_embed/
+
+import itertools
+
+import torch
+
+from vllm.model_executor.models.qwen3_vl import (
+ pos_embed_interpolate_native,
+ triton_pos_embed_interpolate,
+)
+from vllm.triton_utils import HAS_TRITON, triton
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# (h, w) configurations to benchmark
+h_w_configs = [
+ (16, 16),
+ (32, 32),
+ (48, 48),
+ (64, 64),
+ (128, 128),
+ (32, 48),
+ (60, 80),
+]
+
+# Temporal dimensions
+t_range = [1]
+
+configs = list(itertools.product(t_range, h_w_configs))
+
+
+def get_benchmark(
+ num_grid_per_side: int,
+ spatial_merge_size: int,
+ hidden_dim: int,
+ dtype: torch.dtype,
+ device: str,
+):
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["t", "h_w"],
+ x_vals=[list(_) for _ in configs],
+ line_arg="provider",
+ line_vals=["native", "triton"],
+ line_names=["Native (PyTorch)", "Triton"],
+ styles=[("blue", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name=(
+ f"vit-bilinear-pos-embed-"
+ f"grid{num_grid_per_side}-"
+ f"dim{hidden_dim}-"
+ f"{dtype}"
+ ),
+ args={},
+ )
+ )
+ def benchmark(t, h_w, provider):
+ h, w = h_w
+
+ torch.manual_seed(42)
+ embed_weight = (
+ torch.randn(
+ num_grid_per_side * num_grid_per_side,
+ hidden_dim,
+ device=device,
+ dtype=dtype,
+ )
+ * 0.25
+ )
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "native":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: pos_embed_interpolate_native(
+ embed_weight,
+ t,
+ h,
+ w,
+ num_grid_per_side,
+ spatial_merge_size,
+ dtype,
+ ),
+ quantiles=quantiles,
+ )
+ else:
+ assert HAS_TRITON, "Triton not available"
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: triton_pos_embed_interpolate(
+ embed_weight,
+ t,
+ h,
+ w,
+ num_grid_per_side,
+ spatial_merge_size,
+ dtype,
+ ),
+ quantiles=quantiles,
+ )
+
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+ return benchmark
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="Benchmark bilinear position embedding interpolation."
+ )
+ parser.add_argument(
+ "--num-grid-per-side",
+ type=int,
+ default=48,
+ help="Position embedding grid size (default: 48 for Qwen3-VL)",
+ )
+ parser.add_argument(
+ "--spatial-merge-size",
+ type=int,
+ default=2,
+ help="Spatial merge size (default: 2)",
+ )
+ parser.add_argument(
+ "--hidden-dim",
+ type=int,
+ default=1152,
+ help="Embedding hidden dimension (default: 1152 for Qwen3-VL)",
+ )
+ parser.add_argument(
+ "--device",
+ type=str,
+ choices=["cuda:0", "cuda:1"],
+ default="cuda:0",
+ )
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./vit_pos_embed/",
+ )
+ args = parser.parse_args()
+
+ dtype = torch.bfloat16
+
+ bench = get_benchmark(
+ args.num_grid_per_side,
+ args.spatial_merge_size,
+ args.hidden_dim,
+ dtype,
+ args.device,
+ )
+ bench.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_vit_fp8_attn.py b/benchmarks/kernels/benchmark_vit_fp8_attn.py
new file mode 100644
index 000000000000..7d7a067dde9d
--- /dev/null
+++ b/benchmarks/kernels/benchmark_vit_fp8_attn.py
@@ -0,0 +1,324 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Benchmarks FP8 vs BF16 ViT attention via FlashInfer cuDNN backend.
+#
+# == Usage Examples ==
+#
+# Benchmark mode (default, FlashInfer CUDAGraph Bench)
+# python3 benchmark_vit_fp8_attn.py
+#
+# Profile mode (PyTorch profiler, saves TensorBoard traces):
+# python3 benchmark_vit_fp8_attn.py --profile
+# python3 benchmark_vit_fp8_attn.py --profile --profile-output-dir ./profile_traces
+#
+# Custom seq_lens:
+# python3 benchmark_vit_fp8_attn.py --seq-lens 4096 8192 16384
+
+from functools import partial
+
+import numpy as np
+import torch
+from torch.profiler import ProfilerActivity, profile, record_function
+
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# Qwen3-VL defaults
+NUM_HEADS = 16
+HEAD_DIM = 72
+DEFAULT_SEQ_LENS = [2304, 4096, 8192, 16384]
+
+
+def _setup_fp8_attention(num_heads: int, head_dim: int) -> tuple:
+ """Create FP8 and BF16 attention modules + workspace."""
+ from types import SimpleNamespace
+ from unittest.mock import patch
+
+ from vllm.config import VllmConfig, set_current_vllm_config
+ from vllm.config.multimodal import MultiModalConfig
+ from vllm.model_executor.layers.attention.mm_encoder_attention import (
+ MMEncoderAttention,
+ _get_flashinfer_workspace_buffer,
+ )
+ from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+ old_dtype = torch.get_default_dtype()
+ torch.set_default_dtype(torch.bfloat16)
+
+ backend_patch = patch(
+ "vllm.model_executor.layers.attention.mm_encoder_attention"
+ ".get_vit_attn_backend",
+ return_value=AttentionBackendEnum.FLASHINFER,
+ )
+
+ # FP8 attention
+ mm_config_fp8 = MultiModalConfig(mm_encoder_attn_dtype="fp8")
+ vllm_config_fp8 = VllmConfig()
+ vllm_config_fp8.model_config = SimpleNamespace(multimodal_config=mm_config_fp8)
+ with set_current_vllm_config(vllm_config_fp8), backend_patch:
+ attn_fp8 = MMEncoderAttention(
+ num_heads=num_heads,
+ head_size=head_dim,
+ prefix="visual.blocks.0.attn",
+ ).to("cuda")
+
+ # BF16 attention (no FP8)
+ with set_current_vllm_config(VllmConfig()), backend_patch:
+ attn_bf16 = MMEncoderAttention(
+ num_heads=num_heads,
+ head_size=head_dim,
+ prefix="visual.blocks.0.attn",
+ ).to("cuda")
+
+ torch.set_default_dtype(old_dtype)
+
+ workspace = _get_flashinfer_workspace_buffer()
+ return attn_fp8, attn_bf16, workspace
+
+
+def _build_meta(
+ seq_len: int,
+ num_heads: int,
+ head_dim: int,
+ fp8: bool,
+):
+ """Build cu_seqlens, max_seqlen, sequence_lengths."""
+ from vllm.model_executor.layers.attention.mm_encoder_attention import (
+ MMEncoderAttention,
+ )
+ from vllm.utils.math_utils import round_up
+ from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+ cu_np = np.array([0, seq_len], dtype=np.int32)
+ fp8_padded = num_heads * round_up(head_dim, 16) if fp8 else None
+
+ seq_lengths = MMEncoderAttention.maybe_compute_seq_lens(
+ AttentionBackendEnum.FLASHINFER, cu_np, torch.device("cuda")
+ )
+ max_seqlen = torch.tensor(
+ MMEncoderAttention.compute_max_seqlen(AttentionBackendEnum.FLASHINFER, cu_np),
+ dtype=torch.int32,
+ )
+ cu_seqlens = MMEncoderAttention.maybe_recompute_cu_seqlens(
+ AttentionBackendEnum.FLASHINFER,
+ cu_np,
+ num_heads * head_dim,
+ 1,
+ torch.device("cuda"),
+ fp8_padded_hidden_size=fp8_padded,
+ )
+ return cu_seqlens, max_seqlen, seq_lengths
+
+
+def run_benchmark(
+ seq_lens: list[int],
+ num_heads: int,
+ head_dim: int,
+ method: str,
+):
+ """Benchmark FP8 vs BF16 attention across seq_lens.
+
+ Uses FlashInfer GPU-level timing to measure pure kernel time,
+ excluding CPU launch overhead.
+ """
+ if method == "cupti":
+ from flashinfer.testing import bench_gpu_time_with_cupti as bench_fn
+
+ bench_fn = partial(bench_fn, use_cuda_graph=True, cold_l2_cache=False)
+ elif method == "cudagraph":
+ from flashinfer.testing import (
+ bench_gpu_time_with_cudagraph as bench_fn,
+ )
+
+ bench_fn = partial(bench_fn, cold_l2_cache=False)
+ else:
+ raise ValueError(f"Invalid method: {method}")
+
+ attn_fp8, attn_bf16, workspace = _setup_fp8_attention(num_heads, head_dim)
+
+ print(f"Timing method: {method}")
+ print(f"{'seq_len':>8} {'BF16 (us)':>12} {'FP8 (us)':>12} {'Speedup':>10}")
+ print("-" * 46)
+
+ for seq_len in seq_lens:
+ torch.manual_seed(42)
+
+ q = torch.randn(
+ seq_len,
+ num_heads,
+ head_dim,
+ device="cuda",
+ dtype=torch.bfloat16,
+ )
+ k = torch.randn_like(q)
+ v = torch.randn_like(q)
+
+ cu_fp8, max_s, seq_l = _build_meta(seq_len, num_heads, head_dim, fp8=True)
+ # we can reuse cu_fp8 for cu_bf16 since q, k, and v are contiguous
+ cu_bf16 = cu_fp8.clone()
+
+ def bf16_fn(q=q, k=k, v=v, cu=cu_bf16, ms=max_s, sl=seq_l):
+ attn_bf16._forward_flashinfer(q, k, v, cu, ms, sl)
+
+ def fp8_fn(q=q, k=k, v=v, cu=cu_fp8, ms=max_s, sl=seq_l):
+ attn_fp8._forward_flashinfer(q, k, v, cu, ms, sl)
+
+ # bench_fn returns List[float] of per-iteration times in ms
+ bf16_times = bench_fn(bf16_fn)
+ fp8_times = bench_fn(fp8_fn)
+
+ bf16_us = np.median(bf16_times) * 1e3 # ms -> us
+ fp8_us = np.median(fp8_times) * 1e3
+ speedup = bf16_us / fp8_us if fp8_us > 0 else float("inf")
+
+ print(f"{seq_len:>8} {bf16_us:>12.1f} {fp8_us:>12.1f} {speedup:>9.2f}x")
+
+
+def _make_trace_handler(output_dir: str, worker_name: str, label: str):
+ """Create a trace handler that saves to TensorBoard and prints summary."""
+
+ def handler(prof):
+ torch.profiler.tensorboard_trace_handler(output_dir, worker_name)(prof)
+ print(f"\n{'=' * 80}")
+ print(label)
+ print(f"{'=' * 80}")
+ print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
+
+ return handler
+
+
+def run_profile(
+ seq_len: int,
+ num_heads: int,
+ head_dim: int,
+ warmup: int,
+ output_dir: str,
+):
+ """Profile FP8 vs BF16 attention with PyTorch profiler."""
+ attn_fp8, attn_bf16, workspace = _setup_fp8_attention(num_heads, head_dim)
+
+ torch.manual_seed(42)
+ q = torch.randn(
+ seq_len,
+ num_heads,
+ head_dim,
+ device="cuda",
+ dtype=torch.bfloat16,
+ )
+ k = torch.randn_like(q)
+ v = torch.randn_like(q)
+
+ cu_fp8, max_s, seq_l = _build_meta(seq_len, num_heads, head_dim, fp8=True)
+ # we can reuse cu_fp8 for cu_bf16 since q, k, and v are contiguous
+ cu_bf16 = cu_fp8.clone()
+
+ sched = torch.profiler.schedule(wait=0, warmup=warmup, active=1)
+
+ # Profile BF16 (warmup handled by profiler schedule)
+ with profile(
+ activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+ schedule=sched,
+ on_trace_ready=_make_trace_handler(
+ output_dir,
+ f"bf16_h{head_dim}_s{seq_len}",
+ f"BF16 Attention (seq_len={seq_len}, heads={num_heads}, "
+ f"head_dim={head_dim})",
+ ),
+ ) as prof_bf16:
+ for _ in range(warmup + 1):
+ with record_function("bf16_attention"):
+ attn_bf16._forward_flashinfer(
+ q.clone(), k.clone(), v.clone(), cu_bf16, max_s, seq_l
+ )
+ torch.accelerator.synchronize()
+ prof_bf16.step()
+
+ # Profile FP8 (warmup handled by profiler schedule)
+ with profile(
+ activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+ schedule=sched,
+ on_trace_ready=_make_trace_handler(
+ output_dir,
+ f"fp8_h{head_dim}_s{seq_len}",
+ f"FP8 Attention (seq_len={seq_len}, heads={num_heads}, "
+ f"head_dim={head_dim})",
+ ),
+ ) as prof_fp8:
+ for _ in range(warmup + 1):
+ with record_function("fp8_attention"):
+ attn_fp8._forward_flashinfer(
+ q.clone(), k.clone(), v.clone(), cu_fp8, max_s, seq_l
+ )
+ torch.accelerator.synchronize()
+ prof_fp8.step()
+
+ print(f"\nTensorBoard traces saved to: {output_dir}")
+ print(f"View with: tensorboard --logdir={output_dir}")
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark FP8 vs BF16 ViT attention.")
+ parser.add_argument(
+ "--seq-lens",
+ type=int,
+ nargs="+",
+ default=DEFAULT_SEQ_LENS,
+ help="Sequence lengths to benchmark",
+ )
+ parser.add_argument(
+ "--num-heads",
+ type=int,
+ default=NUM_HEADS,
+ )
+ parser.add_argument(
+ "--head-dim",
+ type=int,
+ default=HEAD_DIM,
+ )
+ parser.add_argument(
+ "--method",
+ choices=["cupti", "cudagraph"],
+ default="cudagraph",
+ help="GPU timing method: cupti (CUPTI kernel timing) or "
+ "cudagraph (CUDA graph capture/replay). Default: cudagraph",
+ )
+ parser.add_argument(
+ "--warmup",
+ type=int,
+ default=10,
+ help="Warmup iterations (profile mode only)",
+ )
+ parser.add_argument(
+ "--profile",
+ action="store_true",
+ help="Run PyTorch profiler instead of benchmark",
+ )
+ parser.add_argument(
+ "--profile-seq-len",
+ type=int,
+ default=8192,
+ help="Sequence length for profiling (default: 8192)",
+ )
+ parser.add_argument(
+ "--profile-output-dir",
+ type=str,
+ default="./profile_traces",
+ help="Output directory for TensorBoard traces (default: ./profile_traces)",
+ )
+ args = parser.parse_args()
+
+ if args.profile:
+ run_profile(
+ args.profile_seq_len,
+ args.num_heads,
+ args.head_dim,
+ args.warmup,
+ args.profile_output_dir,
+ )
+ else:
+ run_benchmark(
+ args.seq_lens,
+ args.num_heads,
+ args.head_dim,
+ args.method,
+ )
diff --git a/benchmarks/kernels/cpu/benchmark_cpu_attn.py b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
index 63d034278c7e..08afd693c333 100644
--- a/benchmarks/kernels/cpu/benchmark_cpu_attn.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
@@ -12,7 +12,6 @@
cpu_attn_get_scheduler_metadata,
cpu_attn_reshape_and_cache,
)
-from vllm.platforms import CpuArchEnum, current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
@@ -22,15 +21,14 @@ def get_attn_isa(
block_size: int | None = None,
dtype: torch.dtype | None = None,
):
- if block_size and dtype:
- return _get_attn_isa(dtype, block_size)
- else:
- if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
- return "neon"
- elif torch.cpu._is_amx_tile_supported():
- return "amx"
- else:
- return "vec"
+ # Delegate to _get_attn_isa so the fallback path applies the same arch
+ # gating (e.g. RISC-V RVV is only chosen when the build's hardcoded
+ # VLEN=128 kernel is actually present; on VLEN=256 / scalar hosts it
+ # correctly falls through to vec/vec16).
+ return _get_attn_isa(
+ dtype if dtype is not None else torch.bfloat16,
+ block_size if block_size else 32,
+ )
# rand number generation takes too much time, cache rand tensors
@@ -235,7 +233,7 @@ def rint(lo: int, hi: int) -> int:
)
parser.add_argument("--use-sink", action="store_true")
parser.add_argument(
- "--isa", type=str, choices=["vec", "neon", "amx", "vec16"], default=None
+ "--isa", type=str, choices=["vec", "neon", "amx", "vec16", "rvv"], default=None
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--iters", type=int, default=20)
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index 4384d3e56828..c9aaef284d70 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -16,6 +16,7 @@
fp8_gemm_nt,
per_block_cast_to_fp8,
)
+from vllm.utils.torch_utils import set_random_seed
def benchmark_shape(
@@ -235,9 +236,7 @@ def run_benchmarks(verbose: bool = False):
torch.backends.cudnn.allow_tf32 = True
# Set seeds for reproducibility
- torch.manual_seed(42)
- torch.cuda.manual_seed(42)
-
+ set_random_seed(42)
# Define benchmark shapes (m, n, k)
shapes = [
(8, 4096, 7168),
diff --git a/tests/entrypoints/pooling/pooling/__init__.py b/benchmarks/kernels/ir/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/pooling/__init__.py
rename to benchmarks/kernels/ir/__init__.py
diff --git a/benchmarks/kernels/ir/bench_ir_ops.py b/benchmarks/kernels/ir/bench_ir_ops.py
new file mode 100644
index 000000000000..b23c4e8ae327
--- /dev/null
+++ b/benchmarks/kernels/ir/bench_ir_ops.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Generic benchmark harness for vLLM IR ops.
+
+Usage:
+ python benchmarks/kernels/ir/bench_ir_ops.py
+ python benchmarks/kernels/ir/bench_ir_ops.py --ops rms_norm
+ python benchmarks/kernels/ir/bench_ir_ops.py --ops rms_norm,silu_mul
+ python benchmarks/kernels/ir/bench_ir_ops.py --no-cuda-graph
+ python benchmarks/kernels/ir/bench_ir_ops.py --ops rms_norm --save-path ./results/
+"""
+
+import argparse
+import contextlib
+import csv
+import dataclasses
+import datetime
+import math
+import os
+import subprocess
+import sys
+import tempfile
+
+# Ensure repo root is on sys.path so `benchmarks` is importable as a package.
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
+if _REPO_ROOT not in sys.path:
+ sys.path.insert(0, _REPO_ROOT)
+
+# Suppress noisy C++ warnings from vllm kernel registration (written to fd 2
+# directly by the dynamic linker, so Python-level sys.stderr redirect won't
+# catch them).
+_saved_fd = os.dup(2)
+try:
+ with open(os.devnull, "w") as _devnull:
+ os.dup2(_devnull.fileno(), 2)
+ import torch
+
+ import vllm.kernels # noqa: E402, F401
+finally:
+ os.dup2(_saved_fd, 2)
+ os.close(_saved_fd)
+
+from tqdm import tqdm # noqa: E402
+
+from benchmarks.kernels.ir.shapes import SHAPE_CONFIGS # noqa: E402 # isort: skip
+from vllm.ir.op import IrOp # noqa: E402
+from vllm.platforms import current_platform # noqa: E402
+from vllm.triton_utils import triton # noqa: E402
+
+
+@dataclasses.dataclass(frozen=True)
+class BenchConfig:
+ use_cuda_graph: bool = True
+ warmup: int = 25
+ rep: int = 100
+
+
+def _pkg_version(name: str) -> str:
+ from importlib.metadata import PackageNotFoundError, version
+
+ with contextlib.suppress(PackageNotFoundError):
+ return version(name)
+ return "not installed"
+
+
+_METADATA_LABELS = {
+ "timestamp": "Timestamp",
+ "git_commit": "Git commit",
+ "vllm": "vLLM",
+ "pytorch": "PyTorch",
+ "cuda_runtime": "CUDA runtime",
+ "triton": "Triton",
+ "cutlass": "CUTLASS",
+ "helion": "Helion",
+ "device": "Device",
+ "bench_mode": "Bench mode",
+ "warmup": "Warmup",
+ "rep": "Repetitions",
+}
+
+
+def collect_env_metadata(cfg: BenchConfig) -> dict[str, str]:
+ from vllm.collect_env import get_env_info
+
+ env = get_env_info()
+
+ git_sha = "unknown"
+ with contextlib.suppress(subprocess.CalledProcessError, FileNotFoundError):
+ git_sha = (
+ subprocess.check_output(
+ ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL
+ )
+ .decode()
+ .strip()
+ )
+
+ device_name = current_platform.get_device_name()
+
+ warmup_note = " ms" if not cfg.use_cuda_graph else " ms (ignored)"
+ rep_note = " replays" if cfg.use_cuda_graph else " ms"
+
+ return {
+ "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ "git_commit": git_sha,
+ "vllm": str(env.vllm_version),
+ "pytorch": str(env.torch_version),
+ "cuda_runtime": str(env.cuda_runtime_version),
+ "triton": triton.__version__,
+ "cutlass": _pkg_version("nvidia-cutlass-dsl"),
+ "helion": _pkg_version("helion"),
+ "device": device_name,
+ "bench_mode": "cuda_graph" if cfg.use_cuda_graph else "eager",
+ "warmup": f"{cfg.warmup}{warmup_note}",
+ "rep": f"{cfg.rep}{rep_note}",
+ }
+
+
+def print_metadata(metadata: dict[str, str]):
+ print("=" * 60)
+ for key, val in metadata.items():
+ print(f"{_METADATA_LABELS.get(key, key) + ':':<16}{val}")
+ print("=" * 60)
+
+
+def _clone_args(args: tuple) -> tuple:
+ return tuple(a.clone() if isinstance(a, torch.Tensor) else a for a in args)
+
+
+# TODO(gmagogsfm): When the `maybe_inplace` PR lands, ops marked as
+# inplace=True will mutate bench_args across iterations. Both CUDA graph
+# and eager modes will accumulate drift from repeated in-place mutation.
+# We need to re-clone inputs per iteration for inplace ops.
+def _bench_one(fn, args, cfg: BenchConfig) -> float:
+ bench_args = _clone_args(args)
+ bench_fn = lambda: fn(*bench_args)
+
+ if cfg.use_cuda_graph:
+ ms = triton.testing.do_bench_cudagraph(bench_fn, rep=cfg.rep, quantiles=[0.5])
+ else:
+ ms = triton.testing.do_bench(
+ bench_fn, warmup=cfg.warmup, rep=cfg.rep, quantiles=[0.5]
+ )
+ return ms * 1000
+
+
+# TODO(gmagogsfm): Once compiled native implementation lands (#38775),
+# the benchmark baseline should be the compiled native (what vLLM runs by
+# default) rather than the uncompiled native implementation.
+def collect_timings(
+ op: IrOp, shape_configs: list[dict], cfg: BenchConfig
+) -> tuple[list[str], list[str], dict[str, dict[str, float]]]:
+ def fmt(v) -> str:
+ return str(v).split(".")[-1] if isinstance(v, torch.dtype) else str(v)
+
+ case_names = [
+ "_".join(f"{k}={fmt(v)}" for k, v in kwargs.items()) for kwargs in shape_configs
+ ]
+ providers = [n for n, impl in op.impls.items() if impl.supported]
+
+ results: dict[str, dict[str, float]] = {c: {} for c in case_names}
+ for provider in providers:
+ impl = op.impls[provider]
+ desc = f"{op.name} / {provider}"
+ for case_name, kwargs in tqdm(
+ zip(case_names, shape_configs),
+ desc=desc,
+ total=len(case_names),
+ unit=" cases",
+ ):
+ args = op.generate_inputs(**kwargs)
+ if impl.supports_args(*args):
+ results[case_name][provider] = _bench_one(impl.impl_fn, args, cfg)
+ else:
+ results[case_name][provider] = float("nan")
+
+ return case_names, providers, results
+
+
+def analyze_results(
+ op_name: str,
+ case_names: list[str],
+ providers: list[str],
+ results: dict[str, dict[str, float]],
+) -> tuple[list[dict[str, str]], list[dict[str, str]], list[str]]:
+ native_col = "native"
+ non_native = [p for p in providers if p != native_col]
+
+ header_cols = ["case"]
+ for p in providers:
+ header_cols.append(f"{p} (us)")
+ for p in non_native:
+ header_cols.append(f"{p} speedup")
+
+ detail_rows: list[dict[str, str]] = []
+ speedup_data: dict[str, list[tuple[float, str]]] = {p: [] for p in non_native}
+
+ for case_name in case_names:
+ timings = results[case_name]
+ row: dict[str, str] = {"case": case_name}
+
+ for p in providers:
+ val = timings.get(p, float("nan"))
+ row[f"{p} (us)"] = f"{val:.2f}" if not math.isnan(val) else "n/a"
+
+ native_us = timings.get(native_col, float("nan"))
+ for p in non_native:
+ p_us = timings.get(p, float("nan"))
+ if not math.isnan(native_us) and not math.isnan(p_us) and p_us > 0:
+ speedup = native_us / p_us
+ row[f"{p} speedup"] = f"{speedup:.2f}x"
+ speedup_data[p].append((speedup, case_name))
+ else:
+ row[f"{p} speedup"] = "n/a"
+
+ detail_rows.append(row)
+
+ summary_rows: list[dict[str, str]] = []
+ for p in non_native:
+ entries = speedup_data[p]
+ if not entries:
+ continue
+ speedups = [s for s, _ in entries]
+ geomean = math.exp(sum(math.log(s) for s in speedups) / len(speedups))
+ best_val, best_case = max(entries)
+ worst_val, worst_case = min(entries)
+ wins = sum(1 for s in speedups if s > 1.0)
+ losses = sum(1 for s in speedups if s < 1.0)
+ total = len(speedups)
+
+ print(f"\n{p} vs native ({wins}/{total} faster, {losses}/{total} slower):")
+ print(f" geomean speedup: {geomean:.2f}x")
+ print(f" best: {best_val:.2f}x ({best_case})")
+ print(f" worst: {worst_val:.2f}x ({worst_case})")
+
+ summary_rows.append(
+ {
+ "op": op_name,
+ "provider": p,
+ "geomean_speedup": f"{geomean:.2f}",
+ "best_speedup": f"{best_val:.2f}",
+ "best_case": best_case,
+ "worst_speedup": f"{worst_val:.2f}",
+ "worst_case": worst_case,
+ "wins": str(wins),
+ "losses": str(losses),
+ "total": str(total),
+ }
+ )
+
+ return detail_rows, summary_rows, header_cols
+
+
+def write_csv(path: str, rows: list[dict[str, str]], fieldnames: list[str]):
+ with open(path, "w", newline="") as f:
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
+ writer.writeheader()
+ writer.writerows(rows)
+
+
+def save_results(
+ save_dir: str,
+ op_name: str,
+ detail_rows: list[dict[str, str]],
+ header_cols: list[str],
+ all_summary_rows: list[dict[str, str]],
+ metadata: dict[str, str],
+):
+ write_csv(
+ os.path.join(save_dir, f"{op_name}_detail.csv"),
+ detail_rows,
+ header_cols,
+ )
+ if all_summary_rows:
+ write_csv(
+ os.path.join(save_dir, "summary.csv"),
+ all_summary_rows,
+ list(all_summary_rows[0].keys()),
+ )
+ write_csv(
+ os.path.join(save_dir, "metadata.csv"),
+ [metadata],
+ list(metadata.keys()),
+ )
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Benchmark vLLM IR ops")
+ parser.add_argument(
+ "--ops",
+ type=str,
+ default=None,
+ help="Comma-separated list of op names to benchmark (substring match)",
+ )
+ parser.add_argument(
+ "--no-cuda-graph",
+ action="store_true",
+ help="Disable CUDA graph; use do_bench with L2 cache flushing instead",
+ )
+ parser.add_argument(
+ "--warmup",
+ type=int,
+ default=25,
+ help="Warmup time in ms (do_bench) or ignored with CUDA graph (default: 25)",
+ )
+ parser.add_argument(
+ "--rep",
+ type=int,
+ default=100,
+ help="Repetition time in ms (do_bench) or number of graph replays "
+ "(do_bench_cudagraph) (default: 100)",
+ )
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default=None,
+ help="Directory to save results (default: auto-created temp dir)",
+ )
+ return parser.parse_args()
+
+
+def main():
+ args = parse_args()
+ cfg = BenchConfig(
+ use_cuda_graph=not args.no_cuda_graph,
+ warmup=args.warmup,
+ rep=args.rep,
+ )
+
+ torch.set_default_device(current_platform.device_type)
+
+ metadata = collect_env_metadata(cfg)
+ print_metadata(metadata)
+
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+ save_dir = args.save_path or os.path.join(
+ tempfile.gettempdir(), f"vllm_ir_bench_{timestamp}"
+ )
+ os.makedirs(save_dir, exist_ok=True)
+
+ op_filters = [f.strip() for f in args.ops.split(",")] if args.ops else None
+ all_summary_rows: list[dict[str, str]] = []
+
+ for op in IrOp.registry.values():
+ if op_filters and not any(f in op.name for f in op_filters):
+ continue
+ if not op.has_input_generator:
+ print(f"Skipping op '{op.name}': no input generator registered")
+ continue
+ if op.name not in SHAPE_CONFIGS:
+ raise RuntimeError(
+ f"No benchmark shape config for op '{op.name}'. "
+ f"Add it to benchmarks/kernels/ir/shapes.py"
+ )
+
+ case_names, providers, results = collect_timings(
+ op, SHAPE_CONFIGS[op.name], cfg
+ )
+ detail_rows, summary_rows, header_cols = analyze_results(
+ op.name, case_names, providers, results
+ )
+ all_summary_rows.extend(summary_rows)
+
+ save_results(
+ save_dir,
+ op.name,
+ detail_rows,
+ header_cols,
+ all_summary_rows,
+ metadata,
+ )
+
+ print(f"\nResults saved to: {save_dir}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/ir/shapes.py b/benchmarks/kernels/ir/shapes.py
new file mode 100644
index 000000000000..6cc44cf6cec1
--- /dev/null
+++ b/benchmarks/kernels/ir/shapes.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Shape configurations for IR op benchmarks.
+"""
+
+import torch
+
+NUM_TOKENS = [1, 2, 4, 16, 64, 256, 1024, 4096, 16384]
+COMMON_HIDDEN_SIZES = [
+ 2048, # Llama 3.2 1B, Qwen 3 MoE 30B-A3B, Gemma 3n
+ 3072, # Gemma 7B/9B
+ 4096, # Llama 3 8B, Qwen 3 8B, Mistral 7B
+ 5120, # Llama 4 Scout 17B-16E
+ 7168, # DeepSeek V3
+ 8192, # Llama 3 70B
+ 16384, # Llama 3 405B
+]
+
+# Each entry maps an op name to a list of kwarg dicts that will be passed
+# to that op's registered input generator via op.generate_inputs(**kwargs).
+SHAPE_CONFIGS: dict[str, list[dict]] = {
+ "rms_norm": [
+ {"num_tokens": n, "hidden_size": d, "dtype": dtype}
+ for dtype in [torch.float16, torch.bfloat16, torch.float32]
+ for d in COMMON_HIDDEN_SIZES
+ for n in NUM_TOKENS
+ ],
+}
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index e23f6b923f1b..2f56099c66fd 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -217,6 +217,7 @@ async def send_request(
min_tokens: int | None = None,
max_tokens: int | None = None,
timeout_sec: int = 120,
+ conversation_id: str | None = None,
) -> ServerResponse:
payload = {
"model": model,
@@ -225,6 +226,9 @@ async def send_request(
"temperature": 0.0,
}
+ if conversation_id is not None:
+ payload["conversation_id"] = conversation_id
+
if stream:
payload["stream"] = True
payload["stream_options"] = {"include_usage": False}
@@ -419,6 +423,7 @@ async def send_turn(
min_tokens,
max_tokens,
req_args.timeout_sec,
+ conversation_id=conv_id,
)
if response.valid is False:
@@ -1439,6 +1444,12 @@ async def main() -> None:
action="store_true",
help="Export summary to Excel file (optional)",
)
+ parser.add_argument(
+ "--stats-json-output",
+ type=str,
+ default=None,
+ help="Export per-request stats (ttft_ms, tpot_ms, etc.) to a JSON file",
+ )
parser.add_argument(
"-v",
"--verbose",
@@ -1462,6 +1473,12 @@ async def main() -> None:
"(for example: --warmup-percentages=0%%,50%%)",
)
+ parser.add_argument(
+ "--trust-remote-code",
+ action="store_true",
+ help="Trust remote code when loading the tokenizer.",
+ )
+
args = parser.parse_args()
logger.info(args)
@@ -1504,7 +1521,9 @@ async def main() -> None:
np.random.seed(args.seed)
logger.info("Loading tokenizer")
- tokenizer = AutoTokenizer.from_pretrained(args.model)
+ tokenizer = AutoTokenizer.from_pretrained(
+ args.model, trust_remote_code=args.trust_remote_code
+ )
await get_server_info(args.url)
@@ -1651,6 +1670,19 @@ async def main() -> None:
warmup_runtime_sec=warmup_runtime_sec,
)
+ if args.stats_json_output is not None:
+ # Export per-request metrics as a JSON array for downstream analysis.
+ stats_data = [s._asdict() for s in client_metrics]
+ logger.info(
+ f"{Color.GREEN}Writing per-request stats JSON: "
+ f"{args.stats_json_output}{Color.RESET}"
+ )
+ os.makedirs(
+ os.path.dirname(os.path.abspath(args.stats_json_output)), exist_ok=True
+ )
+ with open(args.stats_json_output, "w") as f:
+ json.dump(stats_data, f, indent=2)
+
if args.output_file is not None:
# Write a JSON file with the updated conversations
# The "assistant" content will contain the answers from the tested LLM
diff --git a/build_rust.sh b/build_rust.sh
new file mode 100755
index 000000000000..98871ec8abcf
--- /dev/null
+++ b/build_rust.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Build the vllm-rs Rust frontend binary and install it into the vllm package.
+# Usage: ./build_rust.sh [--debug]
+#
+# By default builds in release mode. Pass --debug for faster compile times
+# during development.
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")" && pwd)"
+RUST_DIR="$REPO_ROOT/rust"
+TARGET_PATH="${VLLM_RS_TARGET_PATH:-$REPO_ROOT/vllm/vllm-rs}"
+
+# Read the required toolchain from rust-toolchain.toml.
+TOOLCHAIN=$(grep '^channel' "$REPO_ROOT/rust-toolchain.toml" | sed 's/.*= *"\(.*\)"/\1/')
+
+# Ensure rustup and the required toolchain are available.
+if ! command -v rustup &>/dev/null; then
+ echo "rustup not found, installing..."
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none
+ source "$HOME/.cargo/env"
+fi
+
+if ! rustup run "$TOOLCHAIN" rustc --version &>/dev/null; then
+ echo "Installing Rust toolchain: $TOOLCHAIN"
+ rustup toolchain install "$TOOLCHAIN"
+fi
+
+if [[ "${1:-}" == "--debug" ]]; then
+ PROFILE_ARGS=()
+ PROFILE_DIR="debug"
+else
+ PROFILE_ARGS=(--release)
+ PROFILE_DIR="release"
+fi
+
+cargo +"$TOOLCHAIN" build "${PROFILE_ARGS[@]}" \
+ --manifest-path "$RUST_DIR/Cargo.toml" \
+ --bin vllm-rs \
+ --features native-tls-vendored
+
+mkdir -p "$(dirname "$TARGET_PATH")"
+cp "$RUST_DIR/target/$PROFILE_DIR/vllm-rs" "$TARGET_PATH"
+echo "Installed vllm-rs to $TARGET_PATH"
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 8d74d6d5d96c..ffab4015f495 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,7 +1,7 @@
include(FetchContent)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -30,6 +30,26 @@ else()
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
+
+ # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+ # and create a local shim dir with it. When PyTorch is built from source or packaged
+ # by a distro (common on RISC-V, s390x, Fedora/RHEL aarch64), no vendored libgomp
+ # exists and the shim dir is empty; fall back to the system libgomp in that case.
+ vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+ if(VLLM_TORCH_GOMP_SHIM_DIR)
+ find_library(OPEN_MP
+ NAMES gomp
+ PATHS "${VLLM_TORCH_GOMP_SHIM_DIR}"
+ NO_DEFAULT_PATH
+ REQUIRED
+ )
+ # Use the same libgomp as PyTorch at runtime
+ set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+ else()
+ # Fall back to system / toolchain libgomp
+ find_library(OPEN_MP NAMES gomp REQUIRED)
+ endif()
endif()
if (NOT MACOSX_FOUND)
@@ -146,16 +166,51 @@ elseif (S390_FOUND)
"-mtune=native")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
message(STATUS "RISC-V detected")
- if(RVV_BF16_FOUND)
- message(STATUS "BF16 extension detected")
- set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
- add_compile_definitions(RISCV_BF16_SUPPORT)
- elseif (RVV_FP16_FOUND)
- message(WARNING "BF16 functionality is not available")
- set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
+ # VLLM_RVV_VLEN selects the target VLEN. Auto-detected from /proc/cpuinfo
+ # by default; override with -DVLLM_RVV_VLEN=128 or -DVLLM_RVV_VLEN=256.
+ if(NOT DEFINED VLLM_RVV_VLEN)
+ # Auto-detect: find the largest zvlb in /proc/cpuinfo isa line.
+ if(EXISTS /proc/cpuinfo)
+ file(READ /proc/cpuinfo _cpuinfo)
+ set(_best 0)
+ foreach(_n IN ITEMS 128 256 512 1024)
+ if(_cpuinfo MATCHES "zvl${_n}b")
+ set(_best ${_n})
+ endif()
+ endforeach()
+ if(_best GREATER 0)
+ set(VLLM_RVV_VLEN ${_best})
+ endif()
+ endif()
+ # If auto-detect failed (no /proc/cpuinfo or no zvlb reported)
+ # but the compiler supports RVV, require explicit specification.
+ if(NOT DEFINED VLLM_RVV_VLEN AND (RVV_FP16_FOUND OR RVV_BF16_FOUND))
+ message(FATAL_ERROR
+ "RISC-V RVV is available but VLEN could not be auto-detected. "
+ "Please specify VLEN explicitly:\n"
+ " -DVLLM_RVV_VLEN=128 (for VLEN=128 hardware)\n"
+ " -DVLLM_RVV_VLEN=256 (for VLEN=256 hardware, e.g. Spacemit X100)\n"
+ " -DVLLM_RVV_VLEN=0 (force scalar, no RVV)")
+ endif()
+ endif()
+ if(VLLM_RVV_VLEN AND VLLM_RVV_VLEN GREATER 0)
+ message(STATUS "RISC-V RVV VLEN=${VLLM_RVV_VLEN}")
+ # Sources gate FP16/BF16 paths on the compiler-provided
+ # __riscv_zvfh / __riscv_zvfbfmin macros, which GCC and clang
+ # define automatically when those extensions appear in -march.
+ if(RVV_BF16_FOUND)
+ message(STATUS "BF16 extension detected")
+ set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl${VLLM_RVV_VLEN}b -mrvv-vector-bits=zvl -mabi=lp64d)
+ elseif(RVV_FP16_FOUND)
+ message(WARNING "BF16 functionality is not available")
+ set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl${VLLM_RVV_VLEN}b -mrvv-vector-bits=zvl -mabi=lp64d)
+ else()
+ message(STATUS "compile riscv with scalar (no FP16/BF16)")
+ set(MARCH_FLAGS -march=rv64gc)
+ endif()
else()
message(STATUS "compile riscv with scalar")
- list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
+ set(MARCH_FLAGS -march=rv64gc)
endif()
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
else()
@@ -175,20 +230,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
if(NOT NPROC)
set(NPROC 4)
endif()
- # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
- # and create a local shim dir with it
- vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
-
- find_library(OPEN_MP
- NAMES gomp
- PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
- NO_DEFAULT_PATH
- REQUIRED
- )
- # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
- if (OPEN_MP)
- set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
- endif()
# Fetch and populate ACL
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
@@ -287,14 +328,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
set(ONEDNN_VERBOSE "ON")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
- # TODO: Refactor this
- if (ENABLE_X86_ISA)
- # Note: only enable oneDNN for AVX512
- list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512})
- else()
- list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS})
- endif()
-
set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
FetchContent_MakeAvailable(oneDNN)
@@ -307,8 +340,14 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
PRIVATE ${oneDNN_SOURCE_DIR}/src
)
target_link_libraries(dnnl_ext dnnl torch)
- target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC)
+ if (ENABLE_X86_ISA)
+ target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS_AVX2} -fPIC)
+ else()
+ target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
+ endif()
list(APPEND LIBS dnnl_ext)
+
+
set(USE_ONEDNN ON)
else()
set(USE_ONEDNN OFF)
@@ -349,6 +388,7 @@ endif()
set(VLLM_EXT_SRC
"csrc/cpu/activation.cpp"
"csrc/cpu/utils.cpp"
+ "csrc/cpu/spec_decode_utils.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
@@ -359,6 +399,7 @@ set(VLLM_EXT_SRC
if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
set(VLLM_EXT_SRC
"csrc/cpu/shm.cpp"
+ "csrc/cpu/activation_lut_bf16.cpp"
${VLLM_EXT_SRC})
endif()
@@ -370,11 +411,15 @@ endif()
if (ENABLE_X86_ISA)
set(VLLM_EXT_SRC_SGL
+ "csrc/cpu/sgl-kernels/fla.cpp"
+ "csrc/cpu/sgl-kernels/conv.cpp"
"csrc/cpu/sgl-kernels/gemm.cpp"
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
+ "csrc/cpu/sgl-kernels/gemm_int4.cpp"
"csrc/cpu/sgl-kernels/moe.cpp"
"csrc/cpu/sgl-kernels/moe_int8.cpp"
+ "csrc/cpu/sgl-kernels/moe_int4.cpp"
"csrc/cpu/sgl-kernels/moe_fp8.cpp")
set(VLLM_EXT_SRC_AVX512
@@ -382,6 +427,7 @@ if (ENABLE_X86_ISA)
"csrc/cpu/cpu_wna16.cpp"
"csrc/cpu/cpu_fused_moe.cpp"
"csrc/cpu/utils.cpp"
+ "csrc/cpu/spec_decode_utils.cpp"
"csrc/cpu/cpu_attn.cpp"
"csrc/cpu/dnnl_kernels.cpp"
"csrc/cpu/torch_bindings.cpp"
@@ -392,9 +438,11 @@ if (ENABLE_X86_ISA)
"csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
- set(VLLM_EXT_SRC_AVX2
+ set(VLLM_EXT_SRC_AVX2
"csrc/cpu/utils.cpp"
+ "csrc/cpu/spec_decode_utils.cpp"
"csrc/cpu/cpu_attn.cpp"
+ "csrc/cpu/dnnl_kernels.cpp"
"csrc/cpu/torch_bindings.cpp"
# TODO: Remove these files
"csrc/cpu/activation.cpp"
@@ -409,7 +457,7 @@ if (ENABLE_X86_ISA)
set(_C_LIBS numa dnnl_ext)
set(_C_AVX512_LIBS numa dnnl_ext)
- set(_C_AVX2_LIBS numa)
+ set(_C_AVX2_LIBS numa dnnl_ext)
# AMX + AVX512F + AVX512BF16 + AVX512VNNI
define_extension_target(
diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake
new file mode 100644
index 000000000000..183c42dc7953
--- /dev/null
+++ b/cmake/external_projects/deepgemm.cmake
@@ -0,0 +1,187 @@
+include(FetchContent)
+
+# If DEEPGEMM_SRC_DIR is set, DeepGEMM is built from that directory
+# instead of downloading.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{DEEPGEMM_SRC_DIR})
+ set(DEEPGEMM_SRC_DIR $ENV{DEEPGEMM_SRC_DIR})
+endif()
+
+if(DEEPGEMM_SRC_DIR)
+ FetchContent_Declare(
+ deepgemm
+ SOURCE_DIR ${DEEPGEMM_SRC_DIR}
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ )
+else()
+ # This ref should be kept in sync with tools/install_deepgemm.sh
+ FetchContent_Declare(
+ deepgemm
+ GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM.git
+ GIT_TAG 891d57b4db1071624b5c8fa0d1e51cb317fa709f
+ GIT_SUBMODULES "third-party/cutlass" "third-party/fmt"
+ GIT_PROGRESS TRUE
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ )
+endif()
+
+# Use FetchContent_Populate (not MakeAvailable) to avoid processing
+# DeepGEMM's own CMakeLists.txt which has incompatible find_package calls.
+FetchContent_GetProperties(deepgemm)
+if(NOT deepgemm_POPULATED)
+ FetchContent_Populate(deepgemm)
+endif()
+message(STATUS "DeepGEMM is available at ${deepgemm_SOURCE_DIR}")
+
+# DeepGEMM requires CUDA 12.3+ for SM90, 12.9+ for SM100
+set(DEEPGEMM_SUPPORT_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+ list(APPEND DEEPGEMM_SUPPORT_ARCHS "9.0a")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+ list(APPEND DEEPGEMM_SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ list(APPEND DEEPGEMM_SUPPORT_ARCHS "10.0a")
+endif()
+
+cuda_archs_loose_intersection(DEEPGEMM_ARCHS
+ "${DEEPGEMM_SUPPORT_ARCHS}" "${CUDA_ARCHS}")
+
+if(DEEPGEMM_ARCHS)
+ message(STATUS "DeepGEMM CUDA architectures: ${DEEPGEMM_ARCHS}")
+
+ #
+ # DeepGEMM integration notes
+ # --------------------------
+ # We vendor DeepGEMM into vllm/third_party/deep_gemm/ and bundle a
+ # `_C.cpython-X.Y-*.so` for every CPython in `requires-python`. The
+ # per-Python build is delegated to tools/build_deepgemm_C.py.
+ #
+ # Why per-Python: DeepGEMM's binding uses PYBIND11_MODULE, which links
+ # private CPython symbols — a single `_C.abi3.so` is not viable today
+ # (see #41476 / #41512 for the failed attempt).
+ #
+ # TODOs (tracked in vllm-project/vllm#42431):
+ # - Replace DeepGEMM's pybind11 binding with a TORCH_LIBRARY + shim
+ # binding (cf. vllm-flash-attention/csrc/common/pytorch_shim.h) to
+ # collapse to one `_C.abi3.so`. Needs either an upstream change or
+ # a maintained binding fork in vLLM.
+ # - AOT-compile DeepGEMM's CUDA kernels instead of runtime JIT to drop
+ # the vendored CUTLASS/CCCL headers and the CUDA-toolkit-at-runtime
+ # requirement.
+ #
+
+ # DEEPGEMM_PYTHON_INTERPRETERS: ":"-separated target Python paths.
+ # Empty/unset → fall back to the build interpreter (editable installs).
+ # (Empty-but-set env vars test as DEFINED in cmake — treat as unset.)
+ if(NOT "$ENV{DEEPGEMM_PYTHON_INTERPRETERS}" STREQUAL "")
+ string(REPLACE ":" ";" _dg_pythons "$ENV{DEEPGEMM_PYTHON_INTERPRETERS}")
+ else()
+ set(_dg_pythons "${Python_EXECUTABLE}")
+ endif()
+ message(STATUS "DeepGEMM _C will be built for: ${_dg_pythons}")
+
+ # add_custom_command does no implicit header scanning; glob explicitly so
+ # header-only edits in DeepGEMM/cutlass/fmt re-trigger the rebuild.
+ file(GLOB_RECURSE _dg_headers
+ "${deepgemm_SOURCE_DIR}/csrc/*.h"
+ "${deepgemm_SOURCE_DIR}/csrc/*.hpp"
+ "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.h"
+ "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.hpp"
+ "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.cuh")
+
+ set(_dg_markers)
+ set(_dg_seen_soabis)
+ foreach(_pybin IN LISTS _dg_pythons)
+ execute_process(
+ COMMAND "${_pybin}" -c
+ "import sysconfig; print(sysconfig.get_config_var('SOABI'))"
+ OUTPUT_VARIABLE _dg_soabi
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ COMMAND_ERROR_IS_FATAL ANY)
+ # Dedup interpreters that resolve to the same CPython.
+ if(_dg_soabi IN_LIST _dg_seen_soabis)
+ continue()
+ endif()
+ list(APPEND _dg_seen_soabis "${_dg_soabi}")
+ set(_dg_dir "${CMAKE_CURRENT_BINARY_DIR}/deepgemm_C_${_dg_soabi}")
+ set(_dg_marker "${_dg_dir}/.built")
+ add_custom_command(
+ OUTPUT "${_dg_marker}"
+ COMMAND "${Python_EXECUTABLE}"
+ "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
+ "${deepgemm_SOURCE_DIR}" "${_dg_dir}" "${_pybin}"
+ COMMAND "${CMAKE_COMMAND}" -E touch "${_dg_marker}"
+ DEPENDS "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
+ "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp"
+ ${_dg_headers}
+ COMMENT "Building DeepGEMM _C for ${_pybin}"
+ VERBATIM)
+ list(APPEND _dg_markers "${_dg_marker}")
+ install(DIRECTORY "${_dg_dir}/"
+ DESTINATION vllm/third_party/deep_gemm
+ COMPONENT _deep_gemm_C
+ FILES_MATCHING PATTERN "_C.cpython-*.so")
+ endforeach()
+ add_custom_target(_deep_gemm_C ALL DEPENDS ${_dg_markers})
+
+ #
+ # Vendor DeepGEMM Python package files
+ #
+ install(FILES
+ "${deepgemm_SOURCE_DIR}/deep_gemm/__init__.py"
+ DESTINATION vllm/third_party/deep_gemm
+ COMPONENT _deep_gemm_C)
+
+ install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/utils/"
+ DESTINATION vllm/third_party/deep_gemm/utils
+ COMPONENT _deep_gemm_C
+ FILES_MATCHING PATTERN "*.py")
+
+ install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/testing/"
+ DESTINATION vllm/third_party/deep_gemm/testing
+ COMPONENT _deep_gemm_C
+ FILES_MATCHING PATTERN "*.py")
+
+ install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/legacy/"
+ DESTINATION vllm/third_party/deep_gemm/legacy
+ COMPONENT _deep_gemm_C
+ FILES_MATCHING PATTERN "*.py")
+
+ install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/mega/"
+ DESTINATION vllm/third_party/deep_gemm/mega
+ COMPONENT _deep_gemm_C
+ FILES_MATCHING PATTERN "*.py")
+
+ # Generate envs.py (normally generated by DeepGEMM's setup.py build step)
+ file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
+ "# Pre-installed environment variables\npersistent_envs = dict()\n")
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
+ DESTINATION vllm/third_party/deep_gemm
+ RENAME envs.py
+ COMPONENT _deep_gemm_C)
+
+ #
+ # Install include files needed for JIT compilation at runtime.
+ # The JIT compiler finds these relative to the package directory.
+ #
+
+ # DeepGEMM's own CUDA headers
+ install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/include/"
+ DESTINATION vllm/third_party/deep_gemm/include
+ COMPONENT _deep_gemm_C)
+
+ # CUTLASS and CuTe headers (vendored for JIT, separate from vLLM's CUTLASS)
+ install(DIRECTORY "${deepgemm_SOURCE_DIR}/third-party/cutlass/include/"
+ DESTINATION vllm/third_party/deep_gemm/include
+ COMPONENT _deep_gemm_C)
+
+else()
+ message(STATUS "DeepGEMM will not compile: "
+ "unsupported CUDA architecture ${CUDA_ARCHS}")
+ # Create empty target so setup.py doesn't fail on unsupported systems
+ add_custom_target(_deep_gemm_C)
+endif()
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 0f16b9161fa3..65986df55012 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
- GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
+ GIT_TAG a6ec2ba7bd0a7dff98b3f4d3e6b52b159c48d78b
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
diff --git a/cmake/external_projects/triton_kernels.cmake b/cmake/external_projects/triton_kernels.cmake
index 1d8b9779c8f7..2966c78030bd 100644
--- a/cmake/external_projects/triton_kernels.cmake
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -1,6 +1,6 @@
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
-set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
+set(DEFAULT_TRITON_KERNELS_TAG "v3.5.1")
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
# be directly set to the triton_kernels python directory.
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 443d41d5a21a..b38917a7b0b5 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -39,7 +39,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 29210221863736a08f71a866459e368ad1ac4a95
+ GIT_TAG bce29425653ec0fbc579d329883030e832d15ada
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
@@ -87,18 +87,30 @@ endforeach()
#
add_custom_target(_vllm_fa4_cutedsl_C)
-# Copy flash_attn/cute directory (needed for FA4) and transform imports
-# The cute directory uses flash_attn.cute imports internally, which we replace
-# with vllm.vllm_flash_attn.cute to match our package structure.
-install(CODE "
- file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\")
- foreach(SRC_FILE \${CUTE_PY_FILES})
- file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE})
- set(DST_FILE \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute/\${REL_PATH}\")
- get_filename_component(DST_DIR \${DST_FILE} DIRECTORY)
- file(MAKE_DIRECTORY \${DST_DIR})
- file(READ \${SRC_FILE} FILE_CONTENTS)
- string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\")
- file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\")
- endforeach()
-" COMPONENT _vllm_fa4_cutedsl_C)
+# Install flash_attn/cute directory (needed for FA4).
+# When using a local source dir (VLLM_FLASH_ATTN_SRC_DIR), create a symlink
+# so edits to cute-dsl Python files take effect immediately without rebuilding.
+# Otherwise, copy files and transform flash_attn.cute imports to
+# vllm.vllm_flash_attn.cute to match our package structure.
+if(VLLM_FLASH_ATTN_SRC_DIR)
+ install(CODE "
+ set(LINK_TARGET \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\")
+ set(LINK_NAME \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute\")
+ file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")
+ file(REMOVE_RECURSE \"\${LINK_NAME}\")
+ file(CREATE_LINK \"\${LINK_TARGET}\" \"\${LINK_NAME}\" SYMBOLIC)
+ " COMPONENT _vllm_fa4_cutedsl_C)
+else()
+ install(CODE "
+ file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\")
+ foreach(SRC_FILE \${CUTE_PY_FILES})
+ file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE})
+ set(DST_FILE \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute/\${REL_PATH}\")
+ get_filename_component(DST_DIR \${DST_FILE} DIRECTORY)
+ file(MAKE_DIRECTORY \${DST_DIR})
+ file(READ \${SRC_FILE} FILE_CONTENTS)
+ string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\")
+ file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\")
+ endforeach()
+ " COMPONENT _vllm_fa4_cutedsl_C)
+endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index e95333457b57..f10ba93f7c65 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -47,12 +47,17 @@ macro (append_cmake_prefix_path PKG EXPR)
list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
endmacro()
-#
-# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
-# of CUDA source files. The names of the corresponding "hipified" sources are
-# stored in `OUT_SRCS`.
-#
+# Resolve hipified output paths for `NAME` into `OUT_SRCS` and register the
+# `.cu` sources with the shared `hipify_all` target. Per-extension hipify
+# targets are unsafe to run in parallel against a shared csrc/ output dir, so
+# accumulation here is paired with a single finalize step.
function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+ if (TARGET hipify_all)
+ message(FATAL_ERROR
+ "hipify_sources_target(${NAME}) called after vllm_finalize_hipify_target. "
+ "Add the new HIP extension before the finalizer call in CMakeLists.txt.")
+ endif()
+
#
# Split into C++ and non-C++ (i.e. CUDA) sources.
#
@@ -73,19 +78,41 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
endforeach()
- set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
- add_custom_target(
- hipify${NAME}
- COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
- DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
- BYPRODUCTS ${HIP_SRCS}
- COMMENT "Running hipify on ${NAME} extension source files.")
+ set_property(GLOBAL APPEND PROPERTY VLLM_HIPIFY_ALL_SRCS ${SRCS})
+ set_property(GLOBAL APPEND PROPERTY VLLM_HIPIFY_ALL_BYPRODUCTS ${HIP_SRCS})
# Swap out original extension sources with hipified sources.
list(APPEND HIP_SRCS ${CXX_SRCS})
set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
endfunction()
+# Define the single shared `hipify_all` custom target that runs hipify once
+# on the union of every HIP extension's sources. Call after the last HIP
+# `define_extension_target`.
+function (vllm_finalize_hipify_target)
+ if (TARGET hipify_all)
+ return()
+ endif()
+
+ get_property(ALL_SRCS GLOBAL PROPERTY VLLM_HIPIFY_ALL_SRCS)
+ get_property(ALL_BYPRODUCTS GLOBAL PROPERTY VLLM_HIPIFY_ALL_BYPRODUCTS)
+
+ if (NOT ALL_SRCS)
+ return()
+ endif()
+
+ list(REMOVE_DUPLICATES ALL_SRCS)
+ list(REMOVE_DUPLICATES ALL_BYPRODUCTS)
+
+ set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
+ add_custom_target(
+ hipify_all
+ COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${ALL_SRCS}
+ DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${ALL_SRCS}
+ BYPRODUCTS ${ALL_BYPRODUCTS}
+ COMMENT "Running hipify on all extension source files.")
+endfunction()
+
#
# Get additional GPU compiler flags from torch.
#
@@ -449,6 +476,16 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
endfunction()
+
+function(cuda_archs_sm90plus OUT_CUDA_ARCHS TGT_CUDA_ARCHS)
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(_archs "9.0a;10.0f;11.0f" "${TGT_CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(_archs "9.0a;10.0a;10.1a;10.3a" "${TGT_CUDA_ARCHS}")
+ endif()
+ set(${OUT_CUDA_ARCHS} ${_archs} PARENT_SCOPE)
+endfunction()
+
#
# Override the GPU architectures detected by cmake/torch and filter them by
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
@@ -551,7 +588,7 @@ function (define_extension_target MOD_NAME)
if (ARG_LANGUAGE STREQUAL "HIP")
# Make this target dependent on the hipify preprocessor step.
- add_dependencies(${MOD_NAME} hipify${MOD_NAME})
+ add_dependencies(${MOD_NAME} hipify_all)
# Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
target_include_directories(${MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
${ARG_INCLUDE_DIRECTORIES})
diff --git a/csrc/async_util.cuh b/csrc/async_util.cuh
new file mode 100644
index 000000000000..392d78c53fdb
--- /dev/null
+++ b/csrc/async_util.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace vllm {
+namespace cuda_async {
+
+__device__ __forceinline__ void cp_async_shared_global_16_cg(
+ void* smem_ptr, const void* glob_ptr) {
+#if defined(USE_ROCM)
+ *reinterpret_cast(smem_ptr) = *reinterpret_cast(glob_ptr);
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+ uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr));
+ asm volatile("cp.async.cg.shared.global [%0], [%1], 16;\n"
+ :
+ : "r"(smem), "l"(glob_ptr));
+#elif defined(__CUDA_ARCH__)
+ *reinterpret_cast(smem_ptr) = *reinterpret_cast(glob_ptr);
+#else
+ (void)smem_ptr;
+ (void)glob_ptr;
+#endif
+}
+
+__device__ __forceinline__ void cp_async_shared_global_ca(void* smem_ptr,
+ const void* glob_ptr,
+ int size_bytes) {
+#if defined(USE_ROCM)
+ if (size_bytes == 4) {
+ *reinterpret_cast(smem_ptr) =
+ *reinterpret_cast(glob_ptr);
+ } else if (size_bytes == 8) {
+ *reinterpret_cast(smem_ptr) =
+ *reinterpret_cast(glob_ptr);
+ } else {
+ *reinterpret_cast(smem_ptr) =
+ *reinterpret_cast(glob_ptr);
+ }
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+ uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr));
+ if (size_bytes == 4) {
+ asm volatile("cp.async.ca.shared.global [%0], [%1], 4;\n"
+ :
+ : "r"(smem), "l"(glob_ptr));
+ } else if (size_bytes == 8) {
+ asm volatile("cp.async.ca.shared.global [%0], [%1], 8;\n"
+ :
+ : "r"(smem), "l"(glob_ptr));
+ } else {
+ asm volatile("cp.async.ca.shared.global [%0], [%1], 16;\n"
+ :
+ : "r"(smem), "l"(glob_ptr));
+ }
+#elif defined(__CUDA_ARCH__)
+ if (size_bytes == 4) {
+ *reinterpret_cast(smem_ptr) =
+ *reinterpret_cast(glob_ptr);
+ } else if (size_bytes == 8) {
+ *reinterpret_cast(smem_ptr) =
+ *reinterpret_cast(glob_ptr);
+ } else {
+ *reinterpret_cast(smem_ptr) =
+ *reinterpret_cast(glob_ptr);
+ }
+#else
+ (void)smem_ptr;
+ (void)glob_ptr;
+ (void)size_bytes;
+#endif
+}
+
+__device__ __forceinline__ void cp_async_commit_group() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && !defined(USE_ROCM)
+ asm volatile("cp.async.commit_group;\n" ::);
+#endif
+}
+
+template
+__device__ __forceinline__ void cp_async_wait_group() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && !defined(USE_ROCM)
+ asm volatile("cp.async.wait_group %0;\n" : : "n"(n));
+#endif
+}
+
+} // namespace cuda_async
+} // namespace vllm
diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh
index e714e321b0be..3d56859d8fc5 100644
--- a/csrc/attention/dtype_fp8.cuh
+++ b/csrc/attention/dtype_fp8.cuh
@@ -1,6 +1,7 @@
#pragma once
#include "attention_generic.cuh"
+#include "torch_utils.h"
#include
#ifdef ENABLE_FP8
@@ -17,6 +18,22 @@ enum class Fp8KVCacheDataType {
kFp8E5M2 = 2,
};
+inline Fp8KVCacheDataType get_fp8_kv_cache_data_type(
+ const std::string& dtype_str) {
+ // dtype_str refers to CacheDType at vllm.config.cache.CacheDType
+ if (dtype_str == "auto" || dtype_str == "float16" ||
+ dtype_str == "bfloat16") {
+ // unquantized kv cache
+ return Fp8KVCacheDataType::kAuto;
+ } else if (dtype_str == "fp8" || dtype_str == "fp8_ds_mla" ||
+ dtype_str == "fp8_e4m3") {
+ return Fp8KVCacheDataType::kFp8E4M3;
+ } else if (dtype_str == "fp8_e5m2") {
+ return Fp8KVCacheDataType::kFp8E5M2;
+ }
+ TORCH_UTILS_CHECK(false, "Unsupported fp8 kv cache data type: ", dtype_str);
+}
+
// fp8 vector types for quantization of kv cache
template <>
struct Vec {
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 27d1e990c611..75f066e80915 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -3,22 +3,33 @@
#include
#include
#include
+#include
#include "attention_dtypes.h"
#include "attention_utils.cuh"
+#include "../quantization/w8a8/fp8/common.cuh"
+#include "../dispatch_utils.h"
namespace vllm {
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
// can be used to combine partial attention results (in the split-KV case)
-template
+template
__global__ void merge_attn_states_kernel(
- scalar_t* output, float* output_lse, const scalar_t* prefix_output,
+ output_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads,
const uint head_size, const uint prefix_head_stride,
- const uint output_head_stride) {
- using pack_128b_t = uint4;
+ const uint output_head_stride, const uint prefix_num_tokens,
+ const float* output_scale) {
+ // Inputs always load 128-bit packs (pack_size elements of scalar_t).
+ // Outputs store pack_size elements of output_t, which is smaller for FP8.
+ using input_pack_t = uint4;
+ using output_pack_t =
+ std::conditional_t,
+ uint4>;
const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size;
@@ -41,8 +52,45 @@ __global__ void merge_attn_states_kernel(
head_idx * output_head_stride;
const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
- scalar_t* output_head_ptr = output + dst_head_offset;
+ output_t* output_head_ptr = output + dst_head_offset;
+ // Pre-invert scale: multiplication is faster than division
+ float fp8_scale_inv = 1.0f;
+ if constexpr (USE_FP8_OUTPUT) {
+ fp8_scale_inv = 1.0f / *output_scale;
+ }
+
+ // If token_idx >= prefix_num_tokens, just copy from suffix
+ if (token_idx >= prefix_num_tokens) {
+ if (pack_offset < head_size) {
+ input_pack_t s_out_pack = reinterpret_cast(
+ suffix_head_ptr)[pack_offset / pack_size];
+
+ if constexpr (USE_FP8_OUTPUT) {
+ output_t o_out_pack[pack_size];
+#pragma unroll
+ for (uint i = 0; i < pack_size; ++i) {
+ const float val =
+ vllm::to_float(reinterpret_cast(&s_out_pack)[i]);
+ o_out_pack[i] =
+ vllm::scaled_fp8_conversion(val, fp8_scale_inv);
+ }
+ reinterpret_cast(
+ output_head_ptr)[pack_offset / pack_size] =
+ *reinterpret_cast(o_out_pack);
+ } else {
+ reinterpret_cast(
+ output_head_ptr)[pack_offset / pack_size] = s_out_pack;
+ }
+ }
+ if (output_lse != nullptr && pack_idx == 0) {
+ float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
+ output_lse[head_idx * num_tokens + token_idx] = s_lse;
+ }
+ return;
+ }
+
+ // For tokens within prefix range, merge prefix and suffix
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
p_lse = std::isinf(p_lse) ? -std::numeric_limits::infinity() : p_lse;
@@ -53,20 +101,34 @@ __global__ void merge_attn_states_kernel(
/* In certain edge cases, MLA can produce p_lse = s_lse = -inf;
continuing the pipeline then yields NaN. Root cause: with chunked prefill
a batch may be split into two chunks; if a request in that batch has no
- prefix hit, every LSE entry for that request’s position is -inf, and at
+ prefix hit, every LSE entry for that request's position is -inf, and at
this moment we merge cross-attention at first. For now we simply emit
prefix_output (expected to be all zeros) and prefix_lse (-inf) to fix
this problem.
*/
if (std::isinf(max_lse)) {
if (pack_offset < head_size) {
- // Pack 128b load
- pack_128b_t p_out_pack = reinterpret_cast(
+ input_pack_t p_out_pack = reinterpret_cast(
prefix_head_ptr)[pack_offset / pack_size];
- // Pack 128b storage
- reinterpret_cast(output_head_ptr)[pack_offset / pack_size] =
- p_out_pack;
+ if constexpr (USE_FP8_OUTPUT) {
+ // Convert prefix values to FP8 (since -inf means no data,
+ // prefix_output is expected to be zeros)
+ output_t o_out_pack[pack_size];
+#pragma unroll
+ for (uint i = 0; i < pack_size; ++i) {
+ const float val =
+ vllm::to_float(reinterpret_cast(&p_out_pack)[i]);
+ o_out_pack[i] =
+ vllm::scaled_fp8_conversion(val, fp8_scale_inv);
+ }
+ reinterpret_cast(
+ output_head_ptr)[pack_offset / pack_size] =
+ *reinterpret_cast(o_out_pack);
+ } else {
+ reinterpret_cast(
+ output_head_ptr)[pack_offset / pack_size] = p_out_pack;
+ }
}
// We only need to write to output_lse once per head.
if (output_lse != nullptr && pack_idx == 0) {
@@ -84,30 +146,43 @@ __global__ void merge_attn_states_kernel(
const float s_scale = s_se / out_se;
if (pack_offset < head_size) {
- // Pack 128b load
- pack_128b_t p_out_pack = reinterpret_cast(
+ input_pack_t p_out_pack = reinterpret_cast(
prefix_head_ptr)[pack_offset / pack_size];
- pack_128b_t s_out_pack = reinterpret_cast(
+ input_pack_t s_out_pack = reinterpret_cast(
suffix_head_ptr)[pack_offset / pack_size];
- pack_128b_t o_out_pack;
+ // Compute merged values in float32
+ float o_out_f[pack_size];
#pragma unroll
for (uint i = 0; i < pack_size; ++i) {
- // Always use float for FMA to keep high precision.
- // half(uint16_t), bfloat16, float -> float.
const float p_out_f =
vllm::to_float(reinterpret_cast(&p_out_pack)[i]);
const float s_out_f =
vllm::to_float(reinterpret_cast(&s_out_pack)[i]);
- // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
- const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
- // float -> half(uint16_t), bfloat16, float.
- vllm::from_float(reinterpret_cast(&o_out_pack)[i], o_out_f);
+ o_out_f[i] = p_out_f * p_scale + (s_out_f * s_scale);
}
- // Pack 128b storage
- reinterpret_cast(output_head_ptr)[pack_offset / pack_size] =
- o_out_pack;
+ // Convert and store
+ if constexpr (USE_FP8_OUTPUT) {
+ output_t o_out_pack[pack_size];
+#pragma unroll
+ for (uint i = 0; i < pack_size; ++i) {
+ o_out_pack[i] = vllm::scaled_fp8_conversion(
+ o_out_f[i], fp8_scale_inv);
+ }
+ reinterpret_cast(
+ output_head_ptr)[pack_offset / pack_size] =
+ *reinterpret_cast(o_out_pack);
+ } else {
+ output_pack_t o_out_pack;
+#pragma unroll
+ for (uint i = 0; i < pack_size; ++i) {
+ vllm::from_float(reinterpret_cast(&o_out_pack)[i],
+ o_out_f[i]);
+ }
+ reinterpret_cast(
+ output_head_ptr)[pack_offset / pack_size] = o_out_pack;
+ }
}
// We only need to write to output_lse once per head.
if (output_lse != nullptr && pack_idx == 0) {
@@ -134,50 +209,73 @@ __global__ void merge_attn_states_kernel(
} \
}
-#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS) \
+#define LAUNCH_MERGE_ATTN_STATES(scalar_t, output_t, NUM_THREADS, \
+ USE_FP8_OUTPUT) \
{ \
- vllm::merge_attn_states_kernel \
+ vllm::merge_attn_states_kernel \
<<>>( \
- reinterpret_cast(output.data_ptr()), output_lse_ptr, \
+ reinterpret_cast(output.data_ptr()), output_lse_ptr, \
reinterpret_cast(prefix_output.data_ptr()), \
reinterpret_cast(prefix_lse.data_ptr()), \
reinterpret_cast(suffix_output.data_ptr()), \
reinterpret_cast(suffix_lse.data_ptr()), num_tokens, \
- num_heads, head_size, prefix_head_stride, output_head_stride); \
+ num_heads, head_size, prefix_head_stride, output_head_stride, \
+ prefix_num_tokens, output_scale_ptr); \
}
/*@brief Merges the attention states from prefix and suffix
* into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
*
* @param output [n,h,d] The output tensor to store the merged attention states.
- * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
+ * @param output_lse [h,n] Optional tensor to store the log-sum-exp values.
* @param prefix_output [n,h,d] The prefix attention states.
* @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
* states.
* @param suffix_output [n,h,d] The suffix attention states.
* @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
* states.
+ * @param prefill_tokens_with_context Number of prefill tokens with context
+ * For the first p tokens (0 <= token_idx < prefill_tokens_with_context), output
+ * is computed by merging prefix_output and suffix_output. For remaining tokens
+ * (prefill_tokens_with_context <= token_idx < n), output is copied directly
+ * from suffix_output.
+ * @param output_scale Optional scalar tensor for FP8 static quantization.
+ * When provided, output must be FP8 dtype.
*/
template
-void merge_attn_states_launcher(torch::Tensor& output,
- std::optional output_lse,
- const torch::Tensor& prefix_output,
- const torch::Tensor& prefix_lse,
- const torch::Tensor& suffix_output,
- const torch::Tensor& suffix_lse) {
+void merge_attn_states_launcher(
+ torch::Tensor& output, std::optional output_lse,
+ const torch::Tensor& prefix_output, const torch::Tensor& prefix_lse,
+ const torch::Tensor& suffix_output, const torch::Tensor& suffix_lse,
+ const std::optional prefill_tokens_with_context,
+ const std::optional& output_scale) {
constexpr uint NUM_THREADS = 128;
const uint num_tokens = output.size(0);
const uint num_heads = output.size(1);
const uint head_size = output.size(2);
const uint prefix_head_stride = prefix_output.stride(1);
const uint output_head_stride = output.stride(1);
+ // Thread mapping is based on input BF16 pack_size
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
+
+ const uint prefix_num_tokens =
+ prefill_tokens_with_context.has_value()
+ ? static_cast(prefill_tokens_with_context.value())
+ : num_tokens;
+ TORCH_CHECK(prefix_num_tokens <= num_tokens,
+ "prefix_num_tokens must be <= num_tokens");
+
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr();
}
+ float* output_scale_ptr = nullptr;
+ if (output_scale.has_value()) {
+ output_scale_ptr = output_scale.value().data_ptr();
+ }
// Process one pack elements per thread. for float, the
// pack_size is 4 for half/bf16, the pack_size is 8.
const uint threads_per_head = head_size / pack_size;
@@ -189,14 +287,22 @@ void merge_attn_states_launcher(torch::Tensor& output,
const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
auto stream = at::cuda::getCurrentCUDAStream();
- LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
+ if (output_scale.has_value()) {
+ // FP8 output path - dispatch on output FP8 type
+ VLLM_DISPATCH_FP8_TYPES(output.scalar_type(), "merge_attn_states_fp8", [&] {
+ LAUNCH_MERGE_ATTN_STATES(scalar_t, fp8_t, NUM_THREADS, true);
+ });
+ } else {
+ // Original BF16/FP16/FP32 output path
+ LAUNCH_MERGE_ATTN_STATES(scalar_t, scalar_t, NUM_THREADS, false);
+ }
}
-#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t) \
- { \
- merge_attn_states_launcher(output, output_lse, prefix_output, \
- prefix_lse, suffix_output, \
- suffix_lse); \
+#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t) \
+ { \
+ merge_attn_states_launcher( \
+ output, output_lse, prefix_output, prefix_lse, suffix_output, \
+ suffix_lse, prefill_tokens_with_context, output_scale); \
}
void merge_attn_states(torch::Tensor& output,
@@ -204,6 +310,21 @@ void merge_attn_states(torch::Tensor& output,
const torch::Tensor& prefix_output,
const torch::Tensor& prefix_lse,
const torch::Tensor& suffix_output,
- const torch::Tensor& suffix_lse) {
- DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
+ const torch::Tensor& suffix_lse,
+ std::optional prefill_tokens_with_context,
+ const std::optional& output_scale) {
+ if (output_scale.has_value()) {
+ TORCH_CHECK(output.scalar_type() == at::ScalarType::Float8_e4m3fn ||
+ output.scalar_type() == at::ScalarType::Float8_e4m3fnuz,
+ "output must be FP8 when output_scale is provided, got: ",
+ output.scalar_type());
+ } else {
+ TORCH_CHECK(output.scalar_type() == prefix_output.scalar_type(),
+ "output dtype (", output.scalar_type(),
+ ") must match prefix_output dtype (",
+ prefix_output.scalar_type(), ") when output_scale is not set");
+ }
+ // Always dispatch on prefix_output (input) dtype
+ DISPATCH_BY_SCALAR_DTYPE(prefix_output.dtype(),
+ CALL_MERGE_ATTN_STATES_LAUNCHER);
}
diff --git a/csrc/attention/vertical_slash_index.cu b/csrc/attention/vertical_slash_index.cu
deleted file mode 100644
index c1b45b143f4e..000000000000
--- a/csrc/attention/vertical_slash_index.cu
+++ /dev/null
@@ -1,401 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include
-
-#include
-
-#include
-
-__device__ int64_t save_blocks(int* block_offset, int64_t range_start,
- int64_t range_end, int64_t block_size,
- int64_t input_block_count, int64_t kv_seqlen) {
- if (range_start >= kv_seqlen) {
- return input_block_count;
- }
- if (range_end > kv_seqlen) {
- range_end = kv_seqlen;
- }
- int64_t current_block_count = input_block_count;
- for (int idx = range_start; idx < range_end; idx += block_size) {
- block_offset[current_block_count++] = idx;
- }
- return current_block_count;
-}
-
-__global__ void convert_vertical_slash_indexes_kernel(
- const int* q_seqlens, // [BATCH, ]
- const int* kv_seqlens, // [BATCH, ]
- const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
- int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
- int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
- int64_t NNZ_V, int64_t NNZ_S,
- bool causal // True for intra, False for succ
-) {
- const int batch_idx = blockIdx.y;
- const int head_idx = blockIdx.x;
- const int group_idx = blockIdx.z;
-
- int64_t q_seqlen = q_seqlens[batch_idx];
- int64_t kv_seqlen = kv_seqlens[batch_idx];
- int64_t block_idx_m = group_idx * blockDim.x + threadIdx.x;
- int64_t start_m = block_idx_m * BLOCK_SIZE_M;
- if (start_m >= q_seqlen) {
- return;
- }
- int64_t end_m = start_m + BLOCK_SIZE_M;
- vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
- slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
- int64_t row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
- block_count += row_offset;
- block_offset += row_offset * NNZ_S;
- column_count += row_offset;
- column_index += row_offset * NNZ_V;
-
- bool has_slash = true;
- int64_t tmp_col_cnt = 0, tmp_blk_cnt = 0;
- int64_t s = 0, v = 0;
- int64_t v_idx = vertical_indexes[v++];
- int64_t s_idx = slash_indexes[s++];
- if (causal) {
- while (s_idx >= end_m + (kv_seqlen - q_seqlen) && s < NNZ_S) {
- s_idx = slash_indexes[s++];
- }
- if (s_idx > end_m + (kv_seqlen - q_seqlen)) has_slash = false;
- s_idx = max((kv_seqlen - q_seqlen) + end_m - s_idx, BLOCK_SIZE_M);
- } else {
- while (s_idx >= end_m + kv_seqlen && s < NNZ_S) {
- s_idx = slash_indexes[s++];
- }
- if (s_idx > end_m + kv_seqlen) has_slash = false;
- s_idx = max(kv_seqlen + end_m - s_idx, BLOCK_SIZE_M);
- }
-
- int64_t range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
- if (!has_slash) {
- if (causal) {
- range_start = (kv_seqlen - q_seqlen) + end_m;
- range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
- } else {
- range_start = kv_seqlen;
- range_end = kv_seqlen + BLOCK_SIZE_N;
- }
- }
-
- bool slash_finished = false;
- while (1) {
- if (v_idx < range_end) {
- if (v_idx < range_start) {
- column_index[tmp_col_cnt++] = v_idx;
- }
- if (v < NNZ_V) {
- v_idx = vertical_indexes[v++];
- } else {
- if (causal)
- v_idx = end_m + BLOCK_SIZE_N + (kv_seqlen - q_seqlen);
- else
- v_idx = end_m + BLOCK_SIZE_N + kv_seqlen;
- }
- } else {
- if ((s < NNZ_S && causal) ||
- (s < NNZ_S && !causal && slash_indexes[s] >= start_m)) {
- if (causal)
- s_idx = max((kv_seqlen - q_seqlen) + end_m - slash_indexes[s++],
- BLOCK_SIZE_M);
- else
- s_idx = max(kv_seqlen + end_m - slash_indexes[s++], BLOCK_SIZE_M);
- } else {
- if (v == NNZ_V || (v_idx > range_start && causal)) {
- // add the last vertical if no more slash
- if (v == NNZ_V && !causal && v_idx < kv_seqlen) {
- column_index[tmp_col_cnt++] = v_idx;
- }
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- break;
- } else {
- if (causal) {
- range_start = (kv_seqlen - q_seqlen) + end_m;
- range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
- } else {
- // if slash_finished but there are vertical left, save current
- // blocks
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- range_start = kv_seqlen;
- range_end = kv_seqlen + BLOCK_SIZE_N;
- }
- slash_finished = true;
- }
- }
- if (!slash_finished) {
- if (s_idx > range_end + BLOCK_SIZE_M) {
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- range_start = s_idx - BLOCK_SIZE_M;
- range_end = s_idx;
- } else if (s_idx > range_end) {
- range_end += BLOCK_SIZE_M;
- }
- }
- }
- }
-
- block_count[0] = tmp_blk_cnt;
- column_count[0] = tmp_col_cnt;
-}
-
-void convert_vertical_slash_indexes_64x64(
- const int* q_seqlens, // [BATCH, ]
- const int* kv_seqlens, // [BATCH, ]
- const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
- int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
- int64_t BATCH_SIZE, int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M,
- int64_t BLOCK_SIZE_N, int64_t NNZ_V, int64_t NNZ_S, bool causal) {
- const int N_THREADS = 64;
- const dim3 dimBlock(N_THREADS);
- const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS);
- convert_vertical_slash_indexes_kernel<<>>(
- q_seqlens, kv_seqlens, vertical_indexes, slash_indexes, block_count,
- block_offset, column_count, column_index, N_HEADS, N_ROWS, BLOCK_SIZE_M,
- BLOCK_SIZE_N, NNZ_V, NNZ_S, causal);
-}
-
-/**
- * Implements the Algorithm 4 in paper https://arxiv.org/abs/2407.02490.
- *
- * This function builds the index of each row of blocks from vertical indices
- * and slash indices. The vertical indices are treated as points, while the
- * slash indices are converted as ranges. The output consists of the merged
- * ranges and separate column indices, where the ranges are represented by
- * block indices.
- *
- * The implementation is referenced from the original MInference repo:
- * https://github.com/microsoft/MInference/blob/main/csrc/vertical_slash_index.cu.
- */
-void convert_vertical_slash_indexes(
- torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS]
- torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
- torch::Tensor& column_count, // [BATCH, N_HEADS, NUM_ROWS]
- torch::Tensor& column_index, // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
- torch::Tensor q_seqlens, // [BATCH, ]
- torch::Tensor kv_seqlens, // [BATCH, ]
- torch::Tensor vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- torch::Tensor slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- int64_t context_size, int64_t block_size_M, int64_t block_size_N,
- bool causal) {
- cudaSetDevice(q_seqlens.get_device());
-
- int batch_size = slash_indexes.size(0);
- int num_heads = slash_indexes.size(1);
- int nnz_slash = slash_indexes.size(2);
- int nnz_vertical = vertical_indexes.size(2);
- int num_rows = (context_size + block_size_M - 1) / block_size_M;
-
- convert_vertical_slash_indexes_64x64(
- q_seqlens.data_ptr(), kv_seqlens.data_ptr(),
- vertical_indexes.data_ptr(), slash_indexes.data_ptr(),
- block_count.data_ptr(), block_offset.data_ptr(),
- column_count.data_ptr(), column_index.data_ptr(), batch_size,
- num_heads, num_rows, block_size_M, block_size_N, nnz_vertical, nnz_slash,
- causal);
-}
-
-__global__ void convert_vertical_slash_indexes_kernel_mergehead(
- const int* q_seqlens, // [BATCH, ]
- const int* kv_seqlens, // [BATCH, ]
- const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- const int* per_head_vertical_topkv, const int* per_head_slash_topkv,
- int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
- int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
- int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
- int64_t NNZ_V, int64_t NNZ_S,
- bool causal // True for intra, False for succ
-) {
- const int batch_idx = blockIdx.y;
- const int head_idx = blockIdx.x;
- const int group_idx = blockIdx.z;
-
- int64_t q_seqlen = q_seqlens[batch_idx];
- int64_t kv_seqlen = kv_seqlens[batch_idx];
- int64_t block_idx_m = group_idx * blockDim.x + threadIdx.x;
- int64_t start_m = block_idx_m * BLOCK_SIZE_M;
- if (start_m >= q_seqlen) {
- return;
- }
- int64_t end_m = start_m + BLOCK_SIZE_M;
- vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
- slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
- int64_t row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
- block_count += row_offset;
- block_offset += row_offset * NNZ_S;
- column_count += row_offset;
- column_index += row_offset * NNZ_V;
-
- // MergeHead: each head has it's unique max topk NNZ_V,NNZ_S. (NNZ_V,NNZ_S
- // above is buffer size, use to compute offset)
- NNZ_S = per_head_slash_topkv[head_idx];
- NNZ_V = per_head_vertical_topkv[head_idx];
-
- bool has_slash = true;
- int64_t tmp_col_cnt = 0, tmp_blk_cnt = 0;
- int64_t s = 0, v = 0;
- int64_t v_idx = vertical_indexes[v++];
- int64_t s_idx = slash_indexes[s++];
- if (causal) {
- while (s_idx >= end_m + (kv_seqlen - q_seqlen) && s < NNZ_S) {
- s_idx = slash_indexes[s++];
- }
- if (s_idx > end_m + (kv_seqlen - q_seqlen)) has_slash = false;
- s_idx = max((kv_seqlen - q_seqlen) + end_m - s_idx, BLOCK_SIZE_M);
- } else {
- while (s_idx >= end_m + kv_seqlen && s < NNZ_S) {
- s_idx = slash_indexes[s++];
- }
- if (s_idx > end_m + kv_seqlen) has_slash = false;
- s_idx = max(kv_seqlen + end_m - s_idx, BLOCK_SIZE_M);
- }
-
- int64_t range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
- if (!has_slash) {
- if (causal) {
- range_start = (kv_seqlen - q_seqlen) + end_m;
- range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
- } else {
- range_start = kv_seqlen;
- range_end = kv_seqlen + BLOCK_SIZE_N;
- }
- }
-
- bool slash_finished = false;
- while (1) {
- if (v_idx < range_end) {
- if (v_idx < range_start) {
- column_index[tmp_col_cnt++] = v_idx;
- }
- if (v < NNZ_V) {
- v_idx = vertical_indexes[v++];
- } else {
- if (causal)
- v_idx = end_m + BLOCK_SIZE_N + (kv_seqlen - q_seqlen);
- else
- v_idx = end_m + BLOCK_SIZE_N + kv_seqlen;
- }
- } else {
- if ((s < NNZ_S && causal) ||
- (s < NNZ_S && !causal && slash_indexes[s] >= start_m)) {
- if (causal)
- s_idx = max((kv_seqlen - q_seqlen) + end_m - slash_indexes[s++],
- BLOCK_SIZE_M);
- else
- s_idx = max(kv_seqlen + end_m - slash_indexes[s++], BLOCK_SIZE_M);
- } else {
- if (v == NNZ_V || (v_idx > range_start && causal)) {
- // add the last vertical if no more slash
- if (v == NNZ_V && !causal && v_idx < kv_seqlen) {
- column_index[tmp_col_cnt++] = v_idx;
- }
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- break;
- } else {
- if (causal) {
- range_start = (kv_seqlen - q_seqlen) + end_m;
- range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
- } else {
- // if slash_finished but there are vertical left, save current
- // blocks
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- range_start = kv_seqlen;
- range_end = kv_seqlen + BLOCK_SIZE_N;
- }
- slash_finished = true;
- }
- }
- if (!slash_finished) {
- if (s_idx > range_end + BLOCK_SIZE_M) {
- tmp_blk_cnt = save_blocks(block_offset, range_start, range_end,
- BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
- range_start = s_idx - BLOCK_SIZE_M;
- range_end = s_idx;
- } else if (s_idx > range_end) {
- range_end += BLOCK_SIZE_M;
- }
- }
- }
- }
-
- block_count[0] = tmp_blk_cnt;
- column_count[0] = tmp_col_cnt;
-}
-
-void convert_vertical_slash_indexes_64x64_mergehead(
- const int* q_seqlens, // [BATCH, ]
- const int* kv_seqlens, // [BATCH, ]
- const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- int* per_head_vertical_topkv, int* per_head_slash_topkv,
- int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
- int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
- int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
- int64_t BATCH_SIZE, int64_t N_HEADS, int64_t N_ROWS, int64_t BLOCK_SIZE_M,
- int64_t BLOCK_SIZE_N, int64_t NNZ_V, int64_t NNZ_S, bool causal) {
- const int N_THREADS = 64;
- const dim3 dimBlock(N_THREADS);
- const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS);
- convert_vertical_slash_indexes_kernel_mergehead<<>>(
- q_seqlens, kv_seqlens, vertical_indexes, slash_indexes,
- per_head_vertical_topkv, per_head_slash_topkv, block_count, block_offset,
- column_count, column_index, N_HEADS, N_ROWS, BLOCK_SIZE_M, BLOCK_SIZE_N,
- NNZ_V, NNZ_S, causal);
-}
-
-/**
- * Implements the Algorithm 4 in paper https://arxiv.org/abs/2407.02490.
- *
- * Like the above convert_vertical_slash_indexes, but with
- * pre-computed vertical and slash counts.
- */
-void convert_vertical_slash_indexes_mergehead(
- torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS]
- torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
- torch::Tensor& column_count, // [BATCH, N_HEADS, NUM_ROWS]
- torch::Tensor& column_index, // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
- torch::Tensor q_seqlens, // [BATCH, ]
- torch::Tensor kv_seqlens, // [BATCH, ]
- torch::Tensor vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
- torch::Tensor slash_indexes, // [BATCH, N_HEADS, NNZ_S]
- torch::Tensor vertical_indices_count, // [N_HEADS, ]
- torch::Tensor slash_indices_count, // [N_HEADS, ]
- int64_t context_size, int64_t block_size_M, int64_t block_size_N,
- bool causal) {
- cudaSetDevice(q_seqlens.get_device());
-
- int batch_size = slash_indexes.size(0);
- int num_heads = slash_indexes.size(1);
- int nnz_slash = slash_indexes.size(2);
- int nnz_vertical = vertical_indexes.size(2);
- int num_rows = (context_size + block_size_M - 1) / block_size_M;
-
- convert_vertical_slash_indexes_64x64_mergehead(
- q_seqlens.data_ptr(), kv_seqlens.data_ptr(),
- vertical_indexes.data_ptr(), slash_indexes.data_ptr(),
- vertical_indices_count.data_ptr(),
- slash_indices_count.data_ptr(), block_count.data_ptr(),
- block_offset.data_ptr(), column_count.data_ptr(),
- column_index.data_ptr(), batch_size, num_heads, num_rows,
- block_size_M, block_size_N, nnz_vertical, nnz_slash, causal);
-}
diff --git a/csrc/cache.h b/csrc/cache.h
index 0188a568edc7..a9e74b0dc2df 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -10,6 +10,11 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
int64_t block_size_in_bytes,
const torch::Tensor& block_mapping);
+void swap_blocks_batch(const torch::Tensor& src_ptrs,
+ const torch::Tensor& dst_ptrs,
+ const torch::Tensor& sizes,
+ bool is_src_access_order_any);
+
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache,
torch::Tensor& slot_mapping,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 2b3906df9ec5..9130dd2ccae7 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -24,6 +24,8 @@
#ifdef USE_ROCM
#include
typedef __hip_bfloat16 __nv_bfloat16;
+#else
+ #include
#endif
#if defined(__gfx942__)
@@ -73,6 +75,104 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
}
}
+void swap_blocks_batch(const torch::Tensor& src_ptrs,
+ const torch::Tensor& dst_ptrs,
+ const torch::Tensor& sizes,
+ bool is_src_access_order_any) {
+ TORCH_CHECK(src_ptrs.device().is_cpu(), "src_ptrs must be on CPU");
+ TORCH_CHECK(dst_ptrs.device().is_cpu(), "dst_ptrs must be on CPU");
+ TORCH_CHECK(sizes.device().is_cpu(), "sizes must be on CPU");
+ TORCH_CHECK(src_ptrs.dtype() == torch::kInt64, "src_ptrs must be int64");
+ TORCH_CHECK(dst_ptrs.dtype() == torch::kInt64, "dst_ptrs must be int64");
+ TORCH_CHECK(sizes.dtype() == torch::kInt64, "sizes must be int64");
+
+ const int64_t n = src_ptrs.size(0);
+ TORCH_CHECK(dst_ptrs.size(0) == n, "dst_ptrs length must match src_ptrs");
+ TORCH_CHECK(sizes.size(0) == n, "sizes length must match src_ptrs");
+
+ if (n == 0) return;
+
+ int64_t* src_data = src_ptrs.mutable_data_ptr();
+ int64_t* dst_data = dst_ptrs.mutable_data_ptr();
+ int64_t* size_data = sizes.mutable_data_ptr();
+
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ // Use cuMemcpyBatchAsync / hipMemcpyBatchAsync to submit all copies in a
+ // single driver call, amortizing per-copy submission overhead. int64_t
+ // and CUdeviceptr/void*/size_t are all 8 bytes on 64-bit platforms, so we
+ // reinterpret_cast the tensor data directly to avoid copies.
+ static_assert(sizeof(size_t) == sizeof(int64_t));
+#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080
+ static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
+ // Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that
+ // binaries compiled with CUDA 12.8+ still work on older drivers, and
+ // we avoid the CUDA 13.0 header remapping (#define to _v2 signature).
+ // The function pointer is cached after the first call.
+ using BatchFn =
+ CUresult (*)(CUdeviceptr*, CUdeviceptr*, size_t*, size_t,
+ CUmemcpyAttributes*, size_t*, size_t, size_t*, CUstream);
+ static BatchFn batch_fn = []() -> BatchFn {
+ CUdriverProcAddressQueryResult sym_status;
+ void* fn_ptr = nullptr;
+ CUresult res = cuGetProcAddress("cuMemcpyBatchAsync", &fn_ptr, 12080,
+ CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+ if (res != CUDA_SUCCESS || fn_ptr == nullptr) {
+ return nullptr;
+ }
+ return reinterpret_cast(fn_ptr);
+ }();
+
+ if (batch_fn != nullptr) {
+ CUmemcpyAttributes attr = {};
+ // ANY lets the DMA engine prefetch source bytes out of stream order,
+ // which is only safe when no GPU stream is concurrently writing the
+ // source.
+ attr.srcAccessOrder = is_src_access_order_any
+ ? CU_MEMCPY_SRC_ACCESS_ORDER_ANY
+ : CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
+ size_t attrs_idx = 0;
+ size_t fail_idx = 0;
+ CUresult result = batch_fn(reinterpret_cast(dst_data),
+ reinterpret_cast(src_data),
+ reinterpret_cast(size_data),
+ static_cast(n), &attr, &attrs_idx, 1,
+ &fail_idx, static_cast(stream));
+ TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ",
+ fail_idx, " with error ", result);
+ return;
+ }
+#elif defined(USE_ROCM) && defined(HIP_VERSION) && HIP_VERSION >= 70100000
+ // ROCm 7.1+ exposes hipMemcpyBatchAsync. The 7.2.1 implementation early-
+ // returns hipErrorNotSupported whenever numAttrs > 0 (see ROCm/clr @
+ // rocm-7.2.1 hipamd/src/hip_memory.cpp:2819-2822), so call with
+ // numAttrs=0.
+ {
+ hipMemcpyAttributes attr = {};
+ size_t attrs_idx = 0;
+ size_t fail_idx = 0;
+ hipError_t result = hipMemcpyBatchAsync(
+ reinterpret_cast(dst_data), reinterpret_cast(src_data),
+ reinterpret_cast(size_data), static_cast(n), &attr,
+ &attrs_idx, 0, &fail_idx, static_cast(stream));
+ TORCH_CHECK(result == hipSuccess, "hipMemcpyBatchAsync failed at index ",
+ fail_idx, " with error ", result);
+ return;
+ }
+#endif
+ {
+ // Fallback for CUDA < 12.8, older CUDA drivers, and ROCm < 7.1:
+ // individual async copies. cudaMemcpyDefault lets the driver infer
+ // direction from pointer types.
+ for (int64_t i = 0; i < n; i++) {
+ cudaMemcpyAsync(reinterpret_cast(dst_data[i]),
+ reinterpret_cast(src_data[i]),
+ static_cast(size_data[i]), cudaMemcpyDefault,
+ stream);
+ }
+ }
+}
+
namespace vllm {
// Grid: (num_layers, num_pairs)
@@ -523,6 +623,11 @@ __global__ void cp_gather_indexer_k_quant_cache_kernel(
const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
// Find batch index within a block
__shared__ int batch_idx[BLOCK_Y_SIZE];
+ if (threadIdx.x == 0) {
+ batch_idx[threadIdx.y] = -1;
+ }
+ __syncthreads();
+
for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
iter++) {
int tid = iter * blockDim.x + threadIdx.x;
@@ -535,16 +640,18 @@ __global__ void cp_gather_indexer_k_quant_cache_kernel(
}
}
-#ifndef USE_ROCM
- __syncwarp();
-#endif
+ __syncthreads();
- if (head_idx >= head_dim || token_idx >= num_tokens) {
+ // num_tokens may be an allocation upper bound when Python avoids a D2H sync.
+ // Only tokens covered by the exact device-side cu_seq_lens are valid to
+ // gather.
+ const int batch = batch_idx[threadIdx.y];
+ if (head_idx >= head_dim || token_idx >= num_tokens || batch < 0) {
return;
}
- const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
- const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
- inbatch_seq_idx / cache_block_size];
+ const int inbatch_seq_idx = token_idx - cu_seq_lens[batch];
+ const int block_idx =
+ block_table[batch * num_blocks + inbatch_seq_idx / cache_block_size];
const int64_t src_block_offset = block_idx * block_stride;
const int64_t cache_inblock_offset =
(inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
@@ -648,6 +755,28 @@ void reshape_and_cache_flash(
int num_tokens = slot_mapping.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
+
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ if (kv_cache_dtype == "nvfp4") {
+#if defined(ENABLE_NVFP4_SM100) || defined(ENABLE_NVFP4_SM120)
+ // NVFP4 dispatch is compiled separately for SM100+.
+ extern void reshape_and_cache_nvfp4_dispatch(
+ torch::Tensor & key, torch::Tensor & value, torch::Tensor & key_cache,
+ torch::Tensor & value_cache, torch::Tensor & slot_mapping,
+ torch::Tensor & k_scale, torch::Tensor & v_scale);
+ reshape_and_cache_nvfp4_dispatch(key, value, key_cache, value_cache,
+ slot_mapping, k_scale, v_scale);
+ return;
+#else
+ TORCH_CHECK(false,
+ "NVFP4 KV cache requires SM100+ (Blackwell). "
+ "Please rebuild vllm with a Blackwell-compatible CUDA target.");
+#endif
+ }
+
+ // Original FP8/auto path.
int block_size = key_cache.size(1);
int64_t key_stride = key.stride(0);
@@ -665,8 +794,6 @@ void reshape_and_cache_flash(
dim3 grid(num_tokens);
dim3 block(std::min(num_heads * head_size, 512));
- const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
CALL_RESHAPE_AND_CACHE_FLASH);
@@ -1394,6 +1521,9 @@ void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
+ TORCH_CHECK(ql_nope.scalar_type() == at::ScalarType::Half ||
+ ql_nope.scalar_type() == at::ScalarType::BFloat16,
+ "ql_nope must be float16 or bfloat16 dtype");
if (num_tokens == 0) return;
@@ -1405,7 +1535,7 @@ void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
- VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
+ VLLM_DISPATCH_HALF_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
vllm::ConcatMLAQKernel<<>>(
q_out.data_ptr(), ql_nope.data_ptr(),
q_pe.data_ptr(), num_tokens, num_heads, q_out.stride(0),
diff --git a/csrc/cache_kernels_fused.cu b/csrc/cache_kernels_fused.cu
index be037b2fdec2..8687ebe1f14c 100644
--- a/csrc/cache_kernels_fused.cu
+++ b/csrc/cache_kernels_fused.cu
@@ -21,28 +21,33 @@ namespace vllm {
// NOTE Be EXTRA careful with raw_kv_scalar_t, for __half and __nv_bfloat16 it's
// using u16 as the backing type.
-template
+template
__global__ void concat_and_cache_mla_rope_fused_kernel(
const int64_t* __restrict__ positions, // [num_tokens]
qk_t* __restrict__ q_pe, // [num_tokens, num_q_heads, rot_dim]
qk_t* __restrict__ k_pe, // [num_tokens, rot_dim]
const qk_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
- const qk_t* __restrict__ rope_cos_sin_cache, // [max_position, 2,
- // rot_dim // 2]
+ const cos_sin_t* __restrict__ rope_cos_sin_cache, // [max_position, 2,
+ // rot_dim // 2]
const int rot_dim, const int64_t q_pe_stride_token,
const int64_t q_pe_stride_head, const int64_t k_pe_stride,
const int64_t kv_c_stride, const int num_q_heads,
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank +
// rot_dim)]
- const int64_t* __restrict__ kv_cache_slot_mapping, // [num_tokens]
+ const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int block_stride, const int entry_stride, const int kv_lora_rank,
const int block_size, const float* kv_cache_quant_scale) {
// Each thread block is responsible for one token.
const int64_t token_idx = blockIdx.x;
+ const int64_t slot_idx = slot_mapping[token_idx];
+ // NOTE: slot_idx can be -1 if the token is padded
+ if (slot_idx < 0) {
+ return;
+ }
const int64_t pos = positions[token_idx];
- const qk_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
+ const cos_sin_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
const int embed_dim = rot_dim / 2;
@@ -54,8 +59,8 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
// NOTE: Would be nice to have interleaved sin/cos so we could just load
// both at the same time.
- qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
- qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+ qk_t cos = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx));
+ qk_t sin = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim));
qk_t* q_pe_head_ptr =
q_pe + token_idx * q_pe_stride_token + head_idx * q_pe_stride_head;
@@ -81,21 +86,15 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
q_pe_head_ptr[pair_idx_y] = y_dst;
}
- const int64_t slot_idx = kv_cache_slot_mapping[token_idx];
const int64_t block_idx = slot_idx / block_size;
const int64_t entry_idx = slot_idx % block_size;
- // NOTE: slot_idx can be -1 if the token is padded
- if (slot_idx < 0) {
- return;
- }
-
// K with 1 HEAD
for (int i = threadIdx.x; i < embed_dim; i += blockDim.x) {
int pair_idx = i;
- qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
- qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+ qk_t cos = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx));
+ qk_t sin = static_cast(VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim));
qk_t* k_pe_head_ptr = k_pe + token_idx * k_pe_stride;
@@ -165,36 +164,43 @@ __global__ void concat_and_cache_mla_rope_fused_kernel(
} // namespace vllm
-#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE) \
- do { \
- VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] { \
- using qk_t = scalar_t; \
- if (rope_is_neox) { \
- vllm::concat_and_cache_mla_rope_fused_kernel \
- <<>>( \
- positions.data_ptr(), q_pe.data_ptr(), \
- k_pe.data_ptr(), kv_c.data_ptr(), \
- rope_cos_sin_cache.data_ptr(), rot_dim, \
- q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
- num_q_heads, reinterpret_cast(kv_cache.data_ptr()), \
- kv_cache_slot_mapping.data_ptr(), block_stride, \
- entry_stride, kv_lora_rank, block_size, \
- kv_cache_quant_scale.data_ptr()); \
- } else { \
- vllm::concat_and_cache_mla_rope_fused_kernel \
- <<>>( \
- positions.data_ptr(), q_pe.data_ptr(), \
- k_pe.data_ptr(), kv_c.data_ptr(), \
- rope_cos_sin_cache.data_ptr(), rot_dim, \
- q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
- num_q_heads, reinterpret_cast(kv_cache.data_ptr()), \
- kv_cache_slot_mapping.data_ptr(), block_stride, \
- entry_stride, kv_lora_rank, block_size, \
- kv_cache_quant_scale.data_ptr()); \
- } \
- }); \
+#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE) \
+ do { \
+ VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] { \
+ using qk_t = scalar_t; \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ rope_cos_sin_cache.scalar_type(), "rope_cos_sin_cache_scalar_type", \
+ [&] { \
+ using cos_sin_t = scalar_t; \
+ if (rope_is_neox) { \
+ vllm::concat_and_cache_mla_rope_fused_kernel< \
+ qk_t, cos_sin_t, true, RAW_KV_T, CACHE_T, KV_DTYPE> \
+ <<>>( \
+ positions.data_ptr(), q_pe.data_ptr(), \
+ k_pe.data_ptr(), kv_c.data_ptr(), \
+ rope_cos_sin_cache.data_ptr(), rot_dim, \
+ q_pe_stride_token, q_pe_stride_head, k_pe_stride, \
+ kv_c_stride, num_q_heads, \
+ reinterpret_cast(kv_cache.data_ptr()), \
+ slot_mapping.data_ptr(), block_stride, \
+ entry_stride, kv_lora_rank, block_size, \
+ kv_cache_quant_scale.data_ptr()); \
+ } else { \
+ vllm::concat_and_cache_mla_rope_fused_kernel< \
+ qk_t, cos_sin_t, false, RAW_KV_T, CACHE_T, KV_DTYPE> \
+ <<>>( \
+ positions.data_ptr(), q_pe.data_ptr(), \
+ k_pe.data_ptr(), kv_c.data_ptr(), \
+ rope_cos_sin_cache.data_ptr(), rot_dim, \
+ q_pe_stride_token, q_pe_stride_head, k_pe_stride, \
+ kv_c_stride, num_q_heads, \
+ reinterpret_cast(kv_cache.data_ptr()), \
+ slot_mapping.data_ptr(), block_stride, \
+ entry_stride, kv_lora_rank, block_size, \
+ kv_cache_quant_scale.data_ptr()); \
+ } \
+ }); \
+ }); \
} while (false)
// Executes RoPE on q_pe and k_pe, then writes k_pe and kv_c in the kv cache.
@@ -208,43 +214,52 @@ void concat_and_cache_mla_rope_fused(
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
torch::Tensor& rope_cos_sin_cache, // [max_position, rot_dim]
bool rope_is_neox,
- torch::Tensor&
- kv_cache_slot_mapping, // [num_tokens] or [num_actual_tokens]
+ torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
torch::Tensor&
kv_cache, // [num_blocks, block_size, (kv_lora_rank + rot_dim)]
const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale) {
- const int64_t num_tokens = q_pe.size(0);
+ // NOTE(woosuk): In vLLM V1, query/key/position.size(0) can be different from
+ // slot_mapping.size(0) because of padding for CUDA graphs.
+ // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+ // both include padding.
+ // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+ // since key includes padding for CUDA graphs, while slot_mapping does not.
+ // In this case, slot_mapping.size(0) represents the actual number of tokens
+ // before padding.
+ // For compatibility with both cases, we use slot_mapping.size(0) as the
+ // number of tokens.
+ int num_tokens = slot_mapping.size(0);
+ int num_padded_tokens = q_pe.size(0);
+ TORCH_CHECK_GE(num_padded_tokens, num_tokens);
const int num_q_heads = q_pe.size(1);
const int rot_dim = q_pe.size(2);
const int kv_lora_rank = kv_c.size(1);
- TORCH_CHECK(positions.size(0) >=
- num_tokens); // CUDA Graphs might pad this for us
+ TORCH_CHECK_EQ(positions.size(0), num_padded_tokens);
TORCH_CHECK_EQ(positions.dim(), 1);
TORCH_CHECK_EQ(positions.scalar_type(), c10::ScalarType::Long);
- TORCH_CHECK_EQ(q_pe.size(0), num_tokens);
+ TORCH_CHECK_EQ(q_pe.dim(), 3);
+ TORCH_CHECK_EQ(q_pe.size(0), num_padded_tokens);
TORCH_CHECK_EQ(q_pe.size(1), num_q_heads);
TORCH_CHECK_EQ(q_pe.size(2), rot_dim);
- TORCH_CHECK_EQ(q_pe.dim(), 3);
- TORCH_CHECK_EQ(k_pe.size(0), num_tokens);
- TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
TORCH_CHECK_EQ(k_pe.dim(), 2);
+ TORCH_CHECK_EQ(k_pe.size(0), num_padded_tokens);
+ TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
TORCH_CHECK_EQ(k_pe.scalar_type(), q_pe.scalar_type());
- TORCH_CHECK_EQ(kv_c.size(0), num_tokens);
- TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
TORCH_CHECK_EQ(kv_c.dim(), 2);
+ TORCH_CHECK_EQ(kv_c.size(0), num_padded_tokens);
+ TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
TORCH_CHECK_EQ(kv_c.scalar_type(), q_pe.scalar_type());
TORCH_CHECK_EQ(kv_c.dtype(), q_pe.dtype());
TORCH_CHECK_EQ(rope_cos_sin_cache.size(1), rot_dim);
- TORCH_CHECK_EQ(rope_cos_sin_cache.scalar_type(), q_pe.scalar_type());
- TORCH_CHECK_EQ(kv_cache_slot_mapping.size(0), num_tokens);
- TORCH_CHECK_EQ(kv_cache_slot_mapping.scalar_type(), c10::ScalarType::Long);
+ TORCH_CHECK_EQ(slot_mapping.size(0), num_tokens);
+ TORCH_CHECK_EQ(slot_mapping.scalar_type(), c10::ScalarType::Long);
TORCH_CHECK_EQ(kv_cache.size(2), kv_lora_rank + rot_dim);
TORCH_CHECK_EQ(kv_cache.dim(), 3);
diff --git a/csrc/core/batch_invariant.hpp b/csrc/core/batch_invariant.hpp
index fffe96b86857..8273bc74b1ef 100644
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@@ -1,7 +1,6 @@
#pragma once
#include
#include
-#include
namespace vllm {
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
index 68a8750f583b..b6f39ed795f3 100644
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@@ -1,7 +1,13 @@
#pragma once
-// For TORCH_CHECK
-#include
+#include
+#include
+#include
+#include
+#include
+
+// For STD_TORCH_CHECK
+#include
namespace vllm {
@@ -45,7 +51,7 @@ class ScalarType {
// IEEE 754 compliant floating point type
static constexpr ScalarType float_IEEE754(uint8_t exponent,
uint8_t mantissa) {
- TORCH_CHECK(mantissa > 0 && exponent > 0);
+ STD_TORCH_CHECK(mantissa > 0 && exponent > 0);
return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754);
}
@@ -53,11 +59,12 @@ class ScalarType {
static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa,
bool finite_values_only,
NanRepr nan_repr) {
- TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
- TORCH_CHECK(mantissa > 0 && exponent > 0);
- TORCH_CHECK(nan_repr != NAN_IEEE_754,
- "use `float_IEEE754` constructor for floating point types that "
- "follow IEEE 754 conventions");
+ STD_TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
+ STD_TORCH_CHECK(mantissa > 0 && exponent > 0);
+ STD_TORCH_CHECK(
+ nan_repr != NAN_IEEE_754,
+ "use `float_IEEE754` constructor for floating point types that "
+ "follow IEEE 754 conventions");
return ScalarType(exponent, mantissa, true, 0, finite_values_only,
nan_repr);
}
@@ -176,8 +183,8 @@ class ScalarType {
private:
double _floating_point_max() const {
- TORCH_CHECK(mantissa <= 52 && exponent <= 11,
- "Cannot represent max/min as a double for type ", str());
+ STD_TORCH_CHECK(mantissa <= 52 && exponent <= 11,
+ "Cannot represent max/min as a double for type ", str());
uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
@@ -186,8 +193,8 @@ class ScalarType {
uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
- TORCH_CHECK(exponent < 11,
- "Cannot represent max/min as a double for type ", str());
+ STD_TORCH_CHECK(exponent < 11,
+ "Cannot represent max/min as a double for type ", str());
max_exponent += 1;
}
@@ -216,16 +223,17 @@ class ScalarType {
if (is_floating_point()) {
return {_floating_point_max()};
} else {
- TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
- "Cannot represent max as a int64_t");
+ STD_TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
+ "Cannot represent max as a int64_t");
return {(int64_t(1) << mantissa) - 1};
}
}
constexpr std::variant _raw_min() const {
if (is_floating_point()) {
- TORCH_CHECK(is_signed(),
- "We currently assume all floating point types are signed");
+ STD_TORCH_CHECK(
+ is_signed(),
+ "We currently assume all floating point types are signed");
constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
double max = _floating_point_max();
@@ -233,8 +241,8 @@ class ScalarType {
uint64_t min_raw = max_raw | sign_bit_double;
return {*reinterpret_cast(&min_raw)};
} else {
- TORCH_CHECK(!is_signed() || size_bits() <= 64,
- "Cannot represent min as a int64_t");
+ STD_TORCH_CHECK(!is_signed() || size_bits() <= 64,
+ "Cannot represent min as a int64_t");
if (is_signed()) {
// set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
// then perform an arithmetic shift right to set all the bits above
diff --git a/csrc/cpu/activation_lut_bf16.cpp b/csrc/cpu/activation_lut_bf16.cpp
new file mode 100644
index 000000000000..0ff2567e1ee8
--- /dev/null
+++ b/csrc/cpu/activation_lut_bf16.cpp
@@ -0,0 +1,71 @@
+#include "cpu_types.hpp"
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+constexpr uint32_t ActivationLutSize = 1u << 16;
+
+at::Tensor gelu_reference(const at::Tensor& x) { return at::gelu(x, "none"); }
+
+void maybe_init_activation_lut_bf16(
+ uint16_t* lut, std::once_flag& once,
+ at::Tensor (*activation)(const at::Tensor&)) {
+ std::call_once(once, [&]() {
+ auto lut_input =
+ at::empty({static_cast(ActivationLutSize)},
+ at::TensorOptions().device(at::kCPU).dtype(at::kFloat));
+ auto* lut_input_ptr = lut_input.data_ptr();
+#pragma omp parallel for
+ for (uint32_t i = 0; i < ActivationLutSize; ++i) {
+ lut_input_ptr[i] = c10::detail::f32_from_bits(static_cast(i));
+ }
+
+ auto lut_output = activation(lut_input);
+ const auto* lut_output_ptr = lut_output.data_ptr();
+#pragma omp parallel for
+ for (uint32_t i = 0; i < ActivationLutSize; ++i) {
+ lut[i] = c10::detail::round_to_nearest_even(lut_output_ptr[i]);
+ }
+ });
+}
+
+void activation_lut_bf16(torch::Tensor& out, torch::Tensor& input,
+ const uint16_t* lut, const char* op_name) {
+ TORCH_CHECK(input.scalar_type() == at::kBFloat16, op_name,
+ ": input must be bfloat16");
+ TORCH_CHECK(out.scalar_type() == at::kBFloat16, op_name,
+ ": out must be bfloat16");
+ TORCH_CHECK(input.is_contiguous(), op_name, ": input must be contiguous");
+ TORCH_CHECK(out.is_contiguous(), op_name, ": out must be contiguous");
+
+ const auto* src =
+ reinterpret_cast(input.data_ptr());
+ auto* dst = reinterpret_cast(out.data_ptr());
+ const int64_t n = input.numel();
+
+ CPU_KERNEL_GUARD_IN(activation_lut_bf16_impl)
+#pragma omp parallel for
+ for (int64_t i = 0; i < n; ++i) {
+ dst[i] = lut[src[i]];
+ }
+ CPU_KERNEL_GUARD_OUT(activation_lut_bf16_impl)
+}
+
+void activation_lut_bf16(torch::Tensor& out, torch::Tensor& input,
+ const std::string& activation) {
+ if (activation == "gelu") {
+ static std::array lut{};
+ static std::once_flag once;
+ maybe_init_activation_lut_bf16(lut.data(), once, gelu_reference);
+ activation_lut_bf16(out, input, lut.data(), "gelu_lut");
+ return;
+ }
+
+ TORCH_CHECK(false, "Unsupported activation: ", activation);
+}
diff --git a/csrc/cpu/cpu_arch_macros.h b/csrc/cpu/cpu_arch_macros.h
index c73b62ecdec9..53ae70497c0f 100644
--- a/csrc/cpu/cpu_arch_macros.h
+++ b/csrc/cpu/cpu_arch_macros.h
@@ -61,8 +61,23 @@
#endif
#ifdef __aarch64__
- // Implementation copied from Arm Optimized Routines (expf AdvSIMD)
+ // Implementation of neon_expf copied from Arm Optimized Routines (expf
+ // AdvSIMD)
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
+ //
+ // Additional fast exponential intended for cases where outputs will be
+ // downcasted to FP16 / BF16 (e.g. attention softmax). Accurate within 1 ULP
+ // for FP16 Accurate within 1 ULP for BF16 for inputs in [-87.683, 88.376] &
+ // clamps inputs outside this range to 0 / inf. Implementation is similar to
+ // exp_u20, but:
+ // - uses a third degree polynomial approximation for exp(r) instead of a
+ // fifth degree one, with coefficients re-tuned.
+ // - does not split natural log (ln) into high / low parts
+ // - clamps exp(x) to 0 for x < -87.683113f and inf for x > 88.3762589f
+ // exp(x) = 2^n (exp(r))
+ // r = x - n*ln2, with n = round(x/ln2)
+ // exp(r) ~ poly(r) = 1 + r + r^2 * (c3 + c2 * r)
+ // n = round(x / ln2), r = x - n*ln2
#include
#define DEFINE_FAST_EXP \
const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); \
@@ -106,8 +121,55 @@
result.val[2] = neon_expf(vec.reg.val[2]); \
result.val[3] = neon_expf(vec.reg.val[3]); \
return vec_op::FP32Vec16(result); \
- };
+ }; \
+ const float32x4_t lower_bound = vdupq_n_f32(-0x1.5ebb82p+6f); \
+ const float32x4_t upper_bound = vdupq_n_f32(0x1.61814ap+6f); \
+ constexpr float ln2 = 0x1.62e43p-1f; \
+ constexpr float f_c2 = 0x1.5592ecp-3f; \
+ const float32x4_t f_c3 = vdupq_n_f32(0x1.017d34p-1f); \
+ auto neon_expf_f16 = [&](float32x4_t values) __attribute__(( \
+ always_inline)) { \
+ const uint32x4_t lt_lower = vcltq_f32(values, lower_bound); \
+ const uint32x4_t gt_upper = vcgtq_f32(values, upper_bound); \
+ float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); \
+ float32x4_t r = vfmsq_n_f32(values, n, ln2); \
+ uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
+ float32x4_t r2 = vmulq_f32(r, r); \
+ float32x4_t q = vfmaq_n_f32(f_c3, r, f_c2); \
+ float32x4_t s = vaddq_f32(vdupq_n_f32(1.0f), r); \
+ float32x4_t p = vfmaq_f32(s, q, r2); \
+ float32x4_t y = \
+ vreinterpretq_f32_u32(vaddq_u32(vreinterpretq_u32_f32(p), e)); \
+ y = vbslq_f32(lt_lower, vdupq_n_f32(0.0f), y); \
+ y = vbslq_f32(gt_upper, vdupq_n_f32(INFINITY), y); \
+ return y; \
+ }; \
+ auto fast_exp_f16 = [&](const vec_op::FP32Vec16& vec) \
+ __attribute__((always_inline)) { \
+ float32x4x4_t result; \
+ result.val[0] = neon_expf_f16(vec.reg.val[0]); \
+ result.val[1] = neon_expf_f16(vec.reg.val[1]); \
+ result.val[2] = neon_expf_f16(vec.reg.val[2]); \
+ result.val[3] = neon_expf_f16(vec.reg.val[3]); \
+ return vec_op::FP32Vec16(result); \
+ };
#endif // __aarch64__
+// RISC-V RVV
+#ifdef __riscv_v
+ #include
+
+ #ifdef __riscv_zihintpause
+ #define FAST_SPINNING __riscv_pause();
+ #endif
+
+ // FP32Vec16::exp() in cpu_types_riscv.hpp already implements the full
+ // polynomial approximation for RVV, so we simply delegate to it.
+ #define DEFINE_FAST_EXP \
+ auto fast_exp = [&](const vec_op::FP32Vec16& vec) \
+ __attribute__((always_inline)) { return vec.exp(); };
+
+#endif // __riscv_v
+
#endif
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index a582b4b4d7cc..26b881f4f143 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -1,5 +1,16 @@
#include "cpu_attn_dispatch_generated.h"
+// Maps kv_cache_dtype string to Fp8KVCacheDataType enum.
+// "auto" -> kAuto(0); "fp8"/"fp8_e4m3" -> kFp8E4M3; "fp8_e5m2" -> kFp8E5M2.
+static inline cpu_attention::Fp8KVCacheDataType parse_fp8_kv_dtype(
+ const std::string& kv_cache_dtype) {
+ if (kv_cache_dtype == "fp8_e5m2")
+ return cpu_attention::Fp8KVCacheDataType::kFp8E5M2;
+ if (kv_cache_dtype == "fp8_e4m3" || kv_cache_dtype == "fp8")
+ return cpu_attention::Fp8KVCacheDataType::kFp8E4M3;
+ return cpu_attention::Fp8KVCacheDataType::kAuto;
+}
+
torch::Tensor get_scheduler_metadata(
const int64_t num_req, const int64_t num_heads_q,
const int64_t num_heads_kv, const int64_t head_dim,
@@ -18,6 +29,10 @@ torch::Tensor get_scheduler_metadata(
isa = cpu_attention::ISA::NEON;
} else if (isa_hint == "vxe") {
isa = cpu_attention::ISA::VXE;
+ } else if (isa_hint == "rvv") {
+ isa = cpu_attention::ISA::RVV;
+ } else if (isa_hint == "vsx") {
+ isa = cpu_attention::ISA::VSX;
} else {
TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
}
@@ -49,7 +64,7 @@ torch::Tensor get_scheduler_metadata(
input.enable_kv_split = enable_kv_split;
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
- CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
+ CPU_ATTN_DISPATCH(head_dim, isa, 0, [&]() {
input.elem_size = sizeof(scalar_t);
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
@@ -72,7 +87,9 @@ void cpu_attn_reshape_and_cache(
key_cache, // [num_blocks, num_kv_heads, block_size, head_size]
torch::Tensor&
value_cache, // [num_blocks, num_kv_heads, block_size, head_size]
- const torch::Tensor& slot_mapping, const std::string& isa) {
+ const torch::Tensor& slot_mapping, const std::string& isa,
+ const double k_scale = 1.0, const double v_scale = 1.0,
+ const std::string& kv_cache_dtype = "auto") {
TORCH_CHECK_EQ(key.dim(), 3);
TORCH_CHECK_EQ(value.dim(), 3);
TORCH_CHECK_EQ(key_cache.dim(), 4);
@@ -80,18 +97,30 @@ void cpu_attn_reshape_and_cache(
TORCH_CHECK_EQ(key.stride(2), 1);
TORCH_CHECK_EQ(value.stride(2), 1);
+ const int64_t kv_cache_idx =
+ static_cast(parse_fp8_kv_dtype(kv_cache_dtype));
+ const bool is_fp8 = (kv_cache_idx != 0);
+
+ if (is_fp8) {
+ TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte,
+ "key_cache must be uint8 for FP8 path");
+ TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte,
+ "value_cache must be uint8 for FP8 path");
+ TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path");
+ TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path");
+ }
+
+ const float k_inv = is_fp8 ? 1.0f / static_cast(k_scale) : 0.0f;
+ const float v_inv = is_fp8 ? 1.0f / static_cast(v_scale) : 0.0f;
+
const int64_t token_num = key.size(0);
- const int64_t key_token_num_stride = key.stride(0);
- const int64_t value_token_num_stride = value.stride(0);
- const int64_t head_num = value.size(1);
- const int64_t key_head_num_stride = key.stride(1);
- const int64_t value_head_num_stride = value.stride(1);
+ const int64_t head_num = key.size(1);
+ const int64_t head_dim = key.size(2);
const int64_t num_blocks = key_cache.size(0);
const int64_t num_blocks_stride = key_cache.stride(0);
const int64_t cache_head_num_stride = key_cache.stride(1);
const int64_t block_size = key_cache.size(2);
const int64_t block_size_stride = key_cache.stride(2);
- const int64_t head_dim = key.size(-1);
cpu_attention::ISA isa_tag = [&]() {
if (isa == "amx") {
@@ -104,21 +133,33 @@ void cpu_attn_reshape_and_cache(
return cpu_attention::ISA::NEON;
} else if (isa == "vxe") {
return cpu_attention::ISA::VXE;
+ } else if (isa == "rvv") {
+ return cpu_attention::ISA::RVV;
+ } else if (isa == "vsx") {
+ return cpu_attention::ISA::VSX;
} else {
TORCH_CHECK(false, "Invalid ISA type: " + isa);
}
}();
+ if (is_fp8) {
+ TORCH_CHECK(isa_tag == cpu_attention::ISA::AMX ||
+ isa_tag == cpu_attention::ISA::VEC,
+ "FP8 KV cache is only supported on x86 (AMX/VEC) ISA");
+ }
+
VLLM_DISPATCH_FLOATING_TYPES(
key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
- CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
+ CPU_ATTN_DISPATCH(head_dim, isa_tag, kv_cache_idx, [&]() {
+ using kv_t = typename attn_impl::kv_cache_t;
attn_impl::reshape_and_cache(
key.data_ptr(), value.data_ptr(),
- key_cache.data_ptr(), value_cache.data_ptr(),
- slot_mapping.data_ptr(), token_num, key_token_num_stride,
- value_token_num_stride, head_num, key_head_num_stride,
- value_head_num_stride, num_blocks, num_blocks_stride,
- cache_head_num_stride, block_size, block_size_stride);
+ reinterpret_cast(key_cache.data_ptr()),
+ reinterpret_cast(value_cache.data_ptr()),
+ slot_mapping.data_ptr(), token_num, key.stride(0),
+ value.stride(0), head_num, key.stride(1), value.stride(1),
+ num_blocks, num_blocks_stride, cache_head_num_stride, block_size,
+ block_size_stride, k_inv, v_inv);
});
});
}
@@ -137,13 +178,26 @@ void cpu_attention_with_kv_cache(
const int64_t sliding_window_left, const int64_t sliding_window_right,
const torch::Tensor& block_table, // [num_tokens, max_block_num]
const double softcap, const torch::Tensor& scheduler_metadata,
- const std::optional& s_aux // [num_heads]
-) {
+ const std::optional& s_aux, // [num_heads]
+ const double k_scale = 1.0, const double v_scale = 1.0,
+ const std::string& kv_cache_dtype = "auto") {
TORCH_CHECK_EQ(query.dim(), 3);
TORCH_CHECK_EQ(query.stride(2), 1);
TORCH_CHECK_EQ(key_cache.dim(), 4);
TORCH_CHECK_EQ(value_cache.dim(), 4);
+ const int64_t kv_cache_idx =
+ static_cast(parse_fp8_kv_dtype(kv_cache_dtype));
+ const bool is_fp8 = (kv_cache_idx != 0);
+ if (is_fp8) {
+ TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte,
+ "key_cache must be uint8 for FP8 path");
+ TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte,
+ "value_cache must be uint8 for FP8 path");
+ TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path");
+ TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path");
+ }
+
cpu_attention::AttentionInput input;
input.metadata = reinterpret_cast(
scheduler_metadata.data_ptr());
@@ -165,25 +219,32 @@ void cpu_attention_with_kv_cache(
input.block_table = block_table.data_ptr();
input.alibi_slopes =
alibi_slopes.has_value() ? alibi_slopes->data_ptr() : nullptr;
- // For now sink must be bf16
input.s_aux = s_aux.has_value() ? s_aux->data_ptr() : nullptr;
input.scale = scale;
input.causal = causal;
input.sliding_window_left = sliding_window_left;
input.sliding_window_right = sliding_window_right;
if (input.causal) {
- // to make boundary calculation easier
input.sliding_window_right = 0;
}
- float softcap_fp32 = softcap;
- input.softcap = softcap_fp32;
+ input.softcap = static_cast(softcap);
+
+ if (is_fp8) {
+ input.k_scale_fp8 = static_cast(k_scale);
+ input.v_scale_fp8 = static_cast(v_scale);
+ TORCH_CHECK(input.metadata->isa == cpu_attention::ISA::AMX ||
+ input.metadata->isa == cpu_attention::ISA::VEC,
+ "FP8 KV cache is only supported on x86 (AMX/VEC) ISA");
+ }
VLLM_DISPATCH_FLOATING_TYPES(
query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
- CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
- TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
- cpu_attention::AttentionMainLoop mainloop;
- mainloop(&input);
- });
+ CPU_ATTN_DISPATCH(
+ query.size(2), input.metadata->isa, kv_cache_idx, [&]() {
+ TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment,
+ 0);
+ cpu_attention::AttentionMainLoop mainloop;
+ mainloop(&input);
+ });
});
}
diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp
index 1c8644d52329..6a0341085dce 100644
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -1,6 +1,7 @@
#ifndef CPU_ATTN_AMX_HPP
#define CPU_ATTN_AMX_HPP
+#include "cpu_attn_fp8.hpp"
#include "cpu_attn_impl.hpp"
namespace cpu_attention {
@@ -21,9 +22,10 @@ typedef struct __tile_config {
// 2-2-4 pattern, for 16 < m <= 32
// TILE 0, 1: load A matrix, row num should be 16, m - 16
// TILE 2, 3: load B matrix, row num should be 16
-// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
-// - 16
-template
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16,
+// m - 16, m - 16
+// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type.
+template
class TileGemm224 {
public:
template
@@ -42,13 +44,56 @@ class TileGemm224 {
}
};
-template <>
-class TileGemm224 {
+// Dequantize one FP8 tile (AMX_TILE_ROW_NUM rows x 32 cols) to BF16.
+template
+FORCE_INLINE void deq_tile_amx(const uint8_t* src, c10::BFloat16* dst) {
+ for (int r = 0; r < AMX_TILE_ROW_NUM; ++r) {
+ if constexpr (std::is_same_v) {
+ vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e4m3_tag{})
+ .save(dst + r * 32);
+ } else {
+ vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e5m2_tag{})
+ .save(dst + r * 32);
+ }
+ }
+}
+
+// For FP8: dequant src into scratch and return scratch.
+// For BF16: return src directly (scratch is unused; the compiler elides it).
+template
+FORCE_INLINE const c10::BFloat16* prepare_b_tile(const kv_cache_t* src,
+ c10::BFloat16* scratch) {
+ if constexpr (std::is_same_v ||
+ std::is_same_v) {
+ deq_tile_amx(reinterpret_cast(src), scratch);
+ return scratch;
+ } else {
+ return reinterpret_cast(src);
+ }
+}
+
+// Handles both BF16 and FP8 KV cache (2-2-4 pattern).
+template
+class TileGemm224 {
+ static_assert(std::is_same_v ||
+ std::is_same_v ||
+ std::is_same_v,
+ "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2");
+
+ static constexpr bool fp8_kv =
+ std::is_same_v ||
+ std::is_same_v;
+
+ static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16);
+ // BF16 path: scratch_elems=1 so the scratch array is eliminated by the
+ // compiler.
+ static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1;
+
public:
template
FORCE_INLINE static void gemm(const int32_t m_size,
c10::BFloat16* __restrict__ a_tile,
- c10::BFloat16* __restrict__ b_tile,
+ kv_cache_t* __restrict__ b_tile,
float* __restrict__ c_tile, const int64_t lda,
const int64_t ldb, const int64_t ldc,
const int32_t block_size,
@@ -56,6 +101,7 @@ class TileGemm224 {
const bool accum_c) {
const int32_t k_times =
dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+
c10::BFloat16* __restrict__ a_tile_0 = a_tile;
c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
const int64_t a_tile_stride = [&]() {
@@ -70,8 +116,8 @@ class TileGemm224 {
}
}();
- c10::BFloat16* __restrict__ b_tile_2 = b_tile;
- c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+ kv_cache_t* __restrict__ b_tile_2 = b_tile;
+ kv_cache_t* __restrict__ b_tile_3 = [&]() {
if constexpr (phase == AttentionGemmPhase::QK) {
// k_cache is prepacked
return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
@@ -106,11 +152,16 @@ class TileGemm224 {
_tile_zero(7);
}
+ alignas(64) c10::BFloat16 scratch_2[scratch_elems];
+ alignas(64) c10::BFloat16 scratch_3[scratch_elems];
for (int32_t k = 0; k < k_times; ++k) {
+ const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+ const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+
_tile_loadd(0, a_tile_0, a_tile_stride);
- _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+ _tile_stream_loadd(2, const_cast(load_2), b_tile_stride);
_tile_dpbf16ps(4, 0, 2);
- _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+ _tile_stream_loadd(3, const_cast(load_3), b_tile_stride);
_tile_dpbf16ps(5, 0, 3);
_tile_loadd(1, a_tile_1, a_tile_stride);
_tile_dpbf16ps(6, 1, 2);
@@ -154,13 +205,13 @@ class TileGemm224 {
};
// 1-2-2 pattern, for 0 < m <= 16
-// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
-// m, m
-// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
-// num should be 16
-// TILE 6, 7, (6, 7): store results C matrix, row num should be
-// m
-template
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should
+// be m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row num
+// should be 16
+// TILE 6, 7: store results C matrix, row num should be m
+// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type.
+template
class TileGemm122 {
public:
template
@@ -179,13 +230,26 @@ class TileGemm122 {
}
};
-template <>
-class TileGemm122