diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml index b199e554a739..21ffa1b9b8d7 100644 --- a/.buildkite/ci_config.yaml +++ b/.buildkite/ci_config.yaml @@ -8,8 +8,9 @@ run_all_patterns: - "CMakeLists.txt" - "requirements/common.txt" - "requirements/cuda.txt" - - "requirements/build.txt" - - "requirements/test.txt" + - "requirements/kv_connectors.txt" + - "requirements/build/cuda.txt" + - "requirements/test/cuda.txt" - "setup.py" - "csrc/" - "cmake/" diff --git a/.buildkite/ci_config_intel.yaml b/.buildkite/ci_config_intel.yaml index 375be84a396a..a1c0091e0f10 100644 --- a/.buildkite/ci_config_intel.yaml +++ b/.buildkite/ci_config_intel.yaml @@ -6,8 +6,8 @@ run_all_patterns: - "CMakeLists.txt" - "requirements/common.txt" - "requirements/xpu.txt" - - "requirements/build.txt" - - "requirements/test.txt" + - "requirements/build/cuda.txt" + - "requirements/test/cuda.txt" - "setup.py" - "csrc/" - "cmake/" diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index 23a23723ad93..0c514647dc2b 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -20,11 +20,3 @@ steps: - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}" env: DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 1 - - exit_status: -10 # Agent was lost - limit: 1 - - exit_status: 1 # Machine occasionally fail - limit: 1 diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml index acca2b368858..19716bab6de5 100644 --- a/.buildkite/hardware_tests/cpu.yaml +++ b/.buildkite/hardware_tests/cpu.yaml @@ -12,13 +12,19 @@ steps: - vllm/_custom_ops.py - tests/kernels/attention/test_cpu_attn.py - tests/kernels/moe/test_cpu_fused_moe.py + - tests/kernels/moe/test_cpu_quant_fused_moe.py - tests/kernels/test_onednn.py + - tests/kernels/test_awq_int4_to_int8.py + - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py commands: - | - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " pytest -x -v -s tests/kernels/attention/test_cpu_attn.py pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py - pytest -x -v -s tests/kernels/test_onednn.py" + pytest -x -v -s tests/kernels/moe/test_cpu_quant_fused_moe.py + pytest -x -v -s tests/kernels/test_onednn.py + pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py + pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py" - label: CPU-Compatibility Tests depends_on: [] @@ -44,10 +50,24 @@ steps: - tests/models/language/pooling/ commands: - | - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 40m " pytest -x -v -s tests/models/language/generation -m cpu_model pytest -x -v -s tests/models/language/pooling -m cpu_model" +- label: CPU-ModelRunnerV2 Tests + depends_on: [] + device: intel_cpu + no_plugin: true + soft_fail: true + source_file_dependencies: + - vllm/v1/worker/cpu/ + - vllm/v1/worker/gpu/ + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " + uv pip install git+https://github.com/triton-lang/triton-cpu.git@270e696d + VLLM_USE_V2_MODEL_RUNNER=1 pytest -x -v -s tests/models/language/generation/test_granite.py -m cpu_model" + - label: CPU-Quantization Model Tests depends_on: [] device: intel_cpu @@ -55,23 +75,24 @@ steps: source_file_dependencies: - csrc/cpu/ - vllm/model_executor/layers/quantization/cpu_wna16.py - - vllm/model_executor/layers/quantization/gptq_marlin.py + - vllm/model_executor/layers/quantization/auto_gptq.py - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py + - vllm/model_executor/layers/fused_moe/experts/cpu_moe.py - tests/quantization/test_compressed_tensors.py - tests/quantization/test_cpu_wna16.py commands: - | - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs pytest -x -v -s tests/quantization/test_cpu_wna16.py" -- label: CPU-Distributed Tests +- label: CPU-Distributed Tests (PP+TP) depends_on: [] device: intel_cpu no_plugin: true - source_file_dependencies: + source_file_dependencies: &cpu_distributed_deps - csrc/cpu/shm.cpp - vllm/v1/worker/cpu_worker.py - vllm/v1/worker/gpu_worker.py @@ -80,10 +101,21 @@ steps: - vllm/platforms/cpu.py - vllm/distributed/parallel_state.py - vllm/distributed/device_communicators/cpu_communicator.py + - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m " + bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp" + +- label: CPU-Distributed Tests (DP+TP) + depends_on: [] + device: intel_cpu + no_plugin: true + source_file_dependencies: *cpu_distributed_deps commands: - | bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m " - bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh" + bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp" - label: CPU-Multi-Modal Model Tests %N depends_on: [] @@ -97,7 +129,7 @@ steps: - | bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m " pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB" - parallelism: 2 + parallelism: 3 - label: "Arm CPU Test" depends_on: [] diff --git a/.buildkite/hardware_tests/intel.yaml b/.buildkite/hardware_tests/intel.yaml index ba0088b3af62..d70ce28428d4 100644 --- a/.buildkite/hardware_tests/intel.yaml +++ b/.buildkite/hardware_tests/intel.yaml @@ -8,10 +8,3 @@ steps: commands: - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh - - label: "Intel GPU Test" - depends_on: [] - soft_fail: true - device: intel_gpu - no_plugin: true - commands: - - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh index 9131dfc71a0a..10c03c3e1773 100755 --- a/.buildkite/image_build/image_build.sh +++ b/.buildkite/image_build/image_build.sh @@ -92,8 +92,8 @@ check_and_skip_if_image_exists() { } ecr_login() { - aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" - aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com + aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true + aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true } prepare_cache_tags() { @@ -192,6 +192,7 @@ export BUILDKITE_COMMIT export PARENT_COMMIT export IMAGE_TAG export IMAGE_TAG_LATEST +export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}" export CACHE_FROM export CACHE_FROM_BASE_BRANCH export CACHE_FROM_MAIN diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index 42eaed7ddaa0..e0ef7d592424 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -6,6 +6,48 @@ steps: timeout_in_minutes: 600 commands: - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi + # Non-root smoke 1: the default (root) image must still be importable + # under a non-root UID via `--user 2000:0`. Validates the `vllm` passwd + # entry + group-0-writable /home/vllm + uv path cleanup from #31959. + # Uses `import vllm` rather than `vllm serve --help` because the latter + # instantiates `VllmConfig` which requires a GPU attached to the + # container. + - docker run --rm --user 2000:0 --entrypoint python3 "$IMAGE_TAG" -c "import vllm; print(vllm.__version__)" + # Non-root smoke 2: assert the non-root enabling invariants are baked + # into the image. Runs as UID 2000:0 via a shell so we can verify + # filesystem perms + passwd/group file state + wrapper presence without + # triggering vLLM's GPU-requiring config-init path. The opt-in + # `vllm-openai-nonroot` target adds only `USER vllm`, `WORKDIR + # /home/vllm`, and an `ENTRYPOINT` override on top of these invariants; + # its build correctness is reviewed at the Dockerfile level. Wrapper + # logic is covered separately by the pre-commit hook + # `test-nonroot-entrypoint` (see .pre-commit-config.yaml). + - | + docker run --rm --user 2000:0 --entrypoint /bin/sh "$IMAGE_TAG" -ec ' + if ! getent passwd 2000 | grep -q ^vllm:; then + echo FAIL: UID 2000 != vllm + exit 1 + fi + if ! id -gn 2>/dev/null | grep -qx root; then + echo FAIL: GID 0 not root group + exit 1 + fi + touch /home/vllm/.smoke && rm /home/vllm/.smoke + touch /opt/uv/cache/.smoke && rm /opt/uv/cache/.smoke + if ! test -x /usr/local/bin/vllm-nonroot-entrypoint.sh; then + echo FAIL: wrapper missing + exit 1 + fi + if ! test -w /etc/passwd; then + echo FAIL: /etc/passwd not group-writable + exit 1 + fi + if ! test -w /etc/group; then + echo FAIL: /etc/group not group-writable + exit 1 + fi + echo non-root invariants OK + ' retry: automatic: - exit_status: -1 # Agent was lost diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh index ccfe155fa2b7..035f070ab891 100755 --- a/.buildkite/image_build/image_build_cpu.sh +++ b/.buildkite/image_build/image_build_cpu.sh @@ -11,7 +11,7 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true # skip build if image already exists if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh index ff3d11c8d599..b561e2c2e463 100755 --- a/.buildkite/image_build/image_build_cpu_arm64.sh +++ b/.buildkite/image_build/image_build_cpu_arm64.sh @@ -11,7 +11,7 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true # skip build if image already exists if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh index 60fa1789fa06..df900dc60342 100755 --- a/.buildkite/image_build/image_build_hpu.sh +++ b/.buildkite/image_build/image_build_hpu.sh @@ -11,7 +11,7 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true # skip build if image already exists if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then diff --git a/.buildkite/image_build/image_build_torch_nightly.sh b/.buildkite/image_build/image_build_torch_nightly.sh new file mode 100755 index 000000000000..cbd08aa7bd0b --- /dev/null +++ b/.buildkite/image_build/image_build_torch_nightly.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -euo pipefail + +# Build a vLLM test image with PyTorch nightly installed. +# Called by the pipeline generator's "vLLM Against PyTorch Nightly" group. + +if [[ $# -lt 5 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 +BRANCH=$4 +IMAGE_TAG=$5 + +# --- Arguments --- +echo "--- :mag: Arguments" +echo "REGISTRY: ${REGISTRY}" +echo "REPO: ${REPO}" +echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}" +echo "BRANCH: ${BRANCH}" +echo "IMAGE_TAG: ${IMAGE_TAG}" + +# --- ECR login --- +echo "--- :key: ECR login" +aws ecr-public get-login-password --region us-east-1 \ + | docker login --username AWS --password-stdin "$REGISTRY" +aws ecr get-login-password --region us-east-1 \ + | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com + +# --- Set up buildx --- +echo "--- :docker: Setting up buildx" +docker buildx create --name vllm-builder --driver docker-container --use || true +docker buildx inspect --bootstrap +docker buildx ls + +# --- Skip if image already exists --- +echo "--- :mag: Checking if image already exists" +if docker manifest inspect "$IMAGE_TAG" >/dev/null 2>&1; then + echo "Image found: $IMAGE_TAG — skipping build" + exit 0 +fi +echo "Image not found, proceeding with build..." + +# --- CUDA 13.0 for nightly builds --- +# Nightly CI uses CUDA 13.0 while regular CI stays on CUDA 12.9 +NIGHTLY_CUDA_VERSION="13.0.2" +NIGHTLY_BUILD_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-devel-ubuntu22.04" +NIGHTLY_FINAL_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-base-ubuntu22.04" + +echo "--- :docker: Building torch nightly image (CUDA ${NIGHTLY_CUDA_VERSION})" +docker buildx build --file docker/Dockerfile \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit="$BUILDKITE_COMMIT" \ + --build-arg USE_SCCACHE=1 \ + --build-arg PYTORCH_NIGHTLY=1 \ + --build-arg CUDA_VERSION="${NIGHTLY_CUDA_VERSION}" \ + --build-arg BUILD_BASE_IMAGE="${NIGHTLY_BUILD_BASE_IMAGE}" \ + --build-arg FINAL_BASE_IMAGE="${NIGHTLY_FINAL_BASE_IMAGE}" \ + --build-arg torch_cuda_arch_list="8.0 8.9 9.0 10.0 12.0" \ + --tag "$IMAGE_TAG" \ + --push \ + --target test \ + --progress plain . + +echo "--- :white_check_mark: Torch nightly image build complete: $IMAGE_TAG" diff --git a/.buildkite/image_build/image_build_xpu.sh b/.buildkite/image_build/image_build_xpu.sh index c3734dce13ca..45417b7339be 100755 --- a/.buildkite/image_build/image_build_xpu.sh +++ b/.buildkite/image_build/image_build_xpu.sh @@ -11,8 +11,8 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" -aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true +aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true # skip build if image already exists if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then diff --git a/.buildkite/intel_jobs/engine_intel.yaml b/.buildkite/intel_jobs/engine_intel.yaml new file mode 100644 index 000000000000..c66576d40991 --- /dev/null +++ b/.buildkite/intel_jobs/engine_intel.yaml @@ -0,0 +1,21 @@ +group: Engine Intel +depends_on: + - image-build-xpu +steps: +- label: Engine (1 GPU) + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/v1/engine/ + - tests/v1/engine/ + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py' diff --git a/.buildkite/intel_jobs/kernels_intel.yaml b/.buildkite/intel_jobs/kernels_intel.yaml new file mode 100644 index 000000000000..66a8db25f02e --- /dev/null +++ b/.buildkite/intel_jobs/kernels_intel.yaml @@ -0,0 +1,21 @@ +group: Kernels Intel +depends_on: + - image-build-xpu +steps: +- label: vLLM IR Tests + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/ir + - vllm/kernels + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + pytest -v -s kernels/ir' diff --git a/.buildkite/intel_jobs/lora_intel.yaml b/.buildkite/intel_jobs/lora_intel.yaml new file mode 100644 index 000000000000..32a56ef59b3f --- /dev/null +++ b/.buildkite/intel_jobs/lora_intel.yaml @@ -0,0 +1,135 @@ +group: LoRA Intel +depends_on: + - image-build-xpu +steps: +- label: LoRA Runtime + Utils + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + pytest -v -s lora/test_layers.py && + pytest -v -s lora/test_lora_checkpoints.py && + pytest -v -s lora/test_lora_functions.py && + pytest -v -s lora/test_lora_huggingface.py && + pytest -v -s lora/test_lora_manager.py && + pytest -v -s lora/test_lora_utils.py && + pytest -v -s lora/test_peft_helper.py && + pytest -v -s lora/test_resolver.py && + pytest -v -s lora/test_utils.py && + pytest -v -s lora/test_add_lora.py && + pytest -v -s lora/test_worker.py' + +- label: LoRA Fused/MoE Kernels + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + pytest -v -s lora/test_fused_moe_lora_kernel.py && + pytest -v -s lora/test_moe_lora_align_sum.py --deselect="tests/lora/test_moe_lora_align_sum.py::test_moe_lora_align_block_size_mixed_base_and_lora[1]"' + +- label: LoRA Punica Kernels + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + set -o pipefail && + pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-3-43264-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype1-1-2049-64-128-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]"' + +- label: LoRA Punica FP8/XPU Ops + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + pytest -v -s lora/test_punica_ops_fp8.py && + pytest -v -s lora/test_punica_xpu_ops.py' + +- label: LoRA Models + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + (pytest -v -s lora/test_mixtral.py --deselect="tests/lora/test_mixtral.py::test_mixtral_lora[4]" || true) && + pytest -v -s lora/test_quant_model.py --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model0]" --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model1]" --deselect="tests/lora/test_quant_model.py::test_quant_model_tp_equality[model0]" && + pytest -v -s lora/test_transformers_model.py && + pytest -v -s lora/test_chatglm3_tp.py && + pytest -s -v lora/test_minicpmv_tp.py' + +- label: LoRA Multimodal + timeout_in_minutes: 45 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + pytest -v -s lora/test_default_mm_loras.py && + pytest -v -s lora/test_whisper.py' diff --git a/.buildkite/intel_jobs/misc_intel.yaml b/.buildkite/intel_jobs/misc_intel.yaml new file mode 100644 index 000000000000..864128bb5338 --- /dev/null +++ b/.buildkite/intel_jobs/misc_intel.yaml @@ -0,0 +1,55 @@ +group: Miscellaneous Intel +depends_on: + - image-build-xpu +steps: +- label: V1 Core + KV + Metrics + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/ + - tests/v1/core + - tests/v1/executor + - tests/v1/kv_offload + - tests/v1/worker + - tests/v1/kv_connector/unit + - tests/v1/metrics + - tests/entrypoints/openai/correctness/test_lmeval.py + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'pip install -r requirements/kv_connectors.txt && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && + cd tests && + pytest -v -s v1/executor' + +- label: V1 Sample + Logits + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + working_dir: "." + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + VLLM_TEST_DEVICE: "xpu" + source_file_dependencies: + - vllm/ + - tests/v1/sample + - tests/v1/logits_processors + - tests/v1/test_oracle.py + - tests/v1/test_request.py + - tests/v1/test_outputs.py + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'export VLLM_WORKER_MULTIPROC_METHOD=spawn && + cd tests && + pytest -v -s v1/logits_processors --ignore=v1/logits_processors/test_custom_online.py --ignore=v1/logits_processors/test_custom_offline.py && + pytest -v -s v1/test_oracle.py && + pytest -v -s v1/test_request.py && + pytest -v -s v1/test_outputs.py' diff --git a/.buildkite/intel_jobs/test-intel.yaml b/.buildkite/intel_jobs/test-intel.yaml index 3aa75f4754f9..805b7e54f120 100644 --- a/.buildkite/intel_jobs/test-intel.yaml +++ b/.buildkite/intel_jobs/test-intel.yaml @@ -35,9 +35,13 @@ steps: python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp && python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN && python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 && + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --kv-cache-dtype fp8 && + python3 examples/basic/offline_inference/generate.py --model nvidia/Llama-3.1-8B-Instruct-FP8 --block-size 64 --enforce-eager --quantization modelopt --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --max-model-len 4096 && python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192 && python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 && - python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel' + python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel && + python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --max-model-len 8192 + ' - label: "XPU V1 test" depends_on: - image-build-xpu @@ -56,9 +60,28 @@ steps: 'cd tests && pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py && pytest -v -s v1/engine --ignore=v1/engine/test_output_processor.py && - pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py && + pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py -k "not test_topk_only and not test_topp_only and not test_topk_and_topp" && pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py && pytest -v -s v1/structured_output && pytest -v -s v1/test_serial_utils.py && - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py && - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py' + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py && + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py --ignore=v1/kv_connector/unit/test_offloading_connector.py' + - label: "XPU server test" + depends_on: + - image-build-xpu + timeout_in_minutes: 30 + device: intel_gpu + no_plugin: true + env: + REGISTRY: "public.ecr.aws/q9t5s3a7" + REPO: "vllm-ci-test-repo" + source_file_dependencies: + - vllm/ + - .buildkite/intel_jobs/test-intel.yaml + commands: + - >- + bash .buildkite/scripts/hardware_ci/run-intel-test.sh + 'pip install av && + cd tests && + pytest -v -s entrypoints/openai/chat_completion/test_audio_in_video.py && + pytest -v -s benchmarks/test_serve_cli.py' diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml index 6c0b5540cbb6..9a5af8540118 100644 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -1,6 +1,9 @@ # For hf script, without -t option (tensor parallel size). # bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +required_gpu_arch: + - gfx942 + - gfx950 tasks: - name: "mmlu_pro" metrics: diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml index aa4fb9fa03d6..ff43fa187b0e 100644 --- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml @@ -1,6 +1,9 @@ # For vllm script, with -t option (tensor parallel size) # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" +required_gpu_arch: + - gfx942 + - gfx950 tasks: - name: "gsm8k" metrics: diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml index 514c15d6098e..84e4f3fe3349 100644 --- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml @@ -1,4 +1,7 @@ model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" +required_gpu_arch: + - gfx942 + - gfx950 tasks: - name: "mmlu_pro" metrics: diff --git a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt index a3bb95e19e24..36e0543879b3 100644 --- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt +++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt @@ -1,5 +1,6 @@ Qwen2.5-1.5B-Instruct.yaml Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml +Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml Qwen1.5-MoE-W4A16-compressed-tensors.yaml diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh index 518af9a66018..b495c0d123a6 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on chartqa for vllm. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.11" +# pip install "lm-eval[api]>=0.4.12" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index f010ffe6752d..e430e6183b2d 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.11" +# pip install "lm-eval[api]>=0.4.12" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index fec4a94e63e4..f1a541ddbefc 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.11" +# pip install "lm-eval[api]>=0.4.12" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh index e3c6e16bd6b3..ba8da9fc3f55 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.11" +# pip install "lm-eval[api]>=0.4.12" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index fad5f593be4f..d34e603b9e26 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -13,6 +13,7 @@ from contextlib import contextmanager import lm_eval +import pytest import yaml from vllm.platforms import current_platform @@ -89,9 +90,40 @@ def launch_lm_eval(eval_config, tp_size): return results +def _check_rocm_gpu_arch_requirement(eval_config): + """Skip the test if the model requires a ROCm GPU arch not present. + + Model YAML configs can specify:: + + required_gpu_arch: + - gfx942 + - gfx950 + + The check only applies on ROCm. On other platforms (e.g. CUDA) the + field is ignored so that shared config files work for both NVIDIA and + AMD CI pipelines. + """ + required_archs = eval_config.get("required_gpu_arch") + if not required_archs: + return + + if not current_platform.is_rocm(): + return + + from vllm.platforms.rocm import _GCN_ARCH # noqa: E402 + + if not any(arch in _GCN_ARCH for arch in required_archs): + pytest.skip( + f"Model requires GPU arch {required_archs}, " + f"but detected arch is '{_GCN_ARCH}'" + ) + + def test_lm_eval_correctness_param(config_filename, tp_size): eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) + _check_rocm_gpu_arch_requirement(eval_config) + results = launch_lm_eval(eval_config, tp_size) rtol = eval_config.get("rtol", DEFAULT_RTOL) diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json index 63f1f8ab887b..9f226ef2f819 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json @@ -36,6 +36,7 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "ignore-eos": "", + "temperature": 0, "num_prompts": 200 } }, @@ -127,4 +128,4 @@ } } ] -} \ No newline at end of file +} diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json index f0dc3d5ec067..30879b5e9dc5 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json @@ -22,6 +22,7 @@ "hf_split": "test", "no_stream": "", "no_oversample": "", + "temperature": 0, "num_prompts": 200 } }, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json index 0411b04e1bd5..34c2cc82d395 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json @@ -26,34 +26,14 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "ignore-eos": "", + "temperature": 0, "num_prompts": 200 } }, "tests": [ - { - "test_name": "serving_llama8B_tp1_sharegpt", - "server_parameters": { - "tensor_parallel_size": 1 - }, - "client_parameters": { - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" - } - }, - { - "test_name": "serving_llama8B_tp2_sharegpt", - "server_parameters": { - "tensor_parallel_size": 2 - }, - "client_parameters": { - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" - } - }, { "test_name": "serving_llama8B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 1 }, "client_parameters": { "dataset_name": "random", @@ -62,290 +42,244 @@ } }, { - "test_name": "serving_llama8B_tp2_random_128_128", + "test_name": "serving_llama8B_int4_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 2 + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4" }, "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp4_random_128_128", + "test_name": "serving_llama8B_int8_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 4 + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8" }, "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp1_random_128_2048", - "server_parameters": { - "tensor_parallel_size": 1 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_tp2_random_128_2048", + "test_name": "serving_llama1B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 2 + "model": "meta-llama/Llama-3.2-1B" }, "client_parameters": { + "model": "meta-llama/Llama-3.2-1B", "dataset_name": "random", "random-input-len": 128, - "random-output-len": 2048 + "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp4_random_128_2048", + "test_name": "serving_llama3B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 4 + "model": "meta-llama/Llama-3.2-3B-Instruct" }, "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", "dataset_name": "random", "random-input-len": 128, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_tp1_random_2048_128", - "server_parameters": { - "tensor_parallel_size": 1 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp2_random_2048_128", + "test_name": "serving_llama70B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 2 + "model": "meta-llama/Llama-3.3-70B-Instruct" }, "client_parameters": { + "model": "meta-llama/Llama-3.3-70B-Instruct", "dataset_name": "random", - "random-input-len": 2048, + "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp4_random_2048_128", + "test_name": "serving_granite2B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 4 + "model": "ibm-granite/granite-3.2-2b-instruct" }, "client_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", "dataset_name": "random", - "random-input-len": 2048, + "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_tp1_random_2048_2048", - "server_parameters": { - "tensor_parallel_size": 1 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_tp2_random_2048_2048", - "server_parameters": { - "tensor_parallel_size": 2 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_tp4_random_2048_2048", - "server_parameters": { - "tensor_parallel_size": 4 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 2048 - } - }, - { - "test_name": "serving_llama8B_int4_tp1_random_128_128", + "test_name": "serving_qwen1.7B_tp1_random_128_128", "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 1 + "model": "Qwen/Qwen3-1.7B" }, "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "model": "Qwen/Qwen3-1.7B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int4_tp2_random_128_128", + "test_name": "serving_qwen4B_tp1_random_128_128", "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 2 + "model": "Qwen/Qwen3-4B" }, "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "model": "Qwen/Qwen3-4B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int4_tp4_random_128_128", + "test_name": "serving_qwen8B_tp1_random_128_128", "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 4 + "model": "Qwen/Qwen3-8B" }, "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "model": "Qwen/Qwen3-8B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int8_tp1_random_128_128", + "test_name": "serving_qwen14B_tp1_random_128_128", "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 1 + "model": "Qwen/Qwen3-14B" }, "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model": "Qwen/Qwen3-14B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int8_tp2_random_128_128", + "test_name": "serving_qwen30B_tp1_random_128_128", "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2 + "model": "Qwen/Qwen3-30B-A3B" }, "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model": "Qwen/Qwen3-30B-A3B", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama8B_int8_tp4_random_128_128", + "test_name": "serving_glm9B_tp1_random_128_128", "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4 + "model": "zai-org/glm-4-9b-hf" }, "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model": "zai-org/glm-4-9b-hf", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_llama3B_tp1_random_128_128", + "test_name": "serving_gemma7B_tp1_random_128_128", "server_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", - "tensor_parallel_size": 1 + "model": "google/gemma-7b" }, "client_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", + "model": "google/gemma-7b", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_granite2B_tp1_random_128_128", + "test_name": "serving_gemma3-4b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0 + }, "server_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", - "tensor_parallel_size": 1 + "model": "google/gemma-3-4b-it" }, "client_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", + "model": "google/gemma-3-4b-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_qwen1.7B_tp1_random_128_128", + "test_name": "serving_gemma3-12b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0 + }, "server_parameters": { - "model": "Qwen/Qwen3-1.7B", - "tensor_parallel_size": 1 + "model": "google/gemma-3-12b-it" }, "client_parameters": { - "model": "Qwen/Qwen3-1.7B", + "model": "google/gemma-3-12b-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_qwen4B_tp1_random_128_128", + "test_name": "serving_gemma4-4b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0 + }, "server_parameters": { - "model": "Qwen/Qwen3-4B", - "tensor_parallel_size": 1 + "model": "google/gemma-4-E4B-it" }, "client_parameters": { - "model": "Qwen/Qwen3-4B", + "model": "google/gemma-4-E4B-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_qwen8B_tp1_random_128_128", + "test_name": "serving_gemma4-2b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0 + }, "server_parameters": { - "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1 + "model": "google/gemma-4-E2B-it" }, "client_parameters": { - "model": "Qwen/Qwen3-8B", + "model": "google/gemma-4-E2B-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_glm9B_tp1_random_128_128", + "test_name": "serving_gemma4-26b_tp1_random_128_128", + "server_environment_variables": { + "VLLM_CPU_SGL_KERNEL": 0, + "VLLM_CPU_ATTN_SPLIT_KV": 0 + }, "server_parameters": { - "model": "zai-org/glm-4-9b-hf", - "tensor_parallel_size": 1 + "model": "google/gemma-4-26B-A4B-it" }, "client_parameters": { - "model": "zai-org/glm-4-9b-hf", + "model": "google/gemma-4-26B-A4B-it", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 } }, { - "test_name": "serving_gemma7B_tp1_random_128_128", + "test_name": "serving_phi4_tp1_random_128_128", "server_parameters": { - "model": "google/gemma-7b", - "tensor_parallel_size": 1 + "model": "microsoft/Phi-4-reasoning" }, "client_parameters": { - "model": "google/gemma-7b", + "model": "microsoft/Phi-4-reasoning", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128 diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index f66ef2af4bd6..c2d7768e2026 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -26,6 +26,7 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "ignore-eos": "", + "temperature": 0, "num_prompts": 200 } }, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json index 3929aa5fbbe0..d5ef981689dd 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json @@ -21,6 +21,7 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "temperature": 0, "num_prompts": 200 } }, @@ -47,6 +48,7 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "temperature": 0, "num_prompts": 200 } }, @@ -73,6 +75,7 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "temperature": 0, "num_prompts": 200 } }, @@ -100,6 +103,7 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "temperature": 0, "num_prompts": 200 } }, @@ -127,6 +131,7 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "temperature": 0, "num_prompts": 200 } }, @@ -151,6 +156,7 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "temperature": 0, "num_prompts": 200 } } diff --git a/.buildkite/performance-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json index 66d52abc1206..2cbd472295e7 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests.json @@ -1,73 +1,112 @@ -[ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "temperature": 0, + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, { - "test_name": "serving_llama8B_tp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "disable_log_stats": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } }, { - "test_name": "serving_llama70B_tp4_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tensor_parallel_size": 4, - "disable_log_stats": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } + "test_name": "serving_llama8B_tp1_random_2048_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 2048 + } }, { - "test_name": "serving_mixtral8x7B_tp2_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tensor_parallel_size": 2, - "disable_log_stats": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } + "test_name": "serving_llama70B_tp4_random_128_128", + "server_parameters": { + "model": "meta-llama/Llama-3.3-70B-Instruct", + "async_scheduling": "", + "no_enable_prefix_caching": "", + "max_num_batched_tokens": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.3-70B-Instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } }, { - "test_name": "serving_llama70B_tp4_sharegpt_specdecode", - "qps_list": [2], - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tensor_parallel_size": 4, - "speculative_config": { - "model": "turboderp/Qwama-0.5B-Instruct", - "num_speculative_tokens": 4, - "draft_tensor_parallel_size": 1 - } - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } + "test_name": "serving_gemma4-e4b_tp1_random_128_128", + "server_parameters": { + "model": "google/gemma-4-E4B-it", + "enable_auto_tool_choice": "", + "tool_call_parser": "gemma4", + "chat_template": "examples/tool_chat_template_gemma4.jinja", + "reasoning_parser": "gemma4" + }, + "client_parameters": { + "model": "google/gemma-4-E4B-it", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } } -] + ] +} diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 45b2996f7ead..df9b80f7f9a8 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,3 +1,16 @@ +# CUDA architecture lists — following PyTorch RELEASE.md +# (https://github.com/pytorch/pytorch/blob/main/RELEASE.md) +# SM86 included for broader Ampere coverage; SM89 for marlin fp8 support +env: + CUDA_ARCH_X86: "7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX" + # aarch64 only architectures: 8.7 for Orin, 11.0 for Thor (since CUDA 13) + CUDA_ARCH_AARCH64: "8.0 8.7 8.9 9.0 10.0 11.0 12.0+PTX" + CUDA_ARCH_X86_CU129: "7.5 8.0 8.6 8.9 9.0 10.0 12.0" + CUDA_ARCH_AARCH64_CU129: "8.0 8.7 8.9 9.0 10.0 12.0" + MOONCAKE_WHEEL_AARCH64_2_35: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_35_aarch64.whl" + MOONCAKE_WHEEL_AARCH64_2_39: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_39_aarch64.whl" + MOONCAKE_WHEEL_X86_64: "https://vllm-wheels.s3.amazonaws.com/mooncake/mooncake_transfer_engine-0.3.10.post2-0da9dfea3-cp312-cp312-manylinux_2_35_x86_64.whl" + steps: - input: "Provide Release version here" id: input-release-version @@ -14,12 +27,11 @@ steps: agents: queue: arm64_cpu_queue_release commands: - # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: - # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda12.9 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -29,12 +41,11 @@ steps: agents: queue: arm64_cpu_queue_release commands: - # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: - # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -47,7 +58,8 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -57,10 +69,11 @@ steps: agents: queue: cpu_queue_release commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda12.9 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -70,10 +83,11 @@ steps: agents: queue: cpu_queue_release commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -86,7 +100,8 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "s3://vllm-wheels/$$BUILDKITE_COMMIT/$(cd artifacts/dist && echo *.whl)"' env: DOCKER_BUILDKIT: "1" @@ -98,105 +113,236 @@ steps: commands: - "bash .buildkite/scripts/generate-and-upload-nightly-index.sh" + - block: "Unblock to build release Docker images" + depends_on: ~ + key: block-build-release-images + if: build.env("NIGHTLY") != "1" + - group: "Build release Docker images" key: "build-release-images" + depends_on: block-build-release-images + allow_dependency_failure: true steps: - - label: "Build release image - x86_64 - CUDA 12.9" + - label: "Build release image - x86_64 - CUDA 13.0" depends_on: ~ id: build-release-image-x86 agents: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" # re-tag to default image tag and push, just in case arm64 build fails - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"' - - label: "Build release image - aarch64 - CUDA 12.9" + - label: "Build release image - aarch64 - CUDA 13.0" depends_on: ~ id: build-release-image-arm64 agents: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"' - - label: "Build release image - x86_64 - CUDA 13.0" + - label: "Build release image - x86_64 - CUDA 12.9" depends_on: ~ - id: build-release-image-x86-cuda-13-0 + id: build-release-image-x86-cuda-12-9 agents: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129" # re-tag to default image tag and push, just in case arm64 build fails - - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129" + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"' - - label: "Build release image - aarch64 - CUDA 13.0" + - label: "Build release image - aarch64 - CUDA 12.9" depends_on: ~ - id: build-release-image-arm64-cuda-13-0 + id: build-release-image-arm64-cuda-12-9 agents: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_35}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129"' - - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04" + - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04" depends_on: ~ id: build-release-image-x86-ubuntu2404 agents: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"' - - label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04" + - label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04" depends_on: ~ id: build-release-image-arm64-ubuntu2404 agents: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"' - - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04" + - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04" depends_on: ~ - id: build-release-image-x86-cuda-13-0-ubuntu2404 + id: build-release-image-x86-cuda-12-9-ubuntu2404 agents: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404" - - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404" - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404" + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404" + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"' - - label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04" + - label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04" depends_on: ~ - id: build-release-image-arm64-cuda-13-0-ubuntu2404 + id: build-release-image-arm64-cuda-12-9-ubuntu2404 agents: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404" + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg MOONCAKE_WHEEL_AARCH64="${MOONCAKE_WHEEL_AARCH64_2_39}" \ + --build-arg MOONCAKE_WHEEL_X86_64="${MOONCAKE_WHEEL_X86_64}" \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404"' - block: "Build release image for x86_64 CPU" key: block-cpu-release-image-build depends_on: ~ - label: "Build release image - x86_64 - CPU" + key: build-cpu-release-image-x86 depends_on: - block-cpu-release-image-build - input-release-version @@ -207,6 +353,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"' env: DOCKER_BUILDKIT: "1" @@ -215,7 +362,8 @@ steps: depends_on: ~ - label: "Build release image - arm64 - CPU" - depends_on: + key: build-cpu-release-image-arm64 + depends_on: - block-arm64-cpu-release-image-build - input-release-version agents: @@ -225,13 +373,14 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "$$BUILDKITE_LABEL" "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"' env: DOCKER_BUILDKIT: "1" - group: "Publish release images" key: "publish-release-images" steps: - - label: "Create multi-arch manifest - CUDA 12.9" + - label: "Create multi-arch manifest - CUDA 13.0" depends_on: - build-release-image-x86 - build-release-image-arm64 @@ -242,29 +391,22 @@ steps: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 13.0" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"' - - label: "Annotate release workflow - CUDA 12.9" - depends_on: - - create-multi-arch-manifest - id: annotate-release-workflow - agents: - queue: small_cpu_queue_release - commands: - - "bash .buildkite/scripts/annotate-release.sh" - - - label: "Create multi-arch manifest - CUDA 13.0" + - label: "Create multi-arch manifest - CUDA 12.9" depends_on: - - build-release-image-x86-cuda-13-0 - - build-release-image-arm64-cuda-13-0 - id: create-multi-arch-manifest-cuda-13-0 + - build-release-image-x86-cuda-12-9 + - build-release-image-arm64-cuda-12-9 + id: create-multi-arch-manifest-cuda-12-9 agents: queue: small_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend" - - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" + - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu129 --amend" + - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 12.9" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129"' - - label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04" + - label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04" depends_on: - build-release-image-x86-ubuntu2404 - build-release-image-arm64-ubuntu2404 @@ -275,18 +417,20 @@ steps: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend" - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 13.0 Ubuntu 24.04" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"' - - label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04" + - label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04" depends_on: - - build-release-image-x86-cuda-13-0-ubuntu2404 - - build-release-image-arm64-cuda-13-0-ubuntu2404 - id: create-multi-arch-manifest-cuda-13-0-ubuntu2404 + - build-release-image-x86-cuda-12-9-ubuntu2404 + - build-release-image-arm64-cuda-12-9-ubuntu2404 + id: create-multi-arch-manifest-cuda-12-9-ubuntu2404 agents: queue: small_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend" - - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404" + - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu129-ubuntu2404 --amend" + - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404" + - 'bash .buildkite/scripts/annotate-build-artifact.sh "Manifest: CUDA 12.9 Ubuntu 24.04" "public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404"' - label: "Publish nightly multi-arch image to DockerHub" depends_on: @@ -306,16 +450,16 @@ steps: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" - - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0" + - label: "Publish nightly multi-arch image to DockerHub - CUDA 12.9" depends_on: - - create-multi-arch-manifest-cuda-13-0 + - create-multi-arch-manifest-cuda-12-9 if: build.env("NIGHTLY") == "1" agents: queue: small_cpu_queue_release commands: - - "bash .buildkite/scripts/push-nightly-builds.sh cu130" + - "bash .buildkite/scripts/push-nightly-builds.sh cu129" # Clean up old nightly builds (keep only last 14) - - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-" + - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu129-nightly-" plugins: - docker-login#v3.0.0: username: vllmbot @@ -324,24 +468,6 @@ steps: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" - - group: "Publish wheels" - key: "publish-wheels" - steps: - - block: "Confirm update release wheels to PyPI (experimental, use with caution)?" - key: block-upload-release-wheels - depends_on: - - input-release-version - - build-wheels - - - label: "Upload release wheels to PyPI" - depends_on: - - block-upload-release-wheels - id: upload-release-wheels - agents: - queue: small_cpu_queue_release - commands: - - "bash .buildkite/scripts/upload-release-wheels-pypi.sh" - # ============================================================================= # ROCm Release Pipeline (x86_64 only) # ============================================================================= @@ -455,7 +581,7 @@ steps: echo "" echo " Build complete - Image and wheels cached" fi - + artifact_paths: - "artifacts/rocm-base-wheels/*.whl" env: @@ -611,12 +737,14 @@ steps: - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh" env: S3_BUCKET: "vllm-wheels" - VARIANT: "rocm721" + VARIANT: "rocm722" # ROCm Job 6: Build ROCm Release Docker Image - label: ":docker: Build release image - x86_64 - ROCm" id: build-rocm-release-image depends_on: + - step: block-build-release-images + allow_failure: true - step: build-rocm-base-wheels allow_failure: false agents: @@ -669,7 +797,7 @@ steps: # Push to ECR docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm - + echo "" echo " Successfully built and pushed ROCm release image" echo " Image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm" @@ -696,3 +824,60 @@ steps: env: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" + + # ============================================================================= + # Publish to DockerHub and PyPI (at the end so all builds complete first) + # ============================================================================= + + - block: "Publish release images to DockerHub" + key: block-publish-release-images + depends_on: + - create-multi-arch-manifest + - create-multi-arch-manifest-cuda-12-9 + - create-multi-arch-manifest-ubuntu2404 + - create-multi-arch-manifest-cuda-12-9-ubuntu2404 + - build-rocm-release-image + - input-release-version + # Wait for CPU builds if their block steps were unblocked, so publish + # doesn't race the in-progress CPU build. allow_failure lets publish + # proceed when the operator legitimately leaves the CPU block steps + # unblocked or the CPU build fails. + - step: build-cpu-release-image-x86 + allow_failure: true + - step: build-cpu-release-image-arm64 + allow_failure: true + if: build.env("NIGHTLY") != "1" + + - label: "Publish release images to DockerHub" + depends_on: + - block-publish-release-images + key: publish-release-images-dockerhub + agents: + queue: small_cpu_queue_release + commands: + - "bash .buildkite/scripts/publish-release-images.sh" + plugins: + - docker-login#v3.0.0: + username: vllmbot + password-env: DOCKERHUB_TOKEN + env: + DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: "vllmbot" + + - group: "Publish wheels" + key: "publish-wheels" + steps: + - block: "Confirm update release wheels to PyPI (experimental, use with caution)?" + key: block-upload-release-wheels + depends_on: + - input-release-version + - build-wheels + + - label: "Upload release wheels to PyPI" + depends_on: + - block-upload-release-wheels + id: upload-release-wheels + agents: + queue: small_cpu_queue_release + commands: + - "bash .buildkite/scripts/upload-release-wheels-pypi.sh" diff --git a/.buildkite/scripts/annotate-build-artifact.sh b/.buildkite/scripts/annotate-build-artifact.sh new file mode 100755 index 000000000000..67cdf7923658 --- /dev/null +++ b/.buildkite/scripts/annotate-build-artifact.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Append a build artifact line to the Buildkite annotation. +# Usage: annotate-build-artifact.sh