From 9511a3f8eec6992d8834ad85af855683bd74ba29 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 25 Feb 2026 21:01:10 -0500 Subject: [PATCH 001/154] [Bugfix] Fix AttributeError in SMControlContextManager (#35338) Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_ubatch_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py index 45ba1bef9f2a..754f2981c9f2 100644 --- a/vllm/v1/worker/gpu_ubatch_wrapper.py +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -74,7 +74,7 @@ def __init__( "SM control is currently only supported on CUDA" ) - total_sms = num_compute_units(torch.cuda.current_device().index) + total_sms = num_compute_units(torch.cuda.current_device()) assert comm_sms < total_sms self.total_sms = total_sms From 160424a937d373101818876103227cc986887b55 Mon Sep 17 00:00:00 2001 From: Seungmin Kim <8457324+ehfd@users.noreply.github.com> Date: Thu, 26 Feb 2026 11:15:51 +0900 Subject: [PATCH 002/154] [Bugfix] Fix CUDA compatibility path setting for both datacenter and consumer NVIDIA GPUs (#33992) Signed-off-by: Seungmin Kim <8457324+ehfd@users.noreply.github.com> Signed-off-by: Andrew Mello <19512127+88plug@users.noreply.github.com> Co-authored-by: 88plug <19512127+88plug@users.noreply.github.com> Co-authored-by: Michael Goin --- docker/Dockerfile | 12 +- .../installation/gpu.cuda.inc.md | 17 ++ docs/usage/troubleshooting.md | 27 ++- tests/cuda/test_cuda_compatibility_path.py | 187 ++++++++++++++++++ vllm/env_override.py | 82 ++++++++ vllm/envs.py | 14 ++ 6 files changed, 334 insertions(+), 5 deletions(-) create mode 100644 tests/cuda/test_cuda_compatibility_path.py diff --git a/docker/Dockerfile b/docker/Dockerfile index cc2ccc11cdcb..717f27b6b232 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -132,8 +132,10 @@ ENV UV_LINK_MODE=copy # Verify GCC version RUN gcc --version -# Ensure CUDA compatibility library is loaded -RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/cuda-compat.conf && ldconfig +# Enable CUDA forward compatibility by setting '-e VLLM_ENABLE_CUDA_COMPATIBILITY=1' +# Only needed for datacenter/professional GPUs with older drivers. +# See: https://docs.nvidia.com/deploy/cuda-compatibility/ +ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0 # ============================================================ # SLOW-CHANGING DEPENDENCIES BELOW @@ -560,8 +562,10 @@ ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_LINK_MODE=copy -# Ensure CUDA compatibility library is loaded -RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/cuda-compat.conf && ldconfig +# Enable CUDA forward compatibility by setting '-e VLLM_ENABLE_CUDA_COMPATIBILITY=1' +# Only needed for datacenter/professional GPUs with older drivers. +# See: https://docs.nvidia.com/deploy/cuda-compatibility/ +ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0 # ============================================================ # SLOW-CHANGING DEPENDENCIES BELOW diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md index 661e0934eefd..da8b7d3fa1dd 100644 --- a/docs/getting_started/installation/gpu.cuda.inc.md +++ b/docs/getting_started/installation/gpu.cuda.inc.md @@ -297,6 +297,23 @@ You can add any other [engine-args](https://docs.vllm.ai/en/latest/configuration RUN uv pip install --system git+https://github.com/huggingface/transformers.git ``` +#### Running on Systems with Older CUDA Drivers + +vLLM's Docker image comes with [CUDA compatibility libraries](https://docs.nvidia.com/deploy/cuda-compatibility/index.html) pre-installed. This allows you to run vLLM on systems with NVIDIA drivers that are older than the CUDA Toolkit version used in the image, but only supports select professional and datacenter NVIDIA GPUs. + +To enable this feature, set the `VLLM_ENABLE_CUDA_COMPATIBILITY` environment variable to `1` or `true` when running the container: + +```bash +docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -p 8000:8000 \ + --env "HF_TOKEN=" \ + --env "VLLM_ENABLE_CUDA_COMPATIBILITY=1" \ + vllm/vllm-openai +``` + +This will automatically configure `LD_LIBRARY_PATH` to point to the compatibility libraries before loading PyTorch and other dependencies. + # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index 128c36b784d8..814b03c1e38b 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -318,7 +318,32 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to ## CUDA error: the provided PTX was compiled with an unsupported toolchain -If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain.`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. The released vLLM wheels have to be compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [cuda compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. The solution is to install `cuda-compat` package from your package manager. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then add `export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH` to your `.bashrc` file. When successfully installed, you should see that the output of `nvidia-smi` will show `CUDA Version: 12.9`. Note that we use CUDA 12.9 as an example here, you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher. +If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. This section also applies if you get the error `RuntimeError: The NVIDIA driver on your system is too old`. + +The released vLLM wheels are compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [CUDA compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. **This is only supported on select professional and datacenter NVIDIA GPUs.** + +If you are using the vLLM official Docker image, you can solve this by adding `-e VLLM_ENABLE_CUDA_COMPATIBILITY=1` to your `docker run` command. This will enable the pre-installed CUDA forward compatibility libraries. + +If you are running vLLM outside of Docker, the solution is to install the `cuda-compat` package from your package manager with the [CUDA repository](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) enabled. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then set `export VLLM_ENABLE_CUDA_COMPATIBILITY=1` and `export VLLM_CUDA_COMPATIBILITY_PATH="/usr/local/cuda-12.9/compat"`. + +On Conda, you can install the `conda-forge::cuda-compat` package (e.g., `conda install -c conda-forge cuda-compat=12.9`), then after activating the environment, set `export VLLM_ENABLE_CUDA_COMPATIBILITY=1` and `export VLLM_CUDA_COMPATIBILITY_PATH="${CONDA_PREFIX}/cuda-compat"`. + +You can verify the configuration works by running a minimal Python script that initializes CUDA via vLLM: + +```bash +export VLLM_ENABLE_CUDA_COMPATIBILITY=1 +export VLLM_CUDA_COMPATIBILITY_PATH="/usr/local/cuda-12.9/compat" + +python3 - << 'EOF' +import vllm +import torch + +print(f"CUDA available: {torch.cuda.is_available()}") +print(f"CUDA device count: {torch.cuda.device_count()}") +EOF +``` + +Note that we use CUDA 12.9 as an example here, and you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher. ## ptxas fatal: Value 'sm_110a' is not defined for option 'gpu-name' diff --git a/tests/cuda/test_cuda_compatibility_path.py b/tests/cuda/test_cuda_compatibility_path.py new file mode 100644 index 000000000000..837d2c49cfb6 --- /dev/null +++ b/tests/cuda/test_cuda_compatibility_path.py @@ -0,0 +1,187 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for CUDA forward compatibility path logic in env_override.py. + +Verifies the opt-in LD_LIBRARY_PATH manipulation for CUDA compat libs, +including env var parsing, path detection, and deduplication. +""" + +import os +from unittest.mock import patch + +import pytest + +# Import the functions directly (they're module-level in env_override) +# We must import them without triggering the module-level side effects, +# so we import the functions by name after the module is already loaded. +from vllm.env_override import ( + _get_torch_cuda_version, + _maybe_set_cuda_compatibility_path, +) + + +class TestCudaCompatibilityEnvParsing: + """Test VLLM_ENABLE_CUDA_COMPATIBILITY env var parsing.""" + + def test_disabled_by_default(self, monkeypatch): + """Compat path is NOT set when env var is absent.""" + monkeypatch.delenv("VLLM_ENABLE_CUDA_COMPATIBILITY", raising=False) + monkeypatch.delenv("LD_LIBRARY_PATH", raising=False) + _maybe_set_cuda_compatibility_path() + assert ( + "LD_LIBRARY_PATH" not in os.environ + or os.environ.get("LD_LIBRARY_PATH", "") == "" + ) + + @pytest.mark.parametrize("value", ["0", "false", "False", "no", ""]) + def test_disabled_values(self, monkeypatch, value): + """Various falsy values should not activate compat path.""" + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value) + monkeypatch.delenv("LD_LIBRARY_PATH", raising=False) + _maybe_set_cuda_compatibility_path() + # LD_LIBRARY_PATH should not be set (or remain empty) + ld_path = os.environ.get("LD_LIBRARY_PATH", "") + assert "compat" not in ld_path + + @pytest.mark.parametrize("value", ["1", "true", "True", " 1 ", " TRUE "]) + def test_enabled_values_with_valid_path(self, monkeypatch, tmp_path, value): + """Truthy values activate compat path when a valid path exists.""" + compat_dir = tmp_path / "compat" + compat_dir.mkdir() + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value) + monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir)) + monkeypatch.delenv("LD_LIBRARY_PATH", raising=False) + _maybe_set_cuda_compatibility_path() + ld_path = os.environ.get("LD_LIBRARY_PATH", "") + assert str(compat_dir) in ld_path + + +class TestCudaCompatibilityPathDetection: + """Test path detection: custom override, conda, default.""" + + def test_custom_path_override(self, monkeypatch, tmp_path): + """VLLM_CUDA_COMPATIBILITY_PATH takes highest priority.""" + custom_dir = tmp_path / "my-compat" + custom_dir.mkdir() + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1") + monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(custom_dir)) + monkeypatch.delenv("LD_LIBRARY_PATH", raising=False) + _maybe_set_cuda_compatibility_path() + ld_path = os.environ.get("LD_LIBRARY_PATH", "") + assert ld_path.startswith(str(custom_dir)) + + def test_conda_prefix_fallback(self, monkeypatch, tmp_path): + """Falls back to $CONDA_PREFIX/cuda-compat if custom not set.""" + conda_dir = tmp_path / "conda-env" + compat_dir = conda_dir / "cuda-compat" + compat_dir.mkdir(parents=True) + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1") + monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False) + monkeypatch.setenv("CONDA_PREFIX", str(conda_dir)) + monkeypatch.delenv("LD_LIBRARY_PATH", raising=False) + _maybe_set_cuda_compatibility_path() + ld_path = os.environ.get("LD_LIBRARY_PATH", "") + assert str(compat_dir) in ld_path + + def test_no_valid_path_does_nothing(self, monkeypatch): + """When enabled but no valid path exists, LD_LIBRARY_PATH unchanged.""" + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1") + monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", "/nonexistent/path") + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.delenv("LD_LIBRARY_PATH", raising=False) + with patch("vllm.env_override._get_torch_cuda_version", return_value=None): + _maybe_set_cuda_compatibility_path() + assert os.environ.get("LD_LIBRARY_PATH", "") == "" + + def test_default_cuda_path_fallback(self, monkeypatch, tmp_path): + """Falls back to /usr/local/cuda-{ver}/compat via torch version.""" + fake_cuda = tmp_path / "cuda-12.8" / "compat" + fake_cuda.mkdir(parents=True) + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1") + monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False) + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.delenv("LD_LIBRARY_PATH", raising=False) + with ( + patch("vllm.env_override._get_torch_cuda_version", return_value="12.8"), + patch( + "vllm.env_override.os.path.isdir", + side_effect=lambda p: p == "/usr/local/cuda-12.8/compat" + or os.path.isdir(p), + ), + ): + _maybe_set_cuda_compatibility_path() + ld_path = os.environ.get("LD_LIBRARY_PATH", "") + assert "/usr/local/cuda-12.8/compat" in ld_path + + +class TestCudaCompatibilityLdPathManipulation: + """Test LD_LIBRARY_PATH prepend and deduplication logic.""" + + def test_prepends_to_empty_ld_path(self, monkeypatch, tmp_path): + """Compat path is set when LD_LIBRARY_PATH is empty.""" + compat_dir = tmp_path / "compat" + compat_dir.mkdir() + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1") + monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir)) + monkeypatch.delenv("LD_LIBRARY_PATH", raising=False) + _maybe_set_cuda_compatibility_path() + assert os.environ["LD_LIBRARY_PATH"] == str(compat_dir) + + def test_prepends_to_existing_ld_path(self, monkeypatch, tmp_path): + """Compat path is prepended before existing entries.""" + compat_dir = tmp_path / "compat" + compat_dir.mkdir() + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1") + monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir)) + monkeypatch.setenv("LD_LIBRARY_PATH", "/usr/lib:/other/lib") + _maybe_set_cuda_compatibility_path() + ld_path = os.environ["LD_LIBRARY_PATH"] + parts = ld_path.split(os.pathsep) + assert parts[0] == str(compat_dir) + assert "/usr/lib" in parts + assert "/other/lib" in parts + + def test_deduplicates_existing_compat_path(self, monkeypatch, tmp_path): + """If compat path already in LD_LIBRARY_PATH, move to front.""" + compat_dir = tmp_path / "compat" + compat_dir.mkdir() + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1") + monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir)) + monkeypatch.setenv( + "LD_LIBRARY_PATH", + f"/usr/lib:{compat_dir}:/other/lib", + ) + _maybe_set_cuda_compatibility_path() + ld_path = os.environ["LD_LIBRARY_PATH"] + parts = ld_path.split(os.pathsep) + assert parts[0] == str(compat_dir) + assert parts.count(str(compat_dir)) == 1 + + def test_already_at_front_is_noop(self, monkeypatch, tmp_path): + """If compat path is already first, don't modify LD_LIBRARY_PATH.""" + compat_dir = tmp_path / "compat" + compat_dir.mkdir() + original = f"{compat_dir}:/usr/lib" + monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1") + monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir)) + monkeypatch.setenv("LD_LIBRARY_PATH", original) + _maybe_set_cuda_compatibility_path() + assert os.environ["LD_LIBRARY_PATH"] == original + + +class TestGetTorchCudaVersion: + """Test _get_torch_cuda_version() helper.""" + + def test_returns_string_when_torch_available(self): + """Should return a CUDA version string like '12.8'.""" + version = _get_torch_cuda_version() + # torch is installed in vllm's environment + assert version is None or isinstance(version, str) + + def test_returns_none_when_torch_missing(self): + """Should return None when torch is not importable.""" + with patch( + "vllm.env_override.importlib.util.find_spec", + return_value=None, + ): + assert _get_torch_cuda_version() is None diff --git a/vllm/env_override.py b/vllm/env_override.py index e5a40dc3cd8f..181d000a68a7 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -1,7 +1,89 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E402 +import importlib.util import os + +def _get_torch_cuda_version(): + """Peripheral function to _maybe_set_cuda_compatibility_path(). + PyTorch version must not be determined by importing directly + because it will trigger the CUDA initialization, losing the + chance to set the LD_LIBRARY_PATH beforehand. + """ + try: + spec = importlib.util.find_spec("torch") + if not spec: + return None + if spec.origin: + torch_root = os.path.dirname(spec.origin) + elif spec.submodule_search_locations: + torch_root = spec.submodule_search_locations[0] + else: + return None + version_path = os.path.join(torch_root, "version.py") + if not os.path.exists(version_path): + return None + # Load the version module without importing torch + ver_spec = importlib.util.spec_from_file_location("torch.version", version_path) + if not ver_spec or not ver_spec.loader: + return None + module = importlib.util.module_from_spec(ver_spec) + # Avoid registering in sys.modules to not confuse future imports + ver_spec.loader.exec_module(module) + return getattr(module, "cuda", None) + except Exception: + return None + + +def _maybe_set_cuda_compatibility_path(): + """Set LD_LIBRARY_PATH for CUDA forward compatibility if enabled. + + Must run before 'import torch' since torch loads CUDA shared libraries + at import time and the dynamic linker only consults LD_LIBRARY_PATH when + a library is first loaded. + + CUDA forward compatibility is only supported on select professional and + datacenter NVIDIA GPUs. Consumer GPUs (GeForce, RTX) do not support it + and will get Error 803 if compat libs are loaded. + """ + enable = os.environ.get("VLLM_ENABLE_CUDA_COMPATIBILITY", "0").strip().lower() in ( + "1", + "true", + ) + if not enable: + return + + cuda_compat_path = os.environ.get("VLLM_CUDA_COMPATIBILITY_PATH", "") + if not cuda_compat_path or not os.path.isdir(cuda_compat_path): + conda_prefix = os.environ.get("CONDA_PREFIX", "") + conda_compat = os.path.join(conda_prefix, "cuda-compat") + if conda_prefix and os.path.isdir(conda_compat): + cuda_compat_path = conda_compat + if not cuda_compat_path or not os.path.isdir(cuda_compat_path): + torch_cuda_version = _get_torch_cuda_version() + if torch_cuda_version: + default_path = f"/usr/local/cuda-{torch_cuda_version}/compat" + if os.path.isdir(default_path): + cuda_compat_path = default_path + if not cuda_compat_path or not os.path.isdir(cuda_compat_path): + return + + norm_path = os.path.normpath(cuda_compat_path) + existing = os.environ.get("LD_LIBRARY_PATH", "") + ld_paths = existing.split(os.pathsep) if existing else [] + + if ld_paths and ld_paths[0] and os.path.normpath(ld_paths[0]) == norm_path: + return # Already at the front + + new_paths = [norm_path] + [ + p for p in ld_paths if not p or os.path.normpath(p) != norm_path + ] + os.environ["LD_LIBRARY_PATH"] = os.pathsep.join(new_paths) + + +_maybe_set_cuda_compatibility_path() + import torch from vllm.logger import init_logger diff --git a/vllm/envs.py b/vllm/envs.py index 0d8cf021e4f2..d62438d5735b 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -239,6 +239,8 @@ VLLM_WEIGHT_OFFLOADING_DISABLE_UVA: bool = False VLLM_DISABLE_LOG_LOGO: bool = False VLLM_LORA_DISABLE_PDL: bool = False + VLLM_ENABLE_CUDA_COMPATIBILITY: bool = False + VLLM_CUDA_COMPATIBILITY_PATH: str | None = None def get_default_cache_root(): @@ -1591,6 +1593,16 @@ def _get_or_set_default() -> str: # Disable PDL for LoRA, as enabling PDL with LoRA on SM100 causes # Triton compilation to fail. "VLLM_LORA_DISABLE_PDL": lambda: bool(int(os.getenv("VLLM_LORA_DISABLE_PDL", "0"))), + # Enable CUDA compatibility mode for datacenter GPUs with older + # driver versions than the CUDA toolkit major version of vLLM. + "VLLM_ENABLE_CUDA_COMPATIBILITY": lambda: ( + os.environ.get("VLLM_ENABLE_CUDA_COMPATIBILITY", "0").strip().lower() + in ("1", "true") + ), + # Path to the CUDA compatibility libraries when CUDA compatibility is enabled. + "VLLM_CUDA_COMPATIBILITY_PATH": lambda: os.environ.get( + "VLLM_CUDA_COMPATIBILITY_PATH", None + ), } @@ -1731,6 +1743,8 @@ def compile_factors() -> dict[str, object]: "VLLM_CPU_MOE_PREPACK", "VLLM_CPU_SGL_KERNEL", "VLLM_TEST_FORCE_LOAD_FORMAT", + "VLLM_ENABLE_CUDA_COMPATIBILITY", + "VLLM_CUDA_COMPATIBILITY_PATH", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES", "NO_COLOR", From 86c3b5a808506e325fd7e59d86d83170fc98c93c Mon Sep 17 00:00:00 2001 From: "Roberto L. Castro" <38211239+LopezCastroRoberto@users.noreply.github.com> Date: Thu, 26 Feb 2026 03:32:50 +0100 Subject: [PATCH 003/154] [BugFix] Fix fp4 quant kernel on CUDA 12.8 (#35210) Signed-off-by: LopezCastroRoberto --- .../fp4/activation_nvfp4_quant_fusion_kernels.cu | 6 ++++-- csrc/quantization/fp4/nvfp4_quant_kernels.cu | 12 +++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index d0264c4d154c..8583b79fd58f 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -107,7 +107,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo); reinterpret_cast(out)[outOffset >> 1] = packed64; } else { - out[inOffset] = out_val; + int64_t outOffset = + rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; + out[outOffset] = out_val; } } } @@ -140,7 +142,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d] int const numBlocksPerSM = vllm_runtime_blocks_per_sm(static_cast(block.x)); - int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE); + int sf_n_unpadded = int(n / CVT_FP4_ELTS_PER_THREAD); int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast(block.x)); int grid_x = std::min( diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index c27fb69d44be..b521b4707a4d 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -109,7 +109,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) template __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) cvt_fp16_to_fp4_sf_major(int32_t numRows, int32_t numCols, - int32_t sf_n_unpadded, Type const* __restrict__ in, + int32_t sf_n_unpadded, int32_t num_packed_cols, + Type const* __restrict__ in, float const* __restrict__ SFScale, uint32_t* __restrict__ out, uint32_t* __restrict__ SFout) { @@ -131,7 +132,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) // Iterate over all rows and cols including padded ones - // ensures we visit every single scale factor address to initialize it. for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { - if (colIdx < sf_n_unpadded) { + if (colIdx < num_packed_cols) { PackedVec in_vec; int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; @@ -222,7 +223,8 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, reinterpret_cast(sf_out)); }); } else { - int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast(block.x)); + int num_packed_cols = n / CVT_FP4_ELTS_PER_THREAD; + int grid_y = vllm::div_round_up(num_packed_cols, static_cast(block.x)); int grid_x = std::min( m, std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y)); dim3 grid(grid_x, grid_y); @@ -232,8 +234,8 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, auto input_ptr = static_cast(input.data_ptr()); // NOTE: We don't support e8m0 scales at this moment. vllm::cvt_fp16_to_fp4_sf_major - <<>>(m, n, sf_n_unpadded, input_ptr, - input_sf_ptr, + <<>>(m, n, sf_n_unpadded, num_packed_cols, + input_ptr, input_sf_ptr, reinterpret_cast(output_ptr), reinterpret_cast(sf_out)); }); From 2aa414040243bc24447aa5a4f244f4104064d539 Mon Sep 17 00:00:00 2001 From: hujiaxin0 <524446785@qq.com> Date: Thu, 26 Feb 2026 11:08:09 +0800 Subject: [PATCH 004/154] openpangu-vl support video input (#34134) Signed-off-by: hujiaxin <524446785@qq.com> Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Co-authored-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Co-authored-by: Isotr0py --- vllm/multimodal/video.py | 87 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index f123799ca901..fb4e19fa6745 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -747,3 +747,90 @@ def load_bytes( **kwargs, ) return out + + +@VIDEO_LOADER_REGISTRY.register("openpangu") +class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend): + @classmethod + def load_bytes( + cls, + data: bytes, + num_frames: int = 32, + fps: int = 1, + max_duration: int = 300, + frame_recovery: bool = False, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: + """ + Load video frames with dynamic sampling based on duration. + Assume that total_num_frames = 10 and fps = 1. + The timestamp of frame 0 is 0.0. + The timestamp of frame 1 is 1.0.… + The timestamp of frame 9 (the last frame) should be 9.0, that is, + (total_frames_num – 1) / original_fps. + + Args: + data: Raw video bytes + num_frames: Not used in dynamic backend + fps: Target FPS for sampling (default: 1) + + Returns: + Tuple of (frames_array, metadata_dict) + """ + import cv2 + + backend = cls().get_cv2_video_api() + cap = cv2.VideoCapture(BytesIO(data), backend, []) + if not cap.isOpened(): + raise ValueError("Could not open video stream") + + total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + original_fps = float(cap.get(cv2.CAP_PROP_FPS)) + # The timestamp of the rightmost frame, cannot be used to calculate frame 0. + if total_frames_num >= 1 and original_fps > 0: + total_duration = (total_frames_num - 1) / original_fps + else: + total_duration = 0 + + # `fps` is the FPS parameter passed in for sampling, + # -1 indicates that sampling can be performed directly without FPS limitation. + if fps > 0: + # Num_frames is the maximum number of frames to sample. + # If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501 + if num_frames >= int(total_duration * fps) + 1: + num_frames = int(total_duration * fps) + 1 + # Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501 + # cannot be calculated for frame 0. + total_duration = min(total_duration, (num_frames - 1) / fps) + elif fps != -1: + raise ValueError( + f"requires dataset fps is -1 or greater than 0 but got {fps}" + ) + + sample_frame_timestamps = np.linspace( + 0, total_duration, num_frames, dtype=float + ) + frames_indices = [ + min(total_frames_num - 1, round(t * original_fps)) + for t in sample_frame_timestamps + ] + + frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery( + cap, frames_indices, total_frames_num + ) + + if recovered_map: + logger.info( + "Frame recovery: %d frames recovered using forward scan.", + len(recovered_map), + ) + + metadata = { + "total_num_frames": total_frames_num, + "fps": original_fps, + "duration": total_duration, + "video_backend": "opencv_dynamic_openpangu", + "frames_indices": valid_frame_indices, + "do_sample_frames": False, + } + return frames, metadata From 71dfce6aa6cc14d016154b4e3fd8cc40c05415f9 Mon Sep 17 00:00:00 2001 From: Hanjie Qiu <50634613+hjjq@users.noreply.github.com> Date: Wed, 25 Feb 2026 19:17:20 -0800 Subject: [PATCH 005/154] [Kernel] Refactor FlashInfer allreduce for mnnvl backend (#34109) Signed-off-by: hjjq <50634613+hjjq@users.noreply.github.com> Signed-off-by: wzhao18 Co-authored-by: wzhao18 Co-authored-by: Wei Zhao <51183510+wzhao18@users.noreply.github.com> --- .../kernels/benchmark_device_communicators.py | 113 ++++++-- .../kernels/benchmark_fused_collective.py | 210 ++++++++------- .../distributed/test_fusion_all_reduce.py | 10 +- .../passes/fusion/allreduce_rms_fusion.py | 144 ++++++---- .../device_communicators/cuda_communicator.py | 28 +- .../flashinfer_all_reduce.py | 252 ++++++++++++++++++ vllm/envs.py | 14 + 7 files changed, 592 insertions(+), 179 deletions(-) create mode 100644 vllm/distributed/device_communicators/flashinfer_all_reduce.py diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index 7b453fe7b680..d1005461ab93 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -30,6 +30,9 @@ from torch.distributed import ProcessGroup from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce +from vllm.distributed.device_communicators.flashinfer_all_reduce import ( + FlashInferAllReduce, +) from vllm.distributed.device_communicators.pynccl import ( PyNcclCommunicator, register_nccl_symmetric_ops, @@ -44,7 +47,7 @@ logger = init_logger(__name__) # Default sequence lengths to benchmark -DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192] +DEFAULT_SEQUENCE_LENGTHS = [16, 64, 128, 512, 1024, 2048, 4096, 8192] # Fixed hidden size and dtype for all benchmarks HIDDEN_SIZE = 8192 @@ -81,6 +84,7 @@ def __init__( self.symm_mem_comm = None self.symm_mem_comm_multimem = None self.symm_mem_comm_two_shot = None + self.fi_ar_comm = None self._init_communicators() @@ -161,6 +165,22 @@ def _init_communicators(self): ) self.symm_mem_comm_two_shot = None + try: + self.fi_ar_comm = FlashInferAllReduce( + group=self.cpu_group, + device=self.device, + ) + if not self.fi_ar_comm.disabled: + logger.info("Rank %s: FlashInferAllReduce initialized", self.rank) + else: + logger.info("Rank %s: FlashInferAllReduce disabled", self.rank) + self.fi_ar_comm = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize FlashInferAllReduce: %s", self.rank, e + ) + self.fi_ar_comm = None + def benchmark_allreduce( self, sequence_length: int, num_warmup: int, num_trials: int ) -> dict[str, float]: @@ -180,7 +200,8 @@ def benchmark_allreduce( lambda t, c=comm: c.custom_all_reduce(t), lambda t, c=comm: c.should_custom_ar(t), comm.capture(), - "1stage", # env variable value + {"VLLM_CUSTOM_ALLREDUCE_ALGO": "1stage"}, + None, # no destroy function ) ) # CustomAllreduce two-shot @@ -190,7 +211,8 @@ def benchmark_allreduce( lambda t, c=comm: c.custom_all_reduce(t), lambda t, c=comm: c.should_custom_ar(t), comm.capture(), - "2stage", # env variable value + {"VLLM_CUSTOM_ALLREDUCE_ALGO": "2stage"}, + None, # no destroy function ) ) @@ -202,7 +224,8 @@ def benchmark_allreduce( lambda t, c=comm: c.all_reduce(t), lambda t: True, # Always available if initialized nullcontext(), - None, # no env variable needed + {}, # no env variable needed + None, # no destroy function ) ) communicators.append( @@ -211,7 +234,8 @@ def benchmark_allreduce( lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t), lambda t: True, # Always available if initialized nullcontext(), - None, # no env variable needed + {}, # no env variable needed + None, # no destroy function ) ) @@ -223,7 +247,8 @@ def benchmark_allreduce( lambda t, c=comm: c.all_reduce(t), lambda t, c=comm: c.should_use_symm_mem(t), nullcontext(), - None, # no env variable needed + {}, # no env variable needed + None, # no destroy function ) ) @@ -235,29 +260,67 @@ def benchmark_allreduce( lambda t, c=comm: c.all_reduce(t), lambda t, c=comm: c.should_use_symm_mem(t), nullcontext(), - None, # no env variable needed + {}, # no env variable needed + None, # no destroy function needed ) ) - # Benchmark each communicator - for name, allreduce_fn, should_use_fn, context, env_var in communicators: - # Set environment variable if needed - if env_var is not None: - os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var - else: - # Clear the environment variable to avoid interference - os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None) - - latency = self.benchmark_allreduce_single( - sequence_length, - allreduce_fn, - should_use_fn, - context, - num_warmup, - num_trials, + if self.fi_ar_comm is not None: + comm = self.fi_ar_comm + communicators.append( + ( + "flashinfer_trtllm", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_fi_ar(t), + nullcontext(), + {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "trtllm"}, + lambda c=comm: c.destroy(), + ) ) - if latency is not None: - results[name] = latency + communicators.append( + ( + "flashinfer_mnnvl", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_fi_ar(t), + nullcontext(), + {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "mnnvl"}, + lambda c=comm: c.destroy(), + ) + ) + + # Benchmark each communicator + for ( + name, + allreduce_fn, + should_use_fn, + context, + env_dict, + destroy_fn, + ) in communicators: + # Save original values and apply new environment variables + saved_env = {key: os.environ.get(key) for key in env_dict} + for key, value in env_dict.items(): + os.environ[key] = value + try: + latency = self.benchmark_allreduce_single( + sequence_length, + allreduce_fn, + should_use_fn, + context, + num_warmup, + num_trials, + ) + if latency is not None: + results[name] = latency + finally: + if destroy_fn is not None: + destroy_fn() + # Restore environment variables to their original state + for key, original_value in saved_env.items(): + if original_value is None: + os.environ.pop(key, None) + else: + os.environ[key] = original_value return results diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py index 633529edf16d..e18f6a7580fb 100644 --- a/benchmarks/kernels/benchmark_fused_collective.py +++ b/benchmarks/kernels/benchmark_fused_collective.py @@ -5,8 +5,11 @@ Benchmark for FlashInfer fused collective operations vs standard operations. This benchmark compares: -1. FlashInfer's allreduce_fusion (fused allreduce + rmsnorm + optional quant) -2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations +1. FlashInfer's allreduce_fusion with trtllm backend + (fused allreduce + rmsnorm + optional FP8/FP4 quant) +2. FlashInfer's allreduce_fusion with mnnvl backend + (fused allreduce + rmsnorm only, no quantization support) +3. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations Usage with torchrun: torchrun --nproc_per_node=2 benchmark_fused_collective.py @@ -48,8 +51,12 @@ logger = init_logger(__name__) # Try to import FlashInfer +TorchDistBackend = None try: import flashinfer.comm as flashinfer_comm # type: ignore + from flashinfer.comm.mnnvl import ( # type: ignore + TorchDistBackend, + ) if not ( hasattr(flashinfer_comm, "allreduce_fusion") @@ -74,11 +81,15 @@ 8: 64 * MiB, # 64MB } -# Global workspace tensor for FlashInfer -_FI_WORKSPACE = None +# Global workspace tensors for FlashInfer (keyed by backend name) +_FI_WORKSPACES: dict = {} + +# Backends to benchmark +FLASHINFER_BACKENDS = ["trtllm", "mnnvl"] def setup_flashinfer_workspace( + backend: str, world_size: int, rank: int, hidden_dim: int, @@ -86,41 +97,54 @@ def setup_flashinfer_workspace( dtype: torch.dtype, ): """Setup FlashInfer workspace for fused allreduce operations.""" - global _FI_WORKSPACE + global FI_WORKSPACES if flashinfer_comm is None: - return None, None + return None if world_size not in _FI_MAX_SIZES: logger.warning("FlashInfer not supported for world size %s", world_size) - return None, None + return None try: + kwargs = {} + if TorchDistBackend is not None: + kwargs["comm_backend"] = TorchDistBackend(group=dist.group.WORLD) + workspace = flashinfer_comm.create_allreduce_fusion_workspace( - backend="trtllm", + backend=backend, world_size=world_size, rank=rank, max_token_num=max_token_num, hidden_dim=hidden_dim, dtype=dtype, + **kwargs, ) - _FI_WORKSPACE = workspace + _FI_WORKSPACES[backend] = workspace return workspace except Exception as e: - logger.error("Failed to setup FlashInfer workspace: %s", e) + logger.error( + "Failed to setup FlashInfer workspace (backend=%s): %s", backend, e + ) return None -def cleanup_flashinfer_workspace(workspace): - """Cleanup FlashInfer workspace.""" - if flashinfer_comm is None or workspace is None: +def cleanup_flashinfer_workspaces(): + """Cleanup all FlashInfer workspaces.""" + if flashinfer_comm is None: return - try: - workspace.destroy() - except Exception as e: - logger.error("Failed to cleanup FlashInfer workspace: %s", e) + for backend, workspace in _FI_WORKSPACES.items(): + try: + workspace.destroy() + except Exception as e: + logger.error( + "Failed to cleanup FlashInfer workspace (backend=%s): %s", + backend, + e, + ) + _FI_WORKSPACES.clear() class FlashInferFusedAllReduceParams: @@ -134,7 +158,7 @@ def __init__( self.fp32_acc = True self.max_token_num = max_token_num - def get_trtllm_fused_allreduce_kwargs(self): + def get_flashinfer_fused_allreduce_kwargs(self): return { "launch_with_pdl": self.launch_with_pdl, "fp32_acc": self.fp32_acc, @@ -147,11 +171,12 @@ def flashinfer_fused_allreduce_rmsnorm( rms_gamma: torch.Tensor, rms_eps: float, allreduce_params: "FlashInferFusedAllReduceParams", + workspace: object, use_oneshot: bool, norm_out: torch.Tensor | None = None, ): """FlashInfer fused allreduce + rmsnorm operation.""" - if flashinfer_comm is None or _FI_WORKSPACE is None: + if flashinfer_comm is None or workspace is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -160,9 +185,13 @@ def flashinfer_fused_allreduce_rmsnorm( else: residual_out = input_tensor + layout_code = None + if workspace.backend == "trtllm": + layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4 + flashinfer_comm.allreduce_fusion( input=input_tensor, - workspace=_FI_WORKSPACE, + workspace=workspace, pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm, residual_in=residual, residual_out=residual_out, @@ -171,10 +200,10 @@ def flashinfer_fused_allreduce_rmsnorm( rms_eps=rms_eps, quant_out=None, scale_out=None, - layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, + layout_code=layout_code, scale_factor=None, use_oneshot=use_oneshot, - **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + **allreduce_params.get_flashinfer_fused_allreduce_kwargs(), ) @@ -185,12 +214,16 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant( rms_eps: float, scale_factor: torch.Tensor, allreduce_params: FlashInferFusedAllReduceParams, + workspace: object, use_oneshot: bool = True, norm_out: torch.Tensor | None = None, quant_out: torch.Tensor | None = None, ): - """FlashInfer fused allreduce + rmsnorm + FP8 quantization.""" - if flashinfer_comm is None or _FI_WORKSPACE is None: + """FlashInfer fused allreduce + rmsnorm + FP8 quantization. + + Note: Only supported by the trtllm backend. + """ + if flashinfer_comm is None or workspace is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -201,7 +234,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant( flashinfer_comm.allreduce_fusion( input=input_tensor, - workspace=_FI_WORKSPACE, + workspace=workspace, pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, residual_in=residual, residual_out=residual_out, @@ -213,7 +246,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant( layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, scale_factor=scale_factor, use_oneshot=use_oneshot, - **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + **allreduce_params.get_flashinfer_fused_allreduce_kwargs(), ) @@ -224,13 +257,17 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant( rms_eps: float, input_global_scale: torch.Tensor, allreduce_params: FlashInferFusedAllReduceParams, + workspace: object, quant_out: torch.Tensor, use_oneshot: bool, output_scale: torch.Tensor, norm_out: torch.Tensor | None = None, ): - """FlashInfer fused allreduce + rmsnorm + FP4 quantization.""" - if flashinfer_comm is None or _FI_WORKSPACE is None: + """FlashInfer fused allreduce + rmsnorm + FP4 quantization. + + Note: Only supported by the trtllm backend. + """ + if flashinfer_comm is None or workspace is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -241,7 +278,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant( flashinfer_comm.allreduce_fusion( input=input_tensor, - workspace=_FI_WORKSPACE, + workspace=workspace, pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, residual_in=residual, residual_out=residual_out, @@ -253,7 +290,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant( layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, scale_factor=input_global_scale, use_oneshot=use_oneshot, - **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + **allreduce_params.get_flashinfer_fused_allreduce_kwargs(), ) @@ -386,13 +423,16 @@ def run_benchmarks( dtype: torch.dtype, use_residual: bool, allreduce_params: FlashInferFusedAllReduceParams | None, + workspaces: dict, quant_modes: set[str], no_oneshot: bool, ): """Run all benchmarks for given configuration. Args: - quant_mode: "none", "fp8_only", "fp4_only", or "all" + allreduce_params: Shared parameters for FlashInfer fused allreduce. + workspaces: Dict mapping backend name ("trtllm", "mnnvl") to workspace. + quant_modes: Set of quantization modes: "none", "fp8", "fp4". """ ( input_tensor, @@ -454,10 +494,11 @@ def run_benchmarks( logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e) results["standard_allreduce_rmsnorm_native_compiled"] = float("inf") - # FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot - if flashinfer_comm is not None and allreduce_params is not None: + # FlashInfer Fused AllReduce + RMSNorm (all backends) + for backend, workspace in workspaces.items(): for use_oneshot in use_oneshot_options: suffix = "_oneshot" if use_oneshot else "_twoshot" + key = f"flashinfer_{backend}_fused_allreduce_rmsnorm{suffix}" try: time_ms = benchmark_operation( flashinfer_fused_allreduce_rmsnorm, @@ -467,14 +508,17 @@ def run_benchmarks( rms_gamma=rms_gamma, rms_eps=rms_eps, allreduce_params=allreduce_params, + workspace=workspace, use_oneshot=use_oneshot, ) - results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms + results[key] = time_ms except Exception as e: - logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e) - results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float( - "inf" + logger.error( + "FlashInfer (%s) Fused AllReduce+RMSNorm failed: %s", + backend, + e, ) + results[key] = float("inf") if "fp8" in quant_modes: # Standard AllReduce + RMSNorm + FP8 Quant @@ -540,10 +584,12 @@ def run_benchmarks( "inf" ) - # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot - if flashinfer_comm is not None and allreduce_params is not None: + # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant (trtllm only) + if "trtllm" in workspaces: + trtllm_ws = workspaces["trtllm"] for use_oneshot in use_oneshot_options: suffix = "_oneshot" if use_oneshot else "_twoshot" + key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp8_quant{suffix}" try: time_ms = benchmark_operation( flashinfer_fused_allreduce_rmsnorm_fp8_quant, @@ -555,19 +601,16 @@ def run_benchmarks( scale_factor=scale_fp8, quant_out=quant_out_fp8, allreduce_params=allreduce_params, + workspace=trtllm_ws, use_oneshot=use_oneshot, ) - results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = ( - time_ms - ) + results[key] = time_ms except Exception as e: logger.error( - "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s", + "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP8 failed: %s", e, ) - results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = ( - float("inf") - ) + results[key] = float("inf") if "fp4" in quant_modes and current_platform.has_device_capability(100): # Standard AllReduce + RMSNorm + FP4 Quant @@ -627,10 +670,12 @@ def run_benchmarks( "inf" ) - # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot - if flashinfer_comm is not None and allreduce_params is not None: + # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant (trtllm only) + if "trtllm" in workspaces: + trtllm_ws = workspaces["trtllm"] for use_oneshot in use_oneshot_options: suffix = "_oneshot" if use_oneshot else "_twoshot" + key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp4_quant{suffix}" try: time_ms = benchmark_operation( flashinfer_fused_allreduce_rmsnorm_fp4_quant, @@ -641,49 +686,18 @@ def run_benchmarks( rms_eps=rms_eps, input_global_scale=scale_fp4, allreduce_params=allreduce_params, + workspace=trtllm_ws, quant_out=fp4_quant_out, output_scale=fp4_output_scale, use_oneshot=use_oneshot, ) - results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = ( - time_ms - ) + results[key] = time_ms except Exception as e: logger.error( - "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s", + "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP4 failed: %s", e, ) - results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = ( - float("inf") - ) - - # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot - if flashinfer_comm is not None and allreduce_params is not None: - try: - time_ms = benchmark_operation( - flashinfer_fused_allreduce_rmsnorm_fp4_quant, - input_tensor, - residual=residual, - norm_out=norm_out, - rms_gamma=rms_gamma, - rms_eps=rms_eps, - input_global_scale=scale_fp4, - allreduce_params=allreduce_params, - quant_out=fp4_quant_out, - output_scale=fp4_output_scale, - use_oneshot=False, - ) - results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = ( - time_ms - ) - except Exception as e: - logger.error( - "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s", - e, - ) - results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float( - "inf" - ) + results[key] = float("inf") return results @@ -1021,8 +1035,7 @@ def main(): configs = list(itertools.product(args.num_tokens, dtypes, residual_options)) - # Setup FlashInfer workspace if available - workspace = None + # Setup FlashInfer workspaces for all backends allreduce_params = None if flashinfer_comm is not None: @@ -1037,15 +1050,17 @@ def main(): args.hidden_dim * max_element_size ) - workspace = setup_flashinfer_workspace( - world_size, - rank, - args.hidden_dim, - max_num_token, - dtype=workspace_dtype, - ) + for backend in FLASHINFER_BACKENDS: + setup_flashinfer_workspace( + backend=backend, + world_size=world_size, + rank=rank, + hidden_dim=args.hidden_dim, + max_token_num=max_num_token, + dtype=workspace_dtype, + ) - if workspace is not None: + if _FI_WORKSPACES: allreduce_params = FlashInferFusedAllReduceParams( max_token_num=max_num_token, ) @@ -1071,6 +1086,7 @@ def main(): dtype, use_residual, allreduce_params, + workspaces=_FI_WORKSPACES, quant_modes=quant_modes, no_oneshot=args.no_oneshot, ) @@ -1109,11 +1125,13 @@ def main(): finally: # Cleanup - if workspace is not None: - cleanup_flashinfer_workspace(workspace) + cleanup_flashinfer_workspaces() dist.barrier() if __name__ == "__main__": - main() + from vllm.config import VllmConfig, set_current_vllm_config + + with set_current_vllm_config(VllmConfig()): + main() diff --git a/tests/compile/passes/distributed/test_fusion_all_reduce.py b/tests/compile/passes/distributed/test_fusion_all_reduce.py index d48f22970313..6d5113b1e84b 100644 --- a/tests/compile/passes/distributed/test_fusion_all_reduce.py +++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py @@ -142,7 +142,6 @@ def __init__(self, hidden_size=16, token_num=16, eps=1e-6): *(scaled_fp4_quant(w, wg) for w, wg in zip(self.w, wgscale)) ) self.wq, self.wscale = list(wq_gen), list(wscale_gen) - print(f"{self.wq=}, {self.wscale=}") def forward(self, hidden_states): # avoid having graph input be an arg to a pattern directly @@ -199,6 +198,7 @@ def ops_in_model_before(self): @pytest.mark.parametrize("hidden_size", [64]) @pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False]) +@pytest.mark.parametrize("flashinfer_allreduce_backend", ["trtllm", "mnnvl"]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") @pytest.mark.skipif( not find_spec("flashinfer") @@ -215,6 +215,7 @@ def test_all_reduce_fusion_pass_replace( dtype: torch.dtype, enable_rms_norm_custom_op, enable_quant_fp8_custom_op, + flashinfer_allreduce_backend, ): num_processes = 2 if ( @@ -238,6 +239,7 @@ def run_torch_spawn(fn, nprocs): dtype, enable_rms_norm_custom_op, enable_quant_fp8_custom_op, + flashinfer_allreduce_backend, ), nprocs=nprocs, ) @@ -255,6 +257,7 @@ def all_reduce_fusion_pass_on_test_model( dtype: torch.dtype, enable_rms_norm_custom_op, enable_quant_fp8_custom_op, + flashinfer_allreduce_backend, ): set_random_seed(0) @@ -270,6 +273,7 @@ def all_reduce_fusion_pass_on_test_model( "WORLD_SIZE": str(world_size), "MASTER_ADDR": "localhost", "MASTER_PORT": "12345", + "VLLM_FLASHINFER_ALLREDUCE_BACKEND": flashinfer_allreduce_backend, } ) @@ -317,6 +321,10 @@ def all_reduce_fusion_pass_on_test_model( compiled_model = torch.compile(model, backend=backend) compiled_model(hidden_states) + results_unfused = model(hidden_states) + results_fused = compiled_model(hidden_states) + torch.testing.assert_close(results_unfused, results_fused, atol=1e-2, rtol=1e-2) + assert all_reduce_fusion_pass.matched_count == 4, ( f"{all_reduce_fusion_pass.matched_count=}" ) diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py index b6a1314af9ef..44dc3d67bb98 100644 --- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py @@ -22,7 +22,9 @@ kFp8StaticTensorSym, ) from vllm.platforms import current_platform -from vllm.utils.torch_utils import direct_register_custom_op +from vllm.utils.torch_utils import ( + direct_register_custom_op, +) from ..inductor_pass import enable_fake_mode from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass @@ -44,8 +46,6 @@ except ImportError: pass -logger = init_logger(__name__) - if hasattr(torch.ops._C, "scaled_fp4_quant"): STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.default @@ -82,7 +82,16 @@ if flashinfer_comm is not None: - _FI_WORKSPACE = None + from vllm.distributed.device_communicators.flashinfer_all_reduce import ( + destroy_fi_ar_workspace, + get_fi_ar_quant_workspace, + get_fi_ar_workspace, + initialize_fi_ar_quant_workspace, + initialize_fi_ar_workspace, + ) + + ar_fusion_patterns = flashinfer_comm.AllReduceFusionPattern + MiB = 1024 * 1024 def call_trtllm_fused_allreduce_norm( @@ -122,9 +131,19 @@ def call_trtllm_fused_allreduce_norm( max_one_shot_size is None or current_tensor_size <= max_one_shot_size * MiB ) - assert _FI_WORKSPACE is not None, ( - "Flashinfer must be enabled when using flashinfer" + # Select workspace based on pattern: quant patterns use the + # trtllm quant workspace, non-quant patterns use the primary workspace. + if pattern_code in ( + ar_fusion_patterns.kARResidualRMSNormFP8Quant, + ar_fusion_patterns.kARResidualRMSNormFP4Quant, + ): + workspace = get_fi_ar_quant_workspace() + else: + workspace = get_fi_ar_workspace() + assert workspace is not None, ( + "Flashinfer workspace must be initialized when using flashinfer" ) + assert flashinfer_comm is not None if norm_out is None: norm_out = allreduce_in residual_out = residual @@ -133,25 +152,30 @@ def call_trtllm_fused_allreduce_norm( # as flashinfer does not support rms_norm # and allreduce_out together residual_out = allreduce_in - # For the sizes that are smaller than the max size, - # we only use flashinfer one shot allreduce + + layout_code = None + # layout_code only supported by trtllm backend + if workspace.backend == "trtllm": + # in vllm we only support swizzled layout + layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4 + flashinfer_comm.allreduce_fusion( input=allreduce_in, - workspace=_FI_WORKSPACE, + workspace=workspace, pattern=pattern_code, - residual_in=residual, + launch_with_pdl=launch_with_pdl, + output=None, residual_out=residual_out, norm_out=norm_out, + quant_out=quant_out, + scale_out=scale_out, + residual_in=residual, rms_gamma=rms_gamma, rms_eps=rms_eps, - launch_with_pdl=launch_with_pdl, + scale_factor=scale_factor, + layout_code=layout_code, use_oneshot=use_oneshot, fp32_acc=fp32_acc, - quant_out=quant_out, - scale_out=scale_out, - # in vllm we only support swizzled layout - layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, - scale_factor=scale_factor, ) def call_trtllm_fused_allreduce_norm_fake( @@ -729,29 +753,36 @@ def __init__(self, config: VllmConfig) -> None: scope="global", ) - try: - self.workspace = flashinfer_comm.create_allreduce_fusion_workspace( - backend="trtllm", - world_size=self.tp_size, - rank=rank, - max_token_num=self.max_token_num, - hidden_dim=self.hidden_dim, - dtype=self.model_dtype, - ) - except RuntimeError as e: - if "multicast" not in str(e).lower(): - raise - logger.warning_once( - "AllReduce fusion pass is disabled: flashinfer workspace " - "creation failed: %s. This is expected on GPUs without " - "NVSwitch (e.g., NVLink bridge-only or PCIe topologies). " - "Falling back to non-fused allreduce.", - str(e), - ) - return + for workspace_init_fn in [ + initialize_fi_ar_workspace, + initialize_fi_ar_quant_workspace, + ]: + try: + workspace_init_fn( + world_size=self.tp_size, + rank=rank, + max_token_num=self.max_token_num, + hidden_dim=self.hidden_dim, + dtype=self.model_dtype, + group=self.group, + ) + except Exception as e: + if "multicast" in str(e).lower(): + logger.warning( + "AllReduce fusion pass is disabled: flashinfer workspace " + "creation failed: %s. This is expected on GPUs without " + "NVSwitch (e.g., NVLink bridge-only or PCIe topologies). " + "Falling back to non-fused allreduce.", + str(e), + ) + else: + logger.warning( + "Failed to initialize FlashInfer All Reduce workspace: %s. " + "AllReduce fusion pass will be disabled.", + e, + ) + return - global _FI_WORKSPACE - _FI_WORKSPACE = self.workspace self.allreduce_params = FlashInferFusedAllReduceParams( world_size=self.tp_size, max_token_num=self.max_token_num, @@ -762,32 +793,34 @@ def __init__(self, config: VllmConfig) -> None: @enable_fake_mode def register_patterns(self) -> None: + supports_quantization = get_fi_ar_quant_workspace() is not None for epsilon in [1e-5, 1e-6]: - AllReduceFusedRMSNormStaticQuantFP8Pattern( - epsilon, - self.model_dtype, - self.device, - self.allreduce_params, - ).register(self.patterns) - AllReduceFusedAddRMSNormStaticQuantFP8Pattern( - epsilon, - self.model_dtype, - self.device, - self.allreduce_params, - ).register(self.patterns) - if current_platform.has_device_capability(100): - AllReduceFusedRMSNormStaticQuantNVFP4Pattern( + if supports_quantization: + AllReduceFusedRMSNormStaticQuantFP8Pattern( epsilon, self.model_dtype, self.device, self.allreduce_params, ).register(self.patterns) - AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern( + AllReduceFusedAddRMSNormStaticQuantFP8Pattern( epsilon, self.model_dtype, self.device, self.allreduce_params, ).register(self.patterns) + if current_platform.has_device_capability(100): + AllReduceFusedRMSNormStaticQuantNVFP4Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) AllReduceRMSNormPattern( epsilon, self.model_dtype, @@ -825,6 +858,5 @@ def __call__(self, graph: fx.Graph) -> None: def __del__(self) -> None: if getattr(self, "disabled", True): return - if getattr(self, "workspace", None) is not None: - with contextlib.suppress(Exception): - self.workspace.destroy() + with contextlib.suppress(Exception): + destroy_fi_ar_workspace() diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 4c78871e1fda..62e2b90377fd 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -34,19 +34,25 @@ def __init__( # custom allreduce or torch symm mem can be used only by tp use_custom_allreduce = False use_torch_symm_mem = False + use_flashinfer_allreduce = False else: from vllm.distributed.parallel_state import _ENABLE_CUSTOM_ALL_REDUCE use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE use_torch_symm_mem = envs.VLLM_ALLREDUCE_USE_SYMM_MEM + use_flashinfer_allreduce = envs.VLLM_ALLREDUCE_USE_FLASHINFER self.use_custom_allreduce = use_custom_allreduce self.use_torch_symm_mem = use_torch_symm_mem + self.use_flashinfer_allreduce = use_flashinfer_allreduce # lazy import to avoid documentation build error from vllm.distributed.device_communicators.custom_all_reduce import ( CustomAllreduce, ) + from vllm.distributed.device_communicators.flashinfer_all_reduce import ( + FlashInferAllReduce, + ) from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator from vllm.distributed.device_communicators.quick_all_reduce import ( QuickAllReduce, @@ -65,12 +71,20 @@ def __init__( self.ca_comm: CustomAllreduce | None = None self.qr_comm: QuickAllReduce | None = None self.symm_mem_comm: SymmMemCommunicator | None = None + self.fi_ar_comm: FlashInferAllReduce | None = None + if use_torch_symm_mem and current_platform.is_cuda(): self.symm_mem_comm = SymmMemCommunicator( group=self.cpu_group, device=self.device, ) + if self.use_flashinfer_allreduce and self.world_size > 1: + self.fi_ar_comm = FlashInferAllReduce( + group=self.cpu_group, + device=self.device, + ) + if use_custom_allreduce and self.world_size > 1: # Initialize a custom fast all-reduce implementation. self.ca_comm = CustomAllreduce( @@ -136,7 +150,7 @@ def all_reduce(self, input_): out = torch.ops.vllm.all_reduce_symmetric_with_copy(input_) if out is not None: return out - # always try quick reduce first, then custom allreduce, + # always try quick reduce first, then flashinfer, then custom allreduce, # and then pynccl. (quick reduce just for ROCM MI3*) qr_comm = self.qr_comm if ( @@ -147,6 +161,15 @@ def all_reduce(self, input_): out = qr_comm.quick_all_reduce(input_) assert out is not None return out + fi_ar_comm = self.fi_ar_comm + if ( + fi_ar_comm is not None + and not fi_ar_comm.disabled + and fi_ar_comm.should_use_fi_ar(input_) + ): + out = fi_ar_comm.all_reduce(input_) + assert out is not None + return out ca_comm = self.ca_comm if ( ca_comm is not None @@ -270,6 +293,9 @@ def destroy(self): self.pynccl_comm = None if self.ca_comm is not None: self.ca_comm = None + if self.fi_ar_comm is not None: + self.fi_ar_comm.destroy() + self.fi_ar_comm = None if self.all2all_manager is not None: self.all2all_manager.destroy() self.all2all_manager = None diff --git a/vllm/distributed/device_communicators/flashinfer_all_reduce.py b/vllm/distributed/device_communicators/flashinfer_all_reduce.py new file mode 100644 index 000000000000..ea16c93763cb --- /dev/null +++ b/vllm/distributed/device_communicators/flashinfer_all_reduce.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +import vllm.envs as envs +from vllm.config.compilation import PassConfig +from vllm.logger import init_logger +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +fi_ar_available = False +try: + import flashinfer.comm as flashinfer_comm # type: ignore[no-redef] + from flashinfer.comm.mnnvl import ( + TorchDistBackend, # type: ignore[import-not-found, no-redef] + ) + + fi_ar_available = hasattr(flashinfer_comm, "allreduce_fusion") +except ImportError: + pass + +# Global workspace for standalone allreduce and non-quant ar+rms fusion +_fi_ar_workspace = None +# Extra workspace for quant fusion patterns (only supported by trtllm backend) +# Only created if primary workspace is not already trtllm +_fi_ar_quant_workspace = None + + +def get_fi_ar_workspace(): + return _fi_ar_workspace + + +def get_fi_ar_quant_workspace(): + return _fi_ar_quant_workspace + + +def initialize_fi_ar_workspace( + world_size: int, + rank: int, + max_token_num: int, + hidden_dim: int, + dtype: torch.dtype, + group: ProcessGroup, +) -> None: + """ + Initialize the workspace if not already initialized. + + Currently, this function is called by either the AllReduceFusionPass + or the FlashInferAllReduce backend for standalone allreduce. + If the fusion pass is enabled via + --compilation-config.pass_config.fuse_allreduce_rms=true, + it will create the workspace first, and the standalone backend + will reuse the workspace. Otherwise, the standalone backend will + create the workspace. + """ + global _fi_ar_workspace + if _fi_ar_workspace is not None: + return + + backend = envs.VLLM_FLASHINFER_ALLREDUCE_BACKEND + comm_backend = TorchDistBackend(group=group) + _fi_ar_workspace = flashinfer_comm.create_allreduce_fusion_workspace( + backend=backend, + world_size=world_size, + rank=rank, + max_token_num=max_token_num, + hidden_dim=hidden_dim, + dtype=dtype, + comm_backend=comm_backend, + ) + assert _fi_ar_workspace is not None + logger.debug( + "Initialized FlashInfer All Reduce workspace: backend=%s, " + "world_size=%d, rank=%d, max_token_num=%d, hidden_dim=%d, dtype=%s", + backend, + world_size, + rank, + max_token_num, + hidden_dim, + dtype, + ) + + +def initialize_fi_ar_quant_workspace( + world_size: int, + rank: int, + max_token_num: int, + hidden_dim: int, + dtype: torch.dtype, + group: ProcessGroup, +) -> None: + """ + Initialize the workspace used by quantization fusion patterns. + + Currently this always creates a workspace for trtllm backend as only it + supports quantization fusion (FP8/FP4). If the primary workspace + is already trtllm, the quant workspace aliases to it. + """ + global _fi_ar_quant_workspace + if _fi_ar_quant_workspace is not None: + return + + # If primary workspace is already trtllm, reuse it + if _fi_ar_workspace is not None and _fi_ar_workspace.backend == "trtllm": + _fi_ar_quant_workspace = _fi_ar_workspace + return + + comm_backend = TorchDistBackend(group=group) + _fi_ar_quant_workspace = flashinfer_comm.create_allreduce_fusion_workspace( + backend="trtllm", + world_size=world_size, + rank=rank, + max_token_num=max_token_num, + hidden_dim=hidden_dim, + dtype=dtype, + comm_backend=comm_backend, + ) + assert _fi_ar_quant_workspace is not None + logger.debug( + "Initialized FlashInfer All Reduce workspace: backend=trtllm, " + "world_size=%d, rank=%d, max_token_num=%d, hidden_dim=%d, dtype=%s", + world_size, + rank, + max_token_num, + hidden_dim, + dtype, + ) + + +def destroy_fi_ar_workspace(): + global _fi_ar_workspace + global _fi_ar_quant_workspace + if ( + _fi_ar_quant_workspace is not None + and _fi_ar_quant_workspace is not _fi_ar_workspace + ): + _fi_ar_quant_workspace.destroy() + _fi_ar_quant_workspace = None + if _fi_ar_workspace is not None: + _fi_ar_workspace.destroy() + _fi_ar_workspace = None + + +class FlashInferAllReduce: + def __init__( + self, + group: ProcessGroup, + device: int | str | torch.device, + ): + self.disabled = True + + if not fi_ar_available: + logger.info( + "FlashInfer All Reduce is disabled because flashinfer is not available" + ) + return + + if not current_platform.is_cuda(): + logger.info( + "FlashInfer All Reduce is disabled because it requires CUDA platform" + ) + return + + self.group = group + self.world_size = dist.get_world_size(self.group) + self.rank = dist.get_rank(self.group) + self.device = device + if self.world_size == 1: + return + + # Use the same threshold as the allreduce-rms fusion pass + # TODO: tune the threshold + MiB = 1024 * 1024 + max_workspace_size = PassConfig.default_fi_allreduce_fusion_max_size_mb().get( + self.world_size, None + ) + if not max_workspace_size: + logger.warning( + "FlashInfer All Reduce is disabled because it " + "is not supported for world_size=%d.", + self.world_size, + ) + return + self.max_workspace_size = max_workspace_size * MiB + self.max_num_tokens = 0 + self.disabled = False + + def _ensure_workspace(self, hidden_dim: int, dtype: torch.dtype) -> bool: + """Ensure the all reduce workspace is initialized.""" + if get_fi_ar_workspace() is not None: + return True + if self.max_num_tokens == 0: + element_size = torch.tensor([], dtype=dtype, device="cpu").element_size() + self.max_num_tokens = self.max_workspace_size // (hidden_dim * element_size) + try: + initialize_fi_ar_workspace( + world_size=self.world_size, + rank=self.rank, + max_token_num=self.max_num_tokens, + hidden_dim=hidden_dim, + dtype=dtype, + group=self.group, + ) + return True + except Exception as e: + logger.warning( + "Failed to initialize FlashInfer All Reduce workspace: %s. " + "FlashInfer All Reduce will be disabled.", + e, + ) + self.disabled = True + return False + + def should_use_fi_ar(self, input_tensor: torch.Tensor) -> bool: + if self.disabled: + return False + + if not input_tensor.is_cuda: + return False + + if not input_tensor.is_contiguous(): + return False + + if len(input_tensor.shape) != 2: + return False + + num_tokens, hidden_dim = input_tensor.shape + if not self.max_num_tokens: + element_size = torch.tensor([], dtype=input_tensor.dtype).element_size() + self.max_num_tokens = self.max_workspace_size // (hidden_dim * element_size) + + if num_tokens > self.max_num_tokens: + return False + + return self._ensure_workspace(hidden_dim, input_tensor.dtype) + + def all_reduce(self, input_tensor: torch.Tensor) -> torch.Tensor: + workspace = get_fi_ar_workspace() + return flashinfer_comm.allreduce_fusion( + input=input_tensor, + workspace=workspace, + pattern=flashinfer_comm.AllReduceFusionPattern.kAllReduce, + ) + + def destroy(self): + if not self.disabled: + destroy_fi_ar_workspace() diff --git a/vllm/envs.py b/vllm/envs.py index d62438d5735b..d560cfc7753c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -168,6 +168,7 @@ VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = ( "latency" ) + VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "auto" VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024 VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 @@ -206,6 +207,7 @@ VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True + VLLM_ALLREDUCE_USE_FLASHINFER: bool = False VLLM_TUNED_CONFIG_FOLDER: str | None = None VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set() VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: bool = False @@ -1290,6 +1292,14 @@ def _get_or_set_default() -> str: "latency", ["throughput", "latency", "masked_gemm"], ), + # Flashinfer fused allreduce backend. + # "auto" will default to "mnnvl", which performs mostly same/better than "trtllm". + # But "mnnvl" backend does not support fuse with quantization. + "VLLM_FLASHINFER_ALLREDUCE_BACKEND": env_with_choices( + "VLLM_FLASHINFER_ALLREDUCE_BACKEND", + "auto", + ["auto", "trtllm", "mnnvl"], + ), # Control the workspace buffer size for the FlashInfer backend. "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int( os.getenv("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", str(394 * 1024 * 1024)) @@ -1448,6 +1458,10 @@ def _get_or_set_default() -> str: "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool( int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1")) ), + # Whether to use FlashInfer allreduce + "VLLM_ALLREDUCE_USE_FLASHINFER": lambda: bool( + int(os.getenv("VLLM_ALLREDUCE_USE_FLASHINFER", "0")) + ), # Experimental: use this to enable MCP tool calling for non harmony models "VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT": lambda: bool( int(os.getenv("VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT", "0")) From 13025e71e888330aa3277948120051ebbc2674c7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 25 Feb 2026 20:42:40 -0800 Subject: [PATCH 006/154] [Model Runner V2] Add coding style guide (#35325) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index ccab6cec8c78..9e0cae6feef1 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -1,5 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +NOTE: Coding style guide for this file: +This model runner is shared by all models: text and multimodal, generative +and embedding, public and private. As a result, this file must only contain +code that is common to every model. Model-specific behavior belongs in the +appropriate model-specific files. + +In other words: +* Be paranoid about changing this file. It should remain stable. +* Be even more paranoid about adding new lines. It should remain minimal. + +Even for shared features (for example, different parallelism modes), keep the +complexity out of this path. The less common the feature, the more it should be +hidden. Prefer utility functions defined elsewhere and call them from here, +instead of embedding feature-specific logic directly. +""" + import functools import gc import time From 4171ff6dd9ce18f452c4e9267f5bf090c0989b04 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Thu, 26 Feb 2026 05:00:10 +0000 Subject: [PATCH 007/154] [CPU][Feat] Enable KleidiAI INT8_W4A8 for all input dtypes (#34890) Signed-off-by: Fadi Arafeh Co-authored-by: Tyler Michael Smith --- .../linear/mixed_precision/dynamic_4bit.py | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py b/vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py index 3dfe06f1b130..d0515027628e 100644 --- a/vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py +++ b/vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py @@ -42,12 +42,13 @@ def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: not in [ torch.float32, torch.bfloat16, + torch.float16, ] ): return ( False, "Dynamic4bitLinearKernel on Arm requires Float32 or" - " BFloat16 activations", + " BFloat16 or Float16 activations", ) if c.full_weight_shape[0] % c.group_size != 0: return ( @@ -118,8 +119,30 @@ def apply_weights( x: torch.Tensor, bias: torch.Tensor | None = None, ) -> torch.Tensor: + # PyTorch / KleidiAI kernels natively support the following configs: + # - channelwise with bfloat16 / float32 activations + # - groupwise with float32 activations + # To support: + # - groupwise with bfloat16/float16 activations: we need to upcast + # activations to float32 before matmul and downcast back to bfloat16/float16 + # - channelwise with float16 activations, we need to upcast activations to + # float32 before matmul and downcast back to float16 + # Note: these activations will be dynamically quantized to int8 by the kernel. + c = self.config + is_groupwise = c.group_size != c.partition_weight_shape[0] + # dtype of activations before they get dynamically quantized to int8 + original_pre_quant_act_dtype = x.dtype + pre_quant_act_dtype = original_pre_quant_act_dtype + if ( + is_groupwise and pre_quant_act_dtype == torch.bfloat16 + ) or pre_quant_act_dtype == torch.float16: + pre_quant_act_dtype = torch.float32 + x_2d = x.reshape(-1, x.shape[-1]) + if pre_quant_act_dtype != original_pre_quant_act_dtype: + x_2d = x_2d.to(pre_quant_act_dtype) + out_shape = x.shape[:-1] + (c.partition_weight_shape[1],) w_q = getattr(layer, self.w_q_name) @@ -129,5 +152,8 @@ def apply_weights( c.group_size, c.partition_weight_shape[0], c.partition_weight_shape[1], - ) - return output.reshape(out_shape) + ).reshape(out_shape) + + if pre_quant_act_dtype != original_pre_quant_act_dtype: + output = output.to(original_pre_quant_act_dtype) + return output From 9d379410179b649f4e7651940debc35c4ac7c0a5 Mon Sep 17 00:00:00 2001 From: Jason Li Date: Wed, 25 Feb 2026 21:00:12 -0800 Subject: [PATCH 008/154] [torch.compile] Sequence Parallelism threshold compile ranges (#28672) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: jasonlizhengjian Signed-off-by: Jason Li Co-authored-by: Claude Opus 4.6 Co-authored-by: Luka Govedič --- tests/compile/conftest.py | 34 +++++ tests/compile/fusions_e2e/conftest.py | 89 ++++++++++-- .../compile/fusions_e2e/test_tp2_async_tp.py | 133 ++++++++++++++++++ tests/compile/test_config.py | 1 + .../test_sequence_parallelism_threshold.py | 110 +++++++++++++++ .../passes/fusion/sequence_parallelism.py | 123 +++++++++++++--- vllm/config/compilation.py | 9 +- vllm/config/vllm.py | 57 +++++++- 8 files changed, 524 insertions(+), 32 deletions(-) create mode 100644 tests/compile/conftest.py create mode 100644 tests/compile/test_sequence_parallelism_threshold.py diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py new file mode 100644 index 000000000000..6aafac7bcad3 --- /dev/null +++ b/tests/compile/conftest.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager +from unittest.mock import MagicMock, patch + +import pytest + +from vllm.platforms.interface import DeviceCapability + + +@pytest.fixture +def mock_cuda_platform(): + """ + Fixture that returns a factory for creating mocked CUDA platforms. + + Usage: + def test_something(mock_cuda_platform): + with mock_cuda_platform(is_cuda=True, capability=(9, 0)): + # test code + """ + + @contextmanager + def _mock_platform(is_cuda: bool = True, capability: tuple[int, int] | None = None): + mock_platform = MagicMock() + mock_platform.is_cuda.return_value = is_cuda + if capability is not None: + mock_platform.get_device_capability.return_value = DeviceCapability( + *capability + ) + with patch("vllm.platforms.current_platform", mock_platform): + yield mock_platform + + return _mock_platform diff --git a/tests/compile/fusions_e2e/conftest.py b/tests/compile/fusions_e2e/conftest.py index 1d9f6cda9fd6..40b4de57f66f 100644 --- a/tests/compile/fusions_e2e/conftest.py +++ b/tests/compile/fusions_e2e/conftest.py @@ -94,7 +94,7 @@ def run( run_model(full_compilation_config, model_name, **model_kwargs) num_compile_ranges = len(full_compilation_config.get_compile_ranges()) - assert num_compile_ranges in [1, 2] + assert num_compile_ranges in [1, 2, 3] print(f"Compile ranges: {full_compilation_config.get_compile_ranges()}") print("Fusion results:") @@ -107,12 +107,33 @@ def run( # Now check the matches for match_name in matches_check: - num_ranges_activated = ( - 1 if match_name == "ar_rms_fusion" else num_compile_ranges - ) - n_expected = tp_size * num_ranges_activated - log_matches = list(int(ms) for ms in log_matches_dict[match_name]) + + # AR+RMS skips the largest range; SP skips the smallest. + # When both are enabled, AR+RMS activation count is + # model-dependent (hidden_size affects threshold), so derive + # from log data. + if ( + match_name == "ar_rms_fusion" + and "sequence_parallel" in matches_check + and num_compile_ranges >= 2 + ): + assert ( + len(log_matches) >= tp_size and len(log_matches) % tp_size == 0 + ), ( + f"Expected multiple of {tp_size} ar_rms log entries, " + f"found {len(log_matches)}" + ) + num_ranges_activated = len(log_matches) // tp_size + elif ( + match_name in ("ar_rms_fusion", "sequence_parallel") + and num_compile_ranges >= 2 + ): + num_ranges_activated = num_compile_ranges - 1 + else: + num_ranges_activated = num_compile_ranges + + n_expected = tp_size * num_ranges_activated assert len(log_matches) == n_expected, ( f"Could not find {n_expected} {match_name} " f"(found {len(log_matches)}) in:\n {log_holder.text}" @@ -122,8 +143,8 @@ def run( if match_name == "rms_quant_fusion" and "ar_rms_fusion" in matches_check: # AR+rms+quant takes precedence over rms+quant if activated. - # That means we get full matching where ar+rms+quant was not activated, - # and less where it was + # That means we get full matching where ar+rms+quant was not + # activated, and less where it was (only the smallest range). assert sum(m == expected_matches for m in log_matches) == tp_size * ( num_ranges_activated - 1 ), "Expecting full rms+quant fusion where ar+rms+quant not activated" @@ -135,6 +156,43 @@ def run( f"Expecting at least {expected_matches - matches.ar_rms_fusion} " f"where ar+rms+quant was activated" ) + elif ( + match_name == "async_tp" + and "sequence_parallel" in matches_check + and num_compile_ranges >= 2 + ): + # AsyncTP only finds patterns on ranges where SP ran. + n_sp_ranges = num_compile_ranges - 1 + assert ( + sum(m == expected_matches for m in log_matches) + == tp_size * n_sp_ranges + ), ( + f"Expecting {expected_matches} async_tp on " + f"{tp_size * n_sp_ranges} SP-range entries, " + f"found: {log_matches}" + ) + assert sum(m == 0 for m in log_matches) == tp_size, ( + f"Expecting 0 async_tp on {tp_size} small-range entries " + f"(no SP), found: {log_matches}" + ) + elif ( + match_name == "ar_rms_fusion" + and "sequence_parallel" in matches_check + and num_compile_ranges >= 2 + ): + # SP consumes allreduce patterns first, so AR+RMS finds + # full matches only on the smallest range (no SP). + assert sum(m == expected_matches for m in log_matches) == tp_size, ( + f"Expecting {expected_matches} ar_rms on " + f"{tp_size} small-range entries, found: {log_matches}" + ) + assert sum(m == 0 for m in log_matches) == tp_size * ( + num_ranges_activated - 1 + ), ( + f"Expecting 0 ar_rms on " + f"{tp_size * (num_ranges_activated - 1)} large-range " + f"entries (SP took precedence), found: {log_matches}" + ) else: expected_matches_list = [expected_matches] * n_expected assert sorted(log_matches) == expected_matches_list, ( @@ -142,7 +200,7 @@ def run( f"found: {sorted(log_matches)}" ) - if match_name == "ar_rms_fusion": + if match_name == "ar_rms_fusion" and num_compile_ranges >= 2: log_matches = re.findall( r"pass_manager.py:\d+] Skipping " r".*AllReduceFusionPass.* with compile range", @@ -155,4 +213,17 @@ def run( f"(found {len(log_matches)}) in:\n {log_holder.text}" ) + if match_name == "sequence_parallel" and num_compile_ranges >= 2: + log_matches = re.findall( + r"pass_manager.py:\d+] Skipping " + r".*SequenceParallelismPass.* with compile range", + log_holder.text, + ) + + n_expected = tp_size * (num_compile_ranges - num_ranges_activated) + assert len(log_matches) == n_expected, ( + f'Could not find {n_expected} "Skipping SequenceParallelismPass" ' + f"(found {len(log_matches)}) in:\n {log_holder.text}" + ) + return run diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py index 4769ca1e0b63..921839ea0692 100644 --- a/tests/compile/fusions_e2e/test_tp2_async_tp.py +++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py @@ -66,6 +66,9 @@ def test_tp2_async_tp_fp8_fusions( enable_qk_norm_rope_fusion=True, enable_sp=True, fuse_gemm_comms=True, + fuse_allreduce_rms=False, + # Override threshold for testing (models have small hidden_size) + sp_min_token_num=512, ), ) @@ -123,6 +126,9 @@ def test_tp2_async_tp_fusions( enable_qk_norm_rope_fusion=True, enable_sp=True, fuse_gemm_comms=True, + fuse_allreduce_rms=False, + # Override threshold for testing (models have small hidden_size) + sp_min_token_num=512, ), ) @@ -141,3 +147,130 @@ def test_tp2_async_tp_fusions( matches_check, tp_size=2, ) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides", + [llama3_8b_fp8, llama4_scout_fp8], +) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) +@pytest.mark.parametrize("n_layers", [4]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +def test_tp2_sp_ar_rms_fp8_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + run_e2e_fusion_test, + monkeypatch, +): + matches = matches_fn(n_layers) + + if is_blackwell(): + # Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns + monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel") + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + fuse_norm_quant=True, + fuse_act_quant=True, + fuse_attn_quant=True, + enable_qk_norm_rope_fusion=True, + enable_sp=True, + fuse_gemm_comms=True, + fuse_allreduce_rms=True, + # Override threshold for testing (models have small hidden_size) + sp_min_token_num=512, + ), + ) + + matches_check = [ + "rms_quant_fusion", + "act_quant_fusion", + "norm_rope_fusion", + "attn_quant_fusion", + "ar_rms_fusion", + "sequence_parallel", + "async_tp", + ] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + tp_size=2, + ) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides", + [llama3_8b, qwen3_a3b], +) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN]) +@pytest.mark.parametrize("n_layers", [4]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +def test_tp2_sp_ar_rms_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + run_e2e_fusion_test, +): + matches = matches_fn(n_layers) + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + enable_qk_norm_rope_fusion=True, + enable_sp=True, + fuse_gemm_comms=True, + fuse_allreduce_rms=True, + # Override threshold for testing (models have small hidden_size) + sp_min_token_num=512, + ), + ) + + matches_check = [ + "norm_rope_fusion", + "ar_rms_fusion", + "sequence_parallel", + "async_tp", + ] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + tp_size=2, + ) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index eb2f0669ed5f..3ba70b6aad38 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init( fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True, + sp_min_token_num=512 if enable_sp else None, ), cudagraph_mode=cudagraph_mode, ) diff --git a/tests/compile/test_sequence_parallelism_threshold.py b/tests/compile/test_sequence_parallelism_threshold.py new file mode 100644 index 000000000000..42e374cd95d7 --- /dev/null +++ b/tests/compile/test_sequence_parallelism_threshold.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.compilation.passes.fusion.sequence_parallelism import ( + SP_MIN_HIDDEN_SIZE, + SP_MIN_PER_GPU_SIZE_MB, + get_sequence_parallelism_threshold, +) + + +class TestGetSequenceParallelismThreshold: + """Tests for get_sequence_parallelism_threshold function.""" + + def test_non_cuda_returns_none(self, mock_cuda_platform): + """Non-CUDA platforms should return None.""" + with mock_cuda_platform(is_cuda=False): + result = get_sequence_parallelism_threshold( + hidden_size=8192, tp_size=2, element_size=2 + ) + assert result is None + + def test_unsupported_device_capability_returns_none(self, mock_cuda_platform): + """Unsupported device capabilities (e.g., sm80) should return None.""" + with mock_cuda_platform(capability=(8, 0)): + result = get_sequence_parallelism_threshold( + hidden_size=8192, tp_size=2, element_size=2 + ) + assert result is None + + def test_small_hidden_size_returns_none(self, mock_cuda_platform): + """H100 with hidden_size below threshold should return None.""" + with mock_cuda_platform(capability=(9, 0)): + result = get_sequence_parallelism_threshold( + hidden_size=4096, + tp_size=2, + element_size=2, # 4096 < 8192 + ) + assert result is None + + def test_h100_large_model_returns_threshold(self, mock_cuda_platform): + """H100 with large enough hidden_size should return calculated threshold.""" + with mock_cuda_platform(capability=(9, 0)): + hidden_size = 8192 + tp_size = 2 + element_size = 2 # float16/bfloat16 + + result = get_sequence_parallelism_threshold( + hidden_size=hidden_size, + tp_size=tp_size, + element_size=element_size, + ) + + # Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024 + MiB = 1024 * 1024 + expected = int( + (SP_MIN_PER_GPU_SIZE_MB[90] * tp_size * MiB) + // (hidden_size * element_size) + ) + assert result == expected + assert result == 1024 + + @pytest.mark.parametrize( + "hidden_size,tp_size,element_size,expected", + [ + # Boundary: exactly at min hidden size threshold, tp_size=1 + # (8 * 1 * 1024 * 1024) // (8192 * 2) = 512 + (8192, 1, 2, 512), + # Larger hidden size reduces token threshold + # (8 * 1 * 1024 * 1024) // (16384 * 2) = 256 + (16384, 1, 2, 256), + # Larger tp_size increases token threshold + # (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048 + (8192, 4, 2, 2048), + # Larger element_size (fp32) reduces token threshold + # (8 * 2 * 1024 * 1024) // (8192 * 4) = 512 + (8192, 2, 4, 512), + ], + ) + def test_threshold_calculation_variations( + self, mock_cuda_platform, hidden_size, tp_size, element_size, expected + ): + """Test threshold calculation with various parameter combinations.""" + with mock_cuda_platform(capability=(9, 0)): + result = get_sequence_parallelism_threshold( + hidden_size=hidden_size, + tp_size=tp_size, + element_size=element_size, + ) + assert result == expected + + def test_hidden_size_boundary(self, mock_cuda_platform): + """Test behavior at the exact hidden_size boundary.""" + with mock_cuda_platform(capability=(9, 0)): + # Just below threshold + result = get_sequence_parallelism_threshold( + hidden_size=SP_MIN_HIDDEN_SIZE[90] - 1, + tp_size=2, + element_size=2, + ) + assert result is None + + # Exactly at threshold + result = get_sequence_parallelism_threshold( + hidden_size=SP_MIN_HIDDEN_SIZE[90], + tp_size=2, + element_size=2, + ) + assert result is not None diff --git a/vllm/compilation/passes/fusion/sequence_parallelism.py b/vllm/compilation/passes/fusion/sequence_parallelism.py index 5fb932d7284b..63de85932cb7 100644 --- a/vllm/compilation/passes/fusion/sequence_parallelism.py +++ b/vllm/compilation/passes/fusion/sequence_parallelism.py @@ -27,6 +27,63 @@ logger = init_logger(__name__) +# Min hidden size per device capability for sequence parallelism +# Only apply sequence parallelism for models with hidden_size >= threshold +SP_MIN_HIDDEN_SIZE: dict[int, int] = { + 90: 8192, # H100: only for models with hidden_size >= 8192 +} + +# Min size per GPU per device capability for sequence parallelism +# Total min size = min_per_gpu_size * tp_size +# This ensures the threshold scales appropriately with tensor parallelism +SP_MIN_PER_GPU_SIZE_MB: dict[int, float] = { + 90: 8, # 8MB per GPU for H100 +} + + +def get_sequence_parallelism_threshold( + hidden_size: int, + tp_size: int, + element_size: int, +) -> int | None: + """ + Calculate the minimum token threshold for applying sequence parallelism. + + Returns None if sequence parallelism should not be applied based on model size. + + Branching logic based on device capability: + - Check if hidden_size >= SP_MIN_HIDDEN_SIZE[device_capability] + - If not, returns None (SP disabled for small models on this device) + - If yes, calculates threshold based on per-GPU size + + Formula: min_token_num = (min_per_gpu_size_mb * tp_size * MiB) // + (hidden_size * element_size) + """ + from vllm.platforms import current_platform + + if not current_platform.is_cuda(): + return None + + capability = current_platform.get_device_capability() + if capability is None: + return None + device_capability = capability.to_int() + + # Check if device has configured thresholds + min_hidden_size = SP_MIN_HIDDEN_SIZE.get(device_capability) + min_per_gpu_size_mb = SP_MIN_PER_GPU_SIZE_MB.get(device_capability) + + if min_hidden_size is None or min_per_gpu_size_mb is None: + return None + + # Only apply sequence parallelism for models meeting the size threshold + if hidden_size < min_hidden_size: + return None + + MiB = 1024 * 1024 + min_size = min_per_gpu_size_mb * MiB * tp_size + return int(min_size // (hidden_size * element_size)) + def get_first_out_wrapper( fn: Callable[..., Sequence[torch.Tensor]], @@ -309,6 +366,23 @@ class SequenceParallelismPass(VllmPatternMatcherPass): def __init__(self, config: VllmConfig) -> None: super().__init__(config) + # Get min_token_num threshold + # Read min_token_num from config (calculated during config init) + self.min_token_num = None + if config.model_config is not None: + pass_config = config.compilation_config.pass_config + self.min_token_num = pass_config.sp_min_token_num + + if self.min_token_num is not None: + # Take the min to avoid exceeding max_num_batched_tokens + max_batched = config.scheduler_config.max_num_batched_tokens + if max_batched is not None: + self.min_token_num = min(self.min_token_num, max_batched) + logger.debug_once( + f"Sequence parallelism min token threshold: {self.min_token_num}", + scope="global", + ) + # Used to clean up redundant views created temporarily # to circumvent residual shape change issues self.noop_cleanup = NoOpEliminationPass(config) @@ -339,29 +413,36 @@ def __init__(self, config: VllmConfig) -> None: self.dump_patterns(config, self.patterns) def is_applicable_for_range(self, compile_range: Range) -> bool: - # When sequence parallelism is enabled, the residual tensor from RMSNorm - # needs to be split along the sequence dimension. However, this dimension - # is symbolic during piecewise compilation, and splitting symbolic shapes - # is not supported. - # - # This pass is therefore only applied when the sequence dimension is - # concrete: - # 1. In full-graph compilation mode (no Dynamo splitting ops are used). - # For this case we always pad num_tokens to be a multiple of - # tensor_parallel_size, so there's no need to check shape % tp_size == 0. - # 2. For specific shape provided during compilation (e.g., from - # `compile_sizes`), which must be divisible by the tensor-parallel - # size. + """ + Determines if sequence parallelism should be applied for the given + compile range. + + SP is only beneficial for larger batch sizes where the communication + overhead is amortized. For small batches, the overhead of splitting + and gathering tensors across TP ranks outweighs the benefits. + + Returns False (SP disabled) when: + - Using piecewise compilation with non-concrete or TP-indivisible sizes + - min_token_num is None (SP disabled for this device/config) + - The compile range starts below the minimum token threshold + """ + # For piecewise compilation (not using inductor graph partition), + # we need concrete sizes that are divisible by TP for correct splitting if ( - not self.compilation_config.splitting_ops - or self.compilation_config.use_inductor_graph_partition + not self.compilation_config.use_inductor_graph_partition + and self.compilation_config.splitting_ops ): - return True - tp_size = get_tensor_model_parallel_world_size() - result: bool = (compile_range.is_single_size()) and ( - compile_range.end % tp_size == 0 - ) - return result + tp_size = get_tensor_model_parallel_world_size() + if not compile_range.is_single_size() or compile_range.end % tp_size != 0: + return False + + # min_token_num is None when SP is disabled for this device/config + # (e.g., non-CUDA platform, unsupported GPU, or small hidden_size) + if self.min_token_num is None: + return False + + # Only apply SP when batch size meets the minimum threshold + return compile_range.start >= self.min_token_num @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph) -> None: diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index ab6f3da06cdf..d22e9a96e0f3 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -118,7 +118,9 @@ class PassConfig: eliminate_noops: bool = Field(default=True) """Eliminate no-op ops.""" enable_sp: bool = Field(default=None) - """Enable sequence parallelism.""" + """Enable sequence parallelism. Requires TP>1. Automatically disabled + if the model's hidden_size is too small for SP to be beneficial + (threshold is device-capability dependent).""" fuse_gemm_comms: bool = Field(default=None) """Enable async TP.""" fuse_allreduce_rms: bool = Field(default=None) @@ -155,6 +157,11 @@ class PassConfig: 8: 1, # 1MB }, }, where key is the device capability""" + sp_min_token_num: int | None = None + """The minimum number of tokens above which vllm should use + sequence parallelism. Specified as an integer token count. + Unspecified will fallback to default values which are compute + capability and world size dependent.""" # TODO(luka) better pass enabling system. diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index ef71a05d3937..fba3c64a9af0 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -853,8 +853,33 @@ def has_blocked_weights(): logger.warning("Sequence Parallelism requires TP>1, disabling") self.compilation_config.pass_config.enable_sp = False self.compilation_config.pass_config.fuse_gemm_comms = False + else: + # Compute SP threshold early; disable if None (model too + # small) before +rms_norm gets forced into custom_ops. + pass_config = self.compilation_config.pass_config + if pass_config.sp_min_token_num is None: + from vllm.compilation.passes.fusion.sequence_parallelism import ( + get_sequence_parallelism_threshold, + ) + + tp_size = self.parallel_config.tensor_parallel_size + hidden_size = self.model_config.get_hidden_size() + element_size = self.model_config.dtype.itemsize + pass_config.sp_min_token_num = get_sequence_parallelism_threshold( + hidden_size, tp_size, element_size + ) - elif "-rms_norm" in self.compilation_config.custom_ops: + if pass_config.sp_min_token_num is None: + logger.warning( + "Model hidden_size too small for the SP " + "threshold heuristic, disabling. To force SP, " + "set pass_config.sp_min_token_num manually." + ) + self.compilation_config.pass_config.enable_sp = False + self.compilation_config.pass_config.fuse_gemm_comms = False + + if self.compilation_config.pass_config.enable_sp: + if "-rms_norm" in self.compilation_config.custom_ops: logger.warning( "RMS norm force disabled, sequence parallelism might break" ) @@ -1456,6 +1481,36 @@ def _set_compile_ranges(self): "allreduce-rms fusion will be enabled for all num_tokens." ) + # Add the compile ranges for sequence parallelism + if compilation_config.pass_config.enable_sp: + pass_config = compilation_config.pass_config + + # Calculate min_token_num if not explicitly provided + # User override works regardless of hidden_size + if pass_config.sp_min_token_num is None: + from vllm.compilation.passes.fusion.sequence_parallelism import ( + get_sequence_parallelism_threshold, + ) + + tp_size = self.parallel_config.tensor_parallel_size + hidden_size = self.model_config.get_hidden_size() + element_size = self.model_config.dtype.itemsize + pass_config.sp_min_token_num = get_sequence_parallelism_threshold( + hidden_size, tp_size, element_size + ) + + min_token_num = pass_config.sp_min_token_num + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + if min_token_num is not None and ( + max_num_batched_tokens is not None + and min_token_num < max_num_batched_tokens + and min_token_num > 1 + ): + # Add split point at min_token_num - 1 to ensure SP applies + # starting from min_token_num + # This creates ranges: [1, min-1] (no SP), [min, max] (SP applies) + computed_compile_ranges_split_points.append(min_token_num - 1) + if compilation_config.pass_config.fuse_rope_kvcache: max_token_num = ( compilation_config.pass_config.rope_kvcache_fusion_max_token_num From 4a9c07a0a2b8308a045476b48be29e37c349274b Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Thu, 26 Feb 2026 06:39:48 +0100 Subject: [PATCH 009/154] [BugFix] anthropic/serving_messages: fix tool call arguments streaming (#34887) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniele Trifirò Co-authored-by: Nicolò Lucchesi --- vllm/entrypoints/anthropic/serving.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py index 8fb347aabed3..dc037313de33 100644 --- a/vllm/entrypoints/anthropic/serving.py +++ b/vllm/entrypoints/anthropic/serving.py @@ -432,6 +432,19 @@ async def message_stream_converter( data = chunk.model_dump_json(exclude_unset=True) yield wrap_data_with_event(data, "content_block_start") content_block_started = True + if tool_call.function and tool_call.function.arguments: + chunk = AnthropicStreamEvent( + index=content_block_index, + type="content_block_delta", + delta=AnthropicDelta( + type="input_json_delta", + partial_json=tool_call.function.arguments, + ), + ) + data = chunk.model_dump_json(exclude_unset=True) + yield wrap_data_with_event( + data, "content_block_delta" + ) else: chunk = AnthropicStreamEvent( From 186ea22efefd2c6f4f9b7fcb657bd00f50cb465a Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Thu, 26 Feb 2026 01:35:16 -0500 Subject: [PATCH 010/154] [Misc][Harmony] Move Responses API only harmony utils to responses/harmony.py (#35339) Signed-off-by: sfeng33 <4florafeng@gmail.com> --- .../openai/parser/test_harmony_utils.py | 467 +-------------- .../openai/responses/test_harmony_utils.py | 463 +++++++++++++++ .../openai/responses/test_mcp_tools.py | 10 +- .../openai/parser/harmony_utils.py | 518 +--------------- vllm/entrypoints/openai/responses/harmony.py | 552 ++++++++++++++++++ vllm/entrypoints/openai/responses/serving.py | 20 +- 6 files changed, 1040 insertions(+), 990 deletions(-) create mode 100644 tests/entrypoints/openai/responses/test_harmony_utils.py create mode 100644 vllm/entrypoints/openai/responses/harmony.py diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index b73a0b0745c7..7842a1fcd757 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -2,13 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from openai.types.responses import ( - ResponseFunctionToolCall, - ResponseOutputMessage, - ResponseReasoningItem, -) -from openai.types.responses.response_output_item import McpCall -from openai_harmony import Author, Message, Role, TextContent +from openai_harmony import Message, Role from tests.entrypoints.openai.utils import verify_harmony_messages from vllm.entrypoints.openai.parser.harmony_utils import ( @@ -18,20 +12,21 @@ has_custom_tools, parse_chat_input_to_harmony_message, parse_chat_output, - parse_input_to_harmony_message, - parse_output_message, +) +from vllm.entrypoints.openai.responses.harmony import ( + response_previous_input_to_harmony, ) class TestCommonParseInputToHarmonyMessage: """ Tests for scenarios that are common to both Chat Completion - parse_chat_input_to_harmony_message and Responsees API - parse_input_to_harmony_message functions. + parse_chat_input_to_harmony_message and Responses API + response_previous_input_to_harmony functions. """ @pytest.fixture( - params=[parse_chat_input_to_harmony_message, parse_input_to_harmony_message] + params=[parse_chat_input_to_harmony_message, response_previous_input_to_harmony] ) def parse_function(self, request): return request.param @@ -216,81 +211,6 @@ def test_array_content_with_missing_text(self, parse_function): assert messages[0].content[1].text == "actual text" -class TestParseInputToHarmonyMessage: - """ - Tests for scenarios that are specific to the Responses API - parse_input_to_harmony_message function. - """ - - def test_message_with_empty_content(self): - """Test parsing message with empty string content.""" - chat_msg = { - "role": "user", - "content": "", - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].content[0].text == "" - - def test_tool_message_with_string_content(self): - """Test parsing tool message with string content.""" - chat_msg = { - "role": "tool", - "name": "get_weather", - "content": "The weather in San Francisco is sunny, 72°F", - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.TOOL - assert messages[0].author.name == "functions.get_weather" - assert ( - messages[0].content[0].text == "The weather in San Francisco is sunny, 72°F" - ) - assert messages[0].channel == "commentary" - - def test_tool_message_with_array_content(self): - """Test parsing tool message with array content.""" - chat_msg = { - "role": "tool", - "name": "search_results", - "content": [ - {"type": "text", "text": "Result 1: "}, - {"type": "text", "text": "Result 2: "}, - { - "type": "image", - "url": "http://example.com/img.png", - }, # Should be ignored - {"type": "text", "text": "Result 3"}, - ], - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.TOOL - assert messages[0].author.name == "functions.search_results" - assert messages[0].content[0].text == "Result 1: Result 2: Result 3" - - def test_tool_message_with_empty_content(self): - """Test parsing tool message with None content.""" - chat_msg = { - "role": "tool", - "name": "empty_tool", - "content": None, - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.TOOL - assert messages[0].author.name == "functions.empty_tool" - assert messages[0].content[0].text == "" - - class TestParseChatInputToHarmonyMessage: """ Tests for scenarios that are specific to the Chat Completion API @@ -888,200 +808,6 @@ def test_parse_chat_output_preamble_then_final(self) -> None: assert final_content == "Let me look that up.\nThe answer is 42." -class TestParseOutputMessage: - """Tests for parse_output_message function.""" - - def test_commentary_with_no_recipient_creates_message(self): - """Test that commentary with recipient=None (preambles) creates message items. - - Per Harmony format, preambles are intended to be shown to end-users, - unlike analysis channel content which is hidden reasoning. - See: https://cookbook.openai.com/articles/openai-harmony - """ - message = Message.from_role_and_content( - Role.ASSISTANT, "I will now search for the weather information." - ) - message = message.with_channel("commentary") - # recipient is None by default, representing a preamble - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseOutputMessage) - assert output_items[0].type == "message" - assert output_items[0].role == "assistant" - assert output_items[0].status == "completed" - assert len(output_items[0].content) == 1 - assert output_items[0].content[0].type == "output_text" - assert ( - output_items[0].content[0].text - == "I will now search for the weather information." - ) - - def test_commentary_with_function_recipient_creates_function_call(self): - """Test commentary with recipient='functions.X' creates function calls.""" - message = Message.from_role_and_content( - Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}' - ) - message = message.with_channel("commentary") - message = message.with_recipient("functions.get_weather") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseFunctionToolCall) - assert output_items[0].type == "function_call" - assert output_items[0].name == "get_weather" - assert ( - output_items[0].arguments - == '{"location": "San Francisco", "units": "celsius"}' - ) - assert output_items[0].call_id.startswith("call_") - assert output_items[0].id.startswith("fc_") - - def test_commentary_with_python_recipient_creates_reasoning(self): - """Test that commentary with recipient='python' creates reasoning items.""" - message = Message.from_role_and_content( - Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))" - ) - message = message.with_channel("commentary") - message = message.with_recipient("python") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseReasoningItem) - assert output_items[0].type == "reasoning" - assert ( - output_items[0].content[0].text - == "import numpy as np\nprint(np.array([1, 2, 3]))" - ) - - def test_commentary_with_browser_recipient_creates_reasoning(self): - """Test that commentary with recipient='browser' creates reasoning items.""" - message = Message.from_role_and_content( - Role.ASSISTANT, "Navigating to the specified URL" - ) - message = message.with_channel("commentary") - message = message.with_recipient("browser") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseReasoningItem) - assert output_items[0].type == "reasoning" - assert output_items[0].content[0].text == "Navigating to the specified URL" - - def test_commentary_with_container_recipient_creates_reasoning(self): - """Test that commentary with recipient='container' creates reasoning items.""" - message = Message.from_role_and_content( - Role.ASSISTANT, "Running command in container" - ) - message = message.with_channel("commentary") - message = message.with_recipient("container") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseReasoningItem) - assert output_items[0].type == "reasoning" - assert output_items[0].content[0].text == "Running command in container" - - def test_commentary_with_empty_content_and_no_recipient(self): - """Test edge case: empty commentary with recipient=None.""" - message = Message.from_role_and_content(Role.ASSISTANT, "") - message = message.with_channel("commentary") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseOutputMessage) - assert output_items[0].content[0].text == "" - - def test_commentary_with_multiple_contents_and_no_recipient(self): - """Test multiple content items in commentary with no recipient.""" - contents = [ - TextContent(text="Step 1: Analyze the request"), - TextContent(text="Step 2: Prepare to call functions"), - ] - message = Message.from_role_and_contents(Role.ASSISTANT, contents) - message = message.with_channel("commentary") - - output_items = parse_output_message(message) - - # _parse_final_message returns single ResponseOutputMessage with - # multiple contents - assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseOutputMessage) - assert len(output_items[0].content) == 2 - assert output_items[0].content[0].text == "Step 1: Analyze the request" - assert output_items[0].content[1].text == "Step 2: Prepare to call functions" - - def test_commentary_with_multiple_function_calls(self): - """Test multiple function calls in commentary channel.""" - contents = [ - TextContent(text='{"location": "San Francisco"}'), - TextContent(text='{"location": "New York"}'), - ] - message = Message.from_role_and_contents(Role.ASSISTANT, contents) - message = message.with_channel("commentary") - message = message.with_recipient("functions.get_weather") - - output_items = parse_output_message(message) - - assert len(output_items) == 2 - assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items) - assert output_items[0].name == "get_weather" - assert output_items[1].name == "get_weather" - assert output_items[0].arguments == '{"location": "San Francisco"}' - assert output_items[1].arguments == '{"location": "New York"}' - - def test_commentary_with_unknown_recipient_creates_mcp_call(self): - """Test that commentary with unknown recipient creates MCP call.""" - message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') - message = message.with_channel("commentary") - message = message.with_recipient("custom_tool") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], McpCall) - assert output_items[0].type == "mcp_call" - assert output_items[0].name == "custom_tool" - assert output_items[0].server_label == "custom_tool" - - def test_analysis_channel_creates_reasoning(self): - """Test that analysis channel creates reasoning items.""" - message = Message.from_role_and_content( - Role.ASSISTANT, "Analyzing the problem step by step..." - ) - message = message.with_channel("analysis") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseReasoningItem) - assert output_items[0].type == "reasoning" - assert ( - output_items[0].content[0].text == "Analyzing the problem step by step..." - ) - - def test_non_assistant_message_returns_empty(self): - """Test that non-assistant messages return empty list. - - Per the implementation, tool messages to assistant (e.g., search results) - are not included in final output to align with OpenAI behavior. - """ - message = Message.from_author_and_content( - Author.new(Role.TOOL, "functions.get_weather"), - "The weather is sunny, 72°F", - ) - - output_items = parse_output_message(message) - - assert len(output_items) == 0 - - def test_has_custom_tools() -> None: assert not has_custom_tools(set()) assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"}) @@ -1091,185 +817,6 @@ def test_has_custom_tools() -> None: ) -def test_parse_mcp_call_basic() -> None: - """Test that MCP calls are parsed with correct type and server_label.""" - message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}') - message = message.with_recipient("filesystem") - message = message.with_channel("commentary") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], McpCall) - assert output_items[0].type == "mcp_call" - assert output_items[0].name == "filesystem" - assert output_items[0].server_label == "filesystem" - assert output_items[0].arguments == '{"path": "/tmp"}' - assert output_items[0].status == "completed" - - -def test_parse_mcp_call_dotted_recipient() -> None: - """Test that dotted recipients extract the tool name correctly.""" - message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}') - message = message.with_recipient("repo_browser.list") - message = message.with_channel("commentary") - - output_items = parse_output_message(message) - - assert len(output_items) == 1 - assert isinstance(output_items[0], McpCall) - assert output_items[0].name == "list" - assert output_items[0].server_label == "repo_browser" - - -def test_mcp_vs_function_call() -> None: - """Test that function calls are not parsed as MCP calls.""" - func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') - func_message = func_message.with_recipient("functions.my_tool") - func_message = func_message.with_channel("commentary") - - func_items = parse_output_message(func_message) - - assert len(func_items) == 1 - assert not isinstance(func_items[0], McpCall) - assert func_items[0].type == "function_call" - - -def test_mcp_vs_builtin_tools() -> None: - """Test that built-in tools (python, container) are not parsed as MCP calls.""" - # Test python (built-in tool) - should be reasoning, not MCP - python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')") - python_message = python_message.with_recipient("python") - python_message = python_message.with_channel("commentary") - - python_items = parse_output_message(python_message) - - assert len(python_items) == 1 - assert not isinstance(python_items[0], McpCall) - assert python_items[0].type == "reasoning" - - -def test_parse_remaining_state_commentary_channel() -> None: - """Test parse_remaining_state with commentary channel and various recipients.""" - from unittest.mock import Mock - - from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state - - # Test 1: functions.* recipient → should return function tool call - parser_func = Mock() - parser_func.current_content = '{"arg": "value"}' - parser_func.current_role = Role.ASSISTANT - parser_func.current_channel = "commentary" - parser_func.current_recipient = "functions.my_tool" - - func_items = parse_remaining_state(parser_func) - - assert len(func_items) == 1 - assert not isinstance(func_items[0], McpCall) - assert func_items[0].type == "function_call" - assert func_items[0].name == "my_tool" - assert func_items[0].status == "in_progress" - - # Test 2: MCP tool (not builtin) → should return MCP call - parser_mcp = Mock() - parser_mcp.current_content = '{"path": "/tmp"}' - parser_mcp.current_role = Role.ASSISTANT - parser_mcp.current_channel = "commentary" - parser_mcp.current_recipient = "filesystem" - - mcp_items = parse_remaining_state(parser_mcp) - - assert len(mcp_items) == 1 - assert isinstance(mcp_items[0], McpCall) - assert mcp_items[0].type == "mcp_call" - assert mcp_items[0].name == "filesystem" - assert mcp_items[0].server_label == "filesystem" - assert mcp_items[0].status == "in_progress" - - # Test 3: Built-in tool (python) - # should NOT return MCP call, returns reasoning (internal tool interaction) - parser_builtin = Mock() - parser_builtin.current_content = "print('hello')" - parser_builtin.current_role = Role.ASSISTANT - parser_builtin.current_channel = "commentary" - parser_builtin.current_recipient = "python" - - builtin_items = parse_remaining_state(parser_builtin) - - # Built-in tools explicitly return reasoning - assert len(builtin_items) == 1 - assert not isinstance(builtin_items[0], McpCall) - assert builtin_items[0].type == "reasoning" - - # Test 4: No recipient (preamble) → should return message, not reasoning - parser_preamble = Mock() - parser_preamble.current_content = "I'll search for that information now." - parser_preamble.current_role = Role.ASSISTANT - parser_preamble.current_channel = "commentary" - parser_preamble.current_recipient = None - - preamble_items = parse_remaining_state(parser_preamble) - - assert len(preamble_items) == 1 - assert isinstance(preamble_items[0], ResponseOutputMessage) - assert preamble_items[0].type == "message" - assert preamble_items[0].content[0].text == "I'll search for that information now." - assert preamble_items[0].status == "incomplete" # streaming - - -def test_parse_remaining_state_analysis_channel() -> None: - """Test parse_remaining_state with analysis channel and various recipients.""" - from unittest.mock import Mock - - from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state - - # Test 1: functions.* recipient → should return function tool call - parser_func = Mock() - parser_func.current_content = '{"arg": "value"}' - parser_func.current_role = Role.ASSISTANT - parser_func.current_channel = "analysis" - parser_func.current_recipient = "functions.my_tool" - - func_items = parse_remaining_state(parser_func) - - assert len(func_items) == 1 - assert not isinstance(func_items[0], McpCall) - assert func_items[0].type == "function_call" - assert func_items[0].name == "my_tool" - assert func_items[0].status == "in_progress" - - # Test 2: MCP tool (not builtin) → should return MCP call - parser_mcp = Mock() - parser_mcp.current_content = '{"query": "test"}' - parser_mcp.current_role = Role.ASSISTANT - parser_mcp.current_channel = "analysis" - parser_mcp.current_recipient = "database" - - mcp_items = parse_remaining_state(parser_mcp) - - assert len(mcp_items) == 1 - assert isinstance(mcp_items[0], McpCall) - assert mcp_items[0].type == "mcp_call" - assert mcp_items[0].name == "database" - assert mcp_items[0].server_label == "database" - assert mcp_items[0].status == "in_progress" - - # Test 3: Built-in tool (container) - # should NOT return MCP call, falls through to reasoning - parser_builtin = Mock() - parser_builtin.current_content = "docker run" - parser_builtin.current_role = Role.ASSISTANT - parser_builtin.current_channel = "analysis" - parser_builtin.current_recipient = "container" - - builtin_items = parse_remaining_state(parser_builtin) - - # Should fall through to reasoning logic - assert len(builtin_items) == 1 - assert not isinstance(builtin_items[0], McpCall) - assert builtin_items[0].type == "reasoning" - - class TestGetSystemMessage: """Tests for get_system_message channel configuration.""" diff --git a/tests/entrypoints/openai/responses/test_harmony_utils.py b/tests/entrypoints/openai/responses/test_harmony_utils.py new file mode 100644 index 000000000000..e51538298ff9 --- /dev/null +++ b/tests/entrypoints/openai/responses/test_harmony_utils.py @@ -0,0 +1,463 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for vllm.entrypoints.openai.responses.harmony.""" + +from openai.types.responses import ( + ResponseFunctionToolCall, + ResponseOutputMessage, + ResponseReasoningItem, +) +from openai.types.responses.response_output_item import McpCall +from openai_harmony import Author, Message, Role, TextContent + +from vllm.entrypoints.openai.responses.harmony import ( + harmony_to_response_output, + parser_state_to_response_output, + response_previous_input_to_harmony, +) + + +class TestResponsePreviousInputToHarmony: + """ + Tests for scenarios that are specific to the Responses API + response_previous_input_to_harmony function. + """ + + def test_message_with_empty_content(self): + """Test parsing message with empty string content.""" + chat_msg = { + "role": "user", + "content": "", + } + + messages = response_previous_input_to_harmony(chat_msg) + + assert len(messages) == 1 + assert messages[0].content[0].text == "" + + def test_tool_message_with_string_content(self): + """Test parsing tool message with string content.""" + chat_msg = { + "role": "tool", + "name": "get_weather", + "content": "The weather in San Francisco is sunny, 72°F", + } + + messages = response_previous_input_to_harmony(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.TOOL + assert messages[0].author.name == "functions.get_weather" + assert ( + messages[0].content[0].text == "The weather in San Francisco is sunny, 72°F" + ) + assert messages[0].channel == "commentary" + + def test_tool_message_with_array_content(self): + """Test parsing tool message with array content.""" + chat_msg = { + "role": "tool", + "name": "search_results", + "content": [ + {"type": "text", "text": "Result 1: "}, + {"type": "text", "text": "Result 2: "}, + { + "type": "image", + "url": "http://example.com/img.png", + }, # Should be ignored + {"type": "text", "text": "Result 3"}, + ], + } + + messages = response_previous_input_to_harmony(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.TOOL + assert messages[0].author.name == "functions.search_results" + assert messages[0].content[0].text == "Result 1: Result 2: Result 3" + + def test_tool_message_with_empty_content(self): + """Test parsing tool message with None content.""" + chat_msg = { + "role": "tool", + "name": "empty_tool", + "content": None, + } + + messages = response_previous_input_to_harmony(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.TOOL + assert messages[0].author.name == "functions.empty_tool" + assert messages[0].content[0].text == "" + + +class TestHarmonyToResponseOutput: + """Tests for harmony_to_response_output function.""" + + def test_commentary_with_no_recipient_creates_message(self): + """Test that commentary with recipient=None (preambles) creates message items. + + Per Harmony format, preambles are intended to be shown to end-users, + unlike analysis channel content which is hidden reasoning. + See: https://cookbook.openai.com/articles/openai-harmony + """ + message = Message.from_role_and_content( + Role.ASSISTANT, "I will now search for the weather information." + ) + message = message.with_channel("commentary") + # recipient is None by default, representing a preamble + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseOutputMessage) + assert output_items[0].type == "message" + assert output_items[0].role == "assistant" + assert output_items[0].status == "completed" + assert len(output_items[0].content) == 1 + assert output_items[0].content[0].type == "output_text" + assert ( + output_items[0].content[0].text + == "I will now search for the weather information." + ) + + def test_commentary_with_function_recipient_creates_function_call(self): + """Test commentary with recipient='functions.X' creates function calls.""" + message = Message.from_role_and_content( + Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}' + ) + message = message.with_channel("commentary") + message = message.with_recipient("functions.get_weather") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseFunctionToolCall) + assert output_items[0].type == "function_call" + assert output_items[0].name == "get_weather" + assert ( + output_items[0].arguments + == '{"location": "San Francisco", "units": "celsius"}' + ) + assert output_items[0].call_id.startswith("call_") + assert output_items[0].id.startswith("fc_") + + def test_commentary_with_python_recipient_creates_reasoning(self): + """Test that commentary with recipient='python' creates reasoning items.""" + message = Message.from_role_and_content( + Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))" + ) + message = message.with_channel("commentary") + message = message.with_recipient("python") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert ( + output_items[0].content[0].text + == "import numpy as np\nprint(np.array([1, 2, 3]))" + ) + + def test_commentary_with_browser_recipient_creates_reasoning(self): + """Test that commentary with recipient='browser' creates reasoning items.""" + message = Message.from_role_and_content( + Role.ASSISTANT, "Navigating to the specified URL" + ) + message = message.with_channel("commentary") + message = message.with_recipient("browser") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert output_items[0].content[0].text == "Navigating to the specified URL" + + def test_commentary_with_container_recipient_creates_reasoning(self): + """Test that commentary with recipient='container' creates reasoning items.""" + message = Message.from_role_and_content( + Role.ASSISTANT, "Running command in container" + ) + message = message.with_channel("commentary") + message = message.with_recipient("container") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert output_items[0].content[0].text == "Running command in container" + + def test_commentary_with_empty_content_and_no_recipient(self): + """Test edge case: empty commentary with recipient=None.""" + message = Message.from_role_and_content(Role.ASSISTANT, "") + message = message.with_channel("commentary") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseOutputMessage) + assert output_items[0].content[0].text == "" + + def test_commentary_with_multiple_contents_and_no_recipient(self): + """Test multiple content items in commentary with no recipient.""" + contents = [ + TextContent(text="Step 1: Analyze the request"), + TextContent(text="Step 2: Prepare to call functions"), + ] + message = Message.from_role_and_contents(Role.ASSISTANT, contents) + message = message.with_channel("commentary") + + output_items = harmony_to_response_output(message) + + # _parse_final_message returns single ResponseOutputMessage with + # multiple contents + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseOutputMessage) + assert len(output_items[0].content) == 2 + assert output_items[0].content[0].text == "Step 1: Analyze the request" + assert output_items[0].content[1].text == "Step 2: Prepare to call functions" + + def test_commentary_with_multiple_function_calls(self): + """Test multiple function calls in commentary channel.""" + contents = [ + TextContent(text='{"location": "San Francisco"}'), + TextContent(text='{"location": "New York"}'), + ] + message = Message.from_role_and_contents(Role.ASSISTANT, contents) + message = message.with_channel("commentary") + message = message.with_recipient("functions.get_weather") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 2 + assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items) + assert output_items[0].name == "get_weather" + assert output_items[1].name == "get_weather" + assert output_items[0].arguments == '{"location": "San Francisco"}' + assert output_items[1].arguments == '{"location": "New York"}' + + def test_commentary_with_unknown_recipient_creates_mcp_call(self): + """Test that commentary with unknown recipient creates MCP call.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') + message = message.with_channel("commentary") + message = message.with_recipient("custom_tool") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].type == "mcp_call" + assert output_items[0].name == "custom_tool" + assert output_items[0].server_label == "custom_tool" + + def test_analysis_channel_creates_reasoning(self): + """Test that analysis channel creates reasoning items.""" + message = Message.from_role_and_content( + Role.ASSISTANT, "Analyzing the problem step by step..." + ) + message = message.with_channel("analysis") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert ( + output_items[0].content[0].text == "Analyzing the problem step by step..." + ) + + def test_non_assistant_message_returns_empty(self): + """Test that non-assistant messages return empty list. + + Per the implementation, tool messages to assistant (e.g., search results) + are not included in final output to align with OpenAI behavior. + """ + message = Message.from_author_and_content( + Author.new(Role.TOOL, "functions.get_weather"), + "The weather is sunny, 72°F", + ) + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 0 + + +def test_parse_mcp_call_basic() -> None: + """Test that MCP calls are parsed with correct type and server_label.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}') + message = message.with_recipient("filesystem") + message = message.with_channel("commentary") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].type == "mcp_call" + assert output_items[0].name == "filesystem" + assert output_items[0].server_label == "filesystem" + assert output_items[0].arguments == '{"path": "/tmp"}' + assert output_items[0].status == "completed" + + +def test_parse_mcp_call_dotted_recipient() -> None: + """Test that dotted recipients extract the tool name correctly.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}') + message = message.with_recipient("repo_browser.list") + message = message.with_channel("commentary") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].name == "list" + assert output_items[0].server_label == "repo_browser" + + +def test_mcp_vs_function_call() -> None: + """Test that function calls are not parsed as MCP calls.""" + func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') + func_message = func_message.with_recipient("functions.my_tool") + func_message = func_message.with_channel("commentary") + + func_items = harmony_to_response_output(func_message) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + + +def test_mcp_vs_builtin_tools() -> None: + """Test that built-in tools (python, container) are not parsed as MCP calls.""" + # Test python (built-in tool) - should be reasoning, not MCP + python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')") + python_message = python_message.with_recipient("python") + python_message = python_message.with_channel("commentary") + + python_items = harmony_to_response_output(python_message) + + assert len(python_items) == 1 + assert not isinstance(python_items[0], McpCall) + assert python_items[0].type == "reasoning" + + +def test_parser_state_to_response_output_commentary_channel() -> None: + """Test parser_state_to_response_output with commentary + channel and various recipients.""" + from unittest.mock import Mock + + # Test 1: functions.* recipient -> should return function tool call + parser_func = Mock() + parser_func.current_content = '{"arg": "value"}' + parser_func.current_role = Role.ASSISTANT + parser_func.current_channel = "commentary" + parser_func.current_recipient = "functions.my_tool" + + func_items = parser_state_to_response_output(parser_func) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + assert func_items[0].name == "my_tool" + assert func_items[0].status == "in_progress" + + # Test 2: MCP tool (not builtin) -> should return MCP call + parser_mcp = Mock() + parser_mcp.current_content = '{"path": "/tmp"}' + parser_mcp.current_role = Role.ASSISTANT + parser_mcp.current_channel = "commentary" + parser_mcp.current_recipient = "filesystem" + + mcp_items = parser_state_to_response_output(parser_mcp) + + assert len(mcp_items) == 1 + assert isinstance(mcp_items[0], McpCall) + assert mcp_items[0].type == "mcp_call" + assert mcp_items[0].name == "filesystem" + assert mcp_items[0].server_label == "filesystem" + assert mcp_items[0].status == "in_progress" + + # Test 3: Built-in tool (python) + # should NOT return MCP call, returns reasoning (internal tool interaction) + parser_builtin = Mock() + parser_builtin.current_content = "print('hello')" + parser_builtin.current_role = Role.ASSISTANT + parser_builtin.current_channel = "commentary" + parser_builtin.current_recipient = "python" + + builtin_items = parser_state_to_response_output(parser_builtin) + + # Built-in tools explicitly return reasoning + assert len(builtin_items) == 1 + assert not isinstance(builtin_items[0], McpCall) + assert builtin_items[0].type == "reasoning" + + # Test 4: No recipient (preamble) → should return message, not reasoning + parser_preamble = Mock() + parser_preamble.current_content = "I'll search for that information now." + parser_preamble.current_role = Role.ASSISTANT + parser_preamble.current_channel = "commentary" + parser_preamble.current_recipient = None + + preamble_items = parser_state_to_response_output(parser_preamble) + + assert len(preamble_items) == 1 + assert isinstance(preamble_items[0], ResponseOutputMessage) + assert preamble_items[0].type == "message" + assert preamble_items[0].content[0].text == "I'll search for that information now." + assert preamble_items[0].status == "incomplete" # streaming + + +def test_parser_state_to_response_output_analysis_channel() -> None: + """Test parser_state_to_response_output with analysis + channel and various recipients.""" + from unittest.mock import Mock + + # Test 1: functions.* recipient -> should return function tool call + parser_func = Mock() + parser_func.current_content = '{"arg": "value"}' + parser_func.current_role = Role.ASSISTANT + parser_func.current_channel = "analysis" + parser_func.current_recipient = "functions.my_tool" + + func_items = parser_state_to_response_output(parser_func) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + assert func_items[0].name == "my_tool" + assert func_items[0].status == "in_progress" + + # Test 2: MCP tool (not builtin) -> should return MCP call + parser_mcp = Mock() + parser_mcp.current_content = '{"query": "test"}' + parser_mcp.current_role = Role.ASSISTANT + parser_mcp.current_channel = "analysis" + parser_mcp.current_recipient = "database" + + mcp_items = parser_state_to_response_output(parser_mcp) + + assert len(mcp_items) == 1 + assert isinstance(mcp_items[0], McpCall) + assert mcp_items[0].type == "mcp_call" + assert mcp_items[0].name == "database" + assert mcp_items[0].server_label == "database" + assert mcp_items[0].status == "in_progress" + + # Test 3: Built-in tool (container) + # should NOT return MCP call, falls through to reasoning + parser_builtin = Mock() + parser_builtin.current_content = "docker run" + parser_builtin.current_role = Role.ASSISTANT + parser_builtin.current_channel = "analysis" + parser_builtin.current_recipient = "container" + + builtin_items = parser_state_to_response_output(parser_builtin) + + # Should fall through to reasoning logic + assert len(builtin_items) == 1 + assert not isinstance(builtin_items[0], McpCall) + assert builtin_items[0].type == "reasoning" diff --git a/tests/entrypoints/openai/responses/test_mcp_tools.py b/tests/entrypoints/openai/responses/test_mcp_tools.py index 310af4308c63..55445f1889b8 100644 --- a/tests/entrypoints/openai/responses/test_mcp_tools.py +++ b/tests/entrypoints/openai/responses/test_mcp_tools.py @@ -97,16 +97,16 @@ def test_get_tool_description(self): assert server.get_tool_description("test_server", allowed_tools=[]) is None def test_builtin_tools_consistency(self): - """MCP_BUILTIN_TOOLS must match _BUILTIN_TOOL_TO_MCP_SERVER_LABEL values.""" + """MCP_BUILTIN_TOOLS must match BUILTIN_TOOL_TO_MCP_SERVER_LABEL values.""" from vllm.entrypoints.openai.parser.harmony_utils import ( - _BUILTIN_TOOL_TO_MCP_SERVER_LABEL, + BUILTIN_TOOL_TO_MCP_SERVER_LABEL, MCP_BUILTIN_TOOLS, ) - assert set(_BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values()) == MCP_BUILTIN_TOOLS, ( + assert set(BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values()) == MCP_BUILTIN_TOOLS, ( f"MCP_BUILTIN_TOOLS {MCP_BUILTIN_TOOLS} does not match " - f"_BUILTIN_TOOL_TO_MCP_SERVER_LABEL values " - f"{set(_BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values())}" + f"BUILTIN_TOOL_TO_MCP_SERVER_LABEL values " + f"{set(BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values())}" ) diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 9dfd5f518f77..9b4264456c51 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -2,27 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime -import json from collections.abc import Iterable, Sequence from typing import Literal -from openai.types.responses import ( - ResponseFunctionToolCall, - ResponseOutputItem, - ResponseOutputMessage, - ResponseOutputText, - ResponseReasoningItem, -) -from openai.types.responses.response_function_web_search import ( - ActionFind, - ActionOpenPage, - ActionSearch, - ResponseFunctionWebSearch, -) -from openai.types.responses.response_output_item import McpCall -from openai.types.responses.response_reasoning_item import ( - Content as ResponseReasoningTextContent, -) from openai.types.responses.tool import Tool from openai_harmony import ( Author, @@ -38,17 +20,10 @@ ToolDescription, load_harmony_encoding, ) -from openai_harmony import Message as OpenAIHarmonyMessage -from openai_harmony import Role as OpenAIHarmonyRole from vllm import envs from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam -from vllm.entrypoints.openai.responses.protocol import ( - ResponseInputOutputItem, - ResponsesRequest, -) from vllm.logger import init_logger -from vllm.utils import random_uuid logger = init_logger(__name__) @@ -64,14 +39,14 @@ # they are available and requested by the user. # Tool args are provided by MCP tool descriptions. Output # of the tools are stringified. -_BUILTIN_TOOL_TO_MCP_SERVER_LABEL: dict[str, str] = { +BUILTIN_TOOL_TO_MCP_SERVER_LABEL: dict[str, str] = { "python": "code_interpreter", "browser": "web_search_preview", "container": "container", } # Derive MCP_BUILTIN_TOOLS from the canonical mapping -MCP_BUILTIN_TOOLS: set[str] = set(_BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values()) +MCP_BUILTIN_TOOLS: set[str] = set(BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values()) def has_custom_tools(tool_types: set[str]) -> bool: @@ -179,55 +154,6 @@ def get_user_message(content: str) -> Message: return Message.from_role_and_content(Role.USER, content) -def parse_response_input( - response_msg: ResponseInputOutputItem, - prev_responses: list[ResponseOutputItem | ResponseReasoningItem], -) -> Message: - if not isinstance(response_msg, dict): - response_msg = response_msg.model_dump() - if "type" not in response_msg or response_msg["type"] == "message": - role = response_msg["role"] - content = response_msg["content"] - # Add prefix for developer messages. - # <|start|>developer<|message|># Instructions {instructions}<|end|> - text_prefix = "Instructions:\n" if role == "developer" else "" - if isinstance(content, str): - msg = Message.from_role_and_content(role, text_prefix + content) - else: - contents = [TextContent(text=text_prefix + c["text"]) for c in content] - msg = Message.from_role_and_contents(role, contents) - if role == "assistant": - msg = msg.with_channel("final") - elif response_msg["type"] == "function_call_output": - call_id = response_msg["call_id"] - call_response: ResponseFunctionToolCall | None = None - for prev_response in reversed(prev_responses): - if ( - isinstance(prev_response, ResponseFunctionToolCall) - and prev_response.call_id == call_id - ): - call_response = prev_response - break - if call_response is None: - raise ValueError(f"No call message found for {call_id}") - msg = Message.from_author_and_content( - Author.new(Role.TOOL, f"functions.{call_response.name}"), - response_msg["output"], - ) - elif response_msg["type"] == "reasoning": - content = response_msg["content"] - assert len(content) == 1 - msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"]) - elif response_msg["type"] == "function_call": - msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"]) - msg = msg.with_channel("commentary") - msg = msg.with_recipient(f"functions.{response_msg['name']}") - msg = msg.with_content_type("json") - else: - raise ValueError(f"Unknown input type: {response_msg['type']}") - return msg - - def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]: """ Parse a list of messages from request.messages in the Chat Completion API to @@ -390,139 +316,6 @@ def parse_chat_input_to_harmony_message( return msgs -def parse_input_to_harmony_message(chat_msg) -> list[Message]: - """Parse a message from request.previous_input_messages - into Harmony messages. - - Supports both OpenAI chat format ({"role": "..."}) and - Harmony format ({"author": {"role": "..."}}). - """ - if not isinstance(chat_msg, dict): - chat_msg = chat_msg.model_dump(exclude_none=True) - - if "author" in chat_msg and isinstance(chat_msg.get("author"), dict): - return [_parse_harmony_format_message(chat_msg)] - - return _parse_chat_format_message(chat_msg) - - -def _parse_harmony_format_message(chat_msg: dict) -> Message: - """Reconstruct a Message from Harmony-format dict, - preserving channel, recipient, and content_type.""" - author_dict = chat_msg["author"] - role = author_dict.get("role") - name = author_dict.get("name") - - raw_content = chat_msg.get("content", "") - if isinstance(raw_content, list): - # TODO: Support refusal and non-text content types. - contents = [TextContent(text=c.get("text", "")) for c in raw_content] - elif isinstance(raw_content, str): - contents = [TextContent(text=raw_content)] - else: - contents = [TextContent(text="")] - - if name: - msg = Message.from_author_and_contents(Author.new(Role(role), name), contents) - else: - msg = Message.from_role_and_contents(Role(role), contents) - - channel = chat_msg.get("channel") - if channel: - msg = msg.with_channel(channel) - recipient = chat_msg.get("recipient") - if recipient: - msg = msg.with_recipient(recipient) - content_type = chat_msg.get("content_type") - if content_type: - msg = msg.with_content_type(content_type) - - return msg - - -def _parse_chat_format_message(chat_msg: dict) -> list[Message]: - """Parse an OpenAI chat-format dict into Harmony messages.""" - role = chat_msg.get("role") - if role is None: - raise ValueError(f"Message has no 'role' key: {chat_msg}") - - # Assistant message with tool calls - tool_calls = chat_msg.get("tool_calls") - if role == "assistant" and tool_calls: - msgs: list[Message] = [] - for call in tool_calls: - func = call.get("function", {}) - name = func.get("name", "") - arguments = func.get("arguments", "") or "" - msg = Message.from_role_and_content(Role.ASSISTANT, arguments) - msg = msg.with_channel("commentary") - msg = msg.with_recipient(f"functions.{name}") - msg = msg.with_content_type("json") - msgs.append(msg) - return msgs - - # Tool role message (tool output) - if role == "tool": - name = chat_msg.get("name", "") - if name and not name.startswith("functions."): - name = f"functions.{name}" - content = chat_msg.get("content", "") or "" - content = flatten_chat_text_content(content) - # NOTE: .with_recipient("assistant") is required on tool messages - # to match parse_chat_input_to_harmony_message behavior and ensure - # proper routing in the Harmony protocol. - msg = ( - Message.from_author_and_content(Author.new(Role.TOOL, name), content) - .with_channel("commentary") - .with_recipient("assistant") - ) - return [msg] - - # Default: user/assistant/system messages - content = chat_msg.get("content", "") - if isinstance(content, str): - contents = [TextContent(text=content)] - else: - # TODO: Support refusal. - contents = [TextContent(text=c.get("text", "")) for c in content] - msg = Message.from_role_and_contents(role, contents) - return [msg] - - -def construct_harmony_previous_input_messages( - request: ResponsesRequest, -) -> list[OpenAIHarmonyMessage]: - messages: list[OpenAIHarmonyMessage] = [] - if request.previous_input_messages: - for message in request.previous_input_messages: - # Handle both OpenAIHarmonyMessage objects and dictionary inputs - if isinstance(message, OpenAIHarmonyMessage): - message_role = message.author.role - # To match OpenAI, instructions, reasoning and tools are - # always taken from the most recent Responses API request - # not carried over from previous requests - if ( - message_role == OpenAIHarmonyRole.SYSTEM - or message_role == OpenAIHarmonyRole.DEVELOPER - ): - continue - messages.append(message) - else: - harmony_messages = parse_input_to_harmony_message(message) - for harmony_msg in harmony_messages: - message_role = harmony_msg.author.role - # To match OpenAI, instructions, reasoning and tools are - # always taken from the most recent Responses API request - # not carried over from previous requests - if ( - message_role == OpenAIHarmonyRole.SYSTEM - or message_role == OpenAIHarmonyRole.DEVELOPER - ): - continue - messages.append(harmony_msg) - return messages - - def render_for_completion(messages: list[Message]) -> list[int]: conversation = Conversation.from_messages(messages) token_ids = get_encoding().render_conversation_for_completion( @@ -531,313 +324,6 @@ def render_for_completion(messages: list[Message]) -> list[int]: return token_ids -def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem: - """Parse browser tool calls (search, open, find) into web search items.""" - if len(message.content) != 1: - raise ValueError("Invalid number of contents in browser message") - content = message.content[0] - - # Parse JSON args (with retry detection) - try: - browser_call = json.loads(content.text) - except json.JSONDecodeError: - logger.warning( - "Invalid JSON in browser tool call, using error placeholder: %s", - content.text, - ) - json_retry_output_message = ( - f"Invalid JSON args, caught and retried: {content.text}" - ) - browser_call = { - "query": json_retry_output_message, - "url": json_retry_output_message, - "pattern": json_retry_output_message, - } - - # Create appropriate action based on recipient - if recipient == "browser.search": - action = ActionSearch( - query=f"cursor:{browser_call.get('query', '')}", type="search" - ) - elif recipient == "browser.open": - action = ActionOpenPage( - url=f"cursor:{browser_call.get('url', '')}", type="open_page" - ) - elif recipient == "browser.find": - action = ActionFind( - pattern=browser_call.get("pattern", ""), - url=f"cursor:{browser_call.get('url', '')}", - type="find", - ) - else: - raise ValueError(f"Unknown browser action: {recipient}") - - return ResponseFunctionWebSearch( - id=f"ws_{random_uuid()}", - action=action, - status="completed", - type="web_search_call", - ) - - -def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]: - """Parse function calls into function tool call items.""" - function_name = recipient.split(".")[-1] - output_items = [] - for content in message.content: - random_id = random_uuid() - response_item = ResponseFunctionToolCall( - arguments=content.text, - call_id=f"call_{random_id}", - type="function_call", - name=function_name, - id=f"fc_{random_id}", - ) - output_items.append(response_item) - return output_items - - -def _parse_reasoning(message: Message) -> list[ResponseOutputItem]: - """Parse reasoning/analysis content into reasoning items.""" - output_items = [] - for content in message.content: - reasoning_item = ResponseReasoningItem( - id=f"rs_{random_uuid()}", - summary=[], - type="reasoning", - content=[ - ResponseReasoningTextContent(text=content.text, type="reasoning_text") - ], - status=None, - ) - output_items.append(reasoning_item) - return output_items - - -def _parse_final_message(message: Message) -> ResponseOutputItem: - """Parse final channel messages into output message items.""" - contents = [] - for content in message.content: - output_text = ResponseOutputText( - text=content.text, - annotations=[], # TODO - type="output_text", - logprobs=None, # TODO - ) - contents.append(output_text) - return ResponseOutputMessage( - id=f"msg_{random_uuid()}", - content=contents, - role=message.author.role, - status="completed", - type="message", - ) - - -def _parse_mcp_recipient(recipient: str) -> tuple[str, str]: - """ - Parse MCP recipient into (server_label, tool_name). - - For dotted recipients like "repo_browser.list": - - server_label: "repo_browser" (namespace/server) - - tool_name: "list" (specific tool) - - For simple recipients like "filesystem": - - server_label: "filesystem" - - tool_name: "filesystem" - """ - if "." in recipient: - server_label = recipient.split(".")[0] - tool_name = recipient.split(".")[-1] - else: - server_label = recipient - tool_name = recipient - return server_label, tool_name - - -def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]: - """Parse MCP calls into MCP call items.""" - # Handle built-in tools that need server_label mapping - if recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL: - server_label = _BUILTIN_TOOL_TO_MCP_SERVER_LABEL[recipient] - tool_name = recipient - else: - server_label, tool_name = _parse_mcp_recipient(recipient) - - output_items = [] - for content in message.content: - response_item = McpCall( - arguments=content.text, - type="mcp_call", - name=tool_name, - server_label=server_label, - id=f"mcp_{random_uuid()}", - status="completed", - ) - output_items.append(response_item) - return output_items - - -def _parse_message_no_recipient( - message: Message, -) -> list[ResponseOutputItem]: - """Parse a Harmony message with no recipient based on its channel.""" - if message.channel == "analysis": - return _parse_reasoning(message) - - if message.channel in ("commentary", "final"): - # Per Harmony format, preambles (commentary with no recipient) and - # final channel content are both intended to be shown to end-users. - # See: https://cookbook.openai.com/articles/openai-harmony - return [_parse_final_message(message)] - - raise ValueError(f"Unknown channel: {message.channel}") - - -def parse_output_message(message: Message) -> list[ResponseOutputItem]: - """ - Parse a Harmony message into a list of output response items. - """ - if message.author.role != "assistant": - # This is a message from a tool to the assistant (e.g., search result). - # Don't include it in the final output for now. This aligns with - # OpenAI's behavior on models like o4-mini. - return [] - - output_items: list[ResponseOutputItem] = [] - recipient = message.recipient - - if recipient is not None: - # Browser tool calls (browser.search, browser.open, browser.find) - if recipient.startswith("browser."): - output_items.append(_parse_browser_tool_call(message, recipient)) - - # Function calls (should only happen on commentary channel) - elif message.channel == "commentary" and recipient.startswith("functions."): - output_items.extend(_parse_function_call(message, recipient)) - - # Built-in MCP tools (python, browser, container) - elif recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL: - output_items.extend(_parse_reasoning(message)) - - # All other recipients are MCP calls - else: - output_items.extend(_parse_mcp_call(message, recipient)) - - # No recipient - handle based on channel for non-tool messages - else: - output_items.extend(_parse_message_no_recipient(message)) - - return output_items - - -def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]: - if not parser.current_content: - return [] - if parser.current_role != Role.ASSISTANT: - return [] - current_recipient = parser.current_recipient - if current_recipient is not None and current_recipient.startswith("browser."): - return [] - - if current_recipient and parser.current_channel in ("commentary", "analysis"): - if current_recipient.startswith("functions."): - rid = random_uuid() - return [ - ResponseFunctionToolCall( - arguments=parser.current_content, - call_id=f"call_{rid}", - type="function_call", - name=current_recipient.split(".")[-1], - id=f"fc_{rid}", - status="in_progress", - ) - ] - # Built-in MCP tools (python, browser, container) - elif current_recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL: - return [ - ResponseReasoningItem( - id=f"rs_{random_uuid()}", - summary=[], - type="reasoning", - content=[ - ResponseReasoningTextContent( - text=parser.current_content, type="reasoning_text" - ) - ], - status=None, - ) - ] - # All other recipients are MCP calls - else: - rid = random_uuid() - server_label, tool_name = _parse_mcp_recipient(current_recipient) - return [ - McpCall( - arguments=parser.current_content, - type="mcp_call", - name=tool_name, - server_label=server_label, - id=f"mcp_{rid}", - status="in_progress", - ) - ] - - if parser.current_channel == "commentary": - # Per Harmony format, preambles (commentary with no recipient) are - # intended to be shown to end-users, unlike analysis channel content. - output_text = ResponseOutputText( - text=parser.current_content, - annotations=[], - type="output_text", - logprobs=None, - ) - return [ - ResponseOutputMessage( - id=f"msg_{random_uuid()}", - content=[output_text], - role="assistant", - status="incomplete", - type="message", - ) - ] - - if parser.current_channel == "analysis": - return [ - ResponseReasoningItem( - id=f"rs_{random_uuid()}", - summary=[], - type="reasoning", - content=[ - ResponseReasoningTextContent( - text=parser.current_content, type="reasoning_text" - ) - ], - status=None, - ) - ] - - if parser.current_channel == "final": - output_text = ResponseOutputText( - text=parser.current_content, - annotations=[], # TODO - type="output_text", - logprobs=None, # TODO - ) - text_item = ResponseOutputMessage( - id=f"msg_{random_uuid()}", - content=[output_text], - role="assistant", - # if the parser still has messages (ie if the generator got cut - # abruptly), this should be incomplete - status="incomplete", - type="message", - ) - return [text_item] - - return [] - - def get_stop_tokens_for_assistant_actions() -> list[int]: return get_encoding().stop_tokens_for_assistant_actions() diff --git a/vllm/entrypoints/openai/responses/harmony.py b/vllm/entrypoints/openai/responses/harmony.py new file mode 100644 index 000000000000..460f310926ad --- /dev/null +++ b/vllm/entrypoints/openai/responses/harmony.py @@ -0,0 +1,552 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Harmony ↔ Responses API conversion utilities. + +Handles two directions: + 1. Response Input → Harmony Messages (input parsing) + 2. Harmony Messages → Response Output Items (output parsing) +""" + +import json + +from openai.types.responses import ( + ResponseFunctionToolCall, + ResponseOutputItem, + ResponseOutputMessage, + ResponseOutputText, + ResponseReasoningItem, +) +from openai.types.responses.response_function_web_search import ( + ActionFind, + ActionOpenPage, + ActionSearch, + ResponseFunctionWebSearch, +) +from openai.types.responses.response_output_item import McpCall +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent, +) +from openai_harmony import Author, Message, Role, StreamableParser, TextContent + +from vllm.entrypoints.openai.parser.harmony_utils import ( + BUILTIN_TOOL_TO_MCP_SERVER_LABEL, + flatten_chat_text_content, +) +from vllm.entrypoints.openai.responses.protocol import ( + ResponseInputOutputItem, + ResponsesRequest, +) +from vllm.logger import init_logger +from vllm.utils import random_uuid + +logger = init_logger(__name__) + +# --------------------------------------------------------------------------- +# 1. Private helpers for input parsing +# --------------------------------------------------------------------------- + + +def _parse_harmony_format_message(chat_msg: dict) -> Message: + """Reconstruct a Message from Harmony-format dict, + preserving channel, recipient, and content_type.""" + author_dict = chat_msg["author"] + role = author_dict.get("role") + name = author_dict.get("name") + + raw_content = chat_msg.get("content", "") + if isinstance(raw_content, list): + # TODO: Support refusal and non-text content types. + contents = [TextContent(text=c.get("text", "")) for c in raw_content] + elif isinstance(raw_content, str): + contents = [TextContent(text=raw_content)] + else: + contents = [TextContent(text="")] + + if name: + msg = Message.from_author_and_contents(Author.new(Role(role), name), contents) + else: + msg = Message.from_role_and_contents(Role(role), contents) + + channel = chat_msg.get("channel") + if channel: + msg = msg.with_channel(channel) + recipient = chat_msg.get("recipient") + if recipient: + msg = msg.with_recipient(recipient) + content_type = chat_msg.get("content_type") + if content_type: + msg = msg.with_content_type(content_type) + + return msg + + +def _parse_chat_format_message(chat_msg: dict) -> list[Message]: + """Parse an OpenAI chat-format dict into Harmony messages.""" + role = chat_msg.get("role") + if role is None: + raise ValueError(f"Message has no 'role' key: {chat_msg}") + + # Assistant message with tool calls + tool_calls = chat_msg.get("tool_calls") + if role == "assistant" and tool_calls: + msgs: list[Message] = [] + for call in tool_calls: + func = call.get("function", {}) + name = func.get("name", "") + arguments = func.get("arguments", "") or "" + msg = Message.from_role_and_content(Role.ASSISTANT, arguments) + msg = msg.with_channel("commentary") + msg = msg.with_recipient(f"functions.{name}") + msg = msg.with_content_type("json") + msgs.append(msg) + return msgs + + # Tool role message (tool output) + if role == "tool": + name = chat_msg.get("name", "") + if name and not name.startswith("functions."): + name = f"functions.{name}" + content = chat_msg.get("content", "") or "" + content = flatten_chat_text_content(content) + # NOTE: .with_recipient("assistant") is required on tool messages + # to match parse_chat_input_to_harmony_message behavior and ensure + # proper routing in the Harmony protocol. + msg = ( + Message.from_author_and_content(Author.new(Role.TOOL, name), content) + .with_channel("commentary") + .with_recipient("assistant") + ) + return [msg] + + # Default: user/assistant/system messages + content = chat_msg.get("content", "") + if isinstance(content, str): + contents = [TextContent(text=content)] + else: + # TODO: Support refusal. + contents = [TextContent(text=c.get("text", "")) for c in content] + msg = Message.from_role_and_contents(role, contents) + return [msg] + + +# --------------------------------------------------------------------------- +# 2. Public input parsing functions +# --------------------------------------------------------------------------- + + +def response_input_to_harmony( + response_msg: ResponseInputOutputItem, + prev_responses: list[ResponseOutputItem | ResponseReasoningItem], +) -> Message: + """Convert a single ResponseInputOutputItem into a Harmony Message.""" + if not isinstance(response_msg, dict): + response_msg = response_msg.model_dump() + if "type" not in response_msg or response_msg["type"] == "message": + role = response_msg["role"] + content = response_msg["content"] + # Add prefix for developer messages. + # <|start|>developer<|message|># Instructions {instructions}<|end|> + text_prefix = "Instructions:\n" if role == "developer" else "" + if isinstance(content, str): + msg = Message.from_role_and_content(role, text_prefix + content) + else: + contents = [TextContent(text=text_prefix + c["text"]) for c in content] + msg = Message.from_role_and_contents(role, contents) + if role == "assistant": + msg = msg.with_channel("final") + elif response_msg["type"] == "function_call_output": + call_id = response_msg["call_id"] + call_response: ResponseFunctionToolCall | None = None + for prev_response in reversed(prev_responses): + if ( + isinstance(prev_response, ResponseFunctionToolCall) + and prev_response.call_id == call_id + ): + call_response = prev_response + break + if call_response is None: + raise ValueError(f"No call message found for {call_id}") + msg = Message.from_author_and_content( + Author.new(Role.TOOL, f"functions.{call_response.name}"), + response_msg["output"], + ) + elif response_msg["type"] == "reasoning": + content = response_msg["content"] + assert len(content) == 1 + msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"]) + elif response_msg["type"] == "function_call": + msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"]) + msg = msg.with_channel("commentary") + msg = msg.with_recipient(f"functions.{response_msg['name']}") + msg = msg.with_content_type("json") + else: + raise ValueError(f"Unknown input type: {response_msg['type']}") + return msg + + +def response_previous_input_to_harmony(chat_msg) -> list[Message]: + """Parse a message from request.previous_input_messages + into Harmony messages. + + Supports both OpenAI chat format ({"role": "..."}) and + Harmony format ({"author": {"role": "..."}}). + """ + if not isinstance(chat_msg, dict): + chat_msg = chat_msg.model_dump(exclude_none=True) + + if "author" in chat_msg and isinstance(chat_msg.get("author"), dict): + return [_parse_harmony_format_message(chat_msg)] + + return _parse_chat_format_message(chat_msg) + + +def construct_harmony_previous_input_messages( + request: ResponsesRequest, +) -> list[Message]: + """Build a Harmony message list from request.previous_input_messages. + + Filters out system/developer messages to match OpenAI behavior where + instructions are always taken from the most recent Responses API request. + """ + messages: list[Message] = [] + if request.previous_input_messages: + for message in request.previous_input_messages: + # Handle both Message objects and dictionary inputs + if isinstance(message, Message): + message_role = message.author.role + if message_role == Role.SYSTEM or message_role == Role.DEVELOPER: + continue + messages.append(message) + else: + harmony_messages = response_previous_input_to_harmony(message) + for harmony_msg in harmony_messages: + message_role = harmony_msg.author.role + if message_role == Role.SYSTEM or message_role == Role.DEVELOPER: + continue + messages.append(harmony_msg) + return messages + + +# --------------------------------------------------------------------------- +# 3. Private helpers for output parsing +# --------------------------------------------------------------------------- + + +def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem: + """Parse browser tool calls (search, open, find) into web search items.""" + if len(message.content) != 1: + raise ValueError("Invalid number of contents in browser message") + content = message.content[0] + + # Parse JSON args (with retry detection) + try: + browser_call = json.loads(content.text) + except json.JSONDecodeError: + logger.warning( + "Invalid JSON in browser tool call, using error placeholder: %s", + content.text, + ) + json_retry_output_message = ( + f"Invalid JSON args, caught and retried: {content.text}" + ) + browser_call = { + "query": json_retry_output_message, + "url": json_retry_output_message, + "pattern": json_retry_output_message, + } + + # Create appropriate action based on recipient + if recipient == "browser.search": + action = ActionSearch( + query=f"cursor:{browser_call.get('query', '')}", type="search" + ) + elif recipient == "browser.open": + action = ActionOpenPage( + url=f"cursor:{browser_call.get('url', '')}", type="open_page" + ) + elif recipient == "browser.find": + action = ActionFind( + pattern=browser_call.get("pattern", ""), + url=f"cursor:{browser_call.get('url', '')}", + type="find", + ) + else: + raise ValueError(f"Unknown browser action: {recipient}") + + return ResponseFunctionWebSearch( + id=f"ws_{random_uuid()}", + action=action, + status="completed", + type="web_search_call", + ) + + +def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]: + """Parse function calls into function tool call items.""" + function_name = recipient.split(".")[-1] + output_items = [] + for content in message.content: + random_id = random_uuid() + response_item = ResponseFunctionToolCall( + arguments=content.text, + call_id=f"call_{random_id}", + type="function_call", + name=function_name, + id=f"fc_{random_id}", + ) + output_items.append(response_item) + return output_items + + +def _parse_reasoning(message: Message) -> list[ResponseOutputItem]: + """Parse reasoning/analysis content into reasoning items.""" + output_items = [] + for content in message.content: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent(text=content.text, type="reasoning_text") + ], + status=None, + ) + output_items.append(reasoning_item) + return output_items + + +def _parse_final_message(message: Message) -> ResponseOutputItem: + """Parse final channel messages into output message items.""" + contents = [] + for content in message.content: + output_text = ResponseOutputText( + text=content.text, + annotations=[], # TODO + type="output_text", + logprobs=None, # TODO + ) + contents.append(output_text) + return ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=contents, + role=message.author.role, + status="completed", + type="message", + ) + + +def _parse_mcp_recipient(recipient: str) -> tuple[str, str]: + """Parse MCP recipient into (server_label, tool_name). + + For dotted recipients like "repo_browser.list": + - server_label: "repo_browser" (namespace/server) + - tool_name: "list" (specific tool) + + For simple recipients like "filesystem": + - server_label: "filesystem" + - tool_name: "filesystem" + """ + if "." in recipient: + server_label = recipient.split(".")[0] + tool_name = recipient.split(".")[-1] + else: + server_label = recipient + tool_name = recipient + return server_label, tool_name + + +def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]: + """Parse MCP calls into MCP call items.""" + # Handle built-in tools that need server_label mapping + if recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL: + server_label = BUILTIN_TOOL_TO_MCP_SERVER_LABEL[recipient] + tool_name = recipient + else: + server_label, tool_name = _parse_mcp_recipient(recipient) + + output_items = [] + for content in message.content: + response_item = McpCall( + arguments=content.text, + type="mcp_call", + name=tool_name, + server_label=server_label, + id=f"mcp_{random_uuid()}", + status="completed", + ) + output_items.append(response_item) + return output_items + + +def _parse_message_no_recipient( + message: Message, +) -> list[ResponseOutputItem]: + """Parse a Harmony message with no recipient based on its channel.""" + if message.channel == "analysis": + return _parse_reasoning(message) + + if message.channel in ("commentary", "final"): + # Per Harmony format, preambles (commentary with no recipient) and + # final channel content are both intended to be shown to end-users. + # See: https://cookbook.openai.com/articles/openai-harmony + return [_parse_final_message(message)] + + raise ValueError(f"Unknown channel: {message.channel}") + + +# --------------------------------------------------------------------------- +# 4. Public output parsing functions +# --------------------------------------------------------------------------- + + +def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]: + """Parse a Harmony message into a list of output response items. + + This is the main dispatcher that routes based on channel and recipient. + """ + if message.author.role != "assistant": + # This is a message from a tool to the assistant (e.g., search result). + # Don't include it in the final output for now. This aligns with + # OpenAI's behavior on models like o4-mini. + return [] + + output_items: list[ResponseOutputItem] = [] + recipient = message.recipient + + if recipient is not None: + # Browser tool calls (browser.search, browser.open, browser.find) + if recipient.startswith("browser."): + output_items.append(_parse_browser_tool_call(message, recipient)) + + # Function calls (should only happen on commentary channel) + elif message.channel == "commentary" and recipient.startswith("functions."): + output_items.extend(_parse_function_call(message, recipient)) + + # Built-in MCP tools (python, browser, container) + elif recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL: + output_items.extend(_parse_reasoning(message)) + + # All other recipients are MCP calls + else: + output_items.extend(_parse_mcp_call(message, recipient)) + + # No recipient - handle based on channel for non-tool messages + else: + output_items.extend(_parse_message_no_recipient(message)) + + return output_items + + +def parser_state_to_response_output( + parser: StreamableParser, +) -> list[ResponseOutputItem]: + """Extract in-progress response items from incomplete parser state. + + Called when the parser has buffered content that hasn't formed a + complete message yet (e.g., generation was cut short). + """ + if not parser.current_content: + return [] + if parser.current_role != Role.ASSISTANT: + return [] + current_recipient = parser.current_recipient + if current_recipient is not None and current_recipient.startswith("browser."): + return [] + + if current_recipient and parser.current_channel in ("commentary", "analysis"): + if current_recipient.startswith("functions."): + rid = random_uuid() + return [ + ResponseFunctionToolCall( + arguments=parser.current_content, + call_id=f"call_{rid}", + type="function_call", + name=current_recipient.split(".")[-1], + id=f"fc_{rid}", + status="in_progress", + ) + ] + # Built-in MCP tools (python, browser, container) + elif current_recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL: + return [ + ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=parser.current_content, type="reasoning_text" + ) + ], + status=None, + ) + ] + # All other recipients are MCP calls + else: + rid = random_uuid() + server_label, tool_name = _parse_mcp_recipient(current_recipient) + return [ + McpCall( + arguments=parser.current_content, + type="mcp_call", + name=tool_name, + server_label=server_label, + id=f"mcp_{rid}", + status="in_progress", + ) + ] + + if parser.current_channel == "commentary": + # Per Harmony format, preambles (commentary with no recipient) are + # intended to be shown to end-users, unlike analysis channel content. + output_text = ResponseOutputText( + text=parser.current_content, + annotations=[], + type="output_text", + logprobs=None, + ) + return [ + ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=[output_text], + role="assistant", + status="incomplete", + type="message", + ) + ] + + if parser.current_channel == "analysis": + return [ + ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=parser.current_content, type="reasoning_text" + ) + ], + status=None, + ) + ] + + if parser.current_channel == "final": + output_text = ResponseOutputText( + text=parser.current_content, + annotations=[], # TODO + type="output_text", + logprobs=None, # TODO + ) + text_item = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=[output_text], + role="assistant", + # if the parser still has messages (ie if the generator got cut + # abruptly), this should be incomplete + status="incomplete", + type="message", + ) + return [text_item] + + return [] diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index c0ca87a98521..b9d526e25def 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -58,15 +58,11 @@ ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.parser.harmony_utils import ( - construct_harmony_previous_input_messages, get_developer_message, get_stop_tokens_for_assistant_actions, get_system_message, get_user_message, has_custom_tools, - parse_output_message, - parse_remaining_state, - parse_response_input, render_for_completion, ) from vllm.entrypoints.openai.responses.context import ( @@ -76,6 +72,12 @@ SimpleContext, StreamingHarmonyContext, ) +from vllm.entrypoints.openai.responses.harmony import ( + construct_harmony_previous_input_messages, + harmony_to_response_output, + parser_state_to_response_output, + response_input_to_harmony, +) from vllm.entrypoints.openai.responses.protocol import ( InputTokensDetails, OutputTokensDetails, @@ -954,9 +956,9 @@ def _make_response_output_items_with_harmony( output_items: list[ResponseOutputItem] = [] num_init_messages = context.num_init_messages for msg in context.messages[num_init_messages:]: - output_items.extend(parse_output_message(msg)) + output_items.extend(harmony_to_response_output(msg)) # Handle the generation stopped in the middle (if any). - last_items = parse_remaining_state(context.parser) + last_items = parser_state_to_response_output(context.parser) if last_items: output_items.extend(last_items) return output_items @@ -1103,13 +1105,13 @@ def _construct_input_messages_with_harmony( else: prev_outputs = [] for response_msg in request.input: - new_msg = parse_response_input(response_msg, prev_outputs) + new_msg = response_input_to_harmony(response_msg, prev_outputs) if new_msg.author.role != "system": messages.append(new_msg) # User passes in a tool call request and its output. We need - # to add the tool call request to prev_outputs so that the - # parse_response_input can find the tool call request when + # to add the tool call request to prev_outputs so that + # response_input_to_harmony can find the tool call request when # parsing the tool call output. if isinstance(response_msg, ResponseFunctionToolCall): prev_outputs.append(response_msg) From d3a51da92a031f6c1758771a2b13976ace2eece2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 26 Feb 2026 14:35:41 +0800 Subject: [PATCH 011/154] [Benchmark] Simplify SLA scan (#35306) Signed-off-by: DarkLight1337 --- docs/benchmarking/cli.md | 5 + docs/benchmarking/sweeps.md | 88 ++--- tests/benchmarks/sweep/test_serve_sla.py | 298 ---------------- vllm/benchmarks/sweep/plot.py | 2 +- vllm/benchmarks/sweep/serve.py | 87 +++-- vllm/benchmarks/sweep/serve_sla.py | 433 ++++++++--------------- vllm/benchmarks/sweep/sla_sweep.py | 138 -------- vllm/benchmarks/sweep/startup.py | 3 +- 8 files changed, 254 insertions(+), 800 deletions(-) delete mode 100644 tests/benchmarks/sweep/test_serve_sla.py delete mode 100644 vllm/benchmarks/sweep/sla_sweep.py diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md index 7bb91239c58e..8bbd9b0c0e3e 100644 --- a/docs/benchmarking/cli.md +++ b/docs/benchmarking/cli.md @@ -4,6 +4,11 @@ This section guides you through running benchmark tests with the extensive datas It's a living document, updated as new features and datasets become available. +!!! tip + The benchmarks described on this page are mainly for evaluating specific vLLM features as well as regression testing. + + For benchmarking production vLLM servers, we recommend [GuideLLM](https://github.com/vllm-project/guidellm), an established performance benchmarking framework with live progress updates and automatic report generation. It is also more flexible than `vllm bench serve` in terms of dataset loading, request formatting, and workload patterns. + ## Dataset Overview