From 9511a3f8eec6992d8834ad85af855683bd74ba29 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 25 Feb 2026 21:01:10 -0500
Subject: [PATCH 001/154] [Bugfix] Fix AttributeError in
 SMControlContextManager (#35338)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_ubatch_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 45ba1bef9f2a..754f2981c9f2 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -74,7 +74,7 @@ def __init__(
             "SM control is currently only supported on CUDA"
         )
 
-        total_sms = num_compute_units(torch.cuda.current_device().index)
+        total_sms = num_compute_units(torch.cuda.current_device())
 
         assert comm_sms < total_sms
         self.total_sms = total_sms

From 160424a937d373101818876103227cc986887b55 Mon Sep 17 00:00:00 2001
From: Seungmin Kim <8457324+ehfd@users.noreply.github.com>
Date: Thu, 26 Feb 2026 11:15:51 +0900
Subject: [PATCH 002/154] [Bugfix] Fix CUDA compatibility path setting for both
 datacenter and consumer NVIDIA GPUs (#33992)

Signed-off-by: Seungmin Kim <8457324+ehfd@users.noreply.github.com>
Signed-off-by: Andrew Mello <19512127+88plug@users.noreply.github.com>
Co-authored-by: 88plug <19512127+88plug@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 docker/Dockerfile                             |  12 +-
 .../installation/gpu.cuda.inc.md              |  17 ++
 docs/usage/troubleshooting.md                 |  27 ++-
 tests/cuda/test_cuda_compatibility_path.py    | 187 ++++++++++++++++++
 vllm/env_override.py                          |  82 ++++++++
 vllm/envs.py                                  |  14 ++
 6 files changed, 334 insertions(+), 5 deletions(-)
 create mode 100644 tests/cuda/test_cuda_compatibility_path.py

diff --git a/docker/Dockerfile b/docker/Dockerfile
index cc2ccc11cdcb..717f27b6b232 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -132,8 +132,10 @@ ENV UV_LINK_MODE=copy
 # Verify GCC version
 RUN gcc --version
 
-# Ensure CUDA compatibility library is loaded
-RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/cuda-compat.conf && ldconfig
+# Enable CUDA forward compatibility by setting '-e VLLM_ENABLE_CUDA_COMPATIBILITY=1'
+# Only needed for datacenter/professional GPUs with older drivers.
+# See: https://docs.nvidia.com/deploy/cuda-compatibility/
+ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0
 
 # ============================================================
 # SLOW-CHANGING DEPENDENCIES BELOW
@@ -560,8 +562,10 @@ ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 
-# Ensure CUDA compatibility library is loaded
-RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/cuda-compat.conf && ldconfig
+# Enable CUDA forward compatibility by setting '-e VLLM_ENABLE_CUDA_COMPATIBILITY=1'
+# Only needed for datacenter/professional GPUs with older drivers.
+# See: https://docs.nvidia.com/deploy/cuda-compatibility/
+ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0
 
 # ============================================================
 # SLOW-CHANGING DEPENDENCIES BELOW
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 661e0934eefd..da8b7d3fa1dd 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -297,6 +297,23 @@ You can add any other [engine-args](https://docs.vllm.ai/en/latest/configuration
     RUN uv pip install --system git+https://github.com/huggingface/transformers.git
     ```
 
+#### Running on Systems with Older CUDA Drivers
+
+vLLM's Docker image comes with [CUDA compatibility libraries](https://docs.nvidia.com/deploy/cuda-compatibility/index.html) pre-installed. This allows you to run vLLM on systems with NVIDIA drivers that are older than the CUDA Toolkit version used in the image, but only supports select professional and datacenter NVIDIA GPUs.
+
+To enable this feature, set the `VLLM_ENABLE_CUDA_COMPATIBILITY` environment variable to `1` or `true` when running the container:
+
+```bash
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN=<secret>" \
+    --env "VLLM_ENABLE_CUDA_COMPATIBILITY=1" \
+    vllm/vllm-openai <args...>
+```
+
+This will automatically configure `LD_LIBRARY_PATH` to point to the compatibility libraries before loading PyTorch and other dependencies.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 128c36b784d8..814b03c1e38b 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -318,7 +318,32 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to
 
 ## CUDA error: the provided PTX was compiled with an unsupported toolchain
 
-If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain.`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. The released vLLM wheels have to be compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [cuda compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. The solution is to install `cuda-compat` package from your package manager. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then add `export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH` to your `.bashrc` file. When successfully installed, you should see that the output of `nvidia-smi` will show `CUDA Version: 12.9`. Note that we use CUDA 12.9 as an example here, you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher.
+If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. This section also applies if you get the error `RuntimeError: The NVIDIA driver on your system is too old`.
+
+The released vLLM wheels are compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [CUDA compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. **This is only supported on select professional and datacenter NVIDIA GPUs.**
+
+If you are using the vLLM official Docker image, you can solve this by adding `-e VLLM_ENABLE_CUDA_COMPATIBILITY=1` to your `docker run` command. This will enable the pre-installed CUDA forward compatibility libraries.
+
+If you are running vLLM outside of Docker, the solution is to install the `cuda-compat` package from your package manager with the [CUDA repository](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) enabled. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then set `export VLLM_ENABLE_CUDA_COMPATIBILITY=1` and `export VLLM_CUDA_COMPATIBILITY_PATH="/usr/local/cuda-12.9/compat"`.
+
+On Conda, you can install the `conda-forge::cuda-compat` package (e.g., `conda install -c conda-forge cuda-compat=12.9`), then after activating the environment, set `export VLLM_ENABLE_CUDA_COMPATIBILITY=1` and `export VLLM_CUDA_COMPATIBILITY_PATH="${CONDA_PREFIX}/cuda-compat"`.
+
+You can verify the configuration works by running a minimal Python script that initializes CUDA via vLLM:
+
+```bash
+export VLLM_ENABLE_CUDA_COMPATIBILITY=1
+export VLLM_CUDA_COMPATIBILITY_PATH="/usr/local/cuda-12.9/compat"
+
+python3 - << 'EOF'
+import vllm
+import torch
+
+print(f"CUDA available: {torch.cuda.is_available()}")
+print(f"CUDA device count: {torch.cuda.device_count()}")
+EOF
+```
+
+Note that we use CUDA 12.9 as an example here, and you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher.
 
 ## ptxas fatal: Value 'sm_110a' is not defined for option 'gpu-name'
 
diff --git a/tests/cuda/test_cuda_compatibility_path.py b/tests/cuda/test_cuda_compatibility_path.py
new file mode 100644
index 000000000000..837d2c49cfb6
--- /dev/null
+++ b/tests/cuda/test_cuda_compatibility_path.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for CUDA forward compatibility path logic in env_override.py.
+
+Verifies the opt-in LD_LIBRARY_PATH manipulation for CUDA compat libs,
+including env var parsing, path detection, and deduplication.
+"""
+
+import os
+from unittest.mock import patch
+
+import pytest
+
+# Import the functions directly (they're module-level in env_override)
+# We must import them without triggering the module-level side effects,
+# so we import the functions by name after the module is already loaded.
+from vllm.env_override import (
+    _get_torch_cuda_version,
+    _maybe_set_cuda_compatibility_path,
+)
+
+
+class TestCudaCompatibilityEnvParsing:
+    """Test VLLM_ENABLE_CUDA_COMPATIBILITY env var parsing."""
+
+    def test_disabled_by_default(self, monkeypatch):
+        """Compat path is NOT set when env var is absent."""
+        monkeypatch.delenv("VLLM_ENABLE_CUDA_COMPATIBILITY", raising=False)
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        assert (
+            "LD_LIBRARY_PATH" not in os.environ
+            or os.environ.get("LD_LIBRARY_PATH", "") == ""
+        )
+
+    @pytest.mark.parametrize("value", ["0", "false", "False", "no", ""])
+    def test_disabled_values(self, monkeypatch, value):
+        """Various falsy values should not activate compat path."""
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value)
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        # LD_LIBRARY_PATH should not be set (or remain empty)
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert "compat" not in ld_path
+
+    @pytest.mark.parametrize("value", ["1", "true", "True", " 1 ", " TRUE "])
+    def test_enabled_values_with_valid_path(self, monkeypatch, tmp_path, value):
+        """Truthy values activate compat path when a valid path exists."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value)
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert str(compat_dir) in ld_path
+
+
+class TestCudaCompatibilityPathDetection:
+    """Test path detection: custom override, conda, default."""
+
+    def test_custom_path_override(self, monkeypatch, tmp_path):
+        """VLLM_CUDA_COMPATIBILITY_PATH takes highest priority."""
+        custom_dir = tmp_path / "my-compat"
+        custom_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(custom_dir))
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert ld_path.startswith(str(custom_dir))
+
+    def test_conda_prefix_fallback(self, monkeypatch, tmp_path):
+        """Falls back to $CONDA_PREFIX/cuda-compat if custom not set."""
+        conda_dir = tmp_path / "conda-env"
+        compat_dir = conda_dir / "cuda-compat"
+        compat_dir.mkdir(parents=True)
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False)
+        monkeypatch.setenv("CONDA_PREFIX", str(conda_dir))
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert str(compat_dir) in ld_path
+
+    def test_no_valid_path_does_nothing(self, monkeypatch):
+        """When enabled but no valid path exists, LD_LIBRARY_PATH unchanged."""
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", "/nonexistent/path")
+        monkeypatch.delenv("CONDA_PREFIX", raising=False)
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        with patch("vllm.env_override._get_torch_cuda_version", return_value=None):
+            _maybe_set_cuda_compatibility_path()
+        assert os.environ.get("LD_LIBRARY_PATH", "") == ""
+
+    def test_default_cuda_path_fallback(self, monkeypatch, tmp_path):
+        """Falls back to /usr/local/cuda-{ver}/compat via torch version."""
+        fake_cuda = tmp_path / "cuda-12.8" / "compat"
+        fake_cuda.mkdir(parents=True)
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False)
+        monkeypatch.delenv("CONDA_PREFIX", raising=False)
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        with (
+            patch("vllm.env_override._get_torch_cuda_version", return_value="12.8"),
+            patch(
+                "vllm.env_override.os.path.isdir",
+                side_effect=lambda p: p == "/usr/local/cuda-12.8/compat"
+                or os.path.isdir(p),
+            ),
+        ):
+            _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+        assert "/usr/local/cuda-12.8/compat" in ld_path
+
+
+class TestCudaCompatibilityLdPathManipulation:
+    """Test LD_LIBRARY_PATH prepend and deduplication logic."""
+
+    def test_prepends_to_empty_ld_path(self, monkeypatch, tmp_path):
+        """Compat path is set when LD_LIBRARY_PATH is empty."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
+        _maybe_set_cuda_compatibility_path()
+        assert os.environ["LD_LIBRARY_PATH"] == str(compat_dir)
+
+    def test_prepends_to_existing_ld_path(self, monkeypatch, tmp_path):
+        """Compat path is prepended before existing entries."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.setenv("LD_LIBRARY_PATH", "/usr/lib:/other/lib")
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ["LD_LIBRARY_PATH"]
+        parts = ld_path.split(os.pathsep)
+        assert parts[0] == str(compat_dir)
+        assert "/usr/lib" in parts
+        assert "/other/lib" in parts
+
+    def test_deduplicates_existing_compat_path(self, monkeypatch, tmp_path):
+        """If compat path already in LD_LIBRARY_PATH, move to front."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.setenv(
+            "LD_LIBRARY_PATH",
+            f"/usr/lib:{compat_dir}:/other/lib",
+        )
+        _maybe_set_cuda_compatibility_path()
+        ld_path = os.environ["LD_LIBRARY_PATH"]
+        parts = ld_path.split(os.pathsep)
+        assert parts[0] == str(compat_dir)
+        assert parts.count(str(compat_dir)) == 1
+
+    def test_already_at_front_is_noop(self, monkeypatch, tmp_path):
+        """If compat path is already first, don't modify LD_LIBRARY_PATH."""
+        compat_dir = tmp_path / "compat"
+        compat_dir.mkdir()
+        original = f"{compat_dir}:/usr/lib"
+        monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
+        monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
+        monkeypatch.setenv("LD_LIBRARY_PATH", original)
+        _maybe_set_cuda_compatibility_path()
+        assert os.environ["LD_LIBRARY_PATH"] == original
+
+
+class TestGetTorchCudaVersion:
+    """Test _get_torch_cuda_version() helper."""
+
+    def test_returns_string_when_torch_available(self):
+        """Should return a CUDA version string like '12.8'."""
+        version = _get_torch_cuda_version()
+        # torch is installed in vllm's environment
+        assert version is None or isinstance(version, str)
+
+    def test_returns_none_when_torch_missing(self):
+        """Should return None when torch is not importable."""
+        with patch(
+            "vllm.env_override.importlib.util.find_spec",
+            return_value=None,
+        ):
+            assert _get_torch_cuda_version() is None
diff --git a/vllm/env_override.py b/vllm/env_override.py
index e5a40dc3cd8f..181d000a68a7 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -1,7 +1,89 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E402
+import importlib.util
 import os
 
+
+def _get_torch_cuda_version():
+    """Peripheral function to _maybe_set_cuda_compatibility_path().
+    PyTorch version must not be determined by importing directly
+    because it will trigger the CUDA initialization, losing the
+    chance to set the LD_LIBRARY_PATH beforehand.
+    """
+    try:
+        spec = importlib.util.find_spec("torch")
+        if not spec:
+            return None
+        if spec.origin:
+            torch_root = os.path.dirname(spec.origin)
+        elif spec.submodule_search_locations:
+            torch_root = spec.submodule_search_locations[0]
+        else:
+            return None
+        version_path = os.path.join(torch_root, "version.py")
+        if not os.path.exists(version_path):
+            return None
+        # Load the version module without importing torch
+        ver_spec = importlib.util.spec_from_file_location("torch.version", version_path)
+        if not ver_spec or not ver_spec.loader:
+            return None
+        module = importlib.util.module_from_spec(ver_spec)
+        # Avoid registering in sys.modules to not confuse future imports
+        ver_spec.loader.exec_module(module)
+        return getattr(module, "cuda", None)
+    except Exception:
+        return None
+
+
+def _maybe_set_cuda_compatibility_path():
+    """Set LD_LIBRARY_PATH for CUDA forward compatibility if enabled.
+
+    Must run before 'import torch' since torch loads CUDA shared libraries
+    at import time and the dynamic linker only consults LD_LIBRARY_PATH when
+    a library is first loaded.
+
+    CUDA forward compatibility is only supported on select professional and
+    datacenter NVIDIA GPUs. Consumer GPUs (GeForce, RTX) do not support it
+    and will get Error 803 if compat libs are loaded.
+    """
+    enable = os.environ.get("VLLM_ENABLE_CUDA_COMPATIBILITY", "0").strip().lower() in (
+        "1",
+        "true",
+    )
+    if not enable:
+        return
+
+    cuda_compat_path = os.environ.get("VLLM_CUDA_COMPATIBILITY_PATH", "")
+    if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
+        conda_prefix = os.environ.get("CONDA_PREFIX", "")
+        conda_compat = os.path.join(conda_prefix, "cuda-compat")
+        if conda_prefix and os.path.isdir(conda_compat):
+            cuda_compat_path = conda_compat
+    if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
+        torch_cuda_version = _get_torch_cuda_version()
+        if torch_cuda_version:
+            default_path = f"/usr/local/cuda-{torch_cuda_version}/compat"
+            if os.path.isdir(default_path):
+                cuda_compat_path = default_path
+    if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
+        return
+
+    norm_path = os.path.normpath(cuda_compat_path)
+    existing = os.environ.get("LD_LIBRARY_PATH", "")
+    ld_paths = existing.split(os.pathsep) if existing else []
+
+    if ld_paths and ld_paths[0] and os.path.normpath(ld_paths[0]) == norm_path:
+        return  # Already at the front
+
+    new_paths = [norm_path] + [
+        p for p in ld_paths if not p or os.path.normpath(p) != norm_path
+    ]
+    os.environ["LD_LIBRARY_PATH"] = os.pathsep.join(new_paths)
+
+
+_maybe_set_cuda_compatibility_path()
+
 import torch
 
 from vllm.logger import init_logger
diff --git a/vllm/envs.py b/vllm/envs.py
index 0d8cf021e4f2..d62438d5735b 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -239,6 +239,8 @@
     VLLM_WEIGHT_OFFLOADING_DISABLE_UVA: bool = False
     VLLM_DISABLE_LOG_LOGO: bool = False
     VLLM_LORA_DISABLE_PDL: bool = False
+    VLLM_ENABLE_CUDA_COMPATIBILITY: bool = False
+    VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
 
 
 def get_default_cache_root():
@@ -1591,6 +1593,16 @@ def _get_or_set_default() -> str:
     # Disable PDL for LoRA, as enabling PDL with LoRA on SM100 causes
     # Triton compilation to fail.
     "VLLM_LORA_DISABLE_PDL": lambda: bool(int(os.getenv("VLLM_LORA_DISABLE_PDL", "0"))),
+    # Enable CUDA compatibility mode for datacenter GPUs with older
+    # driver versions than the CUDA toolkit major version of vLLM.
+    "VLLM_ENABLE_CUDA_COMPATIBILITY": lambda: (
+        os.environ.get("VLLM_ENABLE_CUDA_COMPATIBILITY", "0").strip().lower()
+        in ("1", "true")
+    ),
+    # Path to the CUDA compatibility libraries when CUDA compatibility is enabled.
+    "VLLM_CUDA_COMPATIBILITY_PATH": lambda: os.environ.get(
+        "VLLM_CUDA_COMPATIBILITY_PATH", None
+    ),
 }
 
 
@@ -1731,6 +1743,8 @@ def compile_factors() -> dict[str, object]:
         "VLLM_CPU_MOE_PREPACK",
         "VLLM_CPU_SGL_KERNEL",
         "VLLM_TEST_FORCE_LOAD_FORMAT",
+        "VLLM_ENABLE_CUDA_COMPATIBILITY",
+        "VLLM_CUDA_COMPATIBILITY_PATH",
         "LOCAL_RANK",
         "CUDA_VISIBLE_DEVICES",
         "NO_COLOR",

From 86c3b5a808506e325fd7e59d86d83170fc98c93c Mon Sep 17 00:00:00 2001
From: "Roberto L. Castro"
 <38211239+LopezCastroRoberto@users.noreply.github.com>
Date: Thu, 26 Feb 2026 03:32:50 +0100
Subject: [PATCH 003/154] [BugFix] Fix fp4 quant kernel on CUDA 12.8 (#35210)

Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
---
 .../fp4/activation_nvfp4_quant_fusion_kernels.cu     |  6 ++++--
 csrc/quantization/fp4/nvfp4_quant_kernels.cu         | 12 +++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
index d0264c4d154c..8583b79fd58f 100644
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -107,7 +107,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
               (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
           reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
         } else {
-          out[inOffset] = out_val;
+          int64_t outOffset =
+              rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+          out[outOffset] = out_val;
         }
       }
     }
@@ -140,7 +142,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
   int const numBlocksPerSM =
       vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
 
-  int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
+  int sf_n_unpadded = int(n / CVT_FP4_ELTS_PER_THREAD);
 
   int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
   int grid_x = std::min(
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index c27fb69d44be..b521b4707a4d 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -109,7 +109,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
     cvt_fp16_to_fp4_sf_major(int32_t numRows, int32_t numCols,
-                             int32_t sf_n_unpadded, Type const* __restrict__ in,
+                             int32_t sf_n_unpadded, int32_t num_packed_cols,
+                             Type const* __restrict__ in,
                              float const* __restrict__ SFScale,
                              uint32_t* __restrict__ out,
                              uint32_t* __restrict__ SFout) {
@@ -131,7 +132,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   // Iterate over all rows and cols including padded ones -
   //  ensures we visit every single scale factor address to initialize it.
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    if (colIdx < sf_n_unpadded) {
+    if (colIdx < num_packed_cols) {
       PackedVec in_vec;
       int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
 
@@ -222,7 +223,8 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
           reinterpret_cast<uint32_t*>(sf_out));
     });
   } else {
-    int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
+    int num_packed_cols = n / CVT_FP4_ELTS_PER_THREAD;
+    int grid_y = vllm::div_round_up(num_packed_cols, static_cast<int>(block.x));
     int grid_x = std::min(
         m, std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
     dim3 grid(grid_x, grid_y);
@@ -232,8 +234,8 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
       auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
       // NOTE: We don't support e8m0 scales at this moment.
       vllm::cvt_fp16_to_fp4_sf_major<cuda_type, false>
-          <<<grid, block, 0, stream>>>(m, n, sf_n_unpadded, input_ptr,
-                                       input_sf_ptr,
+          <<<grid, block, 0, stream>>>(m, n, sf_n_unpadded, num_packed_cols,
+                                       input_ptr, input_sf_ptr,
                                        reinterpret_cast<uint32_t*>(output_ptr),
                                        reinterpret_cast<uint32_t*>(sf_out));
     });

From 2aa414040243bc24447aa5a4f244f4104064d539 Mon Sep 17 00:00:00 2001
From: hujiaxin0 <524446785@qq.com>
Date: Thu, 26 Feb 2026 11:08:09 +0800
Subject: [PATCH 004/154] openpangu-vl support video input (#34134)

Signed-off-by: hujiaxin <524446785@qq.com>
Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Co-authored-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/multimodal/video.py | 87 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index f123799ca901..fb4e19fa6745 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -747,3 +747,90 @@ def load_bytes(
             **kwargs,
         )
         return out
+
+
+@VIDEO_LOADER_REGISTRY.register("openpangu")
+class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend):
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = 32,
+        fps: int = 1,
+        max_duration: int = 300,
+        frame_recovery: bool = False,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Load video frames with dynamic sampling based on duration.
+        Assume that total_num_frames = 10 and fps = 1.
+        The timestamp of frame 0 is 0.0.
+        The timestamp of frame 1 is 1.0.…
+        The timestamp of frame 9 (the last frame) should be 9.0, that is,
+        (total_frames_num – 1) / original_fps.
+
+        Args:
+            data: Raw video bytes
+            num_frames: Not used in dynamic backend
+            fps: Target FPS for sampling (default: 1)
+
+        Returns:
+            Tuple of (frames_array, metadata_dict)
+        """
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = float(cap.get(cv2.CAP_PROP_FPS))
+        # The timestamp of the rightmost frame, cannot be used to calculate frame 0.
+        if total_frames_num >= 1 and original_fps > 0:
+            total_duration = (total_frames_num - 1) / original_fps
+        else:
+            total_duration = 0
+
+        # `fps` is the FPS parameter passed in for sampling,
+        # -1 indicates that sampling can be performed directly without FPS limitation.
+        if fps > 0:
+            # Num_frames is the maximum number of frames to sample.
+            # If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501
+            if num_frames >= int(total_duration * fps) + 1:
+                num_frames = int(total_duration * fps) + 1
+                # Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501
+                # cannot be calculated for frame 0.
+                total_duration = min(total_duration, (num_frames - 1) / fps)
+        elif fps != -1:
+            raise ValueError(
+                f"requires dataset fps is -1 or greater than 0 but got {fps}"
+            )
+
+        sample_frame_timestamps = np.linspace(
+            0, total_duration, num_frames, dtype=float
+        )
+        frames_indices = [
+            min(total_frames_num - 1, round(t * original_fps))
+            for t in sample_frame_timestamps
+        ]
+
+        frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
+            cap, frames_indices, total_frames_num
+        )
+
+        if recovered_map:
+            logger.info(
+                "Frame recovery: %d frames recovered using forward scan.",
+                len(recovered_map),
+            )
+
+        metadata = {
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
+            "duration": total_duration,
+            "video_backend": "opencv_dynamic_openpangu",
+            "frames_indices": valid_frame_indices,
+            "do_sample_frames": False,
+        }
+        return frames, metadata

From 71dfce6aa6cc14d016154b4e3fd8cc40c05415f9 Mon Sep 17 00:00:00 2001
From: Hanjie Qiu <50634613+hjjq@users.noreply.github.com>
Date: Wed, 25 Feb 2026 19:17:20 -0800
Subject: [PATCH 005/154] [Kernel] Refactor FlashInfer allreduce for mnnvl
 backend (#34109)

Signed-off-by: hjjq <50634613+hjjq@users.noreply.github.com>
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Wei Zhao <51183510+wzhao18@users.noreply.github.com>
---
 .../kernels/benchmark_device_communicators.py | 113 ++++++--
 .../kernels/benchmark_fused_collective.py     | 210 ++++++++-------
 .../distributed/test_fusion_all_reduce.py     |  10 +-
 .../passes/fusion/allreduce_rms_fusion.py     | 144 ++++++----
 .../device_communicators/cuda_communicator.py |  28 +-
 .../flashinfer_all_reduce.py                  | 252 ++++++++++++++++++
 vllm/envs.py                                  |  14 +
 7 files changed, 592 insertions(+), 179 deletions(-)
 create mode 100644 vllm/distributed/device_communicators/flashinfer_all_reduce.py

diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
index 7b453fe7b680..d1005461ab93 100644
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -30,6 +30,9 @@
 from torch.distributed import ProcessGroup
 
 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.flashinfer_all_reduce import (
+    FlashInferAllReduce,
+)
 from vllm.distributed.device_communicators.pynccl import (
     PyNcclCommunicator,
     register_nccl_symmetric_ops,
@@ -44,7 +47,7 @@
 logger = init_logger(__name__)
 
 # Default sequence lengths to benchmark
-DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+DEFAULT_SEQUENCE_LENGTHS = [16, 64, 128, 512, 1024, 2048, 4096, 8192]
 
 # Fixed hidden size and dtype for all benchmarks
 HIDDEN_SIZE = 8192
@@ -81,6 +84,7 @@ def __init__(
         self.symm_mem_comm = None
         self.symm_mem_comm_multimem = None
         self.symm_mem_comm_two_shot = None
+        self.fi_ar_comm = None
 
         self._init_communicators()
 
@@ -161,6 +165,22 @@ def _init_communicators(self):
             )
             self.symm_mem_comm_two_shot = None
 
+        try:
+            self.fi_ar_comm = FlashInferAllReduce(
+                group=self.cpu_group,
+                device=self.device,
+            )
+            if not self.fi_ar_comm.disabled:
+                logger.info("Rank %s: FlashInferAllReduce initialized", self.rank)
+            else:
+                logger.info("Rank %s: FlashInferAllReduce disabled", self.rank)
+                self.fi_ar_comm = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize FlashInferAllReduce: %s", self.rank, e
+            )
+            self.fi_ar_comm = None
+
     def benchmark_allreduce(
         self, sequence_length: int, num_warmup: int, num_trials: int
     ) -> dict[str, float]:
@@ -180,7 +200,8 @@ def benchmark_allreduce(
                     lambda t, c=comm: c.custom_all_reduce(t),
                     lambda t, c=comm: c.should_custom_ar(t),
                     comm.capture(),
-                    "1stage",  # env variable value
+                    {"VLLM_CUSTOM_ALLREDUCE_ALGO": "1stage"},
+                    None,  # no destroy function
                 )
             )
             # CustomAllreduce two-shot
@@ -190,7 +211,8 @@ def benchmark_allreduce(
                     lambda t, c=comm: c.custom_all_reduce(t),
                     lambda t, c=comm: c.should_custom_ar(t),
                     comm.capture(),
-                    "2stage",  # env variable value
+                    {"VLLM_CUSTOM_ALLREDUCE_ALGO": "2stage"},
+                    None,  # no destroy function
                 )
             )
 
@@ -202,7 +224,8 @@ def benchmark_allreduce(
                     lambda t, c=comm: c.all_reduce(t),
                     lambda t: True,  # Always available if initialized
                     nullcontext(),
-                    None,  # no env variable needed
+                    {},  # no env variable needed
+                    None,  # no destroy function
                 )
             )
             communicators.append(
@@ -211,7 +234,8 @@ def benchmark_allreduce(
                     lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
                     lambda t: True,  # Always available if initialized
                     nullcontext(),
-                    None,  # no env variable needed
+                    {},  # no env variable needed
+                    None,  # no destroy function
                 )
             )
 
@@ -223,7 +247,8 @@ def benchmark_allreduce(
                     lambda t, c=comm: c.all_reduce(t),
                     lambda t, c=comm: c.should_use_symm_mem(t),
                     nullcontext(),
-                    None,  # no env variable needed
+                    {},  # no env variable needed
+                    None,  # no destroy function
                 )
             )
 
@@ -235,29 +260,67 @@ def benchmark_allreduce(
                     lambda t, c=comm: c.all_reduce(t),
                     lambda t, c=comm: c.should_use_symm_mem(t),
                     nullcontext(),
-                    None,  # no env variable needed
+                    {},  # no env variable needed
+                    None,  # no destroy function needed
                 )
             )
 
-        # Benchmark each communicator
-        for name, allreduce_fn, should_use_fn, context, env_var in communicators:
-            # Set environment variable if needed
-            if env_var is not None:
-                os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
-            else:
-                # Clear the environment variable to avoid interference
-                os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
-
-            latency = self.benchmark_allreduce_single(
-                sequence_length,
-                allreduce_fn,
-                should_use_fn,
-                context,
-                num_warmup,
-                num_trials,
+        if self.fi_ar_comm is not None:
+            comm = self.fi_ar_comm
+            communicators.append(
+                (
+                    "flashinfer_trtllm",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t, c=comm: c.should_use_fi_ar(t),
+                    nullcontext(),
+                    {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "trtllm"},
+                    lambda c=comm: c.destroy(),
+                )
             )
-            if latency is not None:
-                results[name] = latency
+            communicators.append(
+                (
+                    "flashinfer_mnnvl",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t, c=comm: c.should_use_fi_ar(t),
+                    nullcontext(),
+                    {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "mnnvl"},
+                    lambda c=comm: c.destroy(),
+                )
+            )
+
+        # Benchmark each communicator
+        for (
+            name,
+            allreduce_fn,
+            should_use_fn,
+            context,
+            env_dict,
+            destroy_fn,
+        ) in communicators:
+            # Save original values and apply new environment variables
+            saved_env = {key: os.environ.get(key) for key in env_dict}
+            for key, value in env_dict.items():
+                os.environ[key] = value
+            try:
+                latency = self.benchmark_allreduce_single(
+                    sequence_length,
+                    allreduce_fn,
+                    should_use_fn,
+                    context,
+                    num_warmup,
+                    num_trials,
+                )
+                if latency is not None:
+                    results[name] = latency
+            finally:
+                if destroy_fn is not None:
+                    destroy_fn()
+                # Restore environment variables to their original state
+                for key, original_value in saved_env.items():
+                    if original_value is None:
+                        os.environ.pop(key, None)
+                    else:
+                        os.environ[key] = original_value
 
         return results
 
diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py
index 633529edf16d..e18f6a7580fb 100644
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -5,8 +5,11 @@
 Benchmark for FlashInfer fused collective operations vs standard operations.
 
 This benchmark compares:
-1. FlashInfer's allreduce_fusion (fused allreduce + rmsnorm + optional quant)
-2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
+1. FlashInfer's allreduce_fusion with trtllm backend
+   (fused allreduce + rmsnorm + optional FP8/FP4 quant)
+2. FlashInfer's allreduce_fusion with mnnvl backend
+   (fused allreduce + rmsnorm only, no quantization support)
+3. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
 
 Usage with torchrun:
     torchrun --nproc_per_node=2 benchmark_fused_collective.py
@@ -48,8 +51,12 @@
 logger = init_logger(__name__)
 
 # Try to import FlashInfer
+TorchDistBackend = None
 try:
     import flashinfer.comm as flashinfer_comm  # type: ignore
+    from flashinfer.comm.mnnvl import (  # type: ignore
+        TorchDistBackend,
+    )
 
     if not (
         hasattr(flashinfer_comm, "allreduce_fusion")
@@ -74,11 +81,15 @@
     8: 64 * MiB,  # 64MB
 }
 
-# Global workspace tensor for FlashInfer
-_FI_WORKSPACE = None
+# Global workspace tensors for FlashInfer (keyed by backend name)
+_FI_WORKSPACES: dict = {}
+
+# Backends to benchmark
+FLASHINFER_BACKENDS = ["trtllm", "mnnvl"]
 
 
 def setup_flashinfer_workspace(
+    backend: str,
     world_size: int,
     rank: int,
     hidden_dim: int,
@@ -86,41 +97,54 @@ def setup_flashinfer_workspace(
     dtype: torch.dtype,
 ):
     """Setup FlashInfer workspace for fused allreduce operations."""
-    global _FI_WORKSPACE
+    global FI_WORKSPACES
 
     if flashinfer_comm is None:
-        return None, None
+        return None
 
     if world_size not in _FI_MAX_SIZES:
         logger.warning("FlashInfer not supported for world size %s", world_size)
-        return None, None
+        return None
 
     try:
+        kwargs = {}
+        if TorchDistBackend is not None:
+            kwargs["comm_backend"] = TorchDistBackend(group=dist.group.WORLD)
+
         workspace = flashinfer_comm.create_allreduce_fusion_workspace(
-            backend="trtllm",
+            backend=backend,
             world_size=world_size,
             rank=rank,
             max_token_num=max_token_num,
             hidden_dim=hidden_dim,
             dtype=dtype,
+            **kwargs,
         )
 
-        _FI_WORKSPACE = workspace
+        _FI_WORKSPACES[backend] = workspace
         return workspace
     except Exception as e:
-        logger.error("Failed to setup FlashInfer workspace: %s", e)
+        logger.error(
+            "Failed to setup FlashInfer workspace (backend=%s): %s", backend, e
+        )
         return None
 
 
-def cleanup_flashinfer_workspace(workspace):
-    """Cleanup FlashInfer workspace."""
-    if flashinfer_comm is None or workspace is None:
+def cleanup_flashinfer_workspaces():
+    """Cleanup all FlashInfer workspaces."""
+    if flashinfer_comm is None:
         return
 
-    try:
-        workspace.destroy()
-    except Exception as e:
-        logger.error("Failed to cleanup FlashInfer workspace: %s", e)
+    for backend, workspace in _FI_WORKSPACES.items():
+        try:
+            workspace.destroy()
+        except Exception as e:
+            logger.error(
+                "Failed to cleanup FlashInfer workspace (backend=%s): %s",
+                backend,
+                e,
+            )
+    _FI_WORKSPACES.clear()
 
 
 class FlashInferFusedAllReduceParams:
@@ -134,7 +158,7 @@ def __init__(
         self.fp32_acc = True
         self.max_token_num = max_token_num
 
-    def get_trtllm_fused_allreduce_kwargs(self):
+    def get_flashinfer_fused_allreduce_kwargs(self):
         return {
             "launch_with_pdl": self.launch_with_pdl,
             "fp32_acc": self.fp32_acc,
@@ -147,11 +171,12 @@ def flashinfer_fused_allreduce_rmsnorm(
     rms_gamma: torch.Tensor,
     rms_eps: float,
     allreduce_params: "FlashInferFusedAllReduceParams",
+    workspace: object,
     use_oneshot: bool,
     norm_out: torch.Tensor | None = None,
 ):
     """FlashInfer fused allreduce + rmsnorm operation."""
-    if flashinfer_comm is None or _FI_WORKSPACE is None:
+    if flashinfer_comm is None or workspace is None:
         raise RuntimeError("FlashInfer not available or workspace not initialized")
 
     if norm_out is None:
@@ -160,9 +185,13 @@ def flashinfer_fused_allreduce_rmsnorm(
     else:
         residual_out = input_tensor
 
+    layout_code = None
+    if workspace.backend == "trtllm":
+        layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4
+
     flashinfer_comm.allreduce_fusion(
         input=input_tensor,
-        workspace=_FI_WORKSPACE,
+        workspace=workspace,
         pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
         residual_in=residual,
         residual_out=residual_out,
@@ -171,10 +200,10 @@ def flashinfer_fused_allreduce_rmsnorm(
         rms_eps=rms_eps,
         quant_out=None,
         scale_out=None,
-        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        layout_code=layout_code,
         scale_factor=None,
         use_oneshot=use_oneshot,
-        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
     )
 
 
@@ -185,12 +214,16 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
     rms_eps: float,
     scale_factor: torch.Tensor,
     allreduce_params: FlashInferFusedAllReduceParams,
+    workspace: object,
     use_oneshot: bool = True,
     norm_out: torch.Tensor | None = None,
     quant_out: torch.Tensor | None = None,
 ):
-    """FlashInfer fused allreduce + rmsnorm + FP8 quantization."""
-    if flashinfer_comm is None or _FI_WORKSPACE is None:
+    """FlashInfer fused allreduce + rmsnorm + FP8 quantization.
+
+    Note: Only supported by the trtllm backend.
+    """
+    if flashinfer_comm is None or workspace is None:
         raise RuntimeError("FlashInfer not available or workspace not initialized")
 
     if norm_out is None:
@@ -201,7 +234,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
 
     flashinfer_comm.allreduce_fusion(
         input=input_tensor,
-        workspace=_FI_WORKSPACE,
+        workspace=workspace,
         pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
         residual_in=residual,
         residual_out=residual_out,
@@ -213,7 +246,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
         layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
         scale_factor=scale_factor,
         use_oneshot=use_oneshot,
-        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
     )
 
 
@@ -224,13 +257,17 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
     rms_eps: float,
     input_global_scale: torch.Tensor,
     allreduce_params: FlashInferFusedAllReduceParams,
+    workspace: object,
     quant_out: torch.Tensor,
     use_oneshot: bool,
     output_scale: torch.Tensor,
     norm_out: torch.Tensor | None = None,
 ):
-    """FlashInfer fused allreduce + rmsnorm + FP4 quantization."""
-    if flashinfer_comm is None or _FI_WORKSPACE is None:
+    """FlashInfer fused allreduce + rmsnorm + FP4 quantization.
+
+    Note: Only supported by the trtllm backend.
+    """
+    if flashinfer_comm is None or workspace is None:
         raise RuntimeError("FlashInfer not available or workspace not initialized")
 
     if norm_out is None:
@@ -241,7 +278,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
 
     flashinfer_comm.allreduce_fusion(
         input=input_tensor,
-        workspace=_FI_WORKSPACE,
+        workspace=workspace,
         pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
         residual_in=residual,
         residual_out=residual_out,
@@ -253,7 +290,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
         layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
         scale_factor=input_global_scale,
         use_oneshot=use_oneshot,
-        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
     )
 
 
@@ -386,13 +423,16 @@ def run_benchmarks(
     dtype: torch.dtype,
     use_residual: bool,
     allreduce_params: FlashInferFusedAllReduceParams | None,
+    workspaces: dict,
     quant_modes: set[str],
     no_oneshot: bool,
 ):
     """Run all benchmarks for given configuration.
 
     Args:
-        quant_mode: "none", "fp8_only", "fp4_only", or "all"
+        allreduce_params: Shared parameters for FlashInfer fused allreduce.
+        workspaces: Dict mapping backend name ("trtllm", "mnnvl") to workspace.
+        quant_modes: Set of quantization modes: "none", "fp8", "fp4".
     """
     (
         input_tensor,
@@ -454,10 +494,11 @@ def run_benchmarks(
                 logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
                 results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")
 
-        # FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot
-        if flashinfer_comm is not None and allreduce_params is not None:
+        # FlashInfer Fused AllReduce + RMSNorm (all backends)
+        for backend, workspace in workspaces.items():
             for use_oneshot in use_oneshot_options:
                 suffix = "_oneshot" if use_oneshot else "_twoshot"
+                key = f"flashinfer_{backend}_fused_allreduce_rmsnorm{suffix}"
                 try:
                     time_ms = benchmark_operation(
                         flashinfer_fused_allreduce_rmsnorm,
@@ -467,14 +508,17 @@ def run_benchmarks(
                         rms_gamma=rms_gamma,
                         rms_eps=rms_eps,
                         allreduce_params=allreduce_params,
+                        workspace=workspace,
                         use_oneshot=use_oneshot,
                     )
-                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms
+                    results[key] = time_ms
                 except Exception as e:
-                    logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e)
-                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float(
-                        "inf"
+                    logger.error(
+                        "FlashInfer (%s) Fused AllReduce+RMSNorm failed: %s",
+                        backend,
+                        e,
                     )
+                    results[key] = float("inf")
 
     if "fp8" in quant_modes:
         # Standard AllReduce + RMSNorm + FP8 Quant
@@ -540,10 +584,12 @@ def run_benchmarks(
                     "inf"
                 )
 
-        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
-        if flashinfer_comm is not None and allreduce_params is not None:
+        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant (trtllm only)
+        if "trtllm" in workspaces:
+            trtllm_ws = workspaces["trtllm"]
             for use_oneshot in use_oneshot_options:
                 suffix = "_oneshot" if use_oneshot else "_twoshot"
+                key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp8_quant{suffix}"
                 try:
                     time_ms = benchmark_operation(
                         flashinfer_fused_allreduce_rmsnorm_fp8_quant,
@@ -555,19 +601,16 @@ def run_benchmarks(
                         scale_factor=scale_fp8,
                         quant_out=quant_out_fp8,
                         allreduce_params=allreduce_params,
+                        workspace=trtllm_ws,
                         use_oneshot=use_oneshot,
                     )
-                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
-                        time_ms
-                    )
+                    results[key] = time_ms
                 except Exception as e:
                     logger.error(
-                        "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
+                        "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP8 failed: %s",
                         e,
                     )
-                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
-                        float("inf")
-                    )
+                    results[key] = float("inf")
 
     if "fp4" in quant_modes and current_platform.has_device_capability(100):
         # Standard AllReduce + RMSNorm + FP4 Quant
@@ -627,10 +670,12 @@ def run_benchmarks(
                     "inf"
                 )
 
-        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
-        if flashinfer_comm is not None and allreduce_params is not None:
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant (trtllm only)
+        if "trtllm" in workspaces:
+            trtllm_ws = workspaces["trtllm"]
             for use_oneshot in use_oneshot_options:
                 suffix = "_oneshot" if use_oneshot else "_twoshot"
+                key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp4_quant{suffix}"
                 try:
                     time_ms = benchmark_operation(
                         flashinfer_fused_allreduce_rmsnorm_fp4_quant,
@@ -641,49 +686,18 @@ def run_benchmarks(
                         rms_eps=rms_eps,
                         input_global_scale=scale_fp4,
                         allreduce_params=allreduce_params,
+                        workspace=trtllm_ws,
                         quant_out=fp4_quant_out,
                         output_scale=fp4_output_scale,
                         use_oneshot=use_oneshot,
                     )
-                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
-                        time_ms
-                    )
+                    results[key] = time_ms
                 except Exception as e:
                     logger.error(
-                        "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
+                        "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP4 failed: %s",
                         e,
                     )
-                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
-                        float("inf")
-                    )
-
-        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
-        if flashinfer_comm is not None and allreduce_params is not None:
-            try:
-                time_ms = benchmark_operation(
-                    flashinfer_fused_allreduce_rmsnorm_fp4_quant,
-                    input_tensor,
-                    residual=residual,
-                    norm_out=norm_out,
-                    rms_gamma=rms_gamma,
-                    rms_eps=rms_eps,
-                    input_global_scale=scale_fp4,
-                    allreduce_params=allreduce_params,
-                    quant_out=fp4_quant_out,
-                    output_scale=fp4_output_scale,
-                    use_oneshot=False,
-                )
-                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
-                    time_ms
-                )
-            except Exception as e:
-                logger.error(
-                    "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
-                    e,
-                )
-                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
-                    "inf"
-                )
+                    results[key] = float("inf")
 
     return results
 
@@ -1021,8 +1035,7 @@ def main():
 
     configs = list(itertools.product(args.num_tokens, dtypes, residual_options))
 
-    # Setup FlashInfer workspace if available
-    workspace = None
+    # Setup FlashInfer workspaces for all backends
     allreduce_params = None
 
     if flashinfer_comm is not None:
@@ -1037,15 +1050,17 @@ def main():
             args.hidden_dim * max_element_size
         )
 
-        workspace = setup_flashinfer_workspace(
-            world_size,
-            rank,
-            args.hidden_dim,
-            max_num_token,
-            dtype=workspace_dtype,
-        )
+        for backend in FLASHINFER_BACKENDS:
+            setup_flashinfer_workspace(
+                backend=backend,
+                world_size=world_size,
+                rank=rank,
+                hidden_dim=args.hidden_dim,
+                max_token_num=max_num_token,
+                dtype=workspace_dtype,
+            )
 
-        if workspace is not None:
+        if _FI_WORKSPACES:
             allreduce_params = FlashInferFusedAllReduceParams(
                 max_token_num=max_num_token,
             )
@@ -1071,6 +1086,7 @@ def main():
                 dtype,
                 use_residual,
                 allreduce_params,
+                workspaces=_FI_WORKSPACES,
                 quant_modes=quant_modes,
                 no_oneshot=args.no_oneshot,
             )
@@ -1109,11 +1125,13 @@ def main():
 
     finally:
         # Cleanup
-        if workspace is not None:
-            cleanup_flashinfer_workspace(workspace)
+        cleanup_flashinfer_workspaces()
 
         dist.barrier()
 
 
 if __name__ == "__main__":
-    main()
+    from vllm.config import VllmConfig, set_current_vllm_config
+
+    with set_current_vllm_config(VllmConfig()):
+        main()
diff --git a/tests/compile/passes/distributed/test_fusion_all_reduce.py b/tests/compile/passes/distributed/test_fusion_all_reduce.py
index d48f22970313..6d5113b1e84b 100644
--- a/tests/compile/passes/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py
@@ -142,7 +142,6 @@ def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
             *(scaled_fp4_quant(w, wg) for w, wg in zip(self.w, wgscale))
         )
         self.wq, self.wscale = list(wq_gen), list(wscale_gen)
-        print(f"{self.wq=}, {self.wscale=}")
 
     def forward(self, hidden_states):
         # avoid having graph input be an arg to a pattern directly
@@ -199,6 +198,7 @@ def ops_in_model_before(self):
 @pytest.mark.parametrize("hidden_size", [64])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.parametrize("flashinfer_allreduce_backend", ["trtllm", "mnnvl"])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 @pytest.mark.skipif(
     not find_spec("flashinfer")
@@ -215,6 +215,7 @@ def test_all_reduce_fusion_pass_replace(
     dtype: torch.dtype,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
+    flashinfer_allreduce_backend,
 ):
     num_processes = 2
     if (
@@ -238,6 +239,7 @@ def run_torch_spawn(fn, nprocs):
                 dtype,
                 enable_rms_norm_custom_op,
                 enable_quant_fp8_custom_op,
+                flashinfer_allreduce_backend,
             ),
             nprocs=nprocs,
         )
@@ -255,6 +257,7 @@ def all_reduce_fusion_pass_on_test_model(
     dtype: torch.dtype,
     enable_rms_norm_custom_op,
     enable_quant_fp8_custom_op,
+    flashinfer_allreduce_backend,
 ):
     set_random_seed(0)
 
@@ -270,6 +273,7 @@ def all_reduce_fusion_pass_on_test_model(
             "WORLD_SIZE": str(world_size),
             "MASTER_ADDR": "localhost",
             "MASTER_PORT": "12345",
+            "VLLM_FLASHINFER_ALLREDUCE_BACKEND": flashinfer_allreduce_backend,
         }
     )
 
@@ -317,6 +321,10 @@ def all_reduce_fusion_pass_on_test_model(
         compiled_model = torch.compile(model, backend=backend)
         compiled_model(hidden_states)
 
+        results_unfused = model(hidden_states)
+        results_fused = compiled_model(hidden_states)
+        torch.testing.assert_close(results_unfused, results_fused, atol=1e-2, rtol=1e-2)
+
         assert all_reduce_fusion_pass.matched_count == 4, (
             f"{all_reduce_fusion_pass.matched_count=}"
         )
diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
index b6a1314af9ef..44dc3d67bb98 100644
--- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
+++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
@@ -22,7 +22,9 @@
     kFp8StaticTensorSym,
 )
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
+from vllm.utils.torch_utils import (
+    direct_register_custom_op,
+)
 
 from ..inductor_pass import enable_fake_mode
 from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
@@ -44,8 +46,6 @@
     except ImportError:
         pass
 
-logger = init_logger(__name__)
-
 if hasattr(torch.ops._C, "scaled_fp4_quant"):
     STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.default
 
@@ -82,7 +82,16 @@
 
 
 if flashinfer_comm is not None:
-    _FI_WORKSPACE = None
+    from vllm.distributed.device_communicators.flashinfer_all_reduce import (
+        destroy_fi_ar_workspace,
+        get_fi_ar_quant_workspace,
+        get_fi_ar_workspace,
+        initialize_fi_ar_quant_workspace,
+        initialize_fi_ar_workspace,
+    )
+
+    ar_fusion_patterns = flashinfer_comm.AllReduceFusionPattern
+
     MiB = 1024 * 1024
 
     def call_trtllm_fused_allreduce_norm(
@@ -122,9 +131,19 @@ def call_trtllm_fused_allreduce_norm(
             max_one_shot_size is None or current_tensor_size <= max_one_shot_size * MiB
         )
 
-        assert _FI_WORKSPACE is not None, (
-            "Flashinfer must be enabled when using flashinfer"
+        # Select workspace based on pattern: quant patterns use the
+        # trtllm quant workspace, non-quant patterns use the primary workspace.
+        if pattern_code in (
+            ar_fusion_patterns.kARResidualRMSNormFP8Quant,
+            ar_fusion_patterns.kARResidualRMSNormFP4Quant,
+        ):
+            workspace = get_fi_ar_quant_workspace()
+        else:
+            workspace = get_fi_ar_workspace()
+        assert workspace is not None, (
+            "Flashinfer workspace must be initialized when using flashinfer"
         )
+        assert flashinfer_comm is not None
         if norm_out is None:
             norm_out = allreduce_in
             residual_out = residual
@@ -133,25 +152,30 @@ def call_trtllm_fused_allreduce_norm(
             # as flashinfer does not support rms_norm
             # and allreduce_out together
             residual_out = allreduce_in
-        # For the sizes that are smaller than the max size,
-        # we only use flashinfer one shot allreduce
+
+        layout_code = None
+        # layout_code only supported by trtllm backend
+        if workspace.backend == "trtllm":
+            # in vllm we only support swizzled layout
+            layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4
+
         flashinfer_comm.allreduce_fusion(
             input=allreduce_in,
-            workspace=_FI_WORKSPACE,
+            workspace=workspace,
             pattern=pattern_code,
-            residual_in=residual,
+            launch_with_pdl=launch_with_pdl,
+            output=None,
             residual_out=residual_out,
             norm_out=norm_out,
+            quant_out=quant_out,
+            scale_out=scale_out,
+            residual_in=residual,
             rms_gamma=rms_gamma,
             rms_eps=rms_eps,
-            launch_with_pdl=launch_with_pdl,
+            scale_factor=scale_factor,
+            layout_code=layout_code,
             use_oneshot=use_oneshot,
             fp32_acc=fp32_acc,
-            quant_out=quant_out,
-            scale_out=scale_out,
-            # in vllm we only support swizzled layout
-            layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
-            scale_factor=scale_factor,
         )
 
     def call_trtllm_fused_allreduce_norm_fake(
@@ -729,29 +753,36 @@ def __init__(self, config: VllmConfig) -> None:
             scope="global",
         )
 
-        try:
-            self.workspace = flashinfer_comm.create_allreduce_fusion_workspace(
-                backend="trtllm",
-                world_size=self.tp_size,
-                rank=rank,
-                max_token_num=self.max_token_num,
-                hidden_dim=self.hidden_dim,
-                dtype=self.model_dtype,
-            )
-        except RuntimeError as e:
-            if "multicast" not in str(e).lower():
-                raise
-            logger.warning_once(
-                "AllReduce fusion pass is disabled: flashinfer workspace "
-                "creation failed: %s. This is expected on GPUs without "
-                "NVSwitch (e.g., NVLink bridge-only or PCIe topologies). "
-                "Falling back to non-fused allreduce.",
-                str(e),
-            )
-            return
+        for workspace_init_fn in [
+            initialize_fi_ar_workspace,
+            initialize_fi_ar_quant_workspace,
+        ]:
+            try:
+                workspace_init_fn(
+                    world_size=self.tp_size,
+                    rank=rank,
+                    max_token_num=self.max_token_num,
+                    hidden_dim=self.hidden_dim,
+                    dtype=self.model_dtype,
+                    group=self.group,
+                )
+            except Exception as e:
+                if "multicast" in str(e).lower():
+                    logger.warning(
+                        "AllReduce fusion pass is disabled: flashinfer workspace "
+                        "creation failed: %s. This is expected on GPUs without "
+                        "NVSwitch (e.g., NVLink bridge-only or PCIe topologies). "
+                        "Falling back to non-fused allreduce.",
+                        str(e),
+                    )
+                else:
+                    logger.warning(
+                        "Failed to initialize FlashInfer All Reduce workspace: %s. "
+                        "AllReduce fusion pass will be disabled.",
+                        e,
+                    )
+                return
 
-        global _FI_WORKSPACE
-        _FI_WORKSPACE = self.workspace
         self.allreduce_params = FlashInferFusedAllReduceParams(
             world_size=self.tp_size,
             max_token_num=self.max_token_num,
@@ -762,32 +793,34 @@ def __init__(self, config: VllmConfig) -> None:
 
     @enable_fake_mode
     def register_patterns(self) -> None:
+        supports_quantization = get_fi_ar_quant_workspace() is not None
         for epsilon in [1e-5, 1e-6]:
-            AllReduceFusedRMSNormStaticQuantFP8Pattern(
-                epsilon,
-                self.model_dtype,
-                self.device,
-                self.allreduce_params,
-            ).register(self.patterns)
-            AllReduceFusedAddRMSNormStaticQuantFP8Pattern(
-                epsilon,
-                self.model_dtype,
-                self.device,
-                self.allreduce_params,
-            ).register(self.patterns)
-            if current_platform.has_device_capability(100):
-                AllReduceFusedRMSNormStaticQuantNVFP4Pattern(
+            if supports_quantization:
+                AllReduceFusedRMSNormStaticQuantFP8Pattern(
                     epsilon,
                     self.model_dtype,
                     self.device,
                     self.allreduce_params,
                 ).register(self.patterns)
-                AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(
+                AllReduceFusedAddRMSNormStaticQuantFP8Pattern(
                     epsilon,
                     self.model_dtype,
                     self.device,
                     self.allreduce_params,
                 ).register(self.patterns)
+                if current_platform.has_device_capability(100):
+                    AllReduceFusedRMSNormStaticQuantNVFP4Pattern(
+                        epsilon,
+                        self.model_dtype,
+                        self.device,
+                        self.allreduce_params,
+                    ).register(self.patterns)
+                    AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(
+                        epsilon,
+                        self.model_dtype,
+                        self.device,
+                        self.allreduce_params,
+                    ).register(self.patterns)
             AllReduceRMSNormPattern(
                 epsilon,
                 self.model_dtype,
@@ -825,6 +858,5 @@ def __call__(self, graph: fx.Graph) -> None:
     def __del__(self) -> None:
         if getattr(self, "disabled", True):
             return
-        if getattr(self, "workspace", None) is not None:
-            with contextlib.suppress(Exception):
-                self.workspace.destroy()
+        with contextlib.suppress(Exception):
+            destroy_fi_ar_workspace()
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 4c78871e1fda..62e2b90377fd 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -34,19 +34,25 @@ def __init__(
             # custom allreduce or torch symm mem can be used only by tp
             use_custom_allreduce = False
             use_torch_symm_mem = False
+            use_flashinfer_allreduce = False
         else:
             from vllm.distributed.parallel_state import _ENABLE_CUSTOM_ALL_REDUCE
 
             use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
             use_torch_symm_mem = envs.VLLM_ALLREDUCE_USE_SYMM_MEM
+            use_flashinfer_allreduce = envs.VLLM_ALLREDUCE_USE_FLASHINFER
 
         self.use_custom_allreduce = use_custom_allreduce
         self.use_torch_symm_mem = use_torch_symm_mem
+        self.use_flashinfer_allreduce = use_flashinfer_allreduce
 
         # lazy import to avoid documentation build error
         from vllm.distributed.device_communicators.custom_all_reduce import (
             CustomAllreduce,
         )
+        from vllm.distributed.device_communicators.flashinfer_all_reduce import (
+            FlashInferAllReduce,
+        )
         from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
         from vllm.distributed.device_communicators.quick_all_reduce import (
             QuickAllReduce,
@@ -65,12 +71,20 @@ def __init__(
         self.ca_comm: CustomAllreduce | None = None
         self.qr_comm: QuickAllReduce | None = None
         self.symm_mem_comm: SymmMemCommunicator | None = None
+        self.fi_ar_comm: FlashInferAllReduce | None = None
+
         if use_torch_symm_mem and current_platform.is_cuda():
             self.symm_mem_comm = SymmMemCommunicator(
                 group=self.cpu_group,
                 device=self.device,
             )
 
+        if self.use_flashinfer_allreduce and self.world_size > 1:
+            self.fi_ar_comm = FlashInferAllReduce(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
         if use_custom_allreduce and self.world_size > 1:
             # Initialize a custom fast all-reduce implementation.
             self.ca_comm = CustomAllreduce(
@@ -136,7 +150,7 @@ def all_reduce(self, input_):
             out = torch.ops.vllm.all_reduce_symmetric_with_copy(input_)
             if out is not None:
                 return out
-        # always try quick reduce first, then custom allreduce,
+        # always try quick reduce first, then flashinfer, then custom allreduce,
         # and then pynccl. (quick reduce just for ROCM MI3*)
         qr_comm = self.qr_comm
         if (
@@ -147,6 +161,15 @@ def all_reduce(self, input_):
             out = qr_comm.quick_all_reduce(input_)
             assert out is not None
             return out
+        fi_ar_comm = self.fi_ar_comm
+        if (
+            fi_ar_comm is not None
+            and not fi_ar_comm.disabled
+            and fi_ar_comm.should_use_fi_ar(input_)
+        ):
+            out = fi_ar_comm.all_reduce(input_)
+            assert out is not None
+            return out
         ca_comm = self.ca_comm
         if (
             ca_comm is not None
@@ -270,6 +293,9 @@ def destroy(self):
             self.pynccl_comm = None
         if self.ca_comm is not None:
             self.ca_comm = None
+        if self.fi_ar_comm is not None:
+            self.fi_ar_comm.destroy()
+            self.fi_ar_comm = None
         if self.all2all_manager is not None:
             self.all2all_manager.destroy()
             self.all2all_manager = None
diff --git a/vllm/distributed/device_communicators/flashinfer_all_reduce.py b/vllm/distributed/device_communicators/flashinfer_all_reduce.py
new file mode 100644
index 000000000000..ea16c93763cb
--- /dev/null
+++ b/vllm/distributed/device_communicators/flashinfer_all_reduce.py
@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+import vllm.envs as envs
+from vllm.config.compilation import PassConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+fi_ar_available = False
+try:
+    import flashinfer.comm as flashinfer_comm  # type: ignore[no-redef]
+    from flashinfer.comm.mnnvl import (
+        TorchDistBackend,  # type: ignore[import-not-found, no-redef]
+    )
+
+    fi_ar_available = hasattr(flashinfer_comm, "allreduce_fusion")
+except ImportError:
+    pass
+
+# Global workspace for standalone allreduce and non-quant ar+rms fusion
+_fi_ar_workspace = None
+# Extra workspace for quant fusion patterns (only supported by trtllm backend)
+# Only created if primary workspace is not already trtllm
+_fi_ar_quant_workspace = None
+
+
+def get_fi_ar_workspace():
+    return _fi_ar_workspace
+
+
+def get_fi_ar_quant_workspace():
+    return _fi_ar_quant_workspace
+
+
+def initialize_fi_ar_workspace(
+    world_size: int,
+    rank: int,
+    max_token_num: int,
+    hidden_dim: int,
+    dtype: torch.dtype,
+    group: ProcessGroup,
+) -> None:
+    """
+    Initialize the workspace if not already initialized.
+
+    Currently, this function is called by either the AllReduceFusionPass
+    or the FlashInferAllReduce backend for standalone allreduce.
+    If the fusion pass is enabled via
+    --compilation-config.pass_config.fuse_allreduce_rms=true,
+    it will create the workspace first, and the standalone backend
+    will reuse the workspace. Otherwise, the standalone backend will
+    create the workspace.
+    """
+    global _fi_ar_workspace
+    if _fi_ar_workspace is not None:
+        return
+
+    backend = envs.VLLM_FLASHINFER_ALLREDUCE_BACKEND
+    comm_backend = TorchDistBackend(group=group)
+    _fi_ar_workspace = flashinfer_comm.create_allreduce_fusion_workspace(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        max_token_num=max_token_num,
+        hidden_dim=hidden_dim,
+        dtype=dtype,
+        comm_backend=comm_backend,
+    )
+    assert _fi_ar_workspace is not None
+    logger.debug(
+        "Initialized FlashInfer All Reduce workspace: backend=%s, "
+        "world_size=%d, rank=%d, max_token_num=%d, hidden_dim=%d, dtype=%s",
+        backend,
+        world_size,
+        rank,
+        max_token_num,
+        hidden_dim,
+        dtype,
+    )
+
+
+def initialize_fi_ar_quant_workspace(
+    world_size: int,
+    rank: int,
+    max_token_num: int,
+    hidden_dim: int,
+    dtype: torch.dtype,
+    group: ProcessGroup,
+) -> None:
+    """
+    Initialize the workspace used by quantization fusion patterns.
+
+    Currently this always creates a workspace for trtllm backend as only it
+    supports quantization fusion (FP8/FP4). If the primary workspace
+    is already trtllm, the quant workspace aliases to it.
+    """
+    global _fi_ar_quant_workspace
+    if _fi_ar_quant_workspace is not None:
+        return
+
+    # If primary workspace is already trtllm, reuse it
+    if _fi_ar_workspace is not None and _fi_ar_workspace.backend == "trtllm":
+        _fi_ar_quant_workspace = _fi_ar_workspace
+        return
+
+    comm_backend = TorchDistBackend(group=group)
+    _fi_ar_quant_workspace = flashinfer_comm.create_allreduce_fusion_workspace(
+        backend="trtllm",
+        world_size=world_size,
+        rank=rank,
+        max_token_num=max_token_num,
+        hidden_dim=hidden_dim,
+        dtype=dtype,
+        comm_backend=comm_backend,
+    )
+    assert _fi_ar_quant_workspace is not None
+    logger.debug(
+        "Initialized FlashInfer All Reduce workspace: backend=trtllm, "
+        "world_size=%d, rank=%d, max_token_num=%d, hidden_dim=%d, dtype=%s",
+        world_size,
+        rank,
+        max_token_num,
+        hidden_dim,
+        dtype,
+    )
+
+
+def destroy_fi_ar_workspace():
+    global _fi_ar_workspace
+    global _fi_ar_quant_workspace
+    if (
+        _fi_ar_quant_workspace is not None
+        and _fi_ar_quant_workspace is not _fi_ar_workspace
+    ):
+        _fi_ar_quant_workspace.destroy()
+    _fi_ar_quant_workspace = None
+    if _fi_ar_workspace is not None:
+        _fi_ar_workspace.destroy()
+        _fi_ar_workspace = None
+
+
+class FlashInferAllReduce:
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: int | str | torch.device,
+    ):
+        self.disabled = True
+
+        if not fi_ar_available:
+            logger.info(
+                "FlashInfer All Reduce is disabled because flashinfer is not available"
+            )
+            return
+
+        if not current_platform.is_cuda():
+            logger.info(
+                "FlashInfer All Reduce is disabled because it requires CUDA platform"
+            )
+            return
+
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+        self.rank = dist.get_rank(self.group)
+        self.device = device
+        if self.world_size == 1:
+            return
+
+        # Use the same threshold as the allreduce-rms fusion pass
+        # TODO: tune the threshold
+        MiB = 1024 * 1024
+        max_workspace_size = PassConfig.default_fi_allreduce_fusion_max_size_mb().get(
+            self.world_size, None
+        )
+        if not max_workspace_size:
+            logger.warning(
+                "FlashInfer All Reduce is disabled because it "
+                "is not supported for world_size=%d.",
+                self.world_size,
+            )
+            return
+        self.max_workspace_size = max_workspace_size * MiB
+        self.max_num_tokens = 0
+        self.disabled = False
+
+    def _ensure_workspace(self, hidden_dim: int, dtype: torch.dtype) -> bool:
+        """Ensure the all reduce workspace is initialized."""
+        if get_fi_ar_workspace() is not None:
+            return True
+        if self.max_num_tokens == 0:
+            element_size = torch.tensor([], dtype=dtype, device="cpu").element_size()
+            self.max_num_tokens = self.max_workspace_size // (hidden_dim * element_size)
+        try:
+            initialize_fi_ar_workspace(
+                world_size=self.world_size,
+                rank=self.rank,
+                max_token_num=self.max_num_tokens,
+                hidden_dim=hidden_dim,
+                dtype=dtype,
+                group=self.group,
+            )
+            return True
+        except Exception as e:
+            logger.warning(
+                "Failed to initialize FlashInfer All Reduce workspace: %s. "
+                "FlashInfer All Reduce will be disabled.",
+                e,
+            )
+            self.disabled = True
+            return False
+
+    def should_use_fi_ar(self, input_tensor: torch.Tensor) -> bool:
+        if self.disabled:
+            return False
+
+        if not input_tensor.is_cuda:
+            return False
+
+        if not input_tensor.is_contiguous():
+            return False
+
+        if len(input_tensor.shape) != 2:
+            return False
+
+        num_tokens, hidden_dim = input_tensor.shape
+        if not self.max_num_tokens:
+            element_size = torch.tensor([], dtype=input_tensor.dtype).element_size()
+            self.max_num_tokens = self.max_workspace_size // (hidden_dim * element_size)
+
+        if num_tokens > self.max_num_tokens:
+            return False
+
+        return self._ensure_workspace(hidden_dim, input_tensor.dtype)
+
+    def all_reduce(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        workspace = get_fi_ar_workspace()
+        return flashinfer_comm.allreduce_fusion(
+            input=input_tensor,
+            workspace=workspace,
+            pattern=flashinfer_comm.AllReduceFusionPattern.kAllReduce,
+        )
+
+    def destroy(self):
+        if not self.disabled:
+            destroy_fi_ar_workspace()
diff --git a/vllm/envs.py b/vllm/envs.py
index d62438d5735b..d560cfc7753c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -168,6 +168,7 @@
     VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
         "latency"
     )
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "auto"
     VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
@@ -206,6 +207,7 @@
     VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
+    VLLM_ALLREDUCE_USE_FLASHINFER: bool = False
     VLLM_TUNED_CONFIG_FOLDER: str | None = None
     VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set()
     VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: bool = False
@@ -1290,6 +1292,14 @@ def _get_or_set_default() -> str:
         "latency",
         ["throughput", "latency", "masked_gemm"],
     ),
+    # Flashinfer fused allreduce backend.
+    # "auto" will default to "mnnvl", which performs mostly same/better than "trtllm".
+    # But "mnnvl" backend does not support fuse with quantization.
+    "VLLM_FLASHINFER_ALLREDUCE_BACKEND": env_with_choices(
+        "VLLM_FLASHINFER_ALLREDUCE_BACKEND",
+        "auto",
+        ["auto", "trtllm", "mnnvl"],
+    ),
     # Control the workspace buffer size for the FlashInfer backend.
     "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int(
         os.getenv("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", str(394 * 1024 * 1024))
@@ -1448,6 +1458,10 @@ def _get_or_set_default() -> str:
     "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
         int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
     ),
+    # Whether to use FlashInfer allreduce
+    "VLLM_ALLREDUCE_USE_FLASHINFER": lambda: bool(
+        int(os.getenv("VLLM_ALLREDUCE_USE_FLASHINFER", "0"))
+    ),
     # Experimental: use this to enable MCP tool calling for non harmony models
     "VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT": lambda: bool(
         int(os.getenv("VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT", "0"))

From 13025e71e888330aa3277948120051ebbc2674c7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 25 Feb 2026 20:42:40 -0800
Subject: [PATCH 006/154] [Model Runner V2] Add coding style guide (#35325)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/model_runner.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index ccab6cec8c78..9e0cae6feef1 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -1,5 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+NOTE: Coding style guide for this file:
+This model runner is shared by all models: text and multimodal, generative
+and embedding, public and private. As a result, this file must only contain
+code that is common to every model. Model-specific behavior belongs in the
+appropriate model-specific files.
+
+In other words:
+* Be paranoid about changing this file. It should remain stable.
+* Be even more paranoid about adding new lines. It should remain minimal.
+
+Even for shared features (for example, different parallelism modes), keep the
+complexity out of this path. The less common the feature, the more it should be
+hidden. Prefer utility functions defined elsewhere and call them from here,
+instead of embedding feature-specific logic directly.
+"""
+
 import functools
 import gc
 import time

From 4171ff6dd9ce18f452c4e9267f5bf090c0989b04 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Thu, 26 Feb 2026 05:00:10 +0000
Subject: [PATCH 007/154] [CPU][Feat]  Enable KleidiAI INT8_W4A8 for all input
 dtypes (#34890)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 .../linear/mixed_precision/dynamic_4bit.py    | 32 +++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py b/vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py
index 3dfe06f1b130..d0515027628e 100644
--- a/vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py
+++ b/vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py
@@ -42,12 +42,13 @@ def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
             not in [
                 torch.float32,
                 torch.bfloat16,
+                torch.float16,
             ]
         ):
             return (
                 False,
                 "Dynamic4bitLinearKernel on Arm requires Float32 or"
-                " BFloat16 activations",
+                " BFloat16 or Float16 activations",
             )
         if c.full_weight_shape[0] % c.group_size != 0:
             return (
@@ -118,8 +119,30 @@ def apply_weights(
         x: torch.Tensor,
         bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
+        # PyTorch / KleidiAI kernels natively support the following configs:
+        # - channelwise with bfloat16 / float32 activations
+        # - groupwise with float32 activations
+        # To support:
+        # - groupwise with bfloat16/float16 activations: we need to upcast
+        #   activations to float32 before matmul and downcast back to bfloat16/float16
+        # - channelwise with float16 activations, we need to upcast activations to
+        #   float32 before matmul and downcast back to float16
+        # Note: these activations will be dynamically quantized to int8 by the kernel.
+
         c = self.config
+        is_groupwise = c.group_size != c.partition_weight_shape[0]
+        # dtype of activations before they get dynamically quantized to int8
+        original_pre_quant_act_dtype = x.dtype
+        pre_quant_act_dtype = original_pre_quant_act_dtype
+        if (
+            is_groupwise and pre_quant_act_dtype == torch.bfloat16
+        ) or pre_quant_act_dtype == torch.float16:
+            pre_quant_act_dtype = torch.float32
+
         x_2d = x.reshape(-1, x.shape[-1])
+        if pre_quant_act_dtype != original_pre_quant_act_dtype:
+            x_2d = x_2d.to(pre_quant_act_dtype)
+
         out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
 
         w_q = getattr(layer, self.w_q_name)
@@ -129,5 +152,8 @@ def apply_weights(
             c.group_size,
             c.partition_weight_shape[0],
             c.partition_weight_shape[1],
-        )
-        return output.reshape(out_shape)
+        ).reshape(out_shape)
+
+        if pre_quant_act_dtype != original_pre_quant_act_dtype:
+            output = output.to(original_pre_quant_act_dtype)
+        return output

From 9d379410179b649f4e7651940debc35c4ac7c0a5 Mon Sep 17 00:00:00 2001
From: Jason Li <jasonlizhengjian@gmail.com>
Date: Wed, 25 Feb 2026 21:00:12 -0800
Subject: [PATCH 008/154] [torch.compile] Sequence Parallelism threshold
 compile ranges (#28672)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com>
Signed-off-by: Jason Li <jasonlizhengjian@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/compile/conftest.py                     |  34 +++++
 tests/compile/fusions_e2e/conftest.py         |  89 ++++++++++--
 .../compile/fusions_e2e/test_tp2_async_tp.py  | 133 ++++++++++++++++++
 tests/compile/test_config.py                  |   1 +
 .../test_sequence_parallelism_threshold.py    | 110 +++++++++++++++
 .../passes/fusion/sequence_parallelism.py     | 123 +++++++++++++---
 vllm/config/compilation.py                    |   9 +-
 vllm/config/vllm.py                           |  57 +++++++-
 8 files changed, 524 insertions(+), 32 deletions(-)
 create mode 100644 tests/compile/conftest.py
 create mode 100644 tests/compile/test_sequence_parallelism_threshold.py

diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py
new file mode 100644
index 000000000000..6aafac7bcad3
--- /dev/null
+++ b/tests/compile/conftest.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from vllm.platforms.interface import DeviceCapability
+
+
+@pytest.fixture
+def mock_cuda_platform():
+    """
+    Fixture that returns a factory for creating mocked CUDA platforms.
+
+    Usage:
+        def test_something(mock_cuda_platform):
+            with mock_cuda_platform(is_cuda=True, capability=(9, 0)):
+                # test code
+    """
+
+    @contextmanager
+    def _mock_platform(is_cuda: bool = True, capability: tuple[int, int] | None = None):
+        mock_platform = MagicMock()
+        mock_platform.is_cuda.return_value = is_cuda
+        if capability is not None:
+            mock_platform.get_device_capability.return_value = DeviceCapability(
+                *capability
+            )
+        with patch("vllm.platforms.current_platform", mock_platform):
+            yield mock_platform
+
+    return _mock_platform
diff --git a/tests/compile/fusions_e2e/conftest.py b/tests/compile/fusions_e2e/conftest.py
index 1d9f6cda9fd6..40b4de57f66f 100644
--- a/tests/compile/fusions_e2e/conftest.py
+++ b/tests/compile/fusions_e2e/conftest.py
@@ -94,7 +94,7 @@ def run(
             run_model(full_compilation_config, model_name, **model_kwargs)
 
         num_compile_ranges = len(full_compilation_config.get_compile_ranges())
-        assert num_compile_ranges in [1, 2]
+        assert num_compile_ranges in [1, 2, 3]
 
         print(f"Compile ranges: {full_compilation_config.get_compile_ranges()}")
         print("Fusion results:")
@@ -107,12 +107,33 @@ def run(
 
         # Now check the matches
         for match_name in matches_check:
-            num_ranges_activated = (
-                1 if match_name == "ar_rms_fusion" else num_compile_ranges
-            )
-            n_expected = tp_size * num_ranges_activated
-
             log_matches = list(int(ms) for ms in log_matches_dict[match_name])
+
+            # AR+RMS skips the largest range; SP skips the smallest.
+            # When both are enabled, AR+RMS activation count is
+            # model-dependent (hidden_size affects threshold), so derive
+            # from log data.
+            if (
+                match_name == "ar_rms_fusion"
+                and "sequence_parallel" in matches_check
+                and num_compile_ranges >= 2
+            ):
+                assert (
+                    len(log_matches) >= tp_size and len(log_matches) % tp_size == 0
+                ), (
+                    f"Expected multiple of {tp_size} ar_rms log entries, "
+                    f"found {len(log_matches)}"
+                )
+                num_ranges_activated = len(log_matches) // tp_size
+            elif (
+                match_name in ("ar_rms_fusion", "sequence_parallel")
+                and num_compile_ranges >= 2
+            ):
+                num_ranges_activated = num_compile_ranges - 1
+            else:
+                num_ranges_activated = num_compile_ranges
+
+            n_expected = tp_size * num_ranges_activated
             assert len(log_matches) == n_expected, (
                 f"Could not find {n_expected} {match_name} "
                 f"(found {len(log_matches)}) in:\n {log_holder.text}"
@@ -122,8 +143,8 @@ def run(
 
             if match_name == "rms_quant_fusion" and "ar_rms_fusion" in matches_check:
                 # AR+rms+quant takes precedence over rms+quant if activated.
-                # That means we get full matching where ar+rms+quant was not activated,
-                # and less where it was
+                # That means we get full matching where ar+rms+quant was not
+                # activated, and less where it was (only the smallest range).
                 assert sum(m == expected_matches for m in log_matches) == tp_size * (
                     num_ranges_activated - 1
                 ), "Expecting full rms+quant fusion where ar+rms+quant not activated"
@@ -135,6 +156,43 @@ def run(
                     f"Expecting at least {expected_matches - matches.ar_rms_fusion} "
                     f"where ar+rms+quant was activated"
                 )
+            elif (
+                match_name == "async_tp"
+                and "sequence_parallel" in matches_check
+                and num_compile_ranges >= 2
+            ):
+                # AsyncTP only finds patterns on ranges where SP ran.
+                n_sp_ranges = num_compile_ranges - 1
+                assert (
+                    sum(m == expected_matches for m in log_matches)
+                    == tp_size * n_sp_ranges
+                ), (
+                    f"Expecting {expected_matches} async_tp on "
+                    f"{tp_size * n_sp_ranges} SP-range entries, "
+                    f"found: {log_matches}"
+                )
+                assert sum(m == 0 for m in log_matches) == tp_size, (
+                    f"Expecting 0 async_tp on {tp_size} small-range entries "
+                    f"(no SP), found: {log_matches}"
+                )
+            elif (
+                match_name == "ar_rms_fusion"
+                and "sequence_parallel" in matches_check
+                and num_compile_ranges >= 2
+            ):
+                # SP consumes allreduce patterns first, so AR+RMS finds
+                # full matches only on the smallest range (no SP).
+                assert sum(m == expected_matches for m in log_matches) == tp_size, (
+                    f"Expecting {expected_matches} ar_rms on "
+                    f"{tp_size} small-range entries, found: {log_matches}"
+                )
+                assert sum(m == 0 for m in log_matches) == tp_size * (
+                    num_ranges_activated - 1
+                ), (
+                    f"Expecting 0 ar_rms on "
+                    f"{tp_size * (num_ranges_activated - 1)} large-range "
+                    f"entries (SP took precedence), found: {log_matches}"
+                )
             else:
                 expected_matches_list = [expected_matches] * n_expected
                 assert sorted(log_matches) == expected_matches_list, (
@@ -142,7 +200,7 @@ def run(
                     f"found: {sorted(log_matches)}"
                 )
 
-            if match_name == "ar_rms_fusion":
+            if match_name == "ar_rms_fusion" and num_compile_ranges >= 2:
                 log_matches = re.findall(
                     r"pass_manager.py:\d+] Skipping "
                     r".*AllReduceFusionPass.* with compile range",
@@ -155,4 +213,17 @@ def run(
                     f"(found {len(log_matches)}) in:\n {log_holder.text}"
                 )
 
+            if match_name == "sequence_parallel" and num_compile_ranges >= 2:
+                log_matches = re.findall(
+                    r"pass_manager.py:\d+] Skipping "
+                    r".*SequenceParallelismPass.* with compile range",
+                    log_holder.text,
+                )
+
+                n_expected = tp_size * (num_compile_ranges - num_ranges_activated)
+                assert len(log_matches) == n_expected, (
+                    f'Could not find {n_expected} "Skipping SequenceParallelismPass" '
+                    f"(found {len(log_matches)}) in:\n {log_holder.text}"
+                )
+
     return run
diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
index 4769ca1e0b63..921839ea0692 100644
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -66,6 +66,9 @@ def test_tp2_async_tp_fp8_fusions(
             enable_qk_norm_rope_fusion=True,
             enable_sp=True,
             fuse_gemm_comms=True,
+            fuse_allreduce_rms=False,
+            # Override threshold for testing (models have small hidden_size)
+            sp_min_token_num=512,
         ),
     )
 
@@ -123,6 +126,9 @@ def test_tp2_async_tp_fusions(
             enable_qk_norm_rope_fusion=True,
             enable_sp=True,
             fuse_gemm_comms=True,
+            fuse_allreduce_rms=False,
+            # Override threshold for testing (models have small hidden_size)
+            sp_min_token_num=512,
         ),
     )
 
@@ -141,3 +147,130 @@ def test_tp2_async_tp_fusions(
         matches_check,
         tp_size=2,
     )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp8, llama4_scout_fp8],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_sp_ar_rms_fp8_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    matches = matches_fn(n_layers)
+
+    if is_blackwell():
+        # Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
+        monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
+            fuse_allreduce_rms=True,
+            # Override threshold for testing (models have small hidden_size)
+            sp_min_token_num=512,
+        ),
+    )
+
+    matches_check = [
+        "rms_quant_fusion",
+        "act_quant_fusion",
+        "norm_rope_fusion",
+        "attn_quant_fusion",
+        "ar_rms_fusion",
+        "sequence_parallel",
+        "async_tp",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b, qwen3_a3b],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_sp_ar_rms_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            enable_qk_norm_rope_fusion=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
+            fuse_allreduce_rms=True,
+            # Override threshold for testing (models have small hidden_size)
+            sp_min_token_num=512,
+        ),
+    )
+
+    matches_check = [
+        "norm_rope_fusion",
+        "ar_rms_fusion",
+        "sequence_parallel",
+        "async_tp",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index eb2f0669ed5f..3ba70b6aad38 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init(
                 fuse_norm_quant=True,
                 fuse_act_quant=True,
                 eliminate_noops=True,
+                sp_min_token_num=512 if enable_sp else None,
             ),
             cudagraph_mode=cudagraph_mode,
         )
diff --git a/tests/compile/test_sequence_parallelism_threshold.py b/tests/compile/test_sequence_parallelism_threshold.py
new file mode 100644
index 000000000000..42e374cd95d7
--- /dev/null
+++ b/tests/compile/test_sequence_parallelism_threshold.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.compilation.passes.fusion.sequence_parallelism import (
+    SP_MIN_HIDDEN_SIZE,
+    SP_MIN_PER_GPU_SIZE_MB,
+    get_sequence_parallelism_threshold,
+)
+
+
+class TestGetSequenceParallelismThreshold:
+    """Tests for get_sequence_parallelism_threshold function."""
+
+    def test_non_cuda_returns_none(self, mock_cuda_platform):
+        """Non-CUDA platforms should return None."""
+        with mock_cuda_platform(is_cuda=False):
+            result = get_sequence_parallelism_threshold(
+                hidden_size=8192, tp_size=2, element_size=2
+            )
+        assert result is None
+
+    def test_unsupported_device_capability_returns_none(self, mock_cuda_platform):
+        """Unsupported device capabilities (e.g., sm80) should return None."""
+        with mock_cuda_platform(capability=(8, 0)):
+            result = get_sequence_parallelism_threshold(
+                hidden_size=8192, tp_size=2, element_size=2
+            )
+        assert result is None
+
+    def test_small_hidden_size_returns_none(self, mock_cuda_platform):
+        """H100 with hidden_size below threshold should return None."""
+        with mock_cuda_platform(capability=(9, 0)):
+            result = get_sequence_parallelism_threshold(
+                hidden_size=4096,
+                tp_size=2,
+                element_size=2,  # 4096 < 8192
+            )
+        assert result is None
+
+    def test_h100_large_model_returns_threshold(self, mock_cuda_platform):
+        """H100 with large enough hidden_size should return calculated threshold."""
+        with mock_cuda_platform(capability=(9, 0)):
+            hidden_size = 8192
+            tp_size = 2
+            element_size = 2  # float16/bfloat16
+
+            result = get_sequence_parallelism_threshold(
+                hidden_size=hidden_size,
+                tp_size=tp_size,
+                element_size=element_size,
+            )
+
+            # Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024
+            MiB = 1024 * 1024
+            expected = int(
+                (SP_MIN_PER_GPU_SIZE_MB[90] * tp_size * MiB)
+                // (hidden_size * element_size)
+            )
+            assert result == expected
+            assert result == 1024
+
+    @pytest.mark.parametrize(
+        "hidden_size,tp_size,element_size,expected",
+        [
+            # Boundary: exactly at min hidden size threshold, tp_size=1
+            # (8 * 1 * 1024 * 1024) // (8192 * 2) = 512
+            (8192, 1, 2, 512),
+            # Larger hidden size reduces token threshold
+            # (8 * 1 * 1024 * 1024) // (16384 * 2) = 256
+            (16384, 1, 2, 256),
+            # Larger tp_size increases token threshold
+            # (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048
+            (8192, 4, 2, 2048),
+            # Larger element_size (fp32) reduces token threshold
+            # (8 * 2 * 1024 * 1024) // (8192 * 4) = 512
+            (8192, 2, 4, 512),
+        ],
+    )
+    def test_threshold_calculation_variations(
+        self, mock_cuda_platform, hidden_size, tp_size, element_size, expected
+    ):
+        """Test threshold calculation with various parameter combinations."""
+        with mock_cuda_platform(capability=(9, 0)):
+            result = get_sequence_parallelism_threshold(
+                hidden_size=hidden_size,
+                tp_size=tp_size,
+                element_size=element_size,
+            )
+            assert result == expected
+
+    def test_hidden_size_boundary(self, mock_cuda_platform):
+        """Test behavior at the exact hidden_size boundary."""
+        with mock_cuda_platform(capability=(9, 0)):
+            # Just below threshold
+            result = get_sequence_parallelism_threshold(
+                hidden_size=SP_MIN_HIDDEN_SIZE[90] - 1,
+                tp_size=2,
+                element_size=2,
+            )
+            assert result is None
+
+            # Exactly at threshold
+            result = get_sequence_parallelism_threshold(
+                hidden_size=SP_MIN_HIDDEN_SIZE[90],
+                tp_size=2,
+                element_size=2,
+            )
+            assert result is not None
diff --git a/vllm/compilation/passes/fusion/sequence_parallelism.py b/vllm/compilation/passes/fusion/sequence_parallelism.py
index 5fb932d7284b..63de85932cb7 100644
--- a/vllm/compilation/passes/fusion/sequence_parallelism.py
+++ b/vllm/compilation/passes/fusion/sequence_parallelism.py
@@ -27,6 +27,63 @@
 
 logger = init_logger(__name__)
 
+# Min hidden size per device capability for sequence parallelism
+# Only apply sequence parallelism for models with hidden_size >= threshold
+SP_MIN_HIDDEN_SIZE: dict[int, int] = {
+    90: 8192,  # H100: only for models with hidden_size >= 8192
+}
+
+# Min size per GPU per device capability for sequence parallelism
+# Total min size = min_per_gpu_size * tp_size
+# This ensures the threshold scales appropriately with tensor parallelism
+SP_MIN_PER_GPU_SIZE_MB: dict[int, float] = {
+    90: 8,  # 8MB per GPU for H100
+}
+
+
+def get_sequence_parallelism_threshold(
+    hidden_size: int,
+    tp_size: int,
+    element_size: int,
+) -> int | None:
+    """
+    Calculate the minimum token threshold for applying sequence parallelism.
+
+    Returns None if sequence parallelism should not be applied based on model size.
+
+    Branching logic based on device capability:
+    - Check if hidden_size >= SP_MIN_HIDDEN_SIZE[device_capability]
+    - If not, returns None (SP disabled for small models on this device)
+    - If yes, calculates threshold based on per-GPU size
+
+    Formula: min_token_num = (min_per_gpu_size_mb * tp_size * MiB) //
+             (hidden_size * element_size)
+    """
+    from vllm.platforms import current_platform
+
+    if not current_platform.is_cuda():
+        return None
+
+    capability = current_platform.get_device_capability()
+    if capability is None:
+        return None
+    device_capability = capability.to_int()
+
+    # Check if device has configured thresholds
+    min_hidden_size = SP_MIN_HIDDEN_SIZE.get(device_capability)
+    min_per_gpu_size_mb = SP_MIN_PER_GPU_SIZE_MB.get(device_capability)
+
+    if min_hidden_size is None or min_per_gpu_size_mb is None:
+        return None
+
+    # Only apply sequence parallelism for models meeting the size threshold
+    if hidden_size < min_hidden_size:
+        return None
+
+    MiB = 1024 * 1024
+    min_size = min_per_gpu_size_mb * MiB * tp_size
+    return int(min_size // (hidden_size * element_size))
+
 
 def get_first_out_wrapper(
     fn: Callable[..., Sequence[torch.Tensor]],
@@ -309,6 +366,23 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
 
+        # Get min_token_num threshold
+        # Read min_token_num from config (calculated during config init)
+        self.min_token_num = None
+        if config.model_config is not None:
+            pass_config = config.compilation_config.pass_config
+            self.min_token_num = pass_config.sp_min_token_num
+
+            if self.min_token_num is not None:
+                # Take the min to avoid exceeding max_num_batched_tokens
+                max_batched = config.scheduler_config.max_num_batched_tokens
+                if max_batched is not None:
+                    self.min_token_num = min(self.min_token_num, max_batched)
+                logger.debug_once(
+                    f"Sequence parallelism min token threshold: {self.min_token_num}",
+                    scope="global",
+                )
+
         # Used to clean up redundant views created temporarily
         # to circumvent residual shape change issues
         self.noop_cleanup = NoOpEliminationPass(config)
@@ -339,29 +413,36 @@ def __init__(self, config: VllmConfig) -> None:
         self.dump_patterns(config, self.patterns)
 
     def is_applicable_for_range(self, compile_range: Range) -> bool:
-        # When sequence parallelism is enabled, the residual tensor from RMSNorm
-        # needs to be split along the sequence dimension. However, this dimension
-        # is symbolic during piecewise compilation, and splitting symbolic shapes
-        # is not supported.
-        #
-        # This pass is therefore only applied when the sequence dimension is
-        # concrete:
-        # 1. In full-graph compilation mode (no Dynamo splitting ops are used).
-        #   For this case we always pad num_tokens to be a multiple of
-        #   tensor_parallel_size, so there's no need to check shape % tp_size == 0.
-        # 2. For specific shape provided during compilation (e.g., from
-        #    `compile_sizes`), which must be divisible by the tensor-parallel
-        #    size.
+        """
+        Determines if sequence parallelism should be applied for the given
+        compile range.
+
+        SP is only beneficial for larger batch sizes where the communication
+        overhead is amortized. For small batches, the overhead of splitting
+        and gathering tensors across TP ranks outweighs the benefits.
+
+        Returns False (SP disabled) when:
+        - Using piecewise compilation with non-concrete or TP-indivisible sizes
+        - min_token_num is None (SP disabled for this device/config)
+        - The compile range starts below the minimum token threshold
+        """
+        # For piecewise compilation (not using inductor graph partition),
+        # we need concrete sizes that are divisible by TP for correct splitting
         if (
-            not self.compilation_config.splitting_ops
-            or self.compilation_config.use_inductor_graph_partition
+            not self.compilation_config.use_inductor_graph_partition
+            and self.compilation_config.splitting_ops
         ):
-            return True
-        tp_size = get_tensor_model_parallel_world_size()
-        result: bool = (compile_range.is_single_size()) and (
-            compile_range.end % tp_size == 0
-        )
-        return result
+            tp_size = get_tensor_model_parallel_world_size()
+            if not compile_range.is_single_size() or compile_range.end % tp_size != 0:
+                return False
+
+        # min_token_num is None when SP is disabled for this device/config
+        # (e.g., non-CUDA platform, unsupported GPU, or small hidden_size)
+        if self.min_token_num is None:
+            return False
+
+        # Only apply SP when batch size meets the minimum threshold
+        return compile_range.start >= self.min_token_num
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph) -> None:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index ab6f3da06cdf..d22e9a96e0f3 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -118,7 +118,9 @@ class PassConfig:
     eliminate_noops: bool = Field(default=True)
     """Eliminate no-op ops."""
     enable_sp: bool = Field(default=None)
-    """Enable sequence parallelism."""
+    """Enable sequence parallelism. Requires TP>1. Automatically disabled
+    if the model's hidden_size is too small for SP to be beneficial
+    (threshold is device-capability dependent)."""
     fuse_gemm_comms: bool = Field(default=None)
     """Enable async TP."""
     fuse_allreduce_rms: bool = Field(default=None)
@@ -155,6 +157,11 @@ class PassConfig:
                 8: 1,  # 1MB
             },
         }, where key is the device capability"""
+    sp_min_token_num: int | None = None
+    """The minimum number of tokens above which vllm should use
+    sequence parallelism. Specified as an integer token count.
+    Unspecified will fallback to default values which are compute
+    capability and world size dependent."""
 
     # TODO(luka) better pass enabling system.
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index ef71a05d3937..fba3c64a9af0 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -853,8 +853,33 @@ def has_blocked_weights():
                 logger.warning("Sequence Parallelism requires TP>1, disabling")
                 self.compilation_config.pass_config.enable_sp = False
                 self.compilation_config.pass_config.fuse_gemm_comms = False
+            else:
+                # Compute SP threshold early; disable if None (model too
+                # small) before +rms_norm gets forced into custom_ops.
+                pass_config = self.compilation_config.pass_config
+                if pass_config.sp_min_token_num is None:
+                    from vllm.compilation.passes.fusion.sequence_parallelism import (
+                        get_sequence_parallelism_threshold,
+                    )
+
+                    tp_size = self.parallel_config.tensor_parallel_size
+                    hidden_size = self.model_config.get_hidden_size()
+                    element_size = self.model_config.dtype.itemsize
+                    pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
+                        hidden_size, tp_size, element_size
+                    )
 
-            elif "-rms_norm" in self.compilation_config.custom_ops:
+                if pass_config.sp_min_token_num is None:
+                    logger.warning(
+                        "Model hidden_size too small for the SP "
+                        "threshold heuristic, disabling. To force SP, "
+                        "set pass_config.sp_min_token_num manually."
+                    )
+                    self.compilation_config.pass_config.enable_sp = False
+                    self.compilation_config.pass_config.fuse_gemm_comms = False
+
+        if self.compilation_config.pass_config.enable_sp:
+            if "-rms_norm" in self.compilation_config.custom_ops:
                 logger.warning(
                     "RMS norm force disabled, sequence parallelism might break"
                 )
@@ -1456,6 +1481,36 @@ def _set_compile_ranges(self):
                         "allreduce-rms fusion will be enabled for all num_tokens."
                     )
 
+        # Add the compile ranges for sequence parallelism
+        if compilation_config.pass_config.enable_sp:
+            pass_config = compilation_config.pass_config
+
+            # Calculate min_token_num if not explicitly provided
+            # User override works regardless of hidden_size
+            if pass_config.sp_min_token_num is None:
+                from vllm.compilation.passes.fusion.sequence_parallelism import (
+                    get_sequence_parallelism_threshold,
+                )
+
+                tp_size = self.parallel_config.tensor_parallel_size
+                hidden_size = self.model_config.get_hidden_size()
+                element_size = self.model_config.dtype.itemsize
+                pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
+                    hidden_size, tp_size, element_size
+                )
+
+            min_token_num = pass_config.sp_min_token_num
+            max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+            if min_token_num is not None and (
+                max_num_batched_tokens is not None
+                and min_token_num < max_num_batched_tokens
+                and min_token_num > 1
+            ):
+                # Add split point at min_token_num - 1 to ensure SP applies
+                # starting from min_token_num
+                # This creates ranges: [1, min-1] (no SP), [min, max] (SP applies)
+                computed_compile_ranges_split_points.append(min_token_num - 1)
+
         if compilation_config.pass_config.fuse_rope_kvcache:
             max_token_num = (
                 compilation_config.pass_config.rope_kvcache_fusion_max_token_num

From 4a9c07a0a2b8308a045476b48be29e37c349274b Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Thu, 26 Feb 2026 06:39:48 +0100
Subject: [PATCH 009/154] [BugFix] anthropic/serving_messages: fix tool call
 arguments streaming (#34887)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
---
 vllm/entrypoints/anthropic/serving.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py
index 8fb347aabed3..dc037313de33 100644
--- a/vllm/entrypoints/anthropic/serving.py
+++ b/vllm/entrypoints/anthropic/serving.py
@@ -432,6 +432,19 @@ async def message_stream_converter(
                                 data = chunk.model_dump_json(exclude_unset=True)
                                 yield wrap_data_with_event(data, "content_block_start")
                                 content_block_started = True
+                                if tool_call.function and tool_call.function.arguments:
+                                    chunk = AnthropicStreamEvent(
+                                        index=content_block_index,
+                                        type="content_block_delta",
+                                        delta=AnthropicDelta(
+                                            type="input_json_delta",
+                                            partial_json=tool_call.function.arguments,
+                                        ),
+                                    )
+                                    data = chunk.model_dump_json(exclude_unset=True)
+                                    yield wrap_data_with_event(
+                                        data, "content_block_delta"
+                                    )
 
                             else:
                                 chunk = AnthropicStreamEvent(

From 186ea22efefd2c6f4f9b7fcb657bd00f50cb465a Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Thu, 26 Feb 2026 01:35:16 -0500
Subject: [PATCH 010/154] [Misc][Harmony] Move Responses API only harmony utils
 to responses/harmony.py (#35339)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 .../openai/parser/test_harmony_utils.py       | 467 +--------------
 .../openai/responses/test_harmony_utils.py    | 463 +++++++++++++++
 .../openai/responses/test_mcp_tools.py        |  10 +-
 .../openai/parser/harmony_utils.py            | 518 +---------------
 vllm/entrypoints/openai/responses/harmony.py  | 552 ++++++++++++++++++
 vllm/entrypoints/openai/responses/serving.py  |  20 +-
 6 files changed, 1040 insertions(+), 990 deletions(-)
 create mode 100644 tests/entrypoints/openai/responses/test_harmony_utils.py
 create mode 100644 vllm/entrypoints/openai/responses/harmony.py

diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
index b73a0b0745c7..7842a1fcd757 100644
--- a/tests/entrypoints/openai/parser/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -2,13 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from openai.types.responses import (
-    ResponseFunctionToolCall,
-    ResponseOutputMessage,
-    ResponseReasoningItem,
-)
-from openai.types.responses.response_output_item import McpCall
-from openai_harmony import Author, Message, Role, TextContent
+from openai_harmony import Message, Role
 
 from tests.entrypoints.openai.utils import verify_harmony_messages
 from vllm.entrypoints.openai.parser.harmony_utils import (
@@ -18,20 +12,21 @@
     has_custom_tools,
     parse_chat_input_to_harmony_message,
     parse_chat_output,
-    parse_input_to_harmony_message,
-    parse_output_message,
+)
+from vllm.entrypoints.openai.responses.harmony import (
+    response_previous_input_to_harmony,
 )
 
 
 class TestCommonParseInputToHarmonyMessage:
     """
     Tests for scenarios that are common to both Chat Completion
-    parse_chat_input_to_harmony_message and Responsees API
-    parse_input_to_harmony_message functions.
+    parse_chat_input_to_harmony_message and Responses API
+    response_previous_input_to_harmony functions.
     """
 
     @pytest.fixture(
-        params=[parse_chat_input_to_harmony_message, parse_input_to_harmony_message]
+        params=[parse_chat_input_to_harmony_message, response_previous_input_to_harmony]
     )
     def parse_function(self, request):
         return request.param
@@ -216,81 +211,6 @@ def test_array_content_with_missing_text(self, parse_function):
         assert messages[0].content[1].text == "actual text"
 
 
-class TestParseInputToHarmonyMessage:
-    """
-    Tests for scenarios that are specific to the Responses API
-    parse_input_to_harmony_message function.
-    """
-
-    def test_message_with_empty_content(self):
-        """Test parsing message with empty string content."""
-        chat_msg = {
-            "role": "user",
-            "content": "",
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].content[0].text == ""
-
-    def test_tool_message_with_string_content(self):
-        """Test parsing tool message with string content."""
-        chat_msg = {
-            "role": "tool",
-            "name": "get_weather",
-            "content": "The weather in San Francisco is sunny, 72°F",
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.TOOL
-        assert messages[0].author.name == "functions.get_weather"
-        assert (
-            messages[0].content[0].text == "The weather in San Francisco is sunny, 72°F"
-        )
-        assert messages[0].channel == "commentary"
-
-    def test_tool_message_with_array_content(self):
-        """Test parsing tool message with array content."""
-        chat_msg = {
-            "role": "tool",
-            "name": "search_results",
-            "content": [
-                {"type": "text", "text": "Result 1: "},
-                {"type": "text", "text": "Result 2: "},
-                {
-                    "type": "image",
-                    "url": "http://example.com/img.png",
-                },  # Should be ignored
-                {"type": "text", "text": "Result 3"},
-            ],
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.TOOL
-        assert messages[0].author.name == "functions.search_results"
-        assert messages[0].content[0].text == "Result 1: Result 2: Result 3"
-
-    def test_tool_message_with_empty_content(self):
-        """Test parsing tool message with None content."""
-        chat_msg = {
-            "role": "tool",
-            "name": "empty_tool",
-            "content": None,
-        }
-
-        messages = parse_input_to_harmony_message(chat_msg)
-
-        assert len(messages) == 1
-        assert messages[0].author.role == Role.TOOL
-        assert messages[0].author.name == "functions.empty_tool"
-        assert messages[0].content[0].text == ""
-
-
 class TestParseChatInputToHarmonyMessage:
     """
     Tests for scenarios that are specific to the Chat Completion API
@@ -888,200 +808,6 @@ def test_parse_chat_output_preamble_then_final(self) -> None:
         assert final_content == "Let me look that up.\nThe answer is 42."
 
 
-class TestParseOutputMessage:
-    """Tests for parse_output_message function."""
-
-    def test_commentary_with_no_recipient_creates_message(self):
-        """Test that commentary with recipient=None (preambles) creates message items.
-
-        Per Harmony format, preambles are intended to be shown to end-users,
-        unlike analysis channel content which is hidden reasoning.
-        See: https://cookbook.openai.com/articles/openai-harmony
-        """
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "I will now search for the weather information."
-        )
-        message = message.with_channel("commentary")
-        # recipient is None by default, representing a preamble
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseOutputMessage)
-        assert output_items[0].type == "message"
-        assert output_items[0].role == "assistant"
-        assert output_items[0].status == "completed"
-        assert len(output_items[0].content) == 1
-        assert output_items[0].content[0].type == "output_text"
-        assert (
-            output_items[0].content[0].text
-            == "I will now search for the weather information."
-        )
-
-    def test_commentary_with_function_recipient_creates_function_call(self):
-        """Test commentary with recipient='functions.X' creates function calls."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}'
-        )
-        message = message.with_channel("commentary")
-        message = message.with_recipient("functions.get_weather")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseFunctionToolCall)
-        assert output_items[0].type == "function_call"
-        assert output_items[0].name == "get_weather"
-        assert (
-            output_items[0].arguments
-            == '{"location": "San Francisco", "units": "celsius"}'
-        )
-        assert output_items[0].call_id.startswith("call_")
-        assert output_items[0].id.startswith("fc_")
-
-    def test_commentary_with_python_recipient_creates_reasoning(self):
-        """Test that commentary with recipient='python' creates reasoning items."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))"
-        )
-        message = message.with_channel("commentary")
-        message = message.with_recipient("python")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert (
-            output_items[0].content[0].text
-            == "import numpy as np\nprint(np.array([1, 2, 3]))"
-        )
-
-    def test_commentary_with_browser_recipient_creates_reasoning(self):
-        """Test that commentary with recipient='browser' creates reasoning items."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "Navigating to the specified URL"
-        )
-        message = message.with_channel("commentary")
-        message = message.with_recipient("browser")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert output_items[0].content[0].text == "Navigating to the specified URL"
-
-    def test_commentary_with_container_recipient_creates_reasoning(self):
-        """Test that commentary with recipient='container' creates reasoning items."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "Running command in container"
-        )
-        message = message.with_channel("commentary")
-        message = message.with_recipient("container")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert output_items[0].content[0].text == "Running command in container"
-
-    def test_commentary_with_empty_content_and_no_recipient(self):
-        """Test edge case: empty commentary with recipient=None."""
-        message = Message.from_role_and_content(Role.ASSISTANT, "")
-        message = message.with_channel("commentary")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseOutputMessage)
-        assert output_items[0].content[0].text == ""
-
-    def test_commentary_with_multiple_contents_and_no_recipient(self):
-        """Test multiple content items in commentary with no recipient."""
-        contents = [
-            TextContent(text="Step 1: Analyze the request"),
-            TextContent(text="Step 2: Prepare to call functions"),
-        ]
-        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
-        message = message.with_channel("commentary")
-
-        output_items = parse_output_message(message)
-
-        # _parse_final_message returns single ResponseOutputMessage with
-        # multiple contents
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseOutputMessage)
-        assert len(output_items[0].content) == 2
-        assert output_items[0].content[0].text == "Step 1: Analyze the request"
-        assert output_items[0].content[1].text == "Step 2: Prepare to call functions"
-
-    def test_commentary_with_multiple_function_calls(self):
-        """Test multiple function calls in commentary channel."""
-        contents = [
-            TextContent(text='{"location": "San Francisco"}'),
-            TextContent(text='{"location": "New York"}'),
-        ]
-        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
-        message = message.with_channel("commentary")
-        message = message.with_recipient("functions.get_weather")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 2
-        assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items)
-        assert output_items[0].name == "get_weather"
-        assert output_items[1].name == "get_weather"
-        assert output_items[0].arguments == '{"location": "San Francisco"}'
-        assert output_items[1].arguments == '{"location": "New York"}'
-
-    def test_commentary_with_unknown_recipient_creates_mcp_call(self):
-        """Test that commentary with unknown recipient creates MCP call."""
-        message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
-        message = message.with_channel("commentary")
-        message = message.with_recipient("custom_tool")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], McpCall)
-        assert output_items[0].type == "mcp_call"
-        assert output_items[0].name == "custom_tool"
-        assert output_items[0].server_label == "custom_tool"
-
-    def test_analysis_channel_creates_reasoning(self):
-        """Test that analysis channel creates reasoning items."""
-        message = Message.from_role_and_content(
-            Role.ASSISTANT, "Analyzing the problem step by step..."
-        )
-        message = message.with_channel("analysis")
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 1
-        assert isinstance(output_items[0], ResponseReasoningItem)
-        assert output_items[0].type == "reasoning"
-        assert (
-            output_items[0].content[0].text == "Analyzing the problem step by step..."
-        )
-
-    def test_non_assistant_message_returns_empty(self):
-        """Test that non-assistant messages return empty list.
-
-        Per the implementation, tool messages to assistant (e.g., search results)
-        are not included in final output to align with OpenAI behavior.
-        """
-        message = Message.from_author_and_content(
-            Author.new(Role.TOOL, "functions.get_weather"),
-            "The weather is sunny, 72°F",
-        )
-
-        output_items = parse_output_message(message)
-
-        assert len(output_items) == 0
-
-
 def test_has_custom_tools() -> None:
     assert not has_custom_tools(set())
     assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"})
@@ -1091,185 +817,6 @@ def test_has_custom_tools() -> None:
     )
 
 
-def test_parse_mcp_call_basic() -> None:
-    """Test that MCP calls are parsed with correct type and server_label."""
-    message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}')
-    message = message.with_recipient("filesystem")
-    message = message.with_channel("commentary")
-
-    output_items = parse_output_message(message)
-
-    assert len(output_items) == 1
-    assert isinstance(output_items[0], McpCall)
-    assert output_items[0].type == "mcp_call"
-    assert output_items[0].name == "filesystem"
-    assert output_items[0].server_label == "filesystem"
-    assert output_items[0].arguments == '{"path": "/tmp"}'
-    assert output_items[0].status == "completed"
-
-
-def test_parse_mcp_call_dotted_recipient() -> None:
-    """Test that dotted recipients extract the tool name correctly."""
-    message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}')
-    message = message.with_recipient("repo_browser.list")
-    message = message.with_channel("commentary")
-
-    output_items = parse_output_message(message)
-
-    assert len(output_items) == 1
-    assert isinstance(output_items[0], McpCall)
-    assert output_items[0].name == "list"
-    assert output_items[0].server_label == "repo_browser"
-
-
-def test_mcp_vs_function_call() -> None:
-    """Test that function calls are not parsed as MCP calls."""
-    func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
-    func_message = func_message.with_recipient("functions.my_tool")
-    func_message = func_message.with_channel("commentary")
-
-    func_items = parse_output_message(func_message)
-
-    assert len(func_items) == 1
-    assert not isinstance(func_items[0], McpCall)
-    assert func_items[0].type == "function_call"
-
-
-def test_mcp_vs_builtin_tools() -> None:
-    """Test that built-in tools (python, container) are not parsed as MCP calls."""
-    # Test python (built-in tool) - should be reasoning, not MCP
-    python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')")
-    python_message = python_message.with_recipient("python")
-    python_message = python_message.with_channel("commentary")
-
-    python_items = parse_output_message(python_message)
-
-    assert len(python_items) == 1
-    assert not isinstance(python_items[0], McpCall)
-    assert python_items[0].type == "reasoning"
-
-
-def test_parse_remaining_state_commentary_channel() -> None:
-    """Test parse_remaining_state with commentary channel and various recipients."""
-    from unittest.mock import Mock
-
-    from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
-
-    # Test 1: functions.* recipient → should return function tool call
-    parser_func = Mock()
-    parser_func.current_content = '{"arg": "value"}'
-    parser_func.current_role = Role.ASSISTANT
-    parser_func.current_channel = "commentary"
-    parser_func.current_recipient = "functions.my_tool"
-
-    func_items = parse_remaining_state(parser_func)
-
-    assert len(func_items) == 1
-    assert not isinstance(func_items[0], McpCall)
-    assert func_items[0].type == "function_call"
-    assert func_items[0].name == "my_tool"
-    assert func_items[0].status == "in_progress"
-
-    # Test 2: MCP tool (not builtin) → should return MCP call
-    parser_mcp = Mock()
-    parser_mcp.current_content = '{"path": "/tmp"}'
-    parser_mcp.current_role = Role.ASSISTANT
-    parser_mcp.current_channel = "commentary"
-    parser_mcp.current_recipient = "filesystem"
-
-    mcp_items = parse_remaining_state(parser_mcp)
-
-    assert len(mcp_items) == 1
-    assert isinstance(mcp_items[0], McpCall)
-    assert mcp_items[0].type == "mcp_call"
-    assert mcp_items[0].name == "filesystem"
-    assert mcp_items[0].server_label == "filesystem"
-    assert mcp_items[0].status == "in_progress"
-
-    # Test 3: Built-in tool (python)
-    # should NOT return MCP call, returns reasoning (internal tool interaction)
-    parser_builtin = Mock()
-    parser_builtin.current_content = "print('hello')"
-    parser_builtin.current_role = Role.ASSISTANT
-    parser_builtin.current_channel = "commentary"
-    parser_builtin.current_recipient = "python"
-
-    builtin_items = parse_remaining_state(parser_builtin)
-
-    # Built-in tools explicitly return reasoning
-    assert len(builtin_items) == 1
-    assert not isinstance(builtin_items[0], McpCall)
-    assert builtin_items[0].type == "reasoning"
-
-    # Test 4: No recipient (preamble) → should return message, not reasoning
-    parser_preamble = Mock()
-    parser_preamble.current_content = "I'll search for that information now."
-    parser_preamble.current_role = Role.ASSISTANT
-    parser_preamble.current_channel = "commentary"
-    parser_preamble.current_recipient = None
-
-    preamble_items = parse_remaining_state(parser_preamble)
-
-    assert len(preamble_items) == 1
-    assert isinstance(preamble_items[0], ResponseOutputMessage)
-    assert preamble_items[0].type == "message"
-    assert preamble_items[0].content[0].text == "I'll search for that information now."
-    assert preamble_items[0].status == "incomplete"  # streaming
-
-
-def test_parse_remaining_state_analysis_channel() -> None:
-    """Test parse_remaining_state with analysis channel and various recipients."""
-    from unittest.mock import Mock
-
-    from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
-
-    # Test 1: functions.* recipient → should return function tool call
-    parser_func = Mock()
-    parser_func.current_content = '{"arg": "value"}'
-    parser_func.current_role = Role.ASSISTANT
-    parser_func.current_channel = "analysis"
-    parser_func.current_recipient = "functions.my_tool"
-
-    func_items = parse_remaining_state(parser_func)
-
-    assert len(func_items) == 1
-    assert not isinstance(func_items[0], McpCall)
-    assert func_items[0].type == "function_call"
-    assert func_items[0].name == "my_tool"
-    assert func_items[0].status == "in_progress"
-
-    # Test 2: MCP tool (not builtin) → should return MCP call
-    parser_mcp = Mock()
-    parser_mcp.current_content = '{"query": "test"}'
-    parser_mcp.current_role = Role.ASSISTANT
-    parser_mcp.current_channel = "analysis"
-    parser_mcp.current_recipient = "database"
-
-    mcp_items = parse_remaining_state(parser_mcp)
-
-    assert len(mcp_items) == 1
-    assert isinstance(mcp_items[0], McpCall)
-    assert mcp_items[0].type == "mcp_call"
-    assert mcp_items[0].name == "database"
-    assert mcp_items[0].server_label == "database"
-    assert mcp_items[0].status == "in_progress"
-
-    # Test 3: Built-in tool (container)
-    # should NOT return MCP call, falls through to reasoning
-    parser_builtin = Mock()
-    parser_builtin.current_content = "docker run"
-    parser_builtin.current_role = Role.ASSISTANT
-    parser_builtin.current_channel = "analysis"
-    parser_builtin.current_recipient = "container"
-
-    builtin_items = parse_remaining_state(parser_builtin)
-
-    # Should fall through to reasoning logic
-    assert len(builtin_items) == 1
-    assert not isinstance(builtin_items[0], McpCall)
-    assert builtin_items[0].type == "reasoning"
-
-
 class TestGetSystemMessage:
     """Tests for get_system_message channel configuration."""
 
diff --git a/tests/entrypoints/openai/responses/test_harmony_utils.py b/tests/entrypoints/openai/responses/test_harmony_utils.py
new file mode 100644
index 000000000000..e51538298ff9
--- /dev/null
+++ b/tests/entrypoints/openai/responses/test_harmony_utils.py
@@ -0,0 +1,463 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for vllm.entrypoints.openai.responses.harmony."""
+
+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseOutputMessage,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response_output_item import McpCall
+from openai_harmony import Author, Message, Role, TextContent
+
+from vllm.entrypoints.openai.responses.harmony import (
+    harmony_to_response_output,
+    parser_state_to_response_output,
+    response_previous_input_to_harmony,
+)
+
+
+class TestResponsePreviousInputToHarmony:
+    """
+    Tests for scenarios that are specific to the Responses API
+    response_previous_input_to_harmony function.
+    """
+
+    def test_message_with_empty_content(self):
+        """Test parsing message with empty string content."""
+        chat_msg = {
+            "role": "user",
+            "content": "",
+        }
+
+        messages = response_previous_input_to_harmony(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].content[0].text == ""
+
+    def test_tool_message_with_string_content(self):
+        """Test parsing tool message with string content."""
+        chat_msg = {
+            "role": "tool",
+            "name": "get_weather",
+            "content": "The weather in San Francisco is sunny, 72°F",
+        }
+
+        messages = response_previous_input_to_harmony(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.TOOL
+        assert messages[0].author.name == "functions.get_weather"
+        assert (
+            messages[0].content[0].text == "The weather in San Francisco is sunny, 72°F"
+        )
+        assert messages[0].channel == "commentary"
+
+    def test_tool_message_with_array_content(self):
+        """Test parsing tool message with array content."""
+        chat_msg = {
+            "role": "tool",
+            "name": "search_results",
+            "content": [
+                {"type": "text", "text": "Result 1: "},
+                {"type": "text", "text": "Result 2: "},
+                {
+                    "type": "image",
+                    "url": "http://example.com/img.png",
+                },  # Should be ignored
+                {"type": "text", "text": "Result 3"},
+            ],
+        }
+
+        messages = response_previous_input_to_harmony(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.TOOL
+        assert messages[0].author.name == "functions.search_results"
+        assert messages[0].content[0].text == "Result 1: Result 2: Result 3"
+
+    def test_tool_message_with_empty_content(self):
+        """Test parsing tool message with None content."""
+        chat_msg = {
+            "role": "tool",
+            "name": "empty_tool",
+            "content": None,
+        }
+
+        messages = response_previous_input_to_harmony(chat_msg)
+
+        assert len(messages) == 1
+        assert messages[0].author.role == Role.TOOL
+        assert messages[0].author.name == "functions.empty_tool"
+        assert messages[0].content[0].text == ""
+
+
+class TestHarmonyToResponseOutput:
+    """Tests for harmony_to_response_output function."""
+
+    def test_commentary_with_no_recipient_creates_message(self):
+        """Test that commentary with recipient=None (preambles) creates message items.
+
+        Per Harmony format, preambles are intended to be shown to end-users,
+        unlike analysis channel content which is hidden reasoning.
+        See: https://cookbook.openai.com/articles/openai-harmony
+        """
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "I will now search for the weather information."
+        )
+        message = message.with_channel("commentary")
+        # recipient is None by default, representing a preamble
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseOutputMessage)
+        assert output_items[0].type == "message"
+        assert output_items[0].role == "assistant"
+        assert output_items[0].status == "completed"
+        assert len(output_items[0].content) == 1
+        assert output_items[0].content[0].type == "output_text"
+        assert (
+            output_items[0].content[0].text
+            == "I will now search for the weather information."
+        )
+
+    def test_commentary_with_function_recipient_creates_function_call(self):
+        """Test commentary with recipient='functions.X' creates function calls."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}'
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseFunctionToolCall)
+        assert output_items[0].type == "function_call"
+        assert output_items[0].name == "get_weather"
+        assert (
+            output_items[0].arguments
+            == '{"location": "San Francisco", "units": "celsius"}'
+        )
+        assert output_items[0].call_id.startswith("call_")
+        assert output_items[0].id.startswith("fc_")
+
+    def test_commentary_with_python_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='python' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("python")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert (
+            output_items[0].content[0].text
+            == "import numpy as np\nprint(np.array([1, 2, 3]))"
+        )
+
+    def test_commentary_with_browser_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='browser' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Navigating to the specified URL"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("browser")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert output_items[0].content[0].text == "Navigating to the specified URL"
+
+    def test_commentary_with_container_recipient_creates_reasoning(self):
+        """Test that commentary with recipient='container' creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Running command in container"
+        )
+        message = message.with_channel("commentary")
+        message = message.with_recipient("container")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert output_items[0].content[0].text == "Running command in container"
+
+    def test_commentary_with_empty_content_and_no_recipient(self):
+        """Test edge case: empty commentary with recipient=None."""
+        message = Message.from_role_and_content(Role.ASSISTANT, "")
+        message = message.with_channel("commentary")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseOutputMessage)
+        assert output_items[0].content[0].text == ""
+
+    def test_commentary_with_multiple_contents_and_no_recipient(self):
+        """Test multiple content items in commentary with no recipient."""
+        contents = [
+            TextContent(text="Step 1: Analyze the request"),
+            TextContent(text="Step 2: Prepare to call functions"),
+        ]
+        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
+        message = message.with_channel("commentary")
+
+        output_items = harmony_to_response_output(message)
+
+        # _parse_final_message returns single ResponseOutputMessage with
+        # multiple contents
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseOutputMessage)
+        assert len(output_items[0].content) == 2
+        assert output_items[0].content[0].text == "Step 1: Analyze the request"
+        assert output_items[0].content[1].text == "Step 2: Prepare to call functions"
+
+    def test_commentary_with_multiple_function_calls(self):
+        """Test multiple function calls in commentary channel."""
+        contents = [
+            TextContent(text='{"location": "San Francisco"}'),
+            TextContent(text='{"location": "New York"}'),
+        ]
+        message = Message.from_role_and_contents(Role.ASSISTANT, contents)
+        message = message.with_channel("commentary")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 2
+        assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items)
+        assert output_items[0].name == "get_weather"
+        assert output_items[1].name == "get_weather"
+        assert output_items[0].arguments == '{"location": "San Francisco"}'
+        assert output_items[1].arguments == '{"location": "New York"}'
+
+    def test_commentary_with_unknown_recipient_creates_mcp_call(self):
+        """Test that commentary with unknown recipient creates MCP call."""
+        message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
+        message = message.with_channel("commentary")
+        message = message.with_recipient("custom_tool")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], McpCall)
+        assert output_items[0].type == "mcp_call"
+        assert output_items[0].name == "custom_tool"
+        assert output_items[0].server_label == "custom_tool"
+
+    def test_analysis_channel_creates_reasoning(self):
+        """Test that analysis channel creates reasoning items."""
+        message = Message.from_role_and_content(
+            Role.ASSISTANT, "Analyzing the problem step by step..."
+        )
+        message = message.with_channel("analysis")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseReasoningItem)
+        assert output_items[0].type == "reasoning"
+        assert (
+            output_items[0].content[0].text == "Analyzing the problem step by step..."
+        )
+
+    def test_non_assistant_message_returns_empty(self):
+        """Test that non-assistant messages return empty list.
+
+        Per the implementation, tool messages to assistant (e.g., search results)
+        are not included in final output to align with OpenAI behavior.
+        """
+        message = Message.from_author_and_content(
+            Author.new(Role.TOOL, "functions.get_weather"),
+            "The weather is sunny, 72°F",
+        )
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 0
+
+
+def test_parse_mcp_call_basic() -> None:
+    """Test that MCP calls are parsed with correct type and server_label."""
+    message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}')
+    message = message.with_recipient("filesystem")
+    message = message.with_channel("commentary")
+
+    output_items = harmony_to_response_output(message)
+
+    assert len(output_items) == 1
+    assert isinstance(output_items[0], McpCall)
+    assert output_items[0].type == "mcp_call"
+    assert output_items[0].name == "filesystem"
+    assert output_items[0].server_label == "filesystem"
+    assert output_items[0].arguments == '{"path": "/tmp"}'
+    assert output_items[0].status == "completed"
+
+
+def test_parse_mcp_call_dotted_recipient() -> None:
+    """Test that dotted recipients extract the tool name correctly."""
+    message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}')
+    message = message.with_recipient("repo_browser.list")
+    message = message.with_channel("commentary")
+
+    output_items = harmony_to_response_output(message)
+
+    assert len(output_items) == 1
+    assert isinstance(output_items[0], McpCall)
+    assert output_items[0].name == "list"
+    assert output_items[0].server_label == "repo_browser"
+
+
+def test_mcp_vs_function_call() -> None:
+    """Test that function calls are not parsed as MCP calls."""
+    func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
+    func_message = func_message.with_recipient("functions.my_tool")
+    func_message = func_message.with_channel("commentary")
+
+    func_items = harmony_to_response_output(func_message)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+
+
+def test_mcp_vs_builtin_tools() -> None:
+    """Test that built-in tools (python, container) are not parsed as MCP calls."""
+    # Test python (built-in tool) - should be reasoning, not MCP
+    python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')")
+    python_message = python_message.with_recipient("python")
+    python_message = python_message.with_channel("commentary")
+
+    python_items = harmony_to_response_output(python_message)
+
+    assert len(python_items) == 1
+    assert not isinstance(python_items[0], McpCall)
+    assert python_items[0].type == "reasoning"
+
+
+def test_parser_state_to_response_output_commentary_channel() -> None:
+    """Test parser_state_to_response_output with commentary
+    channel and various recipients."""
+    from unittest.mock import Mock
+
+    # Test 1: functions.* recipient -> should return function tool call
+    parser_func = Mock()
+    parser_func.current_content = '{"arg": "value"}'
+    parser_func.current_role = Role.ASSISTANT
+    parser_func.current_channel = "commentary"
+    parser_func.current_recipient = "functions.my_tool"
+
+    func_items = parser_state_to_response_output(parser_func)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+    assert func_items[0].name == "my_tool"
+    assert func_items[0].status == "in_progress"
+
+    # Test 2: MCP tool (not builtin) -> should return MCP call
+    parser_mcp = Mock()
+    parser_mcp.current_content = '{"path": "/tmp"}'
+    parser_mcp.current_role = Role.ASSISTANT
+    parser_mcp.current_channel = "commentary"
+    parser_mcp.current_recipient = "filesystem"
+
+    mcp_items = parser_state_to_response_output(parser_mcp)
+
+    assert len(mcp_items) == 1
+    assert isinstance(mcp_items[0], McpCall)
+    assert mcp_items[0].type == "mcp_call"
+    assert mcp_items[0].name == "filesystem"
+    assert mcp_items[0].server_label == "filesystem"
+    assert mcp_items[0].status == "in_progress"
+
+    # Test 3: Built-in tool (python)
+    # should NOT return MCP call, returns reasoning (internal tool interaction)
+    parser_builtin = Mock()
+    parser_builtin.current_content = "print('hello')"
+    parser_builtin.current_role = Role.ASSISTANT
+    parser_builtin.current_channel = "commentary"
+    parser_builtin.current_recipient = "python"
+
+    builtin_items = parser_state_to_response_output(parser_builtin)
+
+    # Built-in tools explicitly return reasoning
+    assert len(builtin_items) == 1
+    assert not isinstance(builtin_items[0], McpCall)
+    assert builtin_items[0].type == "reasoning"
+
+    # Test 4: No recipient (preamble) → should return message, not reasoning
+    parser_preamble = Mock()
+    parser_preamble.current_content = "I'll search for that information now."
+    parser_preamble.current_role = Role.ASSISTANT
+    parser_preamble.current_channel = "commentary"
+    parser_preamble.current_recipient = None
+
+    preamble_items = parser_state_to_response_output(parser_preamble)
+
+    assert len(preamble_items) == 1
+    assert isinstance(preamble_items[0], ResponseOutputMessage)
+    assert preamble_items[0].type == "message"
+    assert preamble_items[0].content[0].text == "I'll search for that information now."
+    assert preamble_items[0].status == "incomplete"  # streaming
+
+
+def test_parser_state_to_response_output_analysis_channel() -> None:
+    """Test parser_state_to_response_output with analysis
+    channel and various recipients."""
+    from unittest.mock import Mock
+
+    # Test 1: functions.* recipient -> should return function tool call
+    parser_func = Mock()
+    parser_func.current_content = '{"arg": "value"}'
+    parser_func.current_role = Role.ASSISTANT
+    parser_func.current_channel = "analysis"
+    parser_func.current_recipient = "functions.my_tool"
+
+    func_items = parser_state_to_response_output(parser_func)
+
+    assert len(func_items) == 1
+    assert not isinstance(func_items[0], McpCall)
+    assert func_items[0].type == "function_call"
+    assert func_items[0].name == "my_tool"
+    assert func_items[0].status == "in_progress"
+
+    # Test 2: MCP tool (not builtin) -> should return MCP call
+    parser_mcp = Mock()
+    parser_mcp.current_content = '{"query": "test"}'
+    parser_mcp.current_role = Role.ASSISTANT
+    parser_mcp.current_channel = "analysis"
+    parser_mcp.current_recipient = "database"
+
+    mcp_items = parser_state_to_response_output(parser_mcp)
+
+    assert len(mcp_items) == 1
+    assert isinstance(mcp_items[0], McpCall)
+    assert mcp_items[0].type == "mcp_call"
+    assert mcp_items[0].name == "database"
+    assert mcp_items[0].server_label == "database"
+    assert mcp_items[0].status == "in_progress"
+
+    # Test 3: Built-in tool (container)
+    # should NOT return MCP call, falls through to reasoning
+    parser_builtin = Mock()
+    parser_builtin.current_content = "docker run"
+    parser_builtin.current_role = Role.ASSISTANT
+    parser_builtin.current_channel = "analysis"
+    parser_builtin.current_recipient = "container"
+
+    builtin_items = parser_state_to_response_output(parser_builtin)
+
+    # Should fall through to reasoning logic
+    assert len(builtin_items) == 1
+    assert not isinstance(builtin_items[0], McpCall)
+    assert builtin_items[0].type == "reasoning"
diff --git a/tests/entrypoints/openai/responses/test_mcp_tools.py b/tests/entrypoints/openai/responses/test_mcp_tools.py
index 310af4308c63..55445f1889b8 100644
--- a/tests/entrypoints/openai/responses/test_mcp_tools.py
+++ b/tests/entrypoints/openai/responses/test_mcp_tools.py
@@ -97,16 +97,16 @@ def test_get_tool_description(self):
         assert server.get_tool_description("test_server", allowed_tools=[]) is None
 
     def test_builtin_tools_consistency(self):
-        """MCP_BUILTIN_TOOLS must match _BUILTIN_TOOL_TO_MCP_SERVER_LABEL values."""
+        """MCP_BUILTIN_TOOLS must match BUILTIN_TOOL_TO_MCP_SERVER_LABEL values."""
         from vllm.entrypoints.openai.parser.harmony_utils import (
-            _BUILTIN_TOOL_TO_MCP_SERVER_LABEL,
+            BUILTIN_TOOL_TO_MCP_SERVER_LABEL,
             MCP_BUILTIN_TOOLS,
         )
 
-        assert set(_BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values()) == MCP_BUILTIN_TOOLS, (
+        assert set(BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values()) == MCP_BUILTIN_TOOLS, (
             f"MCP_BUILTIN_TOOLS {MCP_BUILTIN_TOOLS} does not match "
-            f"_BUILTIN_TOOL_TO_MCP_SERVER_LABEL values "
-            f"{set(_BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values())}"
+            f"BUILTIN_TOOL_TO_MCP_SERVER_LABEL values "
+            f"{set(BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values())}"
         )
 
 
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
index 9dfd5f518f77..9b4264456c51 100644
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -2,27 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
-import json
 from collections.abc import Iterable, Sequence
 from typing import Literal
 
-from openai.types.responses import (
-    ResponseFunctionToolCall,
-    ResponseOutputItem,
-    ResponseOutputMessage,
-    ResponseOutputText,
-    ResponseReasoningItem,
-)
-from openai.types.responses.response_function_web_search import (
-    ActionFind,
-    ActionOpenPage,
-    ActionSearch,
-    ResponseFunctionWebSearch,
-)
-from openai.types.responses.response_output_item import McpCall
-from openai.types.responses.response_reasoning_item import (
-    Content as ResponseReasoningTextContent,
-)
 from openai.types.responses.tool import Tool
 from openai_harmony import (
     Author,
@@ -38,17 +20,10 @@
     ToolDescription,
     load_harmony_encoding,
 )
-from openai_harmony import Message as OpenAIHarmonyMessage
-from openai_harmony import Role as OpenAIHarmonyRole
 
 from vllm import envs
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam
-from vllm.entrypoints.openai.responses.protocol import (
-    ResponseInputOutputItem,
-    ResponsesRequest,
-)
 from vllm.logger import init_logger
-from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -64,14 +39,14 @@
 # they are available and requested by the user.
 # Tool args are provided by MCP tool descriptions. Output
 # of the tools are stringified.
-_BUILTIN_TOOL_TO_MCP_SERVER_LABEL: dict[str, str] = {
+BUILTIN_TOOL_TO_MCP_SERVER_LABEL: dict[str, str] = {
     "python": "code_interpreter",
     "browser": "web_search_preview",
     "container": "container",
 }
 
 # Derive MCP_BUILTIN_TOOLS from the canonical mapping
-MCP_BUILTIN_TOOLS: set[str] = set(_BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values())
+MCP_BUILTIN_TOOLS: set[str] = set(BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values())
 
 
 def has_custom_tools(tool_types: set[str]) -> bool:
@@ -179,55 +154,6 @@ def get_user_message(content: str) -> Message:
     return Message.from_role_and_content(Role.USER, content)
 
 
-def parse_response_input(
-    response_msg: ResponseInputOutputItem,
-    prev_responses: list[ResponseOutputItem | ResponseReasoningItem],
-) -> Message:
-    if not isinstance(response_msg, dict):
-        response_msg = response_msg.model_dump()
-    if "type" not in response_msg or response_msg["type"] == "message":
-        role = response_msg["role"]
-        content = response_msg["content"]
-        # Add prefix for developer messages.
-        # <|start|>developer<|message|># Instructions {instructions}<|end|>
-        text_prefix = "Instructions:\n" if role == "developer" else ""
-        if isinstance(content, str):
-            msg = Message.from_role_and_content(role, text_prefix + content)
-        else:
-            contents = [TextContent(text=text_prefix + c["text"]) for c in content]
-            msg = Message.from_role_and_contents(role, contents)
-        if role == "assistant":
-            msg = msg.with_channel("final")
-    elif response_msg["type"] == "function_call_output":
-        call_id = response_msg["call_id"]
-        call_response: ResponseFunctionToolCall | None = None
-        for prev_response in reversed(prev_responses):
-            if (
-                isinstance(prev_response, ResponseFunctionToolCall)
-                and prev_response.call_id == call_id
-            ):
-                call_response = prev_response
-                break
-        if call_response is None:
-            raise ValueError(f"No call message found for {call_id}")
-        msg = Message.from_author_and_content(
-            Author.new(Role.TOOL, f"functions.{call_response.name}"),
-            response_msg["output"],
-        )
-    elif response_msg["type"] == "reasoning":
-        content = response_msg["content"]
-        assert len(content) == 1
-        msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
-    elif response_msg["type"] == "function_call":
-        msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
-        msg = msg.with_channel("commentary")
-        msg = msg.with_recipient(f"functions.{response_msg['name']}")
-        msg = msg.with_content_type("json")
-    else:
-        raise ValueError(f"Unknown input type: {response_msg['type']}")
-    return msg
-
-
 def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]:
     """
     Parse a list of messages from request.messages in the Chat Completion API to
@@ -390,139 +316,6 @@ def parse_chat_input_to_harmony_message(
     return msgs
 
 
-def parse_input_to_harmony_message(chat_msg) -> list[Message]:
-    """Parse a message from request.previous_input_messages
-    into Harmony messages.
-
-    Supports both OpenAI chat format ({"role": "..."}) and
-    Harmony format ({"author": {"role": "..."}}).
-    """
-    if not isinstance(chat_msg, dict):
-        chat_msg = chat_msg.model_dump(exclude_none=True)
-
-    if "author" in chat_msg and isinstance(chat_msg.get("author"), dict):
-        return [_parse_harmony_format_message(chat_msg)]
-
-    return _parse_chat_format_message(chat_msg)
-
-
-def _parse_harmony_format_message(chat_msg: dict) -> Message:
-    """Reconstruct a Message from Harmony-format dict,
-    preserving channel, recipient, and content_type."""
-    author_dict = chat_msg["author"]
-    role = author_dict.get("role")
-    name = author_dict.get("name")
-
-    raw_content = chat_msg.get("content", "")
-    if isinstance(raw_content, list):
-        # TODO: Support refusal and non-text content types.
-        contents = [TextContent(text=c.get("text", "")) for c in raw_content]
-    elif isinstance(raw_content, str):
-        contents = [TextContent(text=raw_content)]
-    else:
-        contents = [TextContent(text="")]
-
-    if name:
-        msg = Message.from_author_and_contents(Author.new(Role(role), name), contents)
-    else:
-        msg = Message.from_role_and_contents(Role(role), contents)
-
-    channel = chat_msg.get("channel")
-    if channel:
-        msg = msg.with_channel(channel)
-    recipient = chat_msg.get("recipient")
-    if recipient:
-        msg = msg.with_recipient(recipient)
-    content_type = chat_msg.get("content_type")
-    if content_type:
-        msg = msg.with_content_type(content_type)
-
-    return msg
-
-
-def _parse_chat_format_message(chat_msg: dict) -> list[Message]:
-    """Parse an OpenAI chat-format dict into Harmony messages."""
-    role = chat_msg.get("role")
-    if role is None:
-        raise ValueError(f"Message has no 'role' key: {chat_msg}")
-
-    # Assistant message with tool calls
-    tool_calls = chat_msg.get("tool_calls")
-    if role == "assistant" and tool_calls:
-        msgs: list[Message] = []
-        for call in tool_calls:
-            func = call.get("function", {})
-            name = func.get("name", "")
-            arguments = func.get("arguments", "") or ""
-            msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
-            msg = msg.with_channel("commentary")
-            msg = msg.with_recipient(f"functions.{name}")
-            msg = msg.with_content_type("json")
-            msgs.append(msg)
-        return msgs
-
-    # Tool role message (tool output)
-    if role == "tool":
-        name = chat_msg.get("name", "")
-        if name and not name.startswith("functions."):
-            name = f"functions.{name}"
-        content = chat_msg.get("content", "") or ""
-        content = flatten_chat_text_content(content)
-        # NOTE: .with_recipient("assistant") is required on tool messages
-        # to match parse_chat_input_to_harmony_message behavior and ensure
-        # proper routing in the Harmony protocol.
-        msg = (
-            Message.from_author_and_content(Author.new(Role.TOOL, name), content)
-            .with_channel("commentary")
-            .with_recipient("assistant")
-        )
-        return [msg]
-
-    # Default: user/assistant/system messages
-    content = chat_msg.get("content", "")
-    if isinstance(content, str):
-        contents = [TextContent(text=content)]
-    else:
-        # TODO: Support refusal.
-        contents = [TextContent(text=c.get("text", "")) for c in content]
-    msg = Message.from_role_and_contents(role, contents)
-    return [msg]
-
-
-def construct_harmony_previous_input_messages(
-    request: ResponsesRequest,
-) -> list[OpenAIHarmonyMessage]:
-    messages: list[OpenAIHarmonyMessage] = []
-    if request.previous_input_messages:
-        for message in request.previous_input_messages:
-            # Handle both OpenAIHarmonyMessage objects and dictionary inputs
-            if isinstance(message, OpenAIHarmonyMessage):
-                message_role = message.author.role
-                # To match OpenAI, instructions, reasoning and tools are
-                # always taken from the most recent Responses API request
-                # not carried over from previous requests
-                if (
-                    message_role == OpenAIHarmonyRole.SYSTEM
-                    or message_role == OpenAIHarmonyRole.DEVELOPER
-                ):
-                    continue
-                messages.append(message)
-            else:
-                harmony_messages = parse_input_to_harmony_message(message)
-                for harmony_msg in harmony_messages:
-                    message_role = harmony_msg.author.role
-                    # To match OpenAI, instructions, reasoning and tools are
-                    # always taken from the most recent Responses API request
-                    # not carried over from previous requests
-                    if (
-                        message_role == OpenAIHarmonyRole.SYSTEM
-                        or message_role == OpenAIHarmonyRole.DEVELOPER
-                    ):
-                        continue
-                    messages.append(harmony_msg)
-    return messages
-
-
 def render_for_completion(messages: list[Message]) -> list[int]:
     conversation = Conversation.from_messages(messages)
     token_ids = get_encoding().render_conversation_for_completion(
@@ -531,313 +324,6 @@ def render_for_completion(messages: list[Message]) -> list[int]:
     return token_ids
 
 
-def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem:
-    """Parse browser tool calls (search, open, find) into web search items."""
-    if len(message.content) != 1:
-        raise ValueError("Invalid number of contents in browser message")
-    content = message.content[0]
-
-    # Parse JSON args (with retry detection)
-    try:
-        browser_call = json.loads(content.text)
-    except json.JSONDecodeError:
-        logger.warning(
-            "Invalid JSON in browser tool call, using error placeholder: %s",
-            content.text,
-        )
-        json_retry_output_message = (
-            f"Invalid JSON args, caught and retried: {content.text}"
-        )
-        browser_call = {
-            "query": json_retry_output_message,
-            "url": json_retry_output_message,
-            "pattern": json_retry_output_message,
-        }
-
-    # Create appropriate action based on recipient
-    if recipient == "browser.search":
-        action = ActionSearch(
-            query=f"cursor:{browser_call.get('query', '')}", type="search"
-        )
-    elif recipient == "browser.open":
-        action = ActionOpenPage(
-            url=f"cursor:{browser_call.get('url', '')}", type="open_page"
-        )
-    elif recipient == "browser.find":
-        action = ActionFind(
-            pattern=browser_call.get("pattern", ""),
-            url=f"cursor:{browser_call.get('url', '')}",
-            type="find",
-        )
-    else:
-        raise ValueError(f"Unknown browser action: {recipient}")
-
-    return ResponseFunctionWebSearch(
-        id=f"ws_{random_uuid()}",
-        action=action,
-        status="completed",
-        type="web_search_call",
-    )
-
-
-def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
-    """Parse function calls into function tool call items."""
-    function_name = recipient.split(".")[-1]
-    output_items = []
-    for content in message.content:
-        random_id = random_uuid()
-        response_item = ResponseFunctionToolCall(
-            arguments=content.text,
-            call_id=f"call_{random_id}",
-            type="function_call",
-            name=function_name,
-            id=f"fc_{random_id}",
-        )
-        output_items.append(response_item)
-    return output_items
-
-
-def _parse_reasoning(message: Message) -> list[ResponseOutputItem]:
-    """Parse reasoning/analysis content into reasoning items."""
-    output_items = []
-    for content in message.content:
-        reasoning_item = ResponseReasoningItem(
-            id=f"rs_{random_uuid()}",
-            summary=[],
-            type="reasoning",
-            content=[
-                ResponseReasoningTextContent(text=content.text, type="reasoning_text")
-            ],
-            status=None,
-        )
-        output_items.append(reasoning_item)
-    return output_items
-
-
-def _parse_final_message(message: Message) -> ResponseOutputItem:
-    """Parse final channel messages into output message items."""
-    contents = []
-    for content in message.content:
-        output_text = ResponseOutputText(
-            text=content.text,
-            annotations=[],  # TODO
-            type="output_text",
-            logprobs=None,  # TODO
-        )
-        contents.append(output_text)
-    return ResponseOutputMessage(
-        id=f"msg_{random_uuid()}",
-        content=contents,
-        role=message.author.role,
-        status="completed",
-        type="message",
-    )
-
-
-def _parse_mcp_recipient(recipient: str) -> tuple[str, str]:
-    """
-    Parse MCP recipient into (server_label, tool_name).
-
-    For dotted recipients like "repo_browser.list":
-        - server_label: "repo_browser" (namespace/server)
-        - tool_name: "list" (specific tool)
-
-    For simple recipients like "filesystem":
-        - server_label: "filesystem"
-        - tool_name: "filesystem"
-    """
-    if "." in recipient:
-        server_label = recipient.split(".")[0]
-        tool_name = recipient.split(".")[-1]
-    else:
-        server_label = recipient
-        tool_name = recipient
-    return server_label, tool_name
-
-
-def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
-    """Parse MCP calls into MCP call items."""
-    # Handle built-in tools that need server_label mapping
-    if recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
-        server_label = _BUILTIN_TOOL_TO_MCP_SERVER_LABEL[recipient]
-        tool_name = recipient
-    else:
-        server_label, tool_name = _parse_mcp_recipient(recipient)
-
-    output_items = []
-    for content in message.content:
-        response_item = McpCall(
-            arguments=content.text,
-            type="mcp_call",
-            name=tool_name,
-            server_label=server_label,
-            id=f"mcp_{random_uuid()}",
-            status="completed",
-        )
-        output_items.append(response_item)
-    return output_items
-
-
-def _parse_message_no_recipient(
-    message: Message,
-) -> list[ResponseOutputItem]:
-    """Parse a Harmony message with no recipient based on its channel."""
-    if message.channel == "analysis":
-        return _parse_reasoning(message)
-
-    if message.channel in ("commentary", "final"):
-        # Per Harmony format, preambles (commentary with no recipient) and
-        # final channel content are both intended to be shown to end-users.
-        # See: https://cookbook.openai.com/articles/openai-harmony
-        return [_parse_final_message(message)]
-
-    raise ValueError(f"Unknown channel: {message.channel}")
-
-
-def parse_output_message(message: Message) -> list[ResponseOutputItem]:
-    """
-    Parse a Harmony message into a list of output response items.
-    """
-    if message.author.role != "assistant":
-        # This is a message from a tool to the assistant (e.g., search result).
-        # Don't include it in the final output for now. This aligns with
-        # OpenAI's behavior on models like o4-mini.
-        return []
-
-    output_items: list[ResponseOutputItem] = []
-    recipient = message.recipient
-
-    if recipient is not None:
-        # Browser tool calls (browser.search, browser.open, browser.find)
-        if recipient.startswith("browser."):
-            output_items.append(_parse_browser_tool_call(message, recipient))
-
-        # Function calls (should only happen on commentary channel)
-        elif message.channel == "commentary" and recipient.startswith("functions."):
-            output_items.extend(_parse_function_call(message, recipient))
-
-        # Built-in MCP tools (python, browser, container)
-        elif recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
-            output_items.extend(_parse_reasoning(message))
-
-        # All other recipients are MCP calls
-        else:
-            output_items.extend(_parse_mcp_call(message, recipient))
-
-    # No recipient - handle based on channel for non-tool messages
-    else:
-        output_items.extend(_parse_message_no_recipient(message))
-
-    return output_items
-
-
-def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
-    if not parser.current_content:
-        return []
-    if parser.current_role != Role.ASSISTANT:
-        return []
-    current_recipient = parser.current_recipient
-    if current_recipient is not None and current_recipient.startswith("browser."):
-        return []
-
-    if current_recipient and parser.current_channel in ("commentary", "analysis"):
-        if current_recipient.startswith("functions."):
-            rid = random_uuid()
-            return [
-                ResponseFunctionToolCall(
-                    arguments=parser.current_content,
-                    call_id=f"call_{rid}",
-                    type="function_call",
-                    name=current_recipient.split(".")[-1],
-                    id=f"fc_{rid}",
-                    status="in_progress",
-                )
-            ]
-        # Built-in MCP tools (python, browser, container)
-        elif current_recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
-            return [
-                ResponseReasoningItem(
-                    id=f"rs_{random_uuid()}",
-                    summary=[],
-                    type="reasoning",
-                    content=[
-                        ResponseReasoningTextContent(
-                            text=parser.current_content, type="reasoning_text"
-                        )
-                    ],
-                    status=None,
-                )
-            ]
-        # All other recipients are MCP calls
-        else:
-            rid = random_uuid()
-            server_label, tool_name = _parse_mcp_recipient(current_recipient)
-            return [
-                McpCall(
-                    arguments=parser.current_content,
-                    type="mcp_call",
-                    name=tool_name,
-                    server_label=server_label,
-                    id=f"mcp_{rid}",
-                    status="in_progress",
-                )
-            ]
-
-    if parser.current_channel == "commentary":
-        # Per Harmony format, preambles (commentary with no recipient) are
-        # intended to be shown to end-users, unlike analysis channel content.
-        output_text = ResponseOutputText(
-            text=parser.current_content,
-            annotations=[],
-            type="output_text",
-            logprobs=None,
-        )
-        return [
-            ResponseOutputMessage(
-                id=f"msg_{random_uuid()}",
-                content=[output_text],
-                role="assistant",
-                status="incomplete",
-                type="message",
-            )
-        ]
-
-    if parser.current_channel == "analysis":
-        return [
-            ResponseReasoningItem(
-                id=f"rs_{random_uuid()}",
-                summary=[],
-                type="reasoning",
-                content=[
-                    ResponseReasoningTextContent(
-                        text=parser.current_content, type="reasoning_text"
-                    )
-                ],
-                status=None,
-            )
-        ]
-
-    if parser.current_channel == "final":
-        output_text = ResponseOutputText(
-            text=parser.current_content,
-            annotations=[],  # TODO
-            type="output_text",
-            logprobs=None,  # TODO
-        )
-        text_item = ResponseOutputMessage(
-            id=f"msg_{random_uuid()}",
-            content=[output_text],
-            role="assistant",
-            # if the parser still has messages (ie if the generator got cut
-            # abruptly), this should be incomplete
-            status="incomplete",
-            type="message",
-        )
-        return [text_item]
-
-    return []
-
-
 def get_stop_tokens_for_assistant_actions() -> list[int]:
     return get_encoding().stop_tokens_for_assistant_actions()
 
diff --git a/vllm/entrypoints/openai/responses/harmony.py b/vllm/entrypoints/openai/responses/harmony.py
new file mode 100644
index 000000000000..460f310926ad
--- /dev/null
+++ b/vllm/entrypoints/openai/responses/harmony.py
@@ -0,0 +1,552 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Harmony ↔ Responses API conversion utilities.
+
+Handles two directions:
+  1. Response Input → Harmony Messages  (input parsing)
+  2. Harmony Messages → Response Output Items  (output parsing)
+"""
+
+import json
+
+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseOutputItem,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response_function_web_search import (
+    ActionFind,
+    ActionOpenPage,
+    ActionSearch,
+    ResponseFunctionWebSearch,
+)
+from openai.types.responses.response_output_item import McpCall
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
+from openai_harmony import Author, Message, Role, StreamableParser, TextContent
+
+from vllm.entrypoints.openai.parser.harmony_utils import (
+    BUILTIN_TOOL_TO_MCP_SERVER_LABEL,
+    flatten_chat_text_content,
+)
+from vllm.entrypoints.openai.responses.protocol import (
+    ResponseInputOutputItem,
+    ResponsesRequest,
+)
+from vllm.logger import init_logger
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+# ---------------------------------------------------------------------------
+# 1. Private helpers for input parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_harmony_format_message(chat_msg: dict) -> Message:
+    """Reconstruct a Message from Harmony-format dict,
+    preserving channel, recipient, and content_type."""
+    author_dict = chat_msg["author"]
+    role = author_dict.get("role")
+    name = author_dict.get("name")
+
+    raw_content = chat_msg.get("content", "")
+    if isinstance(raw_content, list):
+        # TODO: Support refusal and non-text content types.
+        contents = [TextContent(text=c.get("text", "")) for c in raw_content]
+    elif isinstance(raw_content, str):
+        contents = [TextContent(text=raw_content)]
+    else:
+        contents = [TextContent(text="")]
+
+    if name:
+        msg = Message.from_author_and_contents(Author.new(Role(role), name), contents)
+    else:
+        msg = Message.from_role_and_contents(Role(role), contents)
+
+    channel = chat_msg.get("channel")
+    if channel:
+        msg = msg.with_channel(channel)
+    recipient = chat_msg.get("recipient")
+    if recipient:
+        msg = msg.with_recipient(recipient)
+    content_type = chat_msg.get("content_type")
+    if content_type:
+        msg = msg.with_content_type(content_type)
+
+    return msg
+
+
+def _parse_chat_format_message(chat_msg: dict) -> list[Message]:
+    """Parse an OpenAI chat-format dict into Harmony messages."""
+    role = chat_msg.get("role")
+    if role is None:
+        raise ValueError(f"Message has no 'role' key: {chat_msg}")
+
+    # Assistant message with tool calls
+    tool_calls = chat_msg.get("tool_calls")
+    if role == "assistant" and tool_calls:
+        msgs: list[Message] = []
+        for call in tool_calls:
+            func = call.get("function", {})
+            name = func.get("name", "")
+            arguments = func.get("arguments", "") or ""
+            msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
+            msg = msg.with_channel("commentary")
+            msg = msg.with_recipient(f"functions.{name}")
+            msg = msg.with_content_type("json")
+            msgs.append(msg)
+        return msgs
+
+    # Tool role message (tool output)
+    if role == "tool":
+        name = chat_msg.get("name", "")
+        if name and not name.startswith("functions."):
+            name = f"functions.{name}"
+        content = chat_msg.get("content", "") or ""
+        content = flatten_chat_text_content(content)
+        # NOTE: .with_recipient("assistant") is required on tool messages
+        # to match parse_chat_input_to_harmony_message behavior and ensure
+        # proper routing in the Harmony protocol.
+        msg = (
+            Message.from_author_and_content(Author.new(Role.TOOL, name), content)
+            .with_channel("commentary")
+            .with_recipient("assistant")
+        )
+        return [msg]
+
+    # Default: user/assistant/system messages
+    content = chat_msg.get("content", "")
+    if isinstance(content, str):
+        contents = [TextContent(text=content)]
+    else:
+        # TODO: Support refusal.
+        contents = [TextContent(text=c.get("text", "")) for c in content]
+    msg = Message.from_role_and_contents(role, contents)
+    return [msg]
+
+
+# ---------------------------------------------------------------------------
+# 2. Public input parsing functions
+# ---------------------------------------------------------------------------
+
+
+def response_input_to_harmony(
+    response_msg: ResponseInputOutputItem,
+    prev_responses: list[ResponseOutputItem | ResponseReasoningItem],
+) -> Message:
+    """Convert a single ResponseInputOutputItem into a Harmony Message."""
+    if not isinstance(response_msg, dict):
+        response_msg = response_msg.model_dump()
+    if "type" not in response_msg or response_msg["type"] == "message":
+        role = response_msg["role"]
+        content = response_msg["content"]
+        # Add prefix for developer messages.
+        # <|start|>developer<|message|># Instructions {instructions}<|end|>
+        text_prefix = "Instructions:\n" if role == "developer" else ""
+        if isinstance(content, str):
+            msg = Message.from_role_and_content(role, text_prefix + content)
+        else:
+            contents = [TextContent(text=text_prefix + c["text"]) for c in content]
+            msg = Message.from_role_and_contents(role, contents)
+        if role == "assistant":
+            msg = msg.with_channel("final")
+    elif response_msg["type"] == "function_call_output":
+        call_id = response_msg["call_id"]
+        call_response: ResponseFunctionToolCall | None = None
+        for prev_response in reversed(prev_responses):
+            if (
+                isinstance(prev_response, ResponseFunctionToolCall)
+                and prev_response.call_id == call_id
+            ):
+                call_response = prev_response
+                break
+        if call_response is None:
+            raise ValueError(f"No call message found for {call_id}")
+        msg = Message.from_author_and_content(
+            Author.new(Role.TOOL, f"functions.{call_response.name}"),
+            response_msg["output"],
+        )
+    elif response_msg["type"] == "reasoning":
+        content = response_msg["content"]
+        assert len(content) == 1
+        msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
+    elif response_msg["type"] == "function_call":
+        msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
+        msg = msg.with_channel("commentary")
+        msg = msg.with_recipient(f"functions.{response_msg['name']}")
+        msg = msg.with_content_type("json")
+    else:
+        raise ValueError(f"Unknown input type: {response_msg['type']}")
+    return msg
+
+
+def response_previous_input_to_harmony(chat_msg) -> list[Message]:
+    """Parse a message from request.previous_input_messages
+    into Harmony messages.
+
+    Supports both OpenAI chat format ({"role": "..."}) and
+    Harmony format ({"author": {"role": "..."}}).
+    """
+    if not isinstance(chat_msg, dict):
+        chat_msg = chat_msg.model_dump(exclude_none=True)
+
+    if "author" in chat_msg and isinstance(chat_msg.get("author"), dict):
+        return [_parse_harmony_format_message(chat_msg)]
+
+    return _parse_chat_format_message(chat_msg)
+
+
+def construct_harmony_previous_input_messages(
+    request: ResponsesRequest,
+) -> list[Message]:
+    """Build a Harmony message list from request.previous_input_messages.
+
+    Filters out system/developer messages to match OpenAI behavior where
+    instructions are always taken from the most recent Responses API request.
+    """
+    messages: list[Message] = []
+    if request.previous_input_messages:
+        for message in request.previous_input_messages:
+            # Handle both Message objects and dictionary inputs
+            if isinstance(message, Message):
+                message_role = message.author.role
+                if message_role == Role.SYSTEM or message_role == Role.DEVELOPER:
+                    continue
+                messages.append(message)
+            else:
+                harmony_messages = response_previous_input_to_harmony(message)
+                for harmony_msg in harmony_messages:
+                    message_role = harmony_msg.author.role
+                    if message_role == Role.SYSTEM or message_role == Role.DEVELOPER:
+                        continue
+                    messages.append(harmony_msg)
+    return messages
+
+
+# ---------------------------------------------------------------------------
+# 3. Private helpers for output parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem:
+    """Parse browser tool calls (search, open, find) into web search items."""
+    if len(message.content) != 1:
+        raise ValueError("Invalid number of contents in browser message")
+    content = message.content[0]
+
+    # Parse JSON args (with retry detection)
+    try:
+        browser_call = json.loads(content.text)
+    except json.JSONDecodeError:
+        logger.warning(
+            "Invalid JSON in browser tool call, using error placeholder: %s",
+            content.text,
+        )
+        json_retry_output_message = (
+            f"Invalid JSON args, caught and retried: {content.text}"
+        )
+        browser_call = {
+            "query": json_retry_output_message,
+            "url": json_retry_output_message,
+            "pattern": json_retry_output_message,
+        }
+
+    # Create appropriate action based on recipient
+    if recipient == "browser.search":
+        action = ActionSearch(
+            query=f"cursor:{browser_call.get('query', '')}", type="search"
+        )
+    elif recipient == "browser.open":
+        action = ActionOpenPage(
+            url=f"cursor:{browser_call.get('url', '')}", type="open_page"
+        )
+    elif recipient == "browser.find":
+        action = ActionFind(
+            pattern=browser_call.get("pattern", ""),
+            url=f"cursor:{browser_call.get('url', '')}",
+            type="find",
+        )
+    else:
+        raise ValueError(f"Unknown browser action: {recipient}")
+
+    return ResponseFunctionWebSearch(
+        id=f"ws_{random_uuid()}",
+        action=action,
+        status="completed",
+        type="web_search_call",
+    )
+
+
+def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
+    """Parse function calls into function tool call items."""
+    function_name = recipient.split(".")[-1]
+    output_items = []
+    for content in message.content:
+        random_id = random_uuid()
+        response_item = ResponseFunctionToolCall(
+            arguments=content.text,
+            call_id=f"call_{random_id}",
+            type="function_call",
+            name=function_name,
+            id=f"fc_{random_id}",
+        )
+        output_items.append(response_item)
+    return output_items
+
+
+def _parse_reasoning(message: Message) -> list[ResponseOutputItem]:
+    """Parse reasoning/analysis content into reasoning items."""
+    output_items = []
+    for content in message.content:
+        reasoning_item = ResponseReasoningItem(
+            id=f"rs_{random_uuid()}",
+            summary=[],
+            type="reasoning",
+            content=[
+                ResponseReasoningTextContent(text=content.text, type="reasoning_text")
+            ],
+            status=None,
+        )
+        output_items.append(reasoning_item)
+    return output_items
+
+
+def _parse_final_message(message: Message) -> ResponseOutputItem:
+    """Parse final channel messages into output message items."""
+    contents = []
+    for content in message.content:
+        output_text = ResponseOutputText(
+            text=content.text,
+            annotations=[],  # TODO
+            type="output_text",
+            logprobs=None,  # TODO
+        )
+        contents.append(output_text)
+    return ResponseOutputMessage(
+        id=f"msg_{random_uuid()}",
+        content=contents,
+        role=message.author.role,
+        status="completed",
+        type="message",
+    )
+
+
+def _parse_mcp_recipient(recipient: str) -> tuple[str, str]:
+    """Parse MCP recipient into (server_label, tool_name).
+
+    For dotted recipients like "repo_browser.list":
+        - server_label: "repo_browser" (namespace/server)
+        - tool_name: "list" (specific tool)
+
+    For simple recipients like "filesystem":
+        - server_label: "filesystem"
+        - tool_name: "filesystem"
+    """
+    if "." in recipient:
+        server_label = recipient.split(".")[0]
+        tool_name = recipient.split(".")[-1]
+    else:
+        server_label = recipient
+        tool_name = recipient
+    return server_label, tool_name
+
+
+def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
+    """Parse MCP calls into MCP call items."""
+    # Handle built-in tools that need server_label mapping
+    if recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
+        server_label = BUILTIN_TOOL_TO_MCP_SERVER_LABEL[recipient]
+        tool_name = recipient
+    else:
+        server_label, tool_name = _parse_mcp_recipient(recipient)
+
+    output_items = []
+    for content in message.content:
+        response_item = McpCall(
+            arguments=content.text,
+            type="mcp_call",
+            name=tool_name,
+            server_label=server_label,
+            id=f"mcp_{random_uuid()}",
+            status="completed",
+        )
+        output_items.append(response_item)
+    return output_items
+
+
+def _parse_message_no_recipient(
+    message: Message,
+) -> list[ResponseOutputItem]:
+    """Parse a Harmony message with no recipient based on its channel."""
+    if message.channel == "analysis":
+        return _parse_reasoning(message)
+
+    if message.channel in ("commentary", "final"):
+        # Per Harmony format, preambles (commentary with no recipient) and
+        # final channel content are both intended to be shown to end-users.
+        # See: https://cookbook.openai.com/articles/openai-harmony
+        return [_parse_final_message(message)]
+
+    raise ValueError(f"Unknown channel: {message.channel}")
+
+
+# ---------------------------------------------------------------------------
+# 4. Public output parsing functions
+# ---------------------------------------------------------------------------
+
+
+def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]:
+    """Parse a Harmony message into a list of output response items.
+
+    This is the main dispatcher that routes based on channel and recipient.
+    """
+    if message.author.role != "assistant":
+        # This is a message from a tool to the assistant (e.g., search result).
+        # Don't include it in the final output for now. This aligns with
+        # OpenAI's behavior on models like o4-mini.
+        return []
+
+    output_items: list[ResponseOutputItem] = []
+    recipient = message.recipient
+
+    if recipient is not None:
+        # Browser tool calls (browser.search, browser.open, browser.find)
+        if recipient.startswith("browser."):
+            output_items.append(_parse_browser_tool_call(message, recipient))
+
+        # Function calls (should only happen on commentary channel)
+        elif message.channel == "commentary" and recipient.startswith("functions."):
+            output_items.extend(_parse_function_call(message, recipient))
+
+        # Built-in MCP tools (python, browser, container)
+        elif recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
+            output_items.extend(_parse_reasoning(message))
+
+        # All other recipients are MCP calls
+        else:
+            output_items.extend(_parse_mcp_call(message, recipient))
+
+    # No recipient - handle based on channel for non-tool messages
+    else:
+        output_items.extend(_parse_message_no_recipient(message))
+
+    return output_items
+
+
+def parser_state_to_response_output(
+    parser: StreamableParser,
+) -> list[ResponseOutputItem]:
+    """Extract in-progress response items from incomplete parser state.
+
+    Called when the parser has buffered content that hasn't formed a
+    complete message yet (e.g., generation was cut short).
+    """
+    if not parser.current_content:
+        return []
+    if parser.current_role != Role.ASSISTANT:
+        return []
+    current_recipient = parser.current_recipient
+    if current_recipient is not None and current_recipient.startswith("browser."):
+        return []
+
+    if current_recipient and parser.current_channel in ("commentary", "analysis"):
+        if current_recipient.startswith("functions."):
+            rid = random_uuid()
+            return [
+                ResponseFunctionToolCall(
+                    arguments=parser.current_content,
+                    call_id=f"call_{rid}",
+                    type="function_call",
+                    name=current_recipient.split(".")[-1],
+                    id=f"fc_{rid}",
+                    status="in_progress",
+                )
+            ]
+        # Built-in MCP tools (python, browser, container)
+        elif current_recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
+            return [
+                ResponseReasoningItem(
+                    id=f"rs_{random_uuid()}",
+                    summary=[],
+                    type="reasoning",
+                    content=[
+                        ResponseReasoningTextContent(
+                            text=parser.current_content, type="reasoning_text"
+                        )
+                    ],
+                    status=None,
+                )
+            ]
+        # All other recipients are MCP calls
+        else:
+            rid = random_uuid()
+            server_label, tool_name = _parse_mcp_recipient(current_recipient)
+            return [
+                McpCall(
+                    arguments=parser.current_content,
+                    type="mcp_call",
+                    name=tool_name,
+                    server_label=server_label,
+                    id=f"mcp_{rid}",
+                    status="in_progress",
+                )
+            ]
+
+    if parser.current_channel == "commentary":
+        # Per Harmony format, preambles (commentary with no recipient) are
+        # intended to be shown to end-users, unlike analysis channel content.
+        output_text = ResponseOutputText(
+            text=parser.current_content,
+            annotations=[],
+            type="output_text",
+            logprobs=None,
+        )
+        return [
+            ResponseOutputMessage(
+                id=f"msg_{random_uuid()}",
+                content=[output_text],
+                role="assistant",
+                status="incomplete",
+                type="message",
+            )
+        ]
+
+    if parser.current_channel == "analysis":
+        return [
+            ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(
+                        text=parser.current_content, type="reasoning_text"
+                    )
+                ],
+                status=None,
+            )
+        ]
+
+    if parser.current_channel == "final":
+        output_text = ResponseOutputText(
+            text=parser.current_content,
+            annotations=[],  # TODO
+            type="output_text",
+            logprobs=None,  # TODO
+        )
+        text_item = ResponseOutputMessage(
+            id=f"msg_{random_uuid()}",
+            content=[output_text],
+            role="assistant",
+            # if the parser still has messages (ie if the generator got cut
+            # abruptly), this should be incomplete
+            status="incomplete",
+            type="message",
+        )
+        return [text_item]
+
+    return []
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
index c0ca87a98521..b9d526e25def 100644
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -58,15 +58,11 @@
 )
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.openai.parser.harmony_utils import (
-    construct_harmony_previous_input_messages,
     get_developer_message,
     get_stop_tokens_for_assistant_actions,
     get_system_message,
     get_user_message,
     has_custom_tools,
-    parse_output_message,
-    parse_remaining_state,
-    parse_response_input,
     render_for_completion,
 )
 from vllm.entrypoints.openai.responses.context import (
@@ -76,6 +72,12 @@
     SimpleContext,
     StreamingHarmonyContext,
 )
+from vllm.entrypoints.openai.responses.harmony import (
+    construct_harmony_previous_input_messages,
+    harmony_to_response_output,
+    parser_state_to_response_output,
+    response_input_to_harmony,
+)
 from vllm.entrypoints.openai.responses.protocol import (
     InputTokensDetails,
     OutputTokensDetails,
@@ -954,9 +956,9 @@ def _make_response_output_items_with_harmony(
         output_items: list[ResponseOutputItem] = []
         num_init_messages = context.num_init_messages
         for msg in context.messages[num_init_messages:]:
-            output_items.extend(parse_output_message(msg))
+            output_items.extend(harmony_to_response_output(msg))
         # Handle the generation stopped in the middle (if any).
-        last_items = parse_remaining_state(context.parser)
+        last_items = parser_state_to_response_output(context.parser)
         if last_items:
             output_items.extend(last_items)
         return output_items
@@ -1103,13 +1105,13 @@ def _construct_input_messages_with_harmony(
             else:
                 prev_outputs = []
             for response_msg in request.input:
-                new_msg = parse_response_input(response_msg, prev_outputs)
+                new_msg = response_input_to_harmony(response_msg, prev_outputs)
                 if new_msg.author.role != "system":
                     messages.append(new_msg)
 
                 # User passes in a tool call request and its output. We need
-                # to add the tool call request to prev_outputs so that the
-                # parse_response_input can find the tool call request when
+                # to add the tool call request to prev_outputs so that
+                # response_input_to_harmony can find the tool call request when
                 # parsing the tool call output.
                 if isinstance(response_msg, ResponseFunctionToolCall):
                     prev_outputs.append(response_msg)

From d3a51da92a031f6c1758771a2b13976ace2eece2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Feb 2026 14:35:41 +0800
Subject: [PATCH 011/154] [Benchmark] Simplify SLA scan (#35306)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/benchmarking/cli.md                 |   5 +
 docs/benchmarking/sweeps.md              |  88 ++---
 tests/benchmarks/sweep/test_serve_sla.py | 298 ----------------
 vllm/benchmarks/sweep/plot.py            |   2 +-
 vllm/benchmarks/sweep/serve.py           |  87 +++--
 vllm/benchmarks/sweep/serve_sla.py       | 433 ++++++++---------------
 vllm/benchmarks/sweep/sla_sweep.py       | 138 --------
 vllm/benchmarks/sweep/startup.py         |   3 +-
 8 files changed, 254 insertions(+), 800 deletions(-)
 delete mode 100644 tests/benchmarks/sweep/test_serve_sla.py
 delete mode 100644 vllm/benchmarks/sweep/sla_sweep.py

diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
index 7bb91239c58e..8bbd9b0c0e3e 100644
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -4,6 +4,11 @@ This section guides you through running benchmark tests with the extensive datas
 
 It's a living document, updated as new features and datasets become available.
 
+!!! tip
+    The benchmarks described on this page are mainly for evaluating specific vLLM features as well as regression testing.
+
+    For benchmarking production vLLM servers, we recommend [GuideLLM](https://github.com/vllm-project/guidellm), an established performance benchmarking framework with live progress updates and automatic report generation. It is also more flexible than `vllm bench serve` in terms of dataset loading, request formatting, and workload patterns.
+
 ## Dataset Overview
 
 <style>
diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
index d56d8ab451b3..e0a7a1b6ddc1 100644
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -1,10 +1,15 @@
 # Parameter Sweeps
 
+`vllm bench sweep` is a suite of commands designed to run benchmarks across multiple configurations and compare them by visualizing the results.
+
 ## Online Benchmark
 
 ### Basic
 
-`vllm bench sweep serve` automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
+`vllm bench sweep serve` starts `vllm serve` and iteratively runs `vllm bench serve` for each server configuration.
+
+!!! tip
+    If you only need to run benchmarks for a single server configuration, consider using [GuideLLM](https://github.com/vllm-project/guidellm), an established performance benchmarking framework with live progress updates and automatic report generation. It is also more flexible than `vllm bench serve` in terms of dataset loading, request formatting, and workload patterns.
 
 Follow these steps to run the script:
 
@@ -50,14 +55,17 @@ Follow these steps to run the script:
     ```json
     [
         {
+            "_benchmark_name": "scenario_A",
             "random_input_len": 128,
             "random_output_len": 32
         },
         {
+            "_benchmark_name": "scenario_B",
             "random_input_len": 256,
             "random_output_len": 64
         },
         {
+            "_benchmark_name": "scenario_C",
             "random_input_len": 512,
             "random_output_len": 128
         }
@@ -77,6 +85,8 @@ vllm bench sweep serve \
     -o benchmarks/results
 ```
 
+By default, each parameter combination is benchmarked 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
+
 !!! important
     If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
     You can use `--dry-run` to preview the commands to be run.
@@ -86,60 +96,40 @@ vllm bench sweep serve \
     In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
 
 !!! note
-    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
+    You should set `_benchmark_name` to provide a human-readable name for parameter combinations involving many variables.
+    This becomes mandatory if the file name would otherwise exceed the maximum path length allowed by the filesystem.
 
 !!! tip
-    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
-  
-### SLA auto-tuner
+    You can use the `--resume` option to continue the parameter sweep if an unexpected error occurs, e.g., timeout when connecting to HF Hub.
 
-`vllm bench sweep serve_sla` is a wrapper over `vllm bench sweep serve` that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
+### SLA Scanner
 
-For example, to ensure E2E latency within different target values for 99% of requests:
-
-```json
-[
-    {
-        "p99_e2el_ms": "<=200"
-    },
-    {
-        "p99_e2el_ms": "<=500"
-    },
-    {
-        "p99_e2el_ms": "<=1000"
-    },
-    {
-        "p99_e2el_ms": "<=2000"
-    }
-]
-```
+`vllm bench sweep serve_sla` is a variant of `vllm bench sweep serve` that scans through values of request rate or concurrency (choose using `--sla-variable`) in order to find the tradeoff between latency and throughput. The results can then be [visualized](#visualization) to determine the feasible SLAs.
 
 Example command:
 
 ```bash
 vllm bench sweep serve_sla \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
-    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100' \
     --serve-params benchmarks/serve_hparams.json \
-    --bench-params benchmarks/bench_hparams.json \
-    --sla-params benchmarks/sla_hparams.json \
-    --sla-variable max_concurrency \
+    --bench-params benchmarks/bench_hparams.json
     -o benchmarks/results
 ```
 
-The algorithm for adjusting the SLA variable is as follows:
+The algorithm for scanning through different values of `sla_variable` can be summarized as follows:
 
-1. Run the benchmark once with maximum possible QPS, and once with minimum possible QPS. For each run, calculate the distance of the SLA metrics from their targets, resulting in data points of QPS vs SLA distance.
-2. Perform spline interpolation between the data points to estimate the QPS that results in zero SLA distance.
-3. Run the benchmark with the estimated QPS and add the resulting data point to the history.
-4. Repeat Steps 2 and 3 until the maximum QPS that passes SLA and the minimum QPS that fails SLA in the history are close enough to each other.
+1. Run the benchmark once with `sla_variable = 1` to simulate serial inference. This results in the lowest possible latency and throughput.
+2. Run the benchmark once with `sla_variable = num_prompts` to simulate batch inference over the whole dataset. This results in the highest possible latency and throughput.
+3. Estimate the maximum value of `sla_variable` that can be supported by the server without oversaturating it.
+4. Run the benchmark over intermediate values of `sla_variable` uniformly using the remaining iterations.
 
-!!! important
-    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
+You can override the number of iterations in the algorithm by setting `--sla-iters`.
 
-    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
+!!! tip
+    This is our equivalent of [GuideLLM's `--profile sweep`](https://github.com/vllm-project/guidellm/blob/v0.5.3/src/guidellm/benchmark/profiles.py#L575).
 
-### Startup
+## Startup Benchmark
 
 `vllm bench sweep startup` runs `vllm bench startup` across parameter combinations to compare cold/warm startup time for different engine settings.
 
@@ -202,15 +192,28 @@ vllm bench sweep startup \
 
 `vllm bench sweep plot` can be used to plot performance curves from parameter sweep results.
 
-Example command:
+Control the variables to plot via `--var-x` and `--var-y`, optionally applying `--filter-by` and `--bin-by` to the values. The plot is organized according to `--fig-by`, `--row-by`, `--col-by`, and `--curve-by`.
+
+Example commands for visualizing [SLA Scanner](#sla-scanner) results:
 
 ```bash
+# Latency increases as the request rate increases
 vllm bench sweep plot benchmarks/results/<timestamp> \
-    --var-x max_concurrency \
+    --var-x request_rate \
+    --var-y p99_ttft_ms \
     --row-by random_input_len \
     --col-by random_output_len \
-    --curve-by api_server_count,max_num_batched_tokens \
-    --filter-by 'max_concurrency<=1024'
+    --curve-by max_num_seqs,max_num_batched_tokens \
+    --filter-by 'request_rate<=128'
+
+# Tradeoff between latency and throughput
+vllm bench sweep plot benchmarks/results/<timestamp> \
+    --var-x request_throughput \
+    --var-y median_ttft_ms \
+    --row-by random_input_len \
+    --col-by random_output_len \
+    --curve-by max_num_seqs,max_num_batched_tokens \
+    --filter-by 'request_rate<=128'
 ```
 
 !!! tip
@@ -233,3 +236,6 @@ Example:
 vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
   --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
 ```
+
+!!! tip
+    You can use `--dry-run` to preview the figures to be plotted.
diff --git a/tests/benchmarks/sweep/test_serve_sla.py b/tests/benchmarks/sweep/test_serve_sla.py
deleted file mode 100644
index 19f4740bc328..000000000000
--- a/tests/benchmarks/sweep/test_serve_sla.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import json
-from collections.abc import Callable
-from pathlib import Path
-from unittest.mock import patch
-
-from vllm.benchmarks.sweep.param_sweep import ParameterSweepItem
-from vllm.benchmarks.sweep.serve_sla import _get_sla_run_path, solve_sla
-from vllm.benchmarks.sweep.server import ServerProcess
-from vllm.benchmarks.sweep.sla_sweep import (
-    SLACriterionBase,
-    SLALessThan,
-    SLALessThanOrEqualTo,
-    SLASweepItem,
-)
-
-
-def _set_return_value(
-    var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
-):
-    """
-    Create a patch for run_sla with a specific function
-    indicating the relationship between the benchmark combination
-    (which includes the SLA variable) and the SLA criterion.
-    """
-
-    def mock_run_sla(
-        server: ServerProcess | None,
-        bench_cmd: list[str],
-        *,
-        serve_comb: ParameterSweepItem,
-        bench_comb: ParameterSweepItem,
-        iter_path: Path,
-        num_runs: int,
-        dry_run: bool,
-    ):
-        iter_data = var2metric(bench_comb)
-
-        summary_path = _get_sla_run_path(iter_path, run_number=None)
-        summary_path.parent.mkdir(parents=True, exist_ok=True)
-        with summary_path.open("w") as f:
-            json.dump(iter_data, f, indent=4)
-
-        return iter_data
-
-    return patch("vllm.benchmarks.sweep.serve_sla.run_sla", side_effect=mock_run_sla)
-
-
-def _var2metric_linear():
-    def wrapped(bench_comb):
-        x = float(bench_comb["request_rate"])
-        y = x
-
-        return [{"request_throughput": y}]
-
-    return wrapped
-
-
-def _var2metric_concave(elbow_point: float):
-    def wrapped(bench_comb):
-        x = float(bench_comb["request_rate"])
-        if x < elbow_point:
-            y = 0.5 * (x - elbow_point) + elbow_point
-        else:
-            y = 1.5 * (x - elbow_point) + elbow_point
-
-        return [{"request_throughput": y}]
-
-    return wrapped
-
-
-def _var2metric_convex(elbow_point: float):
-    def wrapped(bench_comb):
-        x = float(bench_comb["request_rate"])
-        if x < elbow_point:
-            y = 1.5 * (x - elbow_point) + elbow_point
-        else:
-            y = 0.5 * (x - elbow_point) + elbow_point
-
-        return [{"request_throughput": y}]
-
-    return wrapped
-
-
-def _var2metric_quadratic(y_intercept: float):
-    def wrapped(bench_comb):
-        x = float(bench_comb["request_rate"])
-        y = y_intercept + 0.1 * x**2
-
-        return [{"request_throughput": y}]
-
-    return wrapped
-
-
-def _var2metric_sqrt(y_intercept: float):
-    def wrapped(bench_comb):
-        x = float(bench_comb["request_rate"])
-        y = y_intercept + 10 * x**0.5
-
-        return [{"request_throughput": y}]
-
-    return wrapped
-
-
-def _run_solve_sla(
-    var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
-    criterion: SLACriterionBase,
-    base_path: Path,
-    min_value: int = 1,
-    max_value: int = 100,
-):
-    with _set_return_value(var2metric):
-        result = solve_sla(
-            server=None,
-            bench_cmd=[],
-            serve_comb=ParameterSweepItem(),
-            bench_comb=ParameterSweepItem(),
-            sla_comb=SLASweepItem({"request_throughput": criterion}),
-            base_path=base_path,
-            num_runs=1,
-            dry_run=False,
-            sla_variable="request_rate",
-            sla_min_value=min_value,
-            sla_max_value=max_value,
-        )
-        assert result is not None
-
-        return result
-
-
-def test_solve_linear_sla_le(tmp_path):
-    sla_data, history = _run_solve_sla(
-        _var2metric_linear(),
-        SLALessThanOrEqualTo(target=32),
-        tmp_path,
-    )
-
-    assert history.get_max_passing() == 32
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        100: False,
-        1: True,
-        32: True,
-        33: False,
-    }
-
-
-def test_solve_linear_sla_lt(tmp_path):
-    sla_data, history = _run_solve_sla(
-        _var2metric_linear(),
-        SLALessThan(target=32),
-        tmp_path,
-    )
-
-    assert history.get_max_passing() == 31
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        100: False,
-        1: True,
-        31: True,
-        32: False,
-    }
-
-
-def test_solve_linear_sla_oob(tmp_path):
-    sla_data, history = _run_solve_sla(
-        _var2metric_linear(),
-        SLALessThanOrEqualTo(target=32),
-        tmp_path,
-        min_value=64,
-    )
-
-    assert history.get_max_passing() == 64
-    assert history.get_min_failing() == 64
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        100: False,
-        64: False,
-    }
-
-
-def test_solve_concave_sla_le(tmp_path):
-    sla_data, history = _run_solve_sla(
-        _var2metric_concave(elbow_point=32),
-        SLALessThanOrEqualTo(target=24),
-        tmp_path,
-    )
-
-    assert history.get_max_passing() == 16
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        100: False,
-        1: True,
-        7: True,
-        13: True,
-        15: True,
-        16: True,
-        17: False,
-    }
-
-
-def test_solve_convex_sla_le(tmp_path):
-    sla_data, history = _run_solve_sla(
-        _var2metric_convex(elbow_point=32),
-        SLALessThanOrEqualTo(target=24),
-        tmp_path,
-    )
-
-    assert history.get_max_passing() == 26
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        100: False,
-        1: True,
-        48: False,
-        30: False,
-        24: True,
-        26: True,
-        27: False,
-    }
-
-
-def test_solve_quadratic_sla_le(tmp_path):
-    sla_data, history = _run_solve_sla(
-        _var2metric_quadratic(y_intercept=10),
-        SLALessThanOrEqualTo(target=50),
-        tmp_path,
-    )
-
-    assert history.get_max_passing() == 20
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        100: False,
-        1: True,
-        4: True,
-        20: True,
-        21: False,
-    }
-
-
-def test_solve_sqrt_sla_le(tmp_path):
-    sla_data, history = _run_solve_sla(
-        _var2metric_sqrt(y_intercept=10),
-        SLALessThanOrEqualTo(target=100),
-        tmp_path,
-    )
-
-    assert history.get_max_passing() == 81
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        100: False,
-        1: True,
-        89: False,
-        81: True,
-        82: False,
-    }
-
-
-def test_solve_reuse_history(tmp_path):
-    sla_data, history = _run_solve_sla(
-        _var2metric_linear(),
-        SLALessThanOrEqualTo(target=10),
-        tmp_path,
-        min_value=1,
-        max_value=20,
-    )
-
-    assert history.get_max_passing() == 10
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        20: False,
-        1: True,
-        10: True,
-        11: False,
-    }
-
-    sla_data, history = _run_solve_sla(
-        _var2metric_linear(),
-        SLALessThanOrEqualTo(target=30),
-        tmp_path,
-        min_value=21,
-        max_value=40,
-    )
-
-    assert history.get_max_passing() == 30
-
-    assert {val: margin <= 0 for val, margin in history.items()} == {
-        # Items from the past run
-        # (the margins are different because the target changed)
-        20: True,
-        1: True,
-        10: True,
-        11: True,
-        # Items from this run
-        40: False,
-        30: True,
-        31: False,
-    }
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 87323757ed77..53c7db3879ef 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -576,7 +576,7 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         parser.add_argument(
             "--var-y",
             type=str,
-            default="p99_e2el_ms",
+            default="p99_ttft_ms",
             help="The variable for the y-axis",
         )
         parser.add_argument(
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 8b129e49a9e9..7420f2518aae 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -92,7 +92,8 @@ def run_benchmark(
     run_data: dict[str, object]
 
     if output_path.exists():
-        print("Found existing results. Skipping.")
+        print("Found existing results.")
+        print("[SKIPPED BENCHMARK]")
 
         with output_path.open("rb") as f:
             run_data = json.load(f)
@@ -167,6 +168,43 @@ def _comb_needs_server(
     return False
 
 
+def server_ctx(
+    serve_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_comb: ParameterSweepItem,
+    bench_params: ParameterSweep,
+    output_dir: Path,
+    dry_run: bool,
+    server_ready_timeout: int = 300,
+):
+    if not _comb_needs_server(serve_comb, bench_params, output_dir):
+        return contextlib.nullcontext()
+
+    return run_server(
+        serve_cmd,
+        after_bench_cmd,
+        show_stdout=show_stdout,
+        serve_overrides=serve_comb,
+        dry_run=dry_run,
+        server_ready_timeout=server_ready_timeout,
+    )
+
+
+def _comb_is_valid(
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    link_vars: list[tuple[str, str]],
+) -> bool:
+    return all(
+        serve_key in serve_comb
+        and bench_key in bench_comb
+        and serve_comb[serve_key] == bench_comb[bench_key]
+        for serve_key, bench_key in link_vars
+    )
+
+
 def run_comb(
     server: ServerProcess | None,
     bench_cmd: list[str],
@@ -176,7 +214,11 @@ def run_comb(
     base_path: Path,
     num_runs: int,
     dry_run: bool,
+    link_vars: list[tuple[str, str]],
 ):
+    if not _comb_is_valid(serve_comb, bench_comb, link_vars):
+        return None
+
     comb_data = list[dict[str, object]]()
 
     for run_number in range(num_runs):
@@ -208,37 +250,27 @@ def run_combs(
     after_bench_cmd: list[str],
     *,
     show_stdout: bool,
+    server_ready_timeout: int,
     serve_params: ParameterSweep,
     bench_params: ParameterSweep,
     output_dir: Path,
     num_runs: int,
     dry_run: bool,
-    links: list[tuple[str, str]],
-    server_ready_timeout: int = 300,
+    link_vars: list[tuple[str, str]],
 ):
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
-        with (
-            run_server(
-                serve_cmd,
-                after_bench_cmd,
-                show_stdout=show_stdout,
-                serve_overrides=serve_comb,
-                dry_run=dry_run,
-                server_ready_timeout=server_ready_timeout,
-            )
-            if _comb_needs_server(serve_comb, bench_params, output_dir)
-            else contextlib.nullcontext()
+        with server_ctx(
+            serve_cmd,
+            after_bench_cmd,
+            show_stdout=show_stdout,
+            serve_comb=serve_comb,
+            bench_params=bench_params,
+            output_dir=output_dir,
+            dry_run=dry_run,
+            server_ready_timeout=server_ready_timeout,
         ) as server:
             for bench_comb in bench_params:
-                should_run = all(
-                    serve_key in serve_comb
-                    and bench_key in bench_comb
-                    and serve_comb[serve_key] == bench_comb[bench_key]
-                    for serve_key, bench_key in links
-                )
-                if not should_run:
-                    continue
                 base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
 
                 comb_data = run_comb(
@@ -249,6 +281,7 @@ def run_combs(
                     base_path=base_path,
                     num_runs=num_runs,
                     dry_run=dry_run,
+                    link_vars=link_vars,
                 )
 
                 if comb_data is not None:
@@ -269,14 +302,14 @@ class SweepServeArgs:
     bench_cmd: list[str]
     after_bench_cmd: list[str]
     show_stdout: bool
+    server_ready_timeout: int
     serve_params: ParameterSweep
     bench_params: ParameterSweep
     output_dir: Path
     num_runs: int
     dry_run: bool
     resume: str | None
-    link_vars: list[tuple[str, str]] | None
-    server_ready_timeout: int
+    link_vars: list[tuple[str, str]]
 
     parser_name: ClassVar[str] = "serve"
     parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings."
@@ -300,7 +333,9 @@ def from_cli_args(cls, args: argparse.Namespace):
         else:
             # i.e.: run bench_cmd without any modification
             bench_params = ParameterSweep.from_records([{}])
+
         link_vars = cls.parse_link_vars(args.link_vars)
+
         num_runs = args.num_runs
         if num_runs < 1:
             raise ValueError("`num_runs` should be at least 1.")
@@ -437,13 +472,13 @@ def run_main(args: SweepServeArgs):
             bench_cmd=args.bench_cmd,
             after_bench_cmd=args.after_bench_cmd,
             show_stdout=args.show_stdout,
+            server_ready_timeout=args.server_ready_timeout,
             serve_params=args.serve_params,
             bench_params=args.bench_params,
             output_dir=output_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
-            links=args.link_vars,
-            server_ready_timeout=args.server_ready_timeout,
+            link_vars=args.link_vars,
         )
     except BaseException as exc:
         raise RuntimeError(
diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_sla.py
index 3b4d48dd2d89..89169ec15957 100644
--- a/vllm/benchmarks/sweep/serve_sla.py
+++ b/vllm/benchmarks/sweep/serve_sla.py
@@ -1,306 +1,162 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
-import contextlib
-import json
+import math
 from dataclasses import asdict, dataclass
 from datetime import datetime
 from pathlib import Path
 from typing import ClassVar, Literal, get_args
 
+import numpy as np
+from typing_extensions import assert_never
+
 from vllm.utils.import_utils import PlaceholderModule
 
 from .param_sweep import ParameterSweep, ParameterSweepItem
-from .serve import SweepServeArgs, run_benchmark, run_server
+from .serve import (
+    SweepServeArgs,
+    _get_comb_base_path,
+    run_comb,
+    server_ctx,
+)
 from .server import ServerProcess
-from .sla_sweep import SLASweep, SLASweepItem
-from .utils import sanitize_filename
 
 try:
     import pandas as pd
 except ImportError:
     pd = PlaceholderModule("pandas")
 
-try:
-    from scipy.interpolate import PchipInterpolator
-except ImportError:
-    PchipInterpolator = (
-        PlaceholderModule("scipy")
-        .placeholder_attr("interpolate")
-        .placeholder_attr("PchipInterpolator")
-    )
-
-
-def _get_sla_base_path(
-    output_dir: Path,
-    serve_comb: ParameterSweepItem,
-    bench_comb: ParameterSweepItem,
-):
-    parts = list[str]()
-    if serve_comb:
-        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
-    if bench_comb:
-        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
-
-    return output_dir / sanitize_filename("-".join(parts))
-
-
-def _get_sla_iter_path(
-    base_path: Path,
-    sla_comb: SLASweepItem,
-    sla_variable: str,
-    sla_value: int | None,
-):
-    if sla_value is None:
-        prefix = sla_comb.as_text(sep="-")
-        return base_path / f"SLA--{prefix}.json"
-
-    return base_path / f"{sla_variable}={sla_value}"
-
-
-def _get_sla_run_path(iter_path: Path, run_number: int | None):
-    if run_number is None:
-        return iter_path / "summary.json"
-
-    return iter_path / f"run={run_number}.json"
-
-
-def _iter_sla_val_paths(base_path: Path, sla_variable: str):
-    for iter_path in base_path.glob(f"{sla_variable}=*"):
-        sla_value = int(iter_path.name.removeprefix(f"{sla_variable}="))
-        summary_path = iter_path / "summary.json"
-        if summary_path.exists():
-            yield sla_value, summary_path
-
-
-def _sla_needs_server(
-    serve_comb: ParameterSweepItem,
-    bench_combs: ParameterSweep,
-    sla_combs: SLASweep,
-    sla_variable: str,
-    output_dir: Path,
-):
-    for bench_comb in bench_combs:
-        base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
-        for sla_comb in sla_combs:
-            if not _get_sla_iter_path(
-                base_path,
-                sla_comb,
-                sla_variable,
-                sla_value=None,
-            ).exists():
-                return True
-
-    return False
-
-
-def run_sla(
-    server: ServerProcess | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: ParameterSweepItem,
-    bench_comb: ParameterSweepItem,
-    iter_path: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    iter_data = list[dict[str, object]]()
-
-    for run_number in range(num_runs):
-        run_data = run_benchmark(
-            server,
-            bench_cmd,
-            serve_overrides=serve_comb,
-            bench_overrides=bench_comb,
-            run_number=run_number,
-            output_path=_get_sla_run_path(iter_path, run_number),
-            dry_run=dry_run,
-        )
-
-        if run_data is not None:
-            iter_data.append(run_data)
-
-    if dry_run:
-        return None
-
-    with _get_sla_run_path(iter_path, run_number=None).open("w") as f:
-        json.dump(iter_data, f, indent=4)
-
-    return iter_data
-
 
 SLAVariable = Literal["request_rate", "max_concurrency"]
 
 
-class SLAHistory(dict[int, float]):
-    def __init__(self, min_value: int, max_value: int) -> None:
-        super().__init__()
-
-        self.min_value = min_value
-        self.max_value = max_value
-
-    def get_xy(self) -> tuple[list[int], list[float]]:
-        xs = list[int]()
-        ys = list[float]()
-        for x, y in sorted(self.items()):
-            xs.append(x)
-            ys.append(y)
+def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable):
+    request_throughput = float(run_data["request_throughput"])  # type: ignore
+    if sla_variable == "request_rate":
+        return request_throughput
+    if sla_variable == "max_concurrency":
+        mean_latency_ms = float(run_data["mean_e2el_ms"])  # type: ignore
+        return request_throughput * mean_latency_ms / 1000
 
-        return xs, ys
+    assert_never(sla_variable)
 
-    def get_max_passing(self) -> float:
-        return max(
-            (val for val, margin in self.items() if margin <= 0),
-            default=self.min_value,
-        )
 
-    def get_min_failing(self) -> float:
-        return min(
-            (val for val, margin in self.items() if margin > 0),
-            default=self.max_value,
-        )
+def _estimate_sla_avg(runs: list[dict[str, object]], sla_variable: SLAVariable):
+    return sum(_estimate_sla_value(run, sla_variable) for run in runs) / len(runs)
 
 
-def _compute_margin(
-    sla_comb: SLASweepItem,
-    iter_data: list[dict[str, object]],
-):
-    assert iter_data, "Summary should not be empty"
-
-    iter_data_mean = {
-        k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
-        for k in sla_comb
-    }
-
-    sla_margins = [
-        criterion.print_and_compute_margin(iter_data_mean, k)
-        for k, criterion in sla_comb.items()
-    ]
-
-    return max(sla_margins)
-
-
-def solve_sla(
+def run_comb_sla(
     server: ServerProcess | None,
     bench_cmd: list[str],
     *,
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
-    sla_comb: SLASweepItem,
-    base_path: Path,
+    output_dir: Path,
     num_runs: int,
     dry_run: bool,
+    link_vars: list[tuple[str, str]],
     sla_variable: SLAVariable,
-    sla_min_value: int = 1,
-    sla_max_value: int = 8192,  # The value that represents infinite QPS
-):
-    sla_data = list[dict[str, object]]()
-    history = SLAHistory(min_value=sla_min_value, max_value=sla_max_value)
-
-    # Use results from previous runs
-    for past_sla_value, path in _iter_sla_val_paths(base_path, sla_variable):
-        with path.open("rb") as f:
-            past_iter_data = json.load(f)
-
-        sla_data.append(past_iter_data)
-        history[past_sla_value] = _compute_margin(sla_comb, past_iter_data)
-
-    # NOTE: We don't use equality here to be more robust against noisy results
-    while history.get_max_passing() + 1 < history.get_min_failing():
-        if max(history, default=sla_min_value) < sla_max_value:
-            val = sla_max_value
-        elif min(history, default=sla_max_value) > sla_min_value:
-            val = sla_min_value
-        else:
-            spl = PchipInterpolator(*history.get_xy(), extrapolate=False)
-            spl_roots = spl.solve()
-            if len(spl_roots) == 0:
-                # Fallback to binary search
-                val = int((history.get_max_passing() + history.get_min_failing()) / 2)
-            else:
-                val = int(spl_roots[0])
-
-            if val in history:
-                # Cover both sides (floor and ceil) of the root to be sure
-                # that it is indeed the target value
-                val += 1
-
-        val = max(sla_min_value, min(val, sla_max_value))
-        print(f"Testing {sla_variable}: {val} req/s")
-
-        iter_data = run_sla(
-            server,
-            bench_cmd,
-            serve_comb=serve_comb,
-            bench_comb=bench_comb | {sla_variable: val},
-            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
-            num_runs=num_runs,
-            dry_run=dry_run,
-        )
-        if iter_data is None:
-            return None
+    sla_value: int,
+) -> list[dict[str, object]] | None:
+    bench_comb_sla = bench_comb | {sla_variable: sla_value}
 
-        margin = _compute_margin(sla_comb, iter_data)
-        if margin <= 0:
-            print(f"SLA criteria are met. ({margin=:.2f})")
-        else:
-            print(f"SLA criteria are not met. ({margin=:.2f})")
-
-        sla_data.extend(iter_data)
-        history[val] = margin
-
-    return sla_data, history
+    return run_comb(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb_sla,
+        base_path=_get_comb_base_path(output_dir, serve_comb, bench_comb_sla),
+        num_runs=num_runs,
+        dry_run=dry_run,
+        link_vars=link_vars,
+    )
 
 
-def search_sla(
+def explore_sla(
     server: ServerProcess | None,
     bench_cmd: list[str],
     *,
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
-    sla_comb: SLASweepItem,
     sla_variable: SLAVariable,
-    base_path: Path,
+    sla_iters: int,
+    output_dir: Path,
     num_runs: int,
     dry_run: bool,
+    link_vars: list[tuple[str, str]],
 ):
     print("[SLA START]")
     print(f"Serve parameters: {serve_comb.as_text() or '(None)'}")
     print(f"Bench parameters: {bench_comb.as_text() or '(None)'}")
-    print(f"SLA criteria: {sla_comb.as_text()}")
+    print(f"Number of SLA iterations: {sla_iters}")
+
+    if sla_iters < 2:
+        raise ValueError("`sla_iters` should be at least 2")
 
-    result = solve_sla(
+    serial_comb_data = run_comb_sla(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb,
+        output_dir=output_dir,
+        num_runs=num_runs,
+        dry_run=dry_run,
+        link_vars=link_vars,
+        sla_variable=sla_variable,
+        sla_value=1,
+    )
+    batch_comb_data = run_comb_sla(
         server,
         bench_cmd,
         serve_comb=serve_comb,
         bench_comb=bench_comb,
-        sla_comb=sla_comb,
-        base_path=base_path,
+        output_dir=output_dir,
         num_runs=num_runs,
         dry_run=dry_run,
+        link_vars=link_vars,
         sla_variable=sla_variable,
+        sla_value=int(bench_comb.get("num_prompts", 1000)),  # type: ignore
     )
-    if result is None:
-        assert dry_run
-        print("Omitting SLA search.")
-        print("[SLA END]")
+
+    if serial_comb_data is None or batch_comb_data is None:
+        if dry_run:
+            print("Omitting intermediate SLA iterations.")
+            print("[SLA END]")
+
         return
 
-    sla_data, sla_history = result
-    sla_value = sla_history.get_max_passing()
-    print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.")
+    serial_sla_value = math.ceil(_estimate_sla_avg(serial_comb_data, sla_variable))
+    print(f"Serial inference: {sla_variable}={serial_sla_value}")
 
-    with _get_sla_iter_path(
-        base_path,
-        sla_comb,
-        sla_variable,
-        sla_value=None,
-    ).open("w") as f:
-        json.dump(sla_data, f, indent=4)
+    batch_sla_value = math.floor(_estimate_sla_avg(batch_comb_data, sla_variable))
+    print(f"Batch inference: {sla_variable}={batch_sla_value}")
+
+    # Avoid duplicated runs for intermediate values if the range between
+    # `serial_sla_value` and `batch_sla_value` is small
+    inter_sla_values = np.linspace(serial_sla_value, batch_sla_value, sla_iters)[1:-1]
+    inter_sla_values = sorted(set(map(round, inter_sla_values)))
+
+    inter_combs_data: list[dict[str, object]] = []
+    for inter_sla_value in inter_sla_values:
+        print(f"Exploring: {sla_variable}={inter_sla_value}")
+        inter_comb_data = run_comb_sla(
+            server,
+            bench_cmd,
+            serve_comb=serve_comb,
+            bench_comb=bench_comb,
+            output_dir=output_dir,
+            num_runs=num_runs,
+            dry_run=dry_run,
+            link_vars=link_vars,
+            sla_variable=sla_variable,
+            sla_value=inter_sla_value,
+        )
+        if inter_comb_data is not None:
+            inter_combs_data.extend(inter_comb_data)
 
     print("[SLA END]")
 
-    return sla_data
+    return serial_comb_data + inter_combs_data + batch_comb_data
 
 
 def run_slas(
@@ -309,13 +165,15 @@ def run_slas(
     after_bench_cmd: list[str],
     *,
     show_stdout: bool,
+    server_ready_timeout: int,
     serve_params: ParameterSweep,
     bench_params: ParameterSweep,
-    sla_params: SLASweep,
     sla_variable: SLAVariable,
+    sla_iters: int,
     output_dir: Path,
     num_runs: int,
     dry_run: bool,
+    link_vars: list[tuple[str, str]],
 ):
     if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params):
         raise ValueError(
@@ -325,41 +183,32 @@ def run_slas(
 
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
-        with (
-            run_server(
-                serve_cmd,
-                after_bench_cmd,
-                show_stdout=show_stdout,
-                serve_overrides=serve_comb,
-                dry_run=dry_run,
-            )
-            if _sla_needs_server(
-                serve_comb,
-                bench_params,
-                sla_params,
-                sla_variable,
-                output_dir,
-            )
-            else contextlib.nullcontext()
+        with server_ctx(
+            serve_cmd,
+            after_bench_cmd,
+            show_stdout=show_stdout,
+            server_ready_timeout=server_ready_timeout,
+            serve_comb=serve_comb,
+            bench_params=bench_params,
+            output_dir=output_dir,
+            dry_run=dry_run,
         ) as server:
             for bench_comb in bench_params:
-                for sla_comb in sla_params:
-                    base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
-
-                    comb_data = search_sla(
-                        server,
-                        bench_cmd,
-                        serve_comb=serve_comb,
-                        bench_comb=bench_comb,
-                        sla_comb=sla_comb,
-                        sla_variable=sla_variable,
-                        base_path=base_path,
-                        num_runs=num_runs,
-                        dry_run=dry_run,
-                    )
-
-                    if comb_data is not None:
-                        all_data.extend(comb_data)
+                comb_data = explore_sla(
+                    server,
+                    bench_cmd,
+                    serve_comb=serve_comb,
+                    bench_comb=bench_comb,
+                    sla_variable=sla_variable,
+                    sla_iters=sla_iters,
+                    output_dir=output_dir,
+                    num_runs=num_runs,
+                    dry_run=dry_run,
+                    link_vars=link_vars,
+                )
+
+                if comb_data is not None:
+                    all_data.extend(comb_data)
 
     if dry_run:
         return None
@@ -372,26 +221,23 @@ def run_slas(
 
 @dataclass
 class SweepServeSLAArgs(SweepServeArgs):
-    sla_params: SLASweep
     sla_variable: SLAVariable
+    sla_iters: int
 
     parser_name: ClassVar[str] = "serve_sla"
-    parser_help: ClassVar[str] = "Tune a variable to meet SLAs under multiple settings."
+    parser_help: ClassVar[str] = (
+        "Explore the latency-throughput space for determining SLAs."
+    )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         # NOTE: Don't use super() as `from_cli_args` calls `cls()`
         base_args = SweepServeArgs.from_cli_args(args)
 
-        if args.sla_params:
-            sla_params = SLASweep.read_json(args.sla_params)
-        else:
-            sla_params = SLASweep.from_records([])
-
         return cls(
             **asdict(base_args),
-            sla_params=sla_params,
             sla_variable=args.sla_variable,
+            sla_iters=args.sla_iters,
         )
 
     @classmethod
@@ -399,25 +245,20 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         parser = super().add_cli_args(parser)
 
         sla_group = parser.add_argument_group("sla options")
-        sla_group.add_argument(
-            "--sla-params",
-            type=str,
-            required=True,
-            help="Path to JSON file containing a list of SLA constraints to satisfy. "
-            'Each constraint is expressed in `{"<KEY>": "<OP><VALUE>"}` format, '
-            'e.g.: `{"p99_e2el_ms": "<=500"}` means that '
-            "the E2E latency should be less than 500ms 99%% of the time. "
-            "Setting this option runs this script in SLA mode, which searches for "
-            "the maximum `sla_variable` that satisfies the constraints for "
-            "each combination of `serve_params`, `bench_params`, and `sla_params`.",
-        )
         sla_group.add_argument(
             "--sla-variable",
             type=str,
             choices=get_args(SLAVariable),
             default="request_rate",
-            help="Whether to tune request rate or maximum concurrency to satisfy "
-            "the SLA constraints.",
+            help="The variable to adjust in each iteration.",
+        )
+        sla_group.add_argument(
+            "--sla-iters",
+            type=int,
+            default=10,
+            help="Number of iterations used to explore the latency-throughput space. "
+            "This includes the first two iterations used to interpolate the value of "
+            "`sla_variable` for remaining iterations.",
         )
 
         return parser
@@ -436,13 +277,15 @@ def run_main(args: SweepServeSLAArgs):
             bench_cmd=args.bench_cmd,
             after_bench_cmd=args.after_bench_cmd,
             show_stdout=args.show_stdout,
+            server_ready_timeout=args.server_ready_timeout,
             serve_params=args.serve_params,
             bench_params=args.bench_params,
-            sla_params=args.sla_params,
             sla_variable=args.sla_variable,
+            sla_iters=args.sla_iters,
             output_dir=output_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
+            link_vars=args.link_vars,
         )
     except BaseException as exc:
         raise RuntimeError(
diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py
deleted file mode 100644
index 0a780860df27..000000000000
--- a/vllm/benchmarks/sweep/sla_sweep.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import json
-import os
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-
-from typing_extensions import override
-
-SLA_EPS = 1e-8
-"""Offset used to differentiate margins for equality checks."""
-
-
-@dataclass
-class SLACriterionBase(ABC):
-    target: float
-
-    @abstractmethod
-    def compute_margin(self, actual: float) -> float:
-        """
-        Return a negative value or `0` if this criterion is met;
-        otherwise a positive value indicating the distance to the target.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def format_cond(self, lhs: str) -> str:
-        raise NotImplementedError
-
-    def print_and_compute_margin(
-        self,
-        metrics: dict[str, float],
-        metrics_key: str,
-    ) -> float:
-        metric = metrics[metrics_key]
-        margin = self.compute_margin(metric)
-
-        cond = self.format_cond(f"{metrics_key} = {metric:.2f}")
-        print(f"Validating SLA: {cond} | " + ("PASSED" if margin <= 0 else "FAILED"))
-
-        return margin
-
-
-@dataclass
-class SLALessThan(SLACriterionBase):
-    @override
-    def compute_margin(self, actual: float) -> float:
-        return actual + SLA_EPS - self.target
-
-    @override
-    def format_cond(self, lhs: str) -> str:
-        return f"{lhs}<{self.target:.2f}"
-
-
-@dataclass
-class SLALessThanOrEqualTo(SLACriterionBase):
-    @override
-    def compute_margin(self, actual: float) -> float:
-        return actual - self.target
-
-    @override
-    def format_cond(self, lhs: str) -> str:
-        return f"{lhs}<={self.target:.2f}"
-
-
-@dataclass
-class SLAGreaterThan(SLACriterionBase):
-    @override
-    def compute_margin(self, actual: float) -> float:
-        return self.target + SLA_EPS - actual
-
-    @override
-    def format_cond(self, lhs: str) -> str:
-        return f"{lhs}>{self.target:.2f}"
-
-
-@dataclass
-class SLAGreaterThanOrEqualTo(SLACriterionBase):
-    @override
-    def compute_margin(self, actual: float) -> float:
-        return self.target - actual
-
-    @override
-    def format_cond(self, lhs: str) -> str:
-        return f"{lhs}>={self.target:.2f}"
-
-
-# NOTE: The ordering is important! Match longer op_keys first
-SLA_CRITERIA: dict[str, type[SLACriterionBase]] = {
-    "<=": SLALessThanOrEqualTo,
-    ">=": SLAGreaterThanOrEqualTo,
-    "<": SLALessThan,
-    ">": SLAGreaterThan,
-}
-
-
-class SLASweep(list["SLASweepItem"]):
-    @classmethod
-    def read_json(cls, filepath: os.PathLike):
-        with open(filepath, "rb") as f:
-            records = json.load(f)
-
-        return cls.from_records(records)
-
-    @classmethod
-    def from_records(cls, records: list[dict[str, str]]):
-        if not isinstance(records, list):
-            raise TypeError(
-                f"The SLA sweep should be a list of dictionaries, "
-                f"but found type: {type(records)}"
-            )
-
-        return cls(SLASweepItem.from_record(record) for record in records)
-
-
-class SLASweepItem(dict[str, SLACriterionBase]):
-    @classmethod
-    def from_record(cls, record: dict[str, str]):
-        sla_criteria: dict[str, SLACriterionBase] = {}
-
-        for metric_key, metric_value in record.items():
-            for op_key in SLA_CRITERIA:
-                if metric_value.startswith(op_key):
-                    sla_criteria[metric_key] = SLA_CRITERIA[op_key](
-                        float(metric_value.removeprefix(op_key))
-                    )
-                    break
-            else:
-                raise ValueError(
-                    f"Invalid operator for "
-                    f"SLA constraint '{metric_key}={metric_value}'. "
-                    f"Valid operators are: {sorted(SLA_CRITERIA)}",
-                )
-
-        return cls(sla_criteria)
-
-    def as_text(self, sep: str = ", ") -> str:
-        return sep.join(v.format_cond(k) for k, v in self.items())
diff --git a/vllm/benchmarks/sweep/startup.py b/vllm/benchmarks/sweep/startup.py
index 8d779b364484..b4d979b1609e 100644
--- a/vllm/benchmarks/sweep/startup.py
+++ b/vllm/benchmarks/sweep/startup.py
@@ -151,7 +151,8 @@ def run_benchmark(
     print(f"Output file: {output_path}")
 
     if output_path.exists():
-        print("Found existing results. Skipping.")
+        print("Found existing results.")
+        print("[SKIPPED BENCHMARK]")
 
         with output_path.open("r", encoding="utf-8") as f:
             run_data = json.load(f)

From a07c4c59392aeffd619ac0a6a1c10fc364cf6840 Mon Sep 17 00:00:00 2001
From: Ofir Zafrir <ofir.zafrir@intel.com>
Date: Thu, 26 Feb 2026 09:15:16 +0200
Subject: [PATCH 012/154] [BugFix][XPU] Fix speculative decoding on Intel XPU
 due to bug with `IGC_ForceOCLSIMDWidth=16` (#35298)

Signed-off-by: Ofir Zafrir <ofir.zafrir@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/platforms/xpu.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 454d2301ec3c..c97c3297e1ba 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -201,9 +201,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         if vllm_config.lora_config is not None:
             compilation_config.mode = CompilationMode.NONE
-        # decrease triton kernel compilation scratch space for speculative decoding
-        if vllm_config.speculative_config is not None:
-            os.environ["IGC_ForceOCLSIMDWidth"] = "16"  # noqa: SIM112
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
         # Only override worker_cls if it's still the default "auto"

From 9f9a675b23c8276237e98b396c3242c5135808fb Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Thu, 26 Feb 2026 15:46:44 +0800
Subject: [PATCH 013/154] [XPU][8/N] Fix kernel bugs in XPU LoRA and MOE LORA
 (#34115)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
---
 tests/lora/test_fused_moe_lora_kernel.py      |   4 +-
 tests/lora/test_punica_xpu_ops.py             | 298 ++++++++++++++++++
 .../ops/{ipex_ops => xpu_ops}/__init__.py     |   2 +-
 .../ops/{ipex_ops => xpu_ops}/lora_ops.py     |  19 +-
 vllm/lora/punica_wrapper/punica_xpu.py        | 159 +++++++++-
 5 files changed, 462 insertions(+), 20 deletions(-)
 create mode 100644 tests/lora/test_punica_xpu_ops.py
 rename vllm/lora/ops/{ipex_ops => xpu_ops}/__init__.py (66%)
 rename vllm/lora/ops/{ipex_ops => xpu_ops}/lora_ops.py (74%)

diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index b79b668f30c9..382999bca351 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -18,6 +18,7 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.lora.ops.triton_ops import fused_moe_lora
+from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port
 from vllm.utils.torch_utils import set_random_seed
 
@@ -244,8 +245,9 @@ def use_torch(
     return torch.stack(outputs, dim=0)
 
 
+DEVICE_TYPE = current_platform.device_type
 DTYPES = [torch.float16, torch.bfloat16]
-DEVICES = [f"cuda:{0}"]
+DEVICES = [f"{DEVICE_TYPE}:{0}"]
 SEED = [42]
 
 
diff --git a/tests/lora/test_punica_xpu_ops.py b/tests/lora/test_punica_xpu_ops.py
new file mode 100644
index 000000000000..585c97cfa547
--- /dev/null
+++ b/tests/lora/test_punica_xpu_ops.py
@@ -0,0 +1,298 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.lora.utils import (
+    PunicaTensors,
+    assert_close,
+    generate_data,
+    generate_data_for_expand_nslices,
+)
+from vllm.lora.ops.xpu_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+from vllm.platforms import current_platform
+
+
+def torch_bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    limit = output_tensor.shape[0]
+    if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
+        limit = 1
+
+    # LoRA adapter and model may add different amounts of padding to output
+    common_len = min(outputs.shape[1], output_tensor.shape[1])
+
+    if add_inputs:
+        output_tensor[:, :common_len] += outputs[:limit, :common_len]
+    else:
+        output_tensor[:, :common_len] = outputs[:limit, :common_len]
+
+
+def torch_bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    output_tensor[:, : outputs.shape[1]] = scaling * outputs[:]
+
+
+def torch_bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    if add_inputs:
+        output_tensor[:, slice_offset : slice_offset + slice_size] += outputs[:]
+    else:
+        output_tensor[:, slice_offset : slice_offset + slice_size] = outputs[:]
+
+
+def check_bgmv_shrink(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    scaling: float,
+):
+    """
+    Compare vllm.bgmv_shrink against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        "shrink",
+        device,
+    )
+
+    bgmv_shrink(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.our_out_tensor,
+        data.token_lora_mapping,
+        scaling,
+    )
+
+    torch_bgmv_shrink(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.ref_out_tensor,
+        data.token_lora_mapping,
+        scaling,
+    )
+
+    data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_expand(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    add_inputs: bool,
+):
+    """
+    Compare vllm.bgmv_expand against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        "expand",
+        device,
+    )
+
+    bgmv_expand(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.our_out_tensor,
+        data.token_lora_mapping,
+        add_inputs=add_inputs,
+    )
+    torch_bgmv_expand(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.ref_out_tensor,
+        data.token_lora_mapping,
+        add_inputs=add_inputs,
+    )
+    assert_close(data.ref_out_tensor, data.our_out_tensor)
+
+
+def check_bgmv_expand_slice(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    add_inputs: bool,
+):
+    """
+    Compare vllm.bgmv_expand_slice against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+
+    slice_offset = 0
+    for index in range(nslices):
+        bgmv_expand_slice(
+            data.inputs_tensor,
+            data.lora_weights[index],
+            data.our_out_tensor,
+            data.token_lora_mapping,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=add_inputs,
+        )
+        torch_bgmv_expand_slice(
+            data.inputs_tensor,
+            data.lora_weights[index],
+            data.ref_out_tensor,
+            data.token_lora_mapping,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=add_inputs,
+        )
+
+        slice_offset += hidden_size
+    assert_close(data.ref_out_tensor, data.our_out_tensor)
+
+
+# General tests params that tests for variations in all dimensions
+# except hidden_size.
+test_params = {
+    "hidden_sizes": [2049],
+    "batches": [4],
+    "num_loras": [4],
+    "max_ranks": [32],
+}
+
+DTYPES = [torch.float16, torch.bfloat16]
+DEVICES = [f"xpu:{0}"]
+SEED = [0]
+
+
+@pytest.mark.parametrize("batches", test_params["batches"])
+@pytest.mark.parametrize("num_loras", test_params["num_loras"])
+@pytest.mark.parametrize("rank", test_params["max_ranks"])
+@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.skipif(not current_platform.is_xpu(), reason="skip for non xpu platform")
+def test_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    if op_type == "shrink":
+        check_bgmv_shrink(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            dtype=dtype,
+            device=device,
+            scaling=0.5,
+        )
+    else:
+        check_bgmv_expand(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            dtype=dtype,
+            device=device,
+            add_inputs=True,
+        )
+
+
+@pytest.mark.parametrize("batches", test_params["batches"])
+@pytest.mark.parametrize("num_loras", test_params["num_loras"])
+@pytest.mark.parametrize("rank", test_params["max_ranks"])
+@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.skipif(not current_platform.is_xpu(), reason="skip for non xpu platform")
+def test_bgmv_expand_nslices(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+):
+    check_bgmv_expand_slice(
+        batches=batches,
+        num_loras=num_loras,
+        rank=rank,
+        hidden_size=hidden_size,
+        nslices=nslices,
+        dtype=dtype,
+        device=device,
+        add_inputs=True,
+    )
diff --git a/vllm/lora/ops/ipex_ops/__init__.py b/vllm/lora/ops/xpu_ops/__init__.py
similarity index 66%
rename from vllm/lora/ops/ipex_ops/__init__.py
rename to vllm/lora/ops/xpu_ops/__init__.py
index f5a5e0e6f951..f7f16bf23704 100644
--- a/vllm/lora/ops/ipex_ops/__init__.py
+++ b/vllm/lora/ops/xpu_ops/__init__.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.lora.ops.ipex_ops.lora_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+from vllm.lora.ops.xpu_ops.lora_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
 
 __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
diff --git a/vllm/lora/ops/ipex_ops/lora_ops.py b/vllm/lora/ops/xpu_ops/lora_ops.py
similarity index 74%
rename from vllm/lora/ops/ipex_ops/lora_ops.py
rename to vllm/lora/ops/xpu_ops/lora_ops.py
index 0767f90b2f9e..6d1751c3738e 100644
--- a/vllm/lora/ops/ipex_ops/lora_ops.py
+++ b/vllm/lora/ops/xpu_ops/lora_ops.py
@@ -7,11 +7,6 @@
 
 logger = init_logger(__name__)
 
-try:
-    import intel_extension_for_pytorch as ipex
-except ImportError as e:
-    raise e
-
 
 def bgmv_shrink(
     inputs: torch.Tensor,
@@ -20,8 +15,8 @@ def bgmv_shrink(
     lora_indices_tensor: torch.Tensor,
     scaling: float = 1.0,
 ) -> None:
-    ipex.llm.functional.bgmv_shrink(
-        inputs, lora_a_weights, output_tensor, lora_indices_tensor, scaling
+    torch.ops._xpu_C.bgmv_shrink(
+        output_tensor, inputs, lora_a_weights, lora_indices_tensor, scaling
     )
 
 
@@ -32,8 +27,8 @@ def bgmv_expand(
     lora_indices_tensor: torch.Tensor,
     add_inputs: bool = True,
 ) -> None:
-    ipex.llm.functional.bgmv_expand(
-        inputs, lora_b_weights, output_tensor, lora_indices_tensor, add_inputs
+    torch.ops._xpu_C.bgmv_expand(
+        output_tensor, inputs, lora_b_weights, lora_indices_tensor, add_inputs
     )
 
 
@@ -46,10 +41,12 @@ def bgmv_expand_slice(
     slice_size: int,
     add_inputs: bool = True,
 ) -> None:
-    ipex.llm.functional.bgmv_expand_slice(
+    assert slice_size == lora_b_weights.size(-2)
+    assert slice_offset + slice_size <= output_tensor.size(1)
+    torch.ops._xpu_C.bgmv_expand_slice(
+        output_tensor,
         inputs,
         lora_b_weights,
-        output_tensor,
         lora_indices_tensor,
         slice_offset,
         slice_size,
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index 00c00782896c..f031e1bfa341 100644
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -11,8 +11,17 @@
 
 import torch
 
+from vllm import _custom_ops as ops
 from vllm.lora.layers import LoRAMapping
-from vllm.lora.ops.ipex_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+from vllm.lora.ops.xpu_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+from vllm.triton_utils import HAS_TRITON, triton
+from vllm.utils.math_utils import round_up
+
+if HAS_TRITON:
+    from vllm.lora.ops.triton_ops import (
+        LoRAKernelMeta,
+        fused_moe_lora,
+    )
 
 from .punica_base import PunicaWrapperBase
 
@@ -37,6 +46,12 @@ def __init__(
         torch._dynamo.mark_dynamic(self._embeddings_indices, 1)
         torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0)
 
+        self.lora_config = kwargs["lora_config"]
+        self.max_loras = self.lora_config.max_loras
+        self.token_mapping_meta = LoRAKernelMeta.make(
+            self.max_loras, max_num_batched_tokens, device=device
+        )
+
     def update_metadata(
         self,
         mapping: LoRAMapping,
@@ -206,11 +221,9 @@ def add_lora_linear(
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)
-            # We set the buffer to be float32 by default, refer to:
-            # https://github.com/triton-lang/triton/issues/1387
             buffer = torch.zeros(  # type: ignore
                 (len(output_slices), x.size(0), r),
-                dtype=torch.float32,
+                dtype=x.dtype,
                 device=x.device,
             )
         self.add_shrink(
@@ -267,10 +280,142 @@ def add_lora_logits(
         x = x.view(-1, x.shape[-1])
         r = lora_b_stacked.size(-1)
         if buffer is None:
-            # We set the buffer to be float32 by default, refer to:
-            # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device)
+            buffer = torch.zeros((x.size(0), r), dtype=x.dtype, device=x.device)
         sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
         bgmv_shrink(x, lora_a_stacked, buffer, sampler_indices, scale)
         bgmv_expand(buffer, lora_b_stacked, y, sampler_indices, add_inputs=True)
         return y.view_as(y_org)
+
+    def moe_lora_align_block_size(
+        self,
+        topk_ids: torch.Tensor,
+        num_tokens: int,
+        block_size: int,
+        num_experts: int,
+        max_loras: int,
+        adapter_enabled: torch.Tensor,
+        expert_map: torch.Tensor | None = None,
+        pad_sorted_ids: bool = False,
+        naive_block_assignment: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Aligns tokens and experts into block-sized chunks for LoRA-based
+        mixture-of-experts (MoE) execution.
+        """
+        (token_lora_mapping, _, _, _, lora_ids, _, _) = (
+            self.token_mapping_meta.meta_args(
+                num_tokens, self.lora_config.specialize_active_lora
+            )
+        )
+        if naive_block_assignment:
+            expert_ids = topk_ids.reshape(-1)
+            sorted_ids = None
+            num_tokens_post_pad = None
+        else:
+            max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+            if pad_sorted_ids:
+                max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+            sorted_ids = torch.empty(
+                (max_loras * max_num_tokens_padded,),
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+            # Expert ids must be set default to -1 to prevent a blank block
+            expert_ids = torch.empty(
+                (max_loras * max_num_m_blocks,),
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            num_tokens_post_pad = torch.empty(
+                (max_loras), dtype=torch.int32, device=topk_ids.device
+            )
+
+            ops.moe_lora_align_block_size(
+                topk_ids,
+                token_lora_mapping,
+                num_experts,
+                block_size,
+                max_loras,
+                max_num_tokens_padded,
+                max_num_m_blocks,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+                adapter_enabled,
+                lora_ids,
+            )
+            if expert_map is not None:
+                expert_ids = expert_map[expert_ids]
+
+        return None, sorted_ids, expert_ids, num_tokens_post_pad
+
+    def add_lora_fused_moe(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        topk_weights: torch.Tensor,
+        sorted_token_ids: torch.Tensor | None,
+        expert_ids: torch.Tensor,
+        num_tokens_post_padded: torch.Tensor | None,
+        max_lora_rank: int,
+        top_k_num: int,
+        shrink_config,
+        expand_config,
+        adapter_enabled: torch.Tensor,
+        mul_routed_weight=False,
+        fully_sharded: bool = False,
+        offset: int = 0,
+        token_lora_mapping: torch.Tensor | None = None,
+    ):
+        """
+        Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer.
+        """
+        (
+            token_lora_mapping_meta,
+            _,
+            _,
+            _,
+            lora_ids,
+            _,
+            num_active_loras,
+        ) = self.token_mapping_meta.meta_args(
+            x.size(0), self.lora_config.specialize_active_lora
+        )
+        if token_lora_mapping is None:
+            token_lora_mapping = token_lora_mapping_meta
+        fused_moe_lora(
+            y,
+            x,
+            lora_a_stacked,
+            lora_b_stacked,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            token_lora_mapping,
+            max_lora_rank,
+            top_k_num,
+            lora_ids,
+            num_active_loras,
+            adapter_enabled,
+            shrink_config.get("BLOCK_SIZE_M", 64),
+            shrink_config.get("BLOCK_SIZE_N", 64),
+            shrink_config.get("BLOCK_SIZE_K", 32),
+            shrink_config.get("GROUP_SIZE_M", 8),
+            shrink_config.get("NUM_WARPS", 4),
+            shrink_config.get("NUM_STAGES", 3),
+            shrink_config.get("SPLIT_K", 1),
+            expand_config.get("BLOCK_SIZE_M", 64),
+            expand_config.get("BLOCK_SIZE_N", 64),
+            expand_config.get("BLOCK_SIZE_K", 32),
+            expand_config.get("GROUP_SIZE_M", 8),
+            expand_config.get("NUM_WARPS", 4),
+            expand_config.get("NUM_STAGES", 3),
+            expand_config.get("SPLIT_K", 1),
+            mul_routed_weight,
+            fully_sharded,
+            offset,
+        )

From 6042e66cd5304fc043d96aaa0c22d56f939af320 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 26 Feb 2026 02:05:40 -0600
Subject: [PATCH 014/154] [ROCm] Add extra step in config initialization to
 populate custom ops before compilation config init (#34848)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/config/vllm.py         |  2 +
 vllm/platforms/interface.py | 14 ++++++
 vllm/platforms/rocm.py      | 86 ++++++++++++++++++++-----------------
 3 files changed, 62 insertions(+), 40 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index fba3c64a9af0..127c16ac769a 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -809,6 +809,8 @@ def has_blocked_weights():
             if "-quant_fp8" not in custom_ops:
                 custom_ops.append("+quant_fp8")
 
+        current_platform.apply_config_platform_defaults(self)
+
         if self.compilation_config.mode is None:
             if self.optimization_level > OptimizationLevel.O0:
                 self.compilation_config.mode = CompilationMode.VLLM_COMPILE
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 75e716479cb9..5dae767572d2 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -393,6 +393,20 @@ def pre_register_and_update(
         """
         pass
 
+    @classmethod
+    def apply_config_platform_defaults(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Apply the platform-specific default values to the config.
+
+        This function is called during the initialization of global VllmConfig, after
+        parsing cli arguments.
+        It can modify the defaults of the config according to the platform. For example,
+        it can enable custom_ops based on the enabled features.
+
+        The config is passed by reference, so it can be modified in place.
+        """
+        pass
+
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index e1e2ffb1d9a1..3808ecc6e47f 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -482,19 +482,61 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
         return device_props.total_memory
 
     @classmethod
-    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+    def apply_config_platform_defaults(cls, vllm_config: "VllmConfig") -> None:
         from vllm._aiter_ops import rocm_aiter_ops
         from vllm.config.compilation import CUDAGraphMode
 
-        cache_config = vllm_config.cache_config
         compilation_config = vllm_config.compilation_config
-        parallel_config = vllm_config.parallel_config
-        is_eager_execution = compilation_config == CUDAGraphMode.NONE
+        is_eager_execution = compilation_config.cudagraph_mode == CUDAGraphMode.NONE
         use_aiter_fused_moe = rocm_aiter_ops.is_fused_moe_enabled()
         use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
         use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled()
         use_aiter_fused_se = rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
         use_aiter_triton_rope = rocm_aiter_ops.is_triton_rotary_embed_enabled()
+        #  Aiter rms norm perform best when CUDA Graph capture is enabled.
+        if (
+            use_aiter_rms_norm
+            and not is_eager_execution
+            and "-rms_norm" not in compilation_config.custom_ops
+        ):
+            compilation_config.custom_ops.append("+rms_norm")
+
+        if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
+            compilation_config.custom_ops.append("+quant_fp8")
+
+        if use_aiter_fused_se and "-grouped_topk" in compilation_config.custom_ops:
+            logger.warning_once(
+                "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled, which "
+                "requires the 'grouped_topk' custom op. Overriding the "
+                "user-provided '-grouped_topk'."
+            )
+            compilation_config.custom_ops.remove("-grouped_topk")
+        # Ensure grouped_topk is always enabled when using AITER if
+        # its not disabled by user
+        if (
+            use_aiter_fused_moe
+            and "+grouped_topk" not in compilation_config.custom_ops
+            and "-grouped_topk" not in compilation_config.custom_ops
+        ):
+            compilation_config.custom_ops.append("+grouped_topk")
+        # Enable rotary embedding when using AITER if its not disabled by user
+        if (
+            use_aiter_triton_rope
+            and "+rotary_embedding" not in compilation_config.custom_ops
+            and "-rotary_embedding" not in compilation_config.custom_ops
+        ):
+            compilation_config.custom_ops.append("+rotary_embedding")
+
+        # Default dispatch to rocm's sparse_attn_indexer implementation
+        compilation_config.custom_ops.append("+sparse_attn_indexer")
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        from vllm.config.compilation import CUDAGraphMode
+
+        cache_config = vllm_config.cache_config
+        compilation_config = vllm_config.compilation_config
+        parallel_config = vllm_config.parallel_config
 
         if compilation_config.cudagraph_mode.has_full_cudagraphs():
             # decode context parallel does not support full cudagraphs
@@ -533,42 +575,6 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
 
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
-        #  Aiter rms norm perform best when CUDA Graph capture is enabled.
-        if (
-            use_aiter_rms_norm
-            and not is_eager_execution
-            and "-rms_norm" not in compilation_config.custom_ops
-        ):
-            compilation_config.custom_ops.append("+rms_norm")
-
-        if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
-            compilation_config.custom_ops.append("+quant_fp8")
-
-        if use_aiter_fused_se and "-grouped_topk" in compilation_config.custom_ops:
-            logger.warning_once(
-                "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled, which "
-                "requires the 'grouped_topk' custom op. Overriding the "
-                "user-provided '-grouped_topk'."
-            )
-            compilation_config.custom_ops.remove("-grouped_topk")
-        # Ensure grouped_topk is always enabled when using AITER if
-        # its not disabled by user
-        if (
-            use_aiter_fused_moe
-            and "+grouped_topk" not in compilation_config.custom_ops
-            and "-grouped_topk" not in compilation_config.custom_ops
-        ):
-            compilation_config.custom_ops.append("+grouped_topk")
-        # Enable rotary embedding when using AITER if its not disabled by user
-        if (
-            use_aiter_triton_rope
-            and "+rotary_embedding" not in compilation_config.custom_ops
-            and "-rotary_embedding" not in compilation_config.custom_ops
-        ):
-            compilation_config.custom_ops.append("+rotary_embedding")
-
-        # Default dispatch to rocm's sparse_attn_indexer implementation
-        compilation_config.custom_ops.append("+sparse_attn_indexer")
 
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:

From ade81f17feeebef775e8cddf9a8f23848ec694a3 Mon Sep 17 00:00:00 2001
From: Kevin McKay <kevin.mckay@outlook.com>
Date: Thu, 26 Feb 2026 02:11:07 -0600
Subject: [PATCH 015/154] [Bugfix][Hardware][AMD] Gate FP4 ops on gfx950 to
 prevent MI300X crash (#35250)

Signed-off-by: c0de128 <kevin.mckay@outlook.com>
---
 vllm/_aiter_ops.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 3414443e52cb..8ef34bfd6dce 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -1052,12 +1052,16 @@ def is_fp8bmm_enabled(cls) -> bool:
     @classmethod
     @if_aiter_supported
     def is_fp4bmm_enabled(cls) -> bool:
-        return cls._AITER_ENABLED and cls._FP4BMM_ENABLED
+        from vllm.platforms.rocm import on_gfx950
+
+        return cls._AITER_ENABLED and cls._FP4BMM_ENABLED and on_gfx950()
 
     @classmethod
     @if_aiter_supported
     def is_asm_fp4_gemm_dynamic_quant_enabled(cls) -> bool:
-        return cls._AITER_ENABLED and cls._FP4_GEMM_DYNAMIC_QUANT_ASM
+        from vllm.platforms.rocm import on_gfx950
+
+        return cls._AITER_ENABLED and cls._FP4_GEMM_DYNAMIC_QUANT_ASM and on_gfx950()
 
     @classmethod
     @if_aiter_supported

From 3827c8c55aaa6622fd96b0c846a38b94444ebb80 Mon Sep 17 00:00:00 2001
From: Krish Gupta <krishom70@gmail.com>
Date: Thu, 26 Feb 2026 14:44:07 +0530
Subject: [PATCH 016/154] [Test] Add tests for n parameter in chat completions
 API (#35283)

Signed-off-by: KrxGu <krishom70@gmail.com>
---
 tests/entrypoints/openai/test_chat.py | 206 ++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 0cc064cd8f12..c480adcc11bf 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -3,6 +3,7 @@
 
 # imports for structured outputs tests
 import json
+from collections import defaultdict
 
 import jsonschema
 import openai  # use the official client for correctness check
@@ -13,6 +14,11 @@
 import torch
 from openai import BadRequestError
 
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.sampling_params import SamplingParams
+
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
@@ -815,3 +821,203 @@ async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenA
 
     assert chat_output.keys() == invocation_output.keys()
     assert chat_output["choices"] == invocation_output["choices"]
+
+
+# Test n parameter for chat completions
+# Tests that the n parameter works correctly for regular sampling
+# (non-beam search) in chat completions, addressing issue #34305.
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_chat_completion_n_parameter_non_streaming(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    """Test that n parameter returns multiple choices for non-streaming requests."""
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the opposite of big?"},
+    ]
+
+    # Test with n=3
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=20,
+        temperature=0.7,
+        n=3,
+        stream=False,
+    )
+
+    assert len(chat_completion.choices) == 3
+
+    # Verify each choice has content and correct index
+    for i, choice in enumerate(chat_completion.choices):
+        assert choice.index == i
+        assert choice.message.content is not None
+        assert len(choice.message.content) > 0
+
+    # Verify all responses are different (highly likely with temperature > 0)
+    contents = [choice.message.content for choice in chat_completion.choices]
+    assert len(set(contents)) > 1, "Expected different responses with n=3"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_chat_completion_n_parameter_streaming(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    """Test that n parameter returns multiple choices for streaming requests."""
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
+
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=15,
+        temperature=0.7,
+        n=2,
+        stream=True,
+    )
+
+    # Collect all chunks using defaultdict for dynamic handling
+    chunks_by_index = defaultdict(list)
+    async for chunk in stream:
+        for choice in chunk.choices:
+            if choice.delta.content:
+                chunks_by_index[choice.index].append(choice.delta.content)
+
+    # Verify both choices received content
+    assert len(chunks_by_index[0]) > 0, "Choice 0 received no content chunks"
+    assert len(chunks_by_index[1]) > 0, "Choice 1 received no content chunks"
+
+    # Reconstruct full responses
+    response_0 = "".join(chunks_by_index[0])
+    response_1 = "".join(chunks_by_index[1])
+
+    assert len(response_0) > 0, "Choice 0 has empty response"
+    assert len(response_1) > 0, "Choice 1 has empty response"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_chat_completion_n_with_seed(client: openai.AsyncOpenAI, model_name: str):
+    """Test that n parameter works correctly with seed parameter."""
+    messages = [
+        {"role": "user", "content": "Say hello."},
+    ]
+
+    # Test that seed parameter is accepted and works with n > 1
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.8,
+        n=2,
+        seed=42,
+        stream=False,
+    )
+
+    # Verify we get n=2 choices
+    assert len(chat_completion.choices) == 2
+
+    # Verify both choices have valid content
+    for i, choice in enumerate(chat_completion.choices):
+        assert choice.index == i
+        assert choice.message.content is not None
+        assert len(choice.message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_chat_completion_n_equals_1(client: openai.AsyncOpenAI, model_name: str):
+    """Test that n=1 (default) still works correctly."""
+    messages = [
+        {"role": "user", "content": "Hello!"},
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.7,
+        n=1,
+        stream=False,
+    )
+
+    assert len(chat_completion.choices) == 1
+    assert chat_completion.choices[0].index == 0
+    assert chat_completion.choices[0].message.content is not None
+
+
+# Unit tests for n parameter in ChatCompletionRequest.to_sampling_params()
+def test_chat_completion_request_n_parameter_to_sampling_params():
+    """Test that n parameter is correctly passed to SamplingParams."""
+    # Test with n=3
+    request = ChatCompletionRequest(
+        model="test-model",
+        messages=[{"role": "user", "content": "Hello"}],
+        n=3,
+        max_tokens=10,
+    )
+
+    sampling_params = request.to_sampling_params(
+        max_tokens=10,
+        default_sampling_params={},
+    )
+
+    assert isinstance(sampling_params, SamplingParams)
+    assert sampling_params.n == 3, f"Expected n=3, got n={sampling_params.n}"
+
+
+def test_chat_completion_request_n_parameter_default():
+    """Test that n parameter defaults to 1."""
+    request = ChatCompletionRequest(
+        model="test-model",
+        messages=[{"role": "user", "content": "Hello"}],
+        # n not specified, should default to 1
+        max_tokens=10,
+    )
+
+    assert request.n == 1, "n should default to 1"
+    sampling_params = request.to_sampling_params(
+        max_tokens=10,
+        default_sampling_params={},
+    )
+
+    # SamplingParams.from_optional converts None to 1
+    assert sampling_params.n == 1, f"Expected n=1 (default), got n={sampling_params.n}"
+
+
+def test_chat_completion_request_n_parameter_various_values():
+    """Test n parameter with various values."""
+    for n_value in [1, 2, 5, 10]:
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "Test"}],
+            n=n_value,
+            max_tokens=10,
+        )
+
+        sampling_params = request.to_sampling_params(
+            max_tokens=10,
+            default_sampling_params={},
+        )
+
+        assert sampling_params.n == n_value, (
+            f"Expected n={n_value}, got n={sampling_params.n}"
+        )

From ab87f85231b07b107f16ddb9e985deb0d83975ae Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 26 Feb 2026 18:17:11 +0800
Subject: [PATCH 017/154] [Model] Ring 2.5 (#35102)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 docs/models/supported_models.md               |    1 +
 tests/models/registry.py                      |    3 +
 .../layers/fla/ops/layernorm_guard.py         |   35 +-
 vllm/model_executor/layers/layernorm.py       |    1 +
 .../layers/mamba/linear_attn.py               |  189 ++-
 .../models/bailing_moe_linear.py              | 1246 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    1 +
 .../model_arch_config_convertor.py            |    1 +
 8 files changed, 1407 insertions(+), 70 deletions(-)
 create mode 100644 vllm/model_executor/models/bailing_moe_linear.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index e2d505adebcc..d184041f33ca 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -372,6 +372,7 @@ th {
 | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ |
 | `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ |
 | `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ |
+| `BailingMoeV2_5ForCausalLM` | Ling | `inclusionAI/Ling-2.5-1T`, `inclusionAI/Ring-2.5-1T` | | ✅︎ |
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ |
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `thu-coai/ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index fe500254baa5..c522ce58bf04 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -206,6 +206,9 @@ def check_available_online(
     "BailingMoeV2ForCausalLM": _HfExamplesInfo(
         "inclusionAI/Ling-mini-2.0", trust_remote_code=True
     ),
+    "BailingMoeV2_5ForCausalLM": _HfExamplesInfo(
+        "inclusionAI/Ring-2.5-1T", trust_remote_code=True
+    ),
     "BambaForCausalLM": _HfExamplesInfo(
         "ibm-ai-platform/Bamba-9B-v1",
         extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"},
diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
index 74c08e032402..3abfbff9e9de 100644
--- a/vllm/model_executor/layers/fla/ops/layernorm_guard.py
+++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
@@ -84,6 +84,7 @@ def layer_norm_fwd_kernel(
     HAS_Z: tl.constexpr,
     NORM_BEFORE_GATE: tl.constexpr,
     IS_RMS_NORM: tl.constexpr,
+    ACTIVATION: tl.constexpr,
 ):
     # Map the program id to the starting row of X and Y it should compute.
     row_start = tl.program_id(0) * ROWS_PER_BLOCK
@@ -112,7 +113,10 @@ def layer_norm_fwd_kernel(
     if HAS_Z and not NORM_BEFORE_GATE:
         Z_base = Z + rows[:, None] * stride_z_row + col_offsets
         z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32)
-        x *= z * tl.sigmoid(z)
+        if ACTIVATION == "swish" or ACTIVATION == "silu":
+            x *= z * tl.sigmoid(z)
+        elif ACTIVATION == "sigmoid":
+            x *= tl.sigmoid(z)
 
     # Compute mean and variance per row (reduce along axis 1)
     if not IS_RMS_NORM:
@@ -155,7 +159,10 @@ def layer_norm_fwd_kernel(
     if HAS_Z and NORM_BEFORE_GATE:
         Z_base = Z + rows[:, None] * stride_z_row + col_offsets
         z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32)
-        y *= z * tl.sigmoid(z)
+        if ACTIVATION == "swish" or ACTIVATION == "silu":
+            y *= z * tl.sigmoid(z)
+        elif ACTIVATION == "sigmoid":
+            y *= tl.sigmoid(z)
 
     # Write output
     tl.store(Y_base, y, mask=mask)
@@ -178,6 +185,7 @@ def layer_norm_fwd(
     group_size: int = None,
     norm_before_gate: bool = True,
     is_rms_norm: bool = False,
+    activation: str = "swish",
 ):
     M, N = x.shape
     if group_size is None:
@@ -232,9 +240,12 @@ def layer_norm_fwd(
         eps,
         BLOCK_N=BLOCK_N,
         ROWS_PER_BLOCK=rows_per_block,
+        HAS_BIAS=bias is not None,
+        HAS_Z=z is not None,
         NORM_BEFORE_GATE=norm_before_gate,
         IS_RMS_NORM=is_rms_norm,
         num_warps=num_warps,
+        ACTIVATION=activation,
     )
     return out, mean, rstd
 
@@ -252,6 +263,7 @@ def forward(
         group_size=None,
         norm_before_gate=True,
         is_rms_norm=False,
+        activation: str = "swish",
     ):
         """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
 
@@ -277,6 +289,7 @@ def forward(
             group_size=group_size,
             norm_before_gate=norm_before_gate,
             is_rms_norm=is_rms_norm,
+            activation=activation,
         )
         ctx.save_for_backward(x, weight, bias, mean, rstd, z)
         ctx.x_shape_og = x_shape_og
@@ -284,6 +297,7 @@ def forward(
         ctx.group_size = group_size
         ctx.norm_before_gate = norm_before_gate
         ctx.is_rms_norm = is_rms_norm
+        ctx.activation = activation
         return y.reshape(x_shape_og)
 
 
@@ -296,17 +310,25 @@ def layernorm_fn(
     group_size=None,
     norm_before_gate=True,
     is_rms_norm=False,
+    activation: str = "swish",
 ):
     return LayerNormFn.apply(
-        x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm
+        x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm, activation
     )
 
 
 def rmsnorm_fn(
-    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    activation: str = "swish",
 ):
     return LayerNormFn.apply(
-        x, weight, bias, z, eps, group_size, norm_before_gate, True
+        x, weight, bias, z, eps, group_size, norm_before_gate, True, activation
     )
 
 
@@ -359,6 +381,7 @@ def __init__(
         norm_before_gate: bool = False,
         device: torch.device | None = None,
         dtype: torch.dtype | None = None,
+        activation: str = "swish",
     ):
         """If group_size is not None, we do GroupNorm with each group having group_size elements.
         group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
@@ -366,6 +389,7 @@ def __init__(
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.eps = eps
+        self.activation = activation
         self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
         self.register_parameter("bias", None)
         self.group_size = group_size
@@ -385,4 +409,5 @@ def forward(self, x, z=None):
             eps=self.eps,
             group_size=self.group_size,
             norm_before_gate=self.norm_before_gate,
+            activation=self.activation,
         )
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index d8cf36bc225a..17b90c970123 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -592,6 +592,7 @@ def forward_cuda(
             eps=self.eps,
             group_size=self.group_size,
             norm_before_gate=self.norm_before_gate,
+            activation=self.activation,
         )
 
 
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index 8b5f80f54527..802141881747 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
+from collections.abc import Callable
 
 import torch
 import torch.nn.functional as F
@@ -43,7 +44,6 @@ def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
 
         self.weight.weight_loader = self.weight_loader
         self.variance_epsilon = eps
-        return
 
     @staticmethod
     def weight_loader(
@@ -56,7 +56,6 @@ def weight_loader(
         shard_size = loaded_weight.shape[0] // tp_world
         shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
         param.data.copy_(loaded_weight[shard])
-        return
 
     def _forward(
         self,
@@ -102,6 +101,101 @@ def forward_qk(
         return q, k
 
 
+def clear_linear_attention_cache_for_new_sequences(
+    kv_cache: torch.Tensor,
+    state_indices_tensor: torch.Tensor,
+    attn_metadata: LinearAttentionMetadata,
+) -> None:
+    num_prefills = getattr(attn_metadata, "num_prefills", 0)
+    if num_prefills <= 0:
+        return
+
+    num_decode_tokens = getattr(attn_metadata, "num_decode_tokens", 0)
+    for prefill_idx in range(num_prefills):
+        q_start = attn_metadata.query_start_loc[num_decode_tokens + prefill_idx]
+        q_end = attn_metadata.query_start_loc[num_decode_tokens + prefill_idx + 1]
+        query_len = q_end - q_start
+        context_len = (
+            attn_metadata.seq_lens[num_decode_tokens + prefill_idx] - query_len
+        )
+        if context_len == 0:
+            block_to_clear = state_indices_tensor[num_decode_tokens + prefill_idx]
+            kv_cache[block_to_clear, ...] = 0
+
+
+def linear_attention_decode(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slope_rate: torch.Tensor,
+    state_indices_tensor: torch.Tensor,
+    q_start: int = 0,
+    q_end: int | None = None,
+    slot_start: int = 0,
+    slot_end: int | None = None,
+    block_size: int = 32,
+) -> torch.Tensor:
+    q = q[q_start:q_end].unsqueeze(2).contiguous()
+    k = k[q_start:q_end].unsqueeze(2).contiguous()
+    v = v[q_start:q_end].unsqueeze(2).contiguous()
+    slot_id = state_indices_tensor[slot_start:slot_end]
+    return linear_decode_forward_triton(
+        q, k, v, kv_cache, slope_rate, slot_id, block_size
+    )
+
+
+def linear_attention_prefill_and_mix(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_cache: torch.Tensor,
+    state_indices_tensor: torch.Tensor,
+    attn_metadata: LinearAttentionMetadata,
+    slope_rate: torch.Tensor,
+    block_size: int,
+    decode_fn: Callable[..., torch.Tensor],
+    prefix_fn: Callable[..., torch.Tensor],
+    layer_idx: int | None = None,
+) -> torch.Tensor:
+    hidden = []
+    for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
+        if _prefill_idx >= len(attn_metadata.query_start_loc):
+            break
+        if _prefill_idx >= len(state_indices_tensor):
+            break
+        offset = attn_metadata.num_decode_tokens
+        _start = attn_metadata.query_start_loc[offset + _prefill_idx]
+        _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1]
+        slot_id = state_indices_tensor[offset + _prefill_idx]
+        qs = q[_start:_end].transpose(0, 1).contiguous()
+        ks = k[_start:_end].transpose(0, 1).contiguous()
+        vs = v[_start:_end].transpose(0, 1).contiguous()
+        slice_layer_cache = kv_cache[slot_id, ...]
+        out_slice = prefix_fn(
+            qs,
+            ks,
+            vs,
+            slice_layer_cache,
+            slope_rate,
+            block_size,
+            layer_idx=layer_idx,
+        )
+        hidden.append(out_slice.contiguous())
+
+    if attn_metadata.num_decode_tokens > 0:
+        hidden_decode = decode_fn(
+            q, k, v, kv_cache, state_indices_tensor, attn_metadata
+        )
+        hidden.insert(0, hidden_decode)
+
+    if not hidden:
+        return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
+
+    hidden = torch.concat(hidden, dim=0).contiguous()
+    return hidden
+
+
 class MiniMaxText01LinearKernel:
     @staticmethod
     def jit_linear_forward_prefix(
@@ -258,50 +352,33 @@ def get_slopes_power_of_2(n):
     def _prefill_and_mix_infer(
         self, q, k, v, kv_cache, state_indices_tensor, attn_metadata
     ):
-        hidden = []
-        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
-            if _prefill_idx >= len(attn_metadata.query_start_loc):
-                break
-            if _prefill_idx >= len(state_indices_tensor):
-                break
-            offset = attn_metadata.num_decode_tokens
-            _start = attn_metadata.query_start_loc[offset + _prefill_idx]
-            _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1]
-            slot_id = state_indices_tensor[offset + _prefill_idx]
-            qs = q[_start:_end].transpose(0, 1).contiguous()
-            ks = k[_start:_end].transpose(0, 1).contiguous()
-            vs = v[_start:_end].transpose(0, 1).contiguous()
-            slice_layer_cache = kv_cache[slot_id, ...]
-
-            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
-                qs,
-                ks,
-                vs,
-                slice_layer_cache,
-                self.tp_slope,
-                self.BLOCK,
-                layer_idx=self.layer_idx,
-            )
-            hidden.append(out_slice.contiguous())
-        if attn_metadata.num_decode_tokens > 0:
-            hidden_decode = self._decode_infer(
-                q, k, v, kv_cache, state_indices_tensor, attn_metadata
-            )
-            hidden.insert(0, hidden_decode)
-
-        if not hidden:
-            return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
-
-        hidden = torch.concat(hidden, dim=0).contiguous()
-        return hidden
+        return linear_attention_prefill_and_mix(
+            q=q,
+            k=k,
+            v=v,
+            kv_cache=kv_cache,
+            state_indices_tensor=state_indices_tensor,
+            attn_metadata=attn_metadata,
+            slope_rate=self.tp_slope,
+            block_size=self.BLOCK,
+            decode_fn=self._decode_infer,
+            prefix_fn=MiniMaxText01LinearKernel.jit_linear_forward_prefix,
+            layer_idx=self.layer_idx,
+        )
 
     def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata):
-        q = q[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-        k = k[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-        v = v[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-        slot_id = state_indices_tensor[: attn_metadata.num_decodes]
-        hidden = linear_decode_forward_triton(
-            q, k, v, kv_cache, self.tp_slope, slot_id, 32
+        hidden = linear_attention_decode(
+            q,
+            k,
+            v,
+            kv_cache,
+            self.tp_slope,
+            state_indices_tensor,
+            q_start=0,
+            q_end=attn_metadata.num_decode_tokens,
+            slot_start=0,
+            slot_end=attn_metadata.num_decodes,
+            block_size=32,
         )
         return hidden
 
@@ -338,27 +415,9 @@ def _forward(
         if attn_metadata is not None:
             kv_cache = self.kv_cache[forward_context.virtual_engine][0]
             state_indices_tensor = attn_metadata.state_indices_tensor
-
-            num_prefills = getattr(attn_metadata, "num_prefills", 0)
-            if num_prefills > 0:
-                num_decode_tokens = getattr(attn_metadata, "num_decode_tokens", 0)
-                for prefill_idx in range(num_prefills):
-                    q_start = attn_metadata.query_start_loc[
-                        num_decode_tokens + prefill_idx
-                    ]
-                    q_end = attn_metadata.query_start_loc[
-                        num_decode_tokens + prefill_idx + 1
-                    ]
-                    query_len = q_end - q_start
-                    context_len = (
-                        attn_metadata.seq_lens[num_decode_tokens + prefill_idx]
-                        - query_len
-                    )
-                    if context_len == 0:
-                        block_to_clear = state_indices_tensor[
-                            num_decode_tokens + prefill_idx
-                        ]
-                        kv_cache[block_to_clear, ...] = 0
+            clear_linear_attention_cache_for_new_sequences(
+                kv_cache, state_indices_tensor, attn_metadata
+            )
 
         decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
         if attn_metadata is None:
diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py
new file mode 100644
index 000000000000..9b54ec634705
--- /dev/null
+++ b/vllm/model_executor/models/bailing_moe_linear.py
@@ -0,0 +1,1246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fla.ops.layernorm_guard import (
+    RMSNormGated,
+    layernorm_fn,
+)
+from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.linear_attn import (
+    MiniMaxText01LinearAttention,
+    MiniMaxText01LinearKernel,
+    MiniMaxText01RMSNormTP,
+    clear_linear_attention_cache_for_new_sequences,
+    linear_attention_decode,
+    linear_attention_prefill_and_mix,
+)
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateCopyFuncCalculator,
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.bailing_moe import BailingMLP
+from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backend import AttentionMetadata
+from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
+
+from .interfaces import HasInnerState, IsHybrid, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+def is_linear_layer(layer_idx, layer_group_size):
+    if layer_idx is None:
+        return False
+    if layer_group_size > 0:
+        return (layer_idx + 1) % layer_group_size != 0
+    else:
+        return False
+
+
+def _build_rope_parameters(config: PretrainedConfig) -> dict | None:
+    rope_parameters = copy.deepcopy(getattr(config, "rope_parameters", None)) or {}
+    if "rope_theta" not in rope_parameters and hasattr(config, "rope_theta"):
+        rope_parameters["rope_theta"] = config.rope_theta
+    if "partial_rotary_factor" not in rope_parameters and hasattr(
+        config, "partial_rotary_factor"
+    ):
+        rope_parameters["partial_rotary_factor"] = config.partial_rotary_factor
+
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if isinstance(rope_scaling, dict):
+        rope_scaling = copy.deepcopy(rope_scaling)
+        if "type" in rope_scaling and "rope_type" not in rope_scaling:
+            rope_scaling["rope_type"] = rope_scaling.pop("type")
+        rope_parameters.update(rope_scaling)
+
+    return rope_parameters or None
+
+
+class BailingMoeV25MLAAttention(nn.Module):
+    """
+    MLA Attention for BailingMoeV2.5 full attention layers.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        layer_id: int = 0,
+        prefix: str = "attention",
+        cache_config: CacheConfig | None = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.layer_id = layer_id
+        self.prefix = prefix
+
+        # MLA dimensions
+        self.qk_nope_head_dim = getattr(config, "qk_nope_head_dim", 128)
+        self.qk_rope_head_dim = getattr(config, "qk_rope_head_dim", 64)
+        self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+        self.v_head_dim = getattr(config, "v_head_dim", 128)
+
+        # LoRA ranks
+        self.q_lora_rank = getattr(config, "q_lora_rank", None)
+        self.kv_lora_rank = getattr(config, "kv_lora_rank", 512)
+
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.num_heads % tp_size == 0
+        self.num_local_heads = self.num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+
+        # KV projections
+        self.kv_a_layernorm = RMSNorm(
+            self.kv_lora_rank,
+            eps=config.rms_norm_eps,
+        )
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
+        )
+
+        # Output projection
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        if self.q_lora_rank is not None:
+            # Use fused_qkv_a_proj when q_lora_rank is set
+            self.fused_qkv_a_proj = MergedColumnParallelLinear(
+                self.hidden_size,
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.fused_qkv_a_proj",
+                disable_tp=True,
+            )
+            self.q_a_layernorm = RMSNorm(
+                self.q_lora_rank,
+                eps=config.rms_norm_eps,
+            )
+            self.q_b_proj = ColumnParallelLinear(
+                self.q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_b_proj",
+            )
+            self.q_proj = None
+            self.kv_a_proj_with_mqa = None
+        else:
+            # Direct projections when no q_lora_rank
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_a_proj_with_mqa",
+            )
+            self.fused_qkv_a_proj = None
+            self.q_a_layernorm = None
+            self.q_b_proj = None
+
+        rope_parameters = _build_rope_parameters(config)
+        max_position = getattr(config, "max_position_embeddings", 8192)
+        self.rotary_emb = get_rope(
+            head_size=self.qk_rope_head_dim,
+            max_position=max_position,
+            is_neox_style=False,
+            rope_parameters=rope_parameters or None,
+            dtype=torch.float32,
+        )
+
+        # Build MLAModules for MultiHeadLatentAttentionWrapper
+        mla_modules = MLAModules(
+            kv_a_layernorm=self.kv_a_layernorm,
+            kv_b_proj=self.kv_b_proj,
+            rotary_emb=self.rotary_emb,
+            o_proj=self.o_proj,
+            fused_qkv_a_proj=self.fused_qkv_a_proj,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
+            q_a_layernorm=self.q_a_layernorm,
+            q_b_proj=self.q_b_proj,
+            q_proj=self.q_proj,
+            indexer=None,
+            is_sparse=False,
+            topk_indices_buffer=None,
+        )
+
+        self.mla_attn = MultiHeadLatentAttentionWrapper(
+            self.hidden_size,
+            self.num_local_heads,
+            self.scaling,
+            self.qk_nope_head_dim,
+            self.qk_rope_head_dim,
+            self.v_head_dim,
+            self.q_lora_rank,
+            self.kv_lora_rank,
+            mla_modules,
+            cache_config,
+            quant_config,
+            prefix,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward pass for MLA attention."""
+        return self.mla_attn(positions, hidden_states)
+
+
+class BailingMoEGate(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        params_dtype: torch.dtype | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        self.weight = nn.Parameter(
+            torch.empty(
+                (config.num_experts, config.hidden_size),
+                dtype=self.params_dtype,
+            ),
+        )
+        if getattr(config, "moe_router_enable_expert_bias", False):
+            self.expert_bias = nn.Parameter(
+                torch.empty((config.num_experts,), dtype=torch.float32),
+            )
+        else:
+            self.expert_bias = None
+
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states.to(self.weight.dtype), self.weight, None).to(
+            hidden_states.dtype
+        )
+        return logits
+
+
+class BailingMoeV25(nn.Module):
+    """Bailing MoE v2.5 - standalone implementation for linear attention model."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        layer_id: int = 0,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.layer_id = layer_id
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        norm_topk_prob = getattr(config, "norm_topk_prob", None)
+        # Ring-2.5 reference implementations normalize routing weights by default.
+        self.norm_expert_prob = True if norm_topk_prob is None else bool(norm_topk_prob)
+        self.hidden_size = config.hidden_size
+        self.quant_config = quant_config
+        self.num_shared_experts = config.num_shared_experts
+        self.score_function = getattr(config, "score_function", None)
+        self.n_group = getattr(config, "n_group", None)
+        self.topk_group = getattr(config, "topk_group", None)
+        self.use_grouped_topk = self.n_group is not None and self.topk_group is not None
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
+
+        router_dtype = getattr(config, "router_dtype", None)
+        if router_dtype is None or router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        else:
+            self.router_dtype = torch.bfloat16
+
+        # Gate for routing
+        self.gate = BailingMoEGate(
+            config=config,
+            params_dtype=self.router_dtype,
+            prefix=f"{prefix}.gate",
+        )
+        correction_bias = (
+            self.gate.expert_bias if self.gate.expert_bias is not None else None
+        )
+        if self.score_function is not None:
+            assert (self.score_function == "softmax" and correction_bias is None) or (
+                self.score_function == "sigmoid" and correction_bias is not None
+            ), (
+                "score_function and correction_bias should be "
+                "(softmax, None) or (sigmoid, not None)"
+            )
+
+        # Shared experts (using BailingMLP)
+        if self.num_shared_experts > 0:
+            if hasattr(config, "moe_shared_expert_intermediate_size"):
+                intermediate_size = config.moe_shared_expert_intermediate_size
+            else:
+                intermediate_size = config.moe_intermediate_size
+            intermediate_size *= config.num_shared_experts
+            self.shared_experts = BailingMLP(
+                intermediate_size=intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None
+
+        # Routed experts using SharedFusedMoE
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            hidden_size=self.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=self.norm_expert_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            scoring_func=self.score_function,
+            e_score_correction_bias=correction_bias,
+            num_expert_group=self.n_group,
+            topk_group=self.topk_group,
+            use_grouped_topk=self.use_grouped_topk,
+            router_logits_dtype=self.router_dtype,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        # Ensure contiguous token-major layout before router/projections.
+        hidden_states = hidden_states.contiguous().view(-1, hidden_size)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states.to(self.router_dtype))
+        router_logits = router_logits.to(hidden_states.dtype)
+
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        # Handle tuple return from SharedFusedMoE
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = final_hidden_states
+        else:
+            shared_output = None
+
+        final_hidden_states *= self.routed_scaling_factor
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+BailingRMSNormTP = MiniMaxText01RMSNormTP
+
+
+class BailingGroupRMSNormGate(RMSNormGated):
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-5,
+        group_size=None,
+        norm_before_gate=True,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            hidden_size,
+            eps=eps,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            device=device,
+            dtype=dtype,
+            activation="sigmoid",
+        )
+        # Add custom weight loader for TP sharding
+        self.weight.weight_loader = self._weight_loader
+
+    @staticmethod
+    def _weight_loader(param: torch.nn.Parameter, loaded_weight: torch.Tensor) -> None:
+        """Load weight with TP sharding."""
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = loaded_weight.shape[0] // tp_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard].contiguous())
+
+
+class BailingMoELinearAttention(nn.Module, MambaBase):
+    """
+    Bailing MoE Linear Attention implementation using minimax backend.
+
+    This implements the linear attention mechanism from sglang, adapted for vLLM's
+    v1 engine with MambaBase interface support.
+    """
+
+    @property
+    def mamba_type(self) -> str:
+        return "linear_attention"
+
+    def get_state_shape(self) -> tuple[tuple[int, ...], ...]:
+        """Return state shape for linear attention cache.
+
+        Must match the calculation in get_mamba_state_shape_from_config.
+        """
+        return MambaStateShapeCalculator.linear_attention_state_shape(
+            num_heads=self.total_num_heads,
+            tp_size=self.tp_size,
+            head_dim=self.head_dim,
+        )
+
+    def get_state_dtype(self) -> tuple[torch.dtype, ...]:
+        """Return state dtype for linear attention cache.
+
+        Must match the calculation in get_mamba_state_dtype_from_config.
+        """
+        return MambaStateDtypeCalculator.linear_attention_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+        )
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        layer_id: int = 0,
+        prefix: str = "linear_attn",
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+    ):
+        super().__init__()
+
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.total_kv_heads = config.num_attention_heads  # MHA
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.prefix = prefix
+
+        self.head_dim = (
+            config.head_dim
+            if hasattr(config, "head_dim")
+            else config.hidden_size // self.total_num_heads
+        )
+
+        self.hidden_inner_size = self.head_dim * self.total_num_heads
+        self.scaling = self.head_dim**-0.5
+
+        assert self.total_num_heads % self.tp_size == 0
+        self.tp_heads = self.total_num_heads // self.tp_size
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = getattr(config, "rope_theta", 600000)
+
+        self.tp_kv_heads = self.total_kv_heads // self.tp_size
+        self.q_size_per_rank = self.head_dim * self.tp_heads
+        self.kv_size_per_rank = self.head_dim * self.tp_kv_heads
+
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.linear_backend = "minimax"
+        self.linear_scale = self.linear_backend == "minimax"
+        self.linear_rope = getattr(config, "linear_rope", True)
+        if hasattr(config, "use_linear_silu"):
+            self.linear_silu = config.use_linear_silu
+        elif hasattr(config, "linear_silu"):
+            self.linear_silu = config.linear_silu
+        else:
+            self.linear_silu = False
+
+        # Block size for lightning attention
+        self.BLOCK = getattr(config, "block", 256)
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_heads,  # MHA: kv_heads = num_heads
+            bias=(config.use_bias or config.use_qkv_bias),
+            quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.g_proj = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_inner_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.g_proj",
+        )
+        self.dense = RowParallelLinear(
+            self.hidden_inner_size,
+            self.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+            reduce_results=True,
+        )
+
+        self.group_norm_size = getattr(config, "group_norm_size", 1)
+        self.rms_norm_eps = float(getattr(config, "rms_norm_eps", 1e-5))
+        assert self.tp_size <= self.group_norm_size, (
+            "tp_size must be <= group_norm_size for local rms norm"
+        )
+        assert self.group_norm_size % self.tp_size == 0, (
+            "group_norm_size must be divisible by tp_size"
+        )
+
+        # When group_norm_size == 1, group_size equals hidden_size // tp_size
+        self.g_norm = BailingGroupRMSNormGate(
+            hidden_size=self.hidden_inner_size // self.tp_size,
+            eps=self.rms_norm_eps,
+            group_size=(
+                self.hidden_inner_size // self.group_norm_size
+                if self.group_norm_size > 1
+                else self.hidden_inner_size // self.tp_size
+            ),
+        )
+
+        # use fp32 rotary embedding
+        rope_parameters = _build_rope_parameters(config)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=self.max_position_embeddings,
+            is_neox_style=True,
+            dtype=torch.float32,
+            rope_parameters=rope_parameters or None,
+        )
+
+        # Build slope tensor for linear attention decay
+        num_hidden_layers = config.num_hidden_layers
+        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
+            self.total_num_heads
+        )
+        if num_hidden_layers <= 1:
+            self.slope_rate = slope_rate * (1 + 1e-5)
+        else:
+            self.slope_rate = slope_rate * (
+                1 - layer_id / (num_hidden_layers - 1) + 1e-5
+            )
+        self.tp_slope = self.slope_rate[
+            self.tp_rank * self.tp_heads : (self.tp_rank + 1) * self.tp_heads
+        ].contiguous()
+
+        # Register for compilation
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+
+    @staticmethod
+    def weight_direct_load(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+        """Load weight for linear attention layers.
+
+        For FP8 quantized parameters, we need to use the weight_loader if available,
+        as it handles special cases like tensor parallelism sharding.
+        """
+        # Check if param has a weight_loader (for vLLM ModelWeightParameter)
+        weight_loader = getattr(param, "weight_loader", None)
+        if weight_loader is not None:
+            # Use the weight_loader which handles TP sharding and quantization
+            weight_loader(param, loaded_weight)
+        else:
+            # Fall back to direct copy for standard tensors
+            assert param.size() == loaded_weight.size(), (
+                f"Shape mismatch: {param.shape} vs {loaded_weight.shape}"
+            )
+            param.data.copy_(loaded_weight)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> None:
+        """Forward method called by torch.ops.vllm.linear_attention"""
+        torch.ops.vllm.linear_attention(
+            hidden_states,
+            output,
+            positions,
+            self.prefix,
+        )
+
+    def _forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> None:
+        """Actual forward implementation."""
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if attn_metadata is not None:
+            assert isinstance(attn_metadata, dict)
+            attn_metadata = attn_metadata[self.prefix]
+            assert isinstance(attn_metadata, LinearAttentionMetadata)
+            num_actual_tokens = (
+                attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
+            )
+        else:
+            num_actual_tokens = hidden_states.shape[0]
+
+        # QKV projection
+        qkv, _ = self.query_key_value(hidden_states[:num_actual_tokens])
+
+        # use rotary_emb support fp32
+        qkv = qkv.to(torch.float32)
+        if self.linear_silu:
+            qkv = F.silu(qkv)
+
+        # Split q, k, v
+        q, k, v = torch.split(
+            qkv,
+            [self.q_size_per_rank, self.kv_size_per_rank, self.kv_size_per_rank],
+            dim=-1,
+        )
+
+        # Apply QK norm if needed
+        if self.use_qk_norm:
+            q = q.reshape(-1, self.tp_heads, self.head_dim)
+            k = k.reshape(-1, self.tp_kv_heads, self.head_dim)
+            q = layernorm_fn(
+                q,
+                self.query_layernorm.weight.data,
+                bias=None,
+                eps=self.rms_norm_eps,
+                is_rms_norm=True,
+            )
+            k = layernorm_fn(
+                k,
+                self.key_layernorm.weight.data,
+                bias=None,
+                eps=self.rms_norm_eps,
+                is_rms_norm=True,
+            )
+            q = q.reshape(-1, self.q_size_per_rank)
+            k = k.reshape(-1, self.kv_size_per_rank)
+
+        # Apply rotary embeddings
+        if self.linear_rope:
+            q, k = self.rotary_emb(positions[:num_actual_tokens], q, k)
+
+        # Reshape to [batch, heads, seq_len, head_dim]
+        q = q.view((qkv.shape[0], self.tp_heads, self.head_dim))
+        k = k.view((qkv.shape[0], self.tp_kv_heads, self.head_dim))
+        v = v.view((qkv.shape[0], self.tp_kv_heads, self.head_dim))
+
+        # Apply scaling if using minimax backend
+        if self.linear_scale:
+            q = q * self.scaling
+
+        # Get KV cache and state indices
+        if attn_metadata is not None:
+            kv_cache = self.kv_cache[forward_context.virtual_engine][0]
+            state_indices_tensor = attn_metadata.state_indices_tensor
+            clear_linear_attention_cache_for_new_sequences(
+                kv_cache, state_indices_tensor, attn_metadata
+            )
+
+        # Compute attention
+        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
+        if attn_metadata is None:
+            hidden = torch.empty(
+                (q.shape[0], q.shape[1] * q.shape[2]), device=q.device, dtype=q.dtype
+            )
+        else:
+            if not decode_only:
+                hidden = self._prefill_and_mix_infer(
+                    q, k, v, kv_cache, state_indices_tensor, attn_metadata
+                )
+            else:
+                hidden = self._decode_infer(
+                    q, k, v, kv_cache, state_indices_tensor, attn_metadata
+                )
+
+        # Apply group norm and gate (matching SGLang behavior)
+        gate, _ = self.g_proj(hidden_states[:num_actual_tokens])
+
+        if self.group_norm_size > 1:
+            hidden = self.g_norm(hidden, gate)
+        else:
+            hidden = self.g_norm(hidden)
+            hidden = F.sigmoid(gate) * hidden
+
+        hidden = hidden.to(hidden_states.dtype)
+
+        # Output projection
+        dense_out, _ = self.dense(hidden)
+        output[:num_actual_tokens] = dense_out
+
+    def _prefill_and_mix_infer(
+        self, q, k, v, kv_cache, state_indices_tensor, attn_metadata
+    ):
+        """Handle prefill (mixed with decode if any)."""
+        return linear_attention_prefill_and_mix(
+            q=q,
+            k=k,
+            v=v,
+            kv_cache=kv_cache,
+            state_indices_tensor=state_indices_tensor,
+            attn_metadata=attn_metadata,
+            slope_rate=self.tp_slope,
+            block_size=self.BLOCK,
+            decode_fn=self._decode_infer,
+            prefix_fn=MiniMaxText01LinearKernel.jit_linear_forward_prefix,
+            layer_idx=self.layer_id,
+        )
+
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata):
+        """Handle decode (single token per sequence)."""
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+        num_prefills = attn_metadata.num_prefills
+        hidden = linear_attention_decode(
+            q,
+            k,
+            v,
+            kv_cache,
+            self.tp_slope,
+            state_indices_tensor,
+            q_start=num_prefill_tokens,
+            q_end=None,
+            slot_start=num_prefills,
+            slot_end=None,
+            block_size=32,
+        )
+        return hidden
+
+
+class BailingMoeV25DecoderLayer(nn.Module):
+    """Decoder layer supporting both linear and full attention."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        layer_id: int = 0,
+        prefix: str = "layer",
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+
+        # Determine attention type (0 = linear, 1 = full)
+        self.attention_type = getattr(config, "attention_type", 1)
+
+        if self.attention_type == 0:  # Linear attention
+            self.self_attn = BailingMoELinearAttention(
+                config,
+                quant_config=quant_config,
+                layer_id=layer_id,
+                prefix=f"{prefix}.self_attn",
+                model_config=model_config,
+                cache_config=cache_config,
+            )
+        else:  # Full attention
+            self.self_attn = BailingMoeV25MLAAttention(
+                config,
+                quant_config=quant_config,
+                layer_id=layer_id,
+                prefix=f"{prefix}.self_attn",
+                cache_config=cache_config,
+            )
+
+        # MLP/MoE
+        is_moe_layer = config.num_experts > 1 and layer_id >= getattr(
+            config, "first_k_dense_replace", 0
+        )
+
+        if is_moe_layer:
+            self.mlp = BailingMoeV25(
+                config,
+                quant_config=quant_config,
+                layer_id=layer_id,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = BailingMLP(
+                intermediate_size=config.intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                reduce_results=True,
+                prefix=f"{prefix}.mlp",
+            )
+
+        # Layer norms
+        rms_norm_eps = float(getattr(config, "rms_norm_eps", 1e-5))
+        self.input_layernorm = RMSNorm(self.hidden_size, eps=rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(self.hidden_size, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        attn_metadata: AttentionMetadata | None = None,
+        residual: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # Input layernorm
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self attention
+        if self.attention_type == 0:
+            # Linear attention uses output tensor
+            self_attention_output = torch.zeros_like(hidden_states)
+            self.self_attn(
+                hidden_states=hidden_states,
+                output=self_attention_output,
+                positions=positions,
+            )
+        else:
+            # Full attention
+            self_attention_output = self.self_attn(hidden_states, positions)
+
+        hidden_states, residual = self.post_attention_layernorm(
+            self_attention_output, residual
+        )
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
+class BailingMoeV25Model(nn.Module):
+    """Bailing MoE v2.5 Model with hybrid attention support."""
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_dim = config.hidden_size
+
+        # Determine layer types based on layer_group_size
+        self.layer_group_size = getattr(config, "layer_group_size", 1)
+        self.num_layers = config.num_hidden_layers
+
+        # decoder_attention_types: 0 = linear, 1 = full
+        self.decoder_attention_types = [
+            0 if is_linear_layer(i, self.layer_group_size) else 1
+            for i in range(self.num_layers)
+        ]
+
+        # Embeddings
+        if get_pp_group().is_first_rank:
+            self.word_embeddings = VocabParallelEmbedding(
+                self.vocab_size,
+                self.embed_dim,
+                org_num_embeddings=self.vocab_size,
+            )
+        else:
+            from vllm.model_executor.models.utils import PPMissingLayer
+
+            self.word_embeddings = PPMissingLayer()
+
+        # Layers
+        def layer_fn(prefix):
+            layer_idx = int(prefix.split(".")[-1])
+            layer_config = copy.deepcopy(config)
+            layer_config.attention_type = self.decoder_attention_types[layer_idx]
+
+            return BailingMoeV25DecoderLayer(
+                config=layer_config,
+                quant_config=quant_config,
+                layer_id=layer_idx,
+                prefix=prefix,
+                model_config=model_config,
+                cache_config=cache_config,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            self.num_layers, layer_fn, prefix=f"{prefix}.layers"
+        )
+
+        # Final norm
+        norm_kwargs = {}
+        if hasattr(config, "rms_norm_eps"):
+            norm_kwargs["eps"] = config.rms_norm_eps
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, **norm_kwargs)
+        else:
+            from vllm.model_executor.models.utils import PPMissingLayer
+
+            self.norm = PPMissingLayer()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                hidden_states = self.word_embeddings(input_ids)
+            else:
+                hidden_states = inputs_embeds
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer : self.end_layer]:
+            hidden_states, residual = layer(
+                hidden_states=hidden_states,
+                positions=positions,
+                attn_metadata=attn_metadata,
+                residual=residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        else:
+            if residual is not None:
+                hidden_states, _ = self.norm(hidden_states, residual)
+            else:
+                hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        """Get expert parameter mapping for MoE layers."""
+        return FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+            num_redundant_experts=0,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Load checkpoint weights with simplified mapping."""
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        # Stacked parameter mappings (fused projections)
+        stacked_mappings = [
+            (".fused_qkv_a_proj", ".q_a_proj", 0),
+            (".fused_qkv_a_proj", ".kv_a_proj_with_mqa", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        # Expert parameter mappings from FusedMoE
+        expert_mappings = list(self.get_expert_mapping())
+
+        def load_param(name: str, tensor: torch.Tensor, shard_id=None) -> bool:
+            """Load a single parameter."""
+            if name not in params_dict or is_pp_missing_parameter(name, self):
+                return False
+            if name.endswith(".bias") and name not in params_dict:
+                return False
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+
+            if shard_id is None:
+                weight_loader(param, tensor)
+            elif isinstance(shard_id, int):
+                weight_loader(param, tensor, shard_id)
+            else:
+                # Expert param: (expert_id, shard_id)
+                weight_loader(
+                    param, tensor, name, expert_id=shard_id[0], shard_id=shard_id[1]
+                )
+
+            loaded_params.add(name)
+            return True
+
+        def normalize_name(name: str) -> str | None:
+            """Normalize checkpoint name to model parameter name."""
+            # Skip special weights
+            if name.startswith("model.mtp"):
+                return None
+            # Remove 'model.' prefix if present
+            # (e.g., 'model.layers.0...' -> 'layers.0...')
+            name = name.removeprefix("model.")
+            # Map attention.dense based on layer type
+            if "attention.dense" in name:
+                layer_idx = (
+                    int(name.split("layers.")[1].split(".")[0])
+                    if "layers." in name
+                    else 0
+                )
+                attn_name = (
+                    "self_attn.dense"
+                    if is_linear_layer(layer_idx, self.config.layer_group_size)
+                    else "self_attn.o_proj"
+                )
+                name = name.replace("attention.dense", attn_name)
+
+            # Standard mappings
+            name = name.replace("attention.", "self_attn.")
+            name = name.replace(
+                "mlp.gate.e_score_correction_bias", "mlp.gate.expert_bias"
+            )
+
+            return maybe_remap_kv_scale_name(name, params_dict)
+
+        for orig_name, weight in weights:
+            norm_name = normalize_name(orig_name)
+            if norm_name is None:
+                continue
+
+            # Try stacked mappings
+            loaded = False
+            for param_suf, weight_suf, shard_id in stacked_mappings:
+                if weight_suf not in norm_name:
+                    continue
+                mapped = norm_name.replace(weight_suf, param_suf).replace(
+                    "attention.", "self_attn."
+                )
+                if load_param(mapped, weight, shard_id):
+                    loaded = True
+                    break
+            if loaded:
+                continue
+
+            # Handle expert weights
+            if "mlp.experts" in norm_name:
+                # Expert bias
+                if (
+                    "mlp.experts.e_score_correction_bias" in norm_name
+                    or "mlp.experts.expert_bias" in norm_name
+                ):
+                    alt = norm_name.replace(
+                        "mlp.experts.e_score_correction_bias", "mlp.gate.expert_bias"
+                    ).replace("mlp.experts.expert_bias", "mlp.gate.expert_bias")
+                    if load_param(alt, weight) or load_param(norm_name, weight):
+                        continue
+
+                # Routed experts
+                for param_name, weight_name, expert_id, shard_id in expert_mappings:
+                    if weight_name not in norm_name:
+                        continue
+                    mapped = norm_name.replace(weight_name, param_name)
+                    if load_param(mapped, weight, (expert_id, shard_id)):
+                        break
+                continue
+
+            # General parameters
+            load_param(norm_name, weight)
+
+        return loaded_params
+
+
+class BailingMoeV25ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsPP):
+    """Bailing MoE v2.5 For CausalLM."""
+
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = BailingMoeV25Model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+            )
+            self.logits_processor = LogitsProcessor(config.vocab_size)
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.logits_processor(self.lm_head, hidden_states)
+
+    def make_empty_intermediate_tensors(
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: VllmConfig,
+    ) -> tuple[tuple[int, ...], ...]:
+        """Calculate shape for linear attention cache."""
+        config = vllm_config.model_config.hf_config
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+
+        # Return base state shape from linear attention (no padding)
+        return MambaStateShapeCalculator.linear_attention_state_shape(
+            num_heads=config.num_attention_heads,
+            tp_size=tp_size,
+            head_dim=head_dim,
+        )
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: VllmConfig,
+    ) -> tuple[torch.dtype, ...]:
+        return MambaStateDtypeCalculator.linear_attention_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+        )
+
+    @classmethod
+    def get_mamba_state_copy_func(cls) -> tuple:
+        return MambaStateCopyFuncCalculator.linear_attention_state_copy_func()
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7d9fc0226369..6bb8423db6f4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -81,6 +81,7 @@
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),
     "BailingMoeForCausalLM": ("bailing_moe", "BailingMoeForCausalLM"),
     "BailingMoeV2ForCausalLM": ("bailing_moe", "BailingMoeV2ForCausalLM"),
+    "BailingMoeV2_5ForCausalLM": ("bailing_moe_linear", "BailingMoeV25ForCausalLM"),
     "BambaForCausalLM": ("bamba", "BambaForCausalLM"),
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py
index 5fc737e8ee90..b4e6508fad79 100644
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -245,6 +245,7 @@ def is_deepseek_mla(self) -> bool:
             "longcat_flash",
             "pangu_ultra_moe",
             "pangu_ultra_moe_mtp",
+            "bailing_hybrid",
         ):
             return self.hf_text_config.kv_lora_rank is not None
         elif self.hf_text_config.model_type == "eagle":

From 02acd16861bc6388ab79b6d7c9abb20c0237426e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophie=20du=20Cou=C3=A9dic?= <sop@zurich.ibm.com>
Date: Thu, 26 Feb 2026 11:17:43 +0100
Subject: [PATCH 018/154] [Benchmarks] Plot benchmark timeline and requests
 statistics (#35220)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sophie du Couédic <sop@zurich.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 setup.py                 |   2 +-
 vllm/benchmarks/plot.py  | 316 +++++++++++++++++++++++++++++++++++++++
 vllm/benchmarks/serve.py | 166 +++++++++++++++++---
 3 files changed, 466 insertions(+), 18 deletions(-)
 create mode 100644 vllm/benchmarks/plot.py

diff --git a/setup.py b/setup.py
index 8dea355da7c8..a6f2019e5bdb 100644
--- a/setup.py
+++ b/setup.py
@@ -1033,7 +1033,7 @@ def _read_requirements(filename: str) -> list[str]:
     ext_modules=ext_modules,
     install_requires=get_requirements(),
     extras_require={
-        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
+        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy", "plotly"],
         "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.2.2"],
         "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
diff --git a/vllm/benchmarks/plot.py b/vllm/benchmarks/plot.py
new file mode 100644
index 000000000000..3f36ede721ad
--- /dev/null
+++ b/vllm/benchmarks/plot.py
@@ -0,0 +1,316 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Generate plots for benchmark results."""
+
+from pathlib import Path
+from typing import Any
+
+from vllm.utils.import_utils import PlaceholderModule
+
+try:
+    import plotly.express as px
+    import plotly.io as pio
+except ImportError:
+    _plotly = PlaceholderModule("plotly")
+    px = _plotly.placeholder_attr("express")
+    pio = _plotly.placeholder_attr("io")
+
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    _matplotlib = PlaceholderModule("matplotlib")
+    plt = _matplotlib.placeholder_attr("pyplot")
+
+
+def generate_timeline_plot(
+    results: list[dict[str, Any]],
+    output_path: Path,
+    colors: list[str] | None = None,
+    itl_thresholds: list[float] | None = None,
+    labels: list[str] | None = None,
+) -> None:
+    """
+    Generate an HTML timeline plot from benchmark results.
+
+    Args:
+        results: List of per-request result dictionaries containing:
+            - start_time: Request start time (seconds)
+            - ttft: Time to first token (seconds)
+            - itl: List of inter-token latencies (seconds)
+            - latency: Total request latency (seconds)
+            - prompt_len: Number of prompt tokens
+            - output_tokens: Number of output tokens
+        output_path: Path where the HTML file will be saved
+        colors: List of colors for ITL categories (default: green, orange, red, black)
+        itl_thresholds: ITL thresholds in seconds (default: [1.0, 4.0, 6.0])
+        labels: Labels for ITL categories (default based on thresholds)
+    """
+
+    # Set defaults
+    if colors is None:
+        colors = ["#109618", "#FF7F0E", "#D62728"]
+    if itl_thresholds is None:
+        itl_thresholds = [0.025, 0.050]
+    if labels is None:
+        labels = [
+            f"ITL < {itl_thresholds[0] * 1000:.0f}ms",
+            f"{itl_thresholds[0] * 1000:.0f}ms ≤ ITL < {itl_thresholds[1] * 1000:.0f}ms",  # noqa
+            f"ITL ≥ {itl_thresholds[1] * 1000:.0f}ms",
+        ]
+
+    labels_colors = {"TTFT": "#636EFA", **dict(zip(labels, colors))}
+    labels_order = ["TTFT"] + labels
+
+    timeline_data = construct_timeline_data(results, itl_thresholds, labels)
+
+    if not timeline_data:
+        print("No timeline data to plot")
+        return
+
+    # Create the plot
+    fig = px.timeline(
+        timeline_data,
+        x_start="start",
+        x_end="end",
+        y="request_id",
+        color="type",
+        color_discrete_map=labels_colors,
+        category_orders={"type": labels_order},
+        hover_data=[
+            "prompt_tokens",
+            "output_tokens",
+            "req_start_time",
+            "req_finish_time",
+            "segment_start",
+            "segment_end",
+            "duration",
+        ],
+    )
+
+    # Customize hover template to show only time without date
+    fig.update_traces(
+        hovertemplate="<b>%{y}</b><br>"
+        "Type: %{fullData.name}<br>"
+        "Start: %{customdata[4]}<br>"
+        "End: %{customdata[5]}<br>"
+        "Duration: %{customdata[6]}<br>"
+        "Prompt Tokens: %{customdata[0]}<br>"
+        "Output Tokens: %{customdata[1]}<br>"
+        "Request Start Time: %{customdata[2]}<br>"
+        "Request End Time: %{customdata[3]}<br>"
+        "<extra></extra>"
+    )
+
+    fig.update_yaxes(autorange="reversed")
+    fig.update_layout(
+        xaxis_title="Time",
+        yaxis_title="Request ID",
+        showlegend=True,
+    )
+
+    # Save to HTML
+    pio.write_html(fig, str(output_path))
+    print(f"Timeline plot saved to: {output_path}")
+
+
+def construct_timeline_data(
+    requests_data: list[dict[str, Any]],
+    itl_thresholds: list[float],
+    labels: list[str],
+) -> list[dict[str, Any]]:
+    """
+    Construct timeline data from request results.
+
+    Args:
+        requests_data: List of per-request result dictionaries
+        itl_thresholds: ITL thresholds in seconds
+        labels: Labels for ITL categories
+
+    Returns:
+        List of timeline segments for plotting
+    """
+
+    def tostr(sec_time: float) -> str:
+        """Convert seconds to HH:MM:SS.mmm format."""
+        h = int(sec_time // 3600)
+        assert h < 100, "time seems to last more than 100 hours"
+        m = int((sec_time % 3600) // 60)
+        s = sec_time % 60
+        return f"{h:02d}:{m:02d}:{s:06.3f}"
+
+    def itl_type(itl: float) -> str:
+        """Categorize ITL based on thresholds."""
+        if itl < itl_thresholds[0]:
+            return labels[0]
+        elif itl < itl_thresholds[1]:
+            return labels[1]
+        else:
+            return labels[2]
+
+    # Find the earliest start time to use as t0
+    t0 = None
+    for request in requests_data:
+        start_time = request.get("start_time")
+        if start_time is not None and (t0 is None or start_time < t0):
+            t0 = start_time
+
+    if t0 is None:
+        return []
+
+    timeline_data = []
+
+    for i, request in enumerate(requests_data):
+        start_time = request.get("start_time")
+        ttft = request.get("ttft")
+        itl = request.get("itl", [])
+        latency = request.get("latency")
+        prompt_len = request.get("prompt_len", 0)
+        output_tokens = request.get("output_tokens", 0)
+
+        # Skip requests without required data
+        if start_time is None or ttft is None or latency is None:
+            continue
+
+        # Normalize start time
+        start_time = start_time - t0
+        start_time_str = tostr(start_time)
+
+        # TTFT segment
+        ttft_end = start_time + ttft
+        ttft_end_str = tostr(ttft_end)
+
+        timeline_data.append(
+            {
+                "request_id": f"Req {i}",
+                "start": start_time_str,
+                "end": ttft_end_str,
+                "type": "TTFT",
+                "prompt_tokens": prompt_len,
+                "output_tokens": output_tokens,
+                "req_start_time": tostr(start_time),
+                "req_finish_time": tostr(start_time + latency),
+                "segment_start": start_time_str,
+                "segment_end": ttft_end_str,
+                "duration": f"{ttft:.3f}s",
+            }
+        )
+
+        # ITL segments
+        prev_time = ttft_end
+        prev_time_str = ttft_end_str
+
+        for itl_value in itl:
+            itl_end = prev_time + itl_value
+            itl_end_str = tostr(itl_end)
+
+            timeline_data.append(
+                {
+                    "request_id": f"Req {i}",
+                    "start": prev_time_str,
+                    "end": itl_end_str,
+                    "type": itl_type(itl_value),
+                    "prompt_tokens": prompt_len,
+                    "output_tokens": output_tokens,
+                    "req_start_time": tostr(start_time),
+                    "req_finish_time": tostr(start_time + latency),
+                    "segment_start": prev_time_str,
+                    "segment_end": itl_end_str,
+                    "duration": f"{itl_value:.3f}s",
+                }
+            )
+
+            prev_time = itl_end
+            prev_time_str = itl_end_str
+
+    return timeline_data
+
+
+def generate_dataset_stats_plot(
+    results: list[dict[str, Any]],
+    output_path: Path,
+) -> None:
+    """
+    Generate a matplotlib figure with dataset statistics.
+
+    Creates a figure with 4 subplots:
+    - Top-left: Prompt tokens distribution (histogram)
+    - Top-right: Output tokens distribution (histogram)
+    - Bottom-left: Prompt+output tokens distribution (histogram)
+    - Bottom-right: Stacked bar chart (request_id vs tokens)
+
+    Args:
+        results: List of per-request result dictionaries containing:
+            - prompt_len: Number of prompt tokens
+            - output_tokens: Number of output tokens
+        output_path: Path where the figure will be saved
+    """
+    # Extract data
+    prompt_tokens = []
+    output_tokens = []
+    total_tokens = []
+
+    for request in results:
+        prompt_len = request.get("prompt_len", 0)
+        output_len = request.get("output_tokens", 0)
+
+        prompt_tokens.append(prompt_len)
+        output_tokens.append(output_len)
+        total_tokens.append(prompt_len + output_len)
+
+    if not prompt_tokens:
+        print("No data available for dataset statistics plot")
+        return
+
+    # Create figure with 4 subplots
+    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))
+
+    # Top-left: Prompt tokens distribution
+    ax1.hist(prompt_tokens, bins=30, color="steelblue", edgecolor="black", alpha=0.7)
+    ax1.set_xlabel("Prompt Tokens")
+    ax1.set_ylabel("Frequency")
+    ax1.set_title("Prompt Tokens Distribution")
+    ax1.grid(True, alpha=0.3)
+
+    # Top-right: Output tokens distribution
+    ax2.hist(output_tokens, bins=30, color="coral", edgecolor="black", alpha=0.7)
+    ax2.set_xlabel("Output Tokens")
+    ax2.set_ylabel("Frequency")
+    ax2.set_title("Output Tokens Distribution")
+    ax2.grid(True, alpha=0.3)
+
+    # Bottom-left: Prompt+output tokens distribution
+    ax3.hist(
+        total_tokens, bins=30, color="mediumseagreen", edgecolor="black", alpha=0.7
+    )
+    ax3.set_xlabel("Total Tokens (Prompt + Output)")
+    ax3.set_ylabel("Frequency")
+    ax3.set_title("Total Tokens Distribution")
+    ax3.grid(True, alpha=0.3)
+
+    # Bottom-right: Stacked bar chart
+    request_ids = list(range(len(prompt_tokens)))
+    ax4.bar(
+        request_ids, prompt_tokens, label="Prompt Tokens", color="steelblue", alpha=0.7
+    )
+    ax4.bar(
+        request_ids,
+        output_tokens,
+        bottom=prompt_tokens,
+        label="Output Tokens",
+        color="coral",
+        alpha=0.7,
+    )
+    ax4.set_xlabel("Request ID")
+    ax4.set_ylabel("Tokens")
+    ax4.set_title("Tokens per Request (Stacked)")
+    ax4.legend()
+    ax4.grid(True, alpha=0.3, axis="y")
+
+    # Adjust layout to prevent overlap
+    plt.tight_layout()
+
+    # Save figure
+    plt.savefig(str(output_path), dpi=150, bbox_inches="tight")
+    plt.close(fig)
+
+    print(f"Dataset statistics plot saved to: {output_path}")
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 06e67f912a6d..f8bf52de0832 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -34,6 +34,7 @@
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
+from pathlib import Path
 from typing import Any, Literal
 
 import aiohttp
@@ -1183,6 +1184,49 @@ def save_to_pytorch_benchmark_format(
         write_to_json(pt_file, pt_records)
 
 
+def compute_result_filename(
+    args: argparse.Namespace,
+    model_id: str,
+    label: str,
+    current_dt: str,
+) -> str | None:
+    """Compute the result filename based on benchmark configuration.
+
+    Args:
+        args: Command line arguments containing result configuration
+        model_id: The model identifier
+        label: The benchmark label
+        current_dt: Current datetime string
+
+    Returns:
+        The computed filename path or None if no result saving is requested
+    """
+    if not (args.plot_timeline or args.save_result or args.append_result):
+        return None
+
+    base_model_id = model_id.split("/")[-1]
+    max_concurrency_str = (
+        f"-concurrency{args.max_concurrency}"
+        if args.max_concurrency is not None
+        else ""
+    )
+    label = label or args.backend
+
+    if args.ramp_up_strategy is not None:
+        file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+    else:
+        file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+
+    if args.result_filename:
+        file_name = args.result_filename
+
+    if args.result_dir:
+        os.makedirs(args.result_dir, exist_ok=True)
+        file_name = os.path.join(args.result_dir, file_name)
+
+    return file_name
+
+
 def add_cli_args(parser: argparse.ArgumentParser):
     add_dataset_parser(parser)
     parser.add_argument(
@@ -1535,6 +1579,30 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "connecting to servers with self-signed certificates.",
     )
 
+    parser.add_argument(
+        "--plot-timeline",
+        action="store_true",
+        help="Generate an HTML timeline plot showing request execution. "
+        "The plot will be saved alongside the results JSON file.",
+    )
+    parser.add_argument(
+        "--timeline-itl-thresholds",
+        type=float,
+        nargs=2,
+        default=[25.0, 50.0],
+        metavar=("THRESHOLD1", "THRESHOLD2"),
+        help="ITL thresholds in milliseconds for timeline plot coloring. "
+        "Specify two values to categorize inter-token latencies into three groups: "
+        "below first threshold (green), between thresholds (orange), "
+        "and above second threshold (red). Default: 25 50 (milliseconds).",
+    )
+    parser.add_argument(
+        "--plot-dataset-stats",
+        action="store_true",
+        help="Generate a matplotlib figure with dataset statistics showing "
+        "prompt tokens, output tokens, and combined token distributions.",
+    )
+
 
 def main(args: argparse.Namespace) -> dict[str, Any]:
     return asyncio.run(main_async(args))
@@ -1770,6 +1838,86 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     # Merge with benchmark result
     result_json = {**result_json, **benchmark_result}
 
+    # Compute file_name once before using it for plots or saving results
+    file_name = compute_result_filename(args, model_id, label, current_dt)
+
+    # Generate timeline plot if requested
+    if args.plot_timeline:
+        try:
+            from vllm.benchmarks.plot import generate_timeline_plot
+
+            # Prepare per-request data for timeline
+            per_request_data = []
+            start_times = benchmark_result.get("start_times", [])
+            ttfts = benchmark_result.get("ttfts", [])
+            itls = benchmark_result.get("itls", [])
+            input_lens = benchmark_result.get("input_lens", [])
+            output_lens = benchmark_result.get("output_lens", [])
+
+            if start_times and ttfts and itls:
+                for i in range(len(start_times)):
+                    # Calculate latency as ttft + sum of all itls
+                    latency = ttfts[i] + sum(itls[i]) if itls[i] else ttfts[i]
+
+                    per_request_data.append(
+                        {
+                            "start_time": start_times[i],
+                            "ttft": ttfts[i],
+                            "itl": itls[i],
+                            "latency": latency,
+                            "prompt_len": input_lens[i],
+                            "output_tokens": output_lens[i],
+                        }
+                    )
+
+                timeline_path = Path(file_name).with_suffix(".timeline.html")
+                # Convert thresholds from milliseconds to seconds
+                itl_thresholds_sec = [t / 1000.0 for t in args.timeline_itl_thresholds]
+                generate_timeline_plot(
+                    per_request_data, timeline_path, itl_thresholds=itl_thresholds_sec
+                )
+            else:
+                warnings.warn(
+                    "Timeline plot requires detailed metrics. "
+                    "Ensure the benchmark completed successfully.",
+                    stacklevel=2,
+                )
+        except Exception as e:
+            warnings.warn(f"Failed to generate timeline plot: {e}", stacklevel=2)
+
+    # Generate dataset statistics plot if requested
+    if args.plot_dataset_stats:
+        try:
+            from vllm.benchmarks.plot import generate_dataset_stats_plot
+
+            # Prepare per-request data for dataset stats
+            per_request_data = []
+            input_lens = benchmark_result.get("input_lens", [])
+            output_lens = benchmark_result.get("output_lens", [])
+
+            if input_lens and output_lens:
+                for req_input_len, req_output_len in zip(input_lens, output_lens):
+                    per_request_data.append(
+                        {
+                            "prompt_len": req_input_len,
+                            "output_tokens": req_output_len,
+                        }
+                    )
+
+                stats_path = Path(file_name).with_suffix(".dataset_stats.png")
+                generate_dataset_stats_plot(per_request_data, stats_path)
+            else:
+                warnings.warn(
+                    "Dataset statistics plot requires input and "
+                    "output length data. Ensure the benchmark completed "
+                    "successfully.",
+                    stacklevel=2,
+                )
+        except Exception as e:
+            warnings.warn(
+                f"Failed to generate dataset statistics plot: {e}", stacklevel=2
+            )
+
     if not args.save_detailed:
         # Remove fields with too many data points
         for field in [
@@ -1786,24 +1934,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             if field in benchmark_result:
                 del benchmark_result[field]
 
-        # Save to file
+    # Save to file
     if args.save_result or args.append_result:
-        base_model_id = model_id.split("/")[-1]
-        max_concurrency_str = (
-            f"-concurrency{args.max_concurrency}"
-            if args.max_concurrency is not None
-            else ""
-        )
-        label = label or args.backend
-        if args.ramp_up_strategy is not None:
-            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
-        else:
-            file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
-        if args.result_filename:
-            file_name = args.result_filename
-        if args.result_dir:
-            os.makedirs(args.result_dir, exist_ok=True)
-            file_name = os.path.join(args.result_dir, file_name)
         with open(
             file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
         ) as outfile:

From e03ddcfbd4d686c91f6509c5451546437bbbf3e5 Mon Sep 17 00:00:00 2001
From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com>
Date: Thu, 26 Feb 2026 15:51:24 +0530
Subject: [PATCH 019/154] [Hardware][Powerpc]Enable prefix caching and chunked
 prefill for ppc64le (#35081)

Signed-off-by: Akash kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash kaothalkar <akash.kaothalkar@ibm.com>
---
 vllm/engine/arg_utils.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 036178887dbe..2e9cd6634895 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -2076,20 +2076,19 @@ def _set_default_chunked_prefill_and_prefix_caching_args(
             )
 
         # Disable chunked prefill and prefix caching for:
-        # POWER (ppc64le)/RISCV CPUs in V1
+        # RISCV CPUs in V1
         if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
-            CpuArchEnum.POWERPC,
             CpuArchEnum.RISCV,
         ):
             logger.info(
-                "Chunked prefill is not supported for POWER, "
-                "and RISC-V CPUs; "
+                "Chunked prefill is not supported for"
+                "RISC-V CPUs; "
                 "disabling it for V1 backend."
             )
             self.enable_chunked_prefill = False
             logger.info(
-                "Prefix caching is not supported for POWER, "
-                "and RISC-V CPUs; "
+                "Prefix caching is not supported for "
+                "RISC-V CPUs; "
                 "disabling it for V1 backend."
             )
             self.enable_prefix_caching = False

From 32693db8cea5cb9099c4e9d9876def97fdbc5387 Mon Sep 17 00:00:00 2001
From: HZY <19858181030@163.com>
Date: Thu, 26 Feb 2026 18:26:15 +0800
Subject: [PATCH 020/154] [Bugfix] [Qwen3.5]Fix Qwen3.5 FP8 quantization: tuple
 shard_id weight loading (#35289)

Signed-off-by: daowu.hzy <daowu.hzy@alibaba-inc.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/linear.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 6db3907ff911..5fc9fa073180 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -731,16 +731,16 @@ def weight_loader(
         loaded_shard_id: tuple[int, ...] | int | None = None,
     ):
         self.validate_shard_id(loaded_shard_id)
-        # FIXME(Isotr0py): Enable tuple shard_id for BNB quantization.
-        if isinstance(loaded_shard_id, tuple):
-            raise NotImplementedError(
-                "Shard id with multiple indices is not supported in weight_loader, "
-                "please use weight_loader_v2 instead."
-            )
         # Special case for GGUF
         # initialize GGUF param after we know the quantize type
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
         is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if isinstance(loaded_shard_id, tuple) and (
+            is_gguf_weight or is_gguf_weight_type
+        ):
+            raise NotImplementedError(
+                "Shard id with multiple indices is not supported for GGUF."
+            )
         if is_gguf_weight_type:
             if loaded_shard_id is not None:
                 param.data[loaded_shard_id].copy_(loaded_weight)
@@ -768,7 +768,7 @@ def weight_loader(
         # Special case for per-tensor scale to load scalar into fused array.
         needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
-        if loaded_shard_id is None:
+        if loaded_shard_id is None or isinstance(loaded_shard_id, tuple):
             # Loaded weight is already fused on disk (mlp).
             # (e.g., Phi-3's gate_up_proj).
             if output_dim is None:
@@ -780,10 +780,21 @@ def weight_loader(
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
+
+            output_sizes = (
+                self.output_sizes[loaded_shard_id[0] : loaded_shard_id[-1] + 1]
+                if loaded_shard_id is not None
+                else self.output_sizes
+            )
             current_shard_offset = 0
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if use_bitsandbytes_4bit and isinstance(loaded_shard_id, tuple):
+                raise NotImplementedError(
+                    "Shard id with multiple indices is not supported "
+                    "for BNB quantization yet."
+                )
             shard_offsets: list[tuple[int, int, int]] = []
-            for i, output_size in enumerate(self.output_sizes):
+            for i, output_size in enumerate(output_sizes):
                 shard_offsets.append((i, current_shard_offset, output_size))
                 current_shard_offset += output_size
             packed_dim = getattr(param, "packed_dim", None)

From 5281713e1119d6312dba2e4d0a95a517dbc24b06 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Thu, 26 Feb 2026 18:54:55 +0800
Subject: [PATCH 021/154] [XPU] use fixed UMD version in dockerfile.xpu
 (#35392)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 docker/Dockerfile.xpu | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index ba7dd848bdfd..d030b151e74e 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -6,8 +6,7 @@ ARG PYTHON_VERSION=3.12
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/xpu"
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    add-apt-repository -y ppa:kobuk-team/intel-graphics
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
 
 RUN apt clean && apt-get update -y && \
     apt-get install -y --no-install-recommends --fix-missing \
@@ -28,9 +27,22 @@ RUN apt clean && apt-get update -y && \
     python3-pip
 
 RUN apt update && apt upgrade -y && \
-    apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc && \
     apt install -y intel-oneapi-compiler-dpcpp-cpp-2025.3
 
+# Install UMD
+RUN mkdir neo && \
+    cd neo && \
+    wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.24.8/intel-igc-core-2_2.24.8+20344_amd64.deb && \
+    wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.24.8/intel-igc-opencl-2_2.24.8+20344_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.48.36300.8/intel-ocloc_25.48.36300.8-0_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.48.36300.8/intel-opencl-icd_25.48.36300.8-0_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.48.36300.8/libigdgmm12_22.8.2_amd64.deb && \
+    wget https://github.com/intel/compute-runtime/releases/download/25.48.36300.8/libze-intel-gpu1_25.48.36300.8-0_amd64.deb && \
+    wget https://github.com/oneapi-src/level-zero/releases/download/v1.26.0/level-zero_1.26.0+u24.04_amd64.deb && \
+    dpkg -i *.deb && \
+    cd .. && \
+    rm -rf neo
+
 ENV PATH="/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"
 ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python

From 01914445b0513ab355b1275acec2f2e5da4d91d6 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 26 Feb 2026 11:01:01 +0000
Subject: [PATCH 022/154] Remove `bc-lint` (#35274)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/.bc-linter.yml        | 24 ----------------
 .github/workflows/bc-lint.yml | 29 -------------------
 vllm/__init__.py              |  6 ----
 vllm/_bc_linter.py            | 54 -----------------------------------
 vllm/v1/core/sched/output.py  |  5 ----
 5 files changed, 118 deletions(-)
 delete mode 100644 .github/.bc-linter.yml
 delete mode 100644 .github/workflows/bc-lint.yml
 delete mode 100644 vllm/_bc_linter.py

diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml
deleted file mode 100644
index 443dfa45af22..000000000000
--- a/.github/.bc-linter.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
-version: 1
-paths:
-# We temporarily disable globally, and will only enable with `annotations.include`
-# include:
-#   - "vllm/v1/attetion/*.py"
-#   - "vllm/v1/core/*.py"
-exclude:
-  - "**/*.py"
-
-scan:
-  functions: true        # check free functions and methods
-  classes: true          # check classes/dataclasses
-  public_only: true      # ignore names starting with "_" at any level
-
-annotations:
-  include:               # decorators that force‑include a symbol
-    - name: "bc_linter_include"  # matched by simple name or dotted suffix
-      propagate_to_members: false # for classes, include methods/inner classes
-  exclude:               # decorators that force‑exclude a symbol
-    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
-      propagate_to_members: true  # for classes, exclude methods/inner classes
-
-excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml
deleted file mode 100644
index 823695a92132..000000000000
--- a/.github/workflows/bc-lint.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: BC Lint
-
-on:
-  pull_request:
-    types:
-      - opened
-      - synchronize
-      - reopened
-      - labeled
-      - unlabeled
-
-jobs:
-  bc_lint:
-    if: github.repository_owner == 'vllm-project'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Run BC Lint Action
-        uses: pytorch/test-infra/.github/actions/bc-lint@main
-        with:
-          repo: ${{ github.event.pull_request.head.repo.full_name }}
-          base_sha: ${{ github.event.pull_request.base.sha }}
-          head_sha: ${{ github.event.pull_request.head.sha }}
-          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
-          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
-          config_dir: .github
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 19b2cdc673c4..968d1a143b16 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -14,8 +14,6 @@
 import vllm.env_override  # noqa: F401
 
 MODULE_ATTRS = {
-    "bc_linter_skip": "._bc_linter:bc_linter_skip",
-    "bc_linter_include": "._bc_linter:bc_linter_include",
     "AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs",
     "EngineArgs": ".engine.arg_utils:EngineArgs",
     "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
@@ -62,8 +60,6 @@
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
     from vllm.v1.executor.ray_utils import initialize_ray_cluster
-
-    from ._bc_linter import bc_linter_include, bc_linter_skip
 else:
 
     def __getattr__(name: str) -> typing.Any:
@@ -79,8 +75,6 @@ def __getattr__(name: str) -> typing.Any:
 
 __all__ = [
     "__version__",
-    "bc_linter_skip",
-    "bc_linter_include",
     "__version_tuple__",
     "LLM",
     "ModelRegistry",
diff --git a/vllm/_bc_linter.py b/vllm/_bc_linter.py
deleted file mode 100644
index 2929a8bce85a..000000000000
--- a/vllm/_bc_linter.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# vllm/_bc_linter.py
-from collections.abc import Callable
-from typing import Any, TypeVar, overload
-
-T = TypeVar("T")
-
-
-@overload
-def bc_linter_skip(obj: T) -> T: ...
-
-
-@overload
-def bc_linter_skip(*, reason: str | None = ...) -> Callable[[T], T]: ...
-
-
-def bc_linter_skip(obj: Any = None, *, reason: str | None = None):
-    """
-    No-op decorator to mark symbols/files for BC-linter suppression.
-
-    Usage:
-        @bc_linter_skip
-        def legacy_api(...): ...
-    """
-
-    def _wrap(x: T) -> T:
-        return x
-
-    return _wrap if obj is None else obj
-
-
-@overload
-def bc_linter_include(obj: T) -> T: ...
-
-
-@overload
-def bc_linter_include(*, reason: str | None = ...) -> Callable[[T], T]: ...
-
-
-def bc_linter_include(obj: Any = None, *, reason: str | None = None):
-    """
-    Usage:
-        @bc_linter_include
-        def public_api(...): ...
-    """
-
-    def _wrap(x: T) -> T:
-        return x
-
-    return _wrap if obj is None else obj
-
-
-__all__ = ["bc_linter_skip", "bc_linter_include"]
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 7e53f4f2ec9e..0f6ac98fdc15 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -5,8 +5,6 @@
 from functools import cached_property
 from typing import TYPE_CHECKING
 
-from vllm._bc_linter import bc_linter_include
-
 if TYPE_CHECKING:
     import numpy as np
     import numpy.typing as npt
@@ -29,7 +27,6 @@
     Request = object
 
 
-@bc_linter_include
 @dataclass
 class NewRequestData:
     req_id: str
@@ -109,7 +106,6 @@ def anon_repr(self) -> str:
         )
 
 
-@bc_linter_include
 @dataclass
 class CachedRequestData:
     req_ids: list[str]
@@ -179,7 +175,6 @@ def make_empty(cls) -> "CachedRequestData":
         )
 
 
-@bc_linter_include
 @dataclass
 class SchedulerOutput:
     # list of the requests that are scheduled for the first time.

From c0615a296d44ce1963d795ea65dcff6172b4ae8d Mon Sep 17 00:00:00 2001
From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com>
Date: Thu, 26 Feb 2026 06:58:23 -0500
Subject: [PATCH 023/154] [Bugfix] Fix Qwen2.5-Omni and Qwen3-Omni
 mixed-modality embed regression (#35368)

Signed-off-by: linyueqian <linyueqian@outlook.com>
---
 .../processing/test_qwen2_5_omni_embed.py     | 358 ++++++++++++++++++
 .../models/qwen2_5_omni_thinker.py            |  30 +-
 .../models/qwen3_omni_moe_thinker.py          |  12 +-
 3 files changed, 379 insertions(+), 21 deletions(-)
 create mode 100644 tests/models/multimodal/processing/test_qwen2_5_omni_embed.py

diff --git a/tests/models/multimodal/processing/test_qwen2_5_omni_embed.py b/tests/models/multimodal/processing/test_qwen2_5_omni_embed.py
new file mode 100644
index 000000000000..df5b077ce9d7
--- /dev/null
+++ b/tests/models/multimodal/processing/test_qwen2_5_omni_embed.py
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for Qwen2.5-Omni embed_input_ids to verify embeddings are
+correctly assigned to audio/image/video token positions.
+
+Regression test for: https://github.com/vllm-project/vllm/issues/34506
+  - Non-interleaved mixed modalities (audio + image + video) should correctly
+    assign audio embeddings to audio positions, image to image, video to video.
+  - Interleaved (use_audio_in_video) should also work correctly.
+"""
+
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from vllm.model_executor.models.qwen2_5_omni_thinker import (
+    check_interleaved_audio_video,
+    merge_interleaved_embeddings,
+)
+
+# Fake token IDs
+AUDIO_TOKEN_ID = 1001
+IMAGE_TOKEN_ID = 1002
+VIDEO_TOKEN_ID = 1003
+TEXT_TOKEN_ID = 0
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def make_token_seq(
+    audio_n: int, image_n: int, video_n: int, text_prefix: int = 3, text_sep: int = 2
+):
+    """
+    Build a flat token sequence:
+      [text_prefix] [AUDIO * audio_n] [text_sep] [IMAGE * image_n]
+      [text_sep] [VIDEO * video_n] [text_sep]
+    Returns (input_ids tensor, is_multimodal mask, positions dict).
+    """
+    tokens = (
+        [TEXT_TOKEN_ID] * text_prefix
+        + [AUDIO_TOKEN_ID] * audio_n
+        + [TEXT_TOKEN_ID] * text_sep
+        + [IMAGE_TOKEN_ID] * image_n
+        + [TEXT_TOKEN_ID] * text_sep
+        + [VIDEO_TOKEN_ID] * video_n
+        + [TEXT_TOKEN_ID] * text_sep
+    )
+    input_ids = torch.tensor(tokens)
+    is_multimodal = (
+        (input_ids == AUDIO_TOKEN_ID)
+        | (input_ids == IMAGE_TOKEN_ID)
+        | (input_ids == VIDEO_TOKEN_ID)
+    )
+    return input_ids, is_multimodal
+
+
+def make_interleaved_seq(
+    video_chunks: list[int], audio_chunks: list[int], text_prefix: int = 2
+):
+    """
+    Build an interleaved sequence like use_audio_in_video:
+      [text] [V*v0] [A*a0] [V*v1] [A*a1] ...
+    """
+    tokens = [TEXT_TOKEN_ID] * text_prefix
+    for v, a in zip(video_chunks, audio_chunks):
+        tokens += [VIDEO_TOKEN_ID] * v + [AUDIO_TOKEN_ID] * a
+    input_ids = torch.tensor(tokens)
+    is_multimodal = (input_ids == VIDEO_TOKEN_ID) | (input_ids == AUDIO_TOKEN_ID)
+    return input_ids, is_multimodal
+
+
+# ---------------------------------------------------------------------------
+# Tests for check_interleaved_audio_video
+# ---------------------------------------------------------------------------
+
+
+class TestCheckInterleavedAudioVideo:
+    def test_non_interleaved_audio_then_video(self):
+        """Audio entirely before video → not interleaved."""
+        input_ids, is_multimodal = make_token_seq(5, 0, 4)
+        is_video = is_multimodal & (input_ids == VIDEO_TOKEN_ID)
+        is_audio = is_multimodal & (input_ids == AUDIO_TOKEN_ID)
+        assert not check_interleaved_audio_video(
+            is_video, is_audio, is_video.sum().item(), is_audio.sum().item()
+        )
+
+    def test_non_interleaved_with_image(self):
+        """Audio + image + video (the mixed_modalities case) → not interleaved."""
+        input_ids, is_multimodal = make_token_seq(5, 4, 6)
+        is_video = is_multimodal & (input_ids == VIDEO_TOKEN_ID)
+        is_audio = is_multimodal & (input_ids == AUDIO_TOKEN_ID)
+        assert not check_interleaved_audio_video(
+            is_video, is_audio, is_video.sum().item(), is_audio.sum().item()
+        )
+
+    def test_no_audio(self):
+        """Video only → not interleaved."""
+        input_ids, is_multimodal = make_token_seq(0, 0, 6)
+        is_video = is_multimodal & (input_ids == VIDEO_TOKEN_ID)
+        is_audio = is_multimodal & (input_ids == AUDIO_TOKEN_ID)
+        assert not check_interleaved_audio_video(
+            is_video, is_audio, is_video.sum().item(), is_audio.sum().item()
+        )
+
+    def test_interleaved(self):
+        """V A V A interleaved → True."""
+        input_ids, is_multimodal = make_interleaved_seq([4, 4], [3, 3])
+        is_video = is_multimodal & (input_ids == VIDEO_TOKEN_ID)
+        is_audio = is_multimodal & (input_ids == AUDIO_TOKEN_ID)
+        assert check_interleaved_audio_video(
+            is_video, is_audio, is_video.sum().item(), is_audio.sum().item()
+        )
+
+
+# ---------------------------------------------------------------------------
+# Tests for embed_input_ids via a minimal mock
+# ---------------------------------------------------------------------------
+
+
+def make_mock_model(hidden: int = 8):
+    """
+    Return a minimal mock of Qwen2_5OmniThinkerForConditionalGeneration
+    that has enough structure to run embed_input_ids.
+    """
+    from vllm.model_executor.models.qwen2_5_omni_thinker import (
+        Qwen2_5OmniThinkerForConditionalGeneration,
+    )
+
+    model = Mock(spec=Qwen2_5OmniThinkerForConditionalGeneration)
+
+    # Config with token IDs
+    cfg = Mock()
+    cfg.video_token_index = VIDEO_TOKEN_ID
+    cfg.audio_token_index = AUDIO_TOKEN_ID
+    model.config = cfg
+
+    # embed_input_ids: simply embed each token as a one-hot-like vector
+    # token_id * ones so we can verify which embedding ends up where.
+    def fake_lm_embed(ids: torch.Tensor) -> torch.Tensor:
+        # Use .clone() so the tensor is contiguous (expand() creates a strided
+        # view with shared memory, which masked_scatter_ cannot handle).
+        return ids.float().unsqueeze(-1).expand(-1, hidden).clone()
+
+    lang_model = Mock()
+    lang_model.embed_input_ids = fake_lm_embed
+    model.get_language_model = Mock(return_value=lang_model)
+
+    # _embed_text_input_ids: delegate to SupportsMultiModal's implementation
+    from vllm.model_executor.models.interfaces import SupportsMultiModal
+
+    model._embed_text_input_ids = (
+        lambda *a, **kw: SupportsMultiModal._embed_text_input_ids(model, *a, **kw)
+    )
+
+    # super().embed_input_ids → use SupportsMultiModal.embed_input_ids
+    def fake_super_embed(
+        ids, mm_embs=None, *, is_multimodal=None, handle_oov_mm_token=False
+    ):
+        return SupportsMultiModal.embed_input_ids(
+            model,
+            ids,
+            mm_embs,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+    # Bind embed_input_ids as the real method
+    model.embed_input_ids = (
+        lambda *a, **kw: Qwen2_5OmniThinkerForConditionalGeneration.embed_input_ids(
+            model, *a, **kw
+        )
+    )
+
+    # Store super-embed for use inside the method
+    model._super_embed_input_ids = fake_super_embed
+
+    return model, hidden
+
+
+def build_mm_embeds(
+    audio_n, image_n, video_n, hidden, audio_val=10.0, image_val=20.0, video_val=30.0
+):
+    """
+    Build multimodal_embeddings list in position order (audio, image, video).
+    Each embedding is filled with a distinct constant so we can verify placement.
+    """
+    embs = []
+    if audio_n:
+        embs.append(torch.full((audio_n, hidden), audio_val))
+    if image_n:
+        embs.append(torch.full((image_n, hidden), image_val))
+    if video_n:
+        embs.append(torch.full((video_n, hidden), video_val))
+    return embs
+
+
+class TestEmbedInputIds:
+    def _run(self, audio_n, image_n, video_n, hidden=8):
+        """
+        Run embed_input_ids for a non-interleaved mixed-modality sequence.
+        Returns (result_embeds, input_ids, is_multimodal).
+        """
+        input_ids, is_multimodal = make_token_seq(audio_n, image_n, video_n)
+        mm_embeds = build_mm_embeds(audio_n, image_n, video_n, hidden)
+
+        model, _ = make_mock_model(hidden)
+        result = model.embed_input_ids(
+            input_ids, mm_embeds, is_multimodal=is_multimodal
+        )
+        return result, input_ids, is_multimodal
+
+    def test_audio_only(self):
+        """Audio-only: audio positions get audio embeddings."""
+        audio_n, hidden = 5, 8
+        audio_val = 10.0
+        result, input_ids, is_multimodal = self._run(audio_n, 0, 0, hidden)
+
+        audio_pos = (input_ids == AUDIO_TOKEN_ID).nonzero(as_tuple=True)[0]
+        assert result[audio_pos].allclose(torch.full((audio_n, hidden), audio_val)), (
+            "Audio positions should get audio embeddings"
+        )
+
+    def test_video_only(self):
+        """Video-only: video positions get video embeddings."""
+        video_n, hidden = 6, 8
+        video_val = 30.0
+        result, input_ids, is_multimodal = self._run(0, 0, video_n, hidden)
+
+        video_pos = (input_ids == VIDEO_TOKEN_ID).nonzero(as_tuple=True)[0]
+        assert result[video_pos].allclose(torch.full((video_n, hidden), video_val)), (
+            "Video positions should get video embeddings"
+        )
+
+    def test_mixed_modalities_audio_goes_to_audio_pos(self):
+        """
+        Regression test for GitHub issue #34506:
+        With audio + image + video (non-interleaved), audio positions must
+        receive audio embeddings (not image or video embeddings).
+        """
+        audio_n, image_n, video_n, hidden = 5, 4, 6, 8
+        audio_val, image_val, video_val = 10.0, 20.0, 30.0
+
+        result, input_ids, is_multimodal = self._run(audio_n, image_n, video_n, hidden)
+
+        audio_pos = (input_ids == AUDIO_TOKEN_ID).nonzero(as_tuple=True)[0]
+        image_pos = (input_ids == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0]
+        video_pos = (input_ids == VIDEO_TOKEN_ID).nonzero(as_tuple=True)[0]
+
+        mean_a = result[audio_pos].mean().item()
+        assert result[audio_pos].allclose(torch.full((audio_n, hidden), audio_val)), (
+            f"Audio emb wrong: expected {audio_val}, got mean={mean_a:.1f}"
+        )
+
+        mean_i = result[image_pos].mean().item()
+        assert result[image_pos].allclose(torch.full((image_n, hidden), image_val)), (
+            f"Image emb wrong: expected {image_val}, got mean={mean_i:.1f}"
+        )
+
+        mean_v = result[video_pos].mean().item()
+        assert result[video_pos].allclose(torch.full((video_n, hidden), video_val)), (
+            f"Video emb wrong: expected {video_val}, got mean={mean_v:.1f}"
+        )
+
+    def test_text_positions_unchanged(self):
+        """Text positions should keep their text embeddings."""
+        audio_n, image_n, video_n, hidden = 3, 2, 4, 8
+        result, input_ids, is_multimodal = self._run(audio_n, image_n, video_n, hidden)
+
+        text_pos = (~is_multimodal).nonzero(as_tuple=True)[0]
+        # Text tokens have value TEXT_TOKEN_ID=0, so embed → 0.0
+        assert result[text_pos].allclose(torch.zeros(len(text_pos), hidden)), (
+            "Text positions should keep text embeddings"
+        )
+
+    def test_interleaved_use_audio_in_video(self):
+        """
+        Interleaved (use_audio_in_video): video chunks interleaved with audio.
+        Video embeddings must go to video positions, audio to audio positions.
+        """
+        hidden = 8
+        audio_val, video_val = 10.0, 30.0
+        # Two video chunks of 4, two audio chunks of 3
+        video_chunks = [4, 4]
+        audio_chunks = [3, 3]
+        input_ids, is_multimodal = make_interleaved_seq(video_chunks, audio_chunks)
+
+        video_n = sum(video_chunks)  # 8
+        audio_n = sum(audio_chunks)  # 6
+
+        # mm_embeds come in [video, audio] order (video feature first in
+        # mm_features when positions are the same for use_audio_in_video)
+        mm_embeds = [
+            torch.full((video_n, hidden), video_val),
+            torch.full((audio_n, hidden), audio_val),
+        ]
+
+        model, _ = make_mock_model(hidden)
+        result = model.embed_input_ids(
+            input_ids, mm_embeds, is_multimodal=is_multimodal
+        )
+
+        video_pos = (input_ids == VIDEO_TOKEN_ID).nonzero(as_tuple=True)[0]
+        audio_pos = (input_ids == AUDIO_TOKEN_ID).nonzero(as_tuple=True)[0]
+
+        assert result[video_pos].allclose(torch.full((video_n, hidden), video_val)), (
+            "Interleaved: video positions should get video embeddings"
+        )
+
+        assert result[audio_pos].allclose(torch.full((audio_n, hidden), audio_val)), (
+            "Interleaved: audio positions should get audio embeddings"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Tests for merge_interleaved_embeddings helper
+# ---------------------------------------------------------------------------
+
+
+class TestMergeInterleavedEmbeddings:
+    def test_basic_interleaved(self):
+        """Video chunks + audio chunks scattered to correct positions."""
+        hidden = 4
+        input_ids, is_multimodal = make_interleaved_seq([3, 3], [2, 2])
+
+        is_video = is_multimodal & (input_ids == VIDEO_TOKEN_ID)
+        is_audio = is_multimodal & (input_ids == AUDIO_TOKEN_ID)
+        num_video = is_video.sum().item()  # 6
+        num_audio = is_audio.sum().item()  # 4
+
+        inputs_embeds = torch.zeros(len(input_ids), hidden)
+        mm_embeds = [
+            torch.full((num_video, hidden), 30.0),
+            torch.full((num_audio, hidden), 10.0),
+        ]
+
+        result = merge_interleaved_embeddings(
+            inputs_embeds,
+            mm_embeds,
+            is_video,
+            is_audio,
+            is_multimodal,
+            num_video,
+            num_audio,
+        )
+
+        video_pos = is_video.nonzero(as_tuple=True)[0]
+        audio_pos = is_audio.nonzero(as_tuple=True)[0]
+        assert result[video_pos].allclose(torch.full((num_video, hidden), 30.0))
+        assert result[audio_pos].allclose(torch.full((num_audio, hidden), 10.0))
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 977b522b5ebf..a9fdb24346ac 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -1376,23 +1376,12 @@ def embed_input_ids(
         is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        from .utils import _merge_multimodal_embeddings
-
         if multimodal_embeddings is None or is_multimodal is None:
             return super().embed_input_ids(input_ids)
 
-        inputs_embeds = self._embed_text_input_ids(
-            input_ids,
-            self.get_language_model().embed_input_ids,
-            is_multimodal=is_multimodal,
-            handle_oov_mm_token=handle_oov_mm_token,
-        )
-
-        if len(multimodal_embeddings) == 0:
-            return inputs_embeds
-
         # Check for audio-in-video: interleaved video and audio tokens
-        # in the multimodal region.
+        # in the multimodal region. Only use the interleaved path when
+        # needed; otherwise fall back to the default parent implementation.
         video_token_id = self.config.video_token_index
         audio_token_id = self.config.audio_token_index
 
@@ -1403,6 +1392,12 @@ def embed_input_ids(
         num_audio = is_audio.sum().item()
 
         if check_interleaved_audio_video(is_video, is_audio, num_video, num_audio):
+            inputs_embeds = self._embed_text_input_ids(
+                input_ids,
+                self.get_language_model().embed_input_ids,
+                is_multimodal=is_multimodal,
+                handle_oov_mm_token=handle_oov_mm_token,
+            )
             return merge_interleaved_embeddings(
                 inputs_embeds,
                 multimodal_embeddings,
@@ -1413,9 +1408,12 @@ def embed_input_ids(
                 num_audio,
             )
 
-        # Default: standard merge (no interleaving)
-        return _merge_multimodal_embeddings(
-            inputs_embeds, multimodal_embeddings, is_multimodal
+        # Default: standard merge (no interleaving), same as parent class
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
         )
 
     def forward(
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 2943a319f8db..075215276b96 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -1904,15 +1904,17 @@ def embed_input_ids(
                 num_audio,
             )
 
-        # Default: standard merge (no interleaving)
-        inputs_embeds = _merge_multimodal_embeddings(
-            inputs_embeds=inputs_embeds,
+        # Default: standard merge (no interleaving), same as parent class.
+        # multimodal_embeddings may have been updated above (deepstack
+        # main-scale). Use super() to stay consistent with the parent
+        # implementation and avoid issues seen in Qwen2.5-Omni (#34506).
+        return super().embed_input_ids(
+            input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
         )
 
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.Tensor | None,

From c6ca51598adced41f4a1f5481a137e4cb42c71cc Mon Sep 17 00:00:00 2001
From: Li-Yongwen <63399187+Li-Yongwen@users.noreply.github.com>
Date: Thu, 26 Feb 2026 20:18:38 +0800
Subject: [PATCH 024/154] [Bugfix] fix device_name for routing replay (#34336)

Signed-off-by: liyongwen <1310439159@qq.com>
---
 .../model_executor/layers/fused_moe/routed_experts_capturer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
index 7608e06aa22d..b061b3d38b8d 100644
--- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
+++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
@@ -20,6 +20,7 @@
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.forward_context import get_forward_context
+from vllm.platforms import current_platform
 
 logger = logging.getLogger(__name__)
 
@@ -132,7 +133,7 @@ def init_buffer(
         self._device_buffer = torch.zeros(
             (max_num_batched_tokens, num_layers, num_experts_per_tok),
             dtype=torch.int32,
-            device="cuda",
+            device=current_platform.device_type,
         )
         self.dp_rank = vllm_config.parallel_config.data_parallel_rank
 

From ec13e549d3e1de13d05af759cc8bef3f7cf5e318 Mon Sep 17 00:00:00 2001
From: Asaf Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Thu, 26 Feb 2026 14:22:06 +0200
Subject: [PATCH 025/154] [Bugfix] Fix uint32 overflow in Mamba selective scan
 state pointer arithmetic (#35275)

Signed-off-by: Josephasafg <ajgard7@gmail.com>
---
 csrc/mamba/mamba_ssm/selective_scan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h
index 7d22dd8b84a3..e93455a57cd8 100644
--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -15,7 +15,7 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct SSMParamsBase {
-    using index_t = uint32_t;
+    using index_t = size_t;
 
     int batch, dim, seqlen, dstate, n_groups, n_chunks;
     int dim_ngroups_ratio;

From 845ee348ef82d12b5d106384070f7578c843d3cd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Feb 2026 21:05:46 +0800
Subject: [PATCH 026/154] [Misc] Standardize handling of
 `mm_processor_kwargs.size` (#35284)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/lora/test_qwenvl.py                     | 23 +++++++---
 .../multimodal/processing/test_gemma3.py      | 12 ++---
 .../multimodal/processing/test_qwen2_vl.py    | 45 ++++++++++++++++---
 vllm/model_executor/models/ernie45_vl.py      | 21 +++++++--
 vllm/model_executor/models/hunyuan_vision.py  |  8 +++-
 vllm/model_executor/models/keye.py            |  8 +++-
 vllm/model_executor/models/paddleocr_vl.py    | 21 +++++++--
 vllm/model_executor/models/qwen2_vl.py        | 17 ++++++-
 vllm/model_executor/models/qwen3_vl.py        |  8 +++-
 9 files changed, 135 insertions(+), 28 deletions(-)

diff --git a/tests/lora/test_qwenvl.py b/tests/lora/test_qwenvl.py
index 741e1aceee63..5f8fc26c16d3 100644
--- a/tests/lora/test_qwenvl.py
+++ b/tests/lora/test_qwenvl.py
@@ -2,6 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
+
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
@@ -18,15 +21,25 @@ class TestConfig:
     enable_tower_connector_lora: bool = False
     max_model_len: int = 8192
     gpu_memory_utilization: float = 0.85
-    mm_processor_kwargs: dict[str, int] | None = None
+    mm_processor_kwargs: dict[str, object] | None = None
     mm_processor_cache_gb: float = 4
 
     def __post_init__(self):
         if self.mm_processor_kwargs is None:
-            self.mm_processor_kwargs = {
-                "min_pixels": 28 * 28,
-                "max_pixels": 1280 * 28 * 28,
-            }
+            # There is a bug in transformers v4 where size is ignored by
+            # `Qwen2VLProcessor.__call__`
+            if Version(TRANSFORMERS_VERSION) < Version("5.2.0"):
+                self.mm_processor_kwargs = {
+                    "min_pixels": 28 * 28,
+                    "max_pixels": 1280 * 28 * 28,
+                }
+            else:
+                self.mm_processor_kwargs = {
+                    "size": {
+                        "shortest_edge": 28 * 28,
+                        "longest_edge": 1280 * 28 * 28,
+                    }
+                }
 
 
 class Qwen2VLTester:
diff --git a/tests/models/multimodal/processing/test_gemma3.py b/tests/models/multimodal/processing/test_gemma3.py
index 884702cabb5f..2b4c213695ee 100644
--- a/tests/models/multimodal/processing/test_gemma3.py
+++ b/tests/models/multimodal/processing/test_gemma3.py
@@ -150,8 +150,11 @@ def test_batch_processing(self):
 
 
 @pytest.mark.parametrize("model_id", [GEMMA3_MODEL_ID])
+@pytest.mark.parametrize("mm_processor_kwargs", [{}])
 def test_get_image_size_with_most_features(
-    image_assets: ImageTestAssets, model_id: str
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
 ):
     ctx = build_model_context(
         model_id,
@@ -160,15 +163,14 @@ def test_get_image_size_with_most_features(
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
-    hf_processor_mm_kwargs: dict[str, object] = {}
-    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
 
     max_image_size = processor.info.get_image_size_with_most_features()
     max_tokens = processor.info.get_num_image_tokens(
         image_width=max_image_size.width,
         image_height=max_image_size.height,
         processor=hf_processor,
-        mm_kwargs=hf_processor_mm_kwargs,
+        mm_kwargs=mm_processor_kwargs,
     )
 
     prompt = "<start_of_image>"
@@ -179,7 +181,7 @@ def test_get_image_size_with_most_features(
         processed_inputs = processor(
             prompt,
             mm_items=processor.info.parse_mm_data(mm_data),
-            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            hf_processor_mm_kwargs=mm_processor_kwargs,
         )
         mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
         num_patches_tensor = mm_kwargs_data["num_patches"]
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index fb28d0c74425..ad5e82945a39 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
@@ -15,6 +17,16 @@
     [
         ({}, 1426, (5704, 1176)),
         ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
+        (
+            {
+                "size": {
+                    "shortest_edge": 64**2,
+                    "longest_edge": 512**2,
+                },
+            },
+            330,
+            (1320, 1176),
+        ),
     ],
 )
 @pytest.mark.parametrize("num_imgs", [1, 2])
@@ -29,6 +41,12 @@ def test_processor_override(
     kwargs_on_init: bool,
 ):
     """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
+    if (
+        Version(TRANSFORMERS_VERSION) < Version("5.2.0")
+        and "size" in mm_processor_kwargs
+    ):
+        pytest.skip("`size` ignored by `Qwen2VLProcessor.__call__`")
+
     ctx = build_model_context(
         model_id,
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
@@ -60,21 +78,34 @@ def test_processor_override(
 
 
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
-@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28])
+@pytest.mark.parametrize(
+    "mm_processor_kwargs",
+    [
+        {"min_pixels": 28 * 28, "max_pixels": 1280 * 28 * 28},
+        {"min_pixels": 28 * 28, "max_pixels": 1283 * 28 * 28},
+        {"size": {"shortest_edge": 28 * 28, "longest_edge": 1280 * 28 * 28}},
+        {"size": {"shortest_edge": 28 * 28, "longest_edge": 1283 * 28 * 28}},
+    ],
+)
 def test_get_image_size_with_most_features(
     image_assets: ImageTestAssets,
     model_id: str,
-    max_pixels: int,
+    mm_processor_kwargs: dict[str, object],
 ):
+    if (
+        Version(TRANSFORMERS_VERSION) < Version("5.2.0")
+        and "size" in mm_processor_kwargs
+    ):
+        pytest.skip("`size` ignored by `Qwen2VLProcessor.__call__`")
+
     ctx = build_model_context(
         model_id,
-        mm_processor_kwargs={"max_pixels": max_pixels},
+        mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt={"image": 1},
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
-    hf_processor_mm_kwargs: dict[str, object] = {}
-    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
     merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size
 
     max_image_size = processor.info.get_image_size_with_most_features()
@@ -82,7 +113,7 @@ def test_get_image_size_with_most_features(
         image_width=max_image_size.width,
         image_height=max_image_size.height,
         image_processor=hf_processor.image_processor,
-        mm_kwargs=hf_processor_mm_kwargs,
+        mm_kwargs=mm_processor_kwargs,
     )
 
     prompt = "<|vision_start|><|image_pad|><|vision_end|>"
@@ -91,7 +122,7 @@ def test_get_image_size_with_most_features(
         processed_inputs = processor(
             prompt,
             mm_items=processor.info.parse_mm_data(mm_data),
-            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            hf_processor_mm_kwargs=mm_processor_kwargs,
         )
         grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist()
         t, h, w = grid_thw[0]
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 1df4adfac159..edf4c2c8d453 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -829,16 +829,31 @@ def _get_vision_info(
         spatial_conv_size = hf_config.spatial_conv_size
         temporal_conv_size = hf_config.temporal_conv_size
 
+        if self.ctx.model_config.trust_remote_code:
+            # Defined in HF Hub repo
+            min_pixels_key = "min_pixels"
+            max_pixels_key = "max_pixels"
+        else:
+            # Defined in Transformers library (requires v5.0 or above)
+            min_pixels_key = "shortest_edge"
+            max_pixels_key = "longest_edge"
+
         mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
-        size = mm_kwargs.get("size", image_processor.size)
+        size = image_processor.size
+        if override_size := mm_kwargs.get("size"):
+            size = size | override_size
+        if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+            size = size | {min_pixels_key: override_min_pixels}
+        if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+            size = size | {max_pixels_key: override_max_pixels}
 
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
                 width=image_width,
                 factor=patch_size * spatial_conv_size,
-                min_pixels=size["min_pixels"],
-                max_pixels=size["max_pixels"],
+                min_pixels=size[min_pixels_key],
+                max_pixels=size[max_pixels_key],
             )
             preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index 3f2d0e7dda8e..b6fda25ddfbb 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -636,7 +636,13 @@ def _get_vision_info(
         spatial_merge_size = vision_config.spatial_merge_size
 
         mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
-        size = mm_kwargs.get("size", image_processor.size)
+        size = image_processor.size
+        if override_size := mm_kwargs.get("size"):
+            size = size | override_size
+        if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+            size = size | {"shortest_edge": override_min_pixels}
+        if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+            size = size | {"longest_edge": override_max_pixels}
 
         if do_resize:
             resized_height, resized_width = smart_resize(
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 2cb7dc42539d..4c43e413f695 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1021,7 +1021,13 @@ def _get_vision_info(
         temporal_patch_size = 1
 
         mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
-        size = mm_kwargs.get("size", image_processor.size)
+        size = image_processor.size
+        if override_size := mm_kwargs.get("size"):
+            size = size | override_size
+        if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+            size = size | {"min_pixels": override_min_pixels}
+        if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+            size = size | {"max_pixels": override_max_pixels}
 
         if do_resize:
             resized_height, resized_width = smart_resize(
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 6c9304101fba..35132e72413f 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -155,15 +155,30 @@ def get_num_image_tokens(
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
 
+        if self.ctx.model_config.trust_remote_code:
+            # Defined in HF Hub repo
+            min_pixels_key = "min_pixels"
+            max_pixels_key = "max_pixels"
+        else:
+            # Defined in Transformers library (requires v5.0 or above)
+            min_pixels_key = "shortest_edge"
+            max_pixels_key = "longest_edge"
+
         mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
-        size = mm_kwargs.get("size", image_processor.size)
+        size = image_processor.size
+        if override_size := mm_kwargs.get("size"):
+            size = size | override_size
+        if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+            size = size | {min_pixels_key: override_min_pixels}
+        if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+            size = size | {max_pixels_key: override_max_pixels}
 
         resized_height, resized_width = smart_resize(
             height=image_height,
             width=image_width,
             factor=patch_size * merge_size,
-            min_pixels=size["min_pixels"],
-            max_pixels=size["max_pixels"],
+            min_pixels=size[min_pixels_key],
+            max_pixels=size[max_pixels_key],
         )
         preprocessed_size = ImageSize(width=resized_width, height=resized_height)
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index eed559bcb87c..c4c71faf3958 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -843,7 +843,13 @@ def _get_vision_info(
         temporal_patch_size = vision_config.temporal_patch_size
 
         mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
-        size = mm_kwargs.get("size", image_processor.size)
+        size = image_processor.size
+        if override_size := mm_kwargs.get("size"):
+            size = size | override_size
+        if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+            size = size | {"shortest_edge": override_min_pixels}
+        if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+            size = size | {"longest_edge": override_max_pixels}
 
         if do_resize:
             resized_height, resized_width = smart_resize(
@@ -930,7 +936,14 @@ def get_image_size_with_most_features(
             image_processor = self.get_image_processor()
 
             mm_kwargs = self.ctx.get_merged_mm_kwargs({})
-            size = mm_kwargs.get("size", image_processor.size)
+            size = image_processor.size
+            if override_size := mm_kwargs.get("size"):
+                size = size | override_size
+            if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+                size = size | {"shortest_edge": override_min_pixels}
+            if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+                size = size | {"longest_edge": override_max_pixels}
+
             max_pixels = size["longest_edge"]
 
         unit = patch_size * merge_size
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 1a017e56161e..304553ed3a91 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -647,7 +647,13 @@ def _get_vision_info(
         temporal_patch_size = vision_config.temporal_patch_size
 
         mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
-        size = mm_kwargs.get("size", image_processor.size)
+        size = image_processor.size
+        if override_size := mm_kwargs.get("size"):
+            size = size | override_size
+        if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+            size = size | {"shortest_edge": override_min_pixels}
+        if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+            size = size | {"longest_edge": override_max_pixels}
 
         if do_resize:
             if is_video:

From 7fea7250a46c88c1ba9684d7774d2c4ac17c4b90 Mon Sep 17 00:00:00 2001
From: stingoChen <40136864+stingoChen@users.noreply.github.com>
Date: Thu, 26 Feb 2026 22:11:07 +0800
Subject: [PATCH 027/154] [Bug] Fix missing <think> tag after tool call in
 MiniMax 2.1 (#35352)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 冬马 <chenxinke@cai-inc.com>
Co-authored-by: 冬马 <chenxinke@cai-inc.com>
---
 vllm/reasoning/minimax_m2_reasoning_parser.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
index d0333a76b202..e4deaed41caa 100644
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -87,10 +87,15 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
     def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
         self.end_token_id = self.vocab.get("</think>")
+        self.start_token_id = self.vocab.get("<think>")
 
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         end_token_id = self.end_token_id
-        return any(input_id == end_token_id for input_id in reversed(input_ids))
+        start_token_id = self.start_token_id
+        for input_id in reversed(input_ids):
+            if input_id in (end_token_id, start_token_id):
+                return input_id == end_token_id
+        return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         return input_ids

From 111d8690699927af686fa6750cfbbc692a1f8740 Mon Sep 17 00:00:00 2001
From: Jakub Zakrzewski <jzakrzewski@nvidia.com>
Date: Thu, 26 Feb 2026 15:17:17 +0100
Subject: [PATCH 028/154] [Model] Add nvidia/llama-nemotron-embed-vl-1b-v2
 multimodal embedding model (#35297)

Signed-off-by: Jakub Zakrzewski <jzakrzewski@nvidia.com>
---
 docs/models/pooling_models.md                 |  61 ++++
 docs/models/supported_models.md               |   1 +
 .../embed/template/nemotron_embed_vl.jinja    |  20 ++
 .../pooling/test_llama_nemotron_vl_embed.py   | 148 +++++++++
 tests/models/registry.py                      |   3 +
 vllm/model_executor/models/config.py          |  37 +++
 vllm/model_executor/models/nemotron_vl.py     | 302 ++++++++++++++++--
 vllm/model_executor/models/registry.py        |   4 +
 8 files changed, 545 insertions(+), 31 deletions(-)
 create mode 100644 examples/pooling/embed/template/nemotron_embed_vl.jinja
 create mode 100644 tests/models/multimodal/pooling/test_llama_nemotron_vl_embed.py

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index a65bf4db5c95..120addba2830 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -498,6 +498,67 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
 - Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
 - Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py)
 
+### Llama Nemotron Multimodal Embedding Models
+
+Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone
+(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce
+single-vector embeddings from text and/or images.
+
+| Architecture | Backbone | Example HF Models |
+|---|---|---|
+| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` |
+
+Start the server:
+
+```shell
+vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \
+    --trust-remote-code \
+    --chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja
+```
+
+!!! note
+    The chat template bundled with this model's tokenizer is not suitable for
+    the embeddings API. Use the provided override template above when serving
+    with the `messages`-based (chat-style) embeddings endpoint.
+
+    The override template uses the message `role` to automatically prepend the
+    appropriate prefix: set `role` to `"query"` for queries (prepends `query: `)
+    or `"document"` for passages (prepends `passage: `). Any other role omits
+    the prefix.
+
+Embed text queries:
+
+```shell
+curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
+    "model": "nvidia/llama-nemotron-embed-vl-1b-v2",
+    "messages": [
+        {
+            "role": "query",
+            "content": [
+                {"type": "text", "text": "What is machine learning?"}
+            ]
+        }
+    ]
+}'
+```
+
+Embed images via the chat-style `messages` field:
+
+```shell
+curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
+    "model": "nvidia/llama-nemotron-embed-vl-1b-v2",
+    "messages": [
+        {
+            "role": "document",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
+                {"type": "text", "text": "Describe the image."}
+            ]
+        }
+    ]
+}'
+```
+
 ### BAAI/bge-m3
 
 The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index d184041f33ca..5f821ef7a4b4 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -821,6 +821,7 @@ The following table lists those that are tested in vLLM.
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
 | `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | |
+| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | |
 | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
 | `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
diff --git a/examples/pooling/embed/template/nemotron_embed_vl.jinja b/examples/pooling/embed/template/nemotron_embed_vl.jinja
new file mode 100644
index 000000000000..0e5f8f481f2e
--- /dev/null
+++ b/examples/pooling/embed/template/nemotron_embed_vl.jinja
@@ -0,0 +1,20 @@
+{%- if messages | length > 1 -%}
+    {{ raise_exception('Embedding models should only embed one message at a time') }}
+{%- endif -%}
+
+{% set vars = namespace(prefix='', images=[], texts=[]) %}
+{%- for message in messages -%}
+    {%- if message['role'] == 'query' -%}
+        {%- set vars.prefix = 'query: ' %}
+    {%- elif message['role'] == 'document' -%}
+        {%- set vars.prefix = 'passage: ' %}
+    {%- endif -%}
+    {%- for content in message['content'] -%}
+        {%- if content['type'] == 'text' -%}
+            {%- set vars.texts = vars.texts + [content['text']] %}
+        {%- elif content['type'] == 'image' -%}
+            {%- set vars.images = vars.images + ['<image> '] %}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endfor -%}
+{{- bos_token }}{{ vars.prefix }}{{ (vars.images + vars.texts) | join('') }}
diff --git a/tests/models/multimodal/pooling/test_llama_nemotron_vl_embed.py b/tests/models/multimodal/pooling/test_llama_nemotron_vl_embed.py
new file mode 100644
index 000000000000..b02d77b9b7dc
--- /dev/null
+++ b/tests/models/multimodal/pooling/test_llama_nemotron_vl_embed.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for LlamaNemotronVL embedding model (nvidia/llama-nemotron-embed-vl-1b-v2).
+
+This model uses SigLIP vision encoder with bidirectional LLaMA for embeddings.
+"""
+
+import pytest
+import torch
+from transformers import AutoModel
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ...utils import check_embeddings_close
+
+# Prefixes used by the model API
+QUERY_PREFIX = "query: "
+PASSAGE_PREFIX = "passage: "
+
+# Text prompts for text-only embedding
+HF_TEXT_PROMPTS = [
+    # T -> X (text embedding queries)
+    f"{QUERY_PREFIX}The label of the object is stop sign",
+    f"{QUERY_PREFIX}cherry blossom",
+]
+
+# Image prompts using the model's expected format
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        # I -> X (image embedding as passage/document)
+        "stop_sign": f"{PASSAGE_PREFIX}<image>",
+        "cherry_blossom": f"{PASSAGE_PREFIX}<image>",
+    }
+)
+
+MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]
+
+
+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
+    input_images: PromptImageInput,
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Run embedding comparison test between HF and vLLM.
+
+    NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
+    """
+    # Run vLLM inference first
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=2048,
+        enforce_eager=True,
+        trust_remote_code=True,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+
+    # Run HF inference using the model's encode_queries/encode_documents API
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
+        hf_outputs = []
+        for text, image in zip(input_texts, input_images):
+            with torch.inference_mode():
+                if text.startswith(QUERY_PREFIX):
+                    # Strip prefix and use encode_queries for query texts
+                    query_text = text[len(QUERY_PREFIX) :]
+                    embedding = hf_model.model.encode_queries([query_text])
+                elif text.startswith(PASSAGE_PREFIX):
+                    # Strip prefix and use encode_documents for passages/images
+                    passage_text = text[len(PASSAGE_PREFIX) :]
+                    if image is not None:
+                        # Image document - pass image to encode_documents
+                        embedding = hf_model.model.encode_documents(
+                            images=[image],
+                            texts=[passage_text],
+                        )
+                    else:
+                        # Text-only document
+                        embedding = hf_model.model.encode_documents(
+                            texts=[passage_text]
+                        )
+                else:
+                    raise ValueError(
+                        f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"
+                    )
+
+                hf_outputs.append(embedding[0].tolist())
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    """Test text-only embedding."""
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    """Test image embedding."""
+    input_texts_images = [
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c522ce58bf04..0978c93dac5e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -598,6 +598,9 @@ def check_available_online(
     "ColModernVBertForRetrieval": _HfExamplesInfo(
         "ModernVBERT/colmodernvbert-merged",
     ),
+    "LlamaNemotronVLModel": _HfExamplesInfo(
+        "nvidia/llama-nemotron-embed-vl-1b-v2", trust_remote_code=True
+    ),
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
     "Phi3VForCausalLM": _HfExamplesInfo(
         "TIGER-Lab/VLM2Vec-Full", trust_remote_code=True
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 27cf3a7929fc..ea0f118a0d2f 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -112,6 +112,42 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
         model_config.pooler_config.seq_pooling_type = pooling_type
 
 
+class LlamaNemotronVLConfig(VerifyAndUpdateConfig):
+    """Config handler for LlamaNemotronVL embedding models."""
+
+    @staticmethod
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        from vllm.config.pooler import SequencePoolingType
+
+        hf_config = model_config.hf_config
+
+        # Set bidirectional attention on the language model config
+        hf_config.is_causal = False
+        if hasattr(hf_config, "llm_config"):
+            hf_config.llm_config.is_causal = False
+
+        if hasattr(hf_config, "vision_config"):
+            hf_config.patch_size = hf_config.vision_config.patch_size
+
+        # Set up pooling type
+        pooling_type_map: dict[str, SequencePoolingType] = {
+            "avg": "MEAN",
+            "cls": "CLS",
+            "last": "LAST",
+        }
+
+        # Get pooling type from config (check both top-level and llm_config)
+        pooling = getattr(hf_config, "pooling", None)
+        if pooling is None and hasattr(hf_config, "llm_config"):
+            pooling = getattr(hf_config.llm_config, "pooling", "avg")
+
+        pooling_type = pooling_type_map.get(pooling)
+        if pooling_type is None:
+            raise ValueError(f"pool_type {pooling!r} not supported")
+
+        model_config.pooler_config.seq_pooling_type = pooling_type
+
+
 class NomicBertModelConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -619,6 +655,7 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
     "Gemma3TextModel": Gemma3TextModelConfig,
     "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig,
     "LlamaBidirectionalModel": LlamaBidirectionalConfig,
+    "LlamaNemotronVLModel": LlamaNemotronVLConfig,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
     "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 7b87b6160b43..bef083c503eb 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -18,6 +18,7 @@
 from transformers.image_processing_utils_fast import BaseImageProcessorFast
 
 from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.models.internvl import (
@@ -30,12 +31,14 @@
     InternVLProcessor,
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.siglip import SiglipVisionModel
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
 from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_image_processor_from_config
+from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -43,11 +46,13 @@
     SupportsMultiModal,
     SupportsPP,
 )
-from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
-
-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<image>"
+from .interfaces_base import VllmModelForPooling
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 
 
 def build_transform(input_size: int):
@@ -183,10 +188,12 @@ def image_to_pixel_values_nemotron_vl(
     min_num: int,
     max_num: int,
     use_thumbnail: bool,
+    transform: T.Compose | None = None,
 ) -> torch.Tensor:
     target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
 
-    transform = build_transform(input_size=input_size)
+    if transform is None:
+        transform = build_transform(input_size=input_size)
 
     images = dynamic_preprocess_nemotron_vl(
         image,
@@ -200,11 +207,15 @@ def image_to_pixel_values_nemotron_vl(
 
 
 class NemotronVLProcessor(InternVLProcessor):
+    IMG_START = "<img>"
+    IMG_END = "</img>"
+    IMG_CONTEXT = "<image>"
+
     def __init__(
         self,
         config: PretrainedConfig,
         tokenizer: TokenizerLike,
-        image_processor: BaseImageProcessorFast,
+        image_processor: BaseImageProcessorFast | None = None,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
@@ -236,11 +247,18 @@ def __init__(
         self.min_dynamic_patch = min_dynamic_patch
         self.max_dynamic_patch = max_dynamic_patch
         self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail: bool = self.image_processor.use_thumbnail
+
+        if image_processor is not None:
+            self.use_thumbnail = image_processor.use_thumbnail
+        else:
+            self.use_thumbnail = getattr(config, "use_thumbnail", True)
 
     @property
     def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+        return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
+
+    def _get_transform(self) -> T.Compose:
+        return build_transform(input_size=self.image_size)
 
     def get_num_image_tokens(
         self,
@@ -283,10 +301,26 @@ def _images_to_pixel_values_lst(
                 min_num=min_num,
                 max_num=max_num,
                 use_thumbnail=self.use_thumbnail,
+                transform=self._get_transform(),
             )
             for image in images
         ]
 
+    def _replace_image_tokens(
+        self,
+        text: list[str],
+        pixel_values_lst: list[torch.Tensor],
+    ) -> list[str]:
+        """Replace <image> placeholders with image tokens."""
+        for pixel_values in pixel_values_lst:
+            num_patches = pixel_values.shape[0]
+            feature_size = num_patches * self.num_image_token
+            image_repl = self.get_image_repl(feature_size, num_patches)
+            # Use temporary placeholder to avoid replacing tokens we just inserted
+            NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
+            text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
+        return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
+
     def _preprocess_image(
         self,
         text: list[str],
@@ -311,15 +345,7 @@ def _preprocess_image(
                 ),
             }
 
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
-                image_repl = self.get_image_repl(feature_size, num_patches)
-                NVL_IMAGE_CONTEXT = image_repl.full.replace(
-                    "<image>", "<NVL_IMG_CONTEXT>"
-                )
-                text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
-            text = [t.replace("<NVL_IMG_CONTEXT>", IMG_CONTEXT) for t in text]
+            text = self._replace_image_tokens(text, pixel_values_lst)
         return text, image_inputs
 
     def get_image_repl(
@@ -327,10 +353,10 @@ def get_image_repl(
         feature_size: int,
         num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
+        repl_features = self.IMG_CONTEXT * feature_size
+        repl_full = self.IMG_START + repl_features + self.IMG_END
 
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+        return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
 
 
 class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
@@ -396,7 +422,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         with self._mark_language_model(vllm_config):
             self.language_model = init_vllm_registered_model(
                 vllm_config=vllm_config,
-                hf_config=config.text_config,
+                hf_config=config.get_text_config(),
                 prefix=maybe_prefix(prefix, "language_model"),
             )
 
@@ -413,7 +439,7 @@ def _patch_quant_config(
         # the awq models from OpenGVLab missing `modules_to_not_convert`
         # patch the quant_config to add `modules_to_not_convert` back
         if isinstance(quant_config, AWQConfig):
-            text_config = config.text_config
+            text_config = config.get_text_config()
             llm_quant_config = getattr(text_config, "quantization_config", None)
             if (not quant_config.modules_to_not_convert) and (
                 llm_quant_config is not None
@@ -429,10 +455,17 @@ def _init_vision_model(
     ):
         return AutoModel.from_config(config.vision_config, trust_remote_code=True)
 
-    def _init_mlp1(self, config: PretrainedConfig) -> nn.Module:
-        vit_hidden_size = config.vit_hidden_size
-        vision_projection_hidden_size = config.projector_hidden_size
-        llm_hidden_size = config.text_config.hidden_size
+    def _init_mlp1(
+        self,
+        config: PretrainedConfig,
+        vit_hidden_size: int | None = None,
+        vision_projection_hidden_size: int | None = None,
+    ) -> nn.Module:
+        if vit_hidden_size is None:
+            vit_hidden_size = config.vit_hidden_size
+        if vision_projection_hidden_size is None:
+            vision_projection_hidden_size = config.projector_hidden_size
+        llm_hidden_size = config.get_text_config().hidden_size
 
         return nn.Sequential(
             nn.LayerNorm(
@@ -465,10 +498,18 @@ def pixel_shuffle(self, x, scale_factor=0.5):
             x = x.permute(0, 2, 1, 3).contiguous()
         return x
 
+    def _call_vision_model(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """Call vision model and return embeddings.
+
+        Override this method in subclasses to handle different vision model
+        interfaces (e.g., SigLIP vs C-RADIO).
+        """
+        vit_embeds = self.vision_model(x=pixel_values).features
+        return vit_embeds.to(dtype=torch.bfloat16)
+
     def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
         # https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/modeling.py#L177
-        vit_embeds = self.vision_model(x=pixel_values).features
-        vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
+        vit_embeds = self._call_vision_model(pixel_values)
 
         h = w = int(vit_embeds.shape[1] ** 0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
@@ -523,15 +564,16 @@ def _process_image_input(
         image_embeds = self.extract_feature(image_input["pixel_values_flat"])
 
         num_patches = image_input["num_patches"]
+        hidden_size = self.config.get_text_config().hidden_size
 
         # Only one image in the current batch
         if len(num_patches) == 1:
-            return (image_embeds.view(-1, self.config.text_config.hidden_size),)
+            return (image_embeds.view(-1, hidden_size),)
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the size of each embedding.
         feature_size = image_embeds.shape[1]
-        image_embeds = image_embeds.view(-1, self.config.text_config.hidden_size)
+        image_embeds = image_embeds.view(-1, hidden_size)
         image_feature_sizes = [
             num_patches * feature_size for num_patches in num_patches
         ]
@@ -643,3 +685,201 @@ def get_mm_mapping(self) -> MultiModelKeys:
             connector="mlp1",
             tower_model="vision_model",
         )
+
+
+# --------------------------------------------------------
+# LlamaNemotronVL Embedding Model (nvidia/llama-nemotron-embed-vl-1b-v2)
+# Extends LlamaNemotronVLChatModel for embedding/pooling tasks:
+#   - SigLIP vision encoder (instead of C-RADIO)
+#   - Bidirectional (non-causal) LLaMA language model
+#   - Pooler output instead of generative logits
+# --------------------------------------------------------
+
+# SigLIP normalization constants
+SIGLIP_MEAN = (0.5, 0.5, 0.5)
+SIGLIP_STD = (0.5, 0.5, 0.5)
+
+
+def build_siglip_transform(input_size: int):
+    """Build transform for SigLIP vision encoder with normalization.
+
+    Extends the base transform from nemotron_vl with SigLIP-specific normalization.
+    """
+    base_transform = build_transform(input_size=input_size)
+    return T.Compose(
+        [
+            base_transform,
+            T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
+        ]
+    )
+
+
+class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
+    """
+    Processor for LlamaNemotronVL embedding model.
+
+    Inherits from NemotronVLProcessor and specializes it for embedding tasks:
+    - Uses SigLIP transform with normalization instead of base transform
+    - Uses different image context token (<IMG_CONTEXT> vs <image>)
+    """
+
+    IMG_CONTEXT = "<IMG_CONTEXT>"
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        processor_config: dict,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        if min_dynamic_patch is None:
+            min_dynamic_patch = processor_config.get(
+                "min_input_tiles",
+                getattr(config, "min_dynamic_patch", 1),
+            )
+        if max_dynamic_patch is None:
+            max_dynamic_patch = processor_config.get(
+                "max_input_tiles",
+                getattr(config, "max_dynamic_patch", 1),
+            )
+        if dynamic_image_size is None:
+            dynamic_image_size = processor_config.get(
+                "dynamic_image_size",
+                getattr(config, "dynamic_image_size", True),
+            )
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            image_processor=None,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+    def _get_transform(self) -> T.Compose:
+        """Override to add SigLIP normalization."""
+        return build_siglip_transform(input_size=self.image_size)
+
+    def _replace_image_tokens(
+        self,
+        text: list[str],
+        pixel_values_lst: list[torch.Tensor],
+    ) -> list[str]:
+        """Override with simpler token replacement for embedding model.
+
+        No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
+        not <image>, so there's no collision risk.
+        """
+        for pixel_values in pixel_values_lst:
+            num_patches = pixel_values.shape[0]
+            feature_size = num_patches * self.num_image_token
+            image_repl = self.get_image_repl(feature_size, num_patches)
+            text = [t.replace("<image>", image_repl.full, 1) for t in text]
+        return text
+
+
+class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
+    """Processing info for LlamaNemotronVL embedding model."""
+
+    def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
+        """Override to create embedding-specific processor without image_processor."""
+        model_config = self.ctx.model_config
+        processor_config = {}
+        if model_config.model is not None:
+            processor_config = (
+                get_hf_file_to_dict(
+                    "processor_config.json",
+                    model_config.model,
+                    model_config.revision,
+                )
+                or {}
+            )
+
+        return self.ctx.init_processor(
+            LlamaNemotronVLEmbedProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            processor_config=processor_config,
+            **kwargs,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    BaseInternVLMultiModalProcessor[LlamaNemotronVLEmbedProcessingInfo],
+    info=LlamaNemotronVLEmbedProcessingInfo,
+    dummy_inputs=BaseInternVLDummyInputsBuilder[LlamaNemotronVLEmbedProcessingInfo],
+)
+class LlamaNemotronVLForEmbedding(LlamaNemotronVLChatModel, VllmModelForPooling):
+    """
+    LlamaNemotronVL model for embeddings.
+
+    Inherits from LlamaNemotronVLChatModel and specializes it for embedding tasks:
+    - Uses SigLIP vision encoder instead of C-RADIO
+    - Uses bidirectional LLaMA (via llm_config) instead of causal LLaMA
+    - Adds pooler for embedding output instead of generating logits
+    """
+
+    is_pooling_model = True
+
+    # Weight mapping from checkpoint format to vLLM format
+    # Different from parent class due to different vision model structure
+    weight_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # Language model mapping
+            "language_model.layers.": "language_model.model.layers.",
+            "language_model.embed_tokens.": "language_model.model.embed_tokens.",
+            "language_model.norm.": "language_model.model.norm.",
+            # Vision model mapping (SiglipVisionModel has nested vision_model)
+            "vision_model.encoder.": "vision_model.vision_model.encoder.",
+            "vision_model.embeddings.": "vision_model.vision_model.embeddings.",
+            "vision_model.post_layernorm.": "vision_model.vision_model.post_layernorm.",
+        }
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        config = vllm_config.model_config.hf_config
+
+        # Override: get img_context_token_id from config (parent sets None)
+        self.img_context_token_id = getattr(config, "img_context_token_id", None)
+
+        # Initialize pooler for embedding output
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+        self.pooler = DispatchPooler.for_embedding(pooler_config)
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config,
+        *,
+        prefix: str,
+    ) -> nn.Module:
+        """Override to use SigLIP instead of C-RADIO."""
+        return SiglipVisionModel(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_head=False,
+        )
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Module:
+        """Override to use different MLP structure for embedding model."""
+        return super()._init_mlp1(
+            config,
+            vit_hidden_size=config.vision_config.hidden_size,
+            vision_projection_hidden_size=config.get_text_config().hidden_size,
+        )
+
+    def _call_vision_model(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """Override to handle SigLIP interface."""
+        return self.vision_model(pixel_values)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Override to use different weight mapping for SigLIP."""
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.weight_mapper)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 6bb8423db6f4..cc871f9d344a 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -260,6 +260,10 @@
     "OpsColQwen3Model": ("colqwen3", "ColQwen3Model"),
     "Qwen3VLNemotronEmbedModel": ("colqwen3", "ColQwen3Model"),
     "SiglipModel": ("siglip", "SiglipEmbeddingModel"),
+    "LlamaNemotronVLModel": (
+        "nemotron_vl",
+        "LlamaNemotronVLForEmbedding",
+    ),
     # Technically Terratorch models work on images, both in
     # input and output. I am adding it here because it piggy-backs on embedding
     # models for the time being.

From 05972ea7e5f81250cc4ceaae8a174cfffe7755ac Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 26 Feb 2026 10:57:56 -0500
Subject: [PATCH 029/154] [Refactor] Remove dead or duplicate func utils or
 variables (#35318)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 benchmarks/backend_request_func.py            |  6 --
 benchmarks/benchmark_utils.py                 | 71 -------------------
 benchmarks/cutlass_benchmarks/utils.py        | 13 ----
 benchmarks/disagg_benchmarks/rate_limiter.py  | 45 ------------
 benchmarks/disagg_benchmarks/request_queue.py | 39 ----------
 .../layers/quantization/ptpc_fp8.py           |  5 --
 .../layers/quantization/utils/marlin_utils.py | 18 -----
 .../models/hyperclovax_vision.py              |  1 -
 vllm/v1/engine/core.py                        |  1 -
 9 files changed, 199 deletions(-)
 delete mode 100644 benchmarks/disagg_benchmarks/rate_limiter.py
 delete mode 100644 benchmarks/disagg_benchmarks/request_queue.py

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 831b76b66e09..a69637bfc437 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -649,9 +649,3 @@ def get_tokenizer(
     "sglang": async_request_openai_completions,
     "llama.cpp": async_request_openai_completions,
 }
-
-OPENAI_COMPATIBLE_BACKENDS = [
-    k
-    for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions, async_request_openai_chat_completions)
-]
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index f0d661f9d534..5865473e9542 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,78 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import argparse
-import json
-import math
-import os
 import time
 from types import TracebackType
-from typing import Any
-
-
-def convert_to_pytorch_benchmark_format(
-    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
-) -> list:
-    """
-    Save the benchmark results in the format used by PyTorch OSS benchmark with
-    on metric per record
-    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
-    """
-    records = []
-    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
-        return records
-
-    for name, benchmark_values in metrics.items():
-        record = {
-            "benchmark": {
-                "name": "vLLM benchmark",
-                "extra_info": {
-                    "args": vars(args),
-                },
-            },
-            "model": {
-                "name": args.model,
-            },
-            "metric": {
-                "name": name,
-                "benchmark_values": benchmark_values,
-                "extra_info": extra_info,
-            },
-        }
-
-        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
-        # Save tensor_parallel_size parameter if it's part of the metadata
-        if not tp and "tensor_parallel_size" in extra_info:
-            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
-                extra_info["tensor_parallel_size"]
-            )
-
-        records.append(record)
-
-    return records
-
-
-class InfEncoder(json.JSONEncoder):
-    def clear_inf(self, o: Any):
-        if isinstance(o, dict):
-            return {k: self.clear_inf(v) for k, v in o.items()}
-        elif isinstance(o, list):
-            return [self.clear_inf(v) for v in o]
-        elif isinstance(o, float) and math.isinf(o):
-            return "inf"
-        return o
-
-    def iterencode(self, o: Any, *args, **kwargs) -> Any:
-        return super().iterencode(self.clear_inf(o), *args, **kwargs)
-
-
-def write_to_json(filename: str, records: list) -> None:
-    with open(filename, "w") as f:
-        json.dump(
-            records,
-            f,
-            cls=InfEncoder,
-            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
-        )
 
 
 # Collect time and generate time metrics
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index b4f3c6bf94ed..6cbcf6b68c89 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Cutlass bench utils
-from collections.abc import Iterable
 
 import torch
 
@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
 
     # Compressed B, Metadata, Original A, B
     return b_compressed, e, a, b
-
-
-def make_n_rand_sparse_tensors(
-    num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
-) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
-    ABs = []
-    for _ in range(num_tensors):
-        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
-        if b_comp is not None:
-            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
-    BComps, Es, As, Bs = zip(*ABs)
-    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py
deleted file mode 100644
index 87ac8cb6ab1a..000000000000
--- a/benchmarks/disagg_benchmarks/rate_limiter.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import time
-
-
-class RateLimiter:
-    """Token bucket rate limiter implementation"""
-
-    def __init__(self, rate_limit):
-        self.rate_limit = rate_limit  # Requests per second
-        self.num_available_tokens = rate_limit  # Available tokens
-        self.last_refill = time.monotonic()  # Last token refill time
-        self.lock = asyncio.Lock()  # Synchronization lock
-
-    async def acquire(self):
-        """Acquire a token from the rate limiter"""
-        while True:
-            async with self.lock:
-                current_time = time.monotonic()
-                elapsed = current_time - self.last_refill
-
-                # Refill num_available_tokens if more than 1 second has passed
-                if elapsed > 1.0:
-                    self.num_available_tokens = self.rate_limit
-                    self.last_refill = current_time
-
-                # Check if num_available_tokens are available
-                if self.num_available_tokens > 0:
-                    self.num_available_tokens -= 1
-                    return True
-
-                # Calculate wait time if no num_available_tokens available
-                wait_time = 1.0 - elapsed
-            await asyncio.sleep(wait_time)
-
-    async def __aenter__(self):
-        """Enter async context manager - acquire token"""
-        await self.acquire()
-        return self
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        """Exit async context manager - no cleanup needed"""
-        pass
diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py
deleted file mode 100644
index 410bcb956050..000000000000
--- a/benchmarks/disagg_benchmarks/request_queue.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-from collections import deque
-
-
-class RequestQueue:
-    """Request queue manager with concurrency control"""
-
-    def __init__(self, max_concurrent, max_queue_size):
-        # Maximum concurrent requests
-        self.max_concurrent = max_concurrent
-        self.max_queue_size = max_queue_size  # Maximum queue size
-        # Concurrency control
-        self.semaphore = asyncio.Semaphore(max_concurrent)
-        self.queue = deque()  # Request queue
-        self.queue_size = 0  # Current queue size
-        self.lock = asyncio.Lock()  # Sync queue Lock
-
-    async def enqueue(self, task):
-        """Add a request task to the queue"""
-        async with self.lock:
-            if self.queue_size >= self.max_queue_size:
-                return False
-
-            self.queue.append(task)
-            self.queue_size += 1
-            return True
-
-    async def process(self):
-        """Process queued requests using semaphore for concurrency control"""
-        while True:
-            if self.queue:
-                async with self.semaphore, self.lock:
-                    task = self.queue.popleft()
-                    self.queue_size -= 1
-                    await task
-            await asyncio.sleep(0.01)  # Yield control to event loop
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 76410f2e4d5b..5d7b7b54adc8 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -7,7 +7,6 @@
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
-from vllm.logger import init_logger
 from vllm.model_executor.kernels.linear import (
     init_fp8_linear_kernel,
 )
@@ -26,10 +25,6 @@
 )
 from vllm.platforms import current_platform
 
-ACTIVATION_SCHEMES = ["static", "dynamic"]
-
-logger = init_logger(__name__)
-
 
 class PTPCFp8Config(Fp8Config):
     """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index c1147725cbb5..23ccfc536ebc 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso
     return w2_packed.size(1) * marlin_tile_size
 
 
-def marlin_make_workspace(
-    output_size_per_partition: int, device: torch.device
-) -> torch.Tensor:
-    max_workspace_size = (
-        output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
-    ) * GPTQ_MARLIN_MAX_PARALLEL
-
-    return torch.zeros(
-        max_workspace_size, dtype=torch.int, device=device, requires_grad=False
-    )
-
-
 def marlin_make_workspace_new(
     device: torch.device, max_blocks_per_sm: int = 1
 ) -> torch.Tensor:
@@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
     )
 
 
-def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
-
-
 def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index 1fb0d5e5dd76..5b0dfe457d65 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -49,7 +49,6 @@
 )
 from .vision import get_vision_encoder_info
 
-EOT = "<|endofturn|>"
 IMAGE_TOKEN: str = "<|dummy3|>"
 VIDEO_TOKEN: str = "<|_unuse_missing_100270|>"
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a55f1975e95d..39515cab7742 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -72,7 +72,6 @@
 
 logger = init_logger(__name__)
 
-POLLING_TIMEOUT_S = 2.5
 HANDSHAKE_TIMEOUT_MINS = 5
 
 _R = TypeVar("_R")  # Return type for collective_rpc

From ec8ab9d254d3b2e6b919a55277da599a7b9ab146 Mon Sep 17 00:00:00 2001
From: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com>
Date: Thu, 26 Feb 2026 10:00:49 -0600
Subject: [PATCH 030/154] [ROCm] Add dynamic mxfp4 quantization for DeepSeek V2
 projection layers (#34157)

Signed-off-by: Doug Lehr <douglehr@amd.com>
Signed-off-by: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Rohan Potdar <66227218+Rohan138@users.noreply.github.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
---
 .../layers/quantization/quark/quark.py        | 48 ++++++++-
 .../quark/schemes/quark_ocp_mx.py             | 97 ++++++++++++-------
 2 files changed, 106 insertions(+), 39 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 36f20c89ff11..dedc7db380f8 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -35,6 +35,7 @@
 )
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.utils import WeightsMapper
@@ -59,6 +60,22 @@ def __init__(
         self.kv_cache_group = kv_cache_group
         self.kv_cache_config = kv_cache_config
         self.pack_method = pack_method
+        self.dynamic_mxfp4_quant = False
+
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
+        self.hf_config = get_config(
+            model=model_name,
+            trust_remote_code=False,  # or get from model_config if available
+            revision=revision,
+            config_format="auto",
+        )
+
+        quant_config = getattr(self.hf_config, "quantization_config", None)
+        if quant_config is not None:
+            quant_dtype = quant_config["global_quant_config"]["weight"]["dtype"]
+            model_type = self.hf_config.model_type
+            if quant_dtype == "fp4" and model_type == "deepseek_v3":
+                self.dynamic_mxfp4_quant = True
 
     def get_linear_method(self) -> "QuarkLinearMethod":
         return QuarkLinearMethod(self)
@@ -108,7 +125,20 @@ def get_quant_method(
         if should_ignore_layer(
             prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping
         ):
-            return UnquantizedLinearMethod()
+            if (
+                "self_attn" not in prefix  # only quantize attention projections
+                or not getattr(self, "dynamic_mxfp4_quant", False)
+                or not isinstance(layer, LinearBase)  # Ignore other methods
+            ):
+                return UnquantizedLinearMethod()
+
+            scheme = self.get_scheme(
+                layer=layer,
+                layer_name=prefix,
+                dynamic_mxfp4_quant=True,
+            )
+            layer.scheme = scheme
+            return QuarkLinearMethod(self)
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
             layer.scheme = scheme
@@ -450,7 +480,9 @@ def _matches_pattern(layer_name, pattern):
             )
             return global_quant_config
 
-    def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme":
+    def _get_scheme_from_config(
+        self, config: dict[str, Any], dynamic_mxfp4_quant: bool = False
+    ) -> "QuarkScheme":
         if config.get("output_tensors") or config.get("bias"):
             raise NotImplementedError(
                 "Currently, Quark models with output_tensors "
@@ -473,7 +505,9 @@ def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme":
                 input_symmetric=input_config.get("symmetric"),
             )
         elif self._is_w_ocp_mx_a_x(weight_config, input_config):
-            return QuarkOCP_MX(weight_config, input_config)
+            return QuarkOCP_MX(
+                weight_config, input_config, dynamic_mxfp4_quant=dynamic_mxfp4_quant
+            )
 
         raise NotImplementedError(
             "No quark compatible scheme was found. "
@@ -481,11 +515,15 @@ def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme":
             f"Input config: {input_config}"
         )
 
-    def get_scheme(self, layer: torch.nn.Module, layer_name: str) -> "QuarkScheme":
+    def get_scheme(
+        self, layer: torch.nn.Module, layer_name: str, dynamic_mxfp4_quant: bool = False
+    ) -> "QuarkScheme":
         layer_quant_config = self._find_matched_config(layer_name, layer)
 
         # Find the quant_scheme
-        scheme = self._get_scheme_from_config(layer_quant_config)
+        scheme = self._get_scheme_from_config(
+            layer_quant_config, dynamic_mxfp4_quant=dynamic_mxfp4_quant
+        )
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index c5f50122eb7c..6917bb6f2deb 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -24,7 +24,12 @@
     OCP_MX_BLOCK_SIZE,
     OCP_MX_Scheme,
 )
-from vllm.model_executor.parameter import GroupQuantScaleParameter, PackedvLLMParameter
+from vllm.model_executor.parameter import (
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+    PackedvLLMParameter,
+)
+from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
 from .quark_scheme import QuarkScheme
@@ -169,13 +174,16 @@ def gemm_with_dynamic_quant_fake(
 
 class QuarkOCP_MX(QuarkScheme):
     def __init__(
-        self, weight_quant_spec: dict[str, Any], input_quant_spec: dict[str, Any]
+        self,
+        weight_quant_spec: dict[str, Any],
+        input_quant_spec: dict[str, Any],
+        dynamic_mxfp4_quant: bool = False,
     ):
         self.out_dtype = torch.get_default_dtype()
         self.qscheme = "per_group"
         self.weight_quant_spec = weight_quant_spec
         self.input_quant_spec = input_quant_spec
-
+        self.dynamic_mxfp4_quant = dynamic_mxfp4_quant
         self.weight_dtype = weight_quant_spec["dtype"].replace("fp", "mxfp")
         self.input_dtype = input_quant_spec["dtype"].replace("fp", "mxfp")
 
@@ -269,7 +277,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 layer.weight_scale.data, requires_grad=False
             )
         else:
-            if self.rocm_use_aiter_fp4_asm_gemm:
+            if self.dynamic_mxfp4_quant:
+                w_q, w_s = dynamic_mxfp4_quant(layer.weight)
+                layer.weight_scale = torch.nn.Parameter(
+                    w_s.T.contiguous(), requires_grad=False
+                )
+                layer.weight = torch.nn.Parameter(w_q, requires_grad=False)
+            elif self.rocm_use_aiter_fp4_asm_gemm:
                 # shuffle weight scale
                 weight_scale_shuffle = layer.weight_scale.data
                 sm, sn = weight_scale_shuffle.shape
@@ -302,36 +316,51 @@ def create_weights(
         weight_loader: Callable,
         **kwargs,
     ):
-        output_size_per_partition = sum(output_partition_sizes)
-        layer.logical_widths = output_partition_sizes
-
-        # WEIGHT
-        weight = PackedvLLMParameter(
-            data=torch.empty(
-                output_size_per_partition,
-                self.get_packed_dim(input_size_per_partition, self.weight_dtype),
-                dtype=torch.uint8,
-            ),
-            input_dim=1,
-            output_dim=0,
-            packed_dim=1,
-            packed_factor=self.packed_factor,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
-        # WEIGHT SCALE
-        weight_scale = GroupQuantScaleParameter(
-            data=torch.empty(
-                output_size_per_partition,
-                input_size_per_partition // OCP_MX_BLOCK_SIZE,
-                dtype=torch.uint8,
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_scale", weight_scale)
+        if self.dynamic_mxfp4_quant:
+            weight = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition,
+                    dtype=params_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+
+            layer.register_parameter("weight", weight)
+            set_weight_attrs(weight, kwargs)
+        else:
+            output_size_per_partition = sum(output_partition_sizes)
+            layer.logical_widths = output_partition_sizes
+
+            # WEIGHT
+            weight = PackedvLLMParameter(
+                data=torch.empty(
+                    output_size_per_partition,
+                    self.get_packed_dim(input_size_per_partition, self.weight_dtype),
+                    dtype=torch.uint8,
+                ),
+                input_dim=1,
+                output_dim=0,
+                packed_dim=1,
+                packed_factor=self.packed_factor,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("weight", weight)
+
+            # WEIGHT SCALE
+            weight_scale = GroupQuantScaleParameter(
+                data=torch.empty(
+                    output_size_per_partition,
+                    input_size_per_partition // OCP_MX_BLOCK_SIZE,
+                    dtype=torch.uint8,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("weight_scale", weight_scale)
 
     def apply_weights(
         self,

From 9e2cabdf9c86e9fceca8842c8ea2a260281c31e8 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 26 Feb 2026 08:28:45 -0800
Subject: [PATCH 031/154] [ROCm] Update the torch version in rocm_build.txt to
 use the official 2.10 release (#34387)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 requirements/rocm-build.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 01a71c2da38c..6f96c7d55742 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,7 +1,7 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/test/rocm7.0
+--extra-index-url https://download.pytorch.org/whl/rocm7.1
 torch==2.10.0
 torchvision==0.25.0
 torchaudio==2.10.0
@@ -12,5 +12,5 @@ setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
-amdsmi==6.4.3
+amdsmi==7.0.2
 timm>=1.0.17

From f2ad952f40a98e0bb7f89763c51a73124ccc20a6 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Thu, 26 Feb 2026 18:29:34 +0200
Subject: [PATCH 032/154] [BugFix][kv_offload]: Fix kernel block size detection
 (#35125)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 vllm/v1/kv_offload/worker/cpu_gpu.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index a5abae51ef03..5cde5faa47ae 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -259,16 +259,20 @@ def __init__(
                 assert gpu_shape[0] == 2
                 split_k_and_v = True
 
-            try:
-                kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
-                    include_num_layers_dimension=has_layers_dim
-                )
-                assert len(kv_cache_stride_order) == len(gpu_shape)
-            except (AttributeError, NotImplementedError):
-                kv_cache_stride_order = tuple(range(len(gpu_shape)))
-
-            # permute test_shape according to stride_order
-            test_shape = tuple(test_shape[i] for i in kv_cache_stride_order)
+            if has_layers_dim:
+                # in the cross layers case, the registered kv cache tensor
+                # shape matches the physical layout, whereas test_shape
+                # is the logical layout.
+                # To match them, we need to permute test_shape
+                try:
+                    kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
+                        include_num_layers_dimension=has_layers_dim
+                    )
+                    assert len(kv_cache_stride_order) == len(gpu_shape)
+                except (AttributeError, NotImplementedError):
+                    kv_cache_stride_order = tuple(range(len(gpu_shape)))
+
+                test_shape = tuple(test_shape[i] for i in kv_cache_stride_order)
 
             # find block_size (16) dimension index
             block_size_idx = test_shape.index(16)

From ec8f943db1b8e5f3b32ed2ec29526b8a9a521088 Mon Sep 17 00:00:00 2001
From: hujia177 <hujia@meta.com>
Date: Thu, 26 Feb 2026 09:04:42 -0800
Subject: [PATCH 033/154] Add GlmOcrConfig for GLM-OCR model type recognition
 (#34982)

---
 vllm/transformers_utils/config.py           |  1 +
 vllm/transformers_utils/configs/__init__.py |  4 +
 vllm/transformers_utils/configs/glm_ocr.py  | 91 +++++++++++++++++++++
 3 files changed, 96 insertions(+)
 create mode 100644 vllm/transformers_utils/configs/glm_ocr.py

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 00129d52eaed..f5adb171b4dd 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -82,6 +82,7 @@ def __getitem__(self, key):
     deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
     funaudiochat="FunAudioChatConfig",
+    glm_ocr="GlmOcrConfig",
     hunyuan_vl="HunYuanVLConfig",
     isaac="IsaacConfig",
     kimi_linear="KimiLinearConfig",
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 541bc4de61b1..761f96a574e1 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -28,6 +28,8 @@
     "FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
     "FunAudioChatConfig": "vllm.transformers_utils.configs.funaudiochat",
     "FunAudioChatAudioEncoderConfig": "vllm.transformers_utils.configs.funaudiochat",
+    "GlmOcrConfig": "vllm.transformers_utils.configs.glm_ocr",
+    "GlmOcrVisionConfig": "vllm.transformers_utils.configs.glm_ocr",
     "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
@@ -83,6 +85,8 @@
     "FlexOlmoConfig",
     "FunAudioChatConfig",
     "FunAudioChatAudioEncoderConfig",
+    "GlmOcrConfig",
+    "GlmOcrVisionConfig",
     "HunYuanVLConfig",
     "HunYuanVLTextConfig",
     "HunYuanVLVisionConfig",
diff --git a/vllm/transformers_utils/configs/glm_ocr.py b/vllm/transformers_utils/configs/glm_ocr.py
new file mode 100644
index 000000000000..43656d276e28
--- /dev/null
+++ b/vllm/transformers_utils/configs/glm_ocr.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class GlmOcrVisionConfig(PretrainedConfig):
+    model_type = "glm_ocr_vision"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        depth: int = 24,
+        num_heads: int = 16,
+        attention_bias: bool = True,
+        intermediate_size: int = 4096,
+        hidden_act: str = "silu",
+        hidden_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        image_size: int = 336,
+        in_channels: int = 3,
+        patch_size: int = 14,
+        out_hidden_size: int = 1536,
+        rms_norm_eps: float = 1e-5,
+        spatial_merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.depth = depth
+        self.num_heads = num_heads
+        self.attention_bias = attention_bias
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.initializer_range = initializer_range
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.out_hidden_size = out_hidden_size
+        self.rms_norm_eps = rms_norm_eps
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+
+
+class GlmOcrConfig(PretrainedConfig):
+    model_type = "glm_ocr"
+
+    def __init__(
+        self,
+        text_config: dict | None = None,
+        vision_config: dict | None = None,
+        image_start_token_id: int = 59256,
+        image_end_token_id: int = 59257,
+        video_start_token_id: int = 59258,
+        video_end_token_id: int = 59259,
+        image_token_id: int = 59280,
+        video_token_id: int = 59281,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.image_start_token_id = image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.video_start_token_id = video_start_token_id
+        self.video_end_token_id = video_end_token_id
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_config = GlmOcrVisionConfig(**(vision_config or {}))
+
+        if isinstance(text_config, dict):
+            from transformers import AutoConfig
+
+            model_type = text_config.get("model_type", "chatglm")
+            self.text_config = AutoConfig.for_model(model_type, **text_config)
+        elif text_config is None:
+            from transformers import AutoConfig
+
+            self.text_config = AutoConfig.for_model("chatglm")
+        else:
+            self.text_config = text_config
+
+    def get_text_config(self) -> PretrainedConfig:
+        return self.text_config
+
+    def save_pretrained(self, save_directory, **kwargs):
+        self._auto_class = None
+        super().save_pretrained(save_directory, **kwargs)

From 99c7892c5bf20afc90e2ef0e1ad0a89637ae67a9 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 26 Feb 2026 12:14:54 -0500
Subject: [PATCH 034/154] [Perf] Optimize maxsim scores computation for pooling
 models, 13.9% E2E throughput improvement (#35330)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/entrypoints/pooling/score/test_utils.py | 40 +++++++++-
 vllm/entrypoints/pooling/score/serving.py     | 16 ++--
 vllm/entrypoints/pooling/score/utils.py       | 78 ++++++++++++++++++-
 3 files changed, 123 insertions(+), 11 deletions(-)

diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py
index d69da822dd06..e5e1fd606845 100644
--- a/tests/entrypoints/pooling/score/test_utils.py
+++ b/tests/entrypoints/pooling/score/test_utils.py
@@ -4,10 +4,15 @@
 from unittest.mock import patch
 
 import pytest
+import torch
 
 from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
-from vllm.entrypoints.pooling.score.utils import get_score_prompt
+from vllm.entrypoints.pooling.score.utils import (
+    compute_maxsim_score,
+    compute_maxsim_scores,
+    get_score_prompt,
+)
 from vllm.inputs import TokensPrompt
 from vllm.tokenizers import get_tokenizer
 
@@ -349,3 +354,36 @@ def test_post_process_tokens_called(
         assert_prompt_tokenization_consistent(
             cross_encoder_tokenizer, full_prompt, engine_prompt
         )
+
+
+def test_compute_maxsim_scores_matches_reference_per_pair() -> None:
+    generator = torch.Generator()
+    generator.manual_seed(7)
+
+    shared_query = torch.randn(5, 8, generator=generator)
+    q_embs = [
+        shared_query,  # 1:N style shared query
+        shared_query,
+        torch.randn(2, 8, generator=generator),
+        torch.randn(4, 8, generator=generator),
+    ]
+    d_embs = [
+        torch.randn(6, 8, generator=generator),
+        torch.randn(3, 8, generator=generator),
+        torch.randn(5, 8, generator=generator),
+        torch.randn(7, 8, generator=generator),
+    ]
+
+    batched_scores = compute_maxsim_scores(
+        q_embs,
+        d_embs,
+        max_batch_size=4,
+        max_score_matrix_elements=40,  # batch shrinking path.
+    )
+    reference_scores = [
+        compute_maxsim_score(q, d).to("cpu") for q, d in zip(q_embs, d_embs)
+    ]
+
+    assert len(batched_scores) == len(reference_scores)
+    for batched, reference in zip(batched_scores, reference_scores):
+        torch.testing.assert_close(batched, reference, rtol=1e-4, atol=1e-4)
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index 3fe18ca8b3a5..aec6e909d161 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -31,7 +31,7 @@
     ScoreInputs,
     _cosine_similarity,
     compress_token_type_ids,
-    compute_maxsim_score,
+    compute_maxsim_scores,
     get_score_prompt,
     parse_score_data_single,
     validate_score_input,
@@ -311,19 +311,17 @@ async def _late_interaction_score(
         # Compute MaxSim scores
         from vllm.outputs import PoolingOutput
 
+        maxsim_scores = compute_maxsim_scores(
+            [emb.outputs.data for emb in emb_data_1],
+            [emb.outputs.data for emb in emb_data_2],
+        )
+
         scores: list[PoolingRequestOutput] = []
         padding: list[int] = []
         if (pad_token_id := tokenizer.pad_token_id) is not None:
             padding = [pad_token_id]
 
-        for emb_1, emb_2 in zip(emb_data_1, emb_data_2):
-            # emb_1.outputs.data: [query_len, dim]
-            # emb_2.outputs.data: [doc_len, dim]
-            q_emb = emb_1.outputs.data
-            d_emb = emb_2.outputs.data
-
-            maxsim_score = compute_maxsim_score(q_emb, d_emb)
-
+        for emb_1, emb_2, maxsim_score in zip(emb_data_1, emb_data_2, maxsim_scores):
             tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
 
             scores.append(
diff --git a/vllm/entrypoints/pooling/score/utils.py b/vllm/entrypoints/pooling/score/utils.py
index 60e71ff73953..98c24856bf3c 100644
--- a/vllm/entrypoints/pooling/score/utils.py
+++ b/vllm/entrypoints/pooling/score/utils.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 from typing import Any, TypeAlias, cast
 
 import torch
@@ -53,6 +53,82 @@ def compute_maxsim_score(q_emb: torch.Tensor, d_emb: torch.Tensor) -> torch.Tens
     return token_scores.amax(dim=-1).sum()
 
 
+def compute_maxsim_scores(
+    q_embs: Sequence[torch.Tensor],
+    d_embs: Sequence[torch.Tensor],
+    max_batch_size: int = 16,
+    max_score_matrix_elements: int = 16_000_000,
+) -> list[torch.Tensor]:
+    """Compute ColBERT MaxSim scores in padded mini-batches."""
+    if len(q_embs) != len(d_embs):
+        raise ValueError("q_embs and d_embs must have the same length")
+
+    num_pairs = len(q_embs)
+    if num_pairs == 0:
+        return []
+
+    for q_emb, d_emb in zip(q_embs, d_embs):
+        if q_emb.ndim != 2 or d_emb.ndim != 2:
+            raise ValueError("Each embedding tensor must be 2-D")
+        if q_emb.shape[1] != d_emb.shape[1]:
+            raise ValueError("Query and document embeddings must have same dim")
+
+    compute_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    scores: list[torch.Tensor] = []
+    start = 0
+    while start < num_pairs:
+        end = min(start + max_batch_size, num_pairs)
+        max_q = max(int(x.shape[0]) for x in q_embs[start:end])
+        max_d = max(int(x.shape[0]) for x in d_embs[start:end])
+
+        # keep score matrix bounded to avoid oversized allocations.
+        while (
+            end - start > 1
+            and (end - start) * max_q * max_d > max_score_matrix_elements
+        ):
+            end -= 1
+            max_q = max(int(x.shape[0]) for x in q_embs[start:end])
+            max_d = max(int(x.shape[0]) for x in d_embs[start:end])
+
+        batch_q = q_embs[start:end]
+        batch_d = d_embs[start:end]
+        batch_size = end - start
+        dim = int(batch_q[0].shape[1])
+        dtype = batch_q[0].dtype
+
+        q_batch = torch.zeros(
+            (batch_size, max_q, dim), dtype=dtype, device=compute_device
+        )
+        d_batch = torch.zeros(
+            (batch_size, max_d, dim), dtype=dtype, device=compute_device
+        )
+        q_mask = torch.zeros(
+            (batch_size, max_q), dtype=torch.bool, device=compute_device
+        )
+        d_mask = torch.zeros(
+            (batch_size, max_d), dtype=torch.bool, device=compute_device
+        )
+
+        # copy to padded tensors
+        for i, (q_emb, d_emb) in enumerate(zip(batch_q, batch_d)):
+            q_len = int(q_emb.shape[0])
+            d_len = int(d_emb.shape[0])
+            q_batch[i, :q_len] = q_emb.to(device=compute_device, dtype=dtype)
+            d_batch[i, :d_len] = d_emb.to(device=compute_device, dtype=dtype)
+            q_mask[i, :q_len] = True
+            d_mask[i, :d_len] = True
+
+        token_scores = torch.bmm(q_batch, d_batch.transpose(1, 2))
+        token_scores.masked_fill_(~d_mask.unsqueeze(1), float("-inf"))
+        max_per_query = token_scores.amax(dim=-1)
+        max_per_query.masked_fill_(~q_mask, 0)
+        batch_scores = max_per_query.sum(dim=-1).to("cpu")
+        scores.extend(batch_scores.unbind(0))
+        start = end
+
+    return [cast(torch.Tensor, score) for score in scores]
+
+
 class ScoreMultiModalParam(TypedDict, total=False):
     """
     A specialized parameter type for scoring multimodal content

From d940607629b03602f34ba4dd75c747162b01aedd Mon Sep 17 00:00:00 2001
From: Yiliu Dong <91178480+qianlihuang@users.noreply.github.com>
Date: Fri, 27 Feb 2026 01:31:28 +0800
Subject: [PATCH 035/154] [Core] Support `min_tokens` with speculative decoding
 (#32642)

Signed-off-by: qianlihuang <yiliu.dong@qq.com>
Co-authored-by: qianlihuang <yiliu.dong@qq.com>
---
 tests/v1/e2e/test_async_scheduling.py         |  3 +-
 .../logits_processors/test_custom_offline.py  |  7 ++-
 vllm/sampling_params.py                       |  4 +-
 vllm/v1/sample/logits_processor/__init__.py   |  7 +--
 vllm/v1/sample/logits_processor/builtin.py    | 54 +++++++++++++++++++
 vllm/v1/sample/logits_processor/state.py      |  4 +-
 vllm/v1/sample/rejection_sampler.py           |  7 +++
 7 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 393c8dbeecfe..042e953866cf 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -32,8 +32,7 @@
 default_params = dict(
     temperature=0.0,  # greedy
     max_tokens=30,
-    # spec decoding currently doesn't support min_tokens
-    # min_tokens=28,
+    min_tokens=28,
 )
 
 
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index 59317e91864e..29ec72186b8d 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -276,9 +276,12 @@ def test_rejects_custom_logitsprocs(
         monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
 
         llm = LLM(**llm_kwargs)
-        # Require that no logitsprocs have been loaded
+        # Require that no custom logitsprocs have been loaded
+        # (built-in processors may exist: MinTokensLogitsProcessor,
+        # LogitBiasLogitsProcessor, MinPLogitsProcessor)
         worker = llm.llm_engine.model_executor.driver_worker.worker
-        assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
+        for proc in worker.model_runner.input_batch.logitsprocs.all:
+            assert not isinstance(proc, DummyLogitsProcessor)
         return
 
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 4e5885b65864..2f015339e67d 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -678,9 +678,9 @@ def _validate_spec_decode(
             return
 
         # Some sampling parameters are not yet compatible with spec decoding.
-        if self.min_tokens > 1 or self.min_p > _SAMPLING_EPS or self.logit_bias:
+        if self.min_p > _SAMPLING_EPS or self.logit_bias:
             raise ValueError(
-                "The min_tokens, min_p, and logit_bias sampling parameters "
+                "The min_p and logit_bias sampling parameters "
                 "are not yet supported with speculative decoding."
             )
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index f7b70645fd18..693f7b125df3 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -202,10 +202,11 @@ def build_logitsprocs(
         if custom_logitsprocs:
             raise ValueError(STR_SPEC_DEC_REJECTS_LOGITSPROCS)
         logger.warning(
-            "min_p, logit_bias, and min_tokens parameters won't currently work "
-            "with speculative decoding enabled."
+            "min_p and logit_bias parameters won't work with speculative decoding."
+        )
+        return LogitsProcessors(
+            [MinTokensLogitsProcessor(vllm_config, device, is_pin_memory)]
         )
-        return LogitsProcessors()
 
     custom_logitsprocs_classes = _load_custom_logitsprocs(custom_logitsprocs)
     return LogitsProcessors(
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 82743f72b031..11a52711d671 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -3,6 +3,7 @@
 from collections.abc import Callable, Sequence
 from typing import TYPE_CHECKING, TypeVar
 
+import numpy as np
 import torch
 
 from vllm import SamplingParams
@@ -236,6 +237,59 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
             logits.index_put_(self.logits_slice, self.neg_inf_tensor)
         return logits
 
+    def apply_with_spec_decode(
+        self,
+        logits: torch.Tensor,
+        num_draft_tokens: list[int],
+    ) -> torch.Tensor:
+        """Spec-decode version of apply().
+        Priority: ``min_tokens`` > ``stop_token_ids`` / EOS.
+        Example: ``num_draft_tokens = [2, 3, 1]``
+          → ``logits`` shape ``[6, V]``, ``cumsum = [0, 2, 5, 6]``
+          → request 0 owns rows 0‑1, request 1 rows 2‑4, request 2 row 5.
+        """
+        if not self.min_toks:
+            return logits
+
+        num_draft_arr = np.array(num_draft_tokens, dtype=np.int64)
+        cumsum = np.concatenate([[0], np.cumsum(num_draft_arr)])
+
+        entries = [
+            (req_idx, min_tok, len(out_tok_ids), list(stop_tok_ids))
+            for req_idx, (min_tok, out_tok_ids, stop_tok_ids) in self.min_toks.items()
+            if stop_tok_ids
+        ]
+
+        if not entries:
+            return logits
+
+        all_rows: list[np.ndarray] = []  # row indices to mask
+        all_toks: list[np.ndarray] = []  # stop-token ids at those rows
+
+        for req_idx, min_tok, current_len, stop_toks in entries:
+            remaining = min_tok - current_len
+            # How many leading draft positions still need stop-token masking.
+            n_mask = int(min(max(remaining, 0), num_draft_arr[req_idx]))
+
+            if n_mask > 0:
+                offset = cumsum[req_idx]
+                row_indices = np.arange(offset, offset + n_mask, dtype=np.int64)
+                n_stop = len(stop_toks)
+                all_rows.append(np.repeat(row_indices, n_stop))
+                all_toks.append(np.tile(stop_toks, n_mask))
+
+        if all_rows:
+            rows_arr = np.concatenate(all_rows)
+            toks_arr = np.concatenate(all_toks)
+            # (row_indices, token_indices) for index_put_ to set -inf.
+            logits_slice = (
+                torch.from_numpy(rows_arr).to(self.device, non_blocking=True),
+                torch.from_numpy(toks_arr).to(self.device, non_blocking=True),
+            )
+            logits.index_put_(logits_slice, self.neg_inf_tensor)
+
+        return logits
+
 
 def process_dict_updates(
     req_entries: dict[int, T],
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index c15219da5cf7..41cbba8dffb3 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterator
+from collections.abc import Iterable, Iterator
 from itertools import chain
 from typing import TYPE_CHECKING
 
@@ -148,7 +148,7 @@ def get_and_reset(self, batch_size: int) -> BatchUpdate | None:
 class LogitsProcessors:
     """Encapsulates initialized logitsproc objects."""
 
-    def __init__(self, logitsprocs: Iterator["LogitsProcessor"] | None = None) -> None:
+    def __init__(self, logitsprocs: Iterable["LogitsProcessor"] | None = None) -> None:
         self.argmax_invariant: list[LogitsProcessor] = []
         self.non_argmax_invariant: list[LogitsProcessor] = []
         if logitsprocs:
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 1efceba385a5..278d421eb910 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -10,6 +10,7 @@
 from vllm.logger import init_logger
 from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors, SamplerOutput
+from vllm.v1.sample.logits_processor.builtin import MinTokensLogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.bad_words import apply_bad_words_with_drafts
 from vllm.v1.sample.ops.penalties import apply_all_penalties
@@ -292,6 +293,12 @@ def apply_logits_processors(
                 logits, bad_words_token_ids, output_token_ids, metadata.num_draft_tokens
             )
 
+        for processor in sampling_metadata.logitsprocs.non_argmax_invariant:
+            if isinstance(processor, MinTokensLogitsProcessor):
+                logits = processor.apply_with_spec_decode(
+                    logits, metadata.num_draft_tokens
+                )
+
         return logits
 
     @staticmethod

From 05970c772c1ca32be058d4cccfbb12aaf2032d70 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 26 Feb 2026 12:53:46 -0500
Subject: [PATCH 036/154] [Refactor] Remove dead code for attention benchmark
 script (#35418)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 benchmarks/attention_benchmarks/__init__.py |  2 -
 benchmarks/attention_benchmarks/common.py   | 93 ---------------------
 2 files changed, 95 deletions(-)

diff --git a/benchmarks/attention_benchmarks/__init__.py b/benchmarks/attention_benchmarks/__init__.py
index df7a6328569d..2d21288700a5 100644
--- a/benchmarks/attention_benchmarks/__init__.py
+++ b/benchmarks/attention_benchmarks/__init__.py
@@ -15,7 +15,6 @@
     BenchmarkConfig,
     BenchmarkResult,
     MockLayer,
-    MockModelConfig,
     ResultsFormatter,
     get_attention_scale,
     is_mla_backend,
@@ -36,7 +35,6 @@
     "ResultsFormatter",
     # Mock objects
     "MockLayer",
-    "MockModelConfig",
     # Utilities
     "setup_mla_dims",
     "get_attention_scale",
diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py
index 1de8bb0a55b7..6bba93e50238 100644
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -10,7 +10,6 @@
 from pathlib import Path
 from typing import Any
 
-import numpy as np
 import torch
 from batch_spec import get_batch_type, parse_batch_spec
 from rich.console import Console
@@ -62,10 +61,7 @@ def get_text_config(self):
 # Import AttentionLayerBase at module level to avoid circular dependencies
 try:
     from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-
-    _HAS_ATTENTION_LAYER_BASE = True
 except ImportError:
-    _HAS_ATTENTION_LAYER_BASE = False
     AttentionLayerBase = object  # Fallback
 
 
@@ -167,95 +163,6 @@ def get_kv_cache_spec(self):
         return self._kv_cache_spec
 
 
-class MockModelConfig:
-    """Mock model configuration."""
-
-    def __init__(
-        self,
-        num_q_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        dtype: torch.dtype = torch.float16,
-        max_model_len: int = 32768,
-    ):
-        self._n_q = num_q_heads
-        self._n_kv = num_kv_heads
-        self._d = head_dim
-        self.dtype = dtype
-        self.max_model_len = max_model_len
-
-    def get_num_attention_heads(self, _=None) -> int:
-        return self._n_q
-
-    def get_num_kv_heads(self, _=None) -> int:
-        return self._n_kv
-
-    def get_head_size(self) -> int:
-        return self._d
-
-    def get_num_layers(self) -> int:
-        """Mock method for layer count queries."""
-        return 1
-
-    def get_sliding_window_for_layer(self, _layer_idx: int):
-        """Mock method for sliding window queries."""
-        return None
-
-    def get_logits_soft_cap_for_layer(self, _layer_idx: int):
-        """Mock method for logits soft cap queries."""
-        return None
-
-    def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
-        """Mock method for SM scale queries."""
-        return 1.0 / (self.get_head_size() ** 0.5)
-
-
-class MockParallelConfig:
-    """Mock parallel configuration."""
-
-    pass
-
-
-class MockCompilationConfig:
-    """Mock compilation configuration."""
-
-    def __init__(self):
-        self.full_cuda_graph = False
-        self.static_forward_context = {}
-
-
-class MockVLLMConfig:
-    """Mock VLLM configuration."""
-
-    def __init__(self):
-        self.compilation_config = MockCompilationConfig()
-
-
-class MockRunner:
-    """Mock GPU runner for metadata builders."""
-
-    def __init__(
-        self,
-        seq_lens: np.ndarray,
-        query_start_locs: np.ndarray,
-        device: torch.device,
-        num_q_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        dtype: torch.dtype,
-    ):
-        self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
-        self.parallel_config = MockParallelConfig()
-        self.vllm_config = MockVLLMConfig()
-        self.seq_lens_np = seq_lens
-        self.query_start_loc_np = query_start_locs
-        self.device = device
-        self.attention_chunk_size = None
-        self.num_query_heads = num_q_heads
-        self.num_kv_heads = num_kv_heads
-        self.dtype = dtype
-
-
 @dataclass
 class ParameterSweep:
     """Configuration for sweeping a backend parameter."""

From a1f53addb132f75704710184f4c1cc4780343329 Mon Sep 17 00:00:00 2001
From: Runkai Tao <129432511+RunkaiTao@users.noreply.github.com>
Date: Thu, 26 Feb 2026 13:03:10 -0500
Subject: [PATCH 037/154] [BugFix] Align fused MoE-LoRA kernel config with
 actual weight shapes  (#34396)

Signed-off-by: Runkai Tao <rt572@physics.rutgers.edu>
---
 vllm/lora/layers/fused_moe.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index e08dcc87e90e..c13ed44e6f70 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -83,7 +83,11 @@ def _get_lora_moe_configs(
     ):
         if envs.VLLM_TUNED_CONFIG_FOLDER:
             hidden_size = layer.hidden_size
-            intermediate_size = layer.intermediate_size_per_partition
+            intermediate_size = (
+                self.w2_lora_a_stacked[0].shape[-1]
+                if op_prefix == "w2"
+                else self.w13_lora_b_stacked[0].shape[-2]
+            )
             shrink_config = get_lora_op_configs(
                 op_type=f"fused_moe_lora_{op_prefix}_shrink",
                 max_loras=num_loras,

From 5e58bdc7113a2c62a9bfb71304d0d1563b0da7f3 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 26 Feb 2026 13:44:50 -0500
Subject: [PATCH 038/154] [Bugfix] Remove erroneous lower bound on LoRA vocab
 size constraint (#35354)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/lora/test_layers.py            | 4 ++--
 vllm/lora/layers/logits_processor.py | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index c9c55114360d..d3c1f3debb34 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -469,7 +469,7 @@ def _pretest():
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("vocab_size", [512, 32000, 258049, 300000])
+@pytest.mark.parametrize("vocab_size", [258049, 300000])
 @pytest.mark.parametrize("device", DEVICES)
 def test_lm_head_logits_processor_invalid_vocab_size(
     default_vllm_config, dist_init, vocab_size, device
@@ -489,7 +489,7 @@ def test_lm_head_logits_processor_invalid_vocab_size(
         logits_processor, 1024, torch.float16, device, None
     )
 
-    with pytest.raises(ValueError, match="vocab size must be > 32000 and <= 258048"):
+    with pytest.raises(ValueError, match="vocab size must be <= 258048"):
         lora_logits_processor.create_lora_weights(max_loras, lora_config)
 
 
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index 217c46fbec4f..237a61eace1e 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -88,10 +88,8 @@ def create_lora_weights(
         model_config: PretrainedConfig | None = None,
     ) -> None:
         # TODO: Verify if this condition can be further relaxed
-        if self.base_layer.vocab_size <= 32000 or self.base_layer.vocab_size > 258048:
-            raise ValueError(
-                "When using LoRA, vocab size must be > 32000 and <= 258048"
-            )
+        if self.base_layer.vocab_size > 258048:
+            raise ValueError("When using LoRA, vocab size must be <= 258048")
         self.lora_a_stacked = torch.zeros(
             (
                 max_loras,

From b6d5a17298548e77cf5af456e029e5beb26b253c Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 26 Feb 2026 11:00:19 -0800
Subject: [PATCH 039/154] [Model Runner V2] Fix error-handling (#35063)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
---
 vllm/v1/worker/gpu/model_runner.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9e0cae6feef1..26eb3ecf72af 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -227,6 +227,9 @@ def __init__(
         # KV Connector if configured.
         self.kv_connector: KVConnector = NO_OP_KV_CONNECTOR
 
+        # For transferring state from execute_model to subsequent sample_tokens call.
+        self.execute_model_state: tuple | None = None
+
     def update_max_model_len(self, max_model_len: int) -> None:
         self.max_model_len = max_model_len
         self.req_states.max_model_len = max_model_len
@@ -388,6 +391,7 @@ def _dummy_run(
 
         assert self.execute_model_state is not None
         hidden_states, _, input_batch, _ = self.execute_model_state
+        self.execute_model_state = None
         assert hidden_states is not None  # Last PP rank always has hidden_states
         sample_hidden_states = hidden_states[input_batch.logits_indices]
         return hidden_states, sample_hidden_states
@@ -1036,18 +1040,20 @@ def execute_model(
             aux_hidden_states,
             input_batch,
             kv_connector_output,
-        )  # type: ignore
+        )
         return None
 
     @torch.inference_mode()
     def sample_tokens(
         self, grammar_output: GrammarOutput | None
     ) -> AsyncOutput | ModelRunnerOutput | None:
-        assert self.execute_model_state is not None
+        if self.execute_model_state is None:
+            # The prior execute_model call must have failed.
+            return None
         hidden_states, aux_hidden_states, input_batch, kv_connector_output = (
             self.execute_model_state
         )
-        self.execute_model_state = None  # type: ignore
+        self.execute_model_state = None
 
         if not self.is_last_pp_rank:
             # Non-last PP rank: hidden_states is None because this rank produced

From c66aa48e993b74c46f83261654827b7349b2208c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 26 Feb 2026 11:20:35 -0800
Subject: [PATCH 040/154] [Model Runner V2] Add model states [1/N]  (#35350)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 59 ++++++++-----------
 vllm/v1/worker/gpu/input_batch.py     |  3 -
 vllm/v1/worker/gpu/model_runner.py    | 84 +++++++--------------------
 vllm/v1/worker/gpu/model_states.py    | 74 +++++++++++++++++++++++
 4 files changed, 119 insertions(+), 101 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/model_states.py

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index d70a4c7ab18d..95369005d967 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -22,6 +22,7 @@
 from vllm.v1.worker.gpu.block_table import BlockTables
 from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
 from vllm.v1.worker.gpu.input_batch import InputBuffers
+from vllm.v1.worker.gpu.model_states import ModelState
 from vllm.v1.worker.utils import AttentionGroup
 
 
@@ -29,13 +30,11 @@ class CudaGraphManager:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        uses_mrope: bool,
         use_aux_hidden_state_outputs: bool,
         device: torch.device,
     ):
         self.vllm_config = vllm_config
         self.scheduler_config = vllm_config.scheduler_config
-        self.uses_mrope = uses_mrope
         self.use_aux_hidden_state_outputs = use_aux_hidden_state_outputs
         self.device = device
 
@@ -88,8 +87,8 @@ def capture_graph(
         num_tokens: int,
         capture_cg_mode: CUDAGraphMode,
         model: nn.Module,
+        model_state: ModelState,
         input_buffers: InputBuffers,
-        mrope_positions: torch.Tensor | None,
         inputs_embeds: torch.Tensor | None,
         block_tables: BlockTables,
         attn_groups: list[list[AttentionGroup]],
@@ -113,13 +112,18 @@ def capture_graph(
             )
         else:
             num_reqs = min(num_tokens, self.max_num_reqs)
-        input_ids = input_buffers.input_ids[:num_tokens]
-        positions = input_buffers.positions[:num_tokens]
-        if self.uses_mrope:
-            assert mrope_positions is not None
-            positions = mrope_positions[:, :num_tokens]
-        if inputs_embeds is not None:
-            inputs_embeds = inputs_embeds[:num_tokens]
+
+        model_inputs = {
+            "input_ids": input_buffers.input_ids[:num_tokens],
+            "positions": input_buffers.positions[:num_tokens],
+            "inputs_embeds": (
+                inputs_embeds[:num_tokens] if inputs_embeds is not None else None
+            ),
+            # NOTE: Values returned by `prepare_dummy_inputs` will override the
+            # default values above.
+            **model_state.prepare_dummy_inputs(num_reqs, num_tokens),
+        }
+
         attn_metadata, slot_mappings = prepare_inputs_to_capture(
             num_reqs,
             num_tokens,
@@ -143,11 +147,7 @@ def capture_graph(
             num_tokens_across_dp=num_tokens_across_dp,
             slot_mapping=slot_mappings,
         ):
-            model_output = model(
-                input_ids=input_ids,
-                positions=positions,
-                inputs_embeds=inputs_embeds,
-            )
+            model_output = model(**model_inputs)
             if self.use_aux_hidden_state_outputs:
                 hidden_states, aux_hidden_states = model_output
             else:
@@ -164,9 +164,7 @@ def capture_graph(
             num_tokens=num_tokens,
             num_reqs=num_reqs,
             model=model,
-            input_ids=input_ids,
-            positions=positions,
-            inputs_embeds=inputs_embeds,
+            model_inputs=model_inputs,
             num_tokens_across_dp=num_tokens_across_dp,
             attn_metadata=attn_metadata,
             slot_mappings=slot_mappings,
@@ -178,9 +176,7 @@ def _capture_full_graph(
         num_tokens: int,
         num_reqs: int,
         model: nn.Module,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        inputs_embeds: torch.Tensor | None,
+        model_inputs: dict[str, torch.Tensor | None],
         num_tokens_across_dp: torch.Tensor,
         attn_metadata: dict[str, Any] | None,
         slot_mappings: dict[str, torch.Tensor] | None,
@@ -206,11 +202,8 @@ def _capture_full_graph(
             ),
             torch.cuda.graph(graph, self.pool),
         ):
-            model_output = model(
-                input_ids=input_ids,
-                positions=positions,
-                inputs_embeds=inputs_embeds,
-            )
+            model_output = model(**model_inputs)
+
             # Join offloader's copy stream after forward to avoid unjoined
             # stream error. The last layer's start_prefetch forks copy_stream,
             # but wait_prefetch only happens in the next forward pass.
@@ -235,9 +228,7 @@ def _capture_piecewise_graph(
         num_tokens: int,
         num_reqs: int,
         model: nn.Module,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        inputs_embeds: torch.Tensor | None,
+        model_inputs: dict[str, torch.Tensor | None],
         num_tokens_across_dp: torch.Tensor,
         attn_metadata: dict[str, Any] | None,
         slot_mappings: dict[str, torch.Tensor] | None,
@@ -256,18 +247,14 @@ def _capture_piecewise_graph(
             batch_descriptor=batch_descriptor,
             slot_mapping=slot_mappings,
         ):
-            model(
-                input_ids=input_ids,
-                positions=positions,
-                inputs_embeds=inputs_embeds,
-            )
+            model(**model_inputs)
 
     @torch.inference_mode()
     def capture(
         self,
         model: nn.Module,
+        model_state: ModelState,
         input_buffers: InputBuffers,
-        mrope_positions: torch.Tensor | None,
         inputs_embeds: torch.Tensor | None,
         block_tables: BlockTables,
         attn_groups: list[list[AttentionGroup]],
@@ -278,8 +265,8 @@ def capture(
             device=self.device,
             capture_fn=self.capture_graph,
             model=model,
+            model_state=model_state,
             input_buffers=input_buffers,
-            mrope_positions=mrope_positions,
             inputs_embeds=inputs_embeds,
             block_tables=block_tables,
             attn_groups=attn_groups,
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index a15da926da4e..87b8bbf18b81 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -65,8 +65,6 @@ class InputBatch:
     input_ids: torch.Tensor
     # [num_tokens_after_padding]
     positions: torch.Tensor
-    # [3, num_tokens_after_padding]
-    mrope_positions: torch.Tensor | None
     # [num_tokens_after_padding, hidden_size]
     inputs_embeds: torch.Tensor | None
 
@@ -143,7 +141,6 @@ def make_dummy(
             seq_lens=seq_lens,
             input_ids=input_ids,
             positions=positions,
-            mrope_positions=None,
             inputs_embeds=None,
             attn_metadata=None,  # type: ignore
             slot_mappings=None,  # type: ignore
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 26eb3ecf72af..949f09f54bad 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -77,7 +77,7 @@
 )
 from vllm.v1.worker.gpu.lora_utils import LoraState
 from vllm.v1.worker.gpu.mm.encoder_runner import EncoderRunner
-from vllm.v1.worker.gpu.mm.mrope_utils import MRopeState
+from vllm.v1.worker.gpu.model_states import ModelState
 from vllm.v1.worker.gpu.pp_utils import pp_broadcast, pp_receive
 from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.prompt_logprob import PromptLogprobsWorker
@@ -140,14 +140,6 @@ def __init__(
                 dtype=self.dtype,
                 device=self.device,
             )
-        self.uses_mrope = self.model_config.uses_mrope
-        if self.uses_mrope:
-            self.mrope_states = MRopeState(
-                max_num_reqs=self.max_num_reqs,
-                max_num_tokens=self.max_num_tokens,
-                max_model_len=self.max_model_len,
-                device=self.device,
-            )
 
         self.use_async_scheduling = self.scheduler_config.async_scheduling
         self.output_copy_stream = torch.cuda.Stream(self.device)
@@ -212,7 +204,6 @@ def __init__(
         # CUDA graphs.
         self.cudagraph_manager = CudaGraphManager(
             self.vllm_config,
-            self.uses_mrope,
             self.use_aux_hidden_state_outputs,
             self.device,
         )
@@ -271,6 +262,9 @@ def load_model(self, *args, **kwargs) -> None:
         if self.speculator is not None:
             prepare_communication_buffer_for_model(self.speculator)
 
+        # Initialize the components that require the model.
+        self.model_state = ModelState(self.vllm_config, self.model, self.device)
+
     def get_model(self) -> nn.Module:
         return self.model
 
@@ -481,16 +475,13 @@ def capture_model(self) -> int:
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         with self.maybe_setup_dummy_loras(self.lora_config):
-            mrope_positions = None
-            if self.uses_mrope:
-                mrope_positions = self.mrope_states.mrope_positions
             inputs_embeds = None
             if self.supports_mm_inputs:
                 inputs_embeds = self.encoder_runner.inputs_embeds
             self.cudagraph_manager.capture(
                 model=self.model,
+                model_state=self.model_state,
                 input_buffers=self.input_buffers,
-                mrope_positions=mrope_positions,
                 inputs_embeds=inputs_embeds,
                 block_tables=self.block_tables,
                 attn_groups=self.attn_groups,
@@ -554,14 +545,7 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None:
             if self.supports_mm_inputs:
                 self.encoder_runner.add_request(req_id, new_req_data.mm_features)
 
-            # Pre-compute M-RoPE positions for prefill.
-            if self.uses_mrope:
-                self.mrope_states.init_prefill_mrope_positions(
-                    req_index,
-                    self.model,  # type: ignore
-                    new_req_data.prefill_token_ids,
-                    mm_features=new_req_data.mm_features,
-                )
+            self.model_state.add_request(req_index, new_req_data)
 
             self.block_tables.append_block_ids(
                 req_index, new_req_data.block_ids, overwrite=True
@@ -577,8 +561,7 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None:
         if scheduler_output.scheduled_new_reqs:
             self.req_states.apply_staged_writes()
             self.sampler.apply_staged_writes()
-            if self.uses_mrope:
-                self.mrope_states.apply_staged_writes()
+            self.model_state.apply_staged_writes()
 
     def update_requests(self, scheduler_output: SchedulerOutput) -> None:
         # Add new blocks for the existing requests.
@@ -692,15 +675,6 @@ def prepare_inputs(
             )
         dcp_local_seq_lens = self.input_buffers.dcp_local_seq_lens[:num_reqs]
 
-        # Prepare M-RoPE positions.
-        if self.uses_mrope:
-            self.mrope_states.prepare_mrope_positions(
-                idx_mapping,
-                query_start_loc,
-                self.req_states.prefill_len.gpu,
-                self.req_states.num_computed_tokens.gpu,
-            )
-
         # Some input token ids are directly read from the last sampled tokens
         # and draft tokens. Also, get the logits indices to sample tokens from.
         logits_indices = combine_sampled_and_draft_tokens(
@@ -744,10 +718,6 @@ def prepare_inputs(
 
         input_ids = self.input_buffers.input_ids[:num_tokens_after_padding]
         positions = self.input_buffers.positions[:num_tokens_after_padding]
-        mrope_positions = None
-        if self.uses_mrope:
-            mrope_positions = self.mrope_states.mrope_positions
-            mrope_positions = mrope_positions[:, :num_tokens_after_padding]
         return InputBatch(
             req_ids=req_ids,
             num_reqs=num_reqs,
@@ -764,7 +734,6 @@ def prepare_inputs(
             seq_lens=seq_lens,
             input_ids=input_ids,
             positions=positions,
-            mrope_positions=mrope_positions,
             inputs_embeds=None,
             attn_metadata=attn_metadata,
             slot_mappings=slot_mappings_by_layer,
@@ -959,14 +928,24 @@ def execute_model(
                 input_buffers=self.input_buffers,
                 device=self.device,
             )
-            if self.uses_mrope:
-                input_batch.mrope_positions = self.mrope_states.mrope_positions[
-                    :, :num_tokens_after_padding
-                ]
             if not skip_attn_for_dummy_run:
                 self.prepare_dummy_attn_metadata(input_batch)
             # FIXME(woosuk): Fix warmup for LoRA.
 
+        model_inputs = {
+            "input_ids": input_batch.input_ids,
+            "positions": input_batch.positions,
+            "inputs_embeds": input_batch.inputs_embeds,
+            # NOTE: Values returned by `prepare_inputs` will override the default
+            # values above.
+            **self.model_state.prepare_inputs(input_batch, self.req_states),
+        }
+        if not self.is_first_pp_rank:
+            # Update for non-first PP ranks.
+            model_inputs["input_ids"] = None
+            model_inputs["inputs_embeds"] = None
+            model_inputs["intermediate_tensors"] = intermediate_tensors
+
         # Run model.
         if cudagraph_runtime_mode == CUDAGraphMode.FULL:
             # Use explicit cudagraph replay for FULL mode.
@@ -983,20 +962,6 @@ def execute_model(
                 aux_hidden_states = None
         else:
             # For piecewise and eager mode, just call model().
-            positions = input_batch.positions
-            if self.uses_mrope:
-                assert input_batch.mrope_positions is not None
-                positions = input_batch.mrope_positions
-
-            if self.is_first_pp_rank:
-                input_ids = input_batch.input_ids
-                inputs_embeds = input_batch.inputs_embeds
-                assert intermediate_tensors is None
-            else:
-                input_ids = None
-                inputs_embeds = None
-                assert intermediate_tensors is not None
-
             batch_descriptor = BatchDescriptor(
                 num_tokens=input_batch.num_tokens_after_padding,
                 has_lora=self.lora_config is not None,
@@ -1012,12 +977,7 @@ def execute_model(
                 slot_mapping=input_batch.slot_mappings,
             ):
                 self.kv_connector.pre_forward(scheduler_output)
-                model_output = self.model(
-                    input_ids=input_ids,
-                    positions=positions,
-                    inputs_embeds=inputs_embeds,
-                    intermediate_tensors=intermediate_tensors,
-                )
+                model_output = self.model(**model_inputs)
                 if self.use_aux_hidden_state_outputs:
                     hidden_states, aux_hidden_states = model_output
                 else:
diff --git a/vllm/v1/worker/gpu/model_states.py b/vllm/v1/worker/gpu/model_states.py
new file mode 100644
index 000000000000..03574b2ad55a
--- /dev/null
+++ b/vllm/v1/worker/gpu/model_states.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.v1.core.sched.output import NewRequestData
+from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.mm.mrope_utils import MRopeState
+from vllm.v1.worker.gpu.states import RequestState
+
+
+class ModelState:
+    def __init__(self, vllm_config: VllmConfig, model: nn.Module, device: torch.device):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.model = model
+        self.device = device
+
+        self.max_model_len = self.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+
+        self.uses_mrope = self.model_config.uses_mrope
+        if self.uses_mrope:
+            self.mrope_state = MRopeState(
+                max_num_reqs=self.max_num_reqs,
+                max_num_tokens=self.max_num_tokens,
+                max_model_len=self.max_model_len,
+                device=self.device,
+            )
+
+    def add_request(self, req_index: int, new_req_data: NewRequestData) -> None:
+        if self.uses_mrope:
+            # Pre-compute M-RoPE positions for prefill.
+            assert new_req_data.prefill_token_ids is not None
+            self.mrope_state.init_prefill_mrope_positions(
+                req_index,
+                self.model,  # type: ignore
+                new_req_data.prefill_token_ids,
+                mm_features=new_req_data.mm_features,
+            )
+
+    def apply_staged_writes(self) -> None:
+        if self.uses_mrope:
+            self.mrope_state.apply_staged_writes()
+
+    def prepare_inputs(
+        self, input_batch: InputBatch, req_states: RequestState
+    ) -> dict[str, torch.Tensor | None]:
+        if not self.uses_mrope:
+            # Common case (1D positions).
+            return {}
+
+        # Prepare M-RoPE positions.
+        self.mrope_state.prepare_mrope_positions(
+            input_batch.idx_mapping,
+            input_batch.query_start_loc,
+            req_states.prefill_len.gpu,
+            req_states.num_computed_tokens.gpu,
+        )
+        mrope_positions = self.mrope_state.mrope_positions[
+            :, : input_batch.num_tokens_after_padding
+        ]
+        return {"positions": mrope_positions}
+
+    def prepare_dummy_inputs(
+        self, num_reqs: int, num_tokens: int
+    ) -> dict[str, torch.Tensor | None]:
+        if not self.uses_mrope:
+            return {}
+        mrope_positions = self.mrope_state.mrope_positions[:, :num_tokens]
+        return {"positions": mrope_positions}

From 3d66502e1bf48d4ca92ca0d54f7c9bba39a8556c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 26 Feb 2026 11:47:02 -0800
Subject: [PATCH 041/154] [Model Runner V2] Prepare attn metadata in ModelState
 [2/N] (#35383)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/input_batch.py             |  11 +-
 vllm/v1/worker/gpu/model_runner.py            | 154 ++++++++----------
 vllm/v1/worker/gpu/model_states.py            |  31 ++++
 .../gpu/spec_decode/eagle/speculator.py       |   6 +-
 4 files changed, 110 insertions(+), 92 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 87b8bbf18b81..75655258c187 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Any
 
 import numpy as np
 import torch
@@ -60,6 +59,8 @@ class InputBatch:
     query_start_loc_np: np.ndarray
     # [num_reqs]
     seq_lens: torch.Tensor
+    # [num_reqs]
+    dcp_local_seq_lens: torch.Tensor | None
 
     # [num_tokens_after_padding]
     input_ids: torch.Tensor
@@ -68,11 +69,6 @@ class InputBatch:
     # [num_tokens_after_padding, hidden_size]
     inputs_embeds: torch.Tensor | None
 
-    # layer_name -> Metadata
-    attn_metadata: dict[str, Any]
-    # layer_name -> slot_mapping
-    slot_mappings: dict[str, torch.Tensor]
-
     # [total_num_logits]
     logits_indices: torch.Tensor
     # [num_reqs + 1]
@@ -139,11 +135,10 @@ def make_dummy(
             query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
+            dcp_local_seq_lens=None,
             input_ids=input_ids,
             positions=positions,
             inputs_embeds=None,
-            attn_metadata=None,  # type: ignore
-            slot_mappings=None,  # type: ignore
             logits_indices=logits_indices,
             cu_num_logits=cu_num_logits,
             cu_num_logits_np=cu_num_logits_np,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 949f09f54bad..7dcdaf1d2029 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -46,7 +46,6 @@
 from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
 from vllm.v1.worker.gpu.async_utils import AsyncOutput
 from vllm.v1.worker.gpu.attn_utils import (
-    build_attn_metadata,
     build_slot_mappings_by_layer,
     get_kv_cache_spec,
     init_attn_backend,
@@ -317,31 +316,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         )
         self.kv_connector = get_kv_connector(self.vllm_config, kv_caches_dict)
 
-    def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None:
-        block_tables = self.block_tables.get_dummy_block_tables(input_batch.num_reqs)
-        slot_mappings = self.block_tables.get_dummy_slot_mappings(
-            input_batch.num_tokens
-        )
-        slot_mappings_by_layer = build_slot_mappings_by_layer(
-            slot_mappings, self.kv_cache_config
-        )
-        attn_metadata = build_attn_metadata(
-            attn_groups=self.attn_groups,
-            num_reqs=input_batch.num_reqs,
-            num_tokens=input_batch.num_tokens,
-            query_start_loc_gpu=input_batch.query_start_loc,
-            query_start_loc_cpu=torch.from_numpy(input_batch.query_start_loc_np),
-            max_query_len=input_batch.num_scheduled_tokens.max().item(),
-            seq_lens=input_batch.seq_lens,
-            max_seq_len=self.max_model_len,
-            block_tables=block_tables,
-            slot_mappings=slot_mappings,
-            kv_cache_config=self.kv_cache_config,
-            dcp_local_seq_lens=self.input_buffers.dcp_local_seq_lens,
-        )
-        input_batch.attn_metadata = attn_metadata
-        input_batch.slot_mappings = slot_mappings_by_layer
-
     @torch.inference_mode()
     def _dummy_run(
         self, num_tokens: int, *args, skip_attn: bool = True, **kwargs
@@ -384,7 +358,7 @@ def _dummy_run(
             return None, None
 
         assert self.execute_model_state is not None
-        hidden_states, _, input_batch, _ = self.execute_model_state
+        input_batch, _, _, _, hidden_states, _, _ = self.execute_model_state
         self.execute_model_state = None
         assert hidden_states is not None  # Last PP rank always has hidden_states
         sample_hidden_states = hidden_states[input_batch.logits_indices]
@@ -546,7 +520,6 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None:
                 self.encoder_runner.add_request(req_id, new_req_data.mm_features)
 
             self.model_state.add_request(req_index, new_req_data)
-
             self.block_tables.append_block_ids(
                 req_index, new_req_data.block_ids, overwrite=True
             )
@@ -624,9 +597,6 @@ def prepare_inputs(
                 idx_mapping, total_num_logits, cu_num_logits, max_expand_len
             )
 
-        # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
-        block_tables = self.block_tables.gather_block_tables(idx_mapping)
-
         # Get query_start_loc.
         query_start_loc_np = np.empty(self.max_num_reqs + 1, dtype=np.int32)
         query_start_loc_np[0] = 0
@@ -635,11 +605,8 @@ def prepare_inputs(
         # Some attention backends like FA3 require query_start_loc to be non-decreasing.
         query_start_loc_np[num_reqs + 1 :] = num_tokens
         async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc)
-
         query_start_loc_np = query_start_loc_np[: num_reqs + 1]
-        query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
         query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
-        max_query_len = num_scheduled_tokens.max().item()
 
         # Get prefill tokens if any.
         if self.req_states.any_prefills(idx_mapping_np):
@@ -663,6 +630,7 @@ def prepare_inputs(
         )
         seq_lens = self.input_buffers.seq_lens[:num_reqs]
 
+        dcp_local_seq_lens = None
         if self.use_dcp:
             # Prepare dcp local seq_lens.
             prepare_dcp_local_seq_lens(
@@ -673,7 +641,7 @@ def prepare_inputs(
                 self.dcp_rank,
                 self.cp_interleave,
             )
-        dcp_local_seq_lens = self.input_buffers.dcp_local_seq_lens[:num_reqs]
+            dcp_local_seq_lens = self.input_buffers.dcp_local_seq_lens[:num_reqs]
 
         # Some input token ids are directly read from the last sampled tokens
         # and draft tokens. Also, get the logits indices to sample tokens from.
@@ -689,35 +657,6 @@ def prepare_inputs(
             total_num_logits,
         )
 
-        # Compute slot mappings: [num_kv_cache_groups, num_tokens]
-        slot_mappings = self.block_tables.compute_slot_mappings(
-            idx_mapping,
-            query_start_loc,
-            self.input_buffers.positions[:num_tokens],
-        )
-        # Layer name -> slot mapping.
-        slot_mappings_by_layer = build_slot_mappings_by_layer(
-            slot_mappings, self.kv_cache_config
-        )
-
-        # Layer name -> attention metadata.
-        attn_metadata = build_attn_metadata(
-            attn_groups=self.attn_groups,
-            num_reqs=num_reqs,
-            num_tokens=num_tokens,
-            query_start_loc_gpu=query_start_loc,
-            query_start_loc_cpu=query_start_loc_cpu,
-            max_query_len=max_query_len,
-            seq_lens=self.input_buffers.seq_lens,
-            max_seq_len=self.max_model_len,
-            block_tables=block_tables,
-            slot_mappings=slot_mappings,
-            kv_cache_config=self.kv_cache_config,
-            dcp_local_seq_lens=dcp_local_seq_lens,
-        )
-
-        input_ids = self.input_buffers.input_ids[:num_tokens_after_padding]
-        positions = self.input_buffers.positions[:num_tokens_after_padding]
         return InputBatch(
             req_ids=req_ids,
             num_reqs=num_reqs,
@@ -732,17 +671,38 @@ def prepare_inputs(
             query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
-            input_ids=input_ids,
-            positions=positions,
+            dcp_local_seq_lens=dcp_local_seq_lens,
+            input_ids=self.input_buffers.input_ids[:num_tokens_after_padding],
+            positions=self.input_buffers.positions[:num_tokens_after_padding],
             inputs_embeds=None,
-            attn_metadata=attn_metadata,
-            slot_mappings=slot_mappings_by_layer,
             logits_indices=logits_indices,
             cu_num_logits=cu_num_logits,
             cu_num_logits_np=cu_num_logits_np,
             has_structured_output_reqs=scheduler_output.has_structured_output_requests,
         )
 
+    def prepare_attn(
+        self, input_batch: InputBatch
+    ) -> tuple[tuple[torch.Tensor, ...], torch.Tensor]:
+        # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
+        block_tables = self.block_tables.gather_block_tables(input_batch.idx_mapping)
+        # Compute slot mappings: [num_kv_cache_groups, num_tokens]
+        slot_mappings = self.block_tables.compute_slot_mappings(
+            input_batch.idx_mapping,
+            input_batch.query_start_loc,
+            input_batch.positions,
+        )
+        return block_tables, slot_mappings
+
+    def prepare_dummy_attn(
+        self, input_batch: InputBatch
+    ) -> tuple[tuple[torch.Tensor, ...], torch.Tensor]:
+        block_tables = self.block_tables.get_dummy_block_tables(input_batch.num_reqs)
+        slot_mappings = self.block_tables.get_dummy_slot_mappings(
+            input_batch.num_tokens
+        )
+        return block_tables, slot_mappings
+
     @torch.inference_mode()
     def get_mm_embeddings(
         self,
@@ -899,6 +859,8 @@ def execute_model(
             input_batch = self.prepare_inputs(
                 scheduler_output, num_tokens_after_padding
             )
+            block_tables, slot_mappings = self.prepare_attn(input_batch)
+
             if self.lora_config:
                 # Activate LoRA adapters.
                 lora_inputs = self.lora_state.make_lora_inputs(
@@ -929,9 +891,28 @@ def execute_model(
                 device=self.device,
             )
             if not skip_attn_for_dummy_run:
-                self.prepare_dummy_attn_metadata(input_batch)
+                block_tables, slot_mappings = self.prepare_dummy_attn(input_batch)
+            else:
+                block_tables = None
+                slot_mappings = None
             # FIXME(woosuk): Fix warmup for LoRA.
 
+        attn_metadata = None
+        slot_mappings_by_layer = None
+        if not (dummy_run and skip_attn_for_dummy_run):
+            assert slot_mappings is not None
+            slot_mappings_by_layer = build_slot_mappings_by_layer(
+                slot_mappings, self.kv_cache_config
+            )
+            assert block_tables is not None
+            attn_metadata = self.model_state.prepare_attn(
+                input_batch,
+                block_tables,
+                slot_mappings,
+                self.attn_groups,
+                self.kv_cache_config,
+            )
+
         model_inputs = {
             "input_ids": input_batch.input_ids,
             "positions": input_batch.positions,
@@ -968,13 +949,13 @@ def execute_model(
             )
 
             with set_forward_context(
-                input_batch.attn_metadata,
+                attn_metadata,
                 self.vllm_config,
                 num_tokens=input_batch.num_tokens_after_padding,
                 cudagraph_runtime_mode=cudagraph_runtime_mode,
                 num_tokens_across_dp=num_tokens_across_dp,
                 batch_descriptor=batch_descriptor,
-                slot_mapping=input_batch.slot_mappings,
+                slot_mapping=slot_mappings_by_layer,
             ):
                 self.kv_connector.pre_forward(scheduler_output)
                 model_output = self.model(**model_inputs)
@@ -985,22 +966,23 @@ def execute_model(
                     aux_hidden_states = None
 
         kv_connector_output = self.kv_connector.post_forward(scheduler_output)
+        self.execute_model_state = (
+            input_batch,
+            model_inputs,
+            attn_metadata,
+            slot_mappings_by_layer,
+            hidden_states,
+            aux_hidden_states,
+            kv_connector_output,
+        )
 
         if not self.is_last_pp_rank:
             # Non-last PP rank: return IntermediateTensors for sending.
             assert isinstance(hidden_states, IntermediateTensors)
             hidden_states.kv_connector_output = kv_connector_output
-            self.execute_model_state = (None, None, input_batch, kv_connector_output)
             return hidden_states
-
         # Last rank (or no PP): hidden_states is a tensor for sampling.
         assert isinstance(hidden_states, torch.Tensor)
-        self.execute_model_state = (
-            hidden_states,
-            aux_hidden_states,
-            input_batch,
-            kv_connector_output,
-        )
         return None
 
     @torch.inference_mode()
@@ -1010,9 +992,15 @@ def sample_tokens(
         if self.execute_model_state is None:
             # The prior execute_model call must have failed.
             return None
-        hidden_states, aux_hidden_states, input_batch, kv_connector_output = (
-            self.execute_model_state
-        )
+        (
+            input_batch,
+            model_inputs,
+            attn_metadata,
+            slot_mappings_by_layer,
+            hidden_states,
+            aux_hidden_states,
+            kv_connector_output,
+        ) = self.execute_model_state
         self.execute_model_state = None
 
         if not self.is_last_pp_rank:
@@ -1075,6 +1063,8 @@ def sample_tokens(
         if self.speculator is not None:
             draft_tokens = self.speculator.propose(
                 input_batch,
+                attn_metadata,
+                slot_mappings_by_layer,
                 hidden_states,
                 aux_hidden_states,
                 num_sampled,
diff --git a/vllm/v1/worker/gpu/model_states.py b/vllm/v1/worker/gpu/model_states.py
index 03574b2ad55a..838f177b39cd 100644
--- a/vllm/v1/worker/gpu/model_states.py
+++ b/vllm/v1/worker/gpu/model_states.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
 import torch
 import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.v1.core.sched.output import NewRequestData
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
 from vllm.v1.worker.gpu.input_batch import InputBatch
 from vllm.v1.worker.gpu.mm.mrope_utils import MRopeState
 from vllm.v1.worker.gpu.states import RequestState
+from vllm.v1.worker.utils import AttentionGroup
 
 
 class ModelState:
@@ -72,3 +77,29 @@ def prepare_dummy_inputs(
             return {}
         mrope_positions = self.mrope_state.mrope_positions[:, :num_tokens]
         return {"positions": mrope_positions}
+
+    def prepare_attn(
+        self,
+        input_batch: InputBatch,
+        block_tables: tuple[torch.Tensor, ...],
+        slot_mappings: torch.Tensor,
+        attn_groups: list[list[AttentionGroup]],
+        kv_cache_config: KVCacheConfig,
+    ) -> dict[str, Any]:
+        query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np)
+        max_query_len = input_batch.num_scheduled_tokens.max().item()
+        attn_metadata = build_attn_metadata(
+            attn_groups=attn_groups,
+            num_reqs=input_batch.num_reqs,
+            num_tokens=input_batch.num_tokens,
+            query_start_loc_gpu=input_batch.query_start_loc,
+            query_start_loc_cpu=query_start_loc_cpu,
+            max_query_len=max_query_len,
+            seq_lens=input_batch.seq_lens,
+            max_seq_len=self.max_model_len,
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=kv_cache_config,
+            dcp_local_seq_lens=input_batch.dcp_local_seq_lens,
+        )
+        return attn_metadata
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
index 6cd13cebf995..0c85bf65ee9c 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
@@ -182,6 +182,8 @@ def capture_model(self) -> None:
     def propose(
         self,
         input_batch: InputBatch,
+        attn_metadata: dict[str, Any],
+        slot_mappings: dict[str, torch.Tensor],
         # [num_tokens, hidden_size]
         last_hidden_states: torch.Tensor,
         # num_layers x [num_tokens, hidden_size]
@@ -229,8 +231,8 @@ def propose(
         # TODO(woosuk): Support CUDA graph for prefill.
         last_hidden_states, hidden_states = self.run_model(
             num_tokens,
-            input_batch.attn_metadata,
-            input_batch.slot_mappings,
+            attn_metadata,
+            slot_mappings,
             num_tokens_across_dp=None,  # FIXME
         )
         sample_hidden_states = last_hidden_states[last_token_indices]

From 967572dd5f8da947aa4344f0e75516b6ee0ede9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=8D=E5=81=9A=E4=BA=86=E7=9D=A1=E5=A4=A7=E8=A7=89?=
 <64798754+stakeswky@users.noreply.github.com>
Date: Fri, 27 Feb 2026 04:30:45 +0800
Subject: [PATCH 042/154] fix(reasoning): Qwen3ReasoningParser returns
 truncated output as reasoning (#35230)

Signed-off-by: stakeswky <stakeswky@users.noreply.github.com>
Co-authored-by: stakeswky <stakeswky@users.noreply.github.com>
---
 .../reasoning/test_qwen3_reasoning_parser.py  | 82 +++++++++++++++++--
 vllm/reasoning/qwen3_reasoning_parser.py      | 25 ++++--
 2 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
index db2bc16ffbbf..411c7ba485a8 100644
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -9,6 +9,7 @@
     run_reasoning_extraction,
     run_reasoning_extraction_streaming,
 )
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 parser_name = "qwen3"
@@ -58,12 +59,14 @@ def qwen3_tokenizer(request):
     "content": "This is the rest",
 }
 
-# --- No think tokens at all (thinking disabled) ---
+# --- No think tokens at all (thinking enabled, truncated) ---
 
+# With thinking enabled (default), no think tokens means the output was
+# truncated before </think> could be generated. All output is reasoning.
 WITHOUT_THINK = {
     "output": "This is the rest",
-    "reasoning": None,
-    "content": "This is the rest",
+    "reasoning": "This is the rest",
+    "content": None,
 }
 # In streaming, the parser cannot distinguish "thinking disabled" from
 # "reasoning in progress" when no think tokens have appeared yet.
@@ -87,10 +90,12 @@ def qwen3_tokenizer(request):
     "reasoning": "This is a reasoning\nsection",
     "content": "This is the rest\nThat",
 }
+# Truncated output: <think> present but no </think> (thinking enabled).
+# Everything is reasoning because the output was cut off mid-thought.
 ONLY_OPEN_TAG = {
     "output": "<think>This is a reasoning section",
-    "reasoning": None,
-    "content": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
+    "content": None,
 }
 
 ONLY_OPEN_TAG_STREAM = {
@@ -99,6 +104,20 @@ def qwen3_tokenizer(request):
     "content": None,
 }
 
+# Truncated output without <think> prefix (Qwen3.5 style where <think>
+# is in the prompt). No </think> means truncation — all is reasoning.
+TRUNCATED_NO_START_TOKEN = {
+    "output": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+}
+
+TRUNCATED_NO_START_TOKEN_STREAM = {
+    "output": "This is a reasoning section",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+}
+
 TEST_CASES = [
     pytest.param(
         False,
@@ -170,6 +189,16 @@ def qwen3_tokenizer(request):
         ONLY_OPEN_TAG_STREAM,
         id="only_open_tag_stream",
     ),
+    pytest.param(
+        False,
+        TRUNCATED_NO_START_TOKEN,
+        id="truncated_no_start_token",
+    ),
+    pytest.param(
+        True,
+        TRUNCATED_NO_START_TOKEN_STREAM,
+        id="truncated_no_start_token_stream",
+    ),
 ]
 
 
@@ -249,3 +278,46 @@ def test_reasoning_streaming_multi_token_deltas(
 
     assert reconstructor.reasoning == expected_reasoning
     assert (reconstructor.other_content or None) == expected_content
+
+
+# --- Tests for enable_thinking=False (thinking explicitly disabled) ---
+
+
+THINKING_DISABLED_CASES = [
+    pytest.param(
+        "This is plain content",
+        None,
+        "This is plain content",
+        id="thinking_disabled_plain_content",
+    ),
+    pytest.param(
+        "Some output without think tokens",
+        None,
+        "Some output without think tokens",
+        id="thinking_disabled_no_think_tokens",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "output, expected_reasoning, expected_content", THINKING_DISABLED_CASES
+)
+def test_reasoning_thinking_disabled(
+    output: str,
+    expected_reasoning: str | None,
+    expected_content: str | None,
+    qwen3_tokenizer,
+):
+    """When enable_thinking=False, output without </think> is all content."""
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        qwen3_tokenizer,
+        chat_template_kwargs={"enable_thinking": False},
+    )
+
+    reasoning, content = parser.extract_reasoning(
+        model_output=output,
+        request=ChatCompletionRequest(messages=[], model="test-model"),
+    )
+
+    assert reasoning == expected_reasoning
+    assert content == expected_content
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 0c09d40999ec..df7b22a91a38 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -11,6 +11,7 @@
     ResponsesRequest,
 )
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.tokenizers import TokenizerLike
 
 
 class Qwen3ReasoningParser(BaseThinkingReasoningParser):
@@ -33,6 +34,14 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
     it is stripped before extraction (non-streaming) or skipped (streaming).
     """
 
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        # Qwen3 defaults to thinking enabled; only treat output as
+        # pure content when the user explicitly disables it.
+        self.thinking_enabled = chat_kwargs.get("enable_thinking", True)
+
     @property
     def start_token(self) -> str:
         """The token that starts reasoning content."""
@@ -54,8 +63,11 @@ def extract_reasoning(
         If <think> is present (e.g. from a different template), it is
         stripped before extraction.
 
-        When thinking is disabled (no </think> in output), returns
-        (None, model_output) to indicate all output is content.
+        When thinking is explicitly disabled and no </think> appears,
+        returns (None, model_output) — all output is content.
+        Otherwise (thinking enabled, default), a missing </think> means
+        the output was truncated and everything is reasoning:
+        returns (model_output, None).
 
         Returns:
             tuple[Optional[str], Optional[str]]: reasoning content and content
@@ -68,9 +80,12 @@ def extract_reasoning(
         )
 
         if self.end_token not in model_output:
-            # No end token means thinking is disabled or the model
-            # did not produce reasoning. Treat everything as content.
-            return None, model_output
+            if not self.thinking_enabled:
+                # Thinking explicitly disabled — treat everything as content.
+                return None, model_output
+            # Thinking enabled but no </think>: output was truncated.
+            # Everything generated so far is reasoning.
+            return model_output, None
 
         # Extract reasoning content from the model output.
         reasoning, _, content = model_output.partition(self.end_token)

From 98217b09f9ce22429ce35badfa1d50e1f4fe4137 Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Thu, 26 Feb 2026 22:29:01 +0100
Subject: [PATCH 043/154] [Performance] Extract KV cache update op from
 flashinfer forward (#35422)

Signed-off-by: ElizaWszola <ewszola@redhat.com>
---
 vllm/v1/attention/backends/flashinfer.py | 62 ++++++++++++++----------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 26d372c11319..80297720d7b3 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -381,6 +381,8 @@ def get_required_kv_cache_layout(cls) -> KVCacheLayoutType | None:
             return "HND"
         return None
 
+    forward_includes_kv_cache_update: bool = False
+
 
 @dataclass
 class FIPrefill:
@@ -1330,32 +1332,15 @@ def forward(
 
         num_actual_tokens = attn_metadata.num_actual_tokens
 
-        if self.kv_sharing_target_layer_name is None:
-            # Reshape the input keys and values and store them in the cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-            # not padded. However, we don't need to do key[:num_actual_tokens]
-            # and value[:num_actual_tokens] because the reshape_and_cache_flash
-            # op uses the slot_mapping's shape to determine the number of
-            # actual tokens.
-            torch.ops._C_cache_ops.reshape_and_cache_flash(
-                key,
-                value,
-                kv_cache[:, 0],
-                kv_cache[:, 1],
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
+        # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
+        # to process the cache when the kv_cache_dtype is fp8
+        if self.kv_sharing_target_layer_name is None and self.kv_cache_dtype.startswith(
+            "fp8"
+        ):
+            torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                self.kv_cache_dtype
             )
-
-            # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
-            # to process the cache when the kv_cache_dtype is fp8
-            if self.kv_cache_dtype.startswith("fp8"):
-                torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                    self.kv_cache_dtype
-                )
-                kv_cache = kv_cache.view(torch_dtype)
+            kv_cache = kv_cache.view(torch_dtype)
 
         # Inputs and outputs may be padded for CUDA graphs
         query = query[:num_actual_tokens]
@@ -1599,6 +1584,33 @@ def forward(
                 )
         return output_padded
 
+    def do_kv_cache_update(
+        self,
+        layer: torch.nn.Module,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> None:
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[:, 0],
+                kv_cache[:, 1],
+                slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
 
 def fast_plan_decode(
     self,  # decode wrapper

From 832a780f3aed332287203217d0d946b8b03299b4 Mon Sep 17 00:00:00 2001
From: danielafrimi <45691845+danielafrimi@users.noreply.github.com>
Date: Thu, 26 Feb 2026 23:55:19 +0200
Subject: [PATCH 044/154] Nemotron: use per-layer config in
 NemotronHMLPDecoderLayer for heterogeneous models (#35396)

Signed-off-by: dafrimi <dafrimi@nvidia.com>
---
 vllm/model_executor/models/nemotron_h.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index f180e4acd711..446b01fe393e 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -298,6 +298,11 @@ def __init__(
 
         hybrid_override_pattern = config.hybrid_override_pattern
         mlp_index = hybrid_override_pattern[: layer_idx + 1].count("-") - 1
+        # Get per-layer config for heterogeneous models if exist
+        get_layer_config = getattr(config, "get_nemotron_h_config_for_layer", None)
+        layer_config = get_layer_config(layer_idx) if get_layer_config else config
+        config = layer_config
+
         if isinstance(config.intermediate_size, list):
             if len(config.intermediate_size) == 1:
                 intermediate_size = config.intermediate_size[0]

From d0105b84f00fadc18d9e7859d3e76887b7f5772c Mon Sep 17 00:00:00 2001
From: sychen52 <41452870+sychen52@users.noreply.github.com>
Date: Thu, 26 Feb 2026 13:56:24 -0800
Subject: [PATCH 045/154] add mixed precision support for modelopt (#35047)

Signed-off-by: Shiyang Chen <shiychen@nvidia.com>
---
 vllm/config/model.py                          |   1 +
 .../layers/quantization/__init__.py           |   9 +-
 .../layers/quantization/modelopt.py           | 307 +++++++++++++-----
 .../model_loader/weight_utils.py              |  16 +-
 4 files changed, 250 insertions(+), 83 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 5fb81ee424e1..012b2b1c9eb4 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -883,6 +883,7 @@ def _verify_quantization(self) -> None:
                 "modelopt",
                 "modelopt_fp4",
                 "modelopt_mxfp8",
+                "modelopt_mixed",
                 "petit_nvfp4",
                 # Ensure heavy backends are probed last to avoid unnecessary
                 # imports during override detection (e.g., MXFP4 imports Triton)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 09e67f562d0c..2fb54e7751a0 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -18,6 +18,7 @@
     "modelopt",
     "modelopt_fp4",
     "modelopt_mxfp8",
+    "modelopt_mixed",
     "gguf",
     "gptq_marlin",
     "awq_marlin",
@@ -120,7 +121,12 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .gptq import GPTQConfig
     from .gptq_marlin import GPTQMarlinConfig
     from .inc import INCConfig
-    from .modelopt import ModelOptFp8Config, ModelOptMxFp8Config, ModelOptNvFp4Config
+    from .modelopt import (
+        ModelOptFp8Config,
+        ModelOptMixedPrecisionConfig,
+        ModelOptMxFp8Config,
+        ModelOptNvFp4Config,
+    )
     from .moe_wna16 import MoeWNA16Config
     from .mxfp4 import Mxfp4Config
     from .petit import PetitNvFp4Config
@@ -135,6 +141,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "modelopt": ModelOptFp8Config,
         "modelopt_fp4": ModelOptNvFp4Config,
         "modelopt_mxfp8": ModelOptMxFp8Config,
+        "modelopt_mixed": ModelOptMixedPrecisionConfig,
         "gguf": GGUFConfig,
         "gptq_marlin": GPTQMarlinConfig,
         "awq_marlin": AWQMarlinConfig,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 4c059da4170c..c0cc35b28271 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -114,6 +114,8 @@
     "NVFP4",
     # MXFP8
     "MXFP8",
+    # MIXED_PRECISION,
+    "MIXED_PRECISION",
 ]
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
@@ -235,6 +237,26 @@ def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
 
             self.exclude_modules = hf_to_vllm_mapper.apply_list(new_exclude_modules)
 
+    @staticmethod
+    def _extract_modelopt_quant_algo(
+        hf_quant_cfg: dict[str, Any] | None,
+    ) -> str | None:
+        """Extract upper-cased quant_algo from a modelopt config.
+
+        Returns the quant_algo string (upper-cased), or None if the config
+        is not a modelopt config.
+        """
+        if hf_quant_cfg is None:
+            return None
+        if hf_quant_cfg.get("quant_method", "").lower() != "modelopt":
+            return None
+        if "quantization" in hf_quant_cfg:
+            quant_config = hf_quant_cfg["quantization"]
+            if isinstance(quant_config, dict):
+                return str(quant_config.get("quant_algo", "")).upper()
+            return None
+        return str(hf_quant_cfg.get("quant_algo", "")).upper()
+
     @staticmethod
     def get_config_filenames() -> list[str]:
         return ["hf_quant_config.json"]
@@ -272,10 +294,20 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptQuantConfigBase":
             # "exclude_modules" is the key in the legacy hf_quant_config.json
             exclude_modules = quant_config.get("exclude_modules", [])
         else:
-            # Compressed-tensors style format:
+            # Compressed-tensors style format (config.json quantization_config):
             # {"quant_algo": "...", "quant_method": "modelopt"}
             quant_method = config.get("quant_algo")
-            kv_cache_quant_method = config.get("kv_cache_quant_algo")
+
+            # "kv_cache_scheme" (a dict) instead of "kv_cache_quant_algo" (a string).
+            kv_cache_scheme = config.get("kv_cache_scheme")
+            if isinstance(kv_cache_scheme, dict) and (
+                kv_cache_scheme.get("type") == "float"
+                and kv_cache_scheme.get("num_bits") == 8
+            ):
+                kv_cache_quant_method = "FP8"
+            else:
+                kv_cache_quant_method = None
+
             # "ignore" is the key in config.json
             exclude_modules = config.get("ignore", [])
             group_size_raw = config.get("group_size")
@@ -379,32 +411,9 @@ def get_min_capability(cls) -> int:
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
     ) -> QuantizationMethods | None:
-        """Detect if this ModelOpt config should be used based on
-        quantization config."""
-
-        if hf_quant_cfg is None:
-            return None
-
-        # Use the community standard 'quant_method'
-        quant_method = hf_quant_cfg.get("quant_method", "").lower()
-
-        # Only proceed if the method is explicitly "modelopt"
-        if quant_method != "modelopt":
-            return None
-
-        # Look for ModelOpt-specific config structure
-        if "quantization" in hf_quant_cfg:
-            quant_config = hf_quant_cfg["quantization"]
-            if isinstance(quant_config, dict):
-                quant_algo = str(quant_config.get("quant_algo", ""))
-                if quant_algo.upper() == "FP8":
-                    return "modelopt"
-        else:
-            # Check for compressed-tensors style config with specific quant_algo
-            quant_algo = str(hf_quant_cfg.get("quant_algo", ""))
-            if quant_algo.upper() == "FP8":
-                return "modelopt"
-
+        algo = cls._extract_modelopt_quant_algo(hf_quant_cfg)
+        if algo is not None and algo == "FP8":
+            return "modelopt"
         return None
 
     @classmethod
@@ -1031,32 +1040,9 @@ def get_min_capability(cls) -> int:
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
     ) -> QuantizationMethods | None:
-        """Detect if this ModelOpt FP4 config should be used based on
-        quantization config."""
-        if hf_quant_cfg is None:
-            return None
-
-        # Use the community standard 'quant_method'
-        quant_method = hf_quant_cfg.get("quant_method", "").lower()
-
-        # Only proceed if the method is explicitly "modelopt"
-        if quant_method != "modelopt":
-            return None
-
-        # Look for ModelOpt-specific config structure
-        if "quantization" in hf_quant_cfg:
-            quant_config = hf_quant_cfg["quantization"]
-            if isinstance(quant_config, dict):
-                quant_algo = quant_config.get("quant_algo", "")
-                if "NVFP4" in quant_algo:
-                    return "modelopt_fp4"
-        else:
-            # Check for compressed-tensors style config with specific
-            # quant_algo field
-            quant_algo = hf_quant_cfg.get("quant_algo", "")
-            if isinstance(quant_algo, str) and "FP4" in quant_algo.upper():
-                return "modelopt_fp4"
-
+        algo = cls._extract_modelopt_quant_algo(hf_quant_cfg)
+        if algo is not None and ("NVFP4" in algo or "FP4" in algo):
+            return "modelopt_fp4"
         return None
 
     @classmethod
@@ -1619,31 +1605,9 @@ def get_quant_method(
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
     ) -> QuantizationMethods | None:
-        """Detect if this ModelOpt MXFP8 config should be used based on
-        quantization config."""
-        if hf_quant_cfg is None:
-            return None
-
-        # Use the community standard 'quant_method'
-        quant_method = hf_quant_cfg.get("quant_method", "").lower()
-
-        # Only proceed if the method is explicitly "modelopt"
-        if quant_method != "modelopt":
-            return None
-
-        # Look for ModelOpt-specific config structure
-        if "quantization" in hf_quant_cfg:
-            quant_config = hf_quant_cfg["quantization"]
-            if isinstance(quant_config, dict):
-                quant_algo = str(quant_config.get("quant_algo", "")).upper()
-                if "MXFP8" in quant_algo:
-                    return "modelopt_mxfp8"
-        else:
-            # Check for compressed-tensors style config with specific quant_algo
-            quant_algo = str(hf_quant_cfg.get("quant_algo", "")).upper()
-            if "MXFP8" in quant_algo:
-                return "modelopt_mxfp8"
-
+        algo = cls._extract_modelopt_quant_algo(hf_quant_cfg)
+        if algo is not None and "MXFP8" in algo:
+            return "modelopt_mxfp8"
         return None
 
     @classmethod
@@ -1841,3 +1805,188 @@ def apply(
 # Register the method classes for ModelOptMxFp8Config
 ModelOptMxFp8Config.LinearMethodCls = ModelOptMxFp8LinearMethod
 ModelOptMxFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+
+
+class ModelOptMixedPrecisionConfig(ModelOptQuantConfigBase):
+    """Config class for ModelOpt MIXED_PRECISION.
+
+    Supports checkpoints where different layers use different quantization
+    algorithms (e.g., FP8 for dense layers and NVFP4 for MoE experts).
+    The per-layer algorithm is specified in the ``quantized_layers`` dict
+    inside ``config.json``'s ``quantization_config`` (preferred) or the
+    legacy ``hf_quant_config.json``.
+    """
+
+    def __init__(
+        self,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        quantized_layers: dict[str, dict[str, Any]],
+        fp8_config: ModelOptFp8Config,
+        nvfp4_config: ModelOptNvFp4Config,
+    ) -> None:
+        super().__init__(exclude_modules)
+        self.kv_cache_quant_method = kv_cache_quant_method
+        self.quantized_layers = quantized_layers
+        self.fp8_config = fp8_config
+        self.nvfp4_config = nvfp4_config
+
+    def get_name(self) -> QuantizationMethods:
+        return "modelopt_mixed"
+
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89
+
+    @classmethod
+    def override_quantization_method(
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
+        algo = cls._extract_modelopt_quant_algo(hf_quant_cfg)
+        if algo is not None and algo == "MIXED_PRECISION":
+            return "modelopt_mixed"
+        return None
+
+    @classmethod
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        group_size: int | None,
+        **kwargs: Any,
+    ) -> "ModelOptMixedPrecisionConfig":
+        if "quantization" in original_config:
+            quantized_layers = original_config["quantization"].get(
+                "quantized_layers", {}
+            )
+        else:
+            quantized_layers = original_config.get("quantized_layers", {})
+
+        if not quantized_layers:
+            raise ValueError(
+                "MIXED_PRECISION quant_algo requires a non-empty "
+                "'quantized_layers' mapping in the quantization config."
+            )
+
+        # Determine group_size from the first NVFP4 entry if not provided.
+        if group_size is None:
+            for layer_info in quantized_layers.values():
+                if layer_info.get("quant_algo", "").upper() == "NVFP4":
+                    group_size = layer_info.get("group_size", 16)
+                    break
+        if group_size is None:
+            group_size = 16
+
+        fp8_config = ModelOptFp8Config(
+            quant_method="FP8",
+            is_checkpoint_fp8_serialized=True,
+            kv_cache_quant_method=kv_cache_quant_method,
+            exclude_modules=[],
+        )
+        nvfp4_config = ModelOptNvFp4Config(
+            is_checkpoint_nvfp4_serialized=True,
+            kv_cache_quant_algo=kv_cache_quant_method,
+            exclude_modules=[],
+            group_size=group_size,
+        )
+
+        return cls(
+            kv_cache_quant_method=kv_cache_quant_method,
+            exclude_modules=exclude_modules,
+            quantized_layers=quantized_layers,
+            fp8_config=fp8_config,
+            nvfp4_config=nvfp4_config,
+        )
+
+    def _resolve_quant_algo(self, prefix: str) -> str | None:
+        """Look up the quant_algo for a vLLM-side layer prefix.
+
+        Tries three strategies in order:
+        1. Direct lookup in ``quantized_layers``.
+        2. Packed/fused-layer lookup (unfuse via ``packed_modules_mapping``).
+        3. Prefix-based lookup for FusedMoE (any child key starts with
+           ``prefix + "."``).
+
+        Returns the upper-cased quant_algo string, or *None* if the prefix
+        is not found.
+        """
+        # 1. Direct lookup
+        if prefix in self.quantized_layers:
+            return self.quantized_layers[prefix]["quant_algo"].upper()
+
+        # 2. Packed / fused layer lookup
+        proj_name = prefix.rsplit(".", 1)[-1]
+        if self.packed_modules_mapping and proj_name in self.packed_modules_mapping:
+            algos: set[str] = set()
+            base = prefix.rsplit(".", 1)[0]
+            for shard_name in self.packed_modules_mapping[proj_name]:
+                shard_prefix = f"{base}.{shard_name}"
+                if shard_prefix in self.quantized_layers:
+                    algos.add(self.quantized_layers[shard_prefix]["quant_algo"].upper())
+            if len(algos) == 1:
+                return algos.pop()
+            if len(algos) > 1:
+                raise ValueError(
+                    f"Mixed quant_algo within fused layer {prefix}: "
+                    f"{algos}. All shards must use the same quantization."
+                )
+
+        # 3. Prefix-based lookup (for FusedMoE / parent modules)
+        prefix_dot = prefix + "."
+        for key, info in self.quantized_layers.items():
+            if key.startswith(prefix_dot):
+                return info["quant_algo"].upper()
+
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> "QuantizeMethodBase | None":
+        """Return quantize-method based on layer."""
+        # KV-cache quantization
+        if isinstance(layer, Attention):
+            if self.kv_cache_quant_method:
+                return ModelOptFp8KVCacheMethod(self)
+            return None
+
+        # Excluded layers
+        if self.is_layer_excluded(prefix):
+            if isinstance(layer, LinearBase):
+                return UnquantizedLinearMethod()
+            return None
+
+        quant_algo = self._resolve_quant_algo(prefix)
+
+        if isinstance(layer, LinearBase):
+            if quant_algo == "FP8":
+                return ModelOptFp8LinearMethod(self.fp8_config)
+            if quant_algo == "NVFP4":
+                return ModelOptNvFp4LinearMethod(self.nvfp4_config)
+            # Layer not in quantized_layers — leave unquantized
+            return UnquantizedLinearMethod()
+
+        if isinstance(layer, FusedMoE):
+            if quant_algo == "FP8":
+                return ModelOptFp8MoEMethod(
+                    quant_config=self.fp8_config,
+                    moe_config=layer.moe_config,
+                )
+            if quant_algo == "NVFP4":
+                return ModelOptNvFp4FusedMoE(
+                    quant_config=self.nvfp4_config,
+                    moe_config=layer.moe_config,
+                )
+            return None
+
+        return None
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        super().apply_vllm_mapper(hf_to_vllm_mapper)
+        if self.quantized_layers:
+            self.quantized_layers = hf_to_vllm_mapper.apply_dict(self.quantized_layers)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 44dcd076e0bd..24b2f61b8671 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -287,7 +287,17 @@ def get_quant_config(
         )
 
     if hf_quant_config is not None:
-        return quant_cls.from_config(hf_quant_config)
+        # For modelopt_mixed, config.json's quantization_config may or may
+        # not contain the per-layer quantized_layers map.  Newer checkpoints
+        # embed it directly; older ones keep it only in hf_quant_config.json.
+        # If it is missing, fall through to the file-based loading path.
+        if (
+            model_config.quantization == "modelopt_mixed"
+            and "quantized_layers" not in hf_quant_config
+        ):
+            pass  # fall through to file-based loading below
+        else:
+            return quant_cls.from_config(hf_quant_config)
 
     # if hf_quant_config is None, we will try to get config from
     # hf_overrides
@@ -365,8 +375,8 @@ def get_quant_config(
 
         if model_config.quantization == "bitsandbytes":
             config["adapter_name_or_path"] = model_config.model
-        elif model_config.quantization == "modelopt":
-            if config["producer"]["name"] == "modelopt":
+        elif model_config.quantization in ("modelopt", "modelopt_mixed"):
+            if config.get("producer", {}).get("name") == "modelopt":
                 return quant_cls.from_config(config)
             else:
                 raise ValueError(

From 0f2f24c8b205b5bf2dadacf1f95f1ad9f7de73e0 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Thu, 26 Feb 2026 14:08:16 -0800
Subject: [PATCH 046/154] [Bugfix] Fix MessageQueue connect_ip for cross-node
 data parallelism (#35429)

Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
---
 tests/distributed/test_mq_connect_ip.py | 79 +++++++++++++++++++++++++
 vllm/v1/executor/multiproc_executor.py  | 15 ++++-
 2 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 tests/distributed/test_mq_connect_ip.py

diff --git a/tests/distributed/test_mq_connect_ip.py b/tests/distributed/test_mq_connect_ip.py
new file mode 100644
index 000000000000..4b0cdda3ad9e
--- /dev/null
+++ b/tests/distributed/test_mq_connect_ip.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test that MessageQueue uses the local node's IP for binding,
+not a remote master_addr. This validates the fix for cross-node
+data-parallel where each DP group leader must bind to its own IP.
+
+The bug: multiproc_executor used `parallel_config.master_addr` as
+`connect_ip` for every DP group's MessageQueue. For DP groups whose
+leader is NOT on the master node, binding to master_addr fails with
+"Cannot assign requested address".
+
+The fix: use `get_ip()` (local node IP) instead of `master_addr`.
+"""
+
+import pytest
+import zmq
+
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+from vllm.utils.network_utils import get_ip
+
+
+def test_mq_bind_with_local_ip():
+    """MessageQueue with remote readers should successfully bind
+    when connect_ip is the local node's IP."""
+    # n_reader=2, n_local_reader=1 means 1 remote reader,
+    # which triggers the remote ZMQ socket bind.
+    mq = MessageQueue(
+        n_reader=2,
+        n_local_reader=1,
+        connect_ip=get_ip(),
+    )
+    handle = mq.export_handle()
+    assert handle.remote_subscribe_addr is not None
+    # The bound address should contain our local IP
+    local_ip = get_ip()
+    assert (
+        local_ip in handle.remote_subscribe_addr
+        or f"[{local_ip}]" in handle.remote_subscribe_addr
+    )
+    del mq
+
+
+def test_mq_bind_with_non_local_ip_fails():
+    """MessageQueue should fail to bind when connect_ip is a
+    non-local IP address (simulating the bug where master_addr
+    from a different node was used)."""
+    # Use a non-local IP that we definitely can't bind to.
+    # 198.51.100.1 is from TEST-NET-2 (RFC 5737), never locally assigned.
+    non_local_ip = "198.51.100.1"
+    with pytest.raises(zmq.error.ZMQError, match="Cannot assign requested address"):
+        MessageQueue(
+            n_reader=2,
+            n_local_reader=1,
+            connect_ip=non_local_ip,
+        )
+
+
+def test_mq_bind_defaults_to_local_ip():
+    """When connect_ip is None, MessageQueue should auto-detect
+    the local IP and bind successfully."""
+    mq = MessageQueue(
+        n_reader=2,
+        n_local_reader=1,
+        connect_ip=None,  # should fallback to get_ip()
+    )
+    handle = mq.export_handle()
+    assert handle.remote_subscribe_addr is not None
+    del mq
+
+
+if __name__ == "__main__":
+    test_mq_bind_with_local_ip()
+    print("PASSED: test_mq_bind_with_local_ip")
+    test_mq_bind_with_non_local_ip_fails()
+    print("PASSED: test_mq_bind_with_non_local_ip_fails")
+    test_mq_bind_defaults_to_local_ip()
+    print("PASSED: test_mq_bind_defaults_to_local_ip")
+    print("\nAll tests passed!")
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index b63cbd6586f2..9ea29df0058b 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -44,6 +44,7 @@
 from vllm.tracing import instrument, maybe_init_worker_tracer
 from vllm.utils.network_utils import (
     get_distributed_init_method,
+    get_ip,
     get_loopback_ip,
     get_open_port,
 )
@@ -128,11 +129,23 @@ def _init_executor(self) -> None:
             # For leader node within each dp rank,
             # each dp will have its own leader multiproc executor.
             max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+            mq_connect_ip = get_ip()
+            logger.info(
+                "DP group leader: node_rank=%d, node_rank_within_dp=%d, "
+                "master_addr=%s, mq_connect_ip=%s (local), "
+                "world_size=%d, local_world_size=%d",
+                self.parallel_config.node_rank,
+                self.parallel_config.node_rank_within_dp,
+                self.parallel_config.master_addr,
+                mq_connect_ip,
+                self.world_size,
+                self.local_world_size,
+            )
             self.rpc_broadcast_mq = MessageQueue(
                 self.world_size,
                 self.local_world_size,
                 max_chunk_bytes=max_chunk_bytes,
-                connect_ip=self.parallel_config.master_addr,
+                connect_ip=mq_connect_ip,
             )
             scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
         # Create workers

From eb19955c37089056883831838dc155340ae67edd Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 26 Feb 2026 17:30:10 -0500
Subject: [PATCH 047/154] [WideEP] Remove pplx all2all backend (#33724)

Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 .buildkite/test_areas/kernels.yaml            |   13 +-
 CMakeLists.txt                                |    2 +-
 csrc/ops.h                                    |   14 +-
 .../quantization/w8a8/cutlass/moe/moe_data.cu |   26 +-
 .../w8a8/cutlass/scaled_mm_entry.cu           |   43 +-
 csrc/torch_bindings.cpp                       |    8 +-
 docker/Dockerfile                             |    8 +-
 docker/versions.json                          |    3 -
 docs/design/fused_moe_modular_kernel.md       |    7 +-
 docs/design/moe_kernel_features.md            |    5 +-
 docs/governance/committers.md                 |    2 +-
 docs/serving/expert_parallel_deployment.md    |    9 +-
 .../elastic_ep/serve_deepseek_v2.sh           |    2 +-
 .../moe/modular_kernel_tools/common.py        |    7 -
 .../moe/modular_kernel_tools/mk_objects.py    |   14 -
 .../profile_modular_kernel.py                 |    2 +-
 .../moe/test_modular_kernel_combinations.py   |    8 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    |  365 ------
 tests/kernels/moe/test_pplx_moe.py            | 1021 -----------------
 tools/ep_kernels/README.md                    |    2 +-
 .../elastic_ep/install_eep_libraries.sh       |    7 -
 tools/ep_kernels/install_python_libraries.sh  |   18 -
 vllm/_custom_ops.py                           |    4 +-
 vllm/config/compilation.py                    |    2 +-
 vllm/config/parallel.py                       |    9 +-
 .../device_communicators/all2all.py           |   93 +-
 .../device_communicators/cuda_communicator.py |    6 +-
 vllm/distributed/eplb/eplb_state.py           |    2 +-
 .../layers/fused_moe/all2all_utils.py         |   53 +-
 .../model_executor/layers/fused_moe/config.py |   10 +-
 .../layers/fused_moe/cutlass_moe.py           |    2 +-
 .../layers/fused_moe/fused_batched_moe.py     |    6 +-
 .../layers/fused_moe/modular_kernel.py        |    6 +-
 .../layers/fused_moe/oracle/nvfp4.py          |    5 +-
 .../layers/fused_moe/pplx_prepare_finalize.py |  373 ------
 .../fused_moe/runner/default_moe_runner.py    |    3 +-
 .../fused_moe/topk_weight_and_reduce.py       |    9 +-
 .../layers/quantization/mxfp4.py              |    2 +-
 vllm/utils/import_utils.py                    |    5 -
 39 files changed, 107 insertions(+), 2069 deletions(-)
 delete mode 100644 tests/kernels/moe/test_pplx_cutlass_moe.py
 delete mode 100644 tests/kernels/moe/test_pplx_moe.py
 delete mode 100644 vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py

diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
index afc8fc49a2aa..c755c643651d 100644
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -155,5 +155,14 @@ steps:
   commands:
     - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
     - pytest -v -s kernels/moe/test_deepep_moe.py
-    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
-    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55127a514f1f..39714b8461e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -725,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUTLASS MoE kernels
 
   # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
-  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
   # if it's possible to compile MoE kernels that use its output.
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
diff --git a/csrc/ops.h b/csrc/ops.h
index 5e2b475fa8c1..690342b37caa 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -269,13 +269,13 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     const int64_t n, const int64_t k, const bool swap_ab);
 
-void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
-                                  torch::Tensor& problem_sizes1,
-                                  torch::Tensor& problem_sizes2,
-                                  const torch::Tensor& expert_num_tokens,
-                                  const int64_t num_local_experts,
-                                  const int64_t padded_m, const int64_t n,
-                                  const int64_t k);
+void get_cutlass_batched_moe_mm_data(torch::Tensor& expert_offsets,
+                                     torch::Tensor& problem_sizes1,
+                                     torch::Tensor& problem_sizes2,
+                                     const torch::Tensor& expert_num_tokens,
+                                     const int64_t num_local_experts,
+                                     const int64_t padded_m, const int64_t n,
+                                     const int64_t k);
 
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
diff --git a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
index eae500cb6325..41cf170a2431 100644
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -263,12 +263,10 @@ void get_cutlass_moe_mm_data_caller(
 }
 
 template <bool SWAP_AB>
-__global__ void compute_pplx_data(int32_t* expert_offsets,
-                                  int32_t* problem_sizes1,
-                                  int32_t* problem_sizes2,
-                                  const int32_t* __restrict__ expert_num_tokens,
-                                  const int padded_m, const int n,
-                                  const int k) {
+__global__ void compute_batched_moe_data(
+    int32_t* expert_offsets, int32_t* problem_sizes1, int32_t* problem_sizes2,
+    const int32_t* __restrict__ expert_num_tokens, const int padded_m,
+    const int n, const int k) {
   int expert_idx = threadIdx.x;
   expert_offsets[expert_idx] = expert_idx * padded_m;
 
@@ -289,24 +287,22 @@ __global__ void compute_pplx_data(int32_t* expert_offsets,
   }
 }
 
-void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
-                                         torch::Tensor& problem_sizes1,
-                                         torch::Tensor& problem_sizes2,
-                                         const torch::Tensor& expert_num_tokens,
-                                         const int64_t num_local_experts,
-                                         const int64_t padded_m,
-                                         const int64_t n, const int64_t k) {
+void get_cutlass_batched_moe_mm_data_caller(
+    torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const torch::Tensor& expert_num_tokens,
+    const int64_t num_local_experts, const int64_t padded_m, const int64_t n,
+    const int64_t k) {
   auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
 
   if (num_local_experts * padded_m > SWAP_AB_THRESHOLD) {
-    compute_pplx_data<false><<<1, num_local_experts, 0, stream>>>(
+    compute_batched_moe_data<false><<<1, num_local_experts, 0, stream>>>(
         static_cast<int32_t*>(expert_offsets.data_ptr()),
         static_cast<int32_t*>(problem_sizes1.data_ptr()),
         static_cast<int32_t*>(problem_sizes2.data_ptr()),
         static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
         k);
   } else {
-    compute_pplx_data<true><<<1, num_local_experts, 0, stream>>>(
+    compute_batched_moe_data<true><<<1, num_local_experts, 0, stream>>>(
         static_cast<int32_t*>(expert_offsets.data_ptr()),
         static_cast<int32_t*>(problem_sizes1.data_ptr()),
         static_cast<int32_t*>(problem_sizes2.data_ptr()),
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index 82ccc19608cb..d6e82f1db9fa 100644
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -82,13 +82,11 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     const int64_t n, const int64_t k, const bool swap_ab);
 
-void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
-                                         torch::Tensor& problem_sizes1,
-                                         torch::Tensor& problem_sizes2,
-                                         const torch::Tensor& expert_num_tokens,
-                                         const int64_t num_local_experts,
-                                         const int64_t padded_m,
-                                         const int64_t n, const int64_t k);
+void get_cutlass_batched_moe_mm_data_caller(
+    torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const torch::Tensor& expert_num_tokens,
+    const int64_t num_local_experts, const int64_t padded_m, const int64_t n,
+    const int64_t k);
 #endif
 
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -319,29 +317,30 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
       version_num, ". Required capability: 90, 100, or 120");
 }
 
-void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
-                                  torch::Tensor& problem_sizes1,
-                                  torch::Tensor& problem_sizes2,
-                                  const torch::Tensor& expert_num_tokens,
-                                  const int64_t num_local_experts,
-                                  const int64_t padded_m, const int64_t n,
-                                  const int64_t k) {
+void get_cutlass_batched_moe_mm_data(torch::Tensor& expert_offsets,
+                                     torch::Tensor& problem_sizes1,
+                                     torch::Tensor& problem_sizes2,
+                                     const torch::Tensor& expert_num_tokens,
+                                     const int64_t num_local_experts,
+                                     const int64_t padded_m, const int64_t n,
+                                     const int64_t k) {
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
 #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
     (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
     (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
-  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
-                                      problem_sizes2, expert_num_tokens,
-                                      num_local_experts, padded_m, n, k);
+  get_cutlass_batched_moe_mm_data_caller(expert_offsets, problem_sizes1,
+                                         problem_sizes2, expert_num_tokens,
+                                         num_local_experts, padded_m, n, k);
   return;
 #endif
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
-      "for CUDA device capability: ",
-      version_num, ". Required capability: 90, 100, or 120");
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No compiled get_cutlass_batched_moe_mm_data: no "
+                              "cutlass_scaled_mm kernel "
+                              "for CUDA device capability: ",
+                              version_num,
+                              ". Required capability: 90, 100, or 120");
 }
 
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 39b6bc98a843..8be30b209106 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -489,19 +489,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
            &get_cutlass_moe_mm_problem_sizes_from_expert_offsets);
 
   // A function that computes data required to run fused MoE with w8a8 grouped
-  // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
+  // GEMM in batched expert format. It takes expert_num_tokens
   // as an input, and computes expert_offsets (token start indices of each
   // expert). In addition to this, it computes problem sizes for each expert's
   // multiplication used by the two mms called from fused MoE operation.
   ops.def(
-      "get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
+      "get_cutlass_batched_moe_mm_data(Tensor! expert_offsets, "
       "                             Tensor! problem_sizes1, "
       "                             Tensor! problem_sizes2, "
       "                             Tensor expert_num_tokens, "
       "                             int num_local_experts, int padded_m, "
       "                             int n, int k) -> ()");
-  ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
-           &get_cutlass_pplx_moe_mm_data);
+  ops.impl("get_cutlass_batched_moe_mm_data", torch::kCUDA,
+           &get_cutlass_batched_moe_mm_data);
 
   // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
   ops.def(
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 717f27b6b232..495a480b7582 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -308,7 +308,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 #################### CSRC BUILD IMAGE ####################
 
 #################### EXTENSIONS BUILD IMAGE ####################
-# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
+# Build DeepGEMM, DeepEP - runs in PARALLEL with csrc-build
 # This stage is independent and doesn't affect csrc cache
 FROM base AS extensions-build
 ARG CUDA_VERSION
@@ -335,10 +335,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
 RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
 
-# Build pplx-kernels and DeepEP wheels
+# Build DeepEP wheels
 COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
 # Defaults moved here from tools/ep_kernels/install_python_libraries.sh for centralized version management
-ARG PPLX_COMMIT_HASH=12cecfd
 ARG DEEPEP_COMMIT_HASH=73b6ea4
 ARG NVSHMEM_VER
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -347,7 +346,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     /tmp/install_python_libraries.sh \
         --workspace /tmp/ep_kernels_workspace \
         --mode wheel \
-        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
         ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \
         ${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \
     find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
@@ -676,7 +674,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
-# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
+# Install EP kernels wheels (DeepEP) that have been built in the `build` stage
 RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system ep_kernels/dist/*.whl --verbose \
diff --git a/docker/versions.json b/docker/versions.json
index 24f4b6e7d1b7..fa090c10c443 100644
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -52,9 +52,6 @@
     "DEEPGEMM_GIT_REF": {
       "default": "477618cd51baffca09c4b0b87e97c03fe827ef03"
     },
-    "PPLX_COMMIT_HASH": {
-      "default": "12cecfd"
-    },
     "DEEPEP_COMMIT_HASH": {
       "default": "73b6ea4"
     },
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 975df8ba29dc..9db356cdf531 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -15,7 +15,7 @@ Based on the format of the input activations, FusedMoE implementations are broad
 The input activation format completely depends on the All2All Dispatch being used.
 
 * In the Contiguous variant, the All2All Dispatch returns the activations as a contiguous tensor of shape (M, K) along with TopK Ids and TopK weights of shape (M, num_topk). Look at `DeepEPHTPrepareAndFinalize` for an example.
-* In the Batched variant, the All2All Dispatch returns the activations as a tensor of shape (num_experts, max_tokens, K). Here, the activations/tokens that subscribe to the same expert are batched together. Note that not all entries of the tensor are valid. The activations tensor is typically accompanied by an `expert_num_tokens` tensor of size `num_experts`, where `expert_num_tokens[i]` indicates the number of valid tokens that subscribe to the ith expert. Look at `PplxPrepareAndFinalize` or `DeepEPLLPrepareAndFinalize` for an example.
+* In the Batched variant, the All2All Dispatch returns the activations as a tensor of shape (num_experts, max_tokens, K). Here, the activations/tokens that subscribe to the same expert are batched together. Note that not all entries of the tensor are valid. The activations tensor is typically accompanied by an `expert_num_tokens` tensor of size `num_experts`, where `expert_num_tokens[i]` indicates the number of valid tokens that subscribe to the ith expert. Look at `DeepEPLLPrepareAndFinalize` for an example.
 
 The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
 
@@ -132,7 +132,6 @@ class FusedMoEModularKernel:
 
 Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,
 
-* PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
 * DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughput All2All kernels, and
 * DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.
 
@@ -229,7 +228,7 @@ Doing this will add the new implementation to the test suite.
 ### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
 
 The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
-Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
+Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts`
 As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
 with incompatible types, the script will error.
 
@@ -238,7 +237,7 @@ with incompatible types, the script will error.
 Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
 The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
 `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
-Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
+Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts`
 
 ## FusedMoEPrepareAndFinalize Implementations
 
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 04ceeede3ebf..ac5acb66bdbf 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -33,7 +33,6 @@ th {
 | Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
 |---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
 | naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
-| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
 | deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
 | deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
 | flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferA2APrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize.FlashInferA2APrepareAndFinalize] |
@@ -68,7 +67,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
 
-Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`.
+Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `DeepEPLLPrepareAndFinalize`.
 
 Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
 
@@ -110,5 +109,5 @@ The following table shows "families" of modular kernels that are intended to wor
 | backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
 |---------|-----------------------------------------|----------------------------------------------|
 | deepep_high_throughput | `DeepEPHTPrepareAndFinalize` |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
-| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| deepep_low_latency | `DeepEPLLPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
 | flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
diff --git a/docs/governance/committers.md b/docs/governance/committers.md
index 2f0780a08978..89aaadc2bbac 100644
--- a/docs/governance/committers.md
+++ b/docs/governance/committers.md
@@ -154,7 +154,7 @@ If you have PRs touching the area, please feel free to ping the area owner for r
 - FlashAttention: @LucasWilkinson
 - FlashInfer: @LucasWilkinson, @mgoin, @WoosukKwon
 - Blackwell Kernels: @mgoin, @yewentao256
-- DeepEP/DeepGEMM/pplx: @mgoin, @yewentao256
+- DeepEP/DeepGEMM: @mgoin, @yewentao256
 
 ### Integrations
 
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index 82fde27d71fd..d469e20c9866 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -8,7 +8,7 @@ EP is typically coupled with Data Parallelism (DP). While DP can be used indepen
 
 Before using EP, you need to install the necessary dependencies. We are actively working on making this easier in the future:
 
-1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
+1. **Install DeepEP**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
 2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation).
 3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](../../tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
 
@@ -19,7 +19,6 @@ vLLM provides multiple communication backends for EP. Use `--all2all-backend` to
 | Backend | Use Case | Features | Best For |
 |---------|----------|----------|----------|
 | `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration |
-| `pplx` | Single node | Chunked prefill support, efficient intra-node communication | Single-node deployments, development |
 | `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios |
 | `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
 | `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes |
@@ -71,12 +70,11 @@ For example, with `TP=2, DP=4` (8 GPUs total):
 The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
 
 ```bash
-# Single node EP deployment with pplx backend
+# Single node EP deployment
 vllm serve deepseek-ai/DeepSeek-V3-0324 \
     --tensor-parallel-size 1 \       # Tensor parallelism across 1 GPU
     --data-parallel-size 8 \         # Data parallelism across 8 processes
-    --enable-expert-parallel \       # Enable expert parallelism
-    --all2all-backend pplx           # Use pplx communication backend
+    --enable-expert-parallel         # Enable expert parallelism
 ```
 
 ## Multi-Node Deployment
@@ -197,7 +195,6 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
     --tensor-parallel-size 1 \       # Tensor parallelism
     --data-parallel-size 8 \         # Data parallelism
     --enable-expert-parallel \       # Enable EP
-    --all2all-backend pplx \         # Use pplx communication backend
     --enable-eplb \                  # Enable load balancer
     --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
 ```
diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
index b4e922099964..3ce89e1d86f0 100644
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -64,7 +64,7 @@ vllm serve "$MODEL_NAME" \
     --enforce-eager \
     --enable-expert-parallel \
     --enable-eplb \
-    --all2all-backend pplx \
+    --all2all-backend allgather_reducescatter \
     --num-redundant-experts "$REDUNDANT_EXPERTS" \
     --trust-remote-code \
     --host "$HOST" \
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 87cf0453bea1..9f67129616f9 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -37,7 +37,6 @@
     has_deep_ep,
     has_deep_gemm,
     has_mori,
-    has_pplx,
 )
 
 from .mk_objects import (
@@ -206,10 +205,6 @@ def needs_deep_gemm(self):
         info = expert_info(self.fused_experts_type)
         return info.needs_deep_gemm
 
-    def needs_pplx(self):
-        info = prepare_finalize_info(self.prepare_finalize_type)
-        return info.backend == "pplx"
-
     def needs_deep_ep(self):
         info = prepare_finalize_info(self.prepare_finalize_type)
         return (
@@ -290,8 +285,6 @@ def is_valid(self) -> tuple[bool, str | None]:
             return False, "Needs DeepEP, but DeepEP not available."
         if self.needs_deep_gemm() and not has_deep_gemm():
             return False, "Needs DeepGEMM, but DeepGEMM not available."
-        if self.needs_pplx() and not has_pplx():  # noqa: SIM103
-            return False, "Needs PPLX, but PPLX not available."
         if self.needs_aiter() and not has_aiter():  # noqa: SIM103
             return False, "Needs Aiter, but Aiter not available."
         if self.needs_mori() and not has_mori():  # noqa: SIM103
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 981f993427b9..0ea414c3af41 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -39,7 +39,6 @@
     has_deep_ep,
     has_deep_gemm,
     has_mori,
-    has_pplx,
 )
 
 
@@ -238,19 +237,6 @@ def expert_info(kind) -> ExpertInfo:
         supports_apply_weight_on_input=False,
     )
 
-if has_pplx():
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize,
-    )
-
-    register_prepare_and_finalize(
-        PplxPrepareAndFinalize,
-        batched_format,
-        common_float_and_int_types,
-        blocked_quantization_support=True,
-        backend="pplx",
-    )
-
 if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
     from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import (  # noqa: E501
         FlashInferCutlassMoEPrepareAndFinalize,
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index 3cdc7b82130b..702584f9da53 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -125,7 +125,7 @@ def run(config: Config):
         description=(
             "Run single prepare-finalize & fused-experts combination test"
             "Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel "  # noqa: E501
-            "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+            "--pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts"
         )
     )
     args = parser.parse_args()
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index ec31e66140a1..cd1d0a0afe9f 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -14,7 +14,7 @@
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
-from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
 from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed
 from vllm.v1.worker.workspace import init_workspace_manager
 
@@ -39,12 +39,12 @@
 )
 
 has_any_multi_gpu_package = (
-    has_deep_ep() or has_deep_gemm() or has_pplx() or has_flashinfer_cutlass_fused_moe()
+    has_deep_ep() or has_deep_gemm() or has_flashinfer_cutlass_fused_moe()
 )
 
 meets_multi_gpu_requirements = pytest.mark.skipif(
     not has_any_multi_gpu_package,
-    reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
+    reason="Requires deep_ep or deep_gemm or flashinfer packages",
 )
 
 if current_platform.is_fp8_fnuz():
@@ -341,7 +341,7 @@ def test_modular_kernel_combinations_singlegpu(
         description=(
             "Run single prepare-finalize & fused-experts combination test"
             "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
-            "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+            "--pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts"
         )
     )
     args = parser.parse_args()
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
deleted file mode 100644
index d8a6600743e2..000000000000
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-
-import pytest
-import torch
-
-from tests.kernels.utils import torch_experts
-from vllm import _custom_ops as ops
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.activation import MoEActivation
-from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig,
-    FusedMoEParallelConfig,
-    RoutingMethodType,
-    fp8_w8a8_moe_quant_config,
-)
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassBatchedExpertsFp8
-from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
-from vllm.platforms import current_platform
-from vllm.utils.math_utils import cdiv
-from vllm.utils.torch_utils import set_random_seed
-from vllm.v1.worker.workspace import init_workspace_manager
-
-from ...utils import multi_gpu_test
-from .parallel_utils import ProcessGroupInfo, parallel_launch
-
-try:
-    from pplx_kernels import AllToAll
-    from pplx_kernels.nvshmem import (
-        nvshmem_alloc_empty_unique_id,
-        nvshmem_finalize,
-        nvshmem_get_unique_id,
-        nvshmem_init,
-    )
-
-    has_pplx = True
-except ImportError:
-    has_pplx = False
-
-requires_pplx = pytest.mark.skipif(
-    not has_pplx,
-    reason="Requires PPLX kernels",
-)
-
-NUM_EXPERTS = [40, 64]
-TOP_KS = [6, 8]
-
-
-def rank_chunk(num, r, w):
-    rem = num % w
-    return (num // w) + (1 if r < rem else 0)
-
-
-def chunk_by_rank(t, r, w):
-    num = t.shape[0]
-    chunk = rank_chunk(num, r, w)
-    rem = num % w
-    if rem == 0 or r < rem:
-        return t[(r * chunk) : (r + 1) * chunk].contiguous()
-    else:
-        long_chunks = (num // w + 1) * rem
-        short_chunks = (r - rem) * chunk
-        start = long_chunks + short_chunks
-        return t[start : start + chunk].contiguous()
-
-
-def pplx_cutlass_moe(
-    pgi: ProcessGroupInfo,
-    dp_size: int,
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    a1_scale: torch.Tensor,
-    out_dtype,
-    per_act_token: bool,
-    per_out_ch: bool,
-    group_name: str | None,
-):
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize,
-    )
-
-    init_workspace_manager(torch.cuda.current_device())
-
-    assert torch.cuda.current_device() == pgi.local_rank
-
-    num_tokens, hidden_dim = a.shape
-    intermediate_dim = w2.shape[2]
-    num_experts = w1.shape[0]
-    block_size = hidden_dim  # TODO support more cases
-    device = pgi.device
-    rank = pgi.rank
-    world_size = pgi.world_size
-    rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
-    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
-    topk = topk_ids.shape[1]
-
-    if block_size == hidden_dim:
-        scale_elems = 4  # hack to circumvent pplx data format requirements
-    else:
-        scale_elems = (hidden_dim + block_size - 1) // block_size
-
-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim,  # because a.dtype.itemsize == 1
-        hidden_dim_scale_bytes=scale_elems * torch.float32.itemsize,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)
-
-    w1 = w1.to(device)
-    w2 = w2.to(device)
-    w1_scale = w1_scale.to(device)
-    w2_scale = w2_scale.to(device)
-    a1_scale = a1_scale.to(device)
-
-    assert num_experts % world_size == 0
-    num_local_experts = cdiv(num_experts, world_size)
-    num_dispatchers = pgi.world_size // dp_size
-
-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens=max_num_tokens,
-        num_local_experts=num_local_experts,
-        num_dispatchers=num_dispatchers,
-    )
-
-    def make_moe_config() -> FusedMoEConfig:
-        return FusedMoEConfig(
-            num_experts=num_experts,
-            experts_per_token=topk,
-            hidden_dim=hidden_dim,
-            intermediate_size_per_partition=intermediate_dim,
-            num_local_experts=num_local_experts,
-            num_logical_experts=num_experts,
-            moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
-            activation=MoEActivation.SILU,
-            in_dtype=torch.bfloat16,
-            device="cuda",
-            routing_method=RoutingMethodType.Llama4,
-        )
-
-    experts = CutlassBatchedExpertsFp8(
-        moe_config=make_moe_config(),
-        quant_config=fp8_w8a8_moe_quant_config(
-            per_act_token_quant=per_act_token,
-            per_out_ch_quant=per_out_ch,
-            w1_scale=chunk_by_rank(w1_scale, rank, world_size),
-            w2_scale=chunk_by_rank(w2_scale, rank, world_size),
-            a1_scale=chunk_by_rank(a1_scale, rank, world_size)
-            if per_act_token
-            else a1_scale[rank],
-        ),
-        max_num_tokens=max_num_tokens,
-        num_dispatchers=num_dispatchers,
-    )
-
-    fused_cutlass_experts = FusedMoEModularKernel(
-        prepare_finalize,
-        experts,
-        inplace=False,
-    )
-
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weights, rank, world_size).to(device)
-    chunk_topk_ids = (
-        chunk_by_rank(topk_ids, rank, world_size).to(torch.uint32).to(device)
-    )
-
-    out = fused_cutlass_experts(
-        a_chunk,
-        chunk_by_rank(w1, rank, world_size),
-        chunk_by_rank(w2, rank, world_size),
-        chunk_topk_weight,
-        chunk_topk_ids,
-        global_num_experts=num_experts,
-        expert_map=None,  # TODO
-    )
-
-    torch.cuda.synchronize()
-
-    ata.destroy()
-
-    return out[:rank_num_tokens]
-
-
-vllm_config = VllmConfig()
-
-
-def _pplx_moe(
-    pgi: ProcessGroupInfo,
-    dp_size: int,
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    a1_scale: torch.Tensor,
-    out_dtype,
-    a_full: torch.Tensor,
-    w1_full: torch.Tensor,
-    w2_full: torch.Tensor,
-    per_act_token: bool,
-    per_out_ch: bool,
-    use_internode: bool,
-):
-    try:
-        if use_internode:
-            uid = (
-                nvshmem_get_unique_id()
-                if pgi.rank == 0
-                else nvshmem_alloc_empty_unique_id()
-            )
-            torch.distributed.broadcast(uid, src=0)
-            nvshmem_init(uid, pgi.rank, pgi.world_size)
-        else:
-            group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-            group_name = cpu_group.group_name
-
-        with set_current_vllm_config(vllm_config):
-            torch_output = torch_experts(
-                a_full, w1_full, w2_full, topk_weights, topk_ids
-            )
-            pplx_output = pplx_cutlass_moe(
-                pgi,
-                dp_size,
-                a,
-                w1,
-                w2,
-                w1_scale,
-                w2_scale,
-                topk_weights,
-                topk_ids,
-                a1_scale,
-                out_dtype,
-                per_act_token,
-                per_out_ch,
-                group_name,
-            )
-
-            torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
-                pplx_output.device
-            )
-
-        # Uncomment if more debugging is needed
-        # print("PPLX OUT:", pplx_output)
-        # print("TORCH OUT:", torch_output)
-
-        torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
-    finally:
-        if use_internode:
-            nvshmem_finalize()
-
-
-@pytest.mark.parametrize("m", [2, 224])
-@pytest.mark.parametrize("n", [3072])
-@pytest.mark.parametrize("k", [1536])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])  # , [4, 2]])
-@pytest.mark.parametrize("use_internode", [False])
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.skipif(
-    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()
-    ),
-    reason="Grouped gemm is not supported on this GPU type.",
-)
-@requires_pplx
-def test_cutlass_moe_pplx(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    per_act_token: bool,
-    per_out_ch: bool,
-    world_dp_size: tuple[int, int],
-    use_internode: bool,
-):
-    set_random_seed(7)
-
-    with set_current_vllm_config(vllm_config):
-        dtype = torch.half
-
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10.0
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10.0
-
-        n_b_scales = 2 * n if per_out_ch else 1
-        k_b_scales = k if per_out_ch else 1
-
-        w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn)
-        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
-
-        for expert in range(e):
-            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch
-            )
-            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch
-            )
-
-        w1_d = torch.empty_like(w1)
-        w2_d = torch.empty_like(w2)
-        for expert in range(e):
-            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
-            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
-
-        score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
-
-        world_size, dp_size = world_dp_size
-        a_scale1 = (
-            torch.randn(
-                (m if per_act_token else 1, 1), device="cuda", dtype=torch.float32
-            )
-            / 10.0
-        )
-        if not per_act_token:
-            a_scale1 = a_scale1.repeat(world_size, 1)
-
-        parallel_launch(
-            world_size,
-            _pplx_moe,
-            dp_size,
-            a,
-            w1_q,
-            w2_q,
-            w1_scale,
-            w2_scale,
-            topk_weights,
-            topk_ids,
-            a_scale1,
-            dtype,
-            a,
-            w1_d,
-            w2_d,
-            per_act_token,
-            per_out_ch,
-            use_internode,
-        )
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
deleted file mode 100644
index deb3b9eb4d76..000000000000
--- a/tests/kernels/moe/test_pplx_moe.py
+++ /dev/null
@@ -1,1021 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the MOE layers.
-
-Run `pytest tests/kernels/test_pplx_moe.py`.
-"""
-
-import copy
-import itertools
-import textwrap
-import traceback
-from collections.abc import Callable
-
-import pytest
-import torch
-
-try:
-    from pplx_kernels import AllToAll
-    from pplx_kernels.nvshmem import (
-        nvshmem_alloc_empty_unique_id,
-        nvshmem_finalize,
-        nvshmem_get_unique_id,
-        nvshmem_init,
-    )
-
-    has_pplx = True
-except ImportError:
-    has_pplx = False
-
-from tests.kernels.moe.modular_kernel_tools.parallel_utils import _set_vllm_config
-from tests.kernels.moe.utils import (
-    make_dummy_moe_config,
-    make_shared_experts,
-    make_test_weights,
-    naive_batched_moe,
-)
-from tests.kernels.quant_utils import dequant
-from tests.kernels.utils import torch_experts
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe import fused_topk, override_config
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
-from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
-from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
-from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate,
-)
-from vllm.utils.math_utils import round_up
-from vllm.utils.torch_utils import set_random_seed
-from vllm.v1.worker.workspace import init_workspace_manager
-
-from ...utils import multi_gpu_test
-from .parallel_utils import ProcessGroupInfo, parallel_launch
-
-requires_pplx = pytest.mark.skipif(
-    not has_pplx,
-    reason="Requires PPLX kernels",
-)
-
-BATCHED_MOE_MNK_FACTORS = [
-    (1, 128, 128),
-    (33, 2048, 128),
-    (64, 128, 2048),
-    (222, 128, 128),
-    (222, 2048, 1024),
-]
-
-PPLX_COMBOS = [
-    # TODO(bnell): figure out why this fails, seems to be test problem
-    # (1, 128, 128),
-    (2, 128, 512),
-    (3, 1024, 2048),
-    (4, 128, 128),
-    (32, 1024, 512),
-    (45, 512, 2048),
-    (64, 1024, 512),
-    (222, 2048, 1024),
-    (256, 1408, 2048),
-]
-
-NUM_EXPERTS = [8, 64]
-TOP_KS = [1, 2, 6]
-DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
-
-vllm_config = VllmConfig()
-
-
-def torch_prepare(
-    a: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    max_num_tokens: int | None = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    assert topk_ids.dim() == 2
-    assert topk_ids.shape[0] == a.shape[0]
-
-    num_tokens, hidden_dim = a.shape
-    topk = topk_ids.shape[1]
-
-    tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
-
-    assert tokens_per_expert.numel() == num_experts
-
-    if max_num_tokens is None:
-        max_num_tokens = int(tokens_per_expert.max().item())
-
-    b_a = torch.zeros(
-        (num_experts, max_num_tokens, hidden_dim), dtype=a.dtype, device=a.device
-    )
-
-    token_counts = torch.zeros(num_experts, dtype=torch.int, device=a.device)
-
-    for token in range(num_tokens):
-        for j in range(topk):
-            expert_id = topk_ids[token, j]
-            idx = token_counts[expert_id]
-            b_a[expert_id, idx : idx + 1, :] = a[token, :]
-            token_counts[expert_id] = token_counts[expert_id] + 1
-
-    return b_a, tokens_per_expert
-
-
-def torch_finalize(
-    b_out: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor
-) -> torch.Tensor:
-    num_tokens = topk_ids.shape[0]
-    num_experts = b_out.shape[0]
-    K = b_out.shape[-1]
-    out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device)
-    expert_counts = torch.zeros(num_experts, dtype=torch.int, device=b_out.device)
-    for token in range(num_tokens):
-        expert_ids = topk_ids[token]
-        for i in range(expert_ids.numel()):
-            expert_id = expert_ids[i]
-            idx = expert_counts[expert_id]
-            out[token, :] = (
-                out[token, :]
-                + b_out[expert_id, idx : idx + 1, :] * topk_weight[token, i]
-            )
-            expert_counts[expert_id] = expert_counts[expert_id] + 1
-
-    return out
-
-
-def torch_batched_moe(
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-) -> torch.Tensor:
-    num_experts = w1.shape[0]
-    b_a, tokens_per_expert = torch_prepare(a, topk_ids, num_experts)
-    assert b_a.dim() == 3
-    num_tokens, topk = topk_ids.shape
-    _, max_num_tokens, K = b_a.shape
-    assert num_experts == b_a.shape[0] and w2.shape[1] == K
-    out = torch.zeros(
-        (num_experts, max_num_tokens, K), dtype=b_a.dtype, device=b_a.device
-    )
-    tmp = torch.empty(
-        (max_num_tokens, w1.shape[1] // 2), dtype=b_a.dtype, device=b_a.device
-    )
-    for expert in range(num_experts):
-        num = tokens_per_expert[expert]
-        if num > 0:
-            torch.ops._C.silu_and_mul(
-                tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1)
-            )
-            out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1)
-
-    return torch_finalize(out, topk_weight, topk_ids)
-
-
-@pytest.mark.parametrize("m,n,k", BATCHED_MOE_MNK_FACTORS)
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-def test_fused_moe_batched_experts(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    dtype: torch.dtype,
-    workspace_init,
-):
-    set_random_seed(7)
-
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-    score = torch.randn((m, e), device="cuda", dtype=dtype)
-
-    with set_current_vllm_config(vllm_config):
-        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(
-            a, w1, w2, topk_weight, topk_ids
-        )  # only for baseline
-        torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
-        batched_output = naive_batched_moe(
-            a, w1, w2, topk_weight, topk_ids
-        )  # pick torch_experts or this
-
-    torch.testing.assert_close(baseline_output, torch_output, atol=2e-2, rtol=0)
-    torch.testing.assert_close(baseline_output, batched_output, atol=2e-2, rtol=0)
-
-
-def create_pplx_prepare_finalize(
-    num_tokens: int,
-    hidden_dim: int,
-    topk: int,
-    num_experts: int,
-    rank: int,
-    dp_size: int,
-    world_size: int,
-    in_dtype: torch.dtype,
-    quant_dtype: torch.dtype | None,
-    block_shape: list[int] | None,
-    per_act_token_quant: bool,
-    group_name: str | None,
-):
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize,
-        pplx_hidden_dim_scale_bytes,
-    )
-
-    max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
-    num_local_experts = rank_chunk(num_experts, 0, world_size)
-
-    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
-        max_num_tokens,
-        hidden_dim,
-        in_dtype,
-        quant_dtype,
-        per_act_token_quant=per_act_token_quant,
-        block_shape=block_shape,
-    )
-
-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim_bytes,
-        hidden_dim_scale_bytes=scale_bytes,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)
-
-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens=max_num_tokens,
-        num_local_experts=num_local_experts,
-        num_dispatchers=world_size // dp_size,
-    )
-
-    return prepare_finalize, ata
-
-
-def rank_chunk(num: int, r: int, w: int) -> int:
-    rem = num % w
-    return (num // w) + (1 if r < rem else 0)
-
-
-def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
-    chunk = rank_chunk(t.shape[0], r, w)
-    return t[(r * chunk) : (r + 1) * chunk]
-
-
-def maybe_chunk_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None:
-    if t is not None:
-        return chunk_by_rank(t, r, w)
-    else:
-        return t
-
-
-def chunk_scales_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None:
-    if t is not None and t.numel() > 1:
-        chunk = rank_chunk(t.shape[0], r, w)
-        return t[(r * chunk) : (r + 1) * chunk]
-    else:
-        return t
-
-
-def chunk_scales(t: torch.Tensor | None, start: int, end: int) -> torch.Tensor | None:
-    if t is not None and t.numel() > 1:
-        return t[start:end]
-    else:
-        return t
-
-
-def dummy_work(a: torch.Tensor) -> torch.Tensor:
-    return a * 1.1
-
-
-def pplx_prepare_finalize(
-    pgi: ProcessGroupInfo,
-    dp_size: int,
-    a: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    quant_dtype: torch.dtype | None,
-    block_shape: list[int] | None,
-    per_act_token_quant: bool,
-    group_name: str | None,
-) -> torch.Tensor:
-    assert torch.cuda.current_device() == pgi.local_rank
-
-    topk = topk_ids.shape[1]
-    num_tokens, hidden_dim = a.shape
-    device = pgi.device
-    rank = pgi.rank
-    world_size = pgi.world_size
-
-    topk_ids = topk_ids.to(dtype=torch.uint32)
-
-    prepare_finalize, ata = create_pplx_prepare_finalize(
-        num_tokens,
-        hidden_dim,
-        topk,
-        num_experts,
-        rank,
-        dp_size,
-        world_size,
-        a.dtype,
-        quant_dtype,
-        block_shape,
-        per_act_token_quant,
-        group_name,
-    )
-
-    assert a.shape[0] == topk_ids.shape[0]
-
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
-
-    assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
-
-    out = torch.full(
-        a_chunk.shape,
-        torch.nan,
-        dtype=a.dtype,
-        device=device,
-    )
-
-    if quant_dtype is not None and not per_act_token_quant and block_shape is None:
-        a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    else:
-        a1_scale = None
-        a2_scale = None
-
-    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
-        a_chunk,
-        chunk_topk_weight,
-        chunk_topk_ids,
-        num_experts,
-        None,
-        False,
-        FusedMoEQuantConfig.make(
-            quant_dtype,
-            per_act_token_quant=per_act_token_quant,
-            per_out_ch_quant=False,
-            block_shape=block_shape,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-        ),
-    )
-
-    b_a = dummy_work(dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
-
-    prepare_finalize.finalize(
-        out,
-        b_a,
-        chunk_topk_weight,
-        chunk_topk_ids,
-        False,
-        weight_and_reduce_impl=TopKWeightAndReduceDelegate(),
-    )
-
-    torch.cuda.synchronize()
-
-    ata.destroy()
-
-    num_tokens = a_chunk.shape[0]
-
-    return out[:num_tokens]
-
-
-def _pplx_prepare_finalize(
-    pgi: ProcessGroupInfo,
-    dp_size: int,
-    a: torch.Tensor,
-    score: torch.Tensor,
-    topk: torch.Tensor,
-    num_experts: int,
-    quant_dtype: torch.dtype | None,
-    block_shape: list[int] | None,
-    per_act_token_quant: bool,
-    use_internode: bool,
-):
-    try:
-        if use_internode:
-            uid = (
-                nvshmem_get_unique_id()
-                if pgi.rank == 0
-                else nvshmem_alloc_empty_unique_id()
-            )
-            torch.distributed.broadcast(uid, src=0)
-            nvshmem_init(uid, pgi.rank, pgi.world_size)
-            group_name = None
-        else:
-            group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-            group_name = cpu_group.group_name
-
-        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        m, k = a.shape
-
-        a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
-
-        torch_output = (
-            a_rep.view(m, topk, k) * topk_weight.view(m, topk, 1).to(a_rep.dtype)
-        ).sum(dim=1)
-
-        pplx_output = pplx_prepare_finalize(
-            pgi,
-            dp_size,
-            a,
-            topk_weight,
-            topk_ids,
-            num_experts,
-            quant_dtype,
-            block_shape,
-            per_act_token_quant,
-            group_name,
-        )
-
-        torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
-            pgi.device
-        )
-
-        torch.testing.assert_close(pplx_output, torch_output, atol=3e-2, rtol=3e-2)
-    finally:
-        if use_internode:
-            nvshmem_finalize()
-
-
-@pytest.mark.parametrize("mnk", PPLX_COMBOS)
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])
-@pytest.mark.parametrize("per_act_token_quant", [False, True])
-@pytest.mark.parametrize("block_shape", [None, [128, 128]])
-@pytest.mark.parametrize("use_internode", [False])
-@pytest.mark.optional
-@requires_pplx
-@multi_gpu_test(num_gpus=2)
-def test_pplx_prepare_finalize_slow(
-    mnk: tuple[int, int, int],
-    e: int,
-    topk: int,
-    dtype: torch.dtype,
-    world_dp_size: tuple[int, int],
-    per_act_token_quant: bool,
-    block_shape: list[int] | None,
-    use_internode: bool,
-):
-    if dtype == torch.float8_e4m3fn:
-        use_fp8_w8a8 = True
-        act_dtype = torch.bfloat16
-        quant_dtype = dtype
-    else:
-        use_fp8_w8a8 = False
-        act_dtype = dtype
-        quant_dtype = None
-
-    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
-        pytest.skip("Skip quantization test for non-quantized type")
-
-    if per_act_token_quant and block_shape is not None:
-        pytest.skip("Skip illegal quantization combination")
-
-    set_random_seed(7)
-    m, n, k = mnk
-    world_size, dp_size = world_dp_size
-    device = "cuda"
-
-    a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
-    score = torch.randn((m, e), device=device, dtype=act_dtype)
-
-    parallel_launch(
-        world_size,
-        _pplx_prepare_finalize,
-        dp_size,
-        a,
-        score,
-        topk,
-        e,
-        quant_dtype,
-        block_shape,
-        per_act_token_quant,
-        use_internode,
-    )
-
-
-def pplx_moe(
-    group_name: str | None,
-    rank: int,
-    world_size: int,
-    dp_size: int,
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    w1_scale: torch.Tensor | None = None,
-    w2_scale: torch.Tensor | None = None,
-    a1_scale: torch.Tensor | None = None,
-    a2_scale: torch.Tensor | None = None,
-    quant_dtype: torch.dtype | None = None,
-    per_act_token_quant=False,
-    block_shape: list[int] | None = None,
-    use_compile: bool = False,
-    use_cudagraphs: bool = True,
-    shared_experts: torch.nn.Module | None = None,
-) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-    num_tokens, hidden_dim = a.shape
-    num_experts = w1.shape[0]
-    topk = topk_ids.shape[1]
-    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)
-
-    prepare_finalize, ata = create_pplx_prepare_finalize(
-        num_tokens,
-        hidden_dim,
-        topk,
-        num_experts,
-        rank,
-        dp_size,
-        world_size,
-        a.dtype,
-        quant_dtype,
-        block_shape,
-        per_act_token_quant,
-        group_name,
-    )
-
-    topk_ids = topk_ids.to(dtype=torch.uint32)
-
-    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)
-
-    # Chunking weights like this only works for batched format
-    w1_chunk = chunk_by_rank(w1, rank, world_size)
-    w2_chunk = chunk_by_rank(w2, rank, world_size)
-    w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
-    w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
-    a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
-    a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)
-
-    quant_config = FusedMoEQuantConfig.make(
-        quant_dtype,
-        block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant,
-        w1_scale=w1_scale_chunk,
-        w2_scale=w2_scale_chunk,
-        a1_scale=a1_scale_chunk,
-        a2_scale=a2_scale_chunk,
-    )
-
-    experts = BatchedTritonExperts(
-        max_num_tokens=max_num_tokens,
-        num_dispatchers=prepare_finalize.num_dispatchers(),
-        quant_config=quant_config,
-        moe_config=make_dummy_moe_config(),
-    )
-
-    fused_experts = FusedMoEModularKernel(
-        prepare_finalize,
-        experts,
-        shared_experts,
-        inplace=False,
-    )
-
-    # Note: for now use_compile will error out if the problem size is
-    # large enough to trigger chunking. I'm leaving the flag and
-    # setup code in case we are able to revisit this later.
-    if use_compile:
-        _fused_experts = torch.compile(
-            fused_experts, backend="inductor", fullgraph=True
-        )
-        torch._dynamo.mark_dynamic(a_chunk, 0)
-        torch._dynamo.mark_dynamic(chunk_topk_weight, 0)
-        torch._dynamo.mark_dynamic(chunk_topk_ids, 0)
-    else:
-        _fused_experts = fused_experts
-
-    out = _fused_experts(
-        a_chunk,
-        w1_chunk,
-        w2_chunk,
-        chunk_topk_weight,
-        chunk_topk_ids,
-        global_num_experts=num_experts,
-    )
-
-    if use_cudagraphs:
-        if isinstance(out, tuple):
-            out[0].fill_(0)
-            out[1].fill_(0)
-        else:
-            out.fill_(0)
-        stream = torch.cuda.Stream()
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph, stream=stream):
-            out = _fused_experts(
-                a_chunk,
-                w1_chunk,
-                w2_chunk,
-                chunk_topk_weight,
-                chunk_topk_ids,
-                global_num_experts=num_experts,
-            )
-
-        torch.cuda.synchronize()
-        graph.replay()
-
-    torch.cuda.synchronize()
-
-    ata.destroy()
-
-    return out
-
-
-def _pplx_moe(
-    pgi: ProcessGroupInfo,
-    dp_size: int,
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    score: torch.Tensor,
-    topk: int,
-    num_experts: int,
-    w1_s: torch.Tensor | None = None,
-    w2_s: torch.Tensor | None = None,
-    quant_dtype: torch.dtype | None = None,
-    per_act_token_quant: bool = False,
-    block_shape: list[int] | None = None,
-    use_internode: bool = False,
-    shared_experts: torch.nn.Module | None = None,
-):
-    try:
-        if use_internode:
-            uid = (
-                nvshmem_get_unique_id()
-                if pgi.rank == 0
-                else nvshmem_alloc_empty_unique_id()
-            )
-            torch.distributed.broadcast(uid, src=0)
-            nvshmem_init(uid, pgi.rank, pgi.world_size)
-            group_name = None
-        else:
-            group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-            group_name = cpu_group.group_name
-
-        m, k = a.shape
-        e, _, n = w2.shape
-
-        moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
-
-        device = torch.device("cuda", pgi.rank)
-        rank = pgi.rank
-        world_size = pgi.world_size
-
-        a = a.to(device)
-        w1 = w1.to(device)
-        w2 = w2.to(device)
-        w1_s = w1_s.to(device) if w1_s is not None else None
-        w2_s = w2_s.to(device) if w2_s is not None else None
-
-        if quant_dtype is not None and not per_act_token_quant and block_shape is None:
-            a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-            a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        else:
-            a1_scale = None
-            a2_scale = None
-
-        with set_current_vllm_config(vllm_config), override_config(moe_config):
-            topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-
-            shared_output = shared_experts(a) if shared_experts is not None else None
-
-            torch_output = torch_experts(
-                a,
-                w1,
-                w2,
-                topk_weight,
-                topk_ids,
-                w1_scale=w1_s,
-                w2_scale=w2_s,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape,
-            )
-
-            batched_output = naive_batched_moe(
-                a,
-                w1,
-                w2,
-                topk_weight,
-                topk_ids,
-                w1_scale=w1_s,
-                w2_scale=w2_s,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape,
-            )
-
-            pplx_outputs = pplx_moe(
-                group_name,
-                rank,
-                world_size,
-                dp_size,
-                a,
-                w1,
-                w2,
-                topk_weight,
-                topk_ids,
-                w1_scale=w1_s,
-                w2_scale=w2_s,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape,
-                shared_experts=shared_experts,
-            )
-
-        if shared_experts is None:
-            pplx_shared_output = None
-            pplx_output = pplx_outputs
-            assert isinstance(pplx_output, torch.Tensor)
-        else:
-            pplx_shared_output, pplx_output = pplx_outputs
-
-        if shared_output is not None:
-            assert pplx_shared_output is not None
-            chunked_shared_output = chunk_by_rank(
-                shared_output, pgi.rank, pgi.world_size
-            ).to(pplx_shared_output.device)
-        else:
-            chunked_shared_output = None
-
-        chunked_batch_output = chunk_by_rank(
-            batched_output, pgi.rank, pgi.world_size
-        ).to(pplx_output.device)
-
-        torch.testing.assert_close(batched_output, torch_output, atol=3e-2, rtol=3e-2)
-
-        torch.testing.assert_close(
-            pplx_output, chunked_batch_output, atol=3e-2, rtol=3e-2
-        )
-
-        if shared_experts is not None:
-            assert chunked_shared_output is not None
-            assert pplx_shared_output is not None
-            torch.testing.assert_close(
-                pplx_shared_output, chunked_shared_output, atol=3e-2, rtol=3e-2
-            )
-
-    finally:
-        if use_internode:
-            nvshmem_finalize()
-
-
-@pytest.mark.parametrize("mnk", PPLX_COMBOS)
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])
-@pytest.mark.parametrize("per_act_token_quant", [False, True])
-@pytest.mark.parametrize("block_shape", [None, [128, 128]])
-@pytest.mark.parametrize("use_internode", [False])
-@pytest.mark.optional
-@requires_pplx
-@multi_gpu_test(num_gpus=2)
-def test_pplx_moe_slow(
-    mnk: tuple[int, int, int],
-    e: int,
-    topk: int,
-    dtype: torch.dtype,
-    world_dp_size: tuple[int, int],
-    per_act_token_quant: bool,
-    block_shape: list[int] | None,
-    use_internode: bool,
-):
-    set_random_seed(7)
-    m, n, k = mnk
-    world_size, dp_size = world_dp_size
-
-    if dtype == torch.float8_e4m3fn:
-        use_fp8_w8a8 = True
-        quant_dtype = dtype
-    else:
-        use_fp8_w8a8 = False
-        quant_dtype = None
-
-    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
-        pytest.skip("Skip quantization test for non-quantized type")
-
-    if per_act_token_quant and block_shape is not None:
-        pytest.skip("Skip illegal quantization combination")
-
-    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
-    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-
-    (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
-        e,
-        n,
-        k,
-        quant_dtype=quant_dtype,
-        block_shape=block_shape,
-        per_out_ch_quant=per_act_token_quant,
-    )
-
-    parallel_launch(
-        world_size,
-        _pplx_moe,
-        dp_size,
-        a,
-        w1,
-        w2,
-        score,
-        topk,
-        e,
-        w1_s,
-        w2_s,
-        quant_dtype,
-        per_act_token_quant,
-        block_shape,
-        use_internode,
-    )
-
-
-def _pplx_test_loop(
-    pgi: ProcessGroupInfo,
-    dp_size: int,
-    use_internode: bool,
-    use_shared_experts: bool,
-    make_weights: bool,
-    test_fn: Callable,
-):
-    device = torch.device(f"cuda:{pgi.local_rank}")
-    init_workspace_manager(device)
-
-    def format_result(msg, ex=None):
-        if ex is not None:
-            x = str(ex)
-            newx = x.strip(" \n\t")[:16]
-            if len(newx) < len(x):
-                newx = newx + " ..."
-
-            prefix = "E\t"
-            print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
-            print(f"FAILED {msg} - {newx}\n")
-        else:
-            print(f"PASSED {msg}")
-
-    if use_shared_experts:
-        # Note: this config is only needed for the non-naive shared experts.
-        new_vllm_config = copy.deepcopy(vllm_config)
-        new_vllm_config.parallel_config.data_parallel_size = pgi.world_size
-        new_vllm_config.parallel_config.enable_expert_parallel = True
-        _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank)
-
-    set_random_seed(7)
-    combos = itertools.product(
-        PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
-    )
-    exceptions = []
-    count = 0
-    for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
-        count = count + 1
-        m, n, k = mnk
-
-        if dtype == torch.float8_e4m3fn:
-            use_fp8_w8a8 = True
-            quant_dtype = dtype
-        else:
-            use_fp8_w8a8 = False
-            quant_dtype = None
-
-        test_desc = (
-            f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
-            f"dtype={dtype}, per_act_token={per_act_token_quant}, "
-            f"block_shape={block_shape}, use_internode={use_internode}, "
-            f"use_shared_experts={use_shared_experts}"
-        )
-
-        if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
-            print(f"{test_desc} - Skip quantization test for non-quantized type.")
-            continue
-
-        if per_act_token_quant and block_shape is not None:
-            print(f"{test_desc} - Skip illegal quantization combination.")
-            continue
-
-        a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
-        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-
-        args = dict()
-        if make_weights:
-            (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
-                e,
-                n,
-                k,
-                quant_dtype=quant_dtype,
-                block_shape=block_shape,
-                per_out_ch_quant=per_act_token_quant,
-            )
-            args["w1"] = w1
-            args["w2"] = w2
-            args["w1_s"] = w1_s
-            args["w2_s"] = w2_s
-
-        if use_shared_experts:
-            args["shared_experts"] = make_shared_experts(
-                n,
-                k,
-                in_dtype=a.dtype,
-                quant_dtype=quant_dtype,
-            )
-
-        try:
-            test_fn(
-                pgi=pgi,
-                dp_size=dp_size,
-                a=a,
-                score=score,
-                topk=topk,
-                num_experts=e,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape,
-                use_internode=use_internode,
-                **args,
-            )
-            format_result(test_desc)
-        except Exception as ex:
-            format_result(test_desc, ex)
-            exceptions.append(ex)
-
-    if len(exceptions) > 0:
-        raise RuntimeError(
-            f"{len(exceptions)} of {count} tests failed in child process, "
-            f"rank={pgi.rank}."
-        )
-    else:
-        print(f"{count} of {count} tests passed in child process, rank={pgi.rank}.")
-
-
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])
-@pytest.mark.parametrize("use_internode", [False])
-@requires_pplx
-@multi_gpu_test(num_gpus=2)
-def test_pplx_prepare_finalize(
-    world_dp_size: tuple[int, int],
-    use_internode: bool,
-):
-    set_random_seed(7)
-    world_size, dp_size = world_dp_size
-    parallel_launch(
-        world_size * dp_size,
-        _pplx_test_loop,
-        dp_size,
-        use_internode,
-        False,
-        False,
-        _pplx_prepare_finalize,
-    )
-
-
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])
-@pytest.mark.parametrize("use_internode", [False])
-@pytest.mark.parametrize("use_shared_experts", [False, True])
-@requires_pplx
-@multi_gpu_test(num_gpus=2)
-def test_pplx_moe(
-    world_dp_size: tuple[int, int],
-    use_internode: bool,
-    use_shared_experts: bool,
-):
-    set_random_seed(7)
-    world_size, dp_size = world_dp_size
-    parallel_launch(
-        world_size,
-        _pplx_test_loop,
-        dp_size,
-        use_internode,
-        use_shared_experts,
-        True,
-        _pplx_moe,
-    )
diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
index ab0e358802bf..b4eabe18ca1d 100644
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -4,7 +4,7 @@ Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Tech
 
 Here we break down the requirements in 2 steps:
 
-1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
+1. Build and install the Python libraries ([DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
 
 Step 2 is necessary for multi-node deployment.
diff --git a/tools/ep_kernels/elastic_ep/install_eep_libraries.sh b/tools/ep_kernels/elastic_ep/install_eep_libraries.sh
index fe7b862159df..31519c287162 100755
--- a/tools/ep_kernels/elastic_ep/install_eep_libraries.sh
+++ b/tools/ep_kernels/elastic_ep/install_eep_libraries.sh
@@ -76,11 +76,4 @@ popd
 
 export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
 
-# build and install pplx, require pytorch installed
-pushd "$WORKSPACE"
-git clone https://github.com/ppl-ai/pplx-kernels
-cd pplx-kernels
-# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
-# PIP_NO_BUILD_ISOLATION=0 disables build isolation
-PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v
 
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 148cb6e18d8d..3372dd10f4dc 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -4,12 +4,10 @@ set -ex
 # usage: ./install_python_libraries.sh [options]
 #   --workspace <dir>    workspace directory (default: ./ep_kernels_workspace)
 #   --mode <mode>        "install" (default) or "wheel"
-#   --pplx-ref <commit>  pplx-kernels commit hash
 #   --deepep-ref <commit> DeepEP commit hash
 #   --nvshmem-ver <ver>  NVSHMEM version 
 
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
-PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
 DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
 NVSHMEM_VER=${NVSHMEM_VER:-"3.3.24"}  # Default supports both CUDA 12 and 13
 WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
@@ -35,14 +33,6 @@ while [[ $# -gt 0 ]]; do
             MODE="$2"
             shift 2
             ;;
-        --pplx-ref)
-            if [[ -z "$2" || "$2" =~ ^- ]]; then
-                echo "Error: --pplx-ref requires an argument." >&2
-                exit 1
-            fi
-            PPLX_COMMIT_HASH="$2"
-            shift 2
-            ;;
         --deepep-ref)
             if [[ -z "$2" || "$2" =~ ^- ]]; then
                 echo "Error: --deepep-ref requires an argument." >&2
@@ -188,14 +178,6 @@ do_build() {
     popd
 }
 
-# build pplx-kernels
-do_build \
-    "https://github.com/ppl-ai/pplx-kernels" \
-    "pplx-kernels" \
-    "setup.py" \
-    "$PPLX_COMMIT_HASH" \
-    ""
-
 # build DeepEP
 do_build \
     "https://github.com/deepseek-ai/DeepEP" \
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e48ba6c997eb..37cf43620340 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -988,7 +988,7 @@ def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
     return output_tensor
 
 
-def get_cutlass_pplx_moe_mm_data(
+def get_cutlass_batched_moe_mm_data(
     expert_offsets: torch.Tensor,
     problem_sizes1: torch.Tensor,
     problem_sizes2: torch.Tensor,
@@ -1011,7 +1011,7 @@ def get_cutlass_pplx_moe_mm_data(
                                       multiplication in two grouped MMs used in
                                       the fused MoE operation.
     """
-    return torch.ops._C.get_cutlass_pplx_moe_mm_data(
+    return torch.ops._C.get_cutlass_batched_moe_mm_data(
         expert_offsets,
         problem_sizes1,
         problem_sizes2,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index d22e9a96e0f3..01dc61cdcad0 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -1045,7 +1045,7 @@ def set_splitting_ops_for_v1(
                 "are optimized for prefill and are incompatible with CUDA Graphs. "
                 "In order to use CUDA Graphs for decode-optimized workloads, "
                 "use --all2all-backend with another option, such as "
-                "deepep_low_latency, pplx, or allgather_reducescatter."
+                "deepep_low_latency or allgather_reducescatter."
             )
             self.cudagraph_mode = CUDAGraphMode.NONE
 
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index cc2cfa97b503..fa4f72dccca4 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -152,7 +152,6 @@ class ParallelConfig:
 
     - "naive": Naive all2all implementation using broadcasts\n
     - "allgather_reducescatter": All2all based on allgather and reducescatter\n
-    - "pplx": Use pplx kernels\n
     - "deepep_high_throughput": Use deepep high-throughput kernels\n
     - "deepep_low_latency": Use deepep low-latency kernels\n
     - "mori": Use mori kernels\n
@@ -310,6 +309,13 @@ def _validate_parallel_config(self) -> Self:
                 f"but found: {self._api_process_rank}"
             )
 
+        if self.all2all_backend == "pplx":
+            logger.warning(
+                "The 'pplx' all2all backend has been removed. "
+                "Falling back to 'allgather_reducescatter'."
+            )
+            self.all2all_backend = "allgather_reducescatter"
+
         if self.data_parallel_size_local > self.data_parallel_size:
             raise ValueError(
                 f"data_parallel_size_local ({self.data_parallel_size_local}) "
@@ -442,7 +448,6 @@ def stateless_init_dp_group(self) -> ProcessGroup:
     # In this case, ensure the input to the experts is sequence parallel
     # to avoid the excess work.
     #
-    # Not needed for pplx-kernels as it can handle duplicate input tokens.
     @property
     def use_sequence_parallel_moe(self) -> bool:
         return (
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 678cd45800de..4acab4e3c88e 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -3,14 +3,13 @@
 from typing import Any
 
 import torch
-import torch.distributed as dist
 
 import vllm.envs as envs
 from vllm.distributed import get_dp_group, get_ep_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.utils.flashinfer import has_flashinfer_all2all
-from vllm.utils.import_utils import has_deep_ep, has_mori, has_pplx
+from vllm.utils.import_utils import has_deep_ep, has_mori
 
 from .base_device_communicator import All2AllManagerBase, Cache
 
@@ -235,96 +234,6 @@ def destroy(self):
         pass
 
 
-class PPLXAll2AllManager(All2AllManagerBase):
-    """
-    All2All communication based on PPLX kernels.
-    """
-
-    def __init__(self, cpu_group):
-        assert has_pplx(), (
-            "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
-            " to install pplx_kernels."
-        )
-        super().__init__(cpu_group)
-
-        if self.internode:
-            # inter-node communication needs nvshmem,
-            # intra-node communication uses p2p mapping directly
-            from pplx_kernels.nvshmem import (  # type: ignore[import-not-found]
-                nvshmem_alloc_empty_unique_id,
-                nvshmem_get_unique_id,
-                nvshmem_init,
-            )
-
-            logger.debug(
-                "Initialize NVSHMEM for pplx_kernels: rank=%d, world size=%d",
-                self.rank,
-                self.world_size,
-            )
-            uid = (
-                nvshmem_get_unique_id()
-                if self.rank == 0
-                else nvshmem_alloc_empty_unique_id()
-            )
-            dist.broadcast(
-                uid,
-                src=dist.get_process_group_ranks(self.cpu_group)[0],
-                group=self.cpu_group,
-            )
-            logger.debug("PPLX NVSHMEM UID = %s", uid)
-            nvshmem_init(uid, self.rank, self.world_size)
-
-        self.handle_cache = Cache()
-
-    def get_handle(self, kwargs):
-        import pplx_kernels as pplx  # type: ignore[import-not-found]
-
-        return self.handle_cache.get_or_create(
-            kwargs,
-            pplx.AllToAll.internode if self.internode else pplx.AllToAll.intranode,
-        )
-
-    def dispatch_router_logits(
-        self,
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-        is_sequence_parallel: bool = False,
-        extra_tensors: list[torch.Tensor] | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        raise NotImplementedError
-
-    def dispatch(
-        self,
-        hidden_states: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        is_sequence_parallel: bool = False,
-        extra_tensors: list[torch.Tensor] | None = None,
-    ) -> (
-        tuple[torch.Tensor, torch.Tensor, torch.Tensor]
-        | tuple[torch.Tensor, torch.Tensor, torch.Tensor, list[torch.Tensor]]
-    ):
-        raise NotImplementedError
-
-    def combine(
-        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
-    ) -> torch.Tensor:
-        raise NotImplementedError
-
-    def destroy(self):
-        with self.handle_cache._lock:
-            for _, handle in self.handle_cache._cache.items():
-                handle.destroy()
-
-        if self.internode:
-            from pplx_kernels.nvshmem import (
-                nvshmem_finalize,  # type: ignore[import-not-found]
-            )
-
-            logger.debug("PPLX NVSHMEM finalize")
-            nvshmem_finalize()
-
-
 class DeepEPAll2AllManagerBase(All2AllManagerBase):
     """
     All2All communication based on DeepEP High-Throughput kernels.
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 62e2b90377fd..dd571482f5c8 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -112,10 +112,6 @@ def __init__(
                 from .all2all import AgRsAll2AllManager
 
                 self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
-            elif self.all2all_backend == "pplx":
-                from .all2all import PPLXAll2AllManager
-
-                self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
             elif self.all2all_backend == "deepep_high_throughput":
                 from .all2all import DeepEPHTAll2AllManager
 
@@ -298,7 +294,7 @@ def destroy(self):
             self.fi_ar_comm = None
         if self.all2all_manager is not None:
             self.all2all_manager.destroy()
-            self.all2all_manager = None
+            self.all2all_manager = None  # type: ignore[assignment]
 
     def all_gatherv(
         self,
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 7c3701b4ea93..891f19cfe254 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -159,7 +159,7 @@ class EplbModelState:
 
     NOTE: The expert_load_view now records load for all physical experts
     rather than just local experts. This ensures consistent load statistics
-    across different dispatch methods (naive all-to-all, DeepEP, pplx-kernels).
+    across different dispatch methods (naive all-to-all, DeepEP).
     The recorded load will be multiplied by dp_size when using naive all-to-all
     due to each DP rank contributing the same token set to the calculation.
     See:
diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
index bf8ec2dc6f20..8c1bfe1c3675 100644
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Any
 
 import torch
 
@@ -24,16 +25,11 @@
     MoEPrepareAndFinalizeNoEP,
 )
 from vllm.platforms import current_platform
-from vllm.utils.import_utils import has_deep_ep, has_mori, has_pplx
+from vllm.utils.import_utils import has_deep_ep, has_mori
 
 logger = init_logger(__name__)
 
 if current_platform.is_cuda_alike():
-    if has_pplx():
-        from .pplx_prepare_finalize import (
-            PplxPrepareAndFinalize,
-            pplx_hidden_dim_scale_bytes,
-        )
     if has_deep_ep():
         from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
         from .deepep_ll_prepare_finalize import (
@@ -120,51 +116,10 @@ def maybe_make_prepare_finalize(
 
     prepare_finalize: FusedMoEPrepareAndFinalize | None = None
 
-    if moe.use_pplx_kernels:
-        assert quant_config is not None
-
-        hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
-            moe.max_num_tokens,
-            moe.hidden_dim,
-            moe.in_dtype,
-            quant_config.quant_dtype,
-            per_act_token_quant=quant_config.per_act_token_quant,
-            block_shape=quant_config.block_shape,
-        )
-
-        all_to_all_args = dict(
-            max_num_tokens=moe.max_num_tokens,
-            num_experts=moe.num_experts,
-            experts_per_token=moe.experts_per_token,  # topk
-            rank=all2all_manager.rank,
-            world_size=all2all_manager.world_size,
-            # dp_size actually means tp_size, bug in pplx kernels
-            dp_size=all2all_manager.tp_group.world_size,
-            hidden_dim=moe.hidden_dim,
-            hidden_dim_bytes=hidden_dim_bytes,
-            hidden_dim_scale_bytes=hidden_scale_bytes,
-        )
-
-        num_dispatchers = (
-            all2all_manager.world_size // all2all_manager.tp_group.world_size
-        )
-
-        # Intranode pplx a2a takes a group name while internode does not.
-        if not all2all_manager.internode:
-            all_to_all_args["group_name"] = all2all_manager.cpu_group.group_name
-
-        handle = all2all_manager.get_handle(all_to_all_args)
-
-        prepare_finalize = PplxPrepareAndFinalize(
-            handle,
-            max_num_tokens=moe.max_num_tokens,
-            num_local_experts=moe.num_local_experts,
-            num_dispatchers=num_dispatchers,
-        )
-    elif moe.use_deepep_ht_kernels:
+    if moe.use_deepep_ht_kernels:
         assert moe.dp_size == all2all_manager.dp_world_size
 
-        all_to_all_args = dict()
+        all_to_all_args: dict[str, Any] = dict()
         handle = all2all_manager.get_handle(all_to_all_args)
         prepare_finalize = DeepEPHTPrepareAndFinalize(
             handle,
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 87e1e244b3af..33d69b57a934 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -939,10 +939,6 @@ def is_sequence_parallel(self) -> bool:
     def use_all2all_kernels(self):
         return self.dp_size > 1 and self.use_ep
 
-    @property
-    def use_pplx_kernels(self):
-        return self.use_all2all_kernels and self.all2all_backend == "pplx"
-
     @property
     def use_deepep_ht_kernels(self):
         return (
@@ -962,7 +958,7 @@ def use_fi_all2allv_kernels(self):
 
     @property
     def use_batched_activation_format(self):
-        return self.use_deepep_ll_kernels or self.use_pplx_kernels
+        return self.use_deepep_ll_kernels
 
     @property
     def use_naive_all2all_kernels(self):
@@ -1221,10 +1217,6 @@ def ep_rank(self):
     def use_ep(self):
         return self.moe_parallel_config.use_ep
 
-    @property
-    def use_pplx_kernels(self):
-        return self.moe_parallel_config.use_pplx_kernels
-
     @property
     def use_deepep_ht_kernels(self):
         return self.moe_parallel_config.use_deepep_ht_kernels
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index ae9430d29d9c..ac9ba56a6b70 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -166,7 +166,7 @@ def run_cutlass_moe_fp8(
         problem_sizes1 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
         problem_sizes2 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
 
-        ops.get_cutlass_pplx_moe_mm_data(
+        ops.get_cutlass_batched_moe_mm_data(
             expert_offsets,
             problem_sizes1,
             problem_sizes2,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index fbd47f8c4236..24ae2d3c82c6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -493,7 +493,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     """
     A reference prepare/finalize class that reorganizes the tokens into
     expert batched format, i.e. E x max_num_tokens x K.  This is the format
-    that the PPLX dispatch/combine kernels use.
+    that the batched dispatch/combine kernels use.
     """
 
     def __init__(
@@ -648,7 +648,7 @@ def finalize(
 class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
     """
     A reference MoE expert class that operates on expert batched format,
-    i.e. E x max_num_tokens x K.  This is the format that the pplx
+    i.e. E x max_num_tokens x K.  This is the format that the batched
     dispatch/combine kernels use.
     """
 
@@ -880,7 +880,7 @@ def batched_moe_kernel_quantize_input(
 class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
     """
     A Triton based MoE expert class that operates on expert batched format,
-    i.e. E x max_num_tokens x K.  This is the format that the pplx
+    i.e. E x max_num_tokens x K.  This is the format that the batched
     dispatch/combine kernels use.
     """
 
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index c2c0e809d700..043b5ef2669b 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1172,9 +1172,9 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
         # This happens when none of the tokens from the all2all reach this
         # EP rank. Also, note that this is only relevant for CUDAGraph
         # incompatible all2all kernels like the DeepEP high-throughput
-        # kernels. CUDAGraph compatible all2all kernels like the pplx
-        # kernels and the DeepEP low-latency kernels are always batched
-        # and can never run into the tensor.numel() == 0 case.
+        # kernels. CUDAGraph compatible all2all kernels like the DeepEP
+        # low-latency kernels are always batched and can never run into
+        # the tensor.numel() == 0 case.
         if M_full == 0:
             assert num_chunks == 0
             workspace13 = None
diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
index ee7db88ccbd0..b4f4b74ca2d5 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
@@ -143,10 +143,7 @@ def select_nvfp4_moe_backend(
     # NOTE(rob): this is kind of a hack. We need to peak into
     # the prepare-finalize selection to determine if we are using
     # the batched or standard expert format.
-    use_batched = (
-        config.moe_parallel_config.use_deepep_ll_kernels
-        or config.moe_parallel_config.use_pplx_kernels
-    )
+    use_batched = config.moe_parallel_config.use_deepep_ll_kernels
     activation_format = (
         mk.FusedMoEActivationFormat.BatchedExperts
         if use_batched
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
deleted file mode 100644
index 289ac0d1413d..000000000000
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Callable
-
-import pplx_kernels as pplx
-import torch
-
-import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate,
-)
-from vllm.model_executor.layers.fused_moe.utils import (
-    _validate_scale_shape,
-    moe_kernel_quantize_input,
-)
-from vllm.utils.math_utils import cdiv, round_up
-
-logger = init_logger(__name__)
-
-
-def pplx_hidden_dim_scale_bytes(
-    max_num_tokens: int,
-    hidden_dim: int,
-    in_dtype: torch.dtype,
-    quant_dtype: torch.dtype | str | None,
-    per_act_token_quant: bool,
-    block_shape: list[int] | None,
-):
-    # All pplx byte sizes must be 16-byte aligned.
-    align = 16
-
-    # For blocked per token: set to
-    #   cdiv(hidden_dim, block_size) * sizeof(float32)
-    # For per-token: set to 4 * sizeof(float32) (x4 for alignment)
-    if quant_dtype is not None:
-        assert isinstance(quant_dtype, torch.dtype)
-        assert quant_dtype.itemsize == 1
-        hidden_dim_bytes = hidden_dim * quant_dtype.itemsize
-        elem_size = torch.float32.itemsize
-
-        if per_act_token_quant:
-            # per-token (M x 1)
-            assert block_shape is None
-            hidden_scale_bytes = elem_size
-        elif block_shape is not None:
-            # per-group (M x K_tiles)
-            block_size = block_shape[1]
-            num_blocks = cdiv(hidden_dim, block_size)
-            hidden_scale_bytes = num_blocks * elem_size
-        else:
-            # per-tensor (1 x 1)
-            hidden_scale_bytes = elem_size
-    else:
-        hidden_dim_bytes = hidden_dim * in_dtype.itemsize
-        hidden_scale_bytes = 0
-
-    return (
-        round_up(hidden_dim_bytes, align),
-        round_up(hidden_scale_bytes, align),
-    )
-
-
-class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
-    """PPLX-based prepare and finalize for expert parallelism."""
-
-    def __init__(
-        self,
-        a2a: pplx.AllToAll,
-        max_num_tokens: int,
-        num_local_experts: int,
-        num_dispatchers: int,
-    ):
-        super().__init__()
-        assert max_num_tokens > 0
-        assert num_local_experts > 0
-        self.a2a = a2a
-        self.max_num_tokens = max_num_tokens
-        self.num_local_experts = num_local_experts
-        self.num_dispatchers_ = num_dispatchers
-
-    @property
-    def activation_format(self) -> mk.FusedMoEActivationFormat:
-        return mk.FusedMoEActivationFormat.BatchedExperts
-
-    def max_num_tokens_per_rank(self) -> int | None:
-        return self.max_num_tokens
-
-    def topk_indices_dtype(self) -> torch.dtype | None:
-        return torch.uint32
-
-    def num_dispatchers(self) -> int:
-        return self.num_dispatchers_
-
-    def output_is_reduced(self) -> bool:
-        return True
-
-    def supports_async(self) -> bool:
-        return True
-
-    def prepare_async(
-        self,
-        a1: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_experts: int,
-        expert_map: torch.Tensor | None,
-        apply_router_weight_on_input: bool,
-        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool = False,
-    ) -> tuple[Callable, mk.ReceiverType]:
-        if defer_input_quant:
-            raise NotImplementedError(
-                f"{self.__class__.__name__} does not support defer_input_quant=True. "
-                "Please select an MoE kernel that accepts quantized inputs."
-            )
-
-        num_tokens = a1.size(0)  # M
-        hidden_dim = a1.size(-1)  # K
-
-        assert topk_ids.size(0) == num_tokens
-        # expert_map should be None because with expert map, -1 id is used for
-        # non-local token; this causes error when casting ids to the
-        # topk_indices_dtype() int32
-        #
-        if expert_map is not None:
-            logger.warning_once(
-                "The PPLX backend does not support expert mapping. "
-                "The provided `expert_map` will be ignored."
-            )
-        expert_map = None  # noqa: F841
-
-        # Is this always going to be a1.device?
-        device = a1.device
-
-        if apply_router_weight_on_input:
-            topk = topk_ids.size(1)
-            # TODO: this only works for topK=1, will need to update for topK>1
-            assert topk == 1, (
-                "apply_router_weight_on_input is only implemented for topk=1"
-            )
-            a1 = a1 * topk_weights.to(a1.dtype)
-
-        repeat_cols = 4
-        repeat_rows = 1 if quant_config.per_act_token_quant else a1.size(0)
-        # TODO(bnell): always pass quant_config.a1_scale?
-        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1,
-            (None if quant_config.per_act_token_quant else quant_config.a1_scale),
-            quant_dtype=quant_config.quant_dtype,
-            per_act_token_quant=quant_config.per_act_token_quant,
-            block_shape=quant_config.block_shape,
-        )
-
-        _validate_scale_shape(
-            a1q, a1q_scale, quant_config.per_act_token_quant, quant_config.block_shape
-        )
-
-        orig_a_scale_block_shape: int | None = None
-
-        if a1q_scale is not None:
-            scalar_scales = a1q_scale.numel() == 1
-
-            # pplx requires 2-d scales even for scalar scales
-            if a1q_scale.dim() <= 1:
-                assert scalar_scales
-                a1q_scale = a1q_scale.view(1, 1)
-
-            orig_a_scale_block_shape = a1q_scale.shape[-1]
-
-            if not quant_config.is_block_quantized:
-                # TODO (bnell): use group_broadcast instead?
-                a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
-
-        assert a1q_scale is None or a1q_scale.ndim == 2, (
-            f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}"
-        )
-
-        expert_num_tokens = torch.empty(
-            self.num_local_experts,
-            dtype=torch.int32,
-            device=device,
-        )
-
-        expert_x = torch.empty(
-            (
-                self.num_local_experts,
-                self.max_num_tokens * self.num_dispatchers(),
-                hidden_dim,
-            ),
-            dtype=a1q.dtype,
-            device=device,
-        )
-
-        expert_x_scale: torch.Tensor | None = None
-        if a1q.dtype.itemsize == 1:
-            if quant_config.is_per_act_token:
-                # (M x 1) -> (E x M x K)
-                final_dim = expert_x.size(2)
-            elif quant_config.is_per_tensor:
-                # (1 x 1) -> (E x 1 x 1)
-                final_dim = 1
-            else:
-                # (M x K_tiles) -> (E x M x K_tiles)
-                assert quant_config.block_shape is not None
-                num_blocks = cdiv(expert_x.size(2), quant_config.block_shape[1])
-                final_dim = num_blocks
-
-            expert_x_scale_shape = (
-                self.num_local_experts,
-                expert_x.size(1),
-                round_up(final_dim, 4),  # round up for alignment
-            )
-
-            expert_x_scale = torch.empty(
-                expert_x_scale_shape,
-                dtype=torch.float32,
-                device=expert_x.device,
-            )
-
-        # This argument is optional, defaults to indices.size(0)
-        # There's not much point setting this unless it is != indices.size(0)
-        bound_m: torch.Tensor | None = None
-
-        self.a2a.dispatch(
-            out_expert_num_tokens=expert_num_tokens,
-            out_expert_x=expert_x,
-            out_expert_x_scale=expert_x_scale,
-            dp_x=a1q,
-            dp_x_scale=a1q_scale,
-            indices=topk_ids,
-            bound_m=bound_m,
-            do_send=True,
-            do_recv=False,
-        )
-
-        hook = lambda: self.a2a.dispatch(
-            out_expert_num_tokens=expert_num_tokens,
-            out_expert_x=expert_x,
-            out_expert_x_scale=expert_x_scale,
-            dp_x=a1q,
-            dp_x_scale=a1q_scale,
-            indices=topk_ids,
-            bound_m=bound_m,
-            do_send=False,
-            do_recv=True,
-        )
-
-        return (
-            hook,
-            lambda: self._receiver(
-                expert_num_tokens,
-                expert_x,
-                expert_x_scale,
-                orig_a_scale_block_shape,
-            ),
-        )
-
-    def _receiver(
-        self,
-        expert_num_tokens: torch.Tensor,
-        expert_x: torch.Tensor,
-        expert_x_scale: torch.Tensor | None,
-        orig_a_scale_block_shape: int | None,
-    ) -> mk.PrepareResultType:
-        if expert_x_scale is not None:
-            expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
-            assert expert_x_scale.ndim == 3
-
-        expert_tokens_meta = mk.ExpertTokensMetadata(
-            expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None
-        )
-
-        return expert_x, expert_x_scale, expert_tokens_meta, None, None
-
-    def prepare(
-        self,
-        a1: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_experts: int,
-        expert_map: torch.Tensor | None,
-        apply_router_weight_on_input: bool,
-        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool = False,
-    ) -> mk.PrepareResultType:
-        hook, receiver = self.prepare_async(
-            a1,
-            topk_weights,
-            topk_ids,
-            num_experts,
-            expert_map,
-            apply_router_weight_on_input,
-            quant_config,
-            defer_input_quant=defer_input_quant,
-        )
-        hook()
-        return receiver()
-
-    def finalize_async(
-        self,
-        output: torch.Tensor,
-        fused_expert_output: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        apply_router_weight_on_input: bool,
-        weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> Callable:
-        assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate), (
-            "Weight application and reduction happens in the combine kernel."
-        )
-
-        # This argument is optional
-        # There's not much point setting this unless it is != topk_ids.size(0)
-        bound_m: torch.Tensor | None = None
-
-        # TODO (bnell): fails in test_pplx_moe.py, figure out what's going on
-        # num_tokens = output.size(0)  # M
-        # assert topk_ids.size(0) == num_tokens, (
-        #    f"{topk_ids.size(0)} == {num_tokens}")
-        assert topk_ids.size() == topk_weights.size(), (
-            f"{topk_ids.size()} == {topk_weights.size()}"
-        )
-        assert output.size(0) <= self.max_num_tokens, (
-            f"{output.size(0)} <= {self.max_num_tokens}"
-        )
-        assert output.size(1) == fused_expert_output.size(-1)
-
-        # Set weights to 1 if we did them in dispatch. This is hacky.
-        if apply_router_weight_on_input:
-            topk_weights = torch.ones_like(topk_weights)
-
-        topk_ids_u32 = topk_ids.view(dtype=torch.uint32)
-
-        self.a2a.combine(
-            out_tokens=output,
-            indices=topk_ids_u32,
-            weights=topk_weights,
-            expert_y=fused_expert_output,
-            bound_m=bound_m,
-            do_send=True,
-            do_recv=False,
-        )
-
-        return lambda: self.a2a.combine(
-            out_tokens=output,
-            indices=topk_ids_u32,
-            weights=topk_weights,
-            expert_y=fused_expert_output,
-            bound_m=bound_m,
-            do_send=False,
-            do_recv=True,
-        )
-
-    def finalize(
-        self,
-        output: torch.Tensor,
-        fused_expert_output: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        apply_router_weight_on_input: bool,
-        weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
-        receiver = self.finalize_async(
-            output,
-            fused_expert_output,
-            topk_weights,
-            topk_ids,
-            apply_router_weight_on_input,
-            weight_and_reduce_impl,
-        )
-        receiver()
diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
index 7e25c9687402..9c2adf7996ec 100644
--- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
+++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
@@ -216,8 +216,7 @@ def __init__(
     @property
     def use_dp_chunking(self) -> bool:
         return (
-            self.moe_config.moe_parallel_config.use_pplx_kernels
-            or self.moe_config.moe_parallel_config.use_deepep_ll_kernels
+            self.moe_config.moe_parallel_config.use_deepep_ll_kernels
             or self.moe_config.moe_parallel_config.use_mori_kernels
             or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels
         ) and envs.VLLM_ENABLE_MOE_DP_CHUNK
diff --git a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
index 99d4038ec381..d7b50aea2ad6 100644
--- a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
+++ b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
@@ -14,10 +14,11 @@ class TopKWeightAndReduceDelegate(mk.TopKWeightAndReduce):
     implementation does not perform weight application and reduction
     but cannot address the needs of all the compatible PrepareAndFinalize
     implementations.
-    For example, BatchedTritonExperts is compatible with both
-    PplxPrepareAndFinalize and BatchedPrepareAndFinalize. PplxPrepareAndFinalize
-    does the weight-application + reduction as part of the pplx combine kernel.
-    But the BatchedPrepareAndFinalize needs an implementation. To facilitate
+    For example, BatchedTritonExperts is compatible with both batched
+    PrepareAndFinalize implementations like DeepEPLLPrepareAndFinalize and
+    BatchedPrepareAndFinalize. Some PrepareAndFinalize implementations do
+    the weight-application + reduction as part of the combine kernel, while
+    BatchedPrepareAndFinalize needs an explicit implementation. To facilitate
     this case, the BatchedTritonExperts could use TopKWeightAndReduceDelegate
     so the PrepareAndFinalize implementations could choose how to
     weight + reduce.
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index d81f0f80d2e7..9318bedfff17 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -798,7 +798,7 @@ def _interleave_mxfp4_cutlass_sm90(w):
             # batched activation format. As self.fused_experts is not
             # initialized at this point, we resort to checking the MoE config
             # directly.
-            is_batched_moe = self.moe.use_pplx_kernels or self.moe.use_deepep_ll_kernels
+            is_batched_moe = self.moe.use_deepep_ll_kernels
             if is_batched_moe:
                 num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8
             else:
diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py
index 4739120d4741..91e724012d4e 100644
--- a/vllm/utils/import_utils.py
+++ b/vllm/utils/import_utils.py
@@ -402,11 +402,6 @@ def _has_module(module_name: str) -> bool:
     return importlib.util.find_spec(module_name) is not None
 
 
-def has_pplx() -> bool:
-    """Whether the optional `pplx_kernels` package is available."""
-    return _has_module("pplx_kernels")
-
-
 def has_deep_ep() -> bool:
     """Whether the optional `deep_ep` package is available."""
     return _has_module("deep_ep")

From 31fb6f43dac735369851c1d908d3d6ed5d6dc1c2 Mon Sep 17 00:00:00 2001
From: pkousha <43781676+pkousha@users.noreply.github.com>
Date: Thu, 26 Feb 2026 14:35:58 -0800
Subject: [PATCH 048/154] [Kernel][perf] optimize NCCL symm_mem vs custom_AR
 selection thresholds (#33839)

Signed-off-by: <>
Signed-off-by: pkousha <43781676+pkousha@users.noreply.github.com>
Co-authored-by: Pouya Kousha <pkousha@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../device_communicators/all_reduce_utils.py  | 49 ++++++++++++++++---
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py
index ff2d7436b270..3c347ef756d4 100644
--- a/vllm/distributed/device_communicators/all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -27,6 +27,7 @@
 
 logger = init_logger(__name__)
 
+KiB = 1024
 MiB = 1024 * 1024
 # Max size for each world size in case symmetric memory is available
 # For different SM architectures
@@ -60,17 +61,44 @@
     },
 }
 
+# NCCL symmetric memory allreduce configuration based on H100 and GB200 benchmarks.
+# PyNCCL-symm outperforms custom_AR for small and large tensor sizes,
+# while custom_AR wins for mid-range sizes.
+#
+# Benchmark results (8 GPUs):
+#   2K - 16K:   PyNCCL-symm wins (1.35x - 1.48x faster)
+#   32K - 64K:  custom_AR wins
+#   128K - 1G:  PyNCCL-symm wins (1.12x - 6.14x faster)
+#
+# Benchmark results (4 GPUs):
+#   2K - 16K:   PyNCCL-symm wins (1.21x - 1.30x faster)
+#   32K - 256K: custom_AR wins (1.07x - 1.35x faster)
+#   512K - 1G:  PyNCCL-symm wins (1.10x - 2.32x faster)
+#
+# The config defines ranges where custom_AR is preferred (symm_mem disabled).
 NCCL_SYMM_MEM_ALL_REDUCE_CONFIG: dict[str, Any] = {
     "min_world_size": 4,
-    "thresholds": {
-        4: 2 * MiB,  # 2 MB
-        8: 1 * MiB,  # 1 MB
+    # Ranges where custom_AR outperforms NCCL symm_mem: (lower_bound, upper_bound)
+    # NCCL symm_mem will NOT be used for sizes in range: lower < size < upper
+    "custom_ar_preferred_ranges": {
+        4: (16 * KiB, 512 * KiB),  # custom_AR wins for 32K-256K
+        8: (16 * KiB, 128 * KiB),  # custom_AR wins for 32K-64K
     },
     "always_use_above_world_size": 8,  # Always use symm mem for world_size > 8
 }
 
 
 def should_nccl_symm_mem_allreduce(world_size: int, input_tensor: torch.Tensor) -> bool:
+    """
+    Determine if NCCL symmetric memory allreduce should be used.
+
+    Based on H100 and GB200 benchmarks, NCCL symm_mem is preferred for:
+    - Small tensors (≤16K): Lower latency than custom_AR
+    - Large tensors (≥128K for 8 GPUs, ≥512K for 4 GPUs): Better bandwidth
+
+    Custom_AR is preferred for mid-range sizes where its P2P approach
+    has lower overhead than the symm_mem copy-in/copy-out pattern.
+    """
     from vllm.distributed.device_communicators.pynccl_allocator import (
         is_symmetric_memory_enabled,
     )
@@ -80,11 +108,20 @@ def should_nccl_symm_mem_allreduce(world_size: int, input_tensor: torch.Tensor)
 
     if not is_symmetric_memory_enabled():
         return False
+
     if world_size < NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["min_world_size"]:
         return False
-    threshold = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["thresholds"].get(world_size)
-    if threshold is not None and input_tensor.nbytes >= threshold:
-        return True
+
+    tensor_size = input_tensor.nbytes
+    custom_ar_range = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["custom_ar_preferred_ranges"].get(
+        world_size
+    )
+
+    if custom_ar_range is not None:
+        lower_bound, upper_bound = custom_ar_range
+        # Use symm_mem for small sizes (≤ lower_bound) and large sizes (≥ upper_bound)
+        # Use custom_AR (not symm_mem) for mid-range sizes
+        return tensor_size <= lower_bound or tensor_size >= upper_bound
     return world_size > NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["always_use_above_world_size"]
 
 

From 01923eec7092fd5b718cb9b45eb6df152abe9296 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Thu, 26 Feb 2026 14:50:16 -0800
Subject: [PATCH 049/154] [ROCm][Quantization] GPT OSS Upstream MoE wmxfp4_afp8
 with static scales (#30357)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 137 +++++++++++-
 vllm/model_executor/layers/fused_moe/layer.py |  13 +-
 .../layers/quantization/quark/quark_moe.py    | 202 +++++++++++++++---
 3 files changed, 315 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 5617156bf2fc..2fcb7f193785 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -6,6 +6,7 @@
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
@@ -178,7 +179,40 @@ def triton_kernel_moe_forward(
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
+    unpadded_N_w1=None,
+    unpadded_K_w1=None,
+    unpadded_N_w2=None,
+    unpadded_K_w2=None,
 ) -> torch.Tensor:
+    if (
+        quant_config is not None
+        and quant_config.use_mxfp4_w4a8
+        and rocm_aiter_ops.is_enabled()
+    ):
+        from aiter.ops.triton.moe_routing.routing import routing as aiter_routing
+
+        routing_data, gather_idx, scatter_idx = aiter_routing(
+            gating_output, topk, sm_first=not renormalize
+        )
+        return triton_kernel_fused_mxfp4_w4a8_experts(
+            None,
+            hidden_states,
+            w1,
+            w2,
+            routing_data,
+            gather_idx,
+            scatter_idx,
+            activation=activation.value,
+            quant_config=quant_config,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            unpadded_N_w1=unpadded_N_w1,
+            unpadded_K_w1=unpadded_K_w1,
+            unpadded_N_w2=unpadded_N_w2,
+            unpadded_K_w2=unpadded_K_w2,
+        )
+
     if expert_map is not None:
         # With expert parallelism, legacy_routing produces routing data
         # using global expert IDs which don't correspond to local weight
@@ -210,6 +244,9 @@ def triton_kernel_moe_forward(
         effective_global_num_experts = global_num_experts
 
     output = torch.empty_like(hidden_states)
+    effective_quant_config = (
+        quant_config if quant_config is not None else FUSED_MOE_UNQUANTIZED_CONFIG
+    )
 
     return triton_kernel_fused_experts(
         output,
@@ -221,7 +258,7 @@ def triton_kernel_moe_forward(
         scatter_idx,
         topk=topk,
         activation=activation,
-        quant_config=quant_config,
+        quant_config=effective_quant_config,
         apply_router_weight_on_input=apply_router_weight_on_input,
         global_num_experts=effective_global_num_experts,
         expert_map=effective_expert_map,
@@ -252,8 +289,7 @@ def triton_kernel_fused_experts(
     assert activation == MoEActivation.SWIGLUOAI, (
         "Only SWIGLUOAI activation is supported"
     )
-    if quant_config is None:
-        quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+    assert quant_config is not None
 
     # type check, uint8 means mxfp4
     assert hidden_states.dtype == torch.bfloat16
@@ -330,6 +366,98 @@ def triton_kernel_fused_experts(
     return output_tensor
 
 
+# This is a triton implementation of the fused_experts function
+def triton_kernel_fused_mxfp4_w4a8_experts(
+    output_tensor: torch.Tensor,
+    hidden_states: torch.Tensor,
+    w1,  # Tensor or triton_kernels.Tensor
+    w2,  # Tensor or triton_kernels.Tensor
+    routing_data,  # RoutingData
+    gather_indx,  # GatherIndx
+    scatter_indx,  # ScatterIndx
+    activation: str = "silu",
+    quant_config: FusedMoEQuantConfig | None = None,
+    swiglu_alpha: float = 1.702,
+    swiglu_limit: float = 7.0,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+    a1q_scale: torch.Tensor | None = None,
+    unpadded_N_w1=None,
+    unpadded_K_w1=None,
+    unpadded_N_w2=None,
+    unpadded_K_w2=None,
+) -> torch.Tensor:
+    assert quant_config is not None
+    # type check, uint8 means mxfp4
+    assert hidden_states.dtype == torch.bfloat16
+    assert quant_config.w1_bias is None or quant_config.w1_bias.dtype == torch.float32
+    assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
+
+    # Shape check, only check non-mxfp4
+    assert hidden_states.shape[-1] == w1.shape[-2]
+    assert w2.shape[-1] == w1.shape[1]
+
+    E, _, N = w1.shape
+
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    gammas = routing_data.gate_scal if routing_data else None
+
+    from aiter.ops.triton.moe_op_gemm_a8w4 import moe_gemm_a8w4
+    from aiter.ops.triton.quant_moe import downcast_to_static_fp8
+
+    assert quant_config.w1_precision is not None, (
+        "w1_precision in quant config can't be None"
+    )
+    assert quant_config.w2_precision is not None, (
+        "w2_precision in quant config can't be None"
+    )
+
+    hidden_states = downcast_to_static_fp8(
+        hidden_states, quant_config.w1_precision.flex_ctx.lhs_data.scale
+    )
+
+    intermediate_cache1 = moe_gemm_a8w4(
+        hidden_states,
+        w1.storage.data,
+        None,
+        quant_config.w1_precision.weight_scale.storage.data,
+        quant_config.w1_precision.flex_ctx.lhs_data.scale,
+        quant_config.w2_precision.flex_ctx.lhs_data.scale,
+        quant_config.w1_bias,
+        routing_data,
+        gather_indx=gather_indx,
+        gammas=gammas if apply_router_weight_on_input else None,
+        swizzle_mx_scale="CDNA4_SCALE",
+        out_dtype=torch.float8_e4m3fn,
+        apply_swiglu=True,
+        alpha=swiglu_alpha,
+        limit=swiglu_limit,
+        unpadded_N=unpadded_N_w1,
+        unpadded_K=unpadded_K_w1,
+    )
+
+    intermediate_cache3 = moe_gemm_a8w4(
+        intermediate_cache1,
+        w2.storage.data,
+        None,
+        quant_config.w2_precision.weight_scale.storage.data,
+        quant_config.w2_precision.flex_ctx.lhs_data.scale,
+        None,
+        quant_config.w2_bias,
+        routing_data,
+        scatter_indx=scatter_indx,
+        gammas=None if apply_router_weight_on_input else gammas,
+        swizzle_mx_scale="CDNA4_SCALE",
+        unpadded_N=unpadded_N_w2,
+        unpadded_K=unpadded_K_w2,
+    )
+
+    return intermediate_cache3
+
+
 def make_routing_data(
     topk_ids: torch.Tensor,
     topk_weights: torch.Tensor,
@@ -520,6 +648,9 @@ def apply(
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
+        if self.quant_config is None:
+            self.quant_config: FusedMoEQuantConfig = FUSED_MOE_UNQUANTIZED_CONFIG
+
         if expert_map is not None:
             topk_ids = expert_map[topk_ids]
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 679b79ce971b..a7dee700415e 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -525,16 +525,18 @@ def __init__(
 
         # Round up hidden size before creating moe_config.
         # This way moe_config is created with the correct hidden_size from the start.
+        unpadded_hidden_size = hidden_size
+        self.model_type = (
+            self.vllm_config.model_config.hf_config.model_type
+            if self.vllm_config.model_config is not None
+            else None
+        )
         hidden_size = maybe_roundup_hidden_size(
             hidden_size=hidden_size,
             act_dtype=moe_in_dtype,
             moe_parallel_config=self.moe_parallel_config,
             is_lora_enabled=vllm_config.lora_config is not None,
-            model_type=(
-                self.vllm_config.model_config.hf_config.model_type
-                if self.vllm_config.model_config is not None
-                else None
-            ),
+            model_type=self.model_type,
             is_mxfp4_quant=(
                 quant_config is not None and quant_config.is_mxfp4_quant(prefix, self)
             ),
@@ -610,6 +612,7 @@ def _get_quant_method() -> FusedMoEMethodBase:
         moe_quant_params = {
             "num_experts": self.local_num_experts,
             "hidden_size": hidden_size,
+            "unpadded_hidden_size": unpadded_hidden_size,
             "intermediate_size_per_partition": self.intermediate_size_per_partition,
             "params_dtype": params_dtype,
             "weight_loader": self.weight_loader,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 8394857cf9c2..b2abbce1aa1e 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -5,8 +5,8 @@
 
 import torch
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
@@ -32,6 +32,7 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_fp8_moe_layer_for_marlin,
 )
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import _swizzle_mxfp4
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_BLOCK_SIZE,
     OCP_MX_Scheme,
@@ -49,7 +50,11 @@
 
 logger = init_logger(__name__)
 
-__all__ = ["QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod", "QuarkOCP_MX_MoEMethod"]
+__all__ = [
+    "QuarkMoEMethod",
+    "QuarkOCP_MX_MoEMethod",
+    "QuarkOCP_MX_MoEMethod_OSS",
+]
 
 
 class QuarkMoEMethod(FusedMoEMethodBase):
@@ -71,14 +76,30 @@ def get_moe_method(
                 "output_tensors and bias "
                 "quantized are not supported"
             )
+
         weight_config = layer_quant_config.get("weight")
         input_config = layer_quant_config.get("input_tensors")
+
         if quant_config._is_fp8_w4a8(weight_config, input_config):
             return QuarkW4A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_fp8_w8a8(weight_config, input_config):
             return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_w_ocp_mx_a_x(weight_config, input_config):
-            return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config)
+            emulate = not current_platform.supports_mx() or not (
+                rocm_aiter_ops.is_fused_moe_enabled()
+            )
+            if (
+                input_config.get("dtype") == "fp8_e4m3"
+                and not input_config.get("is_dynamic")
+                and not emulate
+            ):
+                return QuarkOCP_MX_MoEMethod_OSS(
+                    weight_config, input_config, module.moe_config
+                )
+            else:
+                return QuarkOCP_MX_MoEMethod(
+                    weight_config, input_config, module.moe_config
+                )
         else:
             raise RuntimeError("Unsupported FusedMoe scheme")
 
@@ -706,13 +727,11 @@ def __init__(
             get_current_vllm_config().model_config.hf_config, "model_type", None
         )
 
-        self._emulate = (
+        self.emulate = (
             not current_platform.supports_mx()
             or not self.ocp_mx_scheme.startswith("w_mxfp4")
         ) and (self.mxfp4_backend is None or not self.use_rocm_aiter_moe)
 
-        self.emulate = True if self.model_type == "gpt_oss" else self._emulate
-
         if self.emulate:
             logger.warning_once(
                 f"The current mode (supports_mx={current_platform.supports_mx()}, "
@@ -753,6 +772,7 @@ def create_weights(
         )
 
         params_dtype = torch.uint8
+        self.intermediate_size_per_partition = intermediate_size_per_partition
         if self.model_type == "gpt_oss":
             if current_platform.is_rocm():
                 intermediate_size_per_partition_after_pad = round_up(
@@ -765,6 +785,10 @@ def create_weights(
         else:
             intermediate_size_per_partition_after_pad = intermediate_size_per_partition
 
+        self.unpadded_hidden_size = extra_weight_attrs.get(
+            "unpadded_hidden_size", hidden_size
+        )
+
         # WEIGHTS
         w13_weight = torch.nn.Parameter(
             torch.empty(
@@ -991,30 +1015,20 @@ def apply(
         shared_experts_input: torch.Tensor | None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if not self.emulate:
-            if (
-                self.model_type == "gpt_oss"
-                and self.mxfp4_backend == Mxfp4Backend.TRITON
-            ):
-                raise NotImplementedError(
-                    "Triton kernel implemented fused MoE for GPT_OSS model "
-                    "in Quark(MoE) format is not integrated or provided yet."
-                )
-
-            else:
-                from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-                    rocm_aiter_fused_experts,
-                )
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+                rocm_aiter_fused_experts,
+            )
 
-                return rocm_aiter_fused_experts(
-                    x,
-                    layer.w13_weight,
-                    layer.w2_weight,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    activation=layer.activation,
-                    quant_config=self.moe_quant_config,
-                    expert_map=layer.expert_map,
-                )
+            return rocm_aiter_fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=layer.activation,
+                quant_config=self.moe_quant_config,
+                expert_map=layer.expert_map,
+            )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -1031,3 +1045,133 @@ def apply(
                 expert_map=layer.expert_map,
                 quant_config=self.moe_quant_config,
             )
+
+
+class QuarkOCP_MX_MoEMethod_OSS(QuarkOCP_MX_MoEMethod):
+    def __init__(
+        self,
+        weight_config: dict[str, Any],
+        input_config: dict[str, Any],
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(weight_config, input_config, moe)
+
+    def process_weights_after_loading(self, layer):
+        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+
+        w13_bias = layer.w13_bias.to(torch.float32)
+        w2_bias = layer.w2_bias.to(torch.float32)
+
+        layer.w13_bias = torch.nn.Parameter(w13_bias, requires_grad=False)
+        layer.w2_bias = torch.nn.Parameter(w2_bias, requires_grad=False)
+
+        # FIXME warp need to be adjusted based on batch size
+        # only apply to  batched mode
+        if self.moe.use_ep:
+            num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8
+        else:
+            num_warps = 8
+
+        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
+            layer.w13_weight, layer.w13_weight_scale, num_warps
+        )
+        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
+            layer.w2_weight, layer.w2_weight_scale, num_warps
+        )
+
+        self.w13_weight_triton_tensor = w13_weight
+        self.w2_weight_triton_tensor = w2_weight
+
+        # need to delete the original weights to save memory on single GPU
+        del layer.w13_weight
+        del layer.w2_weight
+        layer.w13_weight = None
+        layer.w2_weight = None
+        torch.cuda.empty_cache()
+
+        if self.static_input_scales:
+            if layer.w13_input_scale is None or layer.w2_input_scale is None:
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None."
+                )
+            if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+                layer.w2_input_scale
+            ):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer."
+                )
+
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max().to(torch.float32), requires_grad=False
+            )
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max().to(torch.float32), requires_grad=False
+            )
+
+            from triton_kernels.numerics import InFlexData
+
+            lhs_data13 = InFlexData(scale=layer.w13_input_scale)
+            lhs_data2 = InFlexData(scale=layer.w2_input_scale)
+
+            self.w13_precision_config = PrecisionConfig(
+                weight_scale=w13_scale,
+                flex_ctx=FlexCtx(rhs_data=w13_flex, lhs_data=lhs_data13),
+            )
+
+            self.w2_precision_config = PrecisionConfig(
+                weight_scale=w2_scale,
+                flex_ctx=FlexCtx(rhs_data=w2_flex, lhs_data=lhs_data2),
+            )
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return mxfp4_w4a8_moe_quant_config(
+            w1_scale=self.w13_precision_config,
+            w2_scale=self.w2_precision_config,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            w1_bias=layer.w13_bias,
+            w2_bias=layer.w2_bias,
+            block_shape=None,
+        )
+
+    @property
+    def is_monolithic(self) -> bool:
+        return True
+
+    def apply_monolithic(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        expert_map: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if layer.enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet."
+            )
+
+        from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (  # noqa: E501
+            triton_kernel_moe_forward,
+        )
+
+        return triton_kernel_moe_forward(
+            hidden_states=x,
+            w1=self.w13_weight_triton_tensor,
+            w2=self.w2_weight_triton_tensor,
+            gating_output=router_logits,
+            topk=layer.top_k,
+            renormalize=layer.renormalize,
+            global_num_experts=layer.global_num_experts,
+            expert_map=expert_map,
+            quant_config=self.moe_quant_config,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+            unpadded_N_w1=self.intermediate_size_per_partition * 2,
+            unpadded_K_w1=self.unpadded_hidden_size,
+            unpadded_N_w2=self.unpadded_hidden_size,
+            unpadded_K_w2=self.intermediate_size_per_partition,
+        )

From 6283021142bbf5ee324395dad5e80b8661400329 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Thu, 26 Feb 2026 15:38:19 -0800
Subject: [PATCH 050/154] [Bugfix] Fix KV Scale loading for MLA Models (#35430)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
---
 vllm/model_executor/layers/quantization/modelopt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index c0cc35b28271..999bb6325040 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -12,7 +12,7 @@
 from vllm.model_executor.kernels.linear import (
     init_fp8_linear_kernel,
 )
-from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.attention import Attention, MLAAttention
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
@@ -183,7 +183,7 @@ def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> "QuantizeMethodBase | None":
         # handle kv-cache first so we can focus only on weight quantization thereafter
-        if isinstance(layer, Attention):
+        if isinstance(layer, (Attention, MLAAttention)):
             return self.KVCacheMethodCls(self)
 
         # handle exclusion

From 56a6371706bc32c9e2e996ec29911c087a547ac0 Mon Sep 17 00:00:00 2001
From: Andrii Skliar <andreyws96@gmail.com>
Date: Fri, 27 Feb 2026 01:31:43 +0100
Subject: [PATCH 051/154] [Update] Use FlashInfer fast_decode_plan directly
 instead of replication (#34687)

Signed-off-by: Andrii <askliar@nvidia.com>
Co-authored-by: Andrii <askliar@nvidia.com>
---
 tests/kernels/attention/test_flashinfer.py | 203 +++++++++++++++++++
 vllm/v1/attention/backends/flashinfer.py   | 214 ++++++++-------------
 2 files changed, 286 insertions(+), 131 deletions(-)

diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 570bf7fc865a..9a0847697629 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -84,6 +84,209 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
+def _make_paged_kv_metadata(
+    kv_lens: list[int],
+    block_size: int,
+    num_blocks: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Build paged-KV metadata tensors for fast_plan_decode tests.
+
+    Returns:
+        kv_indptr          – CPU int32, shape [num_seqs + 1]
+        kv_indices         – CUDA int32, shape [total_blocks]
+        kv_last_page_lens  – CPU int32, shape [num_seqs]
+        block_tables       – CUDA int32, shape [num_seqs, max_blocks_per_seq]
+    """
+    num_seqs = len(kv_lens)
+    max_blocks = (max(kv_lens) + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_blocks), dtype=torch.int32, device="cuda"
+    )
+
+    indptr_list = [0]
+    indices_list: list[int] = []
+    last_lens_list: list[int] = []
+    for i, seq_len in enumerate(kv_lens):
+        n = (seq_len + block_size - 1) // block_size
+        indices_list.extend(block_tables[i, :n].cpu().tolist())
+        indptr_list.append(indptr_list[-1] + n)
+        last_lens_list.append(seq_len % block_size or block_size)
+
+    return (
+        torch.tensor(indptr_list, dtype=torch.int32, device="cpu"),
+        torch.tensor(indices_list, dtype=torch.int32, device="cuda"),
+        torch.tensor(last_lens_list, dtype=torch.int32, device="cpu"),
+        block_tables,
+    )
+
+
+def _make_cg_decode_wrapper(
+    num_seqs: int,
+    kv_indices_buffer: torch.Tensor,
+    workspace_buffer: torch.Tensor,
+    use_tensor_cores: bool = True,
+) -> "flashinfer.BatchDecodeWithPagedKVCacheWrapper":
+    """Create a cudagraph-enabled BatchDecodeWithPagedKVCacheWrapper.
+
+    *kv_indices_buffer* is shared with the caller so that fast_plan_decode
+    can avoid the device-to-device index copy on subsequent (cudagraph) calls.
+    """
+    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        "NHD",
+        use_cuda_graph=True,
+        paged_kv_indptr_buffer=torch.zeros(
+            num_seqs + 1, dtype=torch.int32, device="cuda"
+        ),
+        paged_kv_indices_buffer=kv_indices_buffer,
+        paged_kv_last_page_len_buffer=torch.zeros(
+            num_seqs, dtype=torch.int32, device="cuda"
+        ),
+        use_tensor_cores=use_tensor_cores,
+    )
+
+
+def test_fast_decode_plan_importable() -> None:
+    """fast_decode_plan must be importable from flashinfer.decode.
+
+    This is a forward-compatibility smoke test: if FlashInfer reorganises its
+    public API the import will fail before any other test does.
+    """
+    from flashinfer.decode import fast_decode_plan  # noqa: F401
+
+    assert callable(fast_decode_plan)
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode
+def test_fast_plan_decode_warmup_uses_full_plan(dtype: torch.dtype) -> None:
+    """On the first call fast_plan_decode must route through self.plan() and
+    flip vllm_first_call to False on the wrapper object."""
+    from unittest.mock import patch
+
+    from vllm.v1.attention.backends.flashinfer import fast_plan_decode
+
+    torch.set_default_device("cuda")
+    set_random_seed(0)
+
+    kv_lens = [128, 64]
+    block_size = 16
+    num_seqs = len(kv_lens)
+    num_query_heads, num_kv_heads = 8, 2
+    head_size = 128
+
+    kv_indptr, kv_indices, kv_last_page_lens, _ = _make_paged_kv_metadata(
+        kv_lens, block_size, NUM_BLOCKS
+    )
+
+    workspace = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = _make_cg_decode_wrapper(num_seqs, kv_indices.clone(), workspace)
+
+    assert getattr(wrapper, "vllm_first_call", True) is True
+
+    with patch.object(wrapper, "plan", wraps=wrapper.plan) as mock_plan:
+        fast_plan_decode(
+            wrapper,
+            indptr_cpu=kv_indptr,
+            indices=kv_indices,
+            last_page_len_cpu=kv_last_page_lens,
+            num_qo_heads=num_query_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_size,
+            page_size=block_size,
+            q_data_type=dtype,
+            kv_data_type=dtype,
+        )
+        mock_plan.assert_called_once()
+
+    assert wrapper.vllm_first_call is False, (
+        "vllm_first_call should be False after the first fast_plan_decode call"
+    )
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode
+def test_fast_plan_decode_matches_full_plan(
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    block_size: int,
+    dtype: torch.dtype,
+) -> None:
+    """fast_plan_decode's cudagraph path (delegating to FlashInfer's
+    fast_decode_plan) must produce attention output numerically identical to
+    a standard plan() call.
+
+    Both the warmup call (self.plan) and the subsequent fast call
+    (fast_decode_plan) are verified against the same reference.
+    """
+    from vllm.v1.attention.backends.flashinfer import fast_plan_decode
+
+    torch.set_default_device("cuda")
+    set_random_seed(0)
+    num_seqs = len(kv_lens)
+    num_query_heads, num_kv_heads = num_heads
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+
+    kv_indptr, kv_indices, kv_last_page_lens, _ = _make_paged_kv_metadata(
+        kv_lens, block_size, NUM_BLOCKS
+    )
+
+    # Reference output via the standard plan()
+    workspace_ref = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    ref_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_ref, "NHD", use_tensor_cores=True
+    )
+    ref_wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        q_data_type=dtype,
+        kv_data_type=dtype,
+    )
+    ref_output = ref_wrapper.run(query, key_value_cache)
+
+    # CUDAGraph wrapper exercised through fast_plan_decode
+    kv_indices_buf = kv_indices.clone()
+    workspace_cg = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    cg_wrapper = _make_cg_decode_wrapper(num_seqs, kv_indices_buf, workspace_cg)
+
+    plan_kwargs: dict = dict(
+        indptr_cpu=kv_indptr,
+        indices=kv_indices_buf,
+        last_page_len_cpu=kv_last_page_lens,
+        num_qo_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        page_size=block_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+    )
+
+    # First call – warmup path (routes through self.plan)
+    fast_plan_decode(cg_wrapper, **plan_kwargs)
+    warmup_output = cg_wrapper.run(query, key_value_cache)
+    torch.testing.assert_close(warmup_output, ref_output, atol=1e-2, rtol=1e-2)
+
+    # Second call – fast path (routes through fast_decode_plan from FlashInfer)
+    fast_plan_decode(cg_wrapper, **plan_kwargs)
+    fast_output = cg_wrapper.run(query, key_value_cache)
+    torch.testing.assert_close(fast_output, ref_output, atol=1e-2, rtol=1e-2)
+
+
 @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 80297720d7b3..5300cf56c5e7 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -13,7 +13,7 @@
     BatchPrefillWithRaggedKVCacheWrapper,
     MultiLevelCascadeAttentionWrapper,
 )
-from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
+from flashinfer.decode import fast_decode_plan, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
 from typing_extensions import override
@@ -199,14 +199,14 @@ def plan(
     ):
         """Plan the prefill operation with given parameters."""
         self._context.plan(
-            qo_indptr_cpu,
-            paged_kv_indptr_cpu,
-            paged_kv_indices,
-            paged_kv_last_page_len_cpu,
-            num_qo_heads * dcp_world_size,
-            num_kv_heads,
-            head_dim,
-            page_size,
+            qo_indptr=qo_indptr_cpu,
+            paged_kv_indptr=paged_kv_indptr_cpu,
+            paged_kv_indices=paged_kv_indices,
+            paged_kv_last_page_len=paged_kv_last_page_len_cpu,
+            num_qo_heads=num_qo_heads * dcp_world_size,
+            num_kv_heads=num_kv_heads,
+            head_dim_qk=head_dim,
+            page_size=page_size,
             causal=False,  # This is context run
             sm_scale=sm_scale,
             window_left=window_left,
@@ -818,6 +818,9 @@ def _compute_flashinfer_kv_metadata(
             page_size,
             paged_kv_last_page_len_np,
         )
+        self.paged_kv_last_page_len.gpu[:num_reqs].copy_(
+            self.paged_kv_last_page_len.cpu[:num_reqs], non_blocking=True
+        )
         return paged_kv_indices
 
     def build(
@@ -999,14 +1002,17 @@ def build(
 
             attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
             attn_metadata.cascade_wrapper.plan(
-                [shared_qo_indptr_cpu, qo_indptr_cpu],
-                [shared_kv_page_indptr_cpu, paged_kv_indptr_cpu],
-                [shared_kv_page_indices_cpu, paged_kv_indices],
-                [shared_kv_last_page_len_cpu, paged_kv_last_page_len_cpu],
-                self.num_qo_heads,
-                self.num_kv_heads,
-                self.head_dim,
-                self.page_size,
+                qo_indptr_arr=[shared_qo_indptr_cpu, qo_indptr_cpu],
+                paged_kv_indptr_arr=[shared_kv_page_indptr_cpu, paged_kv_indptr_cpu],
+                paged_kv_indices_arr=[shared_kv_page_indices_cpu, paged_kv_indices],
+                paged_kv_last_page_len=[
+                    shared_kv_last_page_len_cpu,
+                    paged_kv_last_page_len_cpu,
+                ],
+                num_qo_heads=self.num_qo_heads,
+                num_kv_heads=self.num_kv_heads,
+                head_dim=self.head_dim,
+                page_size=self.page_size,
                 causal=True,
                 sm_scale=self.sm_scale,
                 window_left=self.window_left,
@@ -1084,14 +1090,14 @@ def build(
                         BatchPrefillWithPagedKVCacheWrapper,
                     )
                     prefill_wrapper.plan(
-                        qo_indptr_prefill_cpu,
-                        paged_kv_indptr_prefill_cpu,
-                        paged_kv_indices,
-                        paged_kv_last_page_len_prefill_cpu,
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
+                        qo_indptr=qo_indptr_prefill_cpu,
+                        paged_kv_indptr=paged_kv_indptr_prefill_cpu,
+                        paged_kv_indices=paged_kv_indices,
+                        paged_kv_last_page_len=paged_kv_last_page_len_prefill_cpu,
+                        num_qo_heads=self.num_qo_heads,
+                        num_kv_heads=self.num_kv_heads,
+                        head_dim_qk=self.head_dim,
+                        page_size=self.page_size,
                         causal=True,
                         sm_scale=self.sm_scale,
                         window_left=self.window_left,
@@ -1132,14 +1138,15 @@ def build(
                 # in atten_metadata when using cudagraph.
                 fast_plan_decode(
                     decode_wrapper,
-                    self.paged_kv_indptr.cpu[: num_input_tokens + 1],
-                    paged_kv_indices,
-                    self.paged_kv_last_page_len.cpu[:num_input_tokens],
-                    seq_lens_cpu[:num_input_tokens],
-                    self.num_qo_heads * self.dcp_world_size,
-                    self.num_kv_heads,
-                    self.head_dim,
-                    self.page_size,
+                    indptr_cpu=self.paged_kv_indptr.cpu[: num_input_tokens + 1],
+                    indices=paged_kv_indices,
+                    last_page_len_cpu=self.paged_kv_last_page_len.cpu[
+                        :num_input_tokens
+                    ],
+                    num_qo_heads=self.num_qo_heads * self.dcp_world_size,
+                    num_kv_heads=self.num_kv_heads,
+                    head_dim=self.head_dim,
+                    page_size=self.page_size,
                     # Disable flashinfer's pos encoding and use vllm's rope.
                     pos_encoding_mode="NONE",
                     sm_scale=self.sm_scale,
@@ -1617,7 +1624,6 @@ def fast_plan_decode(
     indptr_cpu: torch.Tensor,
     indices: torch.Tensor,
     last_page_len_cpu: torch.Tensor,
-    seq_lens_cpu: torch.Tensor,
     num_qo_heads: int,
     num_kv_heads: int,
     head_dim: int,
@@ -1654,111 +1660,57 @@ def fast_plan_decode(
     # this warm up is to generate the _cached_module for the decode wrapper.
     if not self.is_cuda_graph_enabled or getattr(self, "vllm_first_call", True):
         self.plan(
-            indptr_cpu,
-            indices,
-            last_page_len_cpu,
-            num_qo_heads,
-            num_kv_heads,
-            head_dim,
-            page_size,
-            pos_encoding_mode,
-            window_left,
-            logits_soft_cap,
-            q_data_type,
-            kv_data_type,
-            o_data_type,
-            data_type,
-            sm_scale,
-            rope_scale,
-            rope_theta,
-            non_blocking,
-            None,  # block_tables
-            None,  # seq_lens
-            fixed_split_size,
-            disable_split_kv,
+            indptr=indptr_cpu,
+            indices=indices,
+            last_page_len=last_page_len_cpu,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            page_size=page_size,
+            pos_encoding_mode=pos_encoding_mode,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+            kv_data_type=kv_data_type,
+            o_data_type=o_data_type,
+            data_type=data_type,
+            sm_scale=sm_scale,
+            rope_scale=rope_scale,
+            rope_theta=rope_theta,
+            non_blocking=non_blocking,
+            block_tables=None,
+            seq_lens=None,
+            fixed_split_size=fixed_split_size,
+            disable_split_kv=disable_split_kv,
         )
         self.vllm_first_call = False
         return
 
     assert self.is_cuda_graph_enabled, "Should be cudagraph only here"
 
-    batch_size = len(last_page_len_cpu)
-    if logits_soft_cap is None:
-        logits_soft_cap = 0.0
-
-    # Handle data types consistently
-    if data_type is not None:
-        if q_data_type is None:
-            q_data_type = data_type
-        if kv_data_type is None:
-            kv_data_type = data_type
-    elif q_data_type is None:
-        q_data_type = "float16"
-
-    if kv_data_type is None:
-        kv_data_type = q_data_type
-    q_data_type = (
-        getattr(torch, q_data_type) if isinstance(q_data_type, str) else q_data_type
-    )
-    kv_data_type = (
-        getattr(torch, kv_data_type) if isinstance(kv_data_type, str) else kv_data_type
+    fast_decode_plan(
+        self,
+        indptr=indptr_cpu,
+        indices=indices,
+        last_page_len=last_page_len_cpu,
+        num_qo_heads=num_qo_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_dim,
+        page_size=page_size,
+        pos_encoding_mode=pos_encoding_mode,
+        window_left=window_left,
+        logits_soft_cap=logits_soft_cap,
+        q_data_type=q_data_type,
+        kv_data_type=kv_data_type,
+        data_type=data_type,
+        sm_scale=sm_scale,
+        rope_scale=rope_scale,
+        rope_theta=rope_theta,
+        non_blocking=non_blocking,
+        fixed_split_size=fixed_split_size,
+        disable_split_kv=disable_split_kv,
     )
 
-    if batch_size != self._fixed_batch_size:
-        raise ValueError(
-            "The batch size should be fixed in cudagraph mode, the runtime "
-            "batch size {} mismatches the batch size set during "
-            "initialization {}".format(batch_size, self._fixed_batch_size)
-        )
-    if len(indices) > len(self._paged_kv_indices_buf):
-        raise ValueError(
-            "The size of indices should be less than or equal to the allocated buffer"
-        )
-
-    # host-to-device copy for the indptr buffer
-    self._paged_kv_indptr_buf.copy_(indptr_cpu, non_blocking=True)
-    # host-to-device copy for the last_page_len buffer
-    self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu, non_blocking=True)
-
-    qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
-
-    try:
-        # Make sure we pass exactly 19 arguments for tensor core version
-        args = [
-            self._float_workspace_buffer,
-            self._int_workspace_buffer,
-            self._pin_memory_int_workspace_buffer,
-            qo_indptr_host,
-            indptr_cpu,
-            seq_lens_cpu,
-            batch_size,  # total_num_rows
-            batch_size,
-            num_qo_heads,
-            num_kv_heads,
-            page_size,
-            self.is_cuda_graph_enabled,
-            head_dim,
-            head_dim,
-            False,  # causal
-            window_left,
-        ]
-        if self._backend == "fa2":
-            args.append(fixed_split_size)
-            args.append(disable_split_kv)
-            args.append(0)  # num_colocated_ctas
-        self._plan_info = self._cached_module.plan(
-            *args,
-        )
-    except Exception as e:
-        raise RuntimeError(f"Error in tensor core plan: {e}") from e
-
-    self._pos_encoding_mode = pos_encoding_mode
-    self._window_left = window_left
-    self._logits_soft_cap = logits_soft_cap
-    self._sm_scale = sm_scale
-    self._rope_scale = rope_scale
-    self._rope_theta = rope_theta
-
 
 @triton.jit
 def _copy_page_indices_kernel(

From 38c498b8e3aaec95049f384edfc56ca12cbe1839 Mon Sep 17 00:00:00 2001
From: roikoren755 <26850796+roikoren755@users.noreply.github.com>
Date: Fri, 27 Feb 2026 02:51:28 +0200
Subject: [PATCH 052/154] [Performance] Cublas Bf16 Gate with Fp32 Output
 (#35121)

Signed-off-by: Roi Koren <roik@nvidia.com>
---
 CMakeLists.txt                                |   3 +-
 csrc/moe/moe_ops.h                            |   4 +
 csrc/moe/router_gemm.cu                       |  52 ++++++++
 csrc/moe/torch_bindings.cpp                   |   4 +
 vllm/_custom_ops.py                           |  17 +++
 .../layers/fused_moe/__init__.py              |   2 +
 .../layers/fused_moe/router/gate_linear.py    | 117 ++++++++++++++++++
 vllm/model_executor/models/deepseek_v2.py     |  72 +----------
 vllm/model_executor/models/nemotron_h.py      |  15 +--
 9 files changed, 206 insertions(+), 80 deletions(-)
 create mode 100644 csrc/moe/router_gemm.cu
 create mode 100644 vllm/model_executor/layers/fused_moe/router/gate_linear.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39714b8461e4..479d6db1eebc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -971,7 +971,8 @@ set(VLLM_MOE_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC
     "csrc/moe/moe_wna16.cu"
-    "csrc/moe/grouped_topk_kernels.cu")
+    "csrc/moe/grouped_topk_kernels.cu"
+    "csrc/moe/router_gemm.cu")
 endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index b71db3569447..d8d962887dab 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -58,6 +58,10 @@ void shuffle_rows(const torch::Tensor& input_tensor,
                   torch::Tensor& output_tensor);
 
 #ifndef USE_ROCM
+// cuBLAS bf16 x bf16 -> fp32 router GEMM (fallback for non-SM90 / batch > 16)
+torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
+                                    torch::Tensor const& weight);
+
 // DeepSeek V3 optimized router GEMM kernel for SM90+
 // Computes output = mat_a @ mat_b.T where:
 //   mat_a: [num_tokens, hidden_dim] in bf16
diff --git a/csrc/moe/router_gemm.cu b/csrc/moe/router_gemm.cu
new file mode 100644
index 000000000000..a939f8846ff1
--- /dev/null
+++ b/csrc/moe/router_gemm.cu
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+// bf16 x bf16 -> fp32 router GEMM via cuBLAS.
+// Uses CUBLAS_COMPUTE_32F so bf16 operands accumulate into fp32,
+// matching TRT-LLM's cuBLAS fallback behaviour in dsv3RouterGemmOp.
+
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cublas_v2.h>
+
+// cuBLAS column-major math for row-major PyTorch tensors:
+//   weight[N,K]_row  lda=K  -> cuBLAS sees (K,N) col-major; CUBLAS_OP_T ->
+//   (N,K) input[M,K]_row   ldb=K  -> cuBLAS sees (K,M) col-major; CUBLAS_OP_N
+//   -> (K,M) out[M,N]_row     ldc=N  -> cuBLAS sees (N,M) col-major (written as
+//   output^T)
+// cuBLAS: C(N,M) = weight(N,K) @ input(K,M)  =>  C^T = output[M,N]
+// params: m=N, n=M, k=K, lda=K (weight), ldb=K (input), ldc=N (output)
+
+torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
+                                    torch::Tensor const& weight) {
+  TORCH_CHECK(input.dtype() == torch::kBFloat16,
+              "router_gemm_bf16_fp32: input must be bfloat16");
+  TORCH_CHECK(weight.dtype() == torch::kBFloat16,
+              "router_gemm_bf16_fp32: weight must be bfloat16");
+  TORCH_CHECK(input.dim() == 2 && weight.dim() == 2,
+              "router_gemm_bf16_fp32: input and weight must be 2-D");
+  TORCH_CHECK(input.size(1) == weight.size(1),
+              "router_gemm_bf16_fp32: inner dimensions must match");
+
+  int64_t const M = input.size(0);
+  int64_t const N = weight.size(0);
+  int64_t const K = input.size(1);
+
+  auto out = torch::empty({M, N}, input.options().dtype(torch::kFloat32));
+
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+  TORCH_CUDABLAS_CHECK(
+      cublasSetStream(handle, at::cuda::getCurrentCUDAStream()));
+
+  float const alpha = 1.0f;
+  float const beta = 0.0f;
+
+  TORCH_CUDABLAS_CHECK(cublasGemmEx(
+      handle, CUBLAS_OP_T, CUBLAS_OP_N, static_cast<int>(N),
+      static_cast<int>(M), static_cast<int>(K), &alpha, weight.data_ptr(),
+      CUDA_R_16BF, static_cast<int>(K), input.data_ptr(), CUDA_R_16BF,
+      static_cast<int>(K), &beta, out.data_ptr(), CUDA_R_32F,
+      static_cast<int>(N), CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT));
+
+  return out;
+}
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 438599451452..7b627a6f8760 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -125,6 +125,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "Tensor)");
   m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 
+  // cuBLAS bf16 x bf16 -> fp32 router GEMM (fallback for non-SM90 / batch > 16)
+  m.def("router_gemm_bf16_fp32(Tensor input, Tensor weight) -> Tensor");
+  m.impl("router_gemm_bf16_fp32", torch::kCUDA, &router_gemm_bf16_fp32);
+
   // DeepSeek V3 optimized router GEMM for SM90+
   m.def("dsv3_router_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
   // conditionally compiled so impl registration is in source file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 37cf43620340..69f080ae20d6 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2190,6 +2190,23 @@ def moe_wna16_gemm(
     )
 
 
+def router_gemm_bf16_fp32(input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+    """bf16 x bf16 -> fp32 GEMM via cuBLAS. weight shape: (N, K)."""
+    return torch.ops._moe_C.router_gemm_bf16_fp32(input, weight)
+
+
+if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "router_gemm_bf16_fp32"):
+
+    @register_fake("_moe_C::router_gemm_bf16_fp32")
+    def router_gemm_bf16_fp32_fake(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+    ) -> torch.Tensor:
+        return torch.empty(
+            input.shape[0], weight.shape[0], dtype=torch.float32, device=input.device
+        )
+
+
 def dsv3_router_gemm(
     hidden_states: torch.Tensor,
     router_weight: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index c6cb31b629a0..be901bd24490 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -28,6 +28,7 @@
 from vllm.model_executor.layers.fused_moe.router.fused_moe_router import (
     FusedMoERouter,
 )
+from vllm.model_executor.layers.fused_moe.router.gate_linear import GateLinear
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
     UnquantizedFusedMoEMethod,
@@ -64,6 +65,7 @@ def get_config() -> dict[str, Any] | None:
     "FusedMoEPermuteExpertsUnpermute",
     "FusedMoEActivationFormat",
     "FusedMoEPrepareAndFinalize",
+    "GateLinear",
     "RoutingMethodType",
     "SharedFusedMoE",
     "ZeroExpertFusedMoE",
diff --git a/vllm/model_executor/layers/fused_moe/router/gate_linear.py b/vllm/model_executor/layers/fused_moe/router/gate_linear.py
new file mode 100644
index 000000000000..77d8e756026d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/router/gate_linear.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.custom_op import PluggableLayer
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.platforms import current_platform
+
+
+@PluggableLayer.register("gate_linear")
+class GateLinear(ReplicatedLinear):
+    """MoE gate linear layer with three-tier GEMM dispatch:
+
+    1. DSV3 specialized kernel (SM90+, batch<=16, supported dims)
+    2. cuBLAS bf16×bf16→fp32 (SM90+ + bf16 + fp32 out_dtype)
+    3. F.linear via ReplicatedLinear (ultimate fallback)
+
+    The ``out_dtype`` attribute is mutable and can be set after init
+    (e.g. when the required dtype depends on the expert quantization
+    method which is only known later).
+    """
+
+    # Dimensions supported by the DSV3 specialized kernel
+    DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
+    DSV3_SUPPORTED_HIDDEN_SIZES = [7168]
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+        out_dtype: torch.dtype | None = None,
+        params_dtype: torch.dtype | None = None,
+        force_fp32_compute: bool = False,
+        prefix: str = "",
+    ):
+        is_hopper_or_blackwell = current_platform.is_device_capability(
+            (9, 0)
+        ) or current_platform.is_device_capability_family(100)
+        can_use_specialized_kernels = (
+            current_platform.is_cuda() and is_hopper_or_blackwell and not bias
+        )
+
+        # If fp32 compute is required and no specialized kernel is available,
+        # store weights in fp32 so Tier 3 computes in fp32 natively.
+        if force_fp32_compute and not can_use_specialized_kernels:
+            params_dtype = torch.float32
+
+        super().__init__(
+            input_size,
+            output_size,
+            bias=bias,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=prefix,
+        )
+        self.out_dtype = out_dtype
+
+        # DSV3 specialized kernel eligibility (SM90+, exact dims)
+        self.allow_specialized_router_gemm = can_use_specialized_kernels
+        self.allow_dsv3_router_gemm = (
+            self.allow_specialized_router_gemm
+            and output_size in self.DSV3_SUPPORTED_NUM_EXPERTS
+            and input_size in self.DSV3_SUPPORTED_HIDDEN_SIZES
+        )
+
+        # cuBLAS bf16→fp32 eligibility
+        self.allow_cublas_router_gemm = (
+            self.allow_specialized_router_gemm
+            and self.weight.dtype == torch.bfloat16
+            and self.out_dtype == torch.float32
+        )
+
+    def set_out_dtype(self, out_dtype: torch.dtype) -> None:
+        """Set output dtype for the router logits after init.
+
+        Useful when the required dtype depends on the expert quantization
+        method which is only known after the gate is constructed.
+        """
+        if self.out_dtype is not None:
+            raise ValueError("out_dtype has already been set")
+        self.out_dtype = out_dtype
+
+        if (
+            not self.allow_cublas_router_gemm
+            and self.allow_specialized_router_gemm
+            and out_dtype == torch.float32
+        ):
+            self.allow_cublas_router_gemm = self.weight.dtype == torch.bfloat16
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
+        import vllm._custom_ops as ops
+
+        # Tier 1: DSV3 specialized kernel
+        if self.allow_dsv3_router_gemm and x.shape[0] <= 16:
+            output = ops.dsv3_router_gemm(
+                hidden_states=x,
+                router_weight=self.weight,
+                output_dtype=self.out_dtype,
+            )
+            return output, None
+
+        # Tier 2: cuBLAS bf16→fp32
+        if self.allow_cublas_router_gemm and x.dtype == torch.bfloat16:
+            output = ops.router_gemm_bf16_fp32(x, self.weight)
+            return output, None
+
+        # Tier 3: F.linear (ReplicatedLinear)
+        if self.out_dtype is not None and x.dtype != self.weight.dtype:
+            x = x.to(self.weight.dtype)
+        output, output_bias = super().forward(x)
+        if self.out_dtype is not None and output.dtype != self.out_dtype:
+            output = output.to(self.out_dtype)
+        return output, output_bias
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 768f4e20b34a..c3e1ddb7dbb9 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -47,7 +47,7 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.fused_moe import GateLinear, SharedFusedMoE
 from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -221,73 +221,6 @@ def forward(self, x):
         return x
 
 
-class DeepSeekV2Gate(ReplicatedLinear):
-    def __init__(
-        self,
-        hidden_size: int,
-        n_experts: int,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        assert quant_config is None
-        super().__init__(
-            hidden_size,
-            n_experts,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate",
-        )
-
-        # Unquantized only, will be called "weight".
-        assert hasattr(self, "weight")
-        is_hopper_or_blackwell = current_platform.is_device_capability(
-            (9, 0)
-        ) or current_platform.is_device_capability_family(100)
-        SUPPORTED_NUM_EXPERTS = [256, 384]
-        SUPPORTED_HIDDEN_SIZES = [7168]
-
-        self.allow_dsv3_router_gemm = (
-            current_platform.is_cuda()
-            and is_hopper_or_blackwell
-            and n_experts in SUPPORTED_NUM_EXPERTS
-            and hidden_size in SUPPORTED_HIDDEN_SIZES
-        )
-
-        self._out_dtype: torch.dtype | None = None
-
-    def set_out_dtype(self, out_dtype: torch.dtype) -> None:
-        """
-        Set out dtype for the router logits. This is needed after
-        __init__, b/c we need to check if the trtllm kernel is
-        selected before we decide between bf16 and fp32.
-        """
-
-        if self._out_dtype is not None:
-            raise ValueError("out_dtype has already been set")
-        else:
-            self._out_dtype = out_dtype
-
-    @property
-    def out_dtype(self) -> torch.dtype:
-        if self._out_dtype is None:
-            raise ValueError("out_dtype has not been set yet")
-        return self._out_dtype
-
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> tuple[torch.Tensor, None]:
-        """
-        Use specialized GEMM for low batch size for DSV3 and KIMI.
-        """
-        if self.allow_dsv3_router_gemm and x.shape[0] <= 16:
-            return ops.dsv3_router_gemm(
-                hidden_states=x, router_weight=self.weight, output_dtype=self.out_dtype
-            ), None
-        else:
-            return super().forward(x)
-
-
 class DeepseekV2MoE(nn.Module):
     def __init__(
         self,
@@ -316,10 +249,9 @@ def __init__(
                 "Only silu is supported for now."
             )
 
-        self.gate = DeepSeekV2Gate(
+        self.gate = GateLinear(
             config.hidden_size,
             config.n_routed_experts,
-            quant_config=None,
             prefix=f"{prefix}.gate",
         )
         if getattr(config, "topk_method", None) == "noaux_tc":
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 446b01fe393e..39ea0ea4837f 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -34,7 +34,7 @@
 from vllm.model_executor.layers.activation import ReLUSquaredActivation
 from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.fused_moe import (
-    FusedMoE,
+    GateLinear,
     SharedFusedMoE,
     activation_without_mul,
 )
@@ -148,13 +148,11 @@ def __init__(
 
         self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
 
-        router_logits_dtype = torch.float32
-        self.gate = ReplicatedLinear(
+        self.gate = GateLinear(
             config.hidden_size,
             config.n_routed_experts,
-            bias=False,
-            params_dtype=router_logits_dtype,
-            quant_config=None,
+            out_dtype=torch.float32,
+            force_fp32_compute=True,
             prefix=f"{prefix}.gate",
         )
 
@@ -232,7 +230,6 @@ def __init__(
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
-            router_logits_dtype=router_logits_dtype,
             routed_input_transform=self.fc1_latent_proj,
         )
 
@@ -244,7 +241,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             hidden_states = sequence_parallel_chunk(hidden_states)
 
         # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+        router_logits, _ = self.gate(hidden_states)
 
         # SharedFusedMoE handles:
         #   - shared experts (with original hidden_states)
@@ -675,7 +672,7 @@ def _get_max_n_routed_experts(self) -> int:
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         if self.has_moe:
             # (param_name, weight_name, expert_id, shard_id)
-            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
                 # - FusedMoe.w1 (aka gate_proj) should be up_proj since that's
                 #   what the activation is applied to
                 # - FusedMoe.w3 (aka up_proj) should be ignored since we're

From 4fec53cfcb3135f223ce52d2c6b7bbf9ddf2be63 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 26 Feb 2026 19:58:03 -0500
Subject: [PATCH 053/154] [CI] Actually run
 tests/kernels/quantization/test_block_fp8.py in CI (#34274)

---
 .buildkite/test_areas/kernels.yaml           |  2 +-
 tests/kernels/quantization/test_block_fp8.py | 12 +++++-------
 vllm/utils/flashinfer.py                     |  2 +-
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
index c755c643651d..e1ecfeb8415f 100644
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -70,7 +70,7 @@ steps:
   - tests/kernels/moe/test_batched_deepgemm.py
   - tests/kernels/attention/test_deepgemm_attention.py
   commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/quantization/test_block_fp8.py
     - pytest -v -s kernels/moe/test_deepgemm.py
     - pytest -v -s kernels/moe/test_batched_deepgemm.py
     - pytest -v -s kernels/attention/test_deepgemm_attention.py
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 2c54267ef905..936516576ce1 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -37,13 +37,15 @@
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+# Quantization test configs
 NUM_TOKENS = [7, 2050]
 D = [512, 4096, 5120, 13824]
 GROUP_SIZE = [64, 128, 512]
 COLUMN_MAJOR_SCALES = [True, False]
 TMA_ALIGNED_SCALES = [True, False]
-M = [1, 7, 8, 83, 84, 4096]
-N = [128, 512, 7168, 7748, 13824]
+# Matmul test configs
+M = [1, 7, 8, 83, 4096]
+N = [128, 512, 576, 7168, 13824]
 K = [256, 3884, 4096, 13824, 16384]
 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
 # and its hidden size is 7168.
@@ -162,8 +164,6 @@ def test_w8a8_block_fp8_cutlass_matmul():
     k_tiles = (K + block_k - 1) // block_k
 
     Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
-    # Hopper requires row-major format for scales
-    Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability(90) else Bs
 
     A_fp8, As = per_token_group_quant_fp8(
         A_fp32, block_size[1], column_major_scales=False
@@ -174,9 +174,7 @@ def test_w8a8_block_fp8_cutlass_matmul():
     )
 
     ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
-    out = cutlass_scaled_mm(
-        A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass, block_size, out_dtype
-    )
+    out = cutlass_scaled_mm(A_fp8_cutlass, B_fp8, As_cutlass, Bs, block_size, out_dtype)
 
     rel_diff = torch.mean(
         torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 333e66f68a87..8ed9e11187cd 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -734,7 +734,7 @@ def should_use_flashinfer_for_blockscale_fp8_gemm(
 
     # Verify DeepGEMM N/K dims requirements
     # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
-    # test inside kernels/quatization/test_block_fp8.py
+    # test inside kernels/quantization/test_block_fp8.py
     N_MULTIPLE = 64
     K_MULTIPLE = 128
 

From d43048ce0585e6c9178f212aa0b7aeed95eb48df Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Thu, 26 Feb 2026 17:49:06 -0800
Subject: [PATCH 054/154] =?UTF-8?q?[Bugfix]=20Emit=20reasoning=5Fpart=20ev?=
 =?UTF-8?q?ents=20in=20simple=20streaming=20path=20for=20Resp=E2=80=A6=20(?=
 =?UTF-8?q?#35184)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniel Salib <danielsalib@meta.com>
---
 .../openai/responses/test_simple.py           | 21 +++++
 vllm/entrypoints/openai/responses/serving.py  | 79 +++++++++++++------
 2 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/tests/entrypoints/openai/responses/test_simple.py b/tests/entrypoints/openai/responses/test_simple.py
index b67f0d34115a..bbf3cc80ad43 100644
--- a/tests/entrypoints/openai/responses/test_simple.py
+++ b/tests/entrypoints/openai/responses/test_simple.py
@@ -6,6 +6,7 @@
 from openai import OpenAI
 
 from ....utils import RemoteOpenAIServer
+from .conftest import validate_streaming_event_stack
 
 MODEL_NAME = "Qwen/Qwen3-8B"
 
@@ -219,3 +220,23 @@ async def test_extra_sampling_params(client: OpenAI, model_name: str):
     assert response.status in ["completed", "incomplete"]
     assert len(response.output) > 0
     assert response.output[0].content[0].text  # Has text output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_streaming_types(
+    pairs_of_event_types: dict[str, str], client: OpenAI, model_name: str
+):
+    stream = await client.responses.create(
+        model=model_name,
+        input="tell me a story about a cat in 20 words",
+        reasoning={"effort": "low"},
+        tools=[],
+        stream=True,
+        background=False,
+    )
+    events = []
+    async for event in stream:
+        events.append(event)
+
+    validate_streaming_event_stack(events, pairs_of_event_types)
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
index b9d526e25def..3cfb6fffc3ea 100644
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -85,6 +85,8 @@
     ResponseCreatedEvent,
     ResponseInProgressEvent,
     ResponseInputOutputMessage,
+    ResponseReasoningPartAddedEvent,
+    ResponseReasoningPartDoneEvent,
     ResponsesRequest,
     ResponsesResponse,
     ResponseUsage,
@@ -1339,6 +1341,19 @@ async def _process_simple_streaming_events(
                                 ),
                             )
                         )
+                        yield _increment_sequence_number_and_return(
+                            ResponseReasoningPartAddedEvent(
+                                type="response.reasoning_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                part=ResponseReasoningTextContent(
+                                    text="",
+                                    type="reasoning_text",
+                                ),
+                            )
+                        )
                     else:
                         yield _increment_sequence_number_and_return(
                             ResponseOutputItemAddedEvent(
@@ -1354,22 +1369,21 @@ async def _process_simple_streaming_events(
                                 ),
                             )
                         )
-                    yield _increment_sequence_number_and_return(
-                        ResponseContentPartAddedEvent(
-                            type="response.content_part.added",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                            content_index=current_content_index,
-                            part=ResponseOutputText(
-                                type="output_text",
-                                text="",
-                                annotations=[],
-                                logprobs=[],
-                            ),
+                        yield _increment_sequence_number_and_return(
+                            ResponseContentPartAddedEvent(
+                                type="response.content_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                part=ResponseOutputText(
+                                    type="output_text",
+                                    text="",
+                                    annotations=[],
+                                    logprobs=[],
+                                ),
+                            )
                         )
-                    )
-                    current_content_index += 1
                     first_delta_sent = True
                 # todo(kebe7jun) tool call support
 
@@ -1397,6 +1411,19 @@ async def _process_simple_streaming_events(
                             text=reason_content,
                         )
                     )
+                    yield _increment_sequence_number_and_return(
+                        ResponseReasoningPartDoneEvent(
+                            type="response.reasoning_part.done",
+                            sequence_number=-1,
+                            item_id=current_item_id,
+                            output_index=current_output_index,
+                            content_index=current_content_index,
+                            part=ResponseReasoningTextContent(
+                                text=reason_content,
+                                type="reasoning_text",
+                            ),
+                        )
+                    )
                     current_content_index = 0
                     reasoning_item = ResponseReasoningItem(
                         type="reasoning",
@@ -1418,6 +1445,8 @@ async def _process_simple_streaming_events(
                             item=reasoning_item,
                         )
                     )
+                    current_output_index += 1
+                    current_item_id = str(uuid.uuid4())
                     yield _increment_sequence_number_and_return(
                         ResponseOutputItemAddedEvent(
                             type="response.output_item.added",
@@ -1432,8 +1461,6 @@ async def _process_simple_streaming_events(
                             ),
                         )
                     )
-                    current_output_index += 1
-                    current_item_id = str(uuid.uuid4())
                     yield _increment_sequence_number_and_return(
                         ResponseContentPartAddedEvent(
                             type="response.content_part.added",
@@ -1449,7 +1476,6 @@ async def _process_simple_streaming_events(
                             ),
                         )
                     )
-                    current_content_index += 1
                     # reset previous delta messages
                     previous_delta_messages = []
 
@@ -1485,7 +1511,6 @@ async def _process_simple_streaming_events(
                             ),
                         )
                     )
-                current_content_index += 1
 
                 previous_delta_messages.append(delta_message)
         if previous_delta_messages:
@@ -1505,7 +1530,19 @@ async def _process_simple_streaming_events(
                         text=reason_content,
                     )
                 )
-                current_content_index += 1
+                yield _increment_sequence_number_and_return(
+                    ResponseReasoningPartDoneEvent(
+                        type="response.reasoning_part.done",
+                        sequence_number=-1,
+                        item_id=current_item_id,
+                        output_index=current_output_index,
+                        content_index=current_content_index,
+                        part=ResponseReasoningTextContent(
+                            text=reason_content,
+                            type="reasoning_text",
+                        ),
+                    )
+                )
                 reasoning_item = ResponseReasoningItem(
                     type="reasoning",
                     content=[
@@ -1543,7 +1580,6 @@ async def _process_simple_streaming_events(
                         item_id=current_item_id,
                     )
                 )
-                current_content_index += 1
                 part = ResponseOutputText(
                     text=final_content,
                     type="output_text",
@@ -1559,7 +1595,6 @@ async def _process_simple_streaming_events(
                         part=part,
                     )
                 )
-                current_content_index += 1
                 item = ResponseOutputMessage(
                     type="message",
                     role="assistant",

From c29ee9c32647cc6cc3c51a6bc070267d48b0bcc4 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Thu, 26 Feb 2026 18:54:11 -0800
Subject: [PATCH 055/154] [compile] Invalidate cache for cpu flags (#35119)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 vllm/envs.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index d560cfc7753c..07d9f81eac99 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1752,10 +1752,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_ENABLE_V1_MULTIPROCESSING",
         "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",
         "VLLM_CPU_KVCACHE_SPACE",
-        "VLLM_CPU_OMP_THREADS_BIND",
-        "VLLM_CPU_NUM_OF_RESERVED_CPU",
         "VLLM_CPU_MOE_PREPACK",
-        "VLLM_CPU_SGL_KERNEL",
         "VLLM_TEST_FORCE_LOAD_FORMAT",
         "VLLM_ENABLE_CUDA_COMPATIBILITY",
         "VLLM_CUDA_COMPATIBILITY_PATH",

From 06be53563ba24b44d788d43ef1afc99a5aef98f4 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Thu, 26 Feb 2026 19:18:52 -0800
Subject: [PATCH 056/154] [Core]Extract is_last_rank in Ray for tpu to override
 (#33012)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 vllm/v1/executor/ray_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index 21403e1c0e5f..67c5a58f7ea9 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -108,7 +108,7 @@ def execute_model_ray(
 
             if isinstance(output, AsyncModelRunnerOutput):
                 output = output.get_output()
-            if not get_pp_group().is_last_rank:
+            if not self._is_last_rank():
                 # Case where there are no scheduled requests
                 # but may still be finished requests.
                 assert not output or not output.req_ids
@@ -128,6 +128,9 @@ def override_env_vars(self, vars: dict[str, str]):
         def _is_intermediate_tensors(self, output) -> bool:
             return isinstance(output, IntermediateTensors)
 
+        def _is_last_rank(self) -> bool:
+            return get_pp_group().is_last_rank
+
     ray_import_err = None
 
 except ImportError as e:

From cabdaa761975179afcc90dc6965430bce033e098 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 27 Feb 2026 04:42:51 +0100
Subject: [PATCH 057/154] [Misc] Move
 `GPUModelRunner.prepare_kernel_block_sizes` to utils (#35400)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/worker/test_gpu_model_runner.py |   8 +-
 vllm/v1/worker/gpu_model_runner.py       | 121 +--------------------
 vllm/v1/worker/utils.py                  | 129 ++++++++++++++++++++++-
 3 files changed, 135 insertions(+), 123 deletions(-)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index cb38aa70d3f0..93e6822e6397 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -38,7 +38,7 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
-from vllm.v1.worker.utils import AttentionGroup
+from vllm.v1.worker.utils import AttentionGroup, select_common_block_size
 
 BLOCK_SIZE = 16
 NUM_BLOCKS = 10
@@ -209,7 +209,7 @@ def test_select_common_block_size_prefers_manager_block_size():
         AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0),
     ]
 
-    selected_size = GPUModelRunner.select_common_block_size(128, attn_groups)
+    selected_size = select_common_block_size(128, attn_groups)
     assert selected_size == 128
 
 
@@ -221,7 +221,7 @@ def test_select_common_block_size_uses_largest_shared_int():
         AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0),
     ]
 
-    selected_size = GPUModelRunner.select_common_block_size(256, attn_groups)
+    selected_size = select_common_block_size(256, attn_groups)
     assert selected_size == 64
 
 
@@ -234,7 +234,7 @@ def test_select_common_block_size_no_valid_option():
     ]
 
     with pytest.raises(ValueError):
-        GPUModelRunner.select_common_block_size(48, attn_groups)
+        select_common_block_size(48, attn_groups)
 
 
 def test_update_states_new_request(model_runner, dist_init):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d82b83b8c496..a3e0adfae218 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -115,7 +115,6 @@
     AttentionMetadataBuilder,
     AttentionType,
     CommonAttentionMetadata,
-    MultipleOf,
 )
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadataBuilder
@@ -189,6 +188,7 @@
     AttentionGroup,
     add_kv_sharing_layers_to_kv_cache_groups,
     bind_kv_cache,
+    prepare_kernel_block_sizes,
     sanity_check_mm_encoder_outputs,
 )
 
@@ -5678,78 +5678,6 @@ def calculate_reorder_batch_threshold(self) -> None:
             return
         self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)  # type: ignore[assignment]
 
-    @staticmethod
-    def select_common_block_size(
-        kv_manager_block_size: int, attn_groups: list[AttentionGroup]
-    ) -> int:
-        """
-        Select a block size that is supported by all backends and is a factor of
-        kv_manager_block_size.
-
-        If kv_manager_block_size is supported by all backends, return it directly.
-        Otherwise, return the max supported size.
-
-        Args:
-            kv_manager_block_size: Block size of KV cache
-            attn_groups: List of attention groups
-
-        Returns:
-            The selected block size
-
-        Raises:
-            ValueError: If no valid block size found
-        """
-
-        def block_size_is_supported(
-            backends: list[type[AttentionBackend]], block_size: int
-        ) -> bool:
-            """
-            Check if the block size is supported by all backends.
-            """
-            for backend in backends:
-                is_supported = False
-                for supported_size in backend.get_supported_kernel_block_sizes():
-                    if isinstance(supported_size, int):
-                        if block_size == supported_size:
-                            is_supported = True
-                    elif isinstance(supported_size, MultipleOf):
-                        if block_size % supported_size.base == 0:
-                            is_supported = True
-                    else:
-                        raise ValueError(f"Unknown supported size: {supported_size}")
-                if not is_supported:
-                    return False
-            return True
-
-        backends = [group.backend for group in attn_groups]
-
-        # Case 1: if the block_size of kv cache manager is supported by all backends,
-        # return it directly
-        if block_size_is_supported(backends, kv_manager_block_size):
-            return kv_manager_block_size
-
-        # Case 2: otherwise, the block_size must be an `int`-format supported size of
-        # at least one backend. Iterate over all `int`-format supported sizes in
-        # descending order and return the first one that is supported by all backends.
-        # Simple proof:
-        # If the supported size b is in MultipleOf(x_i) format for all attention
-        # backends i, and b a factor of kv_manager_block_size, then
-        # kv_manager_block_size also satisfies MultipleOf(x_i) for all i. We will
-        # return kv_manager_block_size in case 1.
-        all_int_supported_sizes = set(
-            supported_size
-            for backend in backends
-            for supported_size in backend.get_supported_kernel_block_sizes()
-            if isinstance(supported_size, int)
-        )
-
-        for supported_size in sorted(all_int_supported_sizes, reverse=True):
-            if kv_manager_block_size % supported_size != 0:
-                continue
-            if block_size_is_supported(backends, supported_size):
-                return supported_size
-        raise ValueError(f"No common block size for {kv_manager_block_size}. ")
-
     def may_reinitialize_input_batch(
         self, kv_cache_config: KVCacheConfig, kernel_block_sizes: list[int]
     ) -> None:
@@ -5846,49 +5774,6 @@ def _kv_cache_spec_attn_group_iterator(self) -> Iterator[AttentionGroup]:
         for attn_groups in self.attn_groups:
             yield from attn_groups
 
-    def _prepare_kernel_block_sizes(self, kv_cache_config: KVCacheConfig) -> list[int]:
-        """
-        Generate kernel_block_sizes that matches each block_size.
-
-        For attention backends that support virtual block splitting,
-        use the supported block sizes from the backend.
-        For other backends (like Mamba), use the same block size (no splitting).
-
-        Args:
-            kv_cache_config: The KV cache configuration.
-
-        Returns:
-            list[int]: List of kernel block sizes for each cache group.
-        """
-        kernel_block_sizes = []
-        for kv_cache_gid, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
-            kv_cache_spec = kv_cache_group.kv_cache_spec
-            if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
-                # All layers in the UniformTypeKVCacheSpecs have the same type,
-                # Pick an arbitrary one to dispatch.
-                kv_cache_spec = next(iter(kv_cache_spec.kv_cache_specs.values()))
-            if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
-                continue
-            elif isinstance(kv_cache_spec, AttentionSpec):
-                # This is an attention backend that supports virtual
-                # block splitting. Get the supported block sizes from
-                # all backends in the group.
-                attn_groups = self.attn_groups[kv_cache_gid]
-                kv_manager_block_size = kv_cache_group.kv_cache_spec.block_size
-                selected_kernel_size = self.select_common_block_size(
-                    kv_manager_block_size, attn_groups
-                )
-                kernel_block_sizes.append(selected_kernel_size)
-            elif isinstance(kv_cache_spec, MambaSpec):
-                # This is likely Mamba or other non-attention cache,
-                # no splitting.
-                kernel_block_sizes.append(kv_cache_spec.block_size)
-            else:
-                raise NotImplementedError(
-                    f"unknown kv cache spec {kv_cache_group.kv_cache_spec}"
-                )
-        return kernel_block_sizes
-
     def _reshape_kv_cache_tensors(
         self,
         kv_cache_config: KVCacheConfig,
@@ -6120,7 +6005,9 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         # backends for that group only supports block_size 64, we will return
         # kernel_block_size 64 and split the 256-token-block to 4 blocks with 64
         # tokens each.
-        kernel_block_sizes = self._prepare_kernel_block_sizes(kv_cache_config)
+        kernel_block_sizes = prepare_kernel_block_sizes(
+            kv_cache_config, self.attn_groups
+        )
 
         # create metadata builders
         self.initialize_metadata_builders(kv_cache_config, kernel_block_sizes)
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index f13c75a7ae78..7280679800b4 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -13,8 +13,20 @@
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import MemorySnapshot, format_gib
-from vllm.v1.attention.backend import AttentionBackend, AttentionMetadataBuilder
-from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
+from vllm.v1.attention.backend import (
+    AttentionBackend,
+    AttentionMetadataBuilder,
+    MultipleOf,
+)
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    EncoderOnlyAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    KVCacheSpec,
+    MambaSpec,
+    UniformTypeKVCacheSpecs,
+)
 
 logger = init_logger(__name__)
 
@@ -59,6 +71,119 @@ def get_metadata_builder(self, ubatch_id: int = 0) -> AttentionMetadataBuilder:
         return self.metadata_builders[ubatch_id]
 
 
+def select_common_block_size(
+    kv_manager_block_size: int, attn_groups: list[AttentionGroup]
+) -> int:
+    """
+    Select a block size that is supported by all backends and is a factor of
+    kv_manager_block_size.
+
+    If kv_manager_block_size is supported by all backends, return it directly.
+    Otherwise, return the max supported size.
+
+    Args:
+        kv_manager_block_size: Block size of KV cache.
+        attn_groups: List of attention groups.
+
+    Returns:
+        The selected block size.
+
+    Raises:
+        ValueError: If no valid block size found.
+    """
+
+    def block_size_is_supported(
+        backends: list[type[AttentionBackend]], block_size: int
+    ) -> bool:
+        """Check if the block size is supported by all backends."""
+        for backend in backends:
+            is_supported = False
+            for supported_size in backend.get_supported_kernel_block_sizes():
+                if isinstance(supported_size, int):
+                    if block_size == supported_size:
+                        is_supported = True
+                elif isinstance(supported_size, MultipleOf):
+                    if block_size % supported_size.base == 0:
+                        is_supported = True
+                else:
+                    raise ValueError(f"Unknown supported size: {supported_size}")
+            if not is_supported:
+                return False
+        return True
+
+    backends = [group.backend for group in attn_groups]
+
+    # Case 1: if the block_size of kv cache manager is supported by all backends,
+    # return it directly.
+    if block_size_is_supported(backends, kv_manager_block_size):
+        return kv_manager_block_size
+
+    # Case 2: otherwise, the block_size must be an `int`-format supported size of
+    # at least one backend. Iterate over all `int`-format supported sizes in
+    # descending order and return the first one that is supported by all backends.
+    # Simple proof:
+    # If the supported size b is in MultipleOf(x_i) format for all attention
+    # backends i, and b a factor of kv_manager_block_size, then
+    # kv_manager_block_size also satisfies MultipleOf(x_i) for all i. We will
+    # return kv_manager_block_size in case 1.
+    all_int_supported_sizes = set(
+        supported_size
+        for backend in backends
+        for supported_size in backend.get_supported_kernel_block_sizes()
+        if isinstance(supported_size, int)
+    )
+
+    for supported_size in sorted(all_int_supported_sizes, reverse=True):
+        if kv_manager_block_size % supported_size != 0:
+            continue
+        if block_size_is_supported(backends, supported_size):
+            return supported_size
+    raise ValueError(f"No common block size for {kv_manager_block_size}. ")
+
+
+def prepare_kernel_block_sizes(
+    kv_cache_config: KVCacheConfig, attn_groups: list[list[AttentionGroup]]
+) -> list[int]:
+    """
+    Generate kernel_block_sizes that matches each block_size.
+
+    For attention backends that support virtual block splitting,
+    use the supported block sizes from the backend.
+    For other backends (like Mamba), use the same block size (no splitting).
+
+    Args:
+        kv_cache_config: The KV cache configuration.
+        attn_groups: Attention groups indexed by KV cache group id.
+
+    Returns:
+        List of kernel block sizes for each cache group.
+    """
+    kernel_block_sizes = []
+    for kv_cache_gid, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
+        kv_cache_spec = kv_cache_group.kv_cache_spec
+        if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
+            # All layers in the UniformTypeKVCacheSpecs have the same type,
+            # pick an arbitrary one to dispatch.
+            kv_cache_spec = next(iter(kv_cache_spec.kv_cache_specs.values()))
+        if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
+            continue
+        if isinstance(kv_cache_spec, AttentionSpec):
+            # This is an attention backend that supports virtual block splitting.
+            kv_manager_block_size = kv_cache_group.kv_cache_spec.block_size
+            selected_kernel_size = select_common_block_size(
+                kv_manager_block_size, attn_groups[kv_cache_gid]
+            )
+            kernel_block_sizes.append(selected_kernel_size)
+        elif isinstance(kv_cache_spec, MambaSpec):
+            # This is likely Mamba or other non-attention cache, no splitting.
+            kernel_block_sizes.append(kv_cache_spec.block_size)
+        else:
+            raise NotImplementedError(
+                f"unknown kv cache spec {kv_cache_group.kv_cache_spec}"
+            )
+    return kernel_block_sizes
+
+
 def sanity_check_mm_encoder_outputs(
     mm_embeddings: MultiModalEmbeddings,
     expected_num_items: int,

From 1e5ad9b74f70f4690dff629598d414ee27116b85 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 27 Feb 2026 11:46:30 +0800
Subject: [PATCH 058/154] [Bugfix] Fix Qwen3NextForCausalLM
 packed_modules_mapping (#35413)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/qwen3_next.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 777d1d7bf147..c57265cc7944 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -412,6 +412,8 @@ def __init__(
             prefix=f"{prefix}.in_proj_qkvz",
         )
         # ba_proj doesn't support blockwise fp8 quantization.
+        # # in_proj_ba is defined as MergedColumnParallelLinear for
+        # compatibility with Qwen3_5.
         self.in_proj_ba = MergedColumnParallelLinear(
             input_size=self.hidden_size,
             output_sizes=[self.num_v_heads] * 2,
@@ -1326,6 +1328,8 @@ class Qwen3NextForCausalLM(
             "v_proj",
         ],
         "gate_up_proj": ["gate_proj", "up_proj"],
+        "in_proj_qkvz": ["in_proj_qkvz"],
+        "in_proj_ba": ["in_proj_ba"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

From a532c838492aced7d4c9ac30f694368c57371050 Mon Sep 17 00:00:00 2001
From: gnovack <gnovack@amazon.com>
Date: Thu, 26 Feb 2026 19:50:43 -0800
Subject: [PATCH 059/154] use 'max_active_experts' for moe lora input size
 (#33197)

Signed-off-by: gnovack <gnovack@amazon.com>
---
 tests/lora/test_moe_lora_align_sum.py  | 2 ++
 vllm/lora/punica_wrapper/punica_gpu.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py
index 3a17f3eba6e8..bb46b4d86807 100644
--- a/tests/lora/test_moe_lora_align_sum.py
+++ b/tests/lora/test_moe_lora_align_sum.py
@@ -47,6 +47,8 @@ def test_moe_lora_align_block_size(
     # compute paddings
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    if topk_ids.numel() < num_experts:
+        max_num_tokens_padded = topk_ids.numel() * block_size
     max_num_m_blocks = CEILDIV(max_num_tokens_padded, block_size)
 
     # init output tensors
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index b75d297ba5c4..5f2604892ce9 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -351,6 +351,8 @@ def moe_lora_align_block_size(
             max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
             if pad_sorted_ids:
                 max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+            if topk_ids.numel() < num_experts:
+                max_num_tokens_padded = topk_ids.numel() * block_size
             sorted_ids = torch.empty(
                 (max_loras * max_num_tokens_padded,),
                 dtype=torch.int32,

From 062b7896324df60d5c03f66201902122f6cb8901 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 26 Feb 2026 22:50:46 -0500
Subject: [PATCH 060/154] [Bug] Fix outdated links in source code (#35314)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .github/mergify.yml                           | 3 +--
 docs/design/metrics.md                        | 2 +-
 tools/profiler/print_layerwise_table.py       | 5 ++++-
 tools/profiler/visualize_layerwise_profile.py | 6 ++++--
 vllm/model_executor/models/config.py          | 2 +-
 5 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 080767ca7218..9c53342d1737 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -259,8 +259,7 @@ pull_request_rules:
       - files=benchmarks/run_structured_output_benchmark.sh
       - files=docs/features/structured_outputs.md
       - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+      - files=examples/online_serving/structured_outputs/structured_outputs.py
       - files~=^tests/v1/structured_output/
       - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
       - files~=^vllm/v1/structured_output/
diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 37cc61d4626b..a977ce9b9bb2 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -656,7 +656,7 @@ vLLM has support for OpenTelemetry tracing:
 - Added by <https://github.com/vllm-project/vllm/pull/4687> and reinstated by <https://github.com/vllm-project/vllm/pull/20372>
 - Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
 - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
-- [User-facing docs](../examples/online_serving/opentelemetry.md)
+- [User-facing docs](../../examples/online_serving/opentelemetry/README.md)
 - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
 - [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
 
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index d7a24a598593..06a8c58537b3 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -33,7 +33,10 @@ def get_entries(node, curr_depth=0):
         "--json-trace",
         type=str,
         required=True,
-        help="json trace file output by examples/offline_inference/profiling.py",
+        help=(
+            "JSON trace file generated by scripts that use "
+            "vllm.profiler.layerwise_profile"
+        ),
     )
     parser.add_argument(
         "--phase",
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index ed4bf0beb716..83b8b3a7520d 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -564,8 +564,10 @@ def make_plot_title_suffix(profile_json: dict) -> str:
         "--json-trace",
         type=str,
         required=True,
-        help="json trace file output by \
-                              examples/offline_inference/profiling.py",
+        help=(
+            "JSON trace file generated by scripts that use "
+            "vllm.profiler.layerwise_profile"
+        ),
     )
     parser.add_argument(
         "--output-directory", type=str, required=False, help="Directory to output plots"
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index ea0f118a0d2f..2ec219d40233 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -213,7 +213,7 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
                     "Nomic context extension is disabled. "
                     "Changing max_model_len from %s to %s. "
                     "To enable context extension, see: "
-                    "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html",
+                    "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.py",
                     max_model_len_before,
                     model_config.max_model_len,
                 )

From 1a8c71674e8bf522506bfe7ea904808df17ad661 Mon Sep 17 00:00:00 2001
From: Daniel Huang <pilotflyer824@gmail.com>
Date: Thu, 26 Feb 2026 19:50:56 -0800
Subject: [PATCH 061/154] [BugFix] Repo utils debug print patch (#35434)

Signed-off-by: Daniel Huang <daniel1.huang@intel.com>
---
 vllm/transformers_utils/repo_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/repo_utils.py b/vllm/transformers_utils/repo_utils.py
index 552e053b29db..e485b60412b5 100644
--- a/vllm/transformers_utils/repo_utils.py
+++ b/vllm/transformers_utils/repo_utils.py
@@ -285,7 +285,7 @@ def get_hf_file_to_dict(
             EntryNotFoundError,
             LocalEntryNotFoundError,
         ) as e:
-            logger.debug("File or repository not found in hf_hub_download", e)
+            logger.debug("File or repository not found in hf_hub_download:", exc_info=e)
             return None
         except HfHubHTTPError as e:
             logger.warning(

From 487e5c51f72739981d7af3d0d42386ae4fe7824f Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Fri, 27 Feb 2026 12:18:52 +0800
Subject: [PATCH 062/154] [Bugfix] disable allreduce_rms_fusion by default when
 pp size > 1 (#35424)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/config/vllm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 127c16ac769a..70f7821ab249 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -126,6 +126,9 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
         # tp-dp combination broken:
         # https://github.com/vllm-project/vllm/issues/34458
         and cfg.parallel_config.data_parallel_size == 1
+        # tp-pp combination broken:
+        # https://github.com/vllm-project/vllm/issues/35426
+        and cfg.parallel_config.pipeline_parallel_size == 1
     )
 
 

From 516cf26698f07d2c92dd61bd5541ceb1376401bc Mon Sep 17 00:00:00 2001
From: zofia <110436990+zufangzhu@users.noreply.github.com>
Date: Fri, 27 Feb 2026 13:19:51 +0800
Subject: [PATCH 063/154] [Bug] correct out dtype of rms_norm_gated native path
 (#35369)

Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/model_executor/layers/layernorm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 17b90c970123..ff78f0886b46 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -577,7 +577,7 @@ def forward_native(
         if z is not None and self.norm_before_gate:
             out = out * F.silu(z)
 
-        return out
+        return out.to(x.dtype)
 
     def forward_cuda(
         self, x: torch.Tensor, z: torch.Tensor | None = None

From a572baff5e5fde4aa3fb92961de04eb043dc5cf4 Mon Sep 17 00:00:00 2001
From: Chengyi Nie <54555896+chengyinie@users.noreply.github.com>
Date: Thu, 26 Feb 2026 21:51:14 -0800
Subject: [PATCH 064/154] [Model Performance] Add Qwen3MoE tuned MoE configs
 for H200 (#35457)

Signed-off-by: Chengyi Nie <cnie@roblox.com>
Co-authored-by: Chengyi Nie <cnie@roblox.com>
---
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 .../E=128,N=96,device_name=NVIDIA_H200.json   | 147 ++++++++++++++++++
 2 files changed, 294 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000000..620fe9365aa7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.6.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000000..fc7dda8a7844
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.6.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

From 07bdabef03c78c9dc6beb549185930888ec6f2ce Mon Sep 17 00:00:00 2001
From: Wang Xingran <72983099+wangxingran222@users.noreply.github.com>
Date: Fri, 27 Feb 2026 15:06:08 +0800
Subject: [PATCH 065/154] [Bugfix] Use 'sum' reduction instead of 'avg' in
 Async TP reduce-scatter (#33088)

Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
Co-authored-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/compilation/passes/fusion/collective_fusion.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/compilation/passes/fusion/collective_fusion.py b/vllm/compilation/passes/fusion/collective_fusion.py
index 55a5a2e5df42..a9b64adcb3f1 100644
--- a/vllm/compilation/passes/fusion/collective_fusion.py
+++ b/vllm/compilation/passes/fusion/collective_fusion.py
@@ -53,7 +53,7 @@ def replacement(mul: torch.Tensor, mm_weight: torch.Tensor) -> torch.Tensor:
             gemm_rs = torch.ops.symm_mem.fused_matmul_reduce_scatter(
                 mul,
                 mm_weight,
-                "avg",
+                "sum",
                 scatter_dim=0,
                 group_name=self.tp.device_group.group_name,
             )
@@ -150,7 +150,7 @@ def replacement(
                 mat2,
                 scale_a,
                 scale_b,
-                "avg",
+                "sum",
                 scatter_dim,  # orig_scatter_dim
                 scatter_dim,  # scatter_dim_after_maybe_reshape
                 self.tp.device_group.group_name,
@@ -285,7 +285,7 @@ def replacement(
                 mat2,
                 scale_a,
                 scale_b,
-                "avg",
+                "sum",
                 scatter_dim,  # orig_scatter_dim
                 scatter_dim,  # scatter_dim_after_maybe_reshape
                 self.tp.device_group.group_name,

From b66a74649e40022c6b1180b98fbb1b6e4b4af74a Mon Sep 17 00:00:00 2001
From: Umut Polat <52835619+umut-polat@users.noreply.github.com>
Date: Fri, 27 Feb 2026 11:01:06 +0300
Subject: [PATCH 066/154] [Bugfix] Replace assert with ValueError for
 response_format validation in completions endpoint (#35456)

Signed-off-by: umut-polat <52835619+umut-polat@users.noreply.github.com>
---
 .../openai/test_completion_error.py           | 13 ++++++++
 .../entrypoints/openai/completion/protocol.py | 30 ++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py
index e48cc32e5400..1e7a3d0a8c39 100644
--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
@@ -221,6 +221,19 @@ async def mock_generate(*args, **kwargs):
     assert chunks[-1] == "data: [DONE]\n\n"
 
 
+def test_json_schema_response_format_missing_schema():
+    """When response_format type is 'json_schema' but the json_schema field
+    is not provided, request construction should raise a validation error
+    so the API returns 400 instead of 500."""
+    with pytest.raises(Exception, match="json_schema.*must be provided"):
+        CompletionRequest(
+            model=MODEL_NAME,
+            prompt="Test prompt",
+            max_tokens=10,
+            response_format={"type": "json_schema"},
+        )
+
+
 def test_negative_prompt_token_ids_nested():
     """Negative token IDs in prompt (nested list) should raise validation error."""
     with pytest.raises(Exception, match="greater than or equal to 0"):
diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py
index 226dd6c1a2bb..02e6e0d036ac 100644
--- a/vllm/entrypoints/openai/completion/protocol.py
+++ b/vllm/entrypoints/openai/completion/protocol.py
@@ -259,7 +259,7 @@ def to_sampling_params(
                 structured_outputs_kwargs["json"] = json_schema.json_schema
             elif response_format.type == "structural_tag":
                 structural_tag = response_format
-                assert structural_tag is not None and isinstance(
+                assert isinstance(
                     structural_tag,
                     (
                         LegacyStructuralTagResponseFormat,
@@ -313,6 +313,34 @@ def to_sampling_params(
             skip_clone=True,  # Created fresh per request, safe to skip clone
         )
 
+    @model_validator(mode="before")
+    @classmethod
+    def validate_response_format(cls, data):
+        response_format = data.get("response_format")
+        if response_format is None:
+            return data
+
+        rf_type = (
+            response_format.get("type")
+            if isinstance(response_format, dict)
+            else getattr(response_format, "type", None)
+        )
+
+        if rf_type == "json_schema":
+            json_schema = (
+                response_format.get("json_schema")
+                if isinstance(response_format, dict)
+                else getattr(response_format, "json_schema", None)
+            )
+            if json_schema is None:
+                raise VLLMValidationError(
+                    "When response_format type is 'json_schema', the "
+                    "'json_schema' field must be provided.",
+                    parameter="response_format",
+                )
+
+        return data
+
     @model_validator(mode="before")
     @classmethod
     def check_structured_outputs_count(cls, data):

From 9c3fe9936b929b5503d780bd4e8e3cd524de1c4e Mon Sep 17 00:00:00 2001
From: Max Hu <hyoung2991@gmail.com>
Date: Fri, 27 Feb 2026 20:20:23 +0800
Subject: [PATCH 067/154] Flashinfer cuDNN backend for Qwen3 VL ViT attention
 (#34580)

Signed-off-by: Max Hu <maxhu@nvidia.com>
Signed-off-by: Max Hu <hyoung2991@gmail.com>
Co-authored-by: Max Hu <maxhu@nvidia.com>
Co-authored-by: Shang Wang <shangw@nvidia.com>
---
 tests/kernels/attention/test_mha_attn.py      | 110 +++++++++++
 .../layers/attention/mm_encoder_attention.py  | 172 +++++++++++++++++-
 vllm/model_executor/models/qwen2_5_vl.py      |   3 +
 vllm/model_executor/models/qwen3_vl.py        |  52 ++++--
 vllm/platforms/cuda.py                        |   1 +
 vllm/v1/attention/ops/vit_attn_wrappers.py    |  88 +++++++++
 6 files changed, 405 insertions(+), 21 deletions(-)

diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index d76c57f9eb1a..bc99ed57637a 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -9,9 +9,12 @@
 import itertools
 from unittest.mock import patch
 
+import numpy as np
 import pytest
 import torch
 
+from vllm.config import get_current_vllm_config
+from vllm.config.multimodal import MultiModalConfig
 from vllm.model_executor.layers.attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
@@ -224,3 +227,110 @@ def test_mha_attn_varlen_forward(
         ref_output.append(output_i)
     ref_output = torch.cat(ref_output, dim=1)
     torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("var_seq_len", VAR_SEQ_LENS)
+@pytest.mark.parametrize(
+    "dtype",
+    [torch.bfloat16, torch.half],
+)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_varlen_forward_flashinfer(
+    default_vllm_config,
+    var_seq_len: list[int],
+    dtype: torch.dtype,
+    device: str,
+):
+    """Test MMEncoderAttention varlen forward with FLASHINFER backend (head_size=72).
+
+    Exercises the path that uses --mm-encoder-attn-backend=FLASHINFER with
+    recomputed cu_seqlens, max_seqlen, and sequence_lengths as in qwen3_vl
+    vision encoder.
+    """
+    pytest.importorskip("flashinfer")
+
+    num_heads = 16
+    head_size = 72
+    set_random_seed(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    # Override vllm config so get_vit_attn_backend returns FLASHINFER (simulates
+    # --mm-encoder-attn-backend=FLASHINFER).
+    vllm_config = get_current_vllm_config()
+    old_model_config = getattr(vllm_config, "model_config", None)
+    minimal_model_config = type(
+        "MinimalModelConfig",
+        (),
+        {
+            "multimodal_config": MultiModalConfig(
+                mm_encoder_attn_backend=AttentionBackendEnum.FLASHINFER
+            ),
+        },
+    )()
+    vllm_config.model_config = minimal_model_config
+    try:
+        total_len = sum(var_seq_len)
+        # Stride of second dim = 3 * num_heads * head_size (same as qwen2_5_vl
+        # after qkv rearrange and unbind: qkv shape (b, s, 3, head, head_dim)).
+        qkv = torch.randn(1, total_len, 3, num_heads, head_size)
+        q, k, v = qkv.unbind(dim=2)
+
+        cu_seqlens_np = np.array(
+            [0] + list(itertools.accumulate(var_seq_len)), dtype=np.int32
+        )
+        hidden_size = num_heads * head_size
+        tp_size = 1
+
+        sequence_lengths_np = MMEncoderAttention.maybe_compute_sequence_lengths(
+            AttentionBackendEnum.FLASHINFER, cu_seqlens_np
+        )
+        sequence_lengths = torch.from_numpy(sequence_lengths_np).to(
+            device, dtype=torch.int32, non_blocking=True
+        )
+
+        max_seqlen_val = MMEncoderAttention.compute_max_seqlen(
+            AttentionBackendEnum.FLASHINFER, cu_seqlens_np
+        )
+        max_seqlen = torch.tensor(max_seqlen_val, device=device, dtype=torch.int32)
+
+        cu_seqlens_np = MMEncoderAttention.maybe_recompute_cu_seqlens(
+            AttentionBackendEnum.FLASHINFER,
+            cu_seqlens_np,
+            hidden_size,
+            tp_size,
+        )
+        cu_seqlens = torch.from_numpy(cu_seqlens_np).to(
+            device, dtype=torch.int32, non_blocking=True
+        )
+
+        scale = 1.0 / head_size**0.5
+        attn = MMEncoderAttention(
+            num_heads,
+            head_size,
+            scale=scale,
+            num_kv_heads=num_heads,
+        )
+        assert attn.attn_backend == AttentionBackendEnum.FLASHINFER
+
+        output = attn(
+            q,
+            k,
+            v,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            sequence_lengths=sequence_lengths,
+        )
+
+        ref_output = []
+        for q_i, k_i, v_i in zip(
+            torch.split(q, var_seq_len, dim=1),
+            torch.split(k, var_seq_len, dim=1),
+            torch.split(v, var_seq_len, dim=1),
+        ):
+            output_i = ref_attention(q_i, k_i, v_i, scale=scale)
+            ref_output.append(output_i)
+        ref_output = torch.cat(ref_output, dim=1)
+        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
+    finally:
+        vllm_config.model_config = old_model_config
diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py
index e59806abb04d..d89366bbdfe3 100644
--- a/vllm/model_executor/layers/attention/mm_encoder_attention.py
+++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -2,21 +2,93 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+import numpy as np
 import torch
 
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.models.vision import get_vit_attn_backend
+from vllm.utils.math_utils import round_up
 from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
+    vit_flashinfer_wrapper,
     vit_torch_sdpa_wrapper,
     vit_triton_attn_wrapper,
 )
 
 logger = init_logger(__name__)
 
+# Batch buckets for cuDNN graph caching.
+# Graphs use batch size and max sequence length as cache key.
+# This avoids creating a new graph for each unique set of
+# batch size and max sequence length at runtime.
+# From the cuDNN team's performance measurements, there
+# is no significant kernel performance difference between padding
+# to a smaller batch size/seq length and padding to larger
+# ones. The bucketing here is solely used to avoid memory
+# operation overhead, which won't be needed if we have CUDA
+# graph support in the future.
+# TODO: Remove buckets after issue #34763
+# (cuda graph support) is addressed.
+FLASHINFER_BATCH_BUCKETS = [8, 16, 32, 64]
+FLASHINFER_MAX_SEQLEN_BUCKETS = [
+    1 * 1024,
+    2 * 1024,
+    4 * 1024,
+    8 * 1024,
+    16 * 1024,
+    32 * 1024,
+    64 * 1024,
+    128 * 1024,
+]
+
+# Workspace buffer for FlashInfer CuDNN backend
+FLASHINFER_CUDNN_WORKSPACE_SIZE_BYTES = 128 * 1024 * 1024
+_flashinfer_workspace_buffer: torch.Tensor | None = None
+
+
+def _get_flashinfer_workspace_buffer() -> torch.Tensor:
+    global _flashinfer_workspace_buffer
+    if _flashinfer_workspace_buffer is None:
+        _flashinfer_workspace_buffer = torch.zeros(
+            FLASHINFER_CUDNN_WORKSPACE_SIZE_BYTES,
+            dtype=torch.uint8,
+            device="cuda",
+        )
+    return _flashinfer_workspace_buffer
+
+
+def add_padding_to_seqlens(
+    seq: np.ndarray,
+    batch_size: int,
+    padding_value: int,
+) -> np.ndarray:
+    batch_size_padded = next(
+        (b for b in FLASHINFER_BATCH_BUCKETS if b >= batch_size),
+        round_up(batch_size, FLASHINFER_BATCH_BUCKETS[0]),
+    )
+    if batch_size_padded == batch_size:
+        return seq
+    return np.concatenate(
+        [
+            seq,
+            np.full((batch_size_padded - batch_size,), padding_value, dtype=seq.dtype),
+        ]
+    )
+
+
+def bucket_flashinfer_max_seqlen(
+    real_max_seqlen: int,
+) -> int:
+    if real_max_seqlen <= 0:
+        return FLASHINFER_MAX_SEQLEN_BUCKETS[0]
+    return next(
+        (s for s in FLASHINFER_MAX_SEQLEN_BUCKETS if s >= real_max_seqlen),
+        round_up(real_max_seqlen, FLASHINFER_MAX_SEQLEN_BUCKETS[-1]),
+    )
+
 
 # --8<-- [start:mm_encoder_attn]
 @CustomOp.register("mm_encoder_attn")
@@ -24,6 +96,67 @@ class MMEncoderAttention(CustomOp):
     """Multi-headed attention without any cache, used for multimodal encoder."""
 
     # --8<-- [end:mm_encoder_attn]
+    @classmethod
+    def compute_max_seqlen(
+        cls,
+        attn_backend: AttentionBackendEnum,
+        cu_seqlens: np.ndarray,
+    ) -> int:
+        max_seqlen = 0
+        if (
+            attn_backend
+            in (
+                AttentionBackendEnum.FLASH_ATTN,
+                AttentionBackendEnum.ROCM_AITER_FA,
+                AttentionBackendEnum.TRITON_ATTN,
+                AttentionBackendEnum.FLASHINFER,
+            )
+            and len(cu_seqlens) >= 2
+        ):
+            max_seqlen = int((cu_seqlens[1:] - cu_seqlens[:-1]).max())
+        if attn_backend == AttentionBackendEnum.FLASHINFER:
+            max_seqlen = bucket_flashinfer_max_seqlen(max_seqlen)
+        return max_seqlen
+
+    @classmethod
+    def maybe_compute_sequence_lengths(
+        cls,
+        attn_backend: AttentionBackendEnum,
+        cu_seqlens: np.ndarray,
+    ) -> np.ndarray | None:
+        if attn_backend != AttentionBackendEnum.FLASHINFER:
+            return None
+        sequence_lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+        sequence_lengths = add_padding_to_seqlens(
+            sequence_lengths, len(sequence_lengths), 0
+        )
+        return sequence_lengths
+
+    @classmethod
+    def maybe_recompute_cu_seqlens(
+        cls,
+        attn_backend: AttentionBackendEnum,
+        cu_seqlens: np.ndarray,
+        hidden_size: int,
+        tp_size: int,
+    ) -> np.ndarray:
+        if attn_backend != AttentionBackendEnum.FLASHINFER:
+            return cu_seqlens
+
+        batch_size = len(cu_seqlens) - 1
+        scale = hidden_size // tp_size
+        cu_seqlens = cu_seqlens * scale
+
+        cu_seqlens_qko = cu_seqlens
+        cu_seqlens_v = cu_seqlens * 3
+
+        cu_seqlens_qko = add_padding_to_seqlens(
+            cu_seqlens_qko, batch_size, cu_seqlens_qko[-1]
+        )
+        cu_seqlens_v = add_padding_to_seqlens(
+            cu_seqlens_v, batch_size, cu_seqlens_v[-1]
+        )
+        return np.concatenate([cu_seqlens_qko, cu_seqlens_v])
 
     def __init__(
         self,
@@ -46,10 +179,9 @@ def __init__(
 
         self.num_heads = num_heads
         self.head_size = head_size
-        self.scale = scale
+        self.scale = 1.0 / (head_size**0.5) if scale is None else scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
         self.layer_name = prefix
-
         assert self.num_heads % self.num_kv_heads == 0, (
             f"num_heads ({self.num_heads}) is not "
             f"divisible by num_kv_heads ({self.num_kv_heads})"
@@ -75,6 +207,9 @@ def __init__(
             get_flash_attn_version() if self.is_flash_attn_backend else None
         )
 
+        if self.attn_backend == AttentionBackendEnum.FLASHINFER:
+            _get_flashinfer_workspace_buffer()
+
         logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
 
     @classmethod
@@ -201,6 +336,27 @@ def _forward_triton(
             output = output.reshape(bsz, q_len, -1)
         return output
 
+    def _forward_flashinfer(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,
+        sequence_lengths: torch.Tensor
+        | None = None,  # Only used for FlashInfer CuDNN backend
+    ) -> torch.Tensor:
+        return vit_flashinfer_wrapper(
+            q=query,
+            k=key,
+            v=value,
+            scale=self.scale,
+            workspace_buffer=_get_flashinfer_workspace_buffer(),
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            sequence_lengths=sequence_lengths,
+        )
+
     def forward_native(
         self,
         query: torch.Tensor,
@@ -208,6 +364,8 @@ def forward_native(
         value: torch.Tensor,
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        sequence_lengths: torch.Tensor
+        | None = None,  # Only used for FlashInfer CuDNN backend
     ) -> torch.Tensor:
         return self._forward_sdpa(query, key, value, cu_seqlens)
 
@@ -218,11 +376,17 @@ def forward_cuda(
         value: torch.Tensor,
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        sequence_lengths: torch.Tensor
+        | None = None,  # Only used for FlashInfer CuDNN backend
     ) -> torch.Tensor:
         if self.is_flash_attn_backend:
             return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
         elif self.attn_backend == AttentionBackendEnum.TRITON_ATTN:
             return self._forward_triton(query, key, value, cu_seqlens, max_seqlen)
+        elif self.attn_backend == AttentionBackendEnum.FLASHINFER:
+            return self._forward_flashinfer(
+                query, key, value, cu_seqlens, max_seqlen, sequence_lengths
+            )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             return self._forward_sdpa(query, key, value, cu_seqlens)
         else:
@@ -238,6 +402,8 @@ def forward_cpu(
         value: torch.Tensor,
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        sequence_lengths: torch.Tensor
+        | None = None,  # Only used for FlashInfer CuDNN backend
     ) -> torch.Tensor:
         return self._forward_sdpa(query, key, value, cu_seqlens)
 
@@ -248,6 +414,8 @@ def forward_xpu(
         value: torch.Tensor,
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        sequence_lengths: torch.Tensor
+        | None = None,  # Only used for FlashInfer CuDNN backend
     ) -> torch.Tensor:
         if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
             return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 9e5f1175a32f..3eeefbb3f26b 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -357,6 +357,7 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
+        sequence_lengths: torch.Tensor,  # Only used for FlashInfer CuDNN backend
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -398,6 +399,7 @@ def forward(
             value=v,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
+            sequence_lengths=sequence_lengths,
         )
 
         context_layer = einops.rearrange(
@@ -463,6 +465,7 @@ def forward(
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
+            sequence_lengths=None,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 304553ed3a91..e5bdbd8029c2 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -51,9 +51,12 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
-from vllm.distributed import get_pp_group
+from vllm.distributed import get_pp_group, parallel_state
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
 from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -92,7 +95,6 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.math_utils import round_up
-from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -244,6 +246,7 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
+        sequence_lengths: torch.Tensor,  # Only used for FlashInfer CuDNN backend
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -251,6 +254,7 @@ def forward(
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
+            sequence_lengths=sequence_lengths,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -332,6 +336,13 @@ def __init__(
         )
         self.num_grid_per_side = int(self.num_position_embeddings**0.5)
 
+        use_data_parallel = is_vit_use_data_parallel()
+        self.tp_size = (
+            1
+            if use_data_parallel
+            else parallel_state.get_tensor_model_parallel_world_size()
+        )
+
         # NOTE: This is used for creating empty tensor for all_gather for
         # DP ViT. Here out_hidden_size is enlarged due to deepstack
         self.out_hidden_size = vision_config.out_hidden_size * (
@@ -513,19 +524,6 @@ def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
 
         return torch.cat(outputs, dim=0)
 
-    def compute_attn_mask_seqlen(
-        self,
-        cu_seqlens: torch.Tensor,
-    ) -> torch.Tensor:
-        max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        if self.attn_backend in (
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-            AttentionBackendEnum.TRITON_ATTN,
-        ):
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        return max_seqlen
-
     def forward(
         self,
         x: torch.Tensor,
@@ -549,11 +547,26 @@ def forward(
             axis=0, dtype=np.int32
         )
         cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
-        cu_seqlens = torch.from_numpy(cu_seqlens)
-
+        sequence_lengths = MMEncoderAttention.maybe_compute_sequence_lengths(
+            self.attn_backend, cu_seqlens
+        )
+        if sequence_lengths is not None:
+            sequence_lengths = torch.from_numpy(sequence_lengths).to(
+                self.device, non_blocking=True
+            )
+        max_seqlen = torch.tensor(
+            MMEncoderAttention.compute_max_seqlen(self.attn_backend, cu_seqlens),
+            dtype=torch.int32,
+            device=self.device,
+        )
+        cu_seqlens = MMEncoderAttention.maybe_recompute_cu_seqlens(
+            self.attn_backend,
+            cu_seqlens,
+            self.hidden_size,
+            self.tp_size,
+        )
+        cu_seqlens = torch.from_numpy(cu_seqlens).to(self.device, non_blocking=True)
         hidden_states = hidden_states.unsqueeze(1)
-        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
-        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         deepstack_feature_lists = []
         for layer_num, blk in enumerate(self.blocks):
@@ -563,6 +576,7 @@ def forward(
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
+                sequence_lengths=sequence_lengths,
             )
             if layer_num in self.deepstack_visual_indexes:
                 deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ddd4df418613..d3312fe1560e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -414,6 +414,7 @@ def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TRITON_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.FLASHINFER,
         ]
 
     @classmethod
diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py
index f5c748fbcced..6ffe110adaa4 100644
--- a/vllm/v1/attention/ops/vit_attn_wrappers.py
+++ b/vllm/v1/attention/ops/vit_attn_wrappers.py
@@ -268,3 +268,91 @@ def vit_torch_sdpa_wrapper(
     return torch.ops.vllm.torch_sdpa_wrapper(
         q, k, v, scale, cu_seqlens, enable_gqa=enable_gqa
     )
+
+
+def flashinfer_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: float,
+    workspace_buffer: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
+    sequence_lengths: torch.Tensor | None = None,
+) -> torch.Tensor:
+    from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache
+
+    is_reshaped = q.dim() == 4
+
+    if is_reshaped:
+        reshape_batch_size = q.shape[0]
+        q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+    # cuDNN <= 9.10.2.21 requires q, k to be contiguous
+    # this comes with no cost for ViTs with RoPE because
+    # RoPE has already made q and k contiguous.
+    q, k = q.contiguous(), k.contiguous()
+
+    assert len(cu_seqlens) % 2 == 0, "cu_seqlens must be divisible by 2"
+    cu_seqlength = len(cu_seqlens) // 2
+    batch_offsets_qko = cu_seqlens[:cu_seqlength].view(-1, 1, 1, 1)
+    batch_offsets_v = cu_seqlens[cu_seqlength:].view(-1, 1, 1, 1)
+    sequence_lengths = sequence_lengths.view(-1, 1, 1, 1)
+    max_seqlen = max_seqlen.item()
+
+    output, _ = cudnn_batch_prefill_with_kv_cache(
+        q,
+        k,
+        v,
+        scale,
+        workspace_buffer,
+        max_token_per_sequence=max_seqlen,
+        max_sequence_kv=max_seqlen,
+        actual_seq_lens_q=sequence_lengths,
+        actual_seq_lens_kv=sequence_lengths,
+        causal=False,
+        return_lse=False,
+        batch_offsets_q=batch_offsets_qko,
+        batch_offsets_k=batch_offsets_qko,
+        batch_offsets_v=batch_offsets_v,
+        batch_offsets_o=batch_offsets_qko,
+    )
+
+    if is_reshaped:
+        output = einops.rearrange(output, "(b s) h d -> b s h d", b=reshape_batch_size)
+
+    return output
+
+
+def vit_flashinfer_wrapper_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: float,
+    workspace_buffer: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
+    sequence_lengths: torch.Tensor | None = None,
+) -> torch.Tensor:
+    return torch.empty_like(q)
+
+
+direct_register_custom_op(
+    op_name="flashinfer_wrapper",
+    op_func=flashinfer_wrapper,
+    fake_impl=vit_flashinfer_wrapper_fake,
+)
+
+
+def vit_flashinfer_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: float,
+    workspace_buffer: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
+    max_seqlen: torch.Tensor | None = None,
+    sequence_lengths: torch.Tensor | None = None,
+) -> torch.Tensor:
+    return torch.ops.vllm.flashinfer_wrapper(
+        q, k, v, scale, workspace_buffer, cu_seqlens, max_seqlen, sequence_lengths
+    )

From 6467b635b6acfd5b30dd31804c79ef70ad4bf834 Mon Sep 17 00:00:00 2001
From: Tib <34336452+Tib-Gridello@users.noreply.github.com>
Date: Fri, 27 Feb 2026 13:53:35 +0100
Subject: [PATCH 068/154] [Bugfix] Add missing activation attr to RMSNormGated
 (#35423)

Signed-off-by: tibG <naps@qubes.milou>
Co-authored-by: tibG <naps@qubes.milou>
---
 vllm/model_executor/layers/layernorm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index ff78f0886b46..72f42de06ee5 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -510,6 +510,7 @@ def __init__(
         norm_before_gate: bool = False,
         device: torch.device | None = None,
         dtype: torch.dtype | None = None,
+        activation: str = "swish",
     ):
         """Initialize RMSNormGated.
 
@@ -524,10 +525,12 @@ def __init__(
                               If False and z is provided: out = norm(x * silu(z))
             device: Device to create parameters on
             dtype: Data type for parameters
+            activation: Activation function name for gating
         """
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.eps = eps
+        self.activation = activation
         self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
         self.register_parameter("bias", None)
         self.group_size = group_size

From 66c1751d13b7b3c294418a88fbfbfe2ec49d5d3f Mon Sep 17 00:00:00 2001
From: Jason Li <jasonlizhengjian@gmail.com>
Date: Fri, 27 Feb 2026 05:36:37 -0800
Subject: [PATCH 069/154] [compile] Cleanup: Remove unnecessary +rms_norm
 forcing for sequence parallelism (#35410)

Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com>
---
 vllm/config/vllm.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 70f7821ab249..7f7b21316c7b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -860,7 +860,7 @@ def has_blocked_weights():
                 self.compilation_config.pass_config.fuse_gemm_comms = False
             else:
                 # Compute SP threshold early; disable if None (model too
-                # small) before +rms_norm gets forced into custom_ops.
+                # small for SP to be beneficial).
                 pass_config = self.compilation_config.pass_config
                 if pass_config.sp_min_token_num is None:
                     from vllm.compilation.passes.fusion.sequence_parallelism import (
@@ -883,14 +883,6 @@ def has_blocked_weights():
                     self.compilation_config.pass_config.enable_sp = False
                     self.compilation_config.pass_config.fuse_gemm_comms = False
 
-        if self.compilation_config.pass_config.enable_sp:
-            if "-rms_norm" in self.compilation_config.custom_ops:
-                logger.warning(
-                    "RMS norm force disabled, sequence parallelism might break"
-                )
-            else:
-                self.compilation_config.custom_ops.append("+rms_norm")
-
         if self.compilation_config.fast_moe_cold_start is None:
             # resolve default behavior: try to be as safe as possible
             # this config is unsafe if any spec decoding draft model has a MOE.

From fbe3f0120a5e786a1459c982c72185311c78a276 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 27 Feb 2026 14:13:27 +0000
Subject: [PATCH 070/154] Revert "Add GlmOcrConfig for GLM-OCR model type
 recognition" (#35512)

---
 vllm/transformers_utils/config.py           |  1 -
 vllm/transformers_utils/configs/__init__.py |  4 -
 vllm/transformers_utils/configs/glm_ocr.py  | 91 ---------------------
 3 files changed, 96 deletions(-)
 delete mode 100644 vllm/transformers_utils/configs/glm_ocr.py

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f5adb171b4dd..00129d52eaed 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -82,7 +82,6 @@ def __getitem__(self, key):
     deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
     funaudiochat="FunAudioChatConfig",
-    glm_ocr="GlmOcrConfig",
     hunyuan_vl="HunYuanVLConfig",
     isaac="IsaacConfig",
     kimi_linear="KimiLinearConfig",
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 761f96a574e1..541bc4de61b1 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -28,8 +28,6 @@
     "FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
     "FunAudioChatConfig": "vllm.transformers_utils.configs.funaudiochat",
     "FunAudioChatAudioEncoderConfig": "vllm.transformers_utils.configs.funaudiochat",
-    "GlmOcrConfig": "vllm.transformers_utils.configs.glm_ocr",
-    "GlmOcrVisionConfig": "vllm.transformers_utils.configs.glm_ocr",
     "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
@@ -85,8 +83,6 @@
     "FlexOlmoConfig",
     "FunAudioChatConfig",
     "FunAudioChatAudioEncoderConfig",
-    "GlmOcrConfig",
-    "GlmOcrVisionConfig",
     "HunYuanVLConfig",
     "HunYuanVLTextConfig",
     "HunYuanVLVisionConfig",
diff --git a/vllm/transformers_utils/configs/glm_ocr.py b/vllm/transformers_utils/configs/glm_ocr.py
deleted file mode 100644
index 43656d276e28..000000000000
--- a/vllm/transformers_utils/configs/glm_ocr.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from typing import Any
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class GlmOcrVisionConfig(PretrainedConfig):
-    model_type = "glm_ocr_vision"
-
-    def __init__(
-        self,
-        hidden_size: int = 1024,
-        depth: int = 24,
-        num_heads: int = 16,
-        attention_bias: bool = True,
-        intermediate_size: int = 4096,
-        hidden_act: str = "silu",
-        hidden_dropout_prob: float = 0.0,
-        initializer_range: float = 0.02,
-        image_size: int = 336,
-        in_channels: int = 3,
-        patch_size: int = 14,
-        out_hidden_size: int = 1536,
-        rms_norm_eps: float = 1e-5,
-        spatial_merge_size: int = 2,
-        temporal_patch_size: int = 2,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.depth = depth
-        self.num_heads = num_heads
-        self.attention_bias = attention_bias
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.initializer_range = initializer_range
-        self.image_size = image_size
-        self.in_channels = in_channels
-        self.patch_size = patch_size
-        self.out_hidden_size = out_hidden_size
-        self.rms_norm_eps = rms_norm_eps
-        self.spatial_merge_size = spatial_merge_size
-        self.temporal_patch_size = temporal_patch_size
-
-
-class GlmOcrConfig(PretrainedConfig):
-    model_type = "glm_ocr"
-
-    def __init__(
-        self,
-        text_config: dict | None = None,
-        vision_config: dict | None = None,
-        image_start_token_id: int = 59256,
-        image_end_token_id: int = 59257,
-        video_start_token_id: int = 59258,
-        video_end_token_id: int = 59259,
-        image_token_id: int = 59280,
-        video_token_id: int = 59281,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.image_start_token_id = image_start_token_id
-        self.image_end_token_id = image_end_token_id
-        self.video_start_token_id = video_start_token_id
-        self.video_end_token_id = video_end_token_id
-        self.image_token_id = image_token_id
-        self.video_token_id = video_token_id
-        self.vision_config = GlmOcrVisionConfig(**(vision_config or {}))
-
-        if isinstance(text_config, dict):
-            from transformers import AutoConfig
-
-            model_type = text_config.get("model_type", "chatglm")
-            self.text_config = AutoConfig.for_model(model_type, **text_config)
-        elif text_config is None:
-            from transformers import AutoConfig
-
-            self.text_config = AutoConfig.for_model("chatglm")
-        else:
-            self.text_config = text_config
-
-    def get_text_config(self) -> PretrainedConfig:
-        return self.text_config
-
-    def save_pretrained(self, save_directory, **kwargs):
-        self._auto_class = None
-        super().save_pretrained(save_directory, **kwargs)

From 6d4f9d3ad5aa3750697edcf013ad080619ae25e9 Mon Sep 17 00:00:00 2001
From: haosdent <haosdent@gmail.com>
Date: Fri, 27 Feb 2026 22:27:06 +0800
Subject: [PATCH 071/154] [Bugfix] Fix DCP + FA3 crash due to missing
 num_splits in _forward_with_dcp (#35082)

Signed-off-by: haosdent <haosdent@gmail.com>
---
 vllm/v1/attention/backends/flash_attn.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index d903bd89ce15..940dc7515491 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -847,6 +847,7 @@ def _forward_with_dcp(
             q_descale=q_descale,
             k_descale=k_descale,
             v_descale=v_descale,
+            num_splits=attn_metadata.max_num_splits,
         )
         # FA returns LSE in shape [ H, B ] but cp_lse_ag_out_rs wants [ B, H ]
         context_attn_out_cor, context_lse_cor = cp_lse_ag_out_rs(
@@ -876,6 +877,7 @@ def _forward_with_dcp(
             q_descale=q_descale,
             k_descale=k_descale,
             v_descale=v_descale,
+            num_splits=attn_metadata.max_num_splits,
         )
         assert context_attn_out_cor.shape == query_attn_out.shape
         assert context_lse_cor.shape == query_lse.shape

From e8249378e414d76387a027a6ffb5bde8b9aff765 Mon Sep 17 00:00:00 2001
From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com>
Date: Fri, 27 Feb 2026 09:48:25 -0500
Subject: [PATCH 072/154] [Bugfix] Fix check_interleaved_audio_video false
 positive for batched non-interleaved requests (#35487)

Signed-off-by: linyueqian <linyueqian@outlook.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 .../processing/test_qwen2_5_omni_embed.py     | 26 +++++++++++++++++
 .../models/qwen2_5_omni_thinker.py            | 29 ++++++++++++++++---
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/tests/models/multimodal/processing/test_qwen2_5_omni_embed.py b/tests/models/multimodal/processing/test_qwen2_5_omni_embed.py
index df5b077ce9d7..5001b98b6d27 100644
--- a/tests/models/multimodal/processing/test_qwen2_5_omni_embed.py
+++ b/tests/models/multimodal/processing/test_qwen2_5_omni_embed.py
@@ -116,6 +116,32 @@ def test_interleaved(self):
             is_video, is_audio, is_video.sum().item(), is_audio.sum().item()
         )
 
+    def test_batched_non_interleaved_no_false_positive(self):
+        """
+        Regression test for https://github.com/vllm-project/vllm/issues/35394.
+
+        5 identical non-interleaved mixed-modality requests batched together:
+        each has [audio][image][video] in separate blocks with text between them.
+        Across the batch, audio from request N falls between video blocks of
+        request N and request N+1, causing the global ranges to overlap.
+        check_interleaved_audio_video must return False (not a false positive).
+        """
+        # Build one request: [text][audio*5][text][image*4][text][video*6][text]
+        single_ids, _ = make_token_seq(5, 4, 6)
+        # Batch 5 identical requests (separated by text tokens to simulate padding)
+        sep = torch.tensor([TEXT_TOKEN_ID] * 3)
+        batched_ids = torch.cat([single_ids, sep] * 5)
+        is_multimodal = (
+            (batched_ids == AUDIO_TOKEN_ID)
+            | (batched_ids == IMAGE_TOKEN_ID)
+            | (batched_ids == VIDEO_TOKEN_ID)
+        )
+        is_video = is_multimodal & (batched_ids == VIDEO_TOKEN_ID)
+        is_audio = is_multimodal & (batched_ids == AUDIO_TOKEN_ID)
+        assert not check_interleaved_audio_video(
+            is_video, is_audio, is_video.sum().item(), is_audio.sum().item()
+        ), "Batched non-interleaved requests should not be detected as interleaved"
+
 
 # ---------------------------------------------------------------------------
 # Tests for embed_input_ids via a minimal mock
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index a9fdb24346ac..ee2bb837a498 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -122,8 +122,17 @@ def check_interleaved_audio_video(
     """
     Check if video and audio positions are interleaved in the multimodal region.
 
-    Returns:
-        True if video and audio tokens are interleaved, False otherwise.
+    Returns True only for the use_audio_in_video=True case, where video and
+    audio tokens alternate within a single contiguous region with no gaps.
+
+    A simple range-overlap check produces false positives when multiple
+    non-interleaved requests are batched together: audio tokens from request N
+    fall between video tokens from request N and request N+1, making the
+    global ranges overlap even though each individual request is non-interleaved.
+
+    To distinguish true interleaving from this batching artefact we require
+    that every position in the combined [first_VA, last_VA] range is occupied
+    by either a video or an audio token (no text/image gaps).
     """
     if num_video == 0 or num_audio == 0:
         return False
@@ -131,10 +140,22 @@ def check_interleaved_audio_video(
     video_pos = is_video.nonzero(as_tuple=True)[0]
     audio_pos = is_audio.nonzero(as_tuple=True)[0]
 
-    return (
+    # Quick range-overlap pre-check (necessary but not sufficient).
+    if not (
         video_pos[0].item() < audio_pos[-1].item()
         and audio_pos[0].item() < video_pos[-1].item()
-    )
+    ):
+        return False
+
+    # Density check: for true use_audio_in_video interleaving every position
+    # in the combined span is a video or audio token.  Batched non-interleaved
+    # requests have text/image tokens between the per-request V and A blocks.
+    # combined_start/end encompass all V/A tokens, so num_video + num_audio
+    # equals the number of V/A tokens in range; compare directly to span size.
+    combined_start = min(video_pos[0].item(), audio_pos[0].item())
+    combined_end = max(video_pos[-1].item(), audio_pos[-1].item())
+    total_in_range = combined_end - combined_start + 1
+    return (num_video + num_audio) == total_in_range
 
 
 def merge_interleaved_embeddings(

From 9251ed5c4fc6c954a5cdc5399d9d4f25ea5a8dd3 Mon Sep 17 00:00:00 2001
From: Koushik Dutta <koushd@gmail.com>
Date: Fri, 27 Feb 2026 06:58:28 -0800
Subject: [PATCH 073/154] [Bugfix] Handle case when kimi ends reasoning with a
 tool call (#33646)

Signed-off-by: Koushik Dutta <koushd@gmail.com>
Co-authored-by: mondaylord <20212010046@fudan.edu.cn>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/reasoning/__init__.py                 |   4 +-
 vllm/reasoning/kimi_k2_reasoning_parser.py | 228 +++++++++++++++++++++
 2 files changed, 230 insertions(+), 2 deletions(-)
 create mode 100644 vllm/reasoning/kimi_k2_reasoning_parser.py

diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 8be56b56e9ca..df75e8584f50 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -53,8 +53,8 @@
         "HunyuanA13BReasoningParser",
     ),
     "kimi_k2": (
-        "deepseek_v3_reasoning_parser",
-        "DeepSeekV3ReasoningWithThinkingParser",
+        "kimi_k2_reasoning_parser",
+        "KimiK2ReasoningParser",
     ),
     "minimax_m2": (
         "minimax_m2_reasoning_parser",
diff --git a/vllm/reasoning/kimi_k2_reasoning_parser.py b/vllm/reasoning/kimi_k2_reasoning_parser.py
new file mode 100644
index 000000000000..8dd1a76e503c
--- /dev/null
+++ b/vllm/reasoning/kimi_k2_reasoning_parser.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
+
+
+class KimiK2ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for Kimi K2 model.
+
+    The Kimi K2 model uses <think>...</think> tokens to denote reasoning text,
+    and may implicitly end reasoning by starting a tool call section using
+    <|tool_calls_section_begin|>.
+    Thinking may also begin without a </think> token.
+
+    Kimi's thinking mode can be disabled via chat_template_kwargs.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction."
+            )
+
+        # Check if thinking is disabled via chat_template_kwargs
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        thinking = bool(chat_kwargs.get("thinking", True))
+
+        # If thinking is not enabled, use identity parser to fall through
+        if not thinking:
+            self._identity_parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
+        else:
+            self._identity_parser = None
+
+        # Token definitions
+        self._start_token = "<think>"
+        self._end_token = "</think>"
+        self._tool_section_start_token = "<|tool_calls_section_begin|>"
+
+        # Get token IDs
+        self._start_token_id = self.vocab.get(self._start_token)
+        self._end_token_id = self.vocab.get(self._end_token)
+        self._tool_section_start_token_id = self.vocab.get(
+            self._tool_section_start_token
+        )
+
+        if self._start_token_id is None or self._end_token_id is None:
+            raise RuntimeError(
+                "KimiK2ReasoningParser could not locate think start/end "
+                "tokens in the tokenizer!"
+            )
+
+    def _is_identity_mode(self) -> bool:
+        """Check if parser is in identity mode (no reasoning extraction)."""
+        return self._identity_parser is not None
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+
+        Reasoning ends when we see either:
+        1. The end token (</think>)
+        2. The tool section start token (<|tool_calls_section_begin|>)
+        """
+        if self._is_identity_mode():
+            return self._identity_parser.is_reasoning_end(input_ids)
+
+        start_token_id = self._start_token_id
+        end_token_id = self._end_token_id
+        tool_section_start_token_id = self._tool_section_start_token_id
+
+        for i in range(len(input_ids) - 1, -1, -1):
+            if input_ids[i] == start_token_id:
+                return False
+            if input_ids[i] == end_token_id:
+                return True
+            # Implicit reasoning end via tool call section
+            if (
+                tool_section_start_token_id is not None
+                and input_ids[i] == tool_section_start_token_id
+            ):
+                return True
+        return False
+
+    def is_reasoning_end_streaming(
+        self, input_ids: Sequence[int], delta_ids: Sequence[int]
+    ) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids on a decode step.
+        """
+        if self._is_identity_mode():
+            return self._identity_parser.is_reasoning_end_streaming(
+                input_ids, delta_ids
+            )
+
+        # Check for explicit end token or implicit tool section start in delta
+        if self._end_token_id in delta_ids:
+            return True
+        return (
+            self._tool_section_start_token_id is not None
+            and self._tool_section_start_token_id in delta_ids
+        )
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract content token ids from the input_ids.
+        """
+        if self._is_identity_mode():
+            return self._identity_parser.extract_content_ids(input_ids)
+
+        if self._end_token_id in input_ids:
+            end_token_index = (
+                len(input_ids) - 1 - input_ids[::-1].index(self._end_token_id)
+            )
+
+            if end_token_index != -1:
+                return input_ids[end_token_index + 1 :]
+
+        if (
+            self._tool_section_start_token_id is not None
+            and self._tool_section_start_token_id in input_ids
+        ):
+            tool_section_index = (
+                len(input_ids)
+                - 1
+                - input_ids[::-1].index(self._tool_section_start_token_id)
+            )
+
+            if tool_section_index != -1:
+                return input_ids[tool_section_index:]
+
+        # still reasoning (no content)
+        return []
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        """
+        Extract reasoning content from the model output.
+        """
+        if self._is_identity_mode():
+            return self._identity_parser.extract_reasoning(model_output, request)
+
+        # thinking does not require a think start token but consume it if present
+        start_token_index = model_output.find(self._start_token)
+        start_token_index = 0 if start_token_index != 0 else len(self._start_token)
+        end_token_index = model_output.find(self._end_token)
+
+        if end_token_index != -1:
+            return (
+                model_output[start_token_index:end_token_index],
+                model_output[end_token_index + len(self._end_token) :] or None,
+            )
+
+        tool_section_index = model_output.find(self._tool_section_start_token)
+        if tool_section_index != -1:
+            return (
+                model_output[start_token_index:tool_section_index],
+                model_output[tool_section_index:] or None,
+            )
+
+        # still reasoning (no content)
+        return (
+            model_output[start_token_index:],
+            None,
+        )
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extract reasoning content from a delta message during streaming.
+        """
+        if self._is_identity_mode():
+            return self._identity_parser.extract_reasoning_streaming(
+                previous_text,
+                current_text,
+                delta_text,
+                previous_token_ids,
+                current_token_ids,
+                delta_token_ids,
+            )
+
+        # If reasoning has already ended in previous tokens, this is content
+        if self.is_reasoning_end(previous_token_ids):
+            return DeltaMessage(content=delta_text)
+
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and delta_token_ids[0] in [
+            self._start_token_id,
+            self._end_token_id,
+        ]:
+            return None
+
+        if self._end_token_id in delta_token_ids:
+            end_index = delta_text.find(self._end_token)
+            reasoning = delta_text[:end_index]
+            content = delta_text[end_index + len(self._end_token) :]
+            return DeltaMessage(
+                reasoning=reasoning, content=content if content else None
+            )
+
+        if self._tool_section_start_token_id in delta_token_ids:
+            tool_index = delta_text.find(self._tool_section_start_token)
+            reasoning = delta_text[:tool_index]
+            content = delta_text[tool_index:]
+            return DeltaMessage(reasoning=reasoning, content=content)
+
+        # still reasoning (no end token)
+        return DeltaMessage(reasoning=delta_text)

From 5de98abc122dd4049adf1234c9fc20b5cc83d6cb Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 27 Feb 2026 07:53:47 -0800
Subject: [PATCH 074/154] Add @BoyuanFeng to CODEOWNERS (#35317)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 .github/CODEOWNERS            | 2 +-
 docs/governance/committers.md | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index adf50a185e55..047ece980834 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,7 +2,7 @@
 # for more info about CODEOWNERS file
 
 # This lists cover the "core" components of vLLM that require careful review
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
 /vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 /vllm/lora @jeejeelee
 /vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
diff --git a/docs/governance/committers.md b/docs/governance/committers.md
index 89aaadc2bbac..df874418f1c4 100644
--- a/docs/governance/committers.md
+++ b/docs/governance/committers.md
@@ -55,6 +55,7 @@ Sorted alphabetically by GitHub handle:
 - [@ywang96](https://github.com/ywang96): Multimodality, benchmarks
 - [@zhuohan123](https://github.com/zhuohan123): Project lead, RL integration, numerics
 - [@zou3519](https://github.com/zou3519): Compilation
+- [@BoyuanFeng](https://github.com/BoyuanFeng): Compilation, CUDAGraph
 
 ### Emeritus Committers
 
@@ -113,7 +114,7 @@ If you have PRs touching the area, please feel free to ping the area owner for r
 - Multi-modal Input Processing: Components that load and process image/video/audio data into feature tensors
     - @DarkLight1337, @ywang96, @Isotr0py
 - torch compile: The torch.compile integration in vLLM, custom passes & transformations
-    - @ProExpertProg, @zou3519, @youkaichao
+    - @ProExpertProg, @zou3519, @youkaichao, @BoyuanFeng
 - State space models: The state space models implementation in vLLM
     - @tdoublep, @tlrmchlsmth
 - Reasoning and tool calling parsers

From 876312f0b59b24f95704a37c93675e36a018a140 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 27 Feb 2026 07:54:24 -0800
Subject: [PATCH 075/154] [Core] Fix `gpu_worker.py` pre-commit errors (#35312)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
---
 vllm/v1/worker/gpu_worker.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index e35d0ef689b7..3aeb20839a9f 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -744,7 +744,8 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None):
 
             # Create the profiler wrapper only on the first start call
             if self.profiler is None:
-                if self.profiler_config.profiler == "torch":
+                profiler_type = self.profiler_config.profiler
+                if profiler_type == "torch":
                     self.profiler = TorchProfilerWrapper(
                         self.profiler_config,
                         worker_name=trace_name,
@@ -754,9 +755,12 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None):
                     logger.debug(
                         "Starting torch profiler with trace name: %s", trace_name
                     )
-                elif self.profiler_config.profiler == "cuda":
+                elif profiler_type == "cuda":
                     self.profiler = CudaProfilerWrapper(self.profiler_config)
                     logger.debug("Starting CUDA profiler")
+                else:
+                    logger.warning("Unrecognized profiler: %s", profiler_type)
+                    return
                 self.profiler.start()
             else:
                 # Profiler already initialized. Restart profiling but keep

From 9098ce690c802887fb36a8f3ee95bb18aacd2f76 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Fri, 27 Feb 2026 09:21:35 -0800
Subject: [PATCH 076/154] [Kernel] [Helion] [7/N] Use HOP to represent Helion
 Kernel call to enable fx tracing and pattern matching (#34390)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 tests/kernels/helion/test_pattern_matching.py | 203 ++++++++++++++++++
 tests/kernels/helion/test_register.py         |  64 +++++-
 vllm/kernels/helion/register.py               | 127 +++++++++--
 3 files changed, 368 insertions(+), 26 deletions(-)
 create mode 100644 tests/kernels/helion/test_pattern_matching.py

diff --git a/tests/kernels/helion/test_pattern_matching.py b/tests/kernels/helion/test_pattern_matching.py
new file mode 100644
index 000000000000..1cab249a18c8
--- /dev/null
+++ b/tests/kernels/helion/test_pattern_matching.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test make_fx tracing and inductor pattern matching with HelionKernelWrapper."""
+
+import contextlib
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+
+from vllm.utils.import_utils import has_helion
+
+if not has_helion():
+    pytest.skip(
+        "Helion is not installed. Install with: pip install vllm[helion]",
+        allow_module_level=True,
+    )
+
+import helion
+import helion.language as hl
+from helion._compat import requires_torch_version
+
+if not requires_torch_version("2.11"):
+    pytest.skip(
+        "HigherOrderOp requires PyTorch >= 2.11",
+        allow_module_level=True,
+    )
+
+from helion._compiler._dynamo.higher_order_ops import (
+    helion_kernel_side_table,
+    helion_kernel_wrapper_mutation,
+)
+from torch._inductor.pattern_matcher import (
+    PatternMatcherPass,
+    fwd_only,
+    register_replacement,
+    select_decomp_table,
+)
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.kernels.helion.config_manager import ConfigManager
+from vllm.kernels.helion.register import HelionKernelWrapper
+
+
+@contextlib.contextmanager
+def _helion_mock_context():
+    configs = {
+        "default": helion.Config(block_sizes=[64], num_warps=2, num_stages=2),
+    }
+    mock_config_manager = Mock(spec=ConfigManager)
+    mock_config_manager.get_platform_configs = Mock(return_value=configs)
+
+    with (
+        patch(
+            "vllm.kernels.helion.config_manager.ConfigManager.get_instance",
+            return_value=mock_config_manager,
+        ),
+        patch(
+            "vllm.kernels.helion.utils.get_canonical_gpu_name",
+            return_value="nvidia_h200",
+        ),
+    ):
+        yield
+
+
+class TestMakeFxHop:
+    def setup_method(self):
+        helion_kernel_side_table.reset_table()
+
+    def test_make_fx_symbolic(self):
+        def raw_add_scale(
+            x: torch.Tensor, y: torch.Tensor, scale: float
+        ) -> tuple[torch.Tensor, int, torch.Tensor]:
+            out_x = torch.empty_like(x)
+            out_y = torch.empty_like(x)
+            for tile in hl.tile(x.size()):
+                out_x[tile] = x[tile] + y[tile] * scale
+                out_y[tile] = out_x[tile] * 2.0
+            return out_x, 42, out_y
+
+        input_x = torch.randn(7, 13)
+        input_y = torch.randn(7, 13)
+        scale = 0.5
+
+        with _helion_mock_context():
+            wrapper = HelionKernelWrapper(
+                raw_kernel_func=raw_add_scale,
+                op_name="test_make_fx",
+                fake_impl=lambda *a, **kw: None,
+            )
+            wrapper.register_config_picker(lambda args, keys: "default")
+
+            def fn(x, y):
+                return wrapper(x, y, scale)
+
+            gm = make_fx(fn, tracing_mode="symbolic")(input_x, input_y)
+
+        hop_nodes = [
+            n
+            for n in gm.graph.nodes
+            if n.op == "call_function" and n.target is helion_kernel_wrapper_mutation
+        ]
+        assert len(hop_nodes) == 1
+        node = hop_nodes[0]
+
+        assert node.kwargs["constant_args"]["scale"] == scale
+        assert set(node.kwargs["tensor_args"]) == {"x", "y"}
+
+        specs = node.kwargs["output_spec"]["leaf_specs"]
+        tensor_specs = [s for s in specs if s["type"] == "tensor"]
+        scalar_specs = [s for s in specs if s["type"] == "scalar"]
+        assert len(tensor_specs) == 2
+        assert len(scalar_specs) == 1
+
+        for spec in tensor_specs:
+            assert spec["dtype"] == input_x.dtype
+
+        assert scalar_specs[0]["scalar_value"] == 42
+
+        for val in node.meta["val"]:
+            assert all(isinstance(s, torch.SymInt) for s in val.shape)
+
+        # Both out_x and out_y are empty_like(x), so output shapes == input shape
+        input_node = next(n for n in gm.graph.nodes if n.op == "placeholder")
+        input_shape = input_node.meta["val"].shape
+        for val in node.meta["val"]:
+            assert len(val.shape) == len(input_shape)
+            for out_s, in_s in zip(val.shape, input_shape):
+                assert out_s == in_s
+
+    def test_pattern_matcher_replaces_with_helion_hop(self):
+        def raw_silu_mul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            M, N = x.size()
+            out = torch.empty_like(x)
+            for tile_m, tile_n in hl.tile([M, N]):
+                out[tile_m, tile_n] = (
+                    torch.nn.functional.silu(x[tile_m, tile_n]) * y[tile_m, tile_n]
+                )
+            return out
+
+        with _helion_mock_context():
+            wrapper = HelionKernelWrapper(
+                raw_kernel_func=raw_silu_mul,
+                op_name="test_pm_silu_mul",
+                fake_impl=lambda *a, **kw: None,
+            )
+            wrapper.register_config_picker(lambda args, keys: "default")
+
+            def pattern(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.silu(x) * y
+
+            def replacement(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return wrapper(x, y)
+
+            inputs = [torch.randn(8, 16), torch.randn(8, 16)]
+
+            pm_pass = PatternMatcherPass(pass_name="test_helion_replacement")
+            register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
+
+            def model(x, y):
+                return torch.nn.functional.silu(x) * y
+
+            decompositions = select_decomp_table()
+            input_x = torch.randn(8, 16)
+            input_y = torch.randn(8, 16)
+            gm = make_fx(model, decompositions, tracing_mode="symbolic")(
+                input_x, input_y
+            )
+
+            def count_hop_nodes(graph):
+                return sum(
+                    1
+                    for n in graph.nodes
+                    if n.op == "call_function"
+                    and n.target is helion_kernel_wrapper_mutation
+                )
+
+            assert count_hop_nodes(gm.graph) == 0
+
+            match_count = pm_pass.apply(gm.graph)
+            gm.graph.lint()
+            gm.recompile()
+
+            assert match_count == 1
+            assert count_hop_nodes(gm.graph) == 1
+
+            hop_node = next(
+                n
+                for n in gm.graph.nodes
+                if n.op == "call_function"
+                and n.target is helion_kernel_wrapper_mutation
+            )
+
+            # raw_silu_mul returns empty_like(x), so output shape == input shape
+            for val in hop_node.meta["val"]:
+                assert all(isinstance(s, torch.SymInt) for s in val.shape)
+
+            input_node = next(n for n in gm.graph.nodes if n.op == "placeholder")
+            input_shape = input_node.meta["val"].shape
+            output_shape = hop_node.meta["val"][0].shape
+            assert len(output_shape) == len(input_shape)
+            for out_s, in_s in zip(output_shape, input_shape):
+                assert out_s == in_s
diff --git a/tests/kernels/helion/test_register.py b/tests/kernels/helion/test_register.py
index 02b05be74d1d..bee72d58a06c 100644
--- a/tests/kernels/helion/test_register.py
+++ b/tests/kernels/helion/test_register.py
@@ -4,8 +4,7 @@
 Unit tests for Helion kernel registration.
 
 Tests ConfiguredHelionKernel, HelionKernelWrapper, and PresetConfigSearch
-including config picker registration, custom autotuner integration, and
-PyTorch op registration.
+including config picker registration and custom autotuner integration.
 """
 
 from unittest.mock import Mock, patch
@@ -25,6 +24,7 @@
 
 from vllm.kernels.helion.config_manager import ConfigManager
 from vllm.kernels.helion.register import (
+    _HOP_AVAILABLE,
     ConfiguredHelionKernel,
     HelionKernelWrapper,
     get_kernel_by_name,
@@ -451,9 +451,51 @@ def fake_impl(*args, **kwargs):
         ):
             wrapper.get_configured_op()
 
-    def test_get_configured_op_returns_cached_op(self, sample_kernel, sample_configs):
-        """Test get_configured_op returns cached op when already registered."""
+    def test_get_configured_op_returns_cached_kernel(
+        self, sample_kernel, sample_configs
+    ):
+        """Test get_configured_op returns cached ConfiguredHelionKernel."""
+
+        def fake_impl(*args, **kwargs):
+            return torch.zeros_like(args[0])
+
+        def default_picker(args, config_keys):
+            return "default"
+
+        wrapper = HelionKernelWrapper(
+            raw_kernel_func=sample_kernel,
+            op_name="test_kernel",
+            fake_impl=fake_impl,
+        )
+        wrapper._config_picker = default_picker
+
+        mock_config_manager = Mock(spec=ConfigManager)
+        mock_config_manager.get_platform_configs = Mock(return_value=sample_configs)
+
+        with (
+            patch(
+                "vllm.kernels.helion.config_manager.ConfigManager.get_instance",
+                return_value=mock_config_manager,
+            ),
+            patch(
+                "vllm.kernels.helion.utils.get_canonical_gpu_name",
+                return_value="nvidia_h200",
+            ),
+            patch("vllm.kernels.helion.register.helion.kernel") as mock_kernel,
+        ):
+            mock_decorated = Mock()
+            mock_kernel.return_value = Mock(return_value=mock_decorated)
+
+            result1 = wrapper.get_configured_op()
+            result2 = wrapper.get_configured_op()
+            assert result1 is result2
 
+    @pytest.mark.skipif(
+        _HOP_AVAILABLE, reason="CustomOp path not used when HOP available"
+    )
+    def test_get_or_register_custom_op_returns_cached_op(
+        self, sample_kernel, sample_configs
+    ):
         def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
@@ -488,12 +530,15 @@ def default_picker(args, config_keys):
         ):
             mock_decorated = Mock()
             mock_kernel.return_value = Mock(return_value=mock_decorated)
-            result = wrapper.get_configured_op()
+            result = wrapper._get_or_register_custom_op()
             assert result is existing_op
 
-    def test_get_configured_op_registers_new_op(self, sample_kernel, sample_configs):
-        """Test get_configured_op creates and registers new op."""
-
+    @pytest.mark.skipif(
+        _HOP_AVAILABLE, reason="CustomOp path not used when HOP available"
+    )
+    def test_get_or_register_custom_op_registers_new_op(
+        self, sample_kernel, sample_configs
+    ):
         def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
@@ -542,11 +587,10 @@ def register_side_effect(op_name, op_func, **kwargs):
         ):
             mock_decorated = Mock()
             mock_kernel.return_value = Mock(return_value=mock_decorated)
-            result = wrapper.get_configured_op()
+            result = wrapper._get_or_register_custom_op()
 
             mock_register.assert_called_once()
             assert result is new_op
-            # Check that op_func is the decorated kernel, not ConfiguredHelionKernel
             assert mock_register.call_args[1]["op_func"] is mock_decorated
 
 
diff --git a/vllm/kernels/helion/register.py b/vllm/kernels/helion/register.py
index 3114631ddec1..cd0ef83fc0a2 100644
--- a/vllm/kernels/helion/register.py
+++ b/vllm/kernels/helion/register.py
@@ -31,8 +31,8 @@
 
 Key Classes
 -----------
-- HelionKernelWrapper: Wraps raw kernel + config_picker, creates configured ops
-- ConfiguredHelionKernel: Platform-specific kernel registered as PyTorch custom op
+- HelionKernelWrapper: Wraps raw kernel + config_picker, creates configured kernels
+- ConfiguredHelionKernel: Platform-specific kernel with pre-tuned configs
 - PresetConfigSearch: Custom autotuner that returns pre-tuned configs
 """
 
@@ -53,10 +53,27 @@
     )
 
 import helion
+from helion._compat import requires_torch_version
 from helion.autotuner.base_search import BaseAutotuner
 from helion.runtime.config import Config
 from helion.runtime.settings import default_autotuner_fn
 
+# TODO(gmagogsfm): Remove CustomOp fallback path (_get_or_register_custom_op,
+# vllm_helion_lib, direct_register_custom_op) once vLLM requires PyTorch >= 2.11.
+_HOP_AVAILABLE = requires_torch_version("2.11")
+
+if _HOP_AVAILABLE:
+    import torch.utils._pytree as pytree
+    from helion._compiler._dynamo.higher_order_ops import (
+        helion_kernel_side_table,
+        helion_kernel_wrapper_mutation,
+    )
+    from helion._compiler._dynamo.variables import infer_output_spec
+    from torch.fx.experimental.proxy_tensor import (
+        disable_proxy_modes_tracing,
+        get_proxy_mode,
+    )
+
 logger = init_logger(__name__)
 
 vllm_helion_lib = Library("vllm_helion", "FRAGMENT")  # noqa
@@ -233,7 +250,7 @@ def _create_decorated_kernel(self) -> Callable[..., Any]:
 
 
 class HelionKernelWrapper:
-    """Wrapper for Helion kernels that creates config-specific PyTorch custom ops."""
+    """Wrapper for Helion kernels with pre-tuned config selection and HOP support."""
 
     def __init__(
         self,
@@ -252,11 +269,86 @@ def __init__(
         self._config_picker: (
             Callable[[tuple[Any, ...], list[str]], str | None] | None
         ) = None
+        self._configured_kernel: ConfiguredHelionKernel | None = None
         self._input_generator: Callable[[], dict[str, tuple[Any, ...]]] | None = None
 
     def __call__(self, *args, **kwargs):
-        configured_op = self.get_configured_op()
-        return configured_op(*args, **kwargs)
+        # CustomOp fallback: register as torch custom op for torch.compile
+        # compatibility on older PyTorch lacking HOP/EffectType support
+        if not _HOP_AVAILABLE:
+            custom_op = self._get_or_register_custom_op()
+            return custom_op(*args, **kwargs)
+        # HOP tracing: record HigherOrderOp in the FX graph
+        if get_proxy_mode() is not None:
+            return self._call_via_hop(args, kwargs)
+        # Eager: run the configured kernel directly
+        return self.get_configured_op()(*args, **kwargs)
+
+    def _call_via_hop(
+        self,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        kernel = self.get_configured_op()._decorated_kernel
+        kernel_idx = helion_kernel_side_table.add_kernel(kernel)
+
+        constant_args, tensor_args = self._partition_args(kernel, args, kwargs)
+
+        all_named = {**constant_args, **tensor_args}
+        full_args = tuple(
+            all_named.get(n, p.default)
+            for n, p in kernel.signature.parameters.items()  # type: ignore[attr-defined]
+            if n in all_named or p.default is not p.empty
+        )
+
+        with disable_proxy_modes_tracing():
+            output_spec = infer_output_spec(kernel, full_args)
+
+        hop_result = helion_kernel_wrapper_mutation(
+            kernel_idx=kernel_idx,
+            constant_args=constant_args,
+            tensor_args=tensor_args,
+            output_spec=output_spec,
+        )
+
+        tree_spec_str = output_spec.get("tree_spec_str")
+        if tree_spec_str is None:
+            return None
+        tree_spec = pytree.treespec_loads(tree_spec_str)
+
+        hop_iter = iter(hop_result)
+        reconstructed = []
+        for spec in output_spec["leaf_specs"]:
+            is_constant_scalar = spec["type"] == "scalar" and not isinstance(
+                spec.get("scalar_value"), torch.SymInt
+            )
+            if is_constant_scalar:
+                reconstructed.append(spec["scalar_value"])
+            else:
+                reconstructed.append(next(hop_iter))
+        return pytree.tree_unflatten(reconstructed, tree_spec)
+
+    @staticmethod
+    def _partition_args(
+        kernel: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        constant_args: dict[str, Any] = {}
+        tensor_args: dict[str, Any] = {}
+        params = list(kernel.signature.parameters.keys())
+        for i, val in enumerate(args):
+            name = params[i]
+            if isinstance(val, torch.Tensor):
+                tensor_args[name] = val
+            else:
+                constant_args[name] = val
+        for name, val in kwargs.items():
+            if isinstance(val, torch.Tensor):
+                tensor_args[name] = val
+            else:
+                constant_args[name] = val
+        return constant_args, tensor_args
 
     def register_config_picker(
         self, picker_func: Callable[[tuple[Any, ...], list[str]], str | None]
@@ -309,29 +401,32 @@ def run_autotune(
         )
         return autotune_kernel.autotune(inputs)
 
-    def get_configured_op(self) -> Any:
+    def get_configured_op(self) -> ConfiguredHelionKernel:
         assert self._config_picker is not None, (
             f"No config picker registered for kernel '{self.op_name}'. "
             f"Use @{self.op_name}.register_config_picker to register one."
         )
 
+        if self._configured_kernel is None:
+            self._configured_kernel = ConfiguredHelionKernel(
+                op_name=self.op_name,
+                config_picker=self._config_picker,
+                raw_kernel_func=self.raw_kernel_func,
+                helion_settings=self.helion_settings,
+            )
+
+        return self._configured_kernel
+
+    def _get_or_register_custom_op(self) -> Any:
         if hasattr(torch.ops.vllm_helion, self.op_name):
-            logger.debug("Op vllm_helion::%s already registered", self.op_name)
             return getattr(torch.ops.vllm_helion, self.op_name)
 
-        configured_kernel = ConfiguredHelionKernel(
-            op_name=self.op_name,
-            config_picker=self._config_picker,
-            raw_kernel_func=self.raw_kernel_func,
-            helion_settings=self.helion_settings,
-        )
+        configured_kernel = self.get_configured_op()
 
         logger.info("Registering op: vllm_helion::%s", self.op_name)
         direct_register_custom_op(
             op_name=self.op_name,
-            op_func=configured_kernel._decorated_kernel,  # Register decorated kernel
-            # TODO(gmagogsfm): Implement automatic mutation/aliasing detection
-            # for Helion kernels.
+            op_func=configured_kernel._decorated_kernel,
             mutates_args=None,
             fake_impl=self._fake_impl,
             target_lib=vllm_helion_lib,

From 905d76b51dc3b98c4d0ee35317493f21f9e6b5d0 Mon Sep 17 00:00:00 2001
From: fort726 <38447663+fort726@users.noreply.github.com>
Date: Sat, 28 Feb 2026 02:26:02 +0900
Subject: [PATCH 077/154] [Model] Add huggingface skt/A.X-K1 model (#32407)

Signed-off-by: Sungwan(Alex) Kim <sw0726.kim@sktelecom.com>
Signed-off-by: fort726 <38447663+fort726@users.noreply.github.com>
Co-authored-by: Sungwan(Alex) Kim <sw0726.kim@sktelecom.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 docs/models/supported_models.md               |    1 +
 tests/models/registry.py                      |    1 +
 vllm/model_executor/models/AXK1.py            | 1168 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    1 +
 vllm/transformers_utils/configs/AXK1.py       |  215 +++
 vllm/transformers_utils/configs/__init__.py   |    2 +
 .../model_arch_config_convertor.py            |    9 +-
 7 files changed, 1396 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/AXK1.py
 create mode 100644 vllm/transformers_utils/configs/AXK1.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 5f821ef7a4b4..eca66041da03 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -369,6 +369,7 @@ th {
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
 | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
 | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ |
+| `AXK1ForCausalLM` | A.X-K1 | `skt/A.X-K1`, etc. | | ✅︎ |
 | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ |
 | `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ |
 | `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 0978c93dac5e..c8e47ad502f9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -194,6 +194,7 @@ def check_available_online(
     "ArcticForCausalLM": _HfExamplesInfo(
         "Snowflake/snowflake-arctic-instruct", trust_remote_code=True
     ),
+    "AXK1ForCausalLM": _HfExamplesInfo("skt/A.X-K1", trust_remote_code=True),
     "BaiChuanForCausalLM": _HfExamplesInfo(
         "baichuan-inc/Baichuan-7B", trust_remote_code=True
     ),
diff --git a/vllm/model_executor/models/AXK1.py b/vllm/model_executor/models/AXK1.py
new file mode 100644
index 000000000000..f5ed4400fb65
--- /dev/null
+++ b/vllm/model_executor/models/AXK1.py
@@ -0,0 +1,1168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only A.X K1 model."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.deepseek_v2 import (
+    DeepseekAttention,
+    DeepseekV2MLP,
+    yarn_get_mscale,
+)
+from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.AXK1 import AXK1Config
+
+from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP
+from .utils import (
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class AXK1MLP(DeepseekV2MLP):
+    pass
+
+
+class AXK1MoE(nn.Module):
+    def __init__(
+        self,
+        config: AXK1Config,
+        parallel_config: ParallelConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.n_routed_experts
+        self.n_shared_experts: int = config.n_shared_experts
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.n_routed_experts, dtype=torch.float32)
+            )
+        else:
+            self.gate.e_score_correction_bias = None
+
+        # Load balancing settings.
+        eplb_config = parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
+
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        self.is_fusion_moe_shared_experts_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
+        if config.n_shared_experts is None or self.is_fusion_moe_shared_experts_enabled:
+            self.shared_experts = None
+        else:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+
+            self.shared_experts = AXK1MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                is_sequence_parallel=self.is_sequence_parallel,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            gate=self.gate,
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=config.scoring_func,
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            # aiter applies routed_scaling_factor internally
+            routed_scaling_factor=1.0
+            if not self.is_rocm_aiter_moe_enabled
+            else self.routed_scaling_factor,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            n_shared_experts=config.n_shared_experts
+            if self.is_fusion_moe_shared_experts_enabled
+            else None,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # Chunk the hidden states so they aren't replicated across TP ranks.
+        # This avoids duplicate computation in self.experts.
+        # TODO: We can replace the all_reduce at the end of attn with a
+        # reduce_scatter instead of chunking here.
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        if self.experts.is_internal_router:
+            # In this case, the gate/router runs inside the FusedMoE class
+            fused_moe_out = self.experts(
+                hidden_states=hidden_states, router_logits=hidden_states
+            )
+        else:
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            fused_moe_out = self.experts(
+                hidden_states=hidden_states, router_logits=router_logits
+            )
+
+        shared_output, final_hidden_states = fused_moe_out
+        if self.shared_experts is None:
+            assert shared_output is None
+
+        # Fix FP16 overflow
+        # See AXK1DecoderLayer for more details.
+        if hidden_states.dtype != torch.float16:
+            if not self.is_rocm_aiter_moe_enabled:
+                final_hidden_states *= self.routed_scaling_factor
+        elif self.shared_experts is not None:
+            assert shared_output is not None
+            shared_output *= 1.0 / self.routed_scaling_factor
+
+        if self.shared_experts is not None:
+            assert shared_output is not None
+            final_hidden_states += shared_output
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def _get_llama_4_scaling(
+    original_max_position_embeddings: int, scaling_beta: float, positions: torch.Tensor
+) -> torch.Tensor:
+    scaling = 1 + scaling_beta * torch.log(
+        1 + torch.floor(positions / original_max_position_embeddings)
+    )
+    # Broadcast over num_heads and head_dim
+    return scaling[..., None, None]
+
+
+class AXK1Attention(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        config: AXK1Config,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        max_position_embeddings: int = 8192,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        topk_indices_buffer: torch.Tensor | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+        assert topk_indices_buffer is None, (
+            "topk_indices_buffer is not \
+        supported for AXK1Attention"
+        )
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_a_proj",
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_b_proj",
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa",
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
+        )
+        # O projection.
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        if config.rope_parameters["rope_type"] != "default":
+            config.rope_parameters["rope_type"] = (
+                "deepseek_yarn"
+                if config.rope_parameters.get("apply_yarn_scaling", True)
+                else "deepseek_llama_scaling"
+            )
+
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=config.rope_parameters,
+            is_neox_style=False,
+        )
+
+        if config.rope_parameters["rope_type"] == "deepseek_yarn":
+            mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
+            scaling_factor = config.rope_parameters["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.attn = Attention(
+            self.num_local_heads,
+            self.qk_head_dim,
+            self.scaling,
+            num_kv_heads=self.num_local_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        llama_4_scaling: torch.Tensor | None,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a)
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = latent_cache[:, :, self.kv_lora_rank :]
+
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        q[..., self.qk_nope_head_dim :] = q_pe
+        k = torch.empty_like(q)
+        k[..., : self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim :] = k_pe
+
+        # Apply llama 4 scaling if provided
+        if llama_4_scaling is not None:
+            q *= llama_4_scaling
+
+        # padding value to qk_head_dim for alignment
+        v = torch.nn.functional.pad(
+            v, [0, self.qk_head_dim - self.v_head_dim], value=0
+        ).view(-1, self.num_local_heads * self.qk_head_dim)
+        attn_output = self.attn(q, k, v)
+        attn_output = attn_output.view(-1, self.num_local_heads, self.qk_head_dim)[
+            ..., : self.v_head_dim
+        ].reshape(-1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class AXK1MLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+
+        For more info see MLACommonImpl in:
+        vllm/v1/attention/backends/mla/utils.py
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        config: AXK1Config,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int | None,
+        kv_lora_rank: int,
+        max_position_embeddings: int = 8192,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        topk_indices_buffer: torch.Tensor | None = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.fused_qkv_a_proj = MergedColumnParallelLinear(
+                self.hidden_size,
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.fused_qkv_a_proj",
+                disable_tp=True,
+            )
+        else:
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_a_proj_with_mqa",
+            )
+
+        if self.q_lora_rank is not None:
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                self.q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_b_proj",
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        if config.rope_parameters["rope_type"] != "default":
+            config.rope_parameters["rope_type"] = (
+                "deepseek_yarn"
+                if config.rope_parameters.get("apply_yarn_scaling", True)
+                else "deepseek_llama_scaling"
+            )
+
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=config.rope_parameters,
+            is_neox_style=False,
+        )
+
+        if config.rope_parameters["rope_type"] == "deepseek_yarn":
+            mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
+            scaling_factor = config.rope_parameters["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        mla_modules = MLAModules(
+            kv_a_layernorm=self.kv_a_layernorm,
+            kv_b_proj=self.kv_b_proj,
+            rotary_emb=self.rotary_emb,
+            o_proj=self.o_proj,
+            fused_qkv_a_proj=self.fused_qkv_a_proj
+            if self.q_lora_rank is not None
+            else None,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa
+            if self.q_lora_rank is None
+            else None,
+            q_a_layernorm=self.q_a_layernorm if self.q_lora_rank is not None else None,
+            q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
+            q_proj=self.q_proj if self.q_lora_rank is None else None,
+            indexer=None,
+            indexer_rotary_emb=None,
+            is_sparse=False,
+            topk_indices_buffer=topk_indices_buffer,
+        )
+
+        self.mla_attn = MultiHeadLatentAttentionWrapper(
+            self.hidden_size,
+            self.num_local_heads,
+            self.scaling,
+            self.qk_nope_head_dim,
+            self.qk_rope_head_dim,
+            self.v_head_dim,
+            self.q_lora_rank,
+            self.kv_lora_rank,
+            mla_modules,
+            cache_config,
+            quant_config,
+            prefix,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        llama_4_scaling: torch.Tensor | None,
+    ) -> torch.Tensor:
+        return self.mla_attn(positions, hidden_states, llama_4_scaling)
+
+
+class AXK1DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str,
+        config: AXK1Config | None = None,
+    ) -> None:
+        super().__init__()
+
+        if config is None:
+            config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        parallel_config = vllm_config.parallel_config
+        self.config = config
+
+        self.hidden_size = config.hidden_size
+        max_position_embeddings = config.max_position_embeddings
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep=".")[-1])
+        self.layer_idx = layer_idx
+
+        # verify MLA attention specific fields
+        qk_nope_head_dim = config.qk_nope_head_dim
+        qk_rope_head_dim = config.qk_rope_head_dim
+        v_head_dim = config.v_head_dim
+        kv_lora_rank = config.kv_lora_rank
+        use_mha = all(dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim))
+        self.use_mha = use_mha
+
+        if use_mha:
+            attn_cls = DeepseekAttention
+        elif model_config.use_mla:
+            attn_cls = AXK1MLAAttention
+        else:
+            attn_cls = AXK1Attention
+        self.self_attn = attn_cls(
+            vllm_config=vllm_config,
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            q_lora_rank=config.q_lora_rank,
+            kv_lora_rank=kv_lora_rank,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            topk_indices_buffer=None,
+        )
+
+        self.is_layer_sparse = self._is_layer_sparse()
+        if self.is_layer_sparse:
+            self.mlp = AXK1MoE(
+                config=config,
+                parallel_config=parallel_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = AXK1MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+    def _is_layer_sparse(self) -> bool:
+        return (
+            self.config.n_routed_experts is not None
+            and self.layer_idx >= self.config.first_k_dense_replace
+            and self.layer_idx % self.config.moe_layer_freq == 0
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        llama_4_scaling: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states.clone()
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        attn_kwargs = {
+            "positions": positions,
+            "hidden_states": hidden_states,
+        }
+        if not self.use_mha:
+            attn_kwargs["llama_4_scaling"] = llama_4_scaling
+        hidden_states = self.self_attn(**attn_kwargs)
+
+        if (
+            not isinstance(self.self_attn, DeepseekAttention)
+            and hidden_states.dtype == torch.float16
+        ):
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
+            hidden_states *= 1.0 / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1.0 / self.routed_scaling_factor
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        if self.is_layer_sparse:
+            hidden_states = self.post_mlp_layernorm(hidden_states)
+
+        if isinstance(self.mlp, AXK1MLP) and hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # Scaling the AXK1MLP output, it is the input of
+            # input_layernorm of next decoder layer.
+            # The scaling of AXK1MOE output would be done in the forward
+            # of AXK1MOE
+            hidden_states *= 1.0 / self.routed_scaling_factor
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class AXK1Model(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: AXK1Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.device = current_platform.device_type
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: AXK1DecoderLayer(vllm_config, prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        # Compute llama 4 scaling once per forward pass if enabled
+        llama_4_scaling_config = getattr(self.config, "llama_4_scaling", None)
+        llama_4_scaling: torch.Tensor | None
+        if llama_4_scaling_config is not None:
+            llama_4_scaling = _get_llama_4_scaling(
+                original_max_position_embeddings=llama_4_scaling_config[
+                    "original_max_position_embeddings"
+                ],
+                scaling_beta=llama_4_scaling_config["beta"],
+                positions=positions,
+            )
+        else:
+            llama_4_scaling = None
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(
+                positions, hidden_states, residual, llama_4_scaling
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class AXK1MixtureOfExperts(MixtureOfExperts):
+    moe_mlp_layers: list[AXK1MoE]
+    """
+    List of MoE MLP layers in the model.
+    """
+
+    def extract_moe_parameters(self, example_moe: AXK1MoE | None):
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("AXK1: No AXK1MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class AXK1ForCausalLM(
+    nn.Module, SupportsPP, AXK1MixtureOfExperts, SupportsLoRA, SupportsEagle
+):
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+    model_cls = AXK1Model
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: AXK1Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        qk_nope_head_dim = config.qk_nope_head_dim
+        qk_rope_head_dim = config.qk_rope_head_dim
+        self.use_mha = all(dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim))
+
+        if self.use_mha:
+            self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
+
+        # `packed_modules_mapping` needs to be modified before
+        # initializing AXK1Model, as it is passed inplace to
+        # quantization config init and may be used to select the
+        # quant_method for relevant layers during initialization.
+        self.fuse_qkv_a_proj = config.q_lora_rank is not None
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.model = self.model_cls(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+        # Set MoE hyperparameters
+        self.num_moe_layers = (
+            self.config.num_hidden_layers - self.config.first_k_dense_replace
+        )
+        self.set_moe_parameters()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.num_expert_groups = getattr(self.config, "n_group", 1)
+
+        self.moe_layers = []
+        self.moe_mlp_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, AXK1DecoderLayer)
+            if isinstance(layer.mlp, AXK1MoE):
+                # Pick last one layer since the first ones may be dense layers.
+                example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
+                self.moe_layers.append(layer.mlp.experts)
+
+        self.extract_moe_parameters(example_moe)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return SharedFusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+            num_redundant_experts=0,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rocm_aiter_moe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        mla_params_mapping = [
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
+        ]
+        mha_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        if self.use_mha:
+            stacked_params_mapping.extend(mha_params_mapping)
+        else:
+            stacked_params_mapping.extend(mla_params_mapping)
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts
+            + (
+                self.config.n_shared_experts
+                if rocm_aiter_moe_shared_expert_enabled
+                else 0
+            ),
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
+            )
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                if is_fusion_moe_shared_experts_layer:
+                    continue
+                name_mapped = name.replace(weight_name, param_name)
+
+                # QKV fusion is optional, fall back to normal
+                # weight loading if it's not enabled
+                # if go with fusion option, then update name
+                if (
+                    param_name == "fused_qkv_a_proj"
+                ) and name_mapped not in params_dict:
+                    continue
+                else:
+                    name = name_mapped
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+
+                # Special handling: when AITER fusion_shared_experts is enabled,
+                # checkpoints may provide a single widened shared_experts tensor
+                # without explicit expert indices
+                # (e.g. ...mlp.shared_experts.gate_proj.weight).
+                # For models with multiple shared experts, split that tensor
+                # evenly into per-shared-expert slices and load them into
+                # appended expert slots mlp.experts.{n_routed_experts + j}.*
+                # accordingly.
+                num_chunks = 1
+                if is_fusion_moe_shared_experts_layer:
+                    num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
+                    # Determine split axis based on op type
+                    # gate/up: ColumnParallel → split along dim 0
+                    # down: RowParallel → split along dim 1
+                    split_dim = (
+                        1
+                        if ("down_proj.weight" in name and loaded_weight.ndim > 1)
+                        else 0
+                    )
+                    total = loaded_weight.shape[split_dim]
+                    assert total % num_chunks == 0, (
+                        f"Shared expert weight dim {total} "
+                        f"not divisible by num_chunks {num_chunks}"
+                    )
+                    chunk_size = total // num_chunks
+
+                for j in range(num_chunks):
+                    chunk_name = name
+                    weight_to_load = loaded_weight
+
+                    if is_fusion_moe_shared_experts_layer:
+                        chunk_slice = slice(j * chunk_size, (j + 1) * chunk_size)
+                        if loaded_weight.ndim == 1:
+                            weight_to_load = loaded_weight[chunk_slice]
+                        elif split_dim == 0:
+                            weight_to_load = loaded_weight[chunk_slice, :]
+                        else:
+                            weight_to_load = loaded_weight[:, chunk_slice]
+                        # Synthesize an expert-style name so expert mapping
+                        # can route it
+                        chunk_name = name.replace(
+                            "mlp.shared_experts",
+                            f"mlp.experts.{self.config.n_routed_experts + j}",
+                        )
+
+                    # Use expert_params_mapping to locate the destination
+                    # param and delegate to its expert-aware weight_loader
+                    # with expert_id.
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in chunk_name:
+                            continue
+
+                        # Anyway, this is an expert weight and should not be
+                        # attempted to load as other weights later
+                        is_expert_weight = True
+
+                        # Do not modify `name` since the loop may continue here
+                        # Instead, create a new variable
+                        name_mapped = chunk_name.replace(weight_name, param_name)
+
+                        if is_pp_missing_parameter(name_mapped, self):
+                            continue
+
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # other available replicas.
+                        weight_loader = typing.cast(
+                            Callable[..., bool], param.weight_loader
+                        )
+                        success = weight_loader(
+                            param,
+                            weight_to_load,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            return_success=True,
+                        )
+                        if success:
+                            if not is_fusion_moe_shared_experts_layer:
+                                name = name_mapped
+                            else:
+                                loaded_params.add(name_mapped)
+                            break
+                    else:
+                        if is_expert_weight:
+                            # We've checked that this is an expert weight
+                            # However it's not mapped locally to this rank
+                            # So we simply skip it
+                            continue
+
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+
+                        # Remapping the name of FP8 kv-scale.
+                        name = maybe_remap_kv_scale_name(name, params_dict)
+                        if name is None:
+                            continue
+
+                        if is_pp_missing_parameter(name, self):
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+            if not is_fusion_moe_shared_experts_layer:
+                loaded_params.add(name)
+
+        return loaded_params
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: AXK1Config, weight_name: str
+) -> int | None:
+    if config.num_nextn_predict_layers and config.num_nextn_predict_layers > 0:
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
+                return layer_idx + i
+    return None
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index cc871f9d344a..75d656d499b5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -75,6 +75,7 @@
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArceeForCausalLM": ("arcee", "ArceeForCausalLM"),
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
+    "AXK1ForCausalLM": ("AXK1", "AXK1ForCausalLM"),
     # baichuan-7b, upper case 'C' in the class name
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
     # baichuan-13b, lower case 'c' in the class name
diff --git a/vllm/transformers_utils/configs/AXK1.py b/vllm/transformers_utils/configs/AXK1.py
new file mode 100644
index 000000000000..5c19a37324b0
--- /dev/null
+++ b/vllm/transformers_utils/configs/AXK1.py
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers import PretrainedConfig
+
+
+class AXK1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AXK1Model`].
+    It is used to instantiate an A.X model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults
+    will yield a similar configuration to that of the A.X K1.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control
+    the model outputs. Read the documentation from [`PretrainedConfig`] for more
+    information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 163840):
+            Vocabulary size of the A.X K1 model. Defines the number of different
+            tokens that can be represented by the `inputs_ids` passed when calling
+            [`AXK1Model`]
+        hidden_size (`int`, *optional*, defaults to 7168):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 18432):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MoE representations.
+        num_hidden_layers (`int`, *optional*, defaults to 61):
+            Number of hidden layers in the Transformer decoder.
+        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
+            Number of nextn predict layers in the AXK1 Model.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer
+            decoder.
+        n_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts, None means dense model.
+        n_routed_experts (`int`, *optional*, defaults to 192):
+            Number of routed experts, None means dense model.
+        routed_scaling_factor (`float`, *optional*, defaults to 2.5):
+            Scaling factor or routed experts.
+        topk_method (`str`, *optional*, defaults to `noaux_tc`):
+            Topk method used in routed gate.
+        n_group (`int`, *optional*, defaults to 8):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 4):
+            Number of selected groups for each token(for each token, ensuring the
+            selected experts is only within `topk_group` groups).
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            Number of selected experts, None means dense model.
+        moe_layer_freq (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer: one expert layer for every
+            `moe_layer_freq - 1` dense layers.
+        first_k_dense_replace (`int`, *optional*, defaults to 1):
+            Number of dense layers in shallow layers
+            (embed->dense->dense->...->dense->moe->moe...->lm_head).
+                      \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to True):
+            Whether to normalize the weights of the routed experts.
+        scoring_func (`str`, *optional*, defaults to 'sigmoid'):
+            Method of computing expert weights.
+        aux_loss_alpha (`float`, *optional*, defaults to 0.0001):
+            Auxiliary loss weight coefficient.
+        seq_aux = (`bool`, *optional*, defaults to True):
+            Whether to compute the auxiliary loss for each individual sample.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement
+            Grouped Query Attention. If `num_key_value_heads=num_attention_heads`,
+            the model will use Multi Head Attention (MHA), if `num_key_value_heads=1
+            the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and
+            value head should be constructed by meanpooling all the original heads
+            within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf).
+            If it is not specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions
+            (not used by all models). Only relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 163691):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 163691):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining.
+            Please refer to
+            [this document](https://huggingface.co/docs/transformers/parallelism)
+            to understand more about it. This value is necessary to ensure exact
+            reproducibility of the pretraining results. Please refer to
+            [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings.
+            Currently supports two scaling strategies: linear and dynamic.
+            Their scaling factor must be a float greater than 1. The expected format
+            is  `{"type": strategy name, "factor": scaling factor}`. When using this
+            flag, don't update `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection
+            layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    """
+
+    model_type = "AXK1"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size: int = 163840,
+        hidden_size: int = 7168,
+        intermediate_size: int = 18432,
+        moe_intermediate_size: int = 2048,
+        num_hidden_layers: int = 61,
+        num_nextn_predict_layers: int | None = 1,
+        num_attention_heads: int = 64,
+        num_key_value_heads: int = 64,
+        n_shared_experts: int | None = 1,
+        n_routed_experts: int | None = 192,
+        ep_size: int | None = 8,  ## Ignored - Expert parallel size
+        routed_scaling_factor: float | None = 2.5,
+        kv_lora_rank: int | None = 512,
+        q_lora_rank: int | None = 1536,
+        qk_rope_head_dim: int | None = 64,
+        v_head_dim: int | None = 128,
+        qk_nope_head_dim: int | None = 128,
+        topk_method: str | None = "noaux_tc",
+        n_group: int | None = 8,
+        topk_group: int | None = 4,
+        num_experts_per_tok: int | None = 8,
+        moe_layer_freq: int | None = 1,
+        first_k_dense_replace: int = 1,
+        norm_topk_prob: bool = True,
+        scoring_func: str | None = "sigmoid",
+        aux_loss_alpha: float | None = 0.0001,
+        seq_aux: float | None = True,
+        hidden_act: str | None = "silu",
+        max_position_embeddings: int | None = 131072,
+        initializer_range: float | None = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool | None = True,
+        pad_token_id: int | None = None,
+        bos_token_id: int | None = 163691,
+        eos_token_id: int | None = 163691,
+        pretraining_tp: int | None = 1,
+        tie_word_embeddings: bool | None = False,
+        rope_theta: float | None = 10000.0,
+        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
+        attention_bias: bool | None = False,
+        attention_dropout: float | None = 0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.rope_parameters = rope_parameters
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 541bc4de61b1..8b5d08b8a09e 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -16,6 +16,7 @@
 
 _CLASS_TO_MODULE: dict[str, str] = {
     "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
+    "AXK1Config": "vllm.transformers_utils.configs.AXK1",
     "BagelConfig": "vllm.transformers_utils.configs.bagel",
     "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
     "ColModernVBertConfig": "vllm.transformers_utils.configs.colmodernvbert",
@@ -70,6 +71,7 @@
 
 __all__ = [
     "AfmoeConfig",
+    "AXK1Config",
     "BagelConfig",
     "ChatGLMConfig",
     "ColModernVBertConfig",
diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py
index b4e6508fad79..bb45f137e395 100644
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -233,6 +233,7 @@ def is_deepseek_mla(self) -> bool:
         if not hasattr(self.hf_text_config, "model_type"):
             return False
         elif self.hf_text_config.model_type in (
+            "AXK1",
             "deepseek_v2",
             "deepseek_v3",
             "deepseek_v32",
@@ -253,7 +254,13 @@ def is_deepseek_mla(self) -> bool:
             # underlying architecture
             return (
                 self.hf_text_config.model.model_type
-                in ("deepseek_v2", "deepseek_v3", "deepseek_v32", "deepseek_mtp")
+                in (
+                    "AXK1",
+                    "deepseek_v2",
+                    "deepseek_v3",
+                    "deepseek_v32",
+                    "deepseek_mtp",
+                )
                 and self.hf_text_config.kv_lora_rank is not None
             )
         return False

From 1d897ff04f90c46041ed3966dea671a6ae532184 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 27 Feb 2026 09:34:37 -0800
Subject: [PATCH 078/154] [Misc] Fill in some v1 CODEOWNERS gaps (#35524)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
---
 .github/CODEOWNERS | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 047ece980834..653d6c42e9af 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -54,11 +54,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/kv_offload @ApostaC @orozery
-/vllm/v1/worker/gpu/kv_connector.py @orozery
+/vllm/v1/engine @njhill
+/vllm/v1/executor @njhill
+/vllm/v1/worker @njhill
 /vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
 
 # Model runner V2
-/vllm/v1/worker/gpu @WoosukKwon
+/vllm/v1/worker/gpu @WoosukKwon @njhill
+/vllm/v1/worker/gpu/kv_connector.py @orozery
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 

From 157722da756daa6f967433903680745abc0c4861 Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Fri, 27 Feb 2026 09:50:37 -0800
Subject: [PATCH 079/154] [perf] Use pinned memory for async H2D transfer in
 do_mamba_copy_block (#35480)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 tests/v1/e2e/test_mamba_prefix_cache.py | 20 +++---
 tests/v1/worker/test_mamba_utils.py     |  1 +
 vllm/v1/worker/gpu_model_runner.py      | 14 ++++
 vllm/v1/worker/mamba_utils.py           | 94 ++++++++++++++++---------
 4 files changed, 85 insertions(+), 44 deletions(-)

diff --git a/tests/v1/e2e/test_mamba_prefix_cache.py b/tests/v1/e2e/test_mamba_prefix_cache.py
index 38cfdcdb3ecf..5aa72ccb3b03 100644
--- a/tests/v1/e2e/test_mamba_prefix_cache.py
+++ b/tests/v1/e2e/test_mamba_prefix_cache.py
@@ -325,6 +325,7 @@ def fake_preprocess_mamba_fn(
         requests: dict[str, CachedRequestState],
         forward_context: dict[str, Any],
         mamba_state_copy_funcs: tuple[MambaStateCopyFunc, ...],
+        copy_bufs: mamba_utils.MambaCopyBuffers,
     ):
         nonlocal copy_info
         copy_info = None
@@ -337,6 +338,7 @@ def fake_preprocess_mamba_fn(
             requests,
             forward_context,
             mamba_state_copy_funcs,
+            copy_bufs,
         )
         if cur_step_action is not None:
             check_copy_info(
@@ -355,6 +357,7 @@ def fake_post_process_mamba_fn(
         mamba_state_idx: dict[str, int],
         forward_context: dict[str, Any],
         mamba_state_copy_funcs: tuple[MambaStateCopyFunc, ...],
+        copy_bufs: mamba_utils.MambaCopyBuffers,
     ):
         nonlocal copy_info
         copy_info = None
@@ -366,6 +369,7 @@ def fake_post_process_mamba_fn(
             mamba_state_idx,
             forward_context,
             mamba_state_copy_funcs,
+            copy_bufs,
         )
         if cur_step_action is not None:
             check_copy_info(
@@ -376,19 +380,15 @@ def fake_post_process_mamba_fn(
             )
         return ret
 
-    def fake_copy_fn(
-        src_state_list: list[int],
-        dest_state_list: list[int],
-        num_elements_list: list[int],
-    ):
+    def fake_copy_fn(copy_bufs: mamba_utils.MambaCopyBuffers):
         nonlocal copy_info
         assert copy_info is None
+        n = copy_bufs.offset
+        src_state_list = copy_bufs.src_ptrs.cpu[:n].tolist()
+        dest_state_list = copy_bufs.dst_ptrs.cpu[:n].tolist()
+        num_elements_list = copy_bufs.sizes.cpu[:n].tolist()
         copy_info = (src_state_list, dest_state_list, num_elements_list)
-        return original_copy_fn(
-            src_state_list,
-            dest_state_list,
-            num_elements_list,
-        )
+        return original_copy_fn(copy_bufs)
 
     return fake_preprocess_mamba_fn, fake_post_process_mamba_fn, fake_copy_fn
 
diff --git a/tests/v1/worker/test_mamba_utils.py b/tests/v1/worker/test_mamba_utils.py
index 38eb250fb17a..df3b7de9b4c9 100644
--- a/tests/v1/worker/test_mamba_utils.py
+++ b/tests/v1/worker/test_mamba_utils.py
@@ -62,6 +62,7 @@ def test_resumed_req_ids_cleared_from_mamba_state_idx():
             {},
             {},
             (),
+            MagicMock(),
         )
 
     assert mamba_state_idx == {"keep": 99}
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a3e0adfae218..768a7ee4b3f5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -755,6 +755,7 @@ def __init__(
         self.execute_model_state: ExecuteModelState | None = None
         self.kv_connector_output: KVConnectorOutput | None = None
         self.mamba_state_idx: dict[str, int] = {}
+        self._mamba_copy_bufs: mamba_utils.MambaCopyBuffers | None = None
         self.layerwise_nvtx_hooks_registered = False
 
     def update_max_model_len(self, max_model_len: int) -> None:
@@ -849,6 +850,16 @@ def _make_buffer(
             with_numpy=numpy,
         )
 
+    def _get_mamba_copy_bufs(self) -> mamba_utils.MambaCopyBuffers:
+        if self._mamba_copy_bufs is None:
+            self._mamba_copy_bufs = mamba_utils.MambaCopyBuffers.create(
+                self.max_num_reqs,
+                self.kv_cache_config,
+                self.model.get_mamba_state_copy_func(),
+                self._make_buffer,
+            )
+        return self._mamba_copy_bufs
+
     def _init_model_kwargs(self):
         model_kwargs = dict[str, Any]()
 
@@ -1211,6 +1222,7 @@ def _update_states_after_model_execute(
                 self.mamba_state_idx,
                 self.compilation_config.static_forward_context,
                 self.model.get_mamba_state_copy_func(),
+                self._get_mamba_copy_bufs(),
             )
 
     def _update_streaming_request(
@@ -3505,6 +3517,7 @@ def execute_model(
                     self.requests,
                     self.compilation_config.static_forward_context,
                     self.model.get_mamba_state_copy_func(),
+                    self._get_mamba_copy_bufs(),
                 )
 
             use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
@@ -5997,6 +6010,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         kv_cache_config = deepcopy(kv_cache_config)
         self.kv_cache_config = kv_cache_config
+        self._mamba_copy_bufs = None
         self.may_add_encoder_only_layers_to_kv_cache_config()
         self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
         self.initialize_attn_backend(kv_cache_config)
diff --git a/vllm/v1/worker/mamba_utils.py b/vllm/v1/worker/mamba_utils.py
index 4f8a3bd050dc..2bd5d2b3fea8 100644
--- a/vllm/v1/worker/mamba_utils.py
+++ b/vllm/v1/worker/mamba_utils.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
 import itertools
+from collections.abc import Callable
 from typing import Any
 
 import torch
@@ -13,6 +15,7 @@
 from vllm.utils.math_utils import cdiv
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, MambaSpec
+from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_input_batch import CachedRequestState
 from vllm.v1.worker.lora_model_runner_mixin import GPUInputBatch
 
@@ -59,10 +62,36 @@ def get_mamba_groups(kv_cache_config: KVCacheConfig) -> tuple[list[int], MambaSp
     return mamba_group_ids, mamba_specs[0]
 
 
+@dataclasses.dataclass
+class MambaCopyBuffers:
+    src_ptrs: CpuGpuBuffer
+    dst_ptrs: CpuGpuBuffer
+    sizes: CpuGpuBuffer
+    offset: int = 0
+
+    @classmethod
+    def create(
+        cls,
+        max_num_reqs: int,
+        kv_cache_config: KVCacheConfig,
+        copy_funcs: tuple[MambaStateCopyFunc, ...],
+        make_buffer: Callable[..., CpuGpuBuffer],
+    ) -> "MambaCopyBuffers":
+        mamba_group_ids, _ = get_mamba_groups(kv_cache_config)
+        entries_per_req = sum(
+            len(kv_cache_config.kv_cache_groups[gid].layer_names)
+            for gid in mamba_group_ids
+        ) * len(copy_funcs)
+        n = max_num_reqs * entries_per_req
+        return cls(
+            src_ptrs=make_buffer(n, dtype=torch.int64),
+            dst_ptrs=make_buffer(n, dtype=torch.int64),
+            sizes=make_buffer(n, dtype=torch.int32),
+        )
+
+
 def collect_mamba_copy_meta(
-    src_state_list: list[int],
-    dest_state_list: list[int],
-    num_elements_list: list[int],
+    copy_bufs: MambaCopyBuffers,
     kv_cache_config: KVCacheConfig,
     mamba_state_copy_funcs: tuple[MambaStateCopyFunc, ...],
     mamba_group_ids: list[int],
@@ -71,10 +100,15 @@ def collect_mamba_copy_meta(
     accept_token_bias: int,
     req_state: CachedRequestState,
     forward_context: dict[str, Any],
-):
+) -> None:
     if src_block_idx == dest_block_idx and accept_token_bias == 0:
         return
 
+    src_ptrs_np = copy_bufs.src_ptrs.np
+    dst_ptrs_np = copy_bufs.dst_ptrs.np
+    sizes_np = copy_bufs.sizes.np
+    offset = copy_bufs.offset
+
     for mamba_group_id in mamba_group_ids:
         block_ids = req_state.block_ids[mamba_group_id]
         dest_block_id = block_ids[dest_block_idx]
@@ -87,25 +121,23 @@ def collect_mamba_copy_meta(
                     state, block_ids, src_block_idx, accept_token_bias + 1
                 )
 
-                src_state_list.append(copy_spec.start_addr)
-                dest_state_list.append(state[dest_block_id].data_ptr())
-                num_elements_list.append(copy_spec.num_elements * state.element_size())
+                src_ptrs_np[offset] = copy_spec.start_addr
+                dst_ptrs_np[offset] = state[dest_block_id].data_ptr()
+                sizes_np[offset] = copy_spec.num_elements * state.element_size()
+                offset += 1
 
+    copy_bufs.offset = offset
 
-def do_mamba_copy_block(
-    src_state_list: list[int],
-    dest_state_list: list[int],
-    num_elements_list: list[int],
-):
-    if len(src_state_list) == 0:
-        return
-    assert len(src_state_list) == len(dest_state_list)
-    assert len(src_state_list) == len(num_elements_list)
-    src_state_ptrs = torch.tensor(src_state_list, device="cuda", dtype=torch.int64)
-    dst_state_ptrs = torch.tensor(dest_state_list, device="cuda", dtype=torch.int64)
-    num_elements = torch.tensor(num_elements_list, device="cuda", dtype=torch.int32)
 
-    batch_memcpy(src_state_ptrs, dst_state_ptrs, num_elements)
+def do_mamba_copy_block(copy_bufs: MambaCopyBuffers):
+    n = copy_bufs.offset
+    if n == 0:
+        return
+    batch_memcpy(
+        copy_bufs.src_ptrs.copy_to_gpu(n),
+        copy_bufs.dst_ptrs.copy_to_gpu(n),
+        copy_bufs.sizes.copy_to_gpu(n),
+    )
 
 
 def preprocess_mamba(
@@ -117,6 +149,7 @@ def preprocess_mamba(
     requests: dict[str, CachedRequestState],
     forward_context: dict[str, Any],
     mamba_state_copy_funcs: tuple[MambaStateCopyFunc, ...],
+    copy_bufs: MambaCopyBuffers,
 ):
     """
     Copy the mamba state of previous step to the last
@@ -138,9 +171,7 @@ def preprocess_mamba(
     for req_id in itertools.chain(finished_req_ids, preempted_req_ids, resumed_req_ids):
         mamba_state_idx.pop(req_id, None)
 
-    src_state_list: list[int] = []
-    dest_state_list: list[int] = []
-    num_elements_list: list[int] = []
+    copy_bufs.offset = 0
     for i, req_id in enumerate(input_batch.req_ids):
         req_state = requests[req_id]
         prev_state_idx = mamba_state_idx.get(req_id)
@@ -169,9 +200,7 @@ def preprocess_mamba(
         mamba_state_idx[req_id] = curr_state_idx
         if prev_state_idx != -1 and prev_state_idx != curr_state_idx:
             collect_mamba_copy_meta(
-                src_state_list,
-                dest_state_list,
-                num_elements_list,
+                copy_bufs,
                 kv_cache_config,
                 mamba_state_copy_funcs,
                 mamba_group_ids,
@@ -182,7 +211,7 @@ def preprocess_mamba(
                 forward_context,
             )
             input_batch.num_accepted_tokens_cpu[i] = 1
-    do_mamba_copy_block(src_state_list, dest_state_list, num_elements_list)
+    do_mamba_copy_block(copy_bufs)
 
 
 def postprocess_mamba(
@@ -193,6 +222,7 @@ def postprocess_mamba(
     mamba_state_idx: dict[str, int],
     forward_context: dict[str, Any],
     mamba_state_copy_funcs: tuple[MambaStateCopyFunc, ...],
+    copy_bufs: MambaCopyBuffers,
 ):
     """
     If a blocks is converted from partial block to full block in this step, copy the
@@ -203,9 +233,7 @@ def postprocess_mamba(
     num_accepted_tokens_cpu = input_batch.num_accepted_tokens_cpu
     # NOTE: can be optimized as this function always returns the same result
     mamba_group_ids, mamba_spec = get_mamba_groups(kv_cache_config)
-    src_state_list: list[int] = []
-    dest_state_list: list[int] = []
-    num_elements_list: list[int] = []
+    copy_bufs.offset = 0
     for i, req_id in enumerate(input_batch.req_ids):
         req_state = requests[req_id]
         num_computed_tokens = req_state.num_computed_tokens
@@ -225,9 +253,7 @@ def postprocess_mamba(
             src_block_idx = mamba_state_idx[req_id]
             dest_block_idx = aligned_new_computed_tokens // mamba_spec.block_size - 1
             collect_mamba_copy_meta(
-                src_state_list,
-                dest_state_list,
-                num_elements_list,
+                copy_bufs,
                 kv_cache_config,
                 mamba_state_copy_funcs,
                 mamba_group_ids,
@@ -239,4 +265,4 @@ def postprocess_mamba(
             )
             if src_block_idx == dest_block_idx:
                 num_accepted_tokens_cpu[i] = 1
-    do_mamba_copy_block(src_state_list, dest_state_list, num_elements_list)
+    do_mamba_copy_block(copy_bufs)

From b602e4f299a596a14402e6a4ead5e51abb180c49 Mon Sep 17 00:00:00 2001
From: Martin Hickey <martin.hickey@ie.ibm.com>
Date: Fri, 27 Feb 2026 17:51:09 +0000
Subject: [PATCH 080/154] [Doc] Fix link to Llama chat template for usability
 (#35525)

Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/serving/openai_compatible_server.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 97ed7d45fbbf..1053b614eb57 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -84,7 +84,7 @@ In order for the language model to support chat protocol, vLLM requires the mode
 a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
 specifies how roles, messages, and other chat-specific tokens are encoded in the input.
 
-An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
+An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/#prompt-template-for-meta-llama-3)
 
 Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those models,
 you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat

From c8aca0c9e1b35ee4a1683a01467e638b23076a37 Mon Sep 17 00:00:00 2001
From: Netanel Haber <58652339+netanel-haber@users.noreply.github.com>
Date: Fri, 27 Feb 2026 20:07:38 +0200
Subject: [PATCH 081/154] Support parakeet as audio encoder for
 nemotron-nano-vl (#35100)

Signed-off-by: Netanel Haber <58652339+netanel-haber@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 .../model_executor/models/nano_nemotron_vl.py | 274 ++++++++++++++++--
 vllm/model_executor/models/parakeet.py        | 145 +++++++++
 vllm/transformers_utils/configs/parakeet.py   |  49 ++++
 3 files changed, 448 insertions(+), 20 deletions(-)
 create mode 100644 vllm/model_executor/models/parakeet.py
 create mode 100644 vllm/transformers_utils/configs/parakeet.py

diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 46cf7fe97829..51b36b1cae38 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -44,6 +44,7 @@
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM
+from vllm.model_executor.models.parakeet import ParakeetExtractor, ProjectedParakeet
 from vllm.model_executor.models.radio import RadioModel, calc_seq_lens
 from vllm.model_executor.models.utils import (
     init_vllm_registered_model,
@@ -55,12 +56,14 @@
     compute_retention_mask,
 )
 from vllm.multimodal.inputs import (
+    AudioItem,
     MultiModalDataDict,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
 )
 from vllm.multimodal.parse import (
+    AudioProcessorItems,
     ImageEmbeddingItems,
     ImageProcessorItems,
     ImageSize,
@@ -91,9 +94,29 @@
 # Alternative: Set a specific higher limit
 # Image.MAX_IMAGE_PIXELS = 300000000  # ~300M pixels
 
+
+class NanoNemotronVLAudioFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Number of audio clips
+        - t: Audio feature length
+        - f: Feature size (mel bins)
+    """
+
+    type: Literal["audio_features"] = "audio_features"
+    input_audio_features: Annotated[torch.Tensor, TensorShape("b", "t", "f")]
+    feature_attention_mask: Annotated[torch.Tensor, TensorShape("b", "t")]
+    audio_feature_lengths: Annotated[torch.Tensor, TensorShape("b")]
+
+
+MAX_AUDIO_LEN_S = 10 * 60  # 10 minutes
+
 IMG_START = "<img>"
 IMG_END = "</img>"
 IMG_CONTEXT = "<image>"
+AUDIO_START = "<so_start>"
+AUDIO_END = "<so_end>"
+AUDIO_CONTEXT = "<so_embedding>"
 
 # Profiling
 # MAX_FRAMES = 16
@@ -820,6 +843,11 @@ def __init__(
         self.video_token = video_token
         self.video_pruning_rate = video_pruning_rate
 
+        self.audio_extractor: ParakeetExtractor | None = None
+        raw_sound_config = getattr(config, "sound_config", None)
+        if raw_sound_config is not None:
+            self.audio_extractor = ParakeetExtractor(raw_sound_config)
+
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
         self._img_start_token_ids = tokenizer.encode(
@@ -952,11 +980,53 @@ def _preprocess_video(
                 text = [t.replace("<video>", video_repl_text, 1) for t in text]
         return text, video_inputs
 
+    def _preprocess_audio(
+        self,
+        text: list[str],
+        audios: list[npt.NDArray],
+    ):
+        if len(audios) == 0:
+            return text, {}
+        assert self.audio_extractor is not None
+
+        extractor = self.audio_extractor
+
+        parts = [x for x in re.split(f"({re.escape(AUDIO_CONTEXT)})", text[0]) if x]
+        token_count = parts.count(AUDIO_CONTEXT)
+        if token_count != len(audios):
+            raise ValueError(
+                "Number of audio tokens in text does not match the number "
+                f"of audios (tokens={token_count}, audios={len(audios)})."
+            )
+        audio_index = 0
+        for idx, part in enumerate(parts):
+            if part == AUDIO_CONTEXT:
+                audio_repl = self.get_audio_repl(audios[audio_index])
+                parts[idx] = audio_repl.full
+                audio_index += 1
+        text = ["".join(parts)]
+        audio_inputs = extractor(
+            audios,
+            sampling_rate=extractor.sampling_rate,
+            return_tensors="pt",
+        )
+        input_audio_features = audio_inputs.input_features
+        feature_attention_mask = audio_inputs.attention_mask
+        audio_feature_lengths = feature_attention_mask.sum(dim=1)
+        audio_inputs = {
+            "input_audio_features": input_audio_features,
+            "feature_attention_mask": feature_attention_mask,
+            "audio_feature_lengths": audio_feature_lengths,
+        }
+
+        return text, audio_inputs
+
     def __call__(
         self,
         text: str | list[str] | None = None,
         images: Image.Image | list[Image.Image] | None = None,
         videos: list[tuple[npt.NDArray, dict[str, Any]]] | None = None,
+        audios: AudioItem | list[AudioItem] | None = None,
         return_tensors: str | TensorType | None = None,
         max_num_tiles: int | None = None,
     ) -> BatchFeature:
@@ -964,8 +1034,8 @@ def __call__(
         if max_num_tiles is None:
             max_num_tiles = self.max_num_tiles
 
-        text, images, videos = [
-            self._make_batch_input(x) for x in (text, images, videos)
+        text, images, videos, audios = [
+            self._make_batch_input(x) for x in (text, images, videos, audios)
         ]
 
         text, image_inputs = self._preprocess_image(
@@ -980,17 +1050,22 @@ def __call__(
             max_num_tiles=1,
         )
 
+        text, audio_inputs = self._preprocess_audio(
+            text=text,
+            audios=audios,
+        )
+
         text_inputs = self.tokenizer(text, add_special_tokens=False)
 
+        combined_inputs = {**text_inputs, **video_inputs, **audio_inputs}
+
         if self.dynamic_tiler is None:
             batch = BatchFeature(
-                {**text_inputs, **video_inputs, **image_inputs},
+                {**combined_inputs, **image_inputs},
                 tensor_type=return_tensors,
             )
         else:
-            batch = BatchFeature(
-                {**text_inputs, **video_inputs}, tensor_type=return_tensors
-            )
+            batch = BatchFeature(combined_inputs, tensor_type=return_tensors)
             # allow images to be exempt from the BatchFeature validation:
             # We will .stack() them in _parse_and_validate_image_input
             batch.update(image_inputs)
@@ -1006,6 +1081,15 @@ def get_image_repl(
 
         return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
+    def get_audio_repl(
+        self,
+        audio: npt.NDArray,
+    ) -> PromptUpdateDetails[str]:
+        assert self.audio_extractor is not None
+        num_tokens = self.audio_extractor.audio_token_count(len(audio))
+        repl_full = f"{AUDIO_START}{AUDIO_CONTEXT * num_tokens}{AUDIO_END}"
+        return PromptUpdateDetails.select_text(repl_full, AUDIO_CONTEXT)
+
     @classmethod
     def get_video_repl(
         cls,
@@ -1147,15 +1231,28 @@ class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
     def supports_video(self):
         return self.get_hf_processor().supports_video
 
+    @property
+    def audio_extractor(self) -> ParakeetExtractor | None:
+        return self.get_hf_processor().audio_extractor
+
     def get_data_parser(self):
+        target_sr = None
+        target_channels = None
+        if extractor := self.audio_extractor:
+            target_sr = extractor.sampling_rate
+            target_channels = 1
+
         return MultiModalDataParser(
             video_needs_metadata=True,
+            target_sr=target_sr,
+            target_channels=target_channels,
             expected_hidden_size=self._get_expected_hidden_size(),
         )
 
     def get_supported_mm_limits(self):
         video_limit = {"video": None} if self.supports_video else {}
-        return {**super().get_supported_mm_limits(), **video_limit}
+        audio_limit = {"audio": None} if self.audio_extractor is not None else {}
+        return {**super().get_supported_mm_limits(), **video_limit, **audio_limit}
 
     def get_video_token(self) -> str | None:
         return IMG_CONTEXT
@@ -1304,7 +1401,16 @@ def _get_mm_fields_config(
         else:
             video_fields = {}
 
-        return image_fields | video_fields
+        if self.info.audio_extractor is not None:
+            audio_fields = dict(
+                input_audio_features=MultiModalFieldConfig.batched("audio"),
+                feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+                audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
+            )
+        else:
+            audio_fields = {}
+
+        return image_fields | video_fields | audio_fields
 
     def _get_prompt_updates(
         self,
@@ -1373,6 +1479,20 @@ def get_video_replacement_internvl(item_idx: int):
                 ),
             ]
 
+        def get_audio_replacement(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            return hf_processor.get_audio_repl(audios.get(item_idx))
+
+        if self.info.audio_extractor is not None:
+            prompt_repl = [
+                *prompt_repl,
+                PromptReplacement(
+                    modality="audio",
+                    target=AUDIO_CONTEXT,
+                    replacement=get_audio_replacement,
+                ),
+            ]
+
         return prompt_repl
 
 
@@ -1422,8 +1542,13 @@ class NanoNemotronVLDummyInputsBuilder(
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_videos = mm_counts.get("video", 0)
+        num_audios = mm_counts.get("audio", 0)
 
-        return super().get_dummy_text(mm_counts) + "<video>" * num_videos
+        return (
+            super().get_dummy_text(mm_counts)
+            + "<video>" * num_videos
+            + AUDIO_CONTEXT * num_audios
+        )
 
     def _get_dummy_videos(
         self,
@@ -1482,7 +1607,25 @@ def get_dummy_mm_data(
             }
         else:
             dummy_video = {}
-        return {**dummy_image, **dummy_video}
+
+        if extractor := self.info.audio_extractor:
+            num_audios = mm_counts.get("audio", 0)
+            audio_overrides = mm_options.get("audio") if mm_options else None
+            tokens_per_audio = max(1, seq_len // max(num_audios, 1))
+            max_audio_num_samples = MAX_AUDIO_LEN_S * extractor.sampling_rate
+            calculated_max_audio_num_samples = extractor.audio_length(tokens_per_audio)
+            audio_len = min(max_audio_num_samples, calculated_max_audio_num_samples)
+            dummy_audio = {
+                "audio": self._get_dummy_audios(
+                    length=audio_len,
+                    num_audios=num_audios,
+                    overrides=audio_overrides,
+                )
+            }
+        else:
+            dummy_audio = {}
+
+        return {**dummy_image, **dummy_video, **dummy_audio}
 
 
 @MULTIMODAL_REGISTRY.register_processor(
@@ -1499,12 +1642,15 @@ def get_placeholder_str(cls, modality: str, i: int) -> str | None:
             return "<image>"
         if modality.startswith("video"):
             return "<video>"
+        if modality.startswith("audio"):
+            return AUDIO_CONTEXT
         return None
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config
-        multimodal_config = vllm_config.model_config.multimodal_config
+        model_config = vllm_config.model_config
+        config = model_config.hf_config
+        multimodal_config = model_config.multimodal_config
         image_size = config.force_image_size
         patch_size = config.patch_size
         self.patch_size = patch_size
@@ -1523,10 +1669,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 hf_config=config.text_config,
                 prefix=maybe_prefix(prefix, "language_model"),
             )
-
-        with self._mark_tower_model(vllm_config, {"image", "video"}):
+        llm_dtype = self.language_model.config.dtype
+        assert isinstance(llm_dtype, torch.dtype)
+        self.llm_dtype = llm_dtype
+        with self._mark_tower_model(vllm_config, {"image", "video", "audio"}):
             self.vision_model = self.get_vit_model_from_radio_config(config).to(
-                self.language_model.config.dtype
+                llm_dtype
             )
 
             # Construct the vision projection.
@@ -1547,14 +1695,26 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 ReLUSquaredActivation(),
                 nn.Linear(vision_projection_hidden_size, llm_hidden_size, bias=False),
             )
-            self.mlp1 = mlp1.to(self.language_model.config.dtype)
+            self.mlp1 = mlp1.to(llm_dtype)
+            self.sound_encoder: ProjectedParakeet | None = None
+            if getattr(config, "sound_config", None) is not None:
+                logger.info_once(
+                    "Found sound config, initializing sound encoder for Nemotron AVLM",
+                    scope="global",
+                )
+                self.sound_encoder = ProjectedParakeet(
+                    config.sound_config,
+                    dtype=llm_dtype,
+                    llm_hidden_size=llm_hidden_size,
+                    max_model_len=model_config.max_model_len,
+                )
 
         self.config = config
         self.model_config = vllm_config.model_config
 
         # Pre-tokenize special tokens for video processing
         # to avoid repeated tokenization
-        tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+        tokenizer = cached_tokenizer_from_config(model_config)
         self._img_start_token_ids = tokenizer.encode(
             IMG_START, add_special_tokens=False
         )
@@ -1566,7 +1726,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config
         )
         if self.dynamic_resolution:
-            logger.info("Dynamic resolution is enabled for NanoNemotronVLProcessor")
+            logger.info_once(
+                "Dynamic resolution is enabled for NanoNemotronVLProcessor",
+                scope="global",
+            )
 
     def pixel_shuffle(self, x, scale_factor=0.5):
         n, w, h, c = x.size()
@@ -1780,6 +1943,51 @@ def _process_video_input(
 
         return final_video_embeddings
 
+    def _process_audio_input(
+        self, audio_input: NanoNemotronVLAudioFeatureInputs
+    ) -> tuple[torch.Tensor, ...]:
+        assert self.sound_encoder is not None
+        input_audio_features = audio_input.input_audio_features
+        feature_attention_mask = audio_input.feature_attention_mask
+        target_device = next(self.sound_encoder.parameters()).device
+
+        # When cross-request batching combines audio clips with different
+        # time dimensions, _reduce_data returns a list instead of a stacked
+        # tensor. Pad to the max time dim and stack; the attention mask
+        # already marks valid positions so zero-padding is safe.
+        if isinstance(input_audio_features, list):
+            feature_sizes = [f.shape[-2] for f in input_audio_features]
+            max_t = max(feature_sizes)
+            padded_feats = [
+                torch.nn.functional.pad(feat, (0, 0, 0, max_t - feat_size))
+                for feat, feat_size in zip(
+                    input_audio_features, feature_sizes, strict=True
+                )
+            ]
+            padded_masks = [
+                torch.nn.functional.pad(mask, (0, max_t - mask.shape[-1]))
+                for mask in feature_attention_mask
+            ]
+            input_audio_features = torch.stack(padded_feats)
+            feature_attention_mask = torch.stack(padded_masks)
+
+        input_audio_features = input_audio_features.to(
+            dtype=self.llm_dtype, device=target_device
+        )
+        feature_attention_mask = feature_attention_mask.to(device=target_device)
+        sound_embeds = self.sound_encoder(input_audio_features, feature_attention_mask)
+
+        valid_input_lens = feature_attention_mask.sum(dim=1)
+        valid_output_lens = self.sound_encoder.encoder._get_subsampling_output_length(
+            valid_input_lens
+        )
+        truncated_embeds = []
+        for i in range(sound_embeds.shape[0]):
+            valid_len = valid_output_lens[i].item()
+            truncated_embeds.append(sound_embeds[i, :valid_len])
+
+        return tuple(truncated_embeds)
+
     def _create_final_video_embeddings(
         self,
         video_embeddings: torch.Tensor,
@@ -1887,6 +2095,18 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                 modalities["images"] = self._parse_and_validate_image_input(**kwargs)
             if input_key in ("pixel_values_flat_video",) and "videos" not in modalities:
                 modalities["videos"] = self._parse_and_validate_video_input(**kwargs)
+            if (
+                input_key
+                in (
+                    "input_audio_features",
+                    "feature_attention_mask",
+                    "audio_feature_lengths",
+                )
+                and "audios" not in modalities
+            ):
+                modalities["audios"] = NanoNemotronVLAudioFeatureInputs(
+                    **kwargs, validate=False
+                )
 
         return modalities
 
@@ -1917,6 +2137,10 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
                 video_input = modalities["videos"]
                 video_embeddings = self._process_video_input(video_input)
                 multimodal_embeddings += tuple(video_embeddings)
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_embeddings = self._process_audio_input(audio_input)
+                multimodal_embeddings += tuple(audio_embeddings)
 
         return multimodal_embeddings
 
@@ -1947,8 +2171,8 @@ def get_mm_mapping(self) -> MultiModelKeys:
         """
         return MultiModelKeys.from_string_field(
             language_model="language_model",
-            connector="mlp1",
-            tower_model="vision_model",
+            connector=["mlp1", "sound_encoder.projection"],
+            tower_model=["vision_model", "sound_encoder.encoder"],
         )
 
     def compute_logits(
@@ -1969,9 +2193,13 @@ def is_adapter_weights(weight: tuple[str, torch.Tensor]):
         def is_vision_weights(name: str) -> bool:
             return name.startswith("vision_model.radio_model.")
 
+        def is_sound_weights(name: str) -> bool:
+            return name.startswith("sound")
+
         # Separate weights by component
         llm_weights = []
         vision_weights = []
+        sound_weights = []
 
         for name, w in weights:
             if is_llm(name):
@@ -1987,9 +2215,15 @@ def is_vision_weights(name: str) -> bool:
                 # Convert: vision_model.radio_model.* → radio_model.*
                 hf_key = name[len("vision_model.") :]  # Remove "vision_model." prefix
                 vision_weights.append((hf_key, w))
+            elif is_sound_weights(name):
+                assert self.sound_encoder is not None
+                sound_weights.append((name, w))
 
         self.language_model.load_weights(llm_weights)
         self.vision_model.load_weights(vision_weights)
+        if self.sound_encoder is not None:
+            assert len(sound_weights) > 0
+            self.sound_encoder.load_weights(sound_weights)
 
     def print_architecture(self, detailed: bool = True, save_to_file: str = None):
         """
diff --git a/vllm/model_executor/models/parakeet.py b/vllm/model_executor/models/parakeet.py
new file mode 100644
index 000000000000..8c5539251c45
--- /dev/null
+++ b/vllm/model_executor/models/parakeet.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Modules below used for the audio encoder component in: models/nano_nemotron_vl.py
+"""
+
+from collections.abc import Iterable
+from dataclasses import asdict
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import ParakeetEncoder as HFParakeetEncoder
+from transformers import ParakeetFeatureExtractor, PretrainedConfig
+
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.transformers_utils.configs.parakeet import ExtractorConfig, ParakeetConfig
+
+
+class ParakeetProjection(nn.Module):
+    def __init__(self, config: ParakeetConfig) -> None:
+        super().__init__()
+        sound_hidden_size = config.hidden_size
+        proj_hidden_size = config.projection_hidden_size
+        llm_hidden_size = config.llm_hidden_size
+        bias = config.projection_bias
+
+        self.norm = nn.LayerNorm(sound_hidden_size, eps=config.projection_eps)
+        self.linear1 = nn.Linear(sound_hidden_size, proj_hidden_size, bias=bias)
+        self.activation = ReLUSquaredActivation()
+        self.linear2 = nn.Linear(proj_hidden_size, llm_hidden_size, bias=bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class ProjectedParakeet(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        *,
+        dtype: torch.dtype,
+        llm_hidden_size: int,
+        max_model_len: int,
+    ) -> None:
+        super().__init__()
+        self.config = ParakeetConfig.from_hf_config(
+            config, llm_hidden_size=llm_hidden_size, max_model_len=max_model_len
+        )
+        self.encoder = HFParakeetEncoder(self.config)
+        self.encoder = self.encoder.to(dtype)
+        self.projection = ParakeetProjection(self.config)
+        self.projection = self.projection.to(dtype)
+
+    def forward(
+        self, input_features: torch.Tensor, attention_mask: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        outputs = self.encoder(
+            input_features=input_features, attention_mask=attention_mask
+        )
+        outputs = outputs.last_hidden_state
+        outputs = outputs.to(dtype=torch.bfloat16)
+        outputs = self.projection(outputs)
+        return outputs
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loaded_params: set[str] = set()
+        params_dict = dict(self.named_parameters())
+        buffers_dict = dict(self.named_buffers())
+
+        if isinstance(weights, dict):
+            weights_list = list(weights.items())
+        else:
+            weights_list = list(weights)
+
+        for name, weight in weights_list:
+            if name.startswith("sound_encoder.encoder.feature_extractor."):
+                # Feature extractor buffers are handled outside the encoder.
+                continue
+            if name.startswith("sound_encoder."):
+                target_name = name[len("sound_encoder.") :]
+            elif name.startswith("sound_projection."):
+                target_name = f"projection.{name[len('sound_projection.') :]}"
+            else:
+                continue
+
+            target = params_dict.get(target_name)
+            if target is None:
+                target = buffers_dict.get(target_name)
+            if target is None:
+                raise ValueError(f"Unknown weight: {name}")
+            weight_loader = getattr(target, "weight_loader", default_weight_loader)
+            with torch.no_grad():
+                weight_loader(target, weight)
+            loaded_params.add(target_name)
+
+        return loaded_params
+
+
+class ParakeetExtractor(ParakeetFeatureExtractor):
+    def __init__(self, config: PretrainedConfig) -> None:
+        self.config = ExtractorConfig.from_hf_config(config)
+        super().__init__(**asdict(self.config))
+        self._clip_target_samples = int(
+            round(self.config.clip_duration_s * self.sampling_rate)
+        )
+        self._tail_min_samples = int(
+            round(self.config.clip_min_duration_s * self.sampling_rate)
+        )
+
+    def _normalize_audio_length(self, audio_len: int) -> int:
+        # Match mcore's compute_params() logic for clip/minduration handling.
+        target_len = max(audio_len, self._tail_min_samples)
+        tail_remainder = target_len % self._clip_target_samples
+        if 0 < tail_remainder < self._tail_min_samples:
+            padding = self._tail_min_samples - tail_remainder
+            target_len += padding
+        assert isinstance(target_len, int)
+        return target_len
+
+    def audio_token_count(self, audio_len: int) -> int:
+        audio_len = self._normalize_audio_length(audio_len)
+        num_frames = audio_len // self.hop_length
+        n_tokens = HFParakeetEncoder._get_subsampling_output_length(
+            self, torch.tensor([num_frames], dtype=torch.float)
+        )
+        return max(1, n_tokens.item())
+
+    def __call__(self, raw_speech: list[np.ndarray], *args, **kwargs):
+        padded = []
+        for p in raw_speech:
+            assert p.ndim == 1
+            audio_len = int(p.shape[0])
+            target_len = self._normalize_audio_length(audio_len)
+            p = np.pad(p, (0, target_len - audio_len))
+            padded.append(p)
+        return super().__call__(padded, *args, **kwargs)
+
+    def audio_length(self, audio_tokens: int) -> int:
+        return int(audio_tokens * self.config.subsampling_factor * self.hop_length)
diff --git a/vllm/transformers_utils/configs/parakeet.py b/vllm/transformers_utils/configs/parakeet.py
new file mode 100644
index 000000000000..efd4c466478b
--- /dev/null
+++ b/vllm/transformers_utils/configs/parakeet.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+from transformers import ParakeetEncoderConfig, PretrainedConfig
+
+
+class ParakeetConfig(ParakeetEncoderConfig):
+    llm_hidden_size: int
+    projection_hidden_size: int
+    projection_bias: bool
+    projection_eps: float = 1e-5
+    sampling_rate: int
+
+    @staticmethod
+    def from_hf_config(
+        config: PretrainedConfig, *, llm_hidden_size: int, max_model_len: int
+    ) -> "ParakeetConfig":
+        assert isinstance(config, PretrainedConfig)
+        return ParakeetConfig(
+            **config.to_dict(),
+            scale_input=False,
+            attention_bias=False,
+            llm_hidden_size=llm_hidden_size,
+            max_position_embeddings=max_model_len
+            + 1,  # + 1 because it seems like max_model_len+1 can be passed
+        )
+
+
+@dataclass(kw_only=True, frozen=True)
+class ExtractorConfig:
+    feature_size: int
+    sampling_rate: int
+    subsampling_factor: int
+    subsampling_conv_kernel_size: int
+    subsampling_conv_stride: int
+    clip_duration_s: int = 30
+    clip_min_duration_s: float = 0.1
+
+    @staticmethod
+    def from_hf_config(config: PretrainedConfig) -> "ExtractorConfig":
+        assert isinstance(config, PretrainedConfig)
+        return ExtractorConfig(
+            feature_size=config.num_mel_bins,
+            sampling_rate=config.sampling_rate,
+            subsampling_factor=config.subsampling_factor,
+            subsampling_conv_kernel_size=config.subsampling_conv_kernel_size,
+            subsampling_conv_stride=config.subsampling_conv_stride,
+        )

From fd6de37fcafe9540ed821256877127df75d74db8 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Fri, 27 Feb 2026 19:34:49 +0100
Subject: [PATCH 082/154] [BugFix] Fix 3D rope in transformers backend (#35097)

Signed-off-by: raushan <raushan@huggingface.co>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../models/transformers/multimodal.py              | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index f7b5d8899502..3360ce59a763 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -218,7 +218,7 @@ def apply(
             if "mm_token_type_ids" in processed_data
             else "token_type_ids"
         )
-        mm_token_type_ids = processed_data.pop(token_type_key)
+        mm_token_type_ids = processed_data.get(token_type_key)
 
         # We can infer vLLM style placeholder from token type ids, if we split
         # it for each input `mm_data`.
@@ -353,6 +353,7 @@ def embed_multimodal(self, **kwargs):
 
         num_image_patches = kwargs.pop("num_image_patches")
         kwargs.pop("token_type_ids", None)  # used only in `forward`
+        kwargs.pop("mm_token_type_ids", None)  # used only in `model.get_rope_index`
 
         if pixel_values is not None:
             # ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
@@ -443,6 +444,7 @@ def get_mrope_input_positions(
             {
                 "image_grid_thw",
                 "video_grid_thw",
+                "mm_token_type_ids",
                 "second_per_grid_ts",
                 "audio_feature_lengths",
                 "use_audio_in_video",
@@ -451,7 +453,7 @@ def get_mrope_input_positions(
         if any(
             v
             for k, v in kwargs.items()
-            if k not in {"image_grid_thw", "video_grid_thw"}
+            if k not in {"image_grid_thw", "mm_token_type_ids"}
         ):
             raise NotImplementedError(
                 "Transformers modeling backend only supports images."
@@ -459,6 +461,7 @@ def get_mrope_input_positions(
 
         image_grid_thw = kwargs.get("image_grid_thw", [])
         video_grid_thw = kwargs.get("video_grid_thw", [])
+        mm_token_type_ids = kwargs.get("mm_token_type_ids")
 
         image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
             image_grid_thw
@@ -467,10 +470,17 @@ def get_mrope_input_positions(
             video_grid_thw
         )
 
+        # In v4 `get_rope_index` doesn't have wildcard `kwargs`, and
+        # can't accept arbitrary args, even if its value is `None`
+        kwargs = {}
+        if mm_token_type_ids:
+            kwargs["mm_token_type_ids"] = torch.cat(mm_token_type_ids)
+
         mrope_positions, mrope_position_delta = self.model.get_rope_index(
             input_ids=torch.tensor(input_tokens).unsqueeze(0),
             image_grid_thw=image_grid_thw,
             video_grid_thw=video_grid_thw,
+            **kwargs,
         )
 
         mrope_positions = mrope_positions[:, 0]

From b1d9f5372d802513e9e009a5d572dd594a09e1dc Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 27 Feb 2026 10:43:30 -0800
Subject: [PATCH 083/154] [Model Runner V2] Warmup kernels (#35172)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
---
 vllm/sampling_params.py      |  18 ++++++
 vllm/v1/worker/gpu/warmup.py | 105 +++++++++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_worker.py |  16 ++++--
 3 files changed, 133 insertions(+), 6 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/warmup.py

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 2f015339e67d..e36a90d6c568 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -840,6 +840,24 @@ def __repr__(self) -> str:
             f"extra_args={self.extra_args})"
         )
 
+    @staticmethod
+    def for_sampler_warmup() -> "SamplingParams":
+        """Set parameters to exercise all sampler logic."""
+        return SamplingParams(
+            temperature=0.9,
+            top_p=0.9,
+            top_k=50,
+            min_p=0.1,
+            frequency_penalty=0.5,
+            presence_penalty=0.5,
+            repetition_penalty=1.2,
+            min_tokens=2,
+            logit_bias={0: -1.0, 1: 0.5},
+            _bad_words_token_ids=[[0], [1, 2]],
+            logprobs=5,
+            prompt_logprobs=1,
+        )
+
 
 class BeamSearchParams(
     msgspec.Struct,
diff --git a/vllm/v1/worker/gpu/warmup.py b/vllm/v1/worker/gpu/warmup.py
new file mode 100644
index 000000000000..ffe5b33f7ba8
--- /dev/null
+++ b/vllm/v1/worker/gpu/warmup.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from vllm import PoolingParams, SamplingParams
+from vllm.v1.core.sched.output import (
+    CachedRequestData,
+    GrammarOutput,
+    NewRequestData,
+    SchedulerOutput,
+)
+from vllm.v1.request import Request
+from vllm.v1.worker.gpu.model_runner import GPUModelRunner
+
+
+@torch.inference_mode()
+def warmup_kernels(model_runner: GPUModelRunner) -> None:
+    """Run two execute_model + sample_tokens iterations to JIT compile
+    triton kernels.
+
+    The first iteration simulates a prefill with requests of 2 prompt
+    tokens each. The second iteration simulates a decode step with all
+    requests generating 1 token each.
+    """
+    prompt_token_ids = [0, 1]
+    prompt_len = len(prompt_token_ids)
+    num_reqs = min(
+        model_runner.scheduler_config.max_num_seqs,
+        model_runner.scheduler_config.max_num_batched_tokens // prompt_len,
+    )
+
+    num_kv_cache_groups = len(model_runner.kv_cache_config.kv_cache_groups)
+    req_ids = [f"_warmup_{i}_" for i in range(num_reqs)]
+
+    # SamplingParams exercising all sampling features.
+    if model_runner.is_pooling_model:
+        sampling_params = None
+        pooling_params = PoolingParams()
+    else:
+        sampling_params = SamplingParams.for_sampler_warmup()
+        pooling_params = None
+
+    # Step 1: Prefill all requests with 2 prompt tokens each.
+    new_reqs = [
+        NewRequestData.from_request(
+            Request(req_ids[i], prompt_token_ids, sampling_params, pooling_params),
+            # Each request uses a distinct block per KV cache group.
+            block_ids=tuple([i] for _ in range(num_kv_cache_groups)),
+            prefill_token_ids=prompt_token_ids,
+        )
+        for i in range(num_reqs)
+    ]
+
+    prefill_output = SchedulerOutput.make_empty()
+    prefill_output.scheduled_new_reqs = new_reqs
+    prefill_output.num_scheduled_tokens = {rid: prompt_len for rid in req_ids}
+    prefill_output.total_num_scheduled_tokens = prompt_len * num_reqs
+    prefill_output.num_common_prefix_blocks = [0] * num_kv_cache_groups
+
+    # Disable KV connector for warmup run.
+    model_runner.kv_connector.set_disabled(True)
+    model_runner.execute_model(prefill_output)
+
+    if not model_runner.is_pooling_model:
+        # Warm up sampler and perform a decode step for non-pooling models.
+
+        grammar_output = None
+        if model_runner.is_last_pp_rank:
+            # Build a GrammarOutput to exercise the structured output bitmask
+            # kernel during the prefill step.
+            vocab_size = model_runner.model_config.get_vocab_size()
+            bitmask_width = (vocab_size + 31) // 32
+            grammar_bitmask = np.full(
+                (len(req_ids), bitmask_width), fill_value=-1, dtype=np.int32
+            )
+            grammar_output = GrammarOutput(
+                structured_output_request_ids=req_ids, grammar_bitmask=grammar_bitmask
+            )
+
+        model_runner.sample_tokens(grammar_output)
+
+        # Step 2: Decode all requests with 1 token each.
+        cached_req_data = CachedRequestData.make_empty()
+        cached_req_data.req_ids = list(req_ids)
+        cached_req_data.new_block_ids = [None] * num_reqs
+        cached_req_data.num_computed_tokens = [prompt_len] * num_reqs
+        cached_req_data.num_output_tokens = [1] * num_reqs
+
+        decode_output = SchedulerOutput.make_empty()
+        decode_output.scheduled_cached_reqs = cached_req_data
+        decode_output.num_scheduled_tokens = {rid: 1 for rid in req_ids}
+        decode_output.total_num_scheduled_tokens = num_reqs
+        decode_output.num_common_prefix_blocks = [0] * num_kv_cache_groups
+
+        model_runner.execute_model(decode_output)
+        model_runner.sample_tokens(None)
+
+    # Clean up - process finish_req_ids.
+    cleanup_output = SchedulerOutput.make_empty()
+    cleanup_output.finished_req_ids = set(req_ids)
+    model_runner.execute_model(cleanup_output)
+    model_runner.kv_connector.set_disabled(False)
+    torch.cuda.synchronize()
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 3aeb20839a9f..fcc0fdf88380 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -61,6 +61,7 @@
 from vllm.v1.worker.worker_base import WorkerBase
 from vllm.v1.worker.workspace import init_workspace_manager
 
+from .gpu.warmup import warmup_kernels
 from .utils import request_memory
 
 logger = init_logger(__name__)
@@ -558,12 +559,15 @@ def compile_or_warm_up_model(self) -> None:
 
             logger.debug(msg)
 
-        # Warm up sampler and preallocate memory buffer for logits and other
-        # sampling related tensors of max possible shape to avoid memory
-        # fragmentation issue.
-        # NOTE: This is called after `capture_model` on purpose to prevent
-        # memory buffers from being cleared by `torch.cuda.empty_cache`.
-        if get_pp_group().is_last_rank:
+        if self.use_v2_model_runner:
+            # V2: Run full execute_model + sample_tokens to JIT compile triton kernels.
+            warmup_kernels(self.model_runner)
+        elif get_pp_group().is_last_rank:
+            # V1: Warm up sampler and preallocate memory buffer for logits and other
+            # sampling related tensors of max possible shape to avoid memory
+            # fragmentation issue.
+            # NOTE: This is called after `capture_model` on purpose to prevent
+            # memory buffers from being cleared by `torch.cuda.empty_cache`.
             max_num_reqs = min(
                 self.scheduler_config.max_num_seqs,
                 self.scheduler_config.max_num_batched_tokens,

From 29b35477b0661f527d2b951ff5125f5c58fce3fe Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Fri, 27 Feb 2026 14:34:16 -0500
Subject: [PATCH 084/154] [compile] Fix caching error over pytree slice node.
 (#35308)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 tests/compile/test_aot_compile.py | 21 +++++++++++++++++++++
 vllm/compilation/caching.py       | 21 +++++++++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index fbacbb6bfa5a..4cfdc1b2e7f6 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -16,6 +16,7 @@
 import vllm.model_executor.layers.activation
 from vllm.compilation.caching import (
     StandaloneCompiledArtifacts,
+    VllmSerializableFunction,
 )
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
@@ -156,6 +157,26 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
             assert torch.allclose(ret, expected)
 
 
+@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
+    def foo(x: torch.Tensor):
+        return x[slice(0, x.shape[0])]
+
+    vllm_config = make_vllm_config()
+
+    example_input = torch.randn(10, 10)
+    torch._dynamo.mark_dynamic(example_input, 0)
+    gm = torch.fx.symbolic_trace(foo)
+    assert "getitem_1 = x[slice(0, getitem, None)]" in gm.code
+    with use_vllm_config(vllm_config):
+        payload = VllmSerializableFunction.serialize_compile_artifacts(
+            VllmSerializableFunction(gm, (example_input,), "", foo)
+        )
+        fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
+
+    assert gm.code == fn.graph_module.code
+
+
 @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
 def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
     """
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 07f9db4190b9..3917a4f28cf9 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import contextlib
 import hashlib
 import inspect
 import os
@@ -144,6 +145,18 @@ def __setstate__(self, state: dict[str, dict[str, Any]]) -> None:
         self.loaded_submodule_store = {}
 
 
+@contextlib.contextmanager
+def patch_pytree_map_over_slice():
+    pytree._private_register_pytree_node(
+        slice, lambda x: ([x.start, x.stop, x.step], None), lambda x, c: slice(*x)
+    )
+
+    try:
+        yield
+    finally:
+        pytree._deregister_pytree_node(slice)
+
+
 class VllmSerializableFunction(SerializableCallable):  # type: ignore[misc]
     """
     A wrapper around a compiled function by vllm. It will forward the tensor
@@ -235,7 +248,10 @@ def _graph_reducer_override(
                 lambda inp: torch.empty_like(inp, device="meta"),
                 state["example_inputs"],
             )
-        with patch.object(GraphPickler, "reducer_override", _graph_reducer_override):
+        with (
+            patch.object(GraphPickler, "reducer_override", _graph_reducer_override),
+            patch_pytree_map_over_slice(),
+        ):
             state["graph_module"] = GraphPickler.dumps(
                 state["graph_module"], Options(ops_filter=None)
             )
@@ -261,7 +277,8 @@ def deserialize_compile_artifacts(cls, data: bytes) -> "VllmSerializableFunction
 
         state = pickle.loads(data)
         fake_mode = FakeTensorMode(shape_env=ShapeEnv())
-        state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
+        with patch_pytree_map_over_slice():
+            state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
         state["graph_module"].recompile()
         state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
 

From 2decec9856033347f3129f1d1b2ec015e1ad88ea Mon Sep 17 00:00:00 2001
From: SteadfastAsArt <35479342+SteadfastAsArt@users.noreply.github.com>
Date: Sat, 28 Feb 2026 03:39:23 +0800
Subject: [PATCH 085/154] [Transformers backend] Ignore MTP weights when
 num_nextn_predict_layers=0 (#34888)

Signed-off-by: SteadfastAsArt <695488173@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers/base.py | 14 +++++++++++++-
 vllm/model_executor/models/utils.py             |  3 ++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index 9e3c0a53574c..1ca73853ad12 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -300,14 +300,26 @@ def _recursive_replace(module: nn.Module, prefix: str):
             for child_name, child_module in module.named_children():
                 new_module = child_module
                 qual_name = maybe_prefix(prefix, child_name)
-                # Populate Eagle3 attrs
                 if (
                     isinstance(module, nn.ModuleList)
                     and len(module) == self.text_config.num_hidden_layers
                 ):
+                    # Populate Eagle3 attrs
                     self._target_class = type(child_module)
                     layer_name = qual_name.removeprefix("model.")
                     self._layer_names[int(child_name)] = layer_name
+                    # MTP weights should not be loaded into the base model
+                    num_hidden_layers = self.text_config.num_hidden_layers
+                    names = (
+                        "n_predict",  # Override from SpeculativeConfig
+                        "num_nextn_predict_layers",  # Most models
+                        "mtp_num_hidden_layers",  # Qwen 3.5
+                    )
+                    n_predict = getattr_iter(self.text_config, names, 0)
+                    for i in range(num_hidden_layers, num_hidden_layers + n_predict):
+                        mtp_prefix = f"{prefix}.{i}."
+                        if mtp_prefix not in self.ignore_unexpected_prefixes:
+                            self.ignore_unexpected_prefixes.append(mtp_prefix)
                 # Replace modules as needed
                 if isinstance(child_module, nn.Linear):
                     generator = (p for p in tp_plan if re.match(p, qual_name))
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index c55693bcff93..abc953b7f980 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -311,8 +311,9 @@ def _load_module(
 
                     continue
 
+                named_parameters = module.named_parameters(recurse=True)
                 desc_param_keys = {
-                    base_prefix + k for k, _ in module.named_parameters(recurse=True)
+                    maybe_prefix(base_prefix, k) for k, _ in named_parameters
                 }
                 msg = (
                     f"There is no module or parameter named {prefix!r} "

From 234a65b781d9dc51d28aebb208096baa8fe0458e Mon Sep 17 00:00:00 2001
From: Lucas Kabela <lucaskabela@meta.com>
Date: Fri, 27 Feb 2026 11:51:36 -0800
Subject: [PATCH 086/154] [Bugfix] Add monkeypatch to prevent race condition
 from writing (#35420)

Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
---
 vllm/compilation/compiler_interface.py | 43 ++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index c00486af644a..e021ce9e3993 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -184,6 +184,47 @@ def is_compile_cache_enabled(
     )
 
 
+def _patch_standalone_compile_atomic_save() -> None:
+    """Backport of pytorch/pytorch#162432 for torch < 2.10.0.
+
+    Patches CompiledArtifact.save() to use write_atomic for binary format,
+    preventing corrupt cache files when multiple processes compile
+    concurrently.
+    """
+    from torch._inductor.codecache import write_atomic
+    from torch._inductor.standalone_compile import CompiledArtifact as cls
+
+    if getattr(cls.save, "_vllm_patched", False):
+        return
+
+    original_save = cls.save
+
+    def _save(
+        self: Any, *, path: str, format: Literal["binary", "unpacked"] = "binary"
+    ) -> None:
+        if format != "binary":
+            return original_save(self, path=path, format=format)
+        from torch._dynamo.utils import dynamo_timed
+        from torch._inductor.codecache import torch_key
+        from torch.utils._appending_byte_serializer import BytesWriter
+
+        with dynamo_timed("CompiledArtifact.save"):
+            assert self._artifacts is not None
+            artifact_bytes, cache_info = self._artifacts
+            assert len(cache_info.aot_autograd_artifacts) == 1, cache_info
+            key = cache_info.aot_autograd_artifacts[0]
+            assert not os.path.isdir(path)
+            writer = BytesWriter()
+            writer.write_bytes(torch_key())
+            writer.write_str(key)
+            writer.write_bytes(artifact_bytes)
+            write_atomic(path, writer.to_bytes())
+
+    _save._vllm_patched = True  # type: ignore[attr-defined]
+    cls.save = _save  # type: ignore[assignment]
+    logger.debug("Patched %s.save for atomic writes (torch < 2.10)", cls.__name__)
+
+
 class InductorStandaloneAdaptor(CompilerInterface):
     """
     The adaptor for the Inductor compiler.
@@ -197,6 +238,8 @@ class InductorStandaloneAdaptor(CompilerInterface):
     name = "inductor_standalone"
 
     def __init__(self, save_format: Literal["binary", "unpacked"]) -> None:
+        if not is_torch_equal_or_newer("2.10.0"):
+            _patch_standalone_compile_atomic_save()
         self.save_format = save_format
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:

From 1d532f9d8fb205942035313293af701ee580a7e2 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 27 Feb 2026 15:14:31 -0500
Subject: [PATCH 087/154] [DP] Only use DP padding when cudagraphs are actually
 used  (#34102)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/cudagraph/test_cudagraph_dispatch.py | 18 +++-
 vllm/config/compilation.py                    |  8 +-
 vllm/forward_context.py                       |  3 +-
 vllm/v1/cudagraph_dispatcher.py               | 62 +++++++-----
 vllm/v1/spec_decode/eagle.py                  | 97 +++++++++----------
 vllm/v1/worker/dp_utils.py                    | 48 ++++-----
 vllm/v1/worker/gpu_model_runner.py            | 30 ++----
 7 files changed, 139 insertions(+), 127 deletions(-)

diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index debf9aeaa4d7..52e927cee8ec 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -176,10 +176,14 @@ def test_dispatcher(self, cudagraph_mode_str, compilation_mode, lora_config):
         assert rt_mode == CUDAGraphMode.NONE
         assert key == BatchDescriptor(num_tokens=15)
 
-        # 4. disable_full should have a fall back mode (e.g., cascade attention)
+        # 4. invalid_modes={FULL} should have a fall back mode
+        #    (e.g., cascade attention)
         desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
         rt_mode, key = dispatcher.dispatch(
-            num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True
+            num_tokens=8,
+            uniform_decode=False,
+            has_lora=False,
+            invalid_modes={CUDAGraphMode.FULL},
         )
 
         if "PIECEWISE" in cudagraph_mode_str:  # string contains check
@@ -188,6 +192,16 @@ def test_dispatcher(self, cudagraph_mode_str, compilation_mode, lora_config):
         else:
             assert rt_mode == CUDAGraphMode.NONE
 
+        # 5. valid_modes={NONE} always returns NONE even when keys exist
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8,
+            uniform_decode=False,
+            has_lora=False,
+            valid_modes={CUDAGraphMode.NONE},
+        )
+        assert rt_mode == CUDAGraphMode.NONE
+        assert key == BatchDescriptor(num_tokens=8)
+
     @pytest.mark.parametrize(
         "cudagraph_mode_str,compilation_mode,expected_modes",
         [
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 01dc61cdcad0..54dbf24f5f66 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -87,8 +87,12 @@ def has_piecewise_cudagraphs(self) -> bool:
     def separate_routine(self) -> bool:
         return isinstance(self.value, tuple)
 
-    def valid_runtime_modes(self) -> bool:
-        return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]
+    @classmethod
+    def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
+        return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})
+
+    def is_valid_runtime_mode(self) -> bool:
+        return self in CUDAGraphMode.valid_runtime_modes()
 
     def __str__(self) -> str:
         return self.name
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index a0753b19e434..15e3263ba9db 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -241,7 +241,7 @@ class ForwardContext:
     additional_kwargs: dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self):
-        assert self.cudagraph_runtime_mode.valid_runtime_modes(), (
+        assert self.cudagraph_runtime_mode.is_valid_runtime_mode(), (
             f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}"
         )
 
@@ -347,7 +347,6 @@ def set_forward_context(
                 num_tokens_unpadded=num_tokens,
                 parallel_config=vllm_config.parallel_config,
                 allow_microbatching=False,
-                allow_dp_padding=False,
             )
             assert num_tokens_across_dp is not None
         dp_metadata = DPMetadata.make(
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 26ca82b8fe65..1578209e612e 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Set as AbstractSet
 from dataclasses import replace
 from itertools import product
 
@@ -232,8 +233,9 @@ def dispatch(
         num_tokens: int,
         uniform_decode: bool = False,
         has_lora: bool = False,
-        disable_full: bool = False,
         num_active_loras: int = 0,
+        valid_modes: AbstractSet[CUDAGraphMode] | None = None,
+        invalid_modes: AbstractSet[CUDAGraphMode] | None = None,
     ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using piecewise only),
@@ -246,15 +248,29 @@ def dispatch(
             uniform_decode: Whether the batch is uniform decode (i.e. uniform and query
                 length is uniform_decode_query_len).
             has_lora: Whether LoRA is active.
-            disable_full: If True, skip FULL cudagraph checks and
-                return PIECEWISE or NONE only. (can be used for features like
-                cascade attention that are not supported by full cudagraphs)
             num_active_loras: Number of distinct active LoRA adapters.
+            valid_modes: Set of cudagraph modes that are allowed. None means
+                all modes are allowed.
+            invalid_modes: Set of cudagraph modes to exclude. Subtracted from
+                valid_modes to compute allowed modes. (e.g., {FULL} for
+                features like cascade attention not supported by full
+                cudagraphs). None means no modes are excluded.
         """
+        allowed_modes = valid_modes or CUDAGraphMode.valid_runtime_modes()
+
+        if invalid_modes:
+            allowed_modes -= invalid_modes
+
+        assert len(allowed_modes) >= 1, (
+            f"No allowed cudagraph modes: valid_modes={valid_modes}, "
+            f"invalid_modes={invalid_modes}"
+        )
+
         if (
             not self.keys_initialized
             or self.cudagraph_mode == CUDAGraphMode.NONE
             or num_tokens > self.compilation_config.max_cudagraph_capture_size
+            or allowed_modes <= {CUDAGraphMode.NONE}
         ):
             return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
 
@@ -281,24 +297,26 @@ def dispatch(
             num_tokens, uniform_decode, has_lora, effective_num_active_loras
         )
 
-        # check if key exists for full cudagraph
-        # For pure FULL mode, keys are registered with uniform=False.
-        batch_desc_to_check = batch_desc
-        if self.cudagraph_mode == CUDAGraphMode.FULL:
-            batch_desc_to_check = replace(batch_desc, uniform=False)
-        if (
-            not disable_full
-            and batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.FULL]
-        ):
-            return CUDAGraphMode.FULL, batch_desc_to_check
-
-        # also check if the relaxed key exists for more "general"
-        # piecewise cudagraph
-        batch_desc_to_check = replace(batch_desc, num_reqs=None, uniform=False)
-        if batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
-            return CUDAGraphMode.PIECEWISE, batch_desc_to_check
-
-        # finally, just return no cudagraphs and a trivial batch descriptor
+        if CUDAGraphMode.FULL in allowed_modes:
+            # check if key exists for full cudagraph
+            # For pure FULL mode, keys are registered with uniform=False.
+            batch_desc_to_check = batch_desc
+            if self.cudagraph_mode == CUDAGraphMode.FULL:
+                batch_desc_to_check = replace(batch_desc, uniform=False)
+            if batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, batch_desc_to_check
+
+        if CUDAGraphMode.PIECEWISE in allowed_modes:
+            # also check if the relaxed key exists for more "general"
+            # piecewise cudagraph
+            batch_desc_to_check = replace(batch_desc, num_reqs=None, uniform=False)
+            if batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
+                return CUDAGraphMode.PIECEWISE, batch_desc_to_check
+
+        assert CUDAGraphMode.NONE in allowed_modes, (
+            f"No matching cudagraph found and NONE is not in "
+            f"allowed_modes={allowed_modes}"
+        )
         return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
 
     def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]:
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index a46ba8f909e1..e53de6a1de4f 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -448,17 +448,10 @@ def propose(
             assert draft_indexer_metadata is not None
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
-        num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
-            num_tokens_unpadded=num_tokens, num_tokens_padded=num_tokens
+        cudagraph_runtime_mode, num_input_tokens, num_tokens_across_dp = (
+            self._determine_batch_execution_and_padding(num_tokens)
         )
 
-        cudagraph_runtime_mode, batch_desc = self.cudagraph_dispatcher.dispatch(
-            num_tokens_dp_padded
-        )
-        num_input_tokens = batch_desc.num_tokens
-        if num_tokens_across_dp is not None:
-            num_tokens_across_dp[self.dp_rank] = num_input_tokens
-
         if self.supports_mm_inputs:
             mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
 
@@ -549,16 +542,9 @@ def propose(
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
-        batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
-            num_tokens_unpadded=batch_size, num_tokens_padded=batch_size
-        )
-
-        cudagraph_runtime_mode, batch_desc = self.cudagraph_dispatcher.dispatch(
-            batch_size_dp_padded
+        cudagraph_runtime_mode, input_batch_size, batch_size_across_dp = (
+            self._determine_batch_execution_and_padding(batch_size)
         )
-        input_batch_size = batch_desc.num_tokens
-        if batch_size_across_dp is not None:
-            batch_size_across_dp[self.dp_rank] = input_batch_size
 
         common_attn_metadata.num_actual_tokens = batch_size
         common_attn_metadata.max_query_len = 1
@@ -1568,19 +1554,11 @@ def dummy_run(
             self.num_speculative_tokens if not is_graph_capturing else 1
         ):
             if fwd_idx <= 1:
-                num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
-                    num_tokens_unpadded=num_tokens, num_tokens_padded=num_tokens
-                )
-                if use_cudagraphs:
-                    cudagraph_runtime_mode, batch_desc = (
-                        self.cudagraph_dispatcher.dispatch(num_tokens_dp_padded)
+                cudagraph_runtime_mode, num_input_tokens, num_tokens_across_dp = (
+                    self._determine_batch_execution_and_padding(
+                        num_tokens, use_cudagraphs=use_cudagraphs
                     )
-                    num_input_tokens = batch_desc.num_tokens
-                else:
-                    cudagraph_runtime_mode = CUDAGraphMode.NONE
-                    num_input_tokens = num_tokens_dp_padded
-                if num_tokens_across_dp is not None:
-                    num_tokens_across_dp[self.dp_rank] = num_input_tokens
+                )
 
             # Make sure to use EAGLE's own buffer during cudagraph capture.
             if (
@@ -1680,28 +1658,49 @@ def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
             == 1
         ), "All drafting layers should belong to the same kv cache group"
 
-    def _pad_batch_across_dp(
+    def _determine_batch_execution_and_padding(
         self,
-        num_tokens_unpadded: int,
-        num_tokens_padded: int,
-    ) -> tuple[int, torch.Tensor]:
-        # TODO(Flechman): support DBO ubatching
-        should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp(
-            num_tokens_unpadded=num_tokens_unpadded,
-            parallel_config=self.vllm_config.parallel_config,
-            allow_microbatching=False,
-            allow_dp_padding=self.cudagraph_dispatcher.cudagraph_mode
-            != CUDAGraphMode.NONE,
-            num_tokens_padded=num_tokens_padded,
-            uniform_decode=None,
-            num_scheduled_tokens_per_request=None,
+        num_tokens: int,
+        use_cudagraphs: bool = True,
+    ) -> tuple[CUDAGraphMode, int, torch.Tensor | None]:
+        cudagraph_mode, batch_desc = self.cudagraph_dispatcher.dispatch(
+            num_tokens,
+            valid_modes=({CUDAGraphMode.NONE} if not use_cudagraphs else None),
         )
-        assert not should_ubatch, "DBO ubatching not implemented for EAGLE"
+        num_tokens_padded = batch_desc.num_tokens
+
+        # Extra coordination when running data-parallel since we need to
+        # coordinate across ranks
+        # TODO(Flechman): support DBO ubatching
+        should_ubatch, num_tokens_across_dp = False, None
+        if self.vllm_config.parallel_config.data_parallel_size > 1:
+            should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = (
+                coordinate_batch_across_dp(
+                    num_tokens_unpadded=num_tokens,
+                    parallel_config=self.vllm_config.parallel_config,
+                    allow_microbatching=False,
+                    num_tokens_padded=num_tokens_padded,
+                    cudagraph_mode=cudagraph_mode.value,
+                )
+            )
+            assert not should_ubatch, "DBO ubatching not implemented for EAGLE"
+
+            # Extract DP-synced values
+            if num_tokens_across_dp is not None:
+                dp_rank = self.dp_rank
+                num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
+                # Re-dispatch with DP padding so we have the correct
+                # batch_descriptor
+                cudagraph_mode, batch_desc = self.cudagraph_dispatcher.dispatch(
+                    num_tokens_padded,
+                    valid_modes={CUDAGraphMode(synced_cudagraph_mode)},
+                )
+                # Assert to make sure the agreed upon token count is correct
+                # otherwise num_tokens_across_dp will no-longer be valid
+                assert batch_desc.num_tokens == num_tokens_padded
+                num_tokens_across_dp[dp_rank] = num_tokens_padded
 
-        num_tokens_dp_padded = num_tokens_padded
-        if num_toks_across_dp is not None:
-            num_tokens_dp_padded = int(num_toks_across_dp[self.dp_rank].item())
-        return num_tokens_dp_padded, num_toks_across_dp
+        return cudagraph_mode, num_tokens_padded, num_tokens_across_dp
 
 
 class EagleProposer(SpecDecodeBaseProposer):
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 82de0cba9194..688c16a3133c 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -37,7 +37,6 @@ def _get_device_and_group(parallel_config: ParallelConfig):
 
 def _run_ar(
     should_ubatch: bool,
-    should_dp_pad: bool,
     orig_num_tokens_per_ubatch: int,
     padded_num_tokens_per_ubatch: int,
     cudagraph_mode: int,
@@ -46,12 +45,11 @@ def _run_ar(
     dp_size = parallel_config.data_parallel_size
     dp_rank = parallel_config.data_parallel_rank
     device, group = _get_device_and_group(parallel_config)
-    tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32)
+    tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32)
     tensor[0][dp_rank] = orig_num_tokens_per_ubatch
     tensor[1][dp_rank] = padded_num_tokens_per_ubatch
     tensor[2][dp_rank] = 1 if should_ubatch else 0
-    tensor[3][dp_rank] = 1 if should_dp_pad else 0
-    tensor[4][dp_rank] = cudagraph_mode
+    tensor[3][dp_rank] = cudagraph_mode
     dist.all_reduce(tensor, group=group)
     return tensor
 
@@ -97,14 +95,13 @@ def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int:
     If any rank has NONE (0), all ranks use NONE.
     This ensures all ranks send consistent values (all padded or all unpadded).
     """
-    return int(tensor[4, :].min().item())
+    return int(tensor[3, :].min().item())
 
 
 def _synchronize_dp_ranks(
     num_tokens_unpadded: int,
     num_tokens_padded: int,
     should_attempt_ubatching: bool,
-    should_attempt_dp_padding: bool,
     cudagraph_mode: int,
     parallel_config: ParallelConfig,
 ) -> tuple[bool, torch.Tensor | None, int]:
@@ -113,8 +110,8 @@ def _synchronize_dp_ranks(
     run with microbatching or none of them do.
 
     2. Determines the total number of tokens that each rank will run.
-    When running microbatched or if should_attempt_dp_padding is True, all
-    ranks will be padded out so that the run with the same number of tokens
+    When running microbatched or if cudagraph is enabled (synced across ranks),
+    all ranks will be padded out so that they run with the same number of tokens.
 
     3. Synchronizes cudagraph_mode across ranks by taking the minimum.
 
@@ -133,29 +130,26 @@ def _synchronize_dp_ranks(
     # will run and if we are using ubatching or not.
     tensor = _run_ar(
         should_ubatch=should_attempt_ubatching,
-        should_dp_pad=should_attempt_dp_padding,
         orig_num_tokens_per_ubatch=num_tokens_unpadded,
         padded_num_tokens_per_ubatch=num_tokens_padded,
         cudagraph_mode=cudagraph_mode,
         parallel_config=parallel_config,
     )
 
-    should_dp_pad = bool(torch.all(tensor[3] == 1).item())
-
-    # DP ranks should all have the same value for should_attempt_dp_padding.
-    assert should_attempt_dp_padding == should_dp_pad
+    # Synchronize cudagraph_mode across ranks first (take min).
+    # This is needed before DP padding decision since we use the synced
+    # cudagraph mode to determine whether DP padding is needed.
+    synced_cudagraph_mode = _post_process_cudagraph_mode(tensor)
 
     # Check conditions for microbatching
     should_ubatch = _post_process_ubatch(tensor, parallel_config.num_ubatches)
 
-    if should_ubatch and not should_dp_pad:
-        logger.debug_once(
-            "Microbatching has been triggered and requires DP padding. "
-            "Enabling DP padding even though it has been explicitly "
-            "disabled.",
-            scope="global",
-        )
-        should_dp_pad = True
+    # DP padding is needed when cudagraph is enabled (synced across ranks)
+    # or when ubatching/DBO is active (ubatching requires uniform batch
+    # sizes across DP ranks currently).
+    # Use the synced runtime cudagraph mode rather than the compilation config
+    # so we can avoid padding when cudagraph is not enabled for this step.
+    should_dp_pad = synced_cudagraph_mode != 0 or should_ubatch
 
     # Pad all DP ranks up to the maximum token count across ranks if
     # should_dp_pad is True
@@ -164,16 +158,12 @@ def _synchronize_dp_ranks(
         should_dp_pad,
     )
 
-    # Synchronize cudagraph_mode across ranks (take min)
-    synced_cudagraph_mode = _post_process_cudagraph_mode(tensor)
-
     return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode
 
 
 def coordinate_batch_across_dp(
     num_tokens_unpadded: int,
     allow_microbatching: bool,
-    allow_dp_padding: bool,
     parallel_config: ParallelConfig,
     num_tokens_padded: int | None = None,
     uniform_decode: bool | None = None,
@@ -187,7 +177,6 @@ def coordinate_batch_across_dp(
     Args:
         num_tokens_unpadded: Number of tokens without accounting for padding
         allow_microbatching: If microbatching should be attempted
-        allow_dp_padding: If all DP ranks should be padded up to the same value
         parallel_config: The parallel config
         num_tokens_padded: Number of tokens including any non-DP padding (CUDA graphs,
             TP, etc)
@@ -195,15 +184,15 @@ def coordinate_batch_across_dp(
             only contains single token decodes
         num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
             number of tokens per request.
-        cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL)
+        cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL).
+            DP padding is enabled when synced cudagraph mode across ranks is not NONE.
 
     Returns: tuple[
         ubatch_slices: if this is set then all DP ranks have agreed to
         microbatch
         num_tokens_after_padding: A tensor containing the total number of
         tokens per-microbatch for each DP rank including padding. Will be
-        padded up to the max value across all DP ranks when allow_dp_padding
-        is True.
+        padded up to the max value across all DP ranks when cudagraph is enabled.
         synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
     ]
 
@@ -231,7 +220,6 @@ def coordinate_batch_across_dp(
             num_tokens_unpadded,
             num_tokens_padded,
             should_attempt_ubatching,
-            allow_dp_padding,
             cudagraph_mode,
             parallel_config,
         )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 768a7ee4b3f5..5e8de1429711 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2300,7 +2300,7 @@ def _prepare_kv_sharing_fast_prefill(
         )
         # Dispatch for the decoder portion of the model.
         _, batch_desc = self.cudagraph_dispatcher.dispatch(
-            num_logits, disable_full=True
+            num_logits, invalid_modes={CUDAGraphMode.FULL}
         )
         num_logits_padded = batch_desc.num_tokens
         logits_indices_padded = self.kv_sharing_fast_prefill_logits_indices[
@@ -3174,20 +3174,19 @@ def _determine_batch_execution_and_padding(
         has_lora = num_active_loras > 0 if force_has_lora is None else force_has_lora
 
         num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
-        dispatch_cudagraph = (
-            lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
+
+        def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None):
+            return self.cudagraph_dispatcher.dispatch(
                 num_tokens=num_tokens,
                 has_lora=has_lora,
                 uniform_decode=uniform_decode,
-                disable_full=disable_full,
                 num_active_loras=num_active_loras,
+                valid_modes={CUDAGraphMode.NONE} if force_eager else valid_modes,
+                invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
             )
-            if not force_eager
-            else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
-        )
 
         cudagraph_mode, batch_descriptor = dispatch_cudagraph(
-            num_tokens_padded, use_cascade_attn or has_encoder_output
+            num_tokens_padded, disable_full=use_cascade_attn or has_encoder_output
         )
         num_tokens_padded = batch_descriptor.num_tokens
         if self.compilation_config.pass_config.enable_sp:
@@ -3204,20 +3203,11 @@ def _determine_batch_execution_and_padding(
         # across ranks
         should_ubatch, num_tokens_across_dp = False, None
         if self.vllm_config.parallel_config.data_parallel_size > 1:
-            # Disable DP padding when running eager to avoid excessive padding when
-            # running prefills. This lets us set cudagraph_mode="NONE" on the prefiller
-            # in a P/D setup and still use CUDA graphs (enabled by this padding) on the
-            # decoder.
-            allow_dp_padding = (
-                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            )
-
             should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = (
                 coordinate_batch_across_dp(
                     num_tokens_unpadded=num_tokens,
                     parallel_config=self.parallel_config,
                     allow_microbatching=allow_microbatching,
-                    allow_dp_padding=allow_dp_padding,
                     num_tokens_padded=num_tokens_padded,
                     uniform_decode=uniform_decode,
                     num_scheduled_tokens_per_request=num_scheduled_tokens_np,
@@ -3232,7 +3222,7 @@ def _determine_batch_execution_and_padding(
                 # Re-dispatch with DP padding so we have the correct batch_descriptor
                 cudagraph_mode, batch_descriptor = dispatch_cudagraph(
                     num_tokens_padded,
-                    disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value,
+                    valid_modes={CUDAGraphMode(synced_cudagraph_mode)},
                 )
                 # Assert to make sure the agreed upon token count is correct otherwise
                 # num_tokens_across_dp will no-longer be valid
@@ -4724,7 +4714,7 @@ def _dummy_run(
 
         assert (
             cudagraph_runtime_mode is None
-            or cudagraph_runtime_mode.valid_runtime_modes()
+            or cudagraph_runtime_mode.is_valid_runtime_mode()
         )
 
         # If cudagraph_mode.decode_mode() == FULL and
@@ -5336,7 +5326,7 @@ def _capture_cudagraphs(
     ):
         assert (
             cudagraph_runtime_mode != CUDAGraphMode.NONE
-            and cudagraph_runtime_mode.valid_runtime_modes()
+            and cudagraph_runtime_mode.is_valid_runtime_mode()
         ), f"Invalid cudagraph runtime mode: {cudagraph_runtime_mode}"
 
         if not batch_descriptors:

From 1f3dbd95fd13849e974f1f31ff36b3e91d7768bc Mon Sep 17 00:00:00 2001
From: Jakub Zakrzewski <jzakrzewski@nvidia.com>
Date: Fri, 27 Feb 2026 21:41:24 +0100
Subject: [PATCH 088/154] [Bugfix][Model] Fix gpt-oss batch invariance (#35404)

Signed-off-by: Jakub Zakrzewski <jzakrzewski@nvidia.com>
---
 vllm/model_executor/layers/linear.py  |  7 +------
 vllm/model_executor/models/gpt_oss.py | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 5fc9fa073180..7c228cc9083d 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -28,7 +28,6 @@
 )
 from vllm.model_executor.layers.utils import (
     dispatch_unquantized_gemm,
-    is_layer_moe_router_gate,
 )
 from vllm.model_executor.parameter import (
     BasevLLMParameter,
@@ -257,11 +256,7 @@ def apply(
         x: torch.Tensor,
         bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        if (
-            vllm_is_batch_invariant()
-            and current_platform.is_cuda_alike()
-            and is_layer_moe_router_gate(getattr(layer, "prefix", ""))
-        ):
+        if vllm_is_batch_invariant() and current_platform.is_cuda_alike():
             return linear_batch_invariant(x, layer.weight, bias)
         return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index fd7050861678..ce13048d1e8f 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -23,7 +23,11 @@
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_BLOCK_SIZE
@@ -165,7 +169,14 @@ def __init__(
         self.hidden_size = config.hidden_size
         self.experts_per_token = config.num_experts_per_tok
         self.world_size = dist.get_world_size() if dist.is_initialized() else 1
-        self.router = torch.nn.Linear(config.hidden_size, config.num_local_experts)
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=True,
+            quant_config=None,
+            prefix=f"{prefix}.router",
+            return_bias=False,
+        )
         assert config.intermediate_size % self.world_size == 0
         self.experts = FusedMoE(
             num_experts=config.num_local_experts,

From 2ce6f3cf67934ebe199188c9a1f83ff1c2d8ba96 Mon Sep 17 00:00:00 2001
From: Aaron Hao <ahao@anyscale.com>
Date: Fri, 27 Feb 2026 12:45:21 -0800
Subject: [PATCH 089/154] [Feat][RL][2/2] Native Weight Syncing API: IPC
 (#34171)

Signed-off-by: hao-aaron <ahao@anyscale.com>
Signed-off-by: Aaron Hao <ahao@anyscale.com>
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
---
 .buildkite/test-amd.yaml                      |   3 +-
 .buildkite/test_areas/distributed.yaml        |   3 +-
 .../new_weight_syncing/rlhf_async_new_apis.py |   8 +-
 .../new_weight_syncing/rlhf_ipc.py            | 149 ++++++
 .../{rlhf.py => rlhf_nccl.py}                 |  12 +-
 .../new_weight_syncing/rlhf_http_ipc.py       | 181 +++++++
 .../rlhf_http_nccl.py}                        |   8 +-
 tests/distributed/test_weight_transfer.py     | 455 +++++++++++++++++-
 tools/pre_commit/check_forbidden_imports.py   |   2 +
 vllm/config/weight_transfer.py                |   2 +-
 vllm/distributed/weight_transfer/base.py      |  29 +-
 vllm/distributed/weight_transfer/factory.py   |   6 +
 .../distributed/weight_transfer/ipc_engine.py | 291 +++++++++++
 .../weight_transfer/nccl_engine.py            |  85 ++--
 14 files changed, 1189 insertions(+), 45 deletions(-)
 create mode 100644 examples/offline_inference/new_weight_syncing/rlhf_ipc.py
 rename examples/offline_inference/new_weight_syncing/{rlhf.py => rlhf_nccl.py} (97%)
 create mode 100644 examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
 rename examples/online_serving/{rlhf_http.py => new_weight_syncing/rlhf_http_nccl.py} (98%)
 create mode 100644 vllm/distributed/weight_transfer/ipc_engine.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ffdf4b83c0e2..65701b78bce6 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -278,7 +278,8 @@ steps:
   - popd
   # NEW rlhf examples
   - pushd ../examples/offline_inference/new_weight_syncing
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
   - popd
 
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 9b5b002f4b70..0a75bc50e484 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -103,7 +103,8 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   # NEW rlhf examples
   - cd new_weight_syncing
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
 
 - label: Distributed Tests (8 GPUs)(H100)
   timeout_in_minutes: 10
diff --git a/examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py b/examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
index 8714eb92b0b3..88b89fbfc403 100644
--- a/examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
+++ b/examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
@@ -42,6 +42,7 @@
     WeightTransferUpdateRequest,
 )
 from vllm.distributed.weight_transfer.nccl_engine import (
+    NCCLTrainerSendWeightsArgs,
     NCCLWeightTransferEngine,
     NCCLWeightTransferInitInfo,
     NCCLWeightTransferUpdateInfo,
@@ -152,11 +153,14 @@ def init_weight_transfer_group(self, world_size):
 
     def broadcast_weights(self, packed: bool = True):
         """Broadcast weights to the inference engine."""
-        NCCLWeightTransferEngine.trainer_send_weights(
-            iterator=self.model.named_parameters(),
+        trainer_args = NCCLTrainerSendWeightsArgs(
             group=self.model_update_group,
             packed=packed,
         )
+        NCCLWeightTransferEngine.trainer_send_weights(
+            iterator=self.model.named_parameters(),
+            trainer_args=trainer_args,
+        )
 
     @torch.inference_mode()
     def generate(self, token_ids: list[int], max_new_tokens: int) -> list[int]:
diff --git a/examples/offline_inference/new_weight_syncing/rlhf_ipc.py b/examples/offline_inference/new_weight_syncing/rlhf_ipc.py
new file mode 100644
index 000000000000..169b1026ad4a
--- /dev/null
+++ b/examples/offline_inference/new_weight_syncing/rlhf_ipc.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray,
+with IPC-based weight syncing APIs
+
+The script colocates the training and inference workloads onto the same GPU using Ray.
+
+The example performs the following steps:
+
+* Request a placement group of 1 GPU.
+* Place the inference model on the above GPU using the placement group.
+* Place and load the training model on the same GPU using the placement group.
+* Generate text from a list of prompts using the inference engine.
+* Update the weights of the training model and broadcast the updated weights
+  to the inference engine by using CUDA IPC handles. Note that
+  for demonstration purposes we simply zero out the weights.
+
+This example assumes a single-node cluster with a single GPU,
+but can be extended to multiple GPUs.
+"""
+
+import os
+
+import ray
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from transformers import AutoModelForCausalLM
+
+from vllm import LLM, SamplingParams
+from vllm.config import WeightTransferConfig
+from vllm.distributed.weight_transfer.ipc_engine import (
+    IPCTrainerSendWeightsArgs,
+    IPCWeightTransferEngine,
+)
+
+
+class MyLLM(LLM):
+    """Configure the vLLM worker for Ray placement group execution."""
+
+    def __init__(self, *args, **kwargs):
+        # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
+        # so that vLLM can manage its own device placement within the worker.
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        # Each worker uses 0.4 GPU so that two instances fit on the same GPU.
+        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = "0"
+        # needed for ipc handle serialization
+        os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
+        super().__init__(*args, **kwargs)
+
+
+# Load the OPT-125M model onto GPU 0 for the training workload.
+
+MODEL_NAME = "facebook/opt-125m"
+
+
+@ray.remote
+class TrainModel:
+    def __init__(self, llm_handle: ray.actor.ActorHandle):
+        self.train_model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+        )
+        self.train_model.to("cuda:0")
+        self.llm_handle = llm_handle
+
+    def init_weight_transfer(self):
+        # IPC backend doesn't need initialization info
+        ray.get(
+            self.llm_handle.init_weight_transfer_engine.remote(dict(init_info=dict()))
+        )
+
+    def broadcast_weights(self, llm_handle: ray.actor.ActorHandle):
+        """Broadcast weights to the inference engine using IPC."""
+        self.llm_handle = llm_handle
+        trainer_args = IPCTrainerSendWeightsArgs(mode="ray", llm_handle=llm_handle)
+        IPCWeightTransferEngine.trainer_send_weights(
+            iterator=self.train_model.named_parameters(),
+            trainer_args=trainer_args,
+        )
+
+
+ray.init()
+
+pg_colocate = placement_group([{"GPU": 1, "CPU": 0}])
+ray.get(pg_colocate.ready())
+
+
+llm = ray.remote(
+    num_cpus=0,
+    num_gpus=0,
+    scheduling_strategy=PlacementGroupSchedulingStrategy(
+        placement_group=pg_colocate,
+        placement_group_capture_child_tasks=True,
+    ),
+)(MyLLM).remote(
+    model=MODEL_NAME,
+    enforce_eager=True,
+    tensor_parallel_size=1,
+    distributed_executor_backend="ray",
+    gpu_memory_utilization=0.7,
+    weight_transfer_config=WeightTransferConfig(backend="ipc"),
+    load_format="dummy",
+)
+
+train_model = TrainModel.options(
+    num_gpus=0.1,
+    num_cpus=0,
+    scheduling_strategy=PlacementGroupSchedulingStrategy(
+        placement_group=pg_colocate, placement_group_capture_child_tasks=True
+    ),
+).remote(llm)
+
+
+# Generate text from the prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0)
+
+outputs = ray.get(llm.generate.remote(prompts, sampling_params))
+
+print("-" * 50)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
+
+ray.get(llm.sleep.remote(level=0))
+
+ray.get(train_model.init_weight_transfer.remote())
+# Synchronize the updated weights to the inference engine using batched API.
+ray.get(train_model.broadcast_weights.remote(llm))
+
+ray.get(llm.wake_up.remote(tags=["scheduling"]))
+
+# Generate text with the updated model.
+outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+print("-" * 50)
+for output in outputs_updated:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
diff --git a/examples/offline_inference/new_weight_syncing/rlhf.py b/examples/offline_inference/new_weight_syncing/rlhf_nccl.py
similarity index 97%
rename from examples/offline_inference/new_weight_syncing/rlhf.py
rename to examples/offline_inference/new_weight_syncing/rlhf_nccl.py
index b3a3ca62f5a6..5d5f24a93f35 100644
--- a/examples/offline_inference/new_weight_syncing/rlhf.py
+++ b/examples/offline_inference/new_weight_syncing/rlhf_nccl.py
@@ -36,6 +36,7 @@
 from vllm import LLM, SamplingParams
 from vllm.config import WeightTransferConfig
 from vllm.distributed.weight_transfer.nccl_engine import (
+    NCCLTrainerSendWeightsArgs,
     NCCLWeightTransferEngine,
 )
 from vllm.utils.network_utils import get_ip, get_open_port
@@ -90,11 +91,14 @@ def init_weight_transfer_group(self, world_size):
 
     def broadcast_weights(self, packed: bool = True):
         """Broadcast weights to the inference engine."""
-        NCCLWeightTransferEngine.trainer_send_weights(
-            iterator=self.model.named_parameters(),
+        trainer_args = NCCLTrainerSendWeightsArgs(
             group=self.model_update_group,
             packed=packed,
         )
+        NCCLWeightTransferEngine.trainer_send_weights(
+            iterator=self.model.named_parameters(),
+            trainer_args=trainer_args,
+        )
 
 
 # Initialize Ray and set the visible devices. The vLLM engine will
@@ -156,6 +160,8 @@ def broadcast_weights(self, packed: bool = True):
     print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
     print("-" * 50)
 
+ray.get(llm.sleep.remote(level=0))
+
 # Set up the communication channel between the training process and the
 # inference engine.
 master_address, master_port = ray.get(train_model.get_master_address_and_port.remote())
@@ -197,6 +203,8 @@ def broadcast_weights(self, packed: bool = True):
 train_handle = train_model.broadcast_weights.remote(packed=True)
 ray.get([train_handle, inference_handle])
 
+ray.get(llm.wake_up.remote(tags=["scheduling"]))
+
 # Generate text with the updated model. The output is expected to be normal
 # because the weights are updated.
 outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
diff --git a/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
new file mode 100644
index 000000000000..d73eba64c267
--- /dev/null
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates reinforcement learning from human feedback (RLHF) using vLLM
+via HTTP API, with IPC-based weight syncing APIs.
+
+Unlike rlhf_nccl.py which uses NCCL and can use separate GPUs, this script
+uses CUDA IPC which requires the training model and vLLM server to be on the
+same GPU. Memory must be carefully managed to fit both models.
+
+Unlike rlhf.py which creates a vLLM instance programmatically, this script
+assumes you have already started a vLLM server using `vllm serve`. It uses:
+- OpenAI-compatible API for inference requests
+- HTTP endpoints for weight transfer control plane
+- CUDA IPC for actual weight data transfer
+
+Prerequisites:
+    Start a vLLM server with weight transfer enabled and reduced GPU memory
+    utilization to leave room for the training model:
+
+    $ VLLM_SERVER_DEV_MODE=1 VLLM_ALLOW_INSECURE_SERIALIZATION=1 \
+        vllm serve facebook/opt-125m --enforce-eager \
+        --weight-transfer-config '{"backend": "ipc"}' \
+        --load-format dummy \
+        --gpu-memory-utilization 0.5
+
+    Then run this script:
+
+    $ python rlhf_http_ipc.py
+
+The example performs the following steps:
+
+* Load the training model on GPU 0 (same GPU as the vLLM server).
+* Generate text using the vLLM server via OpenAI-compatible API. The output
+  is expected to be nonsense because the server is initialized with dummy weights.
+* Initialize weight transfer via HTTP endpoint (no-op for IPC).
+* Broadcast the real weights from the training model to the vLLM server
+  using CUDA IPC handles.
+* Generate text again to show normal output after the weight update.
+"""
+
+import os
+
+import requests
+import torch
+from openai import OpenAI
+from transformers import AutoModelForCausalLM
+
+from vllm.distributed.weight_transfer.ipc_engine import (
+    IPCTrainerSendWeightsArgs,
+    IPCWeightTransferEngine,
+)
+
+BASE_URL = "http://localhost:8000"
+MODEL_NAME = "facebook/opt-125m"
+
+# Enable insecure serialization for IPC handle serialization
+os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
+
+
+def generate_completions(client: OpenAI, model: str, prompts: list[str]) -> list[str]:
+    """Generate completions using the OpenAI-compatible API."""
+    results = []
+    for prompt in prompts:
+        response = client.completions.create(
+            model=model,
+            prompt=prompt,
+            max_tokens=32,
+            temperature=0,
+        )
+        results.append(response.choices[0].text)
+    return results
+
+
+def init_weight_transfer_engine(base_url: str) -> None:
+    """Initialize weight transfer via HTTP endpoint (no-op for IPC)."""
+    url = f"{base_url}/init_weight_transfer_engine"
+    payload = {"init_info": dict()}
+    response = requests.post(url, json=payload, timeout=60)
+    response.raise_for_status()
+
+
+def pause_generation(base_url: str) -> None:
+    """Pause generation via HTTP endpoint."""
+    url = f"{base_url}/pause"
+    response = requests.post(url, timeout=60)
+    response.raise_for_status()
+
+
+def resume_generation(base_url: str) -> None:
+    """Resume generation via HTTP endpoint."""
+    url = f"{base_url}/resume"
+    response = requests.post(url, timeout=60)
+    response.raise_for_status()
+
+
+def get_world_size(base_url: str) -> int:
+    """Get world size from the vLLM server."""
+    url = f"{base_url}/get_world_size"
+    response = requests.get(url, timeout=10)
+    response.raise_for_status()
+    return response.json()["world_size"]
+
+
+def main():
+    # IPC requires the training model to be on the same GPU as the vLLM server
+    # The server should be started on GPU 0 with reduced memory utilization
+    device = "cuda:0"
+    torch.cuda.set_device(device)
+
+    # Load the training model on the same GPU as the server
+    # Use bfloat16 to reduce memory footprint
+    print(f"Loading training model: {MODEL_NAME} on {device}")
+    print(
+        "Note: Ensure the vLLM server was started with --gpu-memory-utilization 0.5 "
+        "or lower to leave room for the training model."
+    )
+    train_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.bfloat16)
+    train_model.to(device)
+    train_model.eval()  # Set to eval mode to save memory
+
+    # Create OpenAI client pointing to the vLLM server
+    client = OpenAI(
+        base_url=f"{BASE_URL}/v1",
+        api_key="EMPTY",  # vLLM doesn't require an API key by default
+    )
+
+    # Test prompts
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Generate text before weight update. The output is expected to be nonsense
+    # because the server is initialized with dummy weights.
+    print("-" * 50)
+    print("Generating text BEFORE weight update (expect nonsense):")
+    print("-" * 50)
+    outputs = generate_completions(client, MODEL_NAME, prompts)
+    for prompt, generated_text in zip(prompts, outputs):
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    print("Initializing weight transfer (IPC backend)...")
+
+    # Initialize weight transfer on vLLM server (no-op for IPC, but still required)
+    init_weight_transfer_engine(BASE_URL)
+
+    # Pause generation before weight sync
+    pause_generation(BASE_URL)
+
+    # Broadcast weights via IPC handles using HTTP mode
+    print("Broadcasting weights via CUDA IPC (HTTP)...")
+    trainer_args = IPCTrainerSendWeightsArgs(mode="http", url=BASE_URL)
+    IPCWeightTransferEngine.trainer_send_weights(
+        iterator=train_model.named_parameters(),
+        trainer_args=trainer_args,
+    )
+
+    # Resume generation after weight sync
+    resume_generation(BASE_URL)
+
+    # Generate text after weight update. The output is expected to be normal
+    # because the real weights are now loaded.
+    print("-" * 50)
+    print("Generating text AFTER weight update:")
+    print("-" * 50)
+    outputs_updated = generate_completions(client, MODEL_NAME, prompts)
+    for prompt, generated_text in zip(prompts, outputs_updated):
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Note: The training model and IPC handles remain in memory.
+    # In a real RLHF training loop, you would update the training model
+    # and create new IPC handles for each weight update.
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/rlhf_http.py b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
similarity index 98%
rename from examples/online_serving/rlhf_http.py
rename to examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
index 721a038a6600..b8a6b180a8d1 100644
--- a/examples/online_serving/rlhf_http.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
@@ -39,6 +39,7 @@
 from transformers import AutoModelForCausalLM
 
 from vllm.distributed.weight_transfer.nccl_engine import (
+    NCCLTrainerSendWeightsArgs,
     NCCLWeightTransferEngine,
 )
 from vllm.utils.network_utils import get_ip, get_open_port
@@ -214,11 +215,14 @@ def main():
 
     # Broadcast all weights from trainer to vLLM workers
     print("Broadcasting weights via NCCL...")
-    NCCLWeightTransferEngine.trainer_send_weights(
-        iterator=train_model.named_parameters(),
+    trainer_args = NCCLTrainerSendWeightsArgs(
         group=model_update_group,
         packed=True,
     )
+    NCCLWeightTransferEngine.trainer_send_weights(
+        iterator=train_model.named_parameters(),
+        trainer_args=trainer_args,
+    )
 
     # Wait for update_weights to complete
     update_thread.join()
diff --git a/tests/distributed/test_weight_transfer.py b/tests/distributed/test_weight_transfer.py
index 4c348dd799b5..04747e7327bc 100644
--- a/tests/distributed/test_weight_transfer.py
+++ b/tests/distributed/test_weight_transfer.py
@@ -3,18 +3,26 @@
 """Tests for weight transfer engine backends.
 
 Unit tests for engine classes (parsing, validation, registry).
-Integration test for NCCL weight transfer between processes using Ray.
+Integration tests for NCCL and IPC weight transfer between processes using Ray.
 """
 
+import base64
+import pickle
 from unittest.mock import MagicMock
 
 import pytest
 import ray
 import torch
+from torch.multiprocessing.reductions import reduce_tensor
 
 from vllm.config.parallel import ParallelConfig
 from vllm.config.weight_transfer import WeightTransferConfig
 from vllm.distributed.weight_transfer import WeightTransferEngineFactory
+from vllm.distributed.weight_transfer.ipc_engine import (
+    IPCWeightTransferEngine,
+    IPCWeightTransferInitInfo,
+    IPCWeightTransferUpdateInfo,
+)
 from vllm.distributed.weight_transfer.nccl_engine import (
     NCCLWeightTransferEngine,
     NCCLWeightTransferInitInfo,
@@ -155,9 +163,29 @@ def test_create_engine_nccl(self):
         engine = WeightTransferEngineFactory.create_engine(config, parallel_config)
         assert isinstance(engine, NCCLWeightTransferEngine)
 
+    def test_create_engine_ipc(self):
+        """Test factory creates IPC engine."""
+        config = WeightTransferConfig(backend="ipc")
+        parallel_config = create_mock_parallel_config()
+        engine = WeightTransferEngineFactory.create_engine(config, parallel_config)
+        assert isinstance(engine, IPCWeightTransferEngine)
+
     def test_create_engine_invalid_backend(self):
         """Test factory raises for invalid backend."""
-        config = WeightTransferConfig(backend="invalid")
+        # Pydantic validates Literal types at construction, so we can't create
+        # a config with an invalid backend. Instead, we test by directly
+        # accessing the registry or using model_construct to bypass validation.
+        from pydantic import ValidationError
+
+        # Test that Pydantic prevents invalid backend at construction
+        with pytest.raises(ValidationError):
+            WeightTransferConfig(backend="invalid")
+
+        # Test factory error by creating a config with valid backend but
+        # then manually modifying the backend attribute (bypassing validation)
+        config = WeightTransferConfig(backend="nccl")
+        # Use object.__setattr__ to bypass Pydantic validation
+        object.__setattr__(config, "backend", "invalid")
         parallel_config = create_mock_parallel_config()
         with pytest.raises(ValueError, match="Invalid weight transfer backend"):
             WeightTransferEngineFactory.create_engine(config, parallel_config)
@@ -344,3 +372,426 @@ def test_nccl_weight_transfer_between_processes():
         f"Received shape: {result['received_shape']}, "
         f"Received sum: {result['received_sum']}"
     )
+
+
+# --- Unit Tests: IPCWeightTransferUpdateInfo Validation ---
+
+
+class TestIPCWeightTransferUpdateInfoValidation:
+    """Test IPCWeightTransferUpdateInfo dataclass validation."""
+
+    def test_valid_update_info(self):
+        """Test creating valid IPCWeightTransferUpdateInfo."""
+        if torch.cuda.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+
+        # Create a dummy tensor and IPC handle
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}]
+
+        info = IPCWeightTransferUpdateInfo(
+            names=["layer.weight"],
+            dtype_names=["float32"],
+            shapes=[[10, 10]],
+            ipc_handles=ipc_handles,
+        )
+        assert info.names == ["layer.weight"]
+        assert info.dtype_names == ["float32"]
+        assert info.shapes == [[10, 10]]
+        assert len(info.ipc_handles) == 1
+
+    def test_mismatched_dtype_names_raises(self):
+        """Test that mismatched dtype_names length raises ValueError."""
+        if torch.cuda.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}, {gpu_uuid: ipc_handle}]
+
+        with pytest.raises(ValueError, match="dtype_names"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight", "layer.bias"],
+                dtype_names=["float32"],  # Only one dtype
+                shapes=[[10, 10], [10]],
+                ipc_handles=ipc_handles,
+            )
+
+    def test_mismatched_shapes_raises(self):
+        """Test that mismatched shapes length raises ValueError."""
+        if torch.cuda.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}, {gpu_uuid: ipc_handle}]
+
+        with pytest.raises(ValueError, match="shapes"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight", "layer.bias"],
+                dtype_names=["float32", "float32"],
+                shapes=[[10, 10]],  # Only one shape
+                ipc_handles=ipc_handles,
+            )
+
+    def test_mismatched_ipc_handles_raises(self):
+        """Test that mismatched ipc_handles length raises ValueError."""
+        if torch.cuda.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}]  # Only one handle
+
+        with pytest.raises(ValueError, match="ipc_handles"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight", "layer.bias"],
+                dtype_names=["float32", "float32"],
+                shapes=[[10, 10], [10]],
+                ipc_handles=ipc_handles,
+            )
+
+    def test_valid_update_info_from_pickled(self):
+        """Test creating IPCWeightTransferUpdateInfo from pickled handles."""
+        if torch.cuda.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}]
+
+        pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
+
+        info = IPCWeightTransferUpdateInfo(
+            names=["layer.weight"],
+            dtype_names=["float32"],
+            shapes=[[10, 10]],
+            ipc_handles_pickled=pickled,
+        )
+        assert info.ipc_handles == ipc_handles
+        assert info.ipc_handles_pickled is None
+
+    def test_both_handles_and_pickled_raises(self):
+        """Test that providing both ipc_handles and ipc_handles_pickled raises."""
+        if torch.cuda.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+
+        dummy_tensor = torch.ones(10, 10, device="cuda:0")
+        ipc_handle = reduce_tensor(dummy_tensor)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle}]
+
+        pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
+
+        with pytest.raises(ValueError, match="Cannot specify both"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight"],
+                dtype_names=["float32"],
+                shapes=[[10, 10]],
+                ipc_handles=ipc_handles,
+                ipc_handles_pickled=pickled,
+            )
+
+    def test_neither_handles_nor_pickled_raises(self):
+        """Test that providing neither ipc_handles nor ipc_handles_pickled raises."""
+        with pytest.raises(ValueError, match="must be provided"):
+            IPCWeightTransferUpdateInfo(
+                names=["layer.weight"],
+                dtype_names=["float32"],
+                shapes=[[10, 10]],
+            )
+
+    def test_empty_lists_valid(self):
+        """Test that empty lists are valid."""
+        info = IPCWeightTransferUpdateInfo(
+            names=[],
+            dtype_names=[],
+            shapes=[],
+            ipc_handles=[],
+        )
+        assert len(info.names) == 0
+
+
+# --- Unit Tests: IPC Engine Parsing ---
+
+
+class TestIPCEngineParsing:
+    """Test IPCWeightTransferEngine parsing methods."""
+
+    def test_parse_update_info_valid(self):
+        """Test parsing valid update info dict."""
+        if torch.cuda.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+
+        config = WeightTransferConfig(backend="ipc")
+        parallel_config = create_mock_parallel_config()
+        engine = IPCWeightTransferEngine(config, parallel_config)
+
+        # Create dummy IPC handles
+        dummy_tensor1 = torch.ones(100, 100, device="cuda:0")
+        dummy_tensor2 = torch.ones(50, device="cuda:0")
+        ipc_handle1 = reduce_tensor(dummy_tensor1)
+        ipc_handle2 = reduce_tensor(dummy_tensor2)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle1}, {gpu_uuid: ipc_handle2}]
+
+        update_info = engine.parse_update_info(
+            {
+                "names": ["w1", "w2"],
+                "dtype_names": ["float32", "bfloat16"],
+                "shapes": [[100, 100], [50]],
+                "ipc_handles": ipc_handles,
+            }
+        )
+
+        assert isinstance(update_info, IPCWeightTransferUpdateInfo)
+        assert update_info.names == ["w1", "w2"]
+        assert update_info.dtype_names == ["float32", "bfloat16"]
+        assert update_info.shapes == [[100, 100], [50]]
+        assert len(update_info.ipc_handles) == 2
+
+    def test_parse_update_info_pickled(self):
+        """Test parsing update info with pickled IPC handles (HTTP path)."""
+        if torch.cuda.device_count() < 1:
+            pytest.skip("Need at least 1 GPU for this test")
+
+        config = WeightTransferConfig(backend="ipc")
+        parallel_config = create_mock_parallel_config()
+        engine = IPCWeightTransferEngine(config, parallel_config)
+
+        dummy_tensor1 = torch.ones(100, 100, device="cuda:0")
+        dummy_tensor2 = torch.ones(50, device="cuda:0")
+        ipc_handle1 = reduce_tensor(dummy_tensor1)
+        ipc_handle2 = reduce_tensor(dummy_tensor2)
+        gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
+        ipc_handles = [{gpu_uuid: ipc_handle1}, {gpu_uuid: ipc_handle2}]
+
+        pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
+
+        update_info = engine.parse_update_info(
+            {
+                "names": ["w1", "w2"],
+                "dtype_names": ["float32", "bfloat16"],
+                "shapes": [[100, 100], [50]],
+                "ipc_handles_pickled": pickled,
+            }
+        )
+
+        assert isinstance(update_info, IPCWeightTransferUpdateInfo)
+        assert update_info.names == ["w1", "w2"]
+        assert len(update_info.ipc_handles) == 2
+        assert update_info.ipc_handles_pickled is None
+        assert gpu_uuid in update_info.ipc_handles[0]
+        assert gpu_uuid in update_info.ipc_handles[1]
+
+
+# --- Integration Test: IPC Weight Transfer Between Ray Tasks ---
+
+
+def get_physical_gpu_id(device_index: int = 0) -> str:
+    """Get physical GPU UUID for a device."""
+    props = torch.cuda.get_device_properties(device_index)
+    return str(props.uuid)
+
+
+@ray.remote(num_gpus=0.5)
+class TrainerActor:
+    """Trainer actor that creates and holds CUDA IPC handles."""
+
+    def __init__(self, tensor_shape: list[int], tensor_dtype: str):
+        # Create tensor on GPU and keep it alive
+        dtype = getattr(torch, tensor_dtype)
+        self.tensor = torch.ones(tensor_shape, dtype=dtype, device="cuda:0")
+        self.tensor.fill_(42.0)  # Fill with 42 to verify correct transfer
+
+        # Create IPC handle (tensor must stay alive for IPC to work)
+        ipc_handle = reduce_tensor(self.tensor)
+        gpu_uuid = get_physical_gpu_id(0)
+
+        torch.cuda.synchronize()
+
+        self.ipc_handle_dict = {
+            "ipc_handle": ipc_handle,
+            "gpu_uuid": gpu_uuid,
+            "shape": tensor_shape,
+            "dtype": tensor_dtype,
+        }
+
+    def get_ipc_handle_dict(self) -> dict:
+        """Return IPC handle dict. Tensor stays alive in this actor."""
+        return self.ipc_handle_dict
+
+
+@ray.remote(num_gpus=0.5)
+def inference_receive_ipc_tensor(
+    ipc_handle_dict: dict,
+    mode: str = "ray",
+) -> dict:
+    """Inference task that receives tensor via IPCWeightTransferEngine."""
+    from unittest.mock import MagicMock
+
+    import torch
+
+    from vllm.config.parallel import ParallelConfig
+    from vllm.config.weight_transfer import WeightTransferConfig
+    from vllm.distributed.weight_transfer.ipc_engine import (
+        IPCWeightTransferEngine,
+    )
+
+    # Create engine with mock parallel config
+    config = WeightTransferConfig(backend="ipc")
+    parallel_config = MagicMock(spec=ParallelConfig)
+    parallel_config.rank = 0
+    parallel_config.world_size = 1
+    parallel_config.data_parallel_rank = 0
+
+    engine = IPCWeightTransferEngine(config, parallel_config)
+
+    # Initialize the engine (no-op for IPC)
+    init_info = IPCWeightTransferInitInfo()
+    engine.init_transfer_engine(init_info)
+
+    # Receive weights with a no-op load_weights that captures the tensor
+    received_tensors = []
+
+    def noop_load_weights(weights: list[tuple[str, torch.Tensor]]):
+        for name, tensor in weights:
+            # Clone tensor to keep it after engine cleans up
+            received_tensors.append((name, tensor.clone()))
+
+    # Build update dict and go through parse_update_info (exercises __post_init__)
+    ipc_handles = [{ipc_handle_dict["gpu_uuid"]: ipc_handle_dict["ipc_handle"]}]
+
+    if mode == "ray":
+        update_dict: dict = {
+            "names": ["test.weight"],
+            "dtype_names": [ipc_handle_dict["dtype"]],
+            "shapes": [ipc_handle_dict["shape"]],
+            "ipc_handles": ipc_handles,
+        }
+    elif mode == "http":
+        pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
+        update_dict = {
+            "names": ["test.weight"],
+            "dtype_names": [ipc_handle_dict["dtype"]],
+            "shapes": [ipc_handle_dict["shape"]],
+            "ipc_handles_pickled": pickled,
+        }
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+
+    update_info = engine.parse_update_info(update_dict)
+    engine.receive_weights(update_info, noop_load_weights)
+    torch.cuda.synchronize()
+
+    # Verify we received the tensor
+    success = False
+    received_shape = None
+    received_sum = None
+
+    if len(received_tensors) == 1:
+        name, tensor = received_tensors[0]
+        received_shape = list(tensor.shape)
+        received_sum = tensor.sum().item()
+        # Check shape matches and values are all 42s (trainer sends 42s)
+        if received_shape == ipc_handle_dict["shape"]:
+            expected_sum = 42.0 * torch.tensor(ipc_handle_dict["shape"]).prod().item()
+            if abs(received_sum - expected_sum) < 0.01:
+                success = True
+
+    engine.shutdown()
+
+    return {
+        "success": success,
+        "received_shape": received_shape,
+        "received_sum": received_sum,
+    }
+
+
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 1,
+    reason="Need at least 1 GPU to run IPC weight transfer test.",
+)
+@pytest.mark.parametrize("mode", ["ray", "http"])
+def test_ipc_weight_transfer_between_processes(mode: str):
+    """Test IPC weight transfer from trainer to inference process using Ray.
+
+    Parametrized over transport modes:
+    - 'ray':  ipc_handles passed directly.
+    - 'http': ipc_handles pickled + base64-encoded, unpickled via __post_init__.
+
+    IPC requires same-GPU access, so we use a placement group to co-locate
+    the trainer actor and inference task on the same GPU.
+    """
+    from ray.util.placement_group import placement_group
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+    ray.init(ignore_reinit_error=True)
+
+    # Create a placement group to ensure both processes are on the same GPU
+    # Use fractional GPUs so both tasks can share the same GPU bundle
+    pg = placement_group([{"GPU": 1, "CPU": 2}])
+    ray.get(pg.ready())
+
+    scheduling_strategy = PlacementGroupSchedulingStrategy(
+        placement_group=pg,
+        placement_group_capture_child_tasks=True,
+    )
+
+    # Tensor to transfer: 100x100 filled with 42s
+    tensor_shape = [100, 100]
+    tensor_dtype = "float32"
+
+    # Create trainer actor that holds the tensor and IPC handle (stays alive)
+    trainer_actor = TrainerActor.options(  # type: ignore[attr-defined]
+        scheduling_strategy=scheduling_strategy
+    ).remote(tensor_shape, tensor_dtype)
+
+    # Get IPC handle dict (tensor stays alive in trainer actor)
+    ipc_handle_dict = ray.get(trainer_actor.get_ipc_handle_dict.remote())
+
+    # Receive tensor in inference process using IPC handles (on same GPU)
+    # Trainer actor stays alive during this operation
+    inference_result = ray.get(
+        inference_receive_ipc_tensor.options(
+            scheduling_strategy=scheduling_strategy
+        ).remote(ipc_handle_dict, mode=mode)
+    )
+
+    assert inference_result["success"], (
+        f"IPC weight transfer failed (mode={mode}). "
+        f"Received shape: {inference_result['received_shape']}, "
+        f"Received sum: {inference_result['received_sum']}"
+    )
+
+
+def test_ipc_receive_weights_missing_gpu_uuid_raises():
+    """Test that receive_weights raises if GPU UUID not found in IPC handles."""
+    if torch.cuda.device_count() < 1:
+        pytest.skip("Need at least 1 GPU for this test")
+
+    config = WeightTransferConfig(backend="ipc")
+    parallel_config = create_mock_parallel_config()
+    engine = IPCWeightTransferEngine(config, parallel_config)
+
+    # Create IPC handle with wrong GPU UUID
+    dummy_tensor = torch.ones(10, 10, device="cuda:0")
+    ipc_handle = reduce_tensor(dummy_tensor)
+    wrong_uuid = "wrong-uuid-12345"
+    ipc_handles = [{wrong_uuid: ipc_handle}]
+
+    update_info = IPCWeightTransferUpdateInfo(
+        names=["w"],
+        dtype_names=["float32"],
+        shapes=[[10, 10]],
+        ipc_handles=ipc_handles,
+    )
+
+    with pytest.raises(ValueError, match="IPC handle not found"):
+        engine.receive_weights(update_info, lambda x: None)
diff --git a/tools/pre_commit/check_forbidden_imports.py b/tools/pre_commit/check_forbidden_imports.py
index 009e9bcbc4c5..786610138351 100644
--- a/tools/pre_commit/check_forbidden_imports.py
+++ b/tools/pre_commit/check_forbidden_imports.py
@@ -37,6 +37,8 @@ class ForbiddenImport:
             "vllm/distributed/device_communicators/all_reduce_utils.py",
             "vllm/distributed/device_communicators/shm_broadcast.py",
             "vllm/distributed/device_communicators/shm_object_storage.py",
+            "vllm/distributed/weight_transfer/ipc_engine.py",
+            "tests/distributed/test_weight_transfer.py",
             "vllm/utils/hashing.py",
             "tests/multimodal/media/test_base.py",
             "tests/tokenizers_/test_hf.py",
diff --git a/vllm/config/weight_transfer.py b/vllm/config/weight_transfer.py
index 855b0d915bbb..1da1f96cb7e4 100644
--- a/vllm/config/weight_transfer.py
+++ b/vllm/config/weight_transfer.py
@@ -9,5 +9,5 @@
 class WeightTransferConfig:
     """Configuration for weight transfer during RL training."""
 
-    backend: Literal["nccl"] = "nccl"
+    backend: Literal["nccl", "ipc"] = "nccl"
     """The backend to use for weight transfer."""
diff --git a/vllm/distributed/weight_transfer/base.py b/vllm/distributed/weight_transfer/base.py
index b87f190fcf7a..788dcef128e5 100644
--- a/vllm/distributed/weight_transfer/base.py
+++ b/vllm/distributed/weight_transfer/base.py
@@ -3,7 +3,7 @@
 """Base class for weight transfer engines."""
 
 from abc import ABC, abstractmethod
-from collections.abc import Callable
+from collections.abc import Callable, Iterator
 from dataclasses import KW_ONLY, dataclass, field
 from typing import Any, Generic, TypeVar
 
@@ -156,3 +156,30 @@ def shutdown(self) -> None:
         This should be called when the worker is shutting down.
         """
         raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def trainer_send_weights(
+        iterator: Iterator[tuple[str, torch.Tensor]],
+        trainer_args: dict[str, Any] | Any,
+    ) -> None:
+        """
+        Send weights from trainer to inference workers.
+
+        This is a static method that can be called from the trainer process
+        to send weights to all inference workers.
+
+        Args:
+            iterator: Iterator of model parameters. Returns (name, tensor) tuples.
+                     The tensors should be on the appropriate device for the backend.
+            trainer_args: Dictionary containing backend-specific arguments needed
+                         to send weights. The structure depends on the backend:
+                         - NCCL: Contains 'group', 'src', 'packed', etc.
+                         - IPC: Contains 'mode' ('http' or 'ray'),
+                                'llm_handle' (for Ray), 'url' (for HTTP), etc.
+
+        Example:
+            >>> param_iter = ((n, p) for n, p in model.named_parameters())
+            >>> engine.trainer_send_weights(param_iter, trainer_args)
+        """
+        raise NotImplementedError
diff --git a/vllm/distributed/weight_transfer/factory.py b/vllm/distributed/weight_transfer/factory.py
index 7235e30d1af6..f8e9c864fcc1 100644
--- a/vllm/distributed/weight_transfer/factory.py
+++ b/vllm/distributed/weight_transfer/factory.py
@@ -114,3 +114,9 @@ def create_engine(
     "vllm.distributed.weight_transfer.nccl_engine",
     "NCCLWeightTransferEngine",
 )
+
+WeightTransferEngineFactory.register_engine(
+    "ipc",
+    "vllm.distributed.weight_transfer.ipc_engine",
+    "IPCWeightTransferEngine",
+)
diff --git a/vllm/distributed/weight_transfer/ipc_engine.py b/vllm/distributed/weight_transfer/ipc_engine.py
new file mode 100644
index 000000000000..2edbec6259aa
--- /dev/null
+++ b/vllm/distributed/weight_transfer/ipc_engine.py
@@ -0,0 +1,291 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""IPC-based weight transfer engine using CUDA IPC for communication."""
+
+import base64
+import pickle
+from collections.abc import Callable, Iterator
+from dataclasses import asdict, dataclass
+from typing import Any
+
+import requests
+import torch
+from torch.multiprocessing.reductions import reduce_tensor
+
+from vllm.config.parallel import ParallelConfig
+from vllm.config.weight_transfer import WeightTransferConfig
+from vllm.distributed.weight_transfer.base import (
+    WeightTransferEngine,
+    WeightTransferInitInfo,
+    WeightTransferUpdateInfo,
+)
+
+
+@dataclass
+class IPCTrainerSendWeightsArgs:
+    """Arguments for IPC trainer_send_weights method."""
+
+    mode: str
+    """Transport mode: 'http' or 'ray'."""
+    llm_handle: Any = None
+    """Ray ObjectRef to LLM handle (required for 'ray' mode)."""
+    url: str | None = None
+    """Base URL for HTTP endpoint (required for 'http' mode)."""
+
+    def __post_init__(self):
+        """Validate that required arguments are provided for the selected mode."""
+        if self.mode == "ray" and self.llm_handle is None:
+            raise ValueError("llm_handle is required for 'ray' mode")
+        if self.mode == "http" and self.url is None:
+            raise ValueError("url is required for 'http' mode")
+        if self.mode not in ("ray", "http"):
+            raise ValueError(f"mode must be 'ray' or 'http', got {self.mode}")
+
+
+@dataclass
+class IPCWeightTransferInitInfo(WeightTransferInitInfo):
+    """Initialization info for IPC weight transfer backend. No init needed for IPC."""
+
+    pass
+
+
+@dataclass
+class IPCWeightTransferUpdateInfo(WeightTransferUpdateInfo):
+    """Update info for IPC weight transfer backend.
+
+    Accepts IPC handles either directly via ``ipc_handles`` (Ray transport)
+    or as a base64-encoded pickle via ``ipc_handles_pickled`` (HTTP transport).
+    Exactly one of the two must be provided; if ``ipc_handles_pickled`` is set
+    it is unpickled into ``ipc_handles`` during ``__post_init__``.
+    """
+
+    names: list[str]
+    dtype_names: list[str]
+    shapes: list[list[int]]
+    ipc_handles: list[dict[str, tuple[Callable, tuple]]] | None = None
+    """IPC handles mapping physical GPU UUID to (func, args) tuple.
+    Each handle is a dictionary mapping GPU UUID strings to IPC handle tuples."""
+    ipc_handles_pickled: str | None = None
+    """Base64-encoded pickled IPC handles, used for HTTP transport."""
+
+    def __post_init__(self):
+        if self.ipc_handles_pickled is not None:
+            if self.ipc_handles is not None:
+                raise ValueError(
+                    "Cannot specify both `ipc_handles` and `ipc_handles_pickled`"
+                )
+            self.ipc_handles = pickle.loads(base64.b64decode(self.ipc_handles_pickled))
+            self.ipc_handles_pickled = None
+
+        if self.ipc_handles is None:
+            raise ValueError(
+                "Either `ipc_handles` or `ipc_handles_pickled` must be provided"
+            )
+
+        num_params = len(self.names)
+        if len(self.dtype_names) != num_params:
+            raise ValueError(
+                f"`dtype_names` should be of the same size as `names`: "
+                f"got {len(self.dtype_names)} and {len(self.names)}"
+            )
+        if len(self.shapes) != num_params:
+            raise ValueError(
+                f"`shapes` should be of the same size as `names`: "
+                f"got {len(self.shapes)} and {len(self.names)}"
+            )
+        if len(self.ipc_handles) != num_params:
+            raise ValueError(
+                f"`ipc_handles` should be of the same size as `names`: "
+                f"got {len(self.ipc_handles)} and {len(self.names)}"
+            )
+
+
+class IPCWeightTransferEngine(
+    WeightTransferEngine[IPCWeightTransferInitInfo, IPCWeightTransferUpdateInfo]
+):
+    """
+    Weight transfer engine using CUDA IPC for communication between trainer and workers.
+
+    This implementation uses CUDA IPC to transfer weights from the trainer (rank 0)
+    to all inference workers in a process group. IPC handles are used to share
+    memory between processes on the same node.
+    """
+
+    # Define backend-specific dataclass types
+    init_info_cls = IPCWeightTransferInitInfo
+    update_info_cls = IPCWeightTransferUpdateInfo
+
+    def __init__(
+        self, config: WeightTransferConfig, parallel_config: ParallelConfig
+    ) -> None:
+        """
+        Initialize the IPC weight transfer engine.
+
+        Args:
+            config: The configuration for the weight transfer engine
+            parallel_config: The configuration for the parallel setup
+        """
+        super().__init__(config, parallel_config)
+
+    def init_transfer_engine(self, init_info: IPCWeightTransferInitInfo) -> None:
+        """
+        Initialize the weight transfer mechanism.
+        This is called once at the beginning of training.
+        No initialization needed for IPC backend.
+
+        Args:
+            init_info: IPC initialization info (empty)
+        """
+        pass
+
+    def receive_weights(
+        self,
+        update_info: IPCWeightTransferUpdateInfo,
+        load_weights: Callable[[list[tuple[str, torch.Tensor]]], None],
+    ) -> None:
+        """
+        Receive weights from the trainer via CUDA IPC handles.
+
+        Args:
+            update_info: IPC update info containing parameter names, dtypes, shapes,
+                        and IPC handles. Each IPC handle is a mapping between physical
+                        GPU UUID and the IPC handle tuple (func, args).
+            load_weights: Callable that loads weights into the model. Called
+                         incrementally for each weight to avoid OOM.
+        """
+        assert update_info.ipc_handles is not None
+        weights = []
+        for name, _dtype_name, _shape, ipc_handle in zip(
+            update_info.names,
+            update_info.dtype_names,
+            update_info.shapes,
+            update_info.ipc_handles,
+        ):
+            device_index = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(device_index)
+            physical_gpu_id = str(props.uuid)
+
+            if physical_gpu_id not in ipc_handle:
+                raise ValueError(
+                    f"IPC handle not found for GPU UUID {physical_gpu_id}. "
+                    f"Available UUIDs: {list(ipc_handle.keys())}"
+                )
+
+            handle = ipc_handle[physical_gpu_id]
+
+            func, args = handle
+            list_args = list(args)  # type: ignore
+            # Index 6 is the device_index parameter in torch's
+            # IPC handle tuple (rebuild_cuda_tensor). Update it
+            # to the current device since the logical index can
+            # differ between sender and receiver.
+            list_args[6] = device_index
+            weight = func(*list_args)  # type: ignore
+            weights.append((name, weight))
+
+        load_weights(weights)
+
+    def shutdown(self) -> None:
+        """
+        Shutdown the weight transfer engine.
+        """
+        pass
+
+    @staticmethod
+    def trainer_send_weights(
+        iterator: Iterator[tuple[str, torch.Tensor]],
+        trainer_args: dict[str, Any] | IPCTrainerSendWeightsArgs,
+    ) -> None:
+        """
+        Send weights from trainer to inference workers via CUDA IPC.
+
+        Supports two modes:
+        - 'ray': Sends weights via Ray RPC to a Ray-based LLM handle
+        - 'http': Sends weights via HTTP POST to a vLLM HTTP server
+
+        Args:
+            iterator: Iterator of model parameters. Returns (name, tensor) tuples.
+                     Tensors should be on the same GPU as the inference workers.
+            trainer_args: Dictionary containing IPC-specific arguments.
+                         Should contain keys from IPCTrainerSendWeightsArgs:
+                         - mode: 'ray' or 'http'
+                         - llm_handle: Ray ObjectRef (for 'ray' mode)
+                         - url: Base URL string (for 'http' mode)
+
+        Example (Ray mode):
+            >>> from vllm.distributed.weight_transfer.ipc_engine import (
+            ...     IPCWeightTransferEngine,
+            ...     IPCTrainerSendWeightsArgs,
+            ... )
+            >>> param_iter = ((n, p) for n, p in model.named_parameters())
+            >>> args = IPCTrainerSendWeightsArgs(mode="ray", llm_handle=llm_handle)
+            >>> IPCWeightTransferEngine.trainer_send_weights(param_iter, asdict(args))
+
+        Example (HTTP mode):
+            >>> args = IPCTrainerSendWeightsArgs(
+            ...     mode="http", url="http://localhost:8000"
+            ... )
+            >>> IPCWeightTransferEngine.trainer_send_weights(param_iter, asdict(args))
+        """
+        # Parse trainer args - accept either dict or dataclass instance
+        if isinstance(trainer_args, dict):
+            args = IPCTrainerSendWeightsArgs(**trainer_args)
+        else:
+            args = trainer_args
+
+        # Get physical GPU UUID
+        device_index = torch.cuda.current_device()
+        props = torch.cuda.get_device_properties(device_index)
+        gpu_uuid = str(props.uuid)
+
+        # Collect weight metadata and create IPC handles
+        names = []
+        dtype_names = []
+        shapes = []
+        ipc_handles = []
+
+        for name, tensor in iterator:
+            names.append(name)
+            dtype_names.append(str(tensor.dtype).split(".")[-1])
+            shapes.append(list(tensor.shape))
+
+            # Create IPC handle for this weight tensor
+            # The tensor must remain in memory for IPC to work
+            weight = tensor.detach().contiguous()
+            ipc_handle = reduce_tensor(weight)
+            ipc_handles.append({gpu_uuid: ipc_handle})
+
+        # Send weights based on mode
+        if args.mode == "ray":
+            # Ray mode: send via Ray RPC
+            import ray
+
+            update_info = asdict(
+                IPCWeightTransferUpdateInfo(
+                    names=names,
+                    dtype_names=dtype_names,
+                    shapes=shapes,
+                    ipc_handles=ipc_handles,
+                )
+            )
+            ray.get(
+                args.llm_handle.update_weights.remote(dict(update_info=update_info))
+            )
+        elif args.mode == "http":
+            # HTTP mode: send via HTTP POST with pickled handles
+            # Pickle and base64 encode IPC handles for HTTP transmission
+            pickled_handles = base64.b64encode(pickle.dumps(ipc_handles)).decode(
+                "utf-8"
+            )
+
+            url = f"{args.url}/update_weights"
+            payload = {
+                "update_info": {
+                    "names": names,
+                    "dtype_names": dtype_names,
+                    "shapes": shapes,
+                    "ipc_handles_pickled": pickled_handles,
+                }
+            }
+            response = requests.post(url, json=payload, timeout=300)
+            response.raise_for_status()
diff --git a/vllm/distributed/weight_transfer/nccl_engine.py b/vllm/distributed/weight_transfer/nccl_engine.py
index 5c90198bf616..e8a1091b905a 100644
--- a/vllm/distributed/weight_transfer/nccl_engine.py
+++ b/vllm/distributed/weight_transfer/nccl_engine.py
@@ -35,6 +35,32 @@ class NCCLWeightTransferInitInfo(WeightTransferInitInfo):
     world_size: int
 
 
+@dataclass
+class NCCLTrainerSendWeightsArgs:
+    """Arguments for NCCL trainer_send_weights method."""
+
+    group: Any
+    """Process group (PyNcclCommunicator) for NCCL communication."""
+    src: int = 0
+    """Source rank (default 0, trainer is typically rank 0)."""
+    post_iter_func: Callable[[tuple[str, torch.Tensor]], torch.Tensor] | None = None
+    """Optional function to apply to each (name, tensor) pair before broadcasting.
+    If None, extracts just the tensor."""
+    packed: bool = False
+    """Whether to use packed tensor broadcasting for efficiency.
+    When True, multiple tensors are batched together before broadcasting
+    to reduce NCCL communication overhead."""
+    stream: torch.cuda.Stream | None = None
+    """CUDA stream to use for broadcasting if packed is False.
+    If packed is True, new streams will be created for each buffer."""
+    packed_buffer_size_bytes: int = DEFAULT_PACKED_BUFFER_SIZE_BYTES
+    """Size in bytes for each packed tensor buffer.
+    Must match the value used in NCCLWeightTransferUpdateInfo."""
+    packed_num_buffers: int = DEFAULT_PACKED_NUM_BUFFERS
+    """Number of buffers for double/triple buffering during packed transfer.
+    Must match the value used in NCCLWeightTransferUpdateInfo."""
+
+
 @dataclass
 class NCCLWeightTransferUpdateInfo(WeightTransferUpdateInfo):
     """Update info for NCCL weight transfer backend."""
@@ -47,7 +73,7 @@ class NCCLWeightTransferUpdateInfo(WeightTransferUpdateInfo):
     When True, multiple tensors are batched together before broadcasting
     to reduce NCCL communication overhead."""
     packed_buffer_size_bytes: int = DEFAULT_PACKED_BUFFER_SIZE_BYTES
-    """Size in bytes for each packed tensor buffer. Default is 1GB.
+    """Size in bytes for each packed tensor buffer.
     Both producer and consumer must use the same value."""
     packed_num_buffers: int = DEFAULT_PACKED_NUM_BUFFERS
     """Number of buffers for double/triple buffering during packed transfer.
@@ -186,47 +212,38 @@ def shutdown(self) -> None:
     @staticmethod
     def trainer_send_weights(
         iterator: Iterator[tuple[str, torch.Tensor]],
-        group: Any,
-        src: int = 0,
-        post_iter_func: Callable[[tuple[str, torch.Tensor]], torch.Tensor]
-        | None = None,
-        packed: bool = False,
-        stream: torch.cuda.Stream | None = None,
-        packed_buffer_size_bytes: int = DEFAULT_PACKED_BUFFER_SIZE_BYTES,
-        packed_num_buffers: int = DEFAULT_PACKED_NUM_BUFFERS,
+        trainer_args: dict[str, Any] | NCCLTrainerSendWeightsArgs,
     ) -> None:
         """Broadcast weights from trainer to vLLM workers.
 
         Args:
             iterator: Iterator of model parameters. Returns (name, tensor) tuples
-            group: Process group (PyNcclCommunicator)
-            src: Source rank (default 0, trainer is typically rank 0)
-            post_iter_func: Optional function to apply to each (name, tensor) pair
-                           before broadcasting. If None, extracts just the tensor.
-            packed: Whether to use packed tensor broadcasting for efficiency.
-                   When True, multiple tensors are batched together before
-                   broadcasting to reduce NCCL communication overhead.
-            stream: CUDA stream to use for broadcasting if packed is False.
-                    If packed is True, new streams will be created for each buffer.
-            packed_buffer_size_bytes: Size in bytes for each packed tensor buffer.
-                   Must match the value used in NCCLWeightTransferUpdateInfo.
-            packed_num_buffers: Number of buffers for double/triple buffering.
-                   Must match the value used in NCCLWeightTransferUpdateInfo.
+            trainer_args: Dictionary or NCCLTrainerSendWeightsArgs instance containing
+                         NCCL-specific arguments. If a dict, should contain keys from
+                         NCCLTrainerSendWeightsArgs.
 
         Example:
             >>> from vllm.distributed.weight_transfer.nccl_engine import (
             ...     NCCLWeightTransferEngine,
+            ...     NCCLTrainerSendWeightsArgs,
             ... )
             >>> param_iter = ((n, p) for n, p in model.named_parameters())
-            >>> NCCLWeightTransferEngine.trainer_send_weights(
-            ...     param_iter, group, packed=True
-            ... )
+            >>> args = NCCLTrainerSendWeightsArgs(group=group, packed=True)
+            >>> NCCLWeightTransferEngine.trainer_send_weights(param_iter, args)
         """
-        if post_iter_func is None:
+        # Parse trainer args - accept either dict or dataclass instance
+        if isinstance(trainer_args, dict):
+            args = NCCLTrainerSendWeightsArgs(**trainer_args)
+        else:
+            args = trainer_args
+
+        if args.post_iter_func is None:
             # Default: extract just the tensor from (name, tensor) tuple
             post_iter_func = lambda x: x[1]
+        else:
+            post_iter_func = args.post_iter_func
 
-        if packed:
+        if args.packed:
             # Use packed tensor broadcasting for efficiency
             from vllm.distributed.weight_transfer.packed_tensor import (
                 packed_broadcast_producer,
@@ -234,18 +251,20 @@ def trainer_send_weights(
 
             packed_broadcast_producer(
                 iterator=iterator,
-                group=group,
-                src=src,
+                group=args.group,
+                src=args.src,
                 post_iter_func=post_iter_func,
-                buffer_size_bytes=packed_buffer_size_bytes,
-                num_buffers=packed_num_buffers,
+                buffer_size_bytes=args.packed_buffer_size_bytes,
+                num_buffers=args.packed_num_buffers,
             )
         else:
             # Use simple one-by-one broadcasting
             for item in iterator:
                 tensor = post_iter_func(item)
-                group.broadcast(
-                    tensor, src=src, stream=stream or torch.cuda.current_stream()
+                args.group.broadcast(
+                    tensor,
+                    src=args.src,
+                    stream=args.stream or torch.cuda.current_stream(),
                 )
 
     @staticmethod

From 9fa6c68fa627c7ab041c48ac9987fb093719597f Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 27 Feb 2026 15:32:55 -0600
Subject: [PATCH 090/154] [ROCm] Enabling encoder and encoder-decoder on ROCm
 and AITER unified backends (#35334)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 docs/design/attention_backends.md             |  4 +-
 .../backends/rocm_aiter_unified_attn.py       | 31 ++++++++
 vllm/v1/attention/backends/rocm_attn.py       | 78 +++++++++++++++++--
 3 files changed, 106 insertions(+), 7 deletions(-)

diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index 3244ce7cc501..3d0fcd6c7716 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -171,8 +171,8 @@ Priority is **1 = highest** (tried first).
 | `FLASH_ATTN_DIFFKV` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
 | `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
 | `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | All | N/A |
+| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
 | `TREE_ATTN` |  | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
 | `TRITON_ATTN` |  | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index db6fd97c9dd9..130ccaa2d6fd 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -55,6 +55,16 @@ def use_cascade_attention(*args, **kwargs) -> bool:
     def get_builder_cls() -> type["RocmAttentionMetadataBuilder"]:
         return RocmAttentionMetadataBuilder
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """RocmAiterUnifiedAttention supports all attention types."""
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
+        )
+
 
 class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl):
     def fused_output_quant_supported(self, quant_key: QuantKey):
@@ -143,6 +153,19 @@ def forward(
 
         num_actual_tokens = attn_metadata.num_actual_tokens
 
+        # Handle encoder attention differently - no KV cache needed
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            # For encoder attention,
+            # we use direct Q, K, V tensors without caching
+            return self._forward_encoder_attention(
+                query[:num_actual_tokens],
+                key[:num_actual_tokens],
+                value[:num_actual_tokens],
+                output[:num_actual_tokens],
+                attn_metadata,
+                layer,
+            )
+
         key_cache, value_cache = kv_cache.unbind(0)
 
         if self.kv_cache_dtype.startswith("fp8"):
@@ -195,6 +218,10 @@ def do_kv_cache_update(
         kv_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
     ):
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            # For encoder attention,
+            # we use direct Q, K, V tensors without caching
+            return
         key_cache, value_cache = kv_cache.unbind(0)
 
         # Reshape the input keys and values and store them in the cache.
@@ -224,6 +251,10 @@ def do_rope_and_kv_cache_update(
         kv_cache: torch.Tensor,
         layer_slot_mapping: torch.Tensor,
     ):
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            # For encoder attention,
+            # we use direct Q, K, V tensors without caching
+            return
         key_cache, value_cache = kv_cache.unbind(0)
         flash_layout = True
 
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index d72293dec250..d4bfa764febe 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -205,6 +205,16 @@ def get_name() -> str:
     def get_impl_cls() -> type["RocmAttentionImpl"]:
         return RocmAttentionImpl
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """RocmAttention supports all attention types."""
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
+        )
+
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -244,6 +254,7 @@ def __init__(
         kv_sharing_target_layer_name: int | None = None,
         sinks: torch.Tensor | None = None,
     ) -> None:
+        self.attn_type = attn_type
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -266,11 +277,6 @@ def __init__(
 
         RocmAttentionBackend.validate_head_size(head_size)
 
-        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
-            raise NotImplementedError(
-                "Encoder self-attention is not implemented for RocmAttentionImpl"
-            )
-
         self.fp8_dtype = current_platform.fp8_dtype()
 
         self.sinks = sinks
@@ -281,6 +287,54 @@ def __init__(
                 f"num_heads: {num_heads}."
             )
 
+    def _forward_encoder_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        output: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        layer: torch.nn.Module,
+    ) -> torch.Tensor:
+        """Forward pass for encoder attention without KV cache.
+
+        Args:
+            query: shape = [num_encoder_tokens, num_heads, head_size]
+            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
+            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
+            output: shape = [num_encoder_tokens, num_heads, head_size]
+            attn_metadata: Encoder attention metadata
+            layer: The attention layer
+        """
+        # For encoder attention, process FP8 quantization if needed
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError(
+                "quantization is not supported for encoder attention"
+            )
+
+        # Use encoder-specific metadata for sequence information
+        query_start_loc = attn_metadata.query_start_loc
+        seq_lens = attn_metadata.seq_lens
+        max_query_len = attn_metadata.max_query_len
+
+        # Call flash attention directly on Q, K, V tensors
+        from vllm.v1.attention.ops.triton_prefill_attention import context_attention_fwd
+
+        context_attention_fwd(
+            q=query,
+            k=key,
+            v=value,
+            o=output,
+            b_start_loc=query_start_loc,
+            b_seq_len=seq_lens,
+            max_input_len=max_query_len,
+            is_causal=False,
+            softmax_scale=self.scale,
+            sliding_window_q=self.sliding_window[0],
+            sliding_window_k=self.sliding_window[1],
+        )
+        return output
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -330,6 +384,16 @@ def forward(
 
         num_actual_tokens = attn_metadata.num_actual_tokens
 
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            return self._forward_encoder_attention(
+                query[:num_actual_tokens],
+                key[:num_actual_tokens],
+                value[:num_actual_tokens],
+                output[:num_actual_tokens],
+                attn_metadata,
+                layer,
+            )
+
         key_cache, value_cache = PagedAttention.split_kv_cache(
             kv_cache, self.num_kv_heads, self.head_size
         )
@@ -380,6 +444,8 @@ def do_kv_cache_update(
         kv_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
     ):
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            return
         key_cache, value_cache = PagedAttention.split_kv_cache(
             kv_cache, self.num_kv_heads, self.head_size
         )
@@ -432,6 +498,8 @@ def do_rope_and_kv_cache_update(
         kv_cache: torch.Tensor,
         layer_slot_mapping: torch.Tensor,
     ):
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            return
         key_cache, value_cache = PagedAttention.split_kv_cache(
             kv_cache,
             layer.num_kv_heads,  # type: ignore[attr-defined]

From e3691988d0bf7c915d544d9204b53506815464ca Mon Sep 17 00:00:00 2001
From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com>
Date: Fri, 27 Feb 2026 16:42:30 -0600
Subject: [PATCH 091/154] [ROCm]: fix aiter rope functionalization (#35533)

Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
---
 .../passes/utility/fix_functionalization.py            | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/passes/utility/fix_functionalization.py b/vllm/compilation/passes/utility/fix_functionalization.py
index c7df5f92e874..1b656d0c890e 100644
--- a/vllm/compilation/passes/utility/fix_functionalization.py
+++ b/vllm/compilation/passes/utility/fix_functionalization.py
@@ -37,6 +37,14 @@ def __call__(self, graph: torch.fx.Graph) -> None:
 
         self.nodes_to_remove: list[torch.fx.Node] = []
         count = 0
+
+        rope_targets = [torch.ops._C.rotary_embedding.default]
+
+        if hasattr(torch.ops.vllm, "rocm_aiter_triton_rotary_embedding"):
+            rope_targets.append(
+                torch.ops.vllm.rocm_aiter_triton_rotary_embedding.default
+            )
+
         for node in graph.nodes:
             if not is_func(node, auto_functionalized):
                 continue  # Avoid deep if-elif nesting
@@ -44,7 +52,7 @@ def __call__(self, graph: torch.fx.Graph) -> None:
             kwargs = node.kwargs
             at_target = node.args[0]
 
-            if at_target == torch.ops._C.rotary_embedding.default:
+            if at_target in rope_targets:
                 query = kwargs["query"]
                 key = kwargs["key"]
                 getitem_nodes = self.getitem_users(node)

From a201ad72d87eaaa1fe20e2f42378be4ddbc867f4 Mon Sep 17 00:00:00 2001
From: "Roberto L. Castro"
 <38211239+LopezCastroRoberto@users.noreply.github.com>
Date: Sat, 28 Feb 2026 01:28:17 +0100
Subject: [PATCH 092/154] [Refactor][Kernel] Add global helper to deduplicate
 vectorized memory ops (#35105)

Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
---
 csrc/activation_kernels.cu                    | 295 +++++-----------
 csrc/cuda_vec_utils.cuh                       | 334 ++++++++++++++++++
 .../activation_nvfp4_quant_fusion_kernels.cu  |  36 +-
 csrc/quantization/fp4/nvfp4_experts_quant.cu  |   4 +-
 csrc/quantization/fp4/nvfp4_quant_kernels.cu  |  28 +-
 csrc/quantization/fp4/nvfp4_utils.cuh         | 149 +-------
 6 files changed, 474 insertions(+), 372 deletions(-)
 create mode 100644 csrc/cuda_vec_utils.cuh

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 99fa42f75e99..758a77795553 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -5,117 +5,11 @@
 #include <cmath>
 
 #include "cuda_compat.h"
+#include "cuda_vec_utils.cuh"
 #include "dispatch_utils.h"
 
 namespace vllm {
 
-struct alignas(32) u32x8_t {
-  uint32_t u0, u1, u2, u3, u4, u5, u6, u7;
-};
-
-__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
-    defined(CUDA_VERSION) && CUDA_VERSION >= 12090
-  asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
-               : "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3),
-                 "=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7)
-               : "l"(ptr));
-#else
-  const uint4* uint_ptr = reinterpret_cast<const uint4*>(ptr);
-  uint4 top_half = __ldg(&uint_ptr[0]);
-  uint4 bottom_half = __ldg(&uint_ptr[1]);
-  val.u0 = top_half.x;
-  val.u1 = top_half.y;
-  val.u2 = top_half.z;
-  val.u3 = top_half.w;
-  val.u4 = bottom_half.x;
-  val.u5 = bottom_half.y;
-  val.u6 = bottom_half.z;
-  val.u7 = bottom_half.w;
-#endif
-}
-
-__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
-    defined(CUDA_VERSION) && CUDA_VERSION >= 12090
-  asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
-               :
-               : "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3),
-                 "r"(val.u4), "r"(val.u5), "r"(val.u6), "r"(val.u7)
-               : "memory");
-#else
-  uint4* uint_ptr = reinterpret_cast<uint4*>(ptr);
-  uint_ptr[0] = make_uint4(val.u0, val.u1, val.u2, val.u3);
-  uint_ptr[1] = make_uint4(val.u4, val.u5, val.u6, val.u7);
-#endif
-}
-
-template <bool support_256>
-struct VecTraits;
-
-template <>
-struct VecTraits<true> {
-  static constexpr int ARCH_MAX_VEC_SIZE = 32;
-  using vec_t = u32x8_t;
-};
-
-template <>
-struct VecTraits<false> {
-  static constexpr int ARCH_MAX_VEC_SIZE = 16;
-  using vec_t = int4;
-};
-
-template <typename T>
-struct PackedTraits;
-
-template <>
-struct PackedTraits<c10::BFloat16> {
-  using packed_t = __nv_bfloat162;
-};
-
-template <>
-struct PackedTraits<c10::Half> {
-  using packed_t = __half2;
-};
-
-template <>
-struct PackedTraits<float> {
-  using packed_t = float2;
-};
-
-template <typename packed_t>
-__device__ __forceinline__ float2 cast_to_float2(const packed_t& val) {
-  if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
-    return __bfloat1622float2(val);
-  } else if constexpr (std::is_same_v<packed_t, __half2>) {
-    return __half22float2(val);
-  } else if constexpr (std::is_same_v<packed_t, float2>) {
-    return float2(val);
-  }
-}
-
-template <typename packed_t>
-__device__ __forceinline__ packed_t cast_to_packed(const float2& val) {
-  if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
-    return __float22bfloat162_rn(val);
-  } else if constexpr (std::is_same_v<packed_t, __half2>) {
-    return __float22half2_rn(val);
-  } else if constexpr (std::is_same_v<packed_t, float2>) {
-    return float2(val);
-  }
-}
-
-template <typename packed_t>
-__device__ __forceinline__ packed_t packed_mul(const packed_t& x,
-                                               const packed_t& y) {
-  if constexpr (std::is_same_v<packed_t, __nv_bfloat162> ||
-                std::is_same_v<packed_t, __half2>) {
-    return __hmul2(x, y);
-  } else if constexpr (std::is_same_v<packed_t, float2>) {
-    return make_float2(x.x * y.x, x.y * y.y);
-  }
-}
-
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
           bool act_first>
 __device__ __forceinline__ scalar_t compute(const scalar_t& x,
@@ -131,16 +25,6 @@ __device__ __forceinline__ packed_t packed_compute(const packed_t& x,
                    : packed_mul(x, PACKED_ACT_FN(y));
 }
 
-// Check if all pointers are 16-byte aligned for int4 vectorized access
-__host__ __device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
-  return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
-}
-
-// Check if all pointers are 16-byte aligned for longlong4_32a vectorized access
-__host__ __device__ __forceinline__ bool is_32byte_aligned(const void* ptr) {
-  return (reinterpret_cast<uintptr_t>(ptr) & 31) == 0;
-}
-
 // Activation and gating kernel template.
 template <typename scalar_t, typename packed_t,
           scalar_t (*ACT_FN)(const scalar_t&),
@@ -155,36 +39,32 @@ __global__ void act_and_mul_kernel(
   scalar_t* out_ptr = out + blockIdx.x * d;
 
   if constexpr (use_vec) {
-    // Fast path: 128-bit/256-bit vectorized loop
-    using vec_t = typename VecTraits<use_256b>::vec_t;
-    constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
-    constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
+    using cuda_t = typename CUDATypeConverter<scalar_t>::Type;
+    using pvec_t = PackedVec<cuda_t, use_256b>;
 
-    const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
-    const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
-    vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
-    const int num_vecs = d / 2 / VEC_SIZE;
+    const pvec_t* x_vec = reinterpret_cast<const pvec_t*>(x_ptr);
+    const pvec_t* y_vec = reinterpret_cast<const pvec_t*>(y_ptr);
+    pvec_t* out_vec = reinterpret_cast<pvec_t*>(out_ptr);
+    const int num_vecs = d / 2 / pvec_t::NUM_ELTS;
 
     for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
-      vec_t x, y;
+      pvec_t x, y;
       if constexpr (use_256b) {
         ld256(x, &x_vec[i]);
         ld256(y, &y_vec[i]);
       } else {
-        x = VLLM_LDG(&x_vec[i]);
-        y = VLLM_LDG(&y_vec[i]);
+        ld128(x, &x_vec[i]);
+        ld128(y, &y_vec[i]);
       }
-      auto* xp = reinterpret_cast<packed_t*>(&x);
-      auto* yp = reinterpret_cast<packed_t*>(&y);
 #pragma unroll
-      for (int j = 0; j < VEC_SIZE; j++) {
-        xp[j] =
-            packed_compute<packed_t, PACKED_ACT_FN, act_first>(xp[j], yp[j]);
+      for (int j = 0; j < pvec_t::NUM_ELTS; j++) {
+        x.elts[j] = packed_compute<packed_t, PACKED_ACT_FN, act_first>(
+            x.elts[j], y.elts[j]);
       }
       if constexpr (use_256b) {
         st256(x, &out_vec[i]);
       } else {
-        out_vec[i] = x;
+        st128(x, &out_vec[i]);
       }
     }
   } else {
@@ -272,51 +152,54 @@ packed_gelu_tanh_kernel(const packed_t& val) {
 // Launch activation and gating kernel.
 // Use ACT_FIRST (bool) indicating whether to apply the activation function
 // first.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST)     \
-  auto dtype = input.scalar_type();                                         \
-  int d = input.size(-1) / 2;                                               \
-  int64_t num_tokens = input.numel() / input.size(-1);                      \
-  if (num_tokens == 0) {                                                    \
-    return;                                                                 \
-  }                                                                         \
-  dim3 grid(num_tokens);                                                    \
-  int cc_major = at::cuda::getCurrentDeviceProperties()->major;             \
-  int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16;         \
-  int vec_size = support_vec / at::elementSize(dtype);                      \
-  const bool use_vec = (d % vec_size == 0);                                 \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));         \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();             \
-  if (use_vec) {                                                            \
-    dim3 block(std::min(d / vec_size, 1024));                               \
-    if (cc_major >= 10 && num_tokens > 128) {                               \
-      VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {       \
-        vllm::act_and_mul_kernel<                                           \
-            scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,      \
-            KERNEL<scalar_t>,                                               \
-            PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
-            ACT_FIRST, true, true><<<grid, block, 0, stream>>>(             \
-            out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);       \
-      });                                                                   \
-    } else {                                                                \
-      VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {       \
-        vllm::act_and_mul_kernel<                                           \
-            scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,      \
-            KERNEL<scalar_t>,                                               \
-            PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
-            ACT_FIRST, true, false><<<grid, block, 0, stream>>>(            \
-            out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);       \
-      });                                                                   \
-    }                                                                       \
-  } else {                                                                  \
-    dim3 block(std::min(d, 1024));                                          \
-    VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {         \
-      vllm::act_and_mul_kernel<                                             \
-          scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,        \
-          KERNEL<scalar_t>,                                                 \
-          PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>,   \
-          ACT_FIRST, false><<<grid, block, 0, stream>>>(                    \
-          out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);         \
-    });                                                                     \
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST)        \
+  auto dtype = input.scalar_type();                                            \
+  int d = input.size(-1) / 2;                                                  \
+  int64_t num_tokens = input.numel() / input.size(-1);                         \
+  if (num_tokens == 0) {                                                       \
+    return;                                                                    \
+  }                                                                            \
+  dim3 grid(num_tokens);                                                       \
+  int cc_major = at::cuda::getCurrentDeviceProperties()->major;                \
+  int support_vec =                                                            \
+      (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128)            \
+          ? vllm::VecTraits<true>::ARCH_MAX_VEC_SIZE                           \
+          : vllm::VecTraits<false>::ARCH_MAX_VEC_SIZE;                         \
+  int vec_size = support_vec / at::elementSize(dtype);                         \
+  const bool use_vec = (d % vec_size == 0);                                    \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  if (use_vec) {                                                               \
+    dim3 block(std::min(d / vec_size, 1024));                                  \
+    if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) {         \
+      VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {          \
+        vllm::act_and_mul_kernel<                                              \
+            scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type,      \
+            KERNEL<scalar_t>,                                                  \
+            PACKED_KERNEL<typename vllm::PackedTypeConverter<scalar_t>::Type>, \
+            ACT_FIRST, true, true><<<grid, block, 0, stream>>>(                \
+            out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);          \
+      });                                                                      \
+    } else {                                                                   \
+      VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {          \
+        vllm::act_and_mul_kernel<                                              \
+            scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type,      \
+            KERNEL<scalar_t>,                                                  \
+            PACKED_KERNEL<typename vllm::PackedTypeConverter<scalar_t>::Type>, \
+            ACT_FIRST, true, false><<<grid, block, 0, stream>>>(               \
+            out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);          \
+      });                                                                      \
+    }                                                                          \
+  } else {                                                                     \
+    dim3 block(std::min(d, 1024));                                             \
+    VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {            \
+      vllm::act_and_mul_kernel<                                                \
+          scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type,        \
+          KERNEL<scalar_t>,                                                    \
+          PACKED_KERNEL<typename vllm::PackedTypeConverter<scalar_t>::Type>,   \
+          ACT_FIRST, false><<<grid, block, 0, stream>>>(                       \
+          out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);            \
+    });                                                                        \
   }
 
 void silu_and_mul(torch::Tensor& out,    // [..., d]
@@ -378,35 +261,31 @@ __global__ void act_and_mul_kernel_with_param(
   scalar_t* out_ptr = out + blockIdx.x * d;
 
   if constexpr (use_vec) {
-    // Fast path: 128-bit/256-bit vectorized loop
-    using vec_t = typename VecTraits<use_256b>::vec_t;
-    constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
-    constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
+    using cuda_t = typename CUDATypeConverter<scalar_t>::Type;
+    using pvec_t = PackedVec<cuda_t, use_256b>;
 
-    const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
-    const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
-    vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
-    const int num_vecs = d / 2 / VEC_SIZE;
+    const pvec_t* x_vec = reinterpret_cast<const pvec_t*>(x_ptr);
+    const pvec_t* y_vec = reinterpret_cast<const pvec_t*>(y_ptr);
+    pvec_t* out_vec = reinterpret_cast<pvec_t*>(out_ptr);
+    const int num_vecs = d / 2 / pvec_t::NUM_ELTS;
 
     for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
-      vec_t x, y;
+      pvec_t x, y;
       if constexpr (use_256b) {
         ld256(x, &x_vec[i]);
         ld256(y, &y_vec[i]);
       } else {
-        x = VLLM_LDG(&x_vec[i]);
-        y = VLLM_LDG(&y_vec[i]);
+        ld128(x, &x_vec[i]);
+        ld128(y, &y_vec[i]);
       }
-      auto* xp = reinterpret_cast<packed_t*>(&x);
-      auto* yp = reinterpret_cast<packed_t*>(&y);
 #pragma unroll
-      for (int j = 0; j < VEC_SIZE; j++) {
-        xp[j] = packed_mul(PACKED_ACT_FN(xp[j], param), yp[j]);
+      for (int j = 0; j < pvec_t::NUM_ELTS; j++) {
+        x.elts[j] = packed_mul(PACKED_ACT_FN(x.elts[j], param), y.elts[j]);
       }
       if constexpr (use_256b) {
         st256(x, &out_vec[i]);
       } else {
-        out_vec[i] = x;
+        st128(x, &out_vec[i]);
       }
     }
   } else {
@@ -499,21 +378,24 @@ __global__ void swigluoai_and_mul_kernel(
   }                                                                            \
   dim3 grid(num_tokens);                                                       \
   int cc_major = at::cuda::getCurrentDeviceProperties()->major;                \
-  int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16;            \
+  int support_vec =                                                            \
+      (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128)            \
+          ? vllm::VecTraits<true>::ARCH_MAX_VEC_SIZE                           \
+          : vllm::VecTraits<false>::ARCH_MAX_VEC_SIZE;                         \
   int vec_size = support_vec / at::elementSize(dtype);                         \
   const bool use_vec = (d % vec_size == 0);                                    \
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
   if (use_vec) {                                                               \
     dim3 block(std::min(d / vec_size, 1024));                                  \
-    if (cc_major >= 10 && num_tokens > 128) {                                  \
+    if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) {         \
       VLLM_DISPATCH_FLOATING_TYPES(                                            \
           dtype, "act_and_mul_kernel_with_param", [&] {                        \
             vllm::act_and_mul_kernel_with_param<                               \
-                scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,     \
+                scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type,  \
                 KERNEL<scalar_t>,                                              \
                 PACKED_KERNEL<                                                 \
-                    typename vllm::PackedTraits<scalar_t>::packed_t>,          \
+                    typename vllm::PackedTypeConverter<scalar_t>::Type>,       \
                 true, true><<<grid, block, 0, stream>>>(                       \
                 out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d,       \
                 PARAM);                                                        \
@@ -522,10 +404,10 @@ __global__ void swigluoai_and_mul_kernel(
       VLLM_DISPATCH_FLOATING_TYPES(                                            \
           dtype, "act_and_mul_kernel_with_param", [&] {                        \
             vllm::act_and_mul_kernel_with_param<                               \
-                scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,     \
+                scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type,  \
                 KERNEL<scalar_t>,                                              \
                 PACKED_KERNEL<                                                 \
-                    typename vllm::PackedTraits<scalar_t>::packed_t>,          \
+                    typename vllm::PackedTypeConverter<scalar_t>::Type>,       \
                 true, false><<<grid, block, 0, stream>>>(                      \
                 out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d,       \
                 PARAM);                                                        \
@@ -535,9 +417,9 @@ __global__ void swigluoai_and_mul_kernel(
     dim3 block(std::min(d, 1024));                                             \
     VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel_with_param", [&] { \
       vllm::act_and_mul_kernel_with_param<                                     \
-          scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,           \
+          scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type,        \
           KERNEL<scalar_t>,                                                    \
-          PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>,      \
+          PACKED_KERNEL<typename vllm::PackedTypeConverter<scalar_t>::Type>,   \
           false><<<grid, block, 0, stream>>>(                                  \
           out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, PARAM);     \
     });                                                                        \
@@ -629,14 +511,17 @@ __global__ void activation_kernel(
   }                                                                      \
   dim3 grid(num_tokens);                                                 \
   int cc_major = at::cuda::getCurrentDeviceProperties()->major;          \
-  int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16;      \
+  int support_vec =                                                      \
+      (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128)      \
+          ? vllm::VecTraits<true>::ARCH_MAX_VEC_SIZE                     \
+          : vllm::VecTraits<false>::ARCH_MAX_VEC_SIZE;                   \
   int vec_size = support_vec / at::elementSize(dtype);                   \
   const bool use_vec = (d % vec_size == 0);                              \
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
   if (use_vec) {                                                         \
     dim3 block(std::min(d / vec_size, 1024));                            \
-    if (cc_major >= 10 && num_tokens > 128) {                            \
+    if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) {   \
       VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] {     \
         vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, true, true>  \
             <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
diff --git a/csrc/cuda_vec_utils.cuh b/csrc/cuda_vec_utils.cuh
new file mode 100644
index 000000000000..82a19f10a70e
--- /dev/null
+++ b/csrc/cuda_vec_utils.cuh
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <cassert>
+
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+#else
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+  #include <cuda_runtime.h>
+#endif
+
+// Device-side: SM100+ architecture with CUDA 12.9+ toolkit, which
+// together enable 256-bit (v8.u32) PTX load/store instructions.
+// Use for PTX instruction selection with architecture fallback paths.
+#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
+    defined(CUDA_VERSION) && CUDA_VERSION >= 12090
+  #define VLLM_256B_PTX_ENABLED 1
+#else
+  #define VLLM_256B_PTX_ENABLED 0
+#endif
+
+namespace vllm {
+
+// ============================================================
+// Types and traits
+// ============================================================
+
+// 256-bit (32-byte) aligned vector type: 8 x uint32_t
+struct alignas(32) u32x8_t {
+  uint32_t d[8];
+};
+
+// VecTraits — select between 128-bit (int4) and 256-bit
+// (u32x8_t) vector types at compile time.
+template <bool support_256>
+struct VecTraits;
+
+template <>
+struct VecTraits<true> {
+  static constexpr int ARCH_MAX_VEC_SIZE = 32;
+  using vec_t = u32x8_t;
+};
+
+template <>
+struct VecTraits<false> {
+  static constexpr int ARCH_MAX_VEC_SIZE = 16;
+  using vec_t = int4;
+};
+
+// PackedTypeConverter — map between CUDA scalar and packed types
+//   half  <-> half2,  __nv_bfloat16 <-> __nv_bfloat162, etc.
+template <typename T>
+struct PackedTypeConverter {
+  static_assert(sizeof(T) == 0,
+                "PackedTypeConverter is not specialized for this type.");
+};
+
+template <>
+struct PackedTypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct PackedTypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct PackedTypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct PackedTypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+template <>
+struct PackedTypeConverter<float> {
+  using Type = float2;
+};
+
+template <>
+struct PackedTypeConverter<float2> {
+  using Type = float;
+};
+
+template <>
+struct PackedTypeConverter<c10::Half> {
+  using Type = half2;
+};
+
+template <>
+struct PackedTypeConverter<c10::BFloat16> {
+  using Type = __nv_bfloat162;
+};
+
+// CUDATypeConverter — map PyTorch scalar types to CUDA scalar
+//   c10::Half -> half,  c10::BFloat16 -> __nv_bfloat16
+template <typename T>
+struct CUDATypeConverter {
+  using Type = T;
+};
+
+template <>
+struct CUDATypeConverter<c10::Half> {
+  using Type = half;
+};
+
+template <>
+struct CUDATypeConverter<c10::BFloat16> {
+  using Type = __nv_bfloat16;
+};
+
+// PackedVec — typed vector container for packed element access.
+//   Derives alignment and element count from VecTraits.
+//   Type is the CUDA scalar type (e.g. half, __nv_bfloat16).
+template <class Type, bool use_256b>
+struct alignas(VecTraits<use_256b>::ARCH_MAX_VEC_SIZE) PackedVec {
+  static constexpr int NUM_ELTS =
+      VecTraits<use_256b>::ARCH_MAX_VEC_SIZE /
+      sizeof(typename PackedTypeConverter<Type>::Type);
+  typename PackedTypeConverter<Type>::Type elts[NUM_ELTS];
+};
+
+// ============================================================
+// Load / store primitives
+// ============================================================
+
+// 256-bit load / store — SM100+ only (PTX v8 instructions).
+__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
+#if VLLM_256B_PTX_ENABLED
+  asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
+               : "=r"(val.d[0]), "=r"(val.d[1]), "=r"(val.d[2]), "=r"(val.d[3]),
+                 "=r"(val.d[4]), "=r"(val.d[5]), "=r"(val.d[6]), "=r"(val.d[7])
+               : "l"(ptr));
+#else
+  assert(false && "ld256 requires SM100+ with CUDA 12.9+");
+#endif
+}
+
+__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
+#if VLLM_256B_PTX_ENABLED
+  asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
+               :
+               : "l"(ptr), "r"(val.d[0]), "r"(val.d[1]), "r"(val.d[2]),
+                 "r"(val.d[3]), "r"(val.d[4]), "r"(val.d[5]), "r"(val.d[6]),
+                 "r"(val.d[7])
+               : "memory");
+#else
+  assert(false && "st256 requires SM100+ with CUDA 12.9+");
+#endif
+}
+
+// Generic ld256 / st256 for any 32-byte aligned type (e.g. PackedVec).
+// Non-template overloads above are preferred for u32x8_t.
+template <typename T>
+__device__ __forceinline__ void ld256(T& val, const T* ptr) {
+  static_assert(sizeof(T) == 32, "ld256 requires a 32-byte type");
+  ld256(reinterpret_cast<u32x8_t&>(val), reinterpret_cast<const u32x8_t*>(ptr));
+}
+
+template <typename T>
+__device__ __forceinline__ void st256(T& val, T* ptr) {
+  static_assert(sizeof(T) == 32, "st256 requires a 32-byte type");
+  st256(reinterpret_cast<u32x8_t&>(val), reinterpret_cast<u32x8_t*>(ptr));
+}
+
+// 128-bit load / store via __ldg (read-only cache hint).
+template <typename T>
+__device__ __forceinline__ void ld128(T& val, const T* ptr) {
+  static_assert(sizeof(T) == 16, "ld128 requires a 16-byte type");
+  *reinterpret_cast<int4*>(&val) = __ldg(reinterpret_cast<const int4*>(ptr));
+}
+
+template <typename T>
+__device__ __forceinline__ void st128(T& val, T* ptr) {
+  static_assert(sizeof(T) == 16, "st128 requires a 16-byte type");
+  *reinterpret_cast<int4*>(ptr) = *reinterpret_cast<int4*>(&val);
+}
+
+// 256-bit cache-streaming (.cs) load / store  — SM100+ only.
+__forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
+#if VLLM_256B_PTX_ENABLED
+  u32x8_t val;
+  asm volatile("ld.global.cs.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];"
+               : "=r"(val.d[0]), "=r"(val.d[1]), "=r"(val.d[2]), "=r"(val.d[3]),
+                 "=r"(val.d[4]), "=r"(val.d[5]), "=r"(val.d[6]), "=r"(val.d[7])
+               : "l"(addr));
+  return val;
+#else
+  assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
+  return {};
+#endif
+}
+
+__forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
+#if VLLM_256B_PTX_ENABLED
+  asm volatile(
+      "st.global.cs.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};" ::"l"(addr),
+      "r"(val.d[0]), "r"(val.d[1]), "r"(val.d[2]), "r"(val.d[3]), "r"(val.d[4]),
+      "r"(val.d[5]), "r"(val.d[6]), "r"(val.d[7]));
+#else
+  assert(false && "st256_cs requires SM100+ with CUDA 12.9+");
+#endif
+}
+
+// 32-bit cache-streaming (.cs) load / store  — SM100+ only.
+__forceinline__ __device__ int ld32_cs(const int* addr) {
+#if VLLM_256B_PTX_ENABLED
+  int val;
+  asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
+  return val;
+#else
+  assert(false && "ld32_cs requires SM100+ with CUDA 12.9+");
+  return 0;
+#endif
+}
+
+__forceinline__ __device__ void st32_cs(int* addr, int val) {
+#if VLLM_256B_PTX_ENABLED
+  asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
+#else
+  assert(false && "st32_cs requires SM100+ with CUDA 12.9+");
+#endif
+}
+
+// Predicated 256-bit / 128-bit cache-global (.cg) loads.
+// Returns zero if pred is false.  SM100+ only.
+__device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,
+                                                 bool pred) {
+#if VLLM_256B_PTX_ENABLED
+  asm volatile(
+      "{\n"
+      "  .reg .pred pr;\n"
+      "  setp.ne.u32 pr, %8, 0;\n"
+      "  mov.u32 %0, 0;\n"
+      "  mov.u32 %1, 0;\n"
+      "  mov.u32 %2, 0;\n"
+      "  mov.u32 %3, 0;\n"
+      "  mov.u32 %4, 0;\n"
+      "  mov.u32 %5, 0;\n"
+      "  mov.u32 %6, 0;\n"
+      "  mov.u32 %7, 0;\n"
+      "  @pr ld.global.cg.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%9];\n"
+      "}\n"
+      : "=r"(val.d[0]), "=r"(val.d[1]), "=r"(val.d[2]), "=r"(val.d[3]),
+        "=r"(val.d[4]), "=r"(val.d[5]), "=r"(val.d[6]), "=r"(val.d[7])
+      : "r"((int)pred), "l"(ptr));
+#else
+  assert(false && "ld256_cg_or_zero requires SM100+ with CUDA 12.9+");
+#endif
+}
+
+__device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
+                                                 bool pred) {
+#if VLLM_256B_PTX_ENABLED
+  uint32_t r0, r1, r2, r3;
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred pr;\n"
+      "  setp.ne.u32 pr, %4, 0;\n"
+      "  mov.u32 %0, 0;\n"
+      "  mov.u32 %1, 0;\n"
+      "  mov.u32 %2, 0;\n"
+      "  mov.u32 %3, 0;\n"
+      "  @pr ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%5];\n"
+      "}\n"
+      : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
+      : "r"((int)pred), "l"(ptr));
+
+  val = uint4{r0, r1, r2, r3};
+#else
+  assert(false && "ld128_cg_or_zero requires SM100+ with CUDA 12.9+");
+#endif
+}
+
+// ============================================================
+// Alignment helpers
+// ============================================================
+
+__host__ __device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
+  return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
+}
+
+__host__ __device__ __forceinline__ bool is_32byte_aligned(const void* ptr) {
+  return (reinterpret_cast<uintptr_t>(ptr) & 31) == 0;
+}
+
+// ============================================================
+// Packed type conversion and arithmetic
+// ============================================================
+
+template <typename packed_t>
+__device__ __forceinline__ float2 cast_to_float2(const packed_t& val) {
+  if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
+    return __bfloat1622float2(val);
+  } else if constexpr (std::is_same_v<packed_t, __half2>) {
+    return __half22float2(val);
+  } else if constexpr (std::is_same_v<packed_t, float2>) {
+    return float2(val);
+  }
+}
+
+template <typename packed_t>
+__device__ __forceinline__ packed_t cast_to_packed(const float2& val) {
+  if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
+    return __float22bfloat162_rn(val);
+  } else if constexpr (std::is_same_v<packed_t, __half2>) {
+    return __float22half2_rn(val);
+  } else if constexpr (std::is_same_v<packed_t, float2>) {
+    return float2(val);
+  }
+}
+
+template <typename packed_t>
+__device__ __forceinline__ packed_t packed_mul(const packed_t& x,
+                                               const packed_t& y) {
+  if constexpr (std::is_same_v<packed_t, __nv_bfloat162> ||
+                std::is_same_v<packed_t, __half2>) {
+    return __hmul2(x, y);
+  } else if constexpr (std::is_same_v<packed_t, float2>) {
+    return make_float2(x.x * y.x, x.y * y.y);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
index 8583b79fd58f..3539096c9feb 100644
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -39,12 +39,12 @@ namespace vllm {
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
     silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols,
-                             int32_t num_padded_cols,
+                             int32_t num_packed_cols,
                              Type const* __restrict__ in,
                              float const* __restrict__ SFScale,
                              uint32_t* __restrict__ out,
                              uint32_t* __restrict__ SFout) {
-  using PackedVec = vllm::PackedVec<Type>;
+  using PackedVec = vllm::PackedVec<Type, CVT_FP4_PACK16>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -63,7 +63,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
 
   // Input tensor row/col loops.
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    if (colIdx < num_padded_cols) {
+    if (colIdx < num_packed_cols) {
       PackedVec in_vec;
       PackedVec in_vec2;
       int64_t inOffset =
@@ -73,19 +73,19 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
 
       bool valid = (rowIdx < numRows) && (elem_idx < numCols);
       if constexpr (CVT_FP4_PACK16) {
-        ld256_or_zero_cg_u32<Type>(
-            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
-            valid);
-        ld256_or_zero_cg_u32<Type>(
-            in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 8],
-            valid);
+        ld256_cg_or_zero(reinterpret_cast<u32x8_t&>(in_vec),
+                         &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+                         valid);
+        ld256_cg_or_zero(reinterpret_cast<u32x8_t&>(in_vec2),
+                         &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 8],
+                         valid);
       } else {
-        ld128_or_zero_cg_u32<Type>(
-            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
-            valid);
-        ld128_or_zero_cg_u32<Type>(
-            in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 4],
-            valid);
+        ld128_cg_or_zero(reinterpret_cast<uint4&>(in_vec),
+                         &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+                         valid);
+        ld128_cg_or_zero(reinterpret_cast<uint4&>(in_vec2),
+                         &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 4],
+                         valid);
       }
 
       // Compute silu and mul
@@ -142,9 +142,9 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
   int const numBlocksPerSM =
       vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
 
-  int sf_n_unpadded = int(n / CVT_FP4_ELTS_PER_THREAD);
+  int num_packed_cols = int(n / CVT_FP4_ELTS_PER_THREAD);
 
-  int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
+  int grid_y = vllm::div_round_up(num_packed_cols, static_cast<int>(block.x));
   int grid_x = std::min(
       int(m), std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
   dim3 grid(grid_x, grid_y);
@@ -154,7 +154,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
         using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
         auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
         vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
-            m, n, sf_n_unpadded, input_ptr, input_sf_ptr,
+            m, n, num_packed_cols, input_ptr, input_sf_ptr,
             reinterpret_cast<uint32_t*>(output_ptr),
             reinterpret_cast<uint32_t*>(sf_out));
       });
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 32685c201102..3162b6cdb8a9 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -43,7 +43,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
                     uint32_t* input_offset_by_experts,
                     uint32_t* output_scale_offset_by_experts, int n_experts,
                     bool low_latency) {
-  using PackedVec = PackedVec<Type>;
+  using PackedVec = PackedVec<Type, CVT_FP4_PACK16>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -155,7 +155,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
                     float const* SFScale, uint32_t* out, uint32_t* SFout,
                     uint32_t* input_offset_by_experts,
                     uint32_t* output_scale_offset_by_experts, int n_experts) {
-  using PackedVec = PackedVec<Type>;
+  using PackedVec = PackedVec<Type, CVT_FP4_PACK16>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index b521b4707a4d..773047c22500 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -42,7 +42,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
                     Type const* __restrict__ in,
                     float const* __restrict__ SFScale,
                     uint32_t* __restrict__ out, uint32_t* __restrict__ SFout) {
-  using PackedVec = vllm::PackedVec<Type>;
+  using PackedVec = vllm::PackedVec<Type, CVT_FP4_PACK16>;
 
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -71,13 +71,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
       // If we are outside valid rows OR outside valid columns -> Use Zeros
       bool valid = (rowIdx < numRows) && (elem_idx < numCols);
       if constexpr (CVT_FP4_PACK16) {
-        ld256_or_zero_cg_u32<Type>(
-            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
-            valid);
+        ld256_cg_or_zero(reinterpret_cast<u32x8_t&>(in_vec),
+                         &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+                         valid);
       } else {
-        ld128_or_zero_cg_u32<Type>(
-            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
-            valid);
+        ld128_cg_or_zero(reinterpret_cast<uint4&>(in_vec),
+                         &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+                         valid);
       }
 
       auto sf_out =
@@ -114,7 +114,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
                              float const* __restrict__ SFScale,
                              uint32_t* __restrict__ out,
                              uint32_t* __restrict__ SFout) {
-  using PackedVec = PackedVec<Type>;
+  using PackedVec = PackedVec<Type, CVT_FP4_PACK16>;
 
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -139,13 +139,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
       // If we are outside valid rows OR outside valid columns -> Use Zeros
       bool valid = (rowIdx < numRows) && (elem_idx < numCols);
       if constexpr (CVT_FP4_PACK16) {
-        ld256_or_zero_cg_u32<Type>(
-            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
-            valid);
+        ld256_cg_or_zero(reinterpret_cast<u32x8_t&>(in_vec),
+                         &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+                         valid);
       } else {
-        ld128_or_zero_cg_u32<Type>(
-            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
-            valid);
+        ld128_cg_or_zero(reinterpret_cast<uint4&>(in_vec),
+                         &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+                         valid);
       }
 
       auto sf_out =
diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh
index 3e7adb9e2931..c1df1860c1a1 100644
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -19,8 +19,10 @@
 #include <cuda_runtime.h>
 #include <cuda_fp8.h>
 
-#if (defined(NVFP4_ENABLE_ELTS16) && (CUDART_VERSION >= 12090) && \
-     defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100)
+#include "../../cuda_vec_utils.cuh"
+
+#if defined(NVFP4_ENABLE_ELTS16) && defined(CUDA_VERSION) && \
+    CUDA_VERSION >= 12090
   #define ELTS_PER_THREAD 16
 constexpr int CVT_FP4_ELTS_PER_THREAD = 16;
 constexpr bool CVT_FP4_PACK16 = true;
@@ -34,68 +36,6 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;
 
 namespace vllm {
 
-// Convert PyTorch cpp type to CUDA type
-template <typename T>
-struct CUDATypeConverter {
-  using Type = T;
-};
-
-template <>
-struct CUDATypeConverter<at::Half> {
-  using Type = half;
-};
-
-template <>
-struct CUDATypeConverter<at::BFloat16> {
-  using Type = __nv_bfloat16;
-};
-
-// Get type2 from type or vice versa (applied to half and bfloat16)
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
-
-template <>
-struct TypeConverter<half2> {
-  using Type = half;
-};
-
-template <>
-struct TypeConverter<half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = __nv_bfloat16;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#if (defined(NVFP4_ENABLE_ELTS16) && (CUDART_VERSION >= 12090) && \
-     defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100)
-// Define a 32 bytes packed data type.
-template <class Type>
-struct alignas(32) PackedVec {
-  typename TypeConverter<Type>::Type elts[8];
-};
-#else
-// Define a 16 bytes packed data type.
-template <class Type>
-struct alignas(16) PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-#endif
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
 template <typename Int>
 __host__ __device__ inline Int round_up(Int x, Int y) {
   static_assert(std::is_integral_v<Int>,
@@ -208,56 +148,6 @@ __device__ __forceinline__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
-template <class Type>
-__device__ __forceinline__ void ld128_or_zero_cg_u32(PackedVec<Type>& out,
-                                                     const void* ptr,
-                                                     bool pred) {
-  uint32_t r0, r1, r2, r3;
-
-  asm volatile(
-      "{\n"
-      "  .reg .pred pr;\n"
-      "  setp.ne.u32 pr, %4, 0;\n"
-      "  mov.u32 %0, 0;\n"
-      "  mov.u32 %1, 0;\n"
-      "  mov.u32 %2, 0;\n"
-      "  mov.u32 %3, 0;\n"
-      "  @pr ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%5];\n"
-      "}\n"
-      : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
-      : "r"((int)pred), "l"(ptr));
-
-  *reinterpret_cast<uint4*>(&out) = uint4{r0, r1, r2, r3};
-}
-
-template <class Type>
-__device__ __forceinline__ void ld256_or_zero_cg_u32(PackedVec<Type>& out,
-                                                     const void* ptr,
-                                                     bool pred) {
-  uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
-
-  asm volatile(
-      "{\n"
-      "  .reg .pred pr;\n"
-      "  setp.ne.u32 pr, %8, 0;\n"
-      "  mov.u32 %0, 0;\n"
-      "  mov.u32 %1, 0;\n"
-      "  mov.u32 %2, 0;\n"
-      "  mov.u32 %3, 0;\n"
-      "  mov.u32 %4, 0;\n"
-      "  mov.u32 %5, 0;\n"
-      "  mov.u32 %6, 0;\n"
-      "  mov.u32 %7, 0;\n"
-      "  @pr ld.global.cg.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%9];\n"
-      "}\n"
-      : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
-        "=r"(r7)
-      : "r"((int)pred), "l"(ptr));
-
-  reinterpret_cast<uint4*>(&out)[0] = uint4{r0, r1, r2, r3};
-  reinterpret_cast<uint4*>(&out)[1] = uint4{r4, r5, r6, r7};
-}
-
 // Compute SF output offset for swizzled tensor core layout.
 // SF layout: [numMTiles, numKTiles, 32, 4, 4]
 // Caller must precompute: numKTiles = (numCols + 63) / 64
@@ -315,8 +205,8 @@ __device__ __forceinline__ uint8_t* sf_out_rowmajor_u8(int row, int pack,
 
 // Quantizes the provided PackedVec into the uint32_t output
 template <class Type, int CVT_FP4_NUM_THREADS_PER_SF, bool UE8M0_SF = false>
-__device__ __forceinline__ fp4_packed_t
-cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
+__device__ __forceinline__ fp4_packed_t cvt_warp_fp16_to_fp4(
+    PackedVec<Type, CVT_FP4_PACK16>& vec, float SFScaleVal, uint8_t* SFout) {
   // Get absolute maximum values among the local 8 values.
   auto localMax = __habs2(vec.elts[0]);
 
@@ -372,11 +262,7 @@ cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
 
 #pragma unroll
   for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
+    fp2Vals[i] = cast_to_float2(vec.elts[i]);
     fp2Vals[i].x *= outputScale;
     fp2Vals[i].y *= outputScale;
   }
@@ -395,22 +281,19 @@ __device__ __forceinline__ float2 silu2(float2 x) {
 }
 
 template <class Type>
-__inline__ __device__ PackedVec<Type> compute_silu_mul(
-    const PackedVec<Type>& x_vec, const PackedVec<Type>& y_vec) {
-  PackedVec<Type> result;
+__inline__ __device__ PackedVec<Type, CVT_FP4_PACK16> compute_silu_mul(
+    const PackedVec<Type, CVT_FP4_PACK16>& x_vec,
+    const PackedVec<Type, CVT_FP4_PACK16>& y_vec) {
+  PackedVec<Type, CVT_FP4_PACK16> result;
 
 #pragma unroll
   for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
     // silu_mul in float32
-    if constexpr (std::is_same_v<Type, half>) {
-      float2 silu_vec = silu2(__half22float2(x_vec.elts[i]));
-      result.elts[i] = __float22half2_rn(
-          __fmul2_rn(silu_vec, __half22float2(y_vec.elts[i])));
-    } else {
-      float2 silu_vec = silu2(__bfloat1622float2(x_vec.elts[i]));
-      result.elts[i] = __float22bfloat162_rn(
-          __fmul2_rn(silu_vec, __bfloat1622float2(y_vec.elts[i])));
-    }
+    using packed_t = typename PackedTypeConverter<Type>::Type;
+    float2 silu_vec = silu2(cast_to_float2(x_vec.elts[i]));
+    float2 y_f2 = cast_to_float2(y_vec.elts[i]);
+    result.elts[i] = cast_to_packed<packed_t>(
+        make_float2(silu_vec.x * y_f2.x, silu_vec.y * y_f2.y));
   }
   return result;
 }

From 5323672bc2b448e94ba027b16d99c93aba9c72a4 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 28 Feb 2026 08:42:37 +0800
Subject: [PATCH 093/154] [misc] cleanup one level of error stack when nixl
 fails to initialize (#35517)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index b3f2ae703fdf..87091d650b17 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -2506,6 +2506,9 @@ def __del__(self):
 
     def shutdown(self):
         """Shutdown the connector worker."""
+        if not hasattr(self, "_handshake_initiation_executor"):
+            # error happens during init, no need to shutdown
+            return
         self._handshake_initiation_executor.shutdown(wait=False)
         for handles in self._recving_transfers.values():
             for handle in handles:

From 405f28d38df2ea4f320635c6e87e206b3ccbea2f Mon Sep 17 00:00:00 2001
From: Umut Polat <52835619+umut-polat@users.noreply.github.com>
Date: Sat, 28 Feb 2026 04:19:21 +0300
Subject: [PATCH 094/154] [Misc] Clean up ResponsesRequest model validators
 (#35531)

Signed-off-by: umut-polat <52835619+umut-polat@users.noreply.github.com>
---
 vllm/entrypoints/openai/responses/protocol.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py
index b0ffd0314792..1ec88ccc3aa6 100644
--- a/vllm/entrypoints/openai/responses/protocol.py
+++ b/vllm/entrypoints/openai/responses/protocol.py
@@ -328,8 +328,9 @@ def to_sampling_params(
         # Also check text.format for OpenAI-style json_schema
         if self.text is not None and self.text.format is not None:
             if structured_outputs is not None:
-                raise ValueError(
-                    "Cannot specify both structured_outputs and text.format"
+                raise VLLMValidationError(
+                    "Cannot specify both structured_outputs and text.format",
+                    parameter="structured_outputs",
                 )
             response_format = self.text.format
             if (
@@ -378,14 +379,19 @@ def is_include_output_logprobs(self) -> bool:
         )
 
     @model_validator(mode="before")
+    @classmethod
     def validate_background(cls, data):
         if not data.get("background"):
             return data
         if not data.get("store", True):
-            raise ValueError("background can only be used when `store` is true")
+            raise VLLMValidationError(
+                "background can only be used when `store` is true",
+                parameter="background",
+            )
         return data
 
     @model_validator(mode="before")
+    @classmethod
     def validate_prompt(cls, data):
         if data.get("prompt") is not None:
             raise VLLMValidationError(
@@ -394,16 +400,19 @@ def validate_prompt(cls, data):
         return data
 
     @model_validator(mode="before")
+    @classmethod
     def check_cache_salt_support(cls, data):
         if data.get("cache_salt") is not None and (
             not isinstance(data["cache_salt"], str) or not data["cache_salt"]
         ):
-            raise ValueError(
-                "Parameter 'cache_salt' must be a non-empty string if provided."
+            raise VLLMValidationError(
+                "Parameter 'cache_salt' must be a non-empty string if provided.",
+                parameter="cache_salt",
             )
         return data
 
     @model_validator(mode="before")
+    @classmethod
     def function_call_parsing(cls, data):
         """Parse function_call dictionaries into ResponseFunctionToolCall objects.
         This ensures Pydantic can properly resolve union types in the input field.

From 86ac7bcf8483d87951a876cd2ed28341f60c95e0 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 27 Feb 2026 18:03:01 -0800
Subject: [PATCH 095/154] [Model Runner V2] Support pooling models (#35120)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/async_utils.py         |  36 ++++++++
 vllm/v1/worker/gpu/input_batch.py         |  32 +++++++
 vllm/v1/worker/gpu/model_runner.py        | 104 +++++++++++++++++++---
 vllm/v1/worker/gpu/pool/__init__.py       |   0
 vllm/v1/worker/gpu/pool/pooling_runner.py |  45 ++++++++++
 vllm/v1/worker/gpu_worker.py              |   6 ++
 6 files changed, 209 insertions(+), 14 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/pool/__init__.py
 create mode 100644 vllm/v1/worker/gpu/pool/pooling_runner.py

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index e628e38bd02c..f87459efa069 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -70,6 +70,42 @@ def get_output(self) -> ModelRunnerOutput:
         return self.model_runner_output
 
 
+class AsyncPoolingOutput(AsyncModelRunnerOutput):
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        pooler_output: torch.Tensor,
+        is_valid: torch.Tensor | None,
+        main_stream: torch.cuda.Stream,
+        copy_stream: torch.cuda.Stream,
+        copy_event: torch.cuda.Event,
+    ):
+        self.model_runner_output = model_runner_output
+        self.pooler_output = pooler_output
+        self.is_valid = is_valid
+        self.copy_event = copy_event
+
+        with stream(copy_stream, main_stream):
+            copy_stream.wait_stream(main_stream)
+            self.pooler_output_cpu = self.pooler_output.to("cpu", non_blocking=True)
+            if self.is_valid is not None:
+                self.is_valid_cpu = self.is_valid.to("cpu", non_blocking=True)
+            else:
+                self.is_valid_cpu = None
+            self.copy_event.record(copy_stream)
+
+    def get_output(self) -> ModelRunnerOutput:
+        self.copy_event.synchronize()
+        pooler_output = self.pooler_output_cpu.unbind(dim=0)
+        if self.is_valid_cpu is not None:
+            is_valid_cpu = self.is_valid_cpu.tolist()
+            for i, is_valid in enumerate(is_valid_cpu):
+                if not is_valid:
+                    pooler_output[i] = None
+        self.model_runner_output.pooler_output = pooler_output
+        return self.model_runner_output
+
+
 def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
     return x.to("cpu", non_blocking=True).numpy()
 
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 75655258c187..5918cc374997 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -499,6 +499,38 @@ def post_update(
     )
 
 
+@triton.jit
+def _post_update_pool_kernel(
+    idx_mapping_ptr,
+    num_computed_tokens_ptr,
+    query_start_loc_ptr,
+):
+    batch_id = tl.program_id(0)
+    query_start = tl.load(query_start_loc_ptr + batch_id)
+    query_end = tl.load(query_start_loc_ptr + batch_id + 1)
+    query_len = query_end - query_start
+
+    req_state_idx = tl.load(idx_mapping_ptr + batch_id)
+    num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
+    tl.store(num_computed_tokens_ptr + req_state_idx, num_computed + query_len)
+
+
+def post_update_pool(
+    # [num_reqs]
+    idx_mapping: torch.Tensor,
+    # [max_num_reqs]
+    num_computed_tokens: torch.Tensor,
+    # [num_reqs + 1]
+    query_start_loc: torch.Tensor,
+) -> None:
+    num_reqs = idx_mapping.shape[0]
+    _post_update_pool_kernel[(num_reqs,)](
+        idx_mapping,
+        num_computed_tokens,
+        query_start_loc,
+    )
+
+
 @triton.jit
 def _expand_idx_mapping_kernel(
     idx_mapping_ptr,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 7dcdaf1d2029..8bca1a17fb8c 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -38,13 +38,14 @@
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
+from vllm.tasks import SupportedTask
 from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
-from vllm.v1.worker.gpu.async_utils import AsyncOutput
+from vllm.v1.worker.gpu.async_utils import AsyncOutput, AsyncPoolingOutput
 from vllm.v1.worker.gpu.attn_utils import (
     build_slot_mappings_by_layer,
     get_kv_cache_spec,
@@ -66,6 +67,7 @@
     expand_idx_mapping,
     get_num_sampled_and_rejected,
     post_update,
+    post_update_pool,
     prepare_pos_seq_lens,
     prepare_prefill_inputs,
 )
@@ -77,6 +79,7 @@
 from vllm.v1.worker.gpu.lora_utils import LoraState
 from vllm.v1.worker.gpu.mm.encoder_runner import EncoderRunner
 from vllm.v1.worker.gpu.model_states import ModelState
+from vllm.v1.worker.gpu.pool.pooling_runner import PoolingRunner
 from vllm.v1.worker.gpu.pp_utils import pp_broadcast, pp_receive
 from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.prompt_logprob import PromptLogprobsWorker
@@ -119,7 +122,6 @@ def __init__(
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 self.cache_config.cache_dtype
             ]
-        self.is_pooling_model = False
 
         self.vocab_size = self.model_config.get_vocab_size()
         self.max_model_len = self.model_config.max_model_len
@@ -217,6 +219,10 @@ def __init__(
         # KV Connector if configured.
         self.kv_connector: KVConnector = NO_OP_KV_CONNECTOR
 
+        # Pooling models.
+        self.is_pooling_model = self.model_config.runner_type == "pooling"
+        self.pooling_runner: PoolingRunner | None = None
+
         # For transferring state from execute_model to subsequent sample_tokens call.
         self.execute_model_state: tuple | None = None
 
@@ -224,9 +230,13 @@ def update_max_model_len(self, max_model_len: int) -> None:
         self.max_model_len = max_model_len
         self.req_states.max_model_len = max_model_len
 
-    @staticmethod
-    def get_supported_tasks() -> tuple[str]:
-        return ("generate",)
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        tasks: list[SupportedTask] = []
+        if self.model_config.runner_type == "generate":
+            tasks.append("generate")
+        if self.pooling_runner is not None:
+            tasks.extend(self.pooling_runner.get_supported_pooling_tasks())
+        return tuple(tasks)
 
     def load_model(self, *args, **kwargs) -> None:
         time_before_load = time.perf_counter()
@@ -263,6 +273,8 @@ def load_model(self, *args, **kwargs) -> None:
 
         # Initialize the components that require the model.
         self.model_state = ModelState(self.vllm_config, self.model, self.device)
+        if self.is_pooling_model:
+            self.pooling_runner = PoolingRunner(self.model)
 
     def get_model(self) -> nn.Module:
         return self.model
@@ -388,16 +400,24 @@ def _dummy_sampler_run(self, hidden_states: torch.Tensor) -> None:
             expanded_local_pos,
         )
 
+    @torch.inference_mode()
+    def _dummy_pooler_run(self, hidden_states: torch.Tensor) -> None:
+        assert self.pooling_runner is not None
+        self.pooling_runner.dummy_pooler_run(hidden_states)
+
     @torch.inference_mode()
     def profile_run(self) -> None:
         hidden_states, sample_hidden_states = self._dummy_run(
             self.max_num_tokens, skip_attn=True
         )
 
-        # Only run sampler on last PP rank (non-last ranks return None).
+        # Only run sampler/pooler on last PP rank (non-last ranks return None).
         if self.is_last_pp_rank:
             assert sample_hidden_states is not None
-            self._dummy_sampler_run(sample_hidden_states)
+            if self.pooling_runner is None:
+                self._dummy_sampler_run(sample_hidden_states)
+            else:
+                self._dummy_pooler_run(hidden_states)
 
             if self.speculator is not None:
                 num_tokens_across_dp = make_num_tokens_across_dp(
@@ -505,7 +525,6 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None:
         for new_req_data in scheduler_output.scheduled_new_reqs:
             assert new_req_data.prompt_token_ids is not None
             assert new_req_data.prefill_token_ids is not None
-            assert new_req_data.sampling_params is not None
             req_id = new_req_data.req_id
             prompt_len = len(new_req_data.prompt_token_ids)
             self.req_states.add_request(
@@ -523,14 +542,16 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None:
             self.block_tables.append_block_ids(
                 req_index, new_req_data.block_ids, overwrite=True
             )
-            self.sampler.add_request(
-                req_index, prompt_len, new_req_data.sampling_params
-            )
-            self.prompt_logprobs_worker.add_request(
-                req_id, req_index, new_req_data.sampling_params
-            )
             self.lora_state.add_request(req_id, req_index, new_req_data.lora_request)
 
+            if new_req_data.sampling_params is not None:
+                self.sampler.add_request(
+                    req_index, prompt_len, new_req_data.sampling_params
+                )
+                self.prompt_logprobs_worker.add_request(
+                    req_id, req_index, new_req_data.sampling_params
+                )
+
         if scheduler_output.scheduled_new_reqs:
             self.req_states.apply_staged_writes()
             self.sampler.apply_staged_writes()
@@ -1083,3 +1104,58 @@ def sample_tokens(
 
     def take_draft_token_ids(self) -> DraftTokenIds | None:
         return self.draft_tokens_handler.get_draft_tokens()
+
+    @torch.inference_mode()
+    def pool(self) -> AsyncPoolingOutput | ModelRunnerOutput | None:
+        if self.execute_model_state is None:
+            # The prior execute_model call must have failed.
+            return None
+
+        input_batch, _, _, _, hidden_states, _, kv_connector_output = (
+            self.execute_model_state
+        )
+        self.execute_model_state = None
+
+        if not self.is_last_pp_rank:
+            self.postprocess_pool(input_batch)
+            return None
+
+        assert self.pooling_runner is not None
+        pooler_output, is_valid = self.pooling_runner.pool(
+            hidden_states, input_batch, self.req_states
+        )
+        self.postprocess_pool(input_batch)
+
+        # Build the model runner output.
+        model_runner_output = ModelRunnerOutput(
+            req_ids=input_batch.req_ids,
+            req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
+            kv_connector_output=kv_connector_output,
+        )
+        async_output = AsyncPoolingOutput(
+            model_runner_output=model_runner_output,
+            pooler_output=pooler_output,
+            is_valid=is_valid,
+            main_stream=self.main_stream,
+            copy_stream=self.output_copy_stream,
+            copy_event=self.output_copy_event,
+        )
+        if self.use_async_scheduling:
+            return async_output
+        return async_output.get_output()
+
+    def postprocess_pool(self, input_batch: InputBatch) -> None:
+        # Update the number of computed tokens.
+        post_update_pool(
+            input_batch.idx_mapping,
+            self.req_states.num_computed_tokens.gpu,
+            input_batch.query_start_loc,
+        )
+
+        # Update the number of computed prefill tokens.
+        idx_mapping_np = input_batch.idx_mapping_np
+        computed_prefill = self.req_states.num_computed_prefill_tokens
+        computed_prefill[idx_mapping_np] += input_batch.num_scheduled_tokens
+        np.minimum(
+            computed_prefill, self.req_states.prefill_len.np, out=computed_prefill
+        )
diff --git a/vllm/v1/worker/gpu/pool/__init__.py b/vllm/v1/worker/gpu/pool/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/v1/worker/gpu/pool/pooling_runner.py b/vllm/v1/worker/gpu/pool/pooling_runner.py
new file mode 100644
index 000000000000..7098aad54c32
--- /dev/null
+++ b/vllm/v1/worker/gpu/pool/pooling_runner.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import cast
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vllm.model_executor.models import VllmModelForPooling, is_pooling_model
+from vllm.tasks import PoolingTask
+from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.states import RequestState
+
+
+# NOTE(woosuk): Currently, this class only supports the "LAST" pooling task
+# on decoder-only models. How to support other pooling tasks and models
+# is to be determined.
+class PoolingRunner:
+    def __init__(self, model: nn.Module):
+        self.model = cast(VllmModelForPooling, model)
+
+    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
+        if not is_pooling_model(self.model):
+            return []
+        assert "embed" in self.model.pooler.get_supported_tasks()
+        return ["embed"]
+
+    def pool(
+        self,
+        hidden_states: torch.Tensor,
+        input_batch: InputBatch,
+        req_states: RequestState,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # TODO(woosuk): Support different types of pooling tasks.
+        last_hidden_states = hidden_states[input_batch.logits_indices]
+        # TODO(woosuk): Make normalization optional.
+        last_hidden_states = F.normalize(last_hidden_states, p=2, dim=-1)
+
+        prompt_len = req_states.prompt_len.gpu[input_batch.idx_mapping]
+        is_valid = input_batch.seq_lens == prompt_len
+        return last_hidden_states, is_valid
+
+    def dummy_pooler_run(self, hidden_states: torch.Tensor) -> None:
+        F.normalize(hidden_states, p=2, dim=-1)
+        return
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index fcc0fdf88380..06410b2eb2b3 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -700,6 +700,12 @@ def execute_model(
             output = self.model_runner.execute_model(
                 scheduler_output, intermediate_tensors
             )
+            if (
+                self.use_v2_model_runner
+                and self.model_runner.is_pooling_model
+                and output is None
+            ):
+                output = self.model_runner.pool()  # type: ignore
             if isinstance(
                 output, ModelRunnerOutput | AsyncModelRunnerOutput | NoneType
             ):

From 1a014a0a9327ed64a1bdec8e1afa43b9ea70a3c1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 27 Feb 2026 18:32:38 -0800
Subject: [PATCH 096/154] [Model Runner V2] Move MM encoder to Model States
 [3/N] (#35564)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/cudagraph_utils.py         |  6 --
 vllm/v1/worker/gpu/input_batch.py             |  3 -
 vllm/v1/worker/gpu/mm/encoder_cache.py        | 40 ++++++++
 vllm/v1/worker/gpu/mm/encoder_runner.py       | 40 ++------
 vllm/v1/worker/gpu/model_runner.py            | 98 +++++++------------
 vllm/v1/worker/gpu/model_states.py            | 58 ++++++++++-
 .../gpu/spec_decode/eagle/speculator.py       |  1 -
 7 files changed, 135 insertions(+), 111 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/mm/encoder_cache.py

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 95369005d967..6e43043bc2ff 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -89,7 +89,6 @@ def capture_graph(
         model: nn.Module,
         model_state: ModelState,
         input_buffers: InputBuffers,
-        inputs_embeds: torch.Tensor | None,
         block_tables: BlockTables,
         attn_groups: list[list[AttentionGroup]],
         kv_cache_config: KVCacheConfig,
@@ -116,9 +115,6 @@ def capture_graph(
         model_inputs = {
             "input_ids": input_buffers.input_ids[:num_tokens],
             "positions": input_buffers.positions[:num_tokens],
-            "inputs_embeds": (
-                inputs_embeds[:num_tokens] if inputs_embeds is not None else None
-            ),
             # NOTE: Values returned by `prepare_dummy_inputs` will override the
             # default values above.
             **model_state.prepare_dummy_inputs(num_reqs, num_tokens),
@@ -255,7 +251,6 @@ def capture(
         model: nn.Module,
         model_state: ModelState,
         input_buffers: InputBuffers,
-        inputs_embeds: torch.Tensor | None,
         block_tables: BlockTables,
         attn_groups: list[list[AttentionGroup]],
         kv_cache_config: KVCacheConfig,
@@ -267,7 +262,6 @@ def capture(
             model=model,
             model_state=model_state,
             input_buffers=input_buffers,
-            inputs_embeds=inputs_embeds,
             block_tables=block_tables,
             attn_groups=attn_groups,
             kv_cache_config=kv_cache_config,
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 5918cc374997..974f117d27b3 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -66,8 +66,6 @@ class InputBatch:
     input_ids: torch.Tensor
     # [num_tokens_after_padding]
     positions: torch.Tensor
-    # [num_tokens_after_padding, hidden_size]
-    inputs_embeds: torch.Tensor | None
 
     # [total_num_logits]
     logits_indices: torch.Tensor
@@ -138,7 +136,6 @@ def make_dummy(
             dcp_local_seq_lens=None,
             input_ids=input_ids,
             positions=positions,
-            inputs_embeds=None,
             logits_indices=logits_indices,
             cu_num_logits=cu_num_logits,
             cu_num_logits_np=cu_num_logits_np,
diff --git a/vllm/v1/worker/gpu/mm/encoder_cache.py b/vllm/v1/worker/gpu/mm/encoder_cache.py
new file mode 100644
index 000000000000..1fcbe6429943
--- /dev/null
+++ b/vllm/v1/worker/gpu/mm/encoder_cache.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.multimodal.inputs import MultiModalFeatureSpec
+
+
+class EncoderCache:
+    def __init__(self):
+        # req_id -> MM features
+        self.mm_features: dict[str, list[MultiModalFeatureSpec]] = {}
+        # MM hash -> encoder outputs
+        self.encoder_outputs: dict[str, torch.Tensor] = {}
+
+    def add_request(
+        self, req_id: str, mm_features: list[MultiModalFeatureSpec]
+    ) -> None:
+        self.mm_features[req_id] = mm_features
+
+    def remove_request(self, req_id: str) -> None:
+        self.mm_features.pop(req_id, None)
+
+    def reset_mm_cache(self) -> None:
+        """
+        Clear the multi-modal cache that was used during profiling,
+        but no longer needed during inference.
+        """
+        # TODO: Implement MM budget for encoder dummy run
+        pass
+
+    def reset_encoder_cache(self) -> None:
+        """Clear the GPU-side encoder cache storing vision embeddings.
+
+        This should be called when model weights are updated to ensure
+        stale embeddings computed with old weights are not reused.
+        """
+        self.encoder_outputs.clear()
+
+    def free_encoder_cache(self, mm_hash: str) -> None:
+        self.encoder_outputs.pop(mm_hash, None)
diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py
index 941e77e390a8..c0676d05d268 100644
--- a/vllm/v1/worker/gpu/mm/encoder_runner.py
+++ b/vllm/v1/worker/gpu/mm/encoder_runner.py
@@ -4,8 +4,9 @@
 import torch
 
 from vllm.model_executor.models.interfaces import SupportsMultiModal
-from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem
+from vllm.multimodal.inputs import MultiModalKwargsItem
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
+from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
 from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs
 
 
@@ -14,44 +15,19 @@ def __init__(
         self,
         max_num_tokens: int,
         hidden_size: int,
+        encoder_cache: EncoderCache,
         dtype: torch.dtype,
         device: torch.device,
     ):
         self.max_num_tokens = max_num_tokens
         self.hidden_size = hidden_size
+        self.encoder_cache = encoder_cache
         self.dtype = dtype
         self.device = device
 
         self.inputs_embeds = torch.zeros(
             max_num_tokens, hidden_size, dtype=dtype, device=device
         )
-        self.req_id_to_mm_features: dict[str, list[MultiModalFeatureSpec]] = {}
-        self.encoder_cache: dict[str, torch.Tensor] = {}
-
-    def reset_mm_cache(self) -> None:
-        """
-        Clear the multi-modal cache that was used during profiling,
-        but no longer needed during inference.
-        """
-        # TODO: Implement MM budget for encoder dummy run
-        pass
-
-    def reset_encoder_cache(self) -> None:
-        """Clear the GPU-side encoder cache storing vision embeddings.
-
-        This should be called when model weights are updated to ensure
-        stale embeddings computed with old weights are not reused.
-        """
-        self.encoder_cache.clear()
-
-    def add_request(self, req_id: str, mm_features: list[MultiModalFeatureSpec]):
-        self.req_id_to_mm_features[req_id] = mm_features
-
-    def free_encoder_cache(self, mm_hash: str) -> None:
-        self.encoder_cache.pop(mm_hash, None)
-
-    def remove_request(self, req_id: str) -> None:
-        self.req_id_to_mm_features.pop(req_id, None)
 
     def prepare_mm_inputs(
         self, scheduled_encoder_inputs: dict[str, list[int]]
@@ -59,7 +35,7 @@ def prepare_mm_inputs(
         mm_hashes: list[str] = []
         mm_kwargs: list[tuple[str, MultiModalKwargsItem]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
-            mm_features = self.req_id_to_mm_features[req_id]
+            mm_features = self.encoder_cache.mm_features[req_id]
             for mm_input_id in encoder_input_ids:
                 mm_feature = mm_features[mm_input_id]
                 if mm_feature.data is None:
@@ -90,7 +66,7 @@ def execute_mm_encoder(
             encoder_outputs.extend(curr_group_outputs)
 
         # Cache the encoder outputs by mm_hash
-        self.encoder_cache.update(zip(mm_hashes, encoder_outputs))
+        self.encoder_cache.encoder_outputs.update(zip(mm_hashes, encoder_outputs))
         return encoder_outputs
 
     def gather_mm_embeddings(
@@ -122,7 +98,7 @@ def gather_mm_embeddings(
                 # OPTIMIZATION: Skip decode requests.
                 continue
 
-            mm_features = self.req_id_to_mm_features[req_id]
+            mm_features = self.encoder_cache.mm_features[req_id]
             for mm_feature in mm_features:
                 pos_info = mm_feature.mm_position
                 start_pos = pos_info.offset
@@ -148,7 +124,7 @@ def gather_mm_embeddings(
                     continue
 
                 mm_hash = mm_feature.identifier
-                encoder_output = self.encoder_cache.get(mm_hash, None)
+                encoder_output = self.encoder_cache.encoder_outputs.get(mm_hash, None)
                 assert encoder_output is not None, f"Encoder cache miss for {mm_hash}."
 
                 if (is_embed := pos_info.is_embed) is not None:
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 8bca1a17fb8c..188a2694e991 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -77,7 +77,7 @@
     get_kv_connector,
 )
 from vllm.v1.worker.gpu.lora_utils import LoraState
-from vllm.v1.worker.gpu.mm.encoder_runner import EncoderRunner
+from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
 from vllm.v1.worker.gpu.model_states import ModelState
 from vllm.v1.worker.gpu.pool.pooling_runner import PoolingRunner
 from vllm.v1.worker.gpu.pp_utils import pp_broadcast, pp_receive
@@ -127,20 +127,6 @@ def __init__(
         self.max_model_len = self.model_config.max_model_len
         self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
         self.max_num_reqs = self.scheduler_config.max_num_seqs
-        self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()
-
-        # Multimodal
-        self.mm_registry = MULTIMODAL_REGISTRY
-        self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
-            self.model_config
-        )
-        if self.supports_mm_inputs:
-            self.encoder_runner = EncoderRunner(
-                max_num_tokens=self.max_num_tokens,
-                hidden_size=self.inputs_embeds_size,
-                dtype=self.dtype,
-                device=self.device,
-            )
 
         self.use_async_scheduling = self.scheduler_config.async_scheduling
         self.output_copy_stream = torch.cuda.Stream(self.device)
@@ -162,6 +148,15 @@ def __init__(
         self.dcp_rank = get_dcp_group().rank_in_group if self.use_dcp else 0
         self.cp_interleave = self.parallel_config.cp_kv_cache_interleave_size
 
+        # Multimodal
+        self.mm_registry = MULTIMODAL_REGISTRY
+        self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
+            self.model_config
+        )
+        self.encoder_cache = None
+        if self.supports_mm_inputs and self.is_first_pp_rank:
+            self.encoder_cache = EncoderCache()
+
         self.speculator = None
         self.num_speculative_steps = 0
         self.use_aux_hidden_state_outputs = False
@@ -272,7 +267,9 @@ def load_model(self, *args, **kwargs) -> None:
             prepare_communication_buffer_for_model(self.speculator)
 
         # Initialize the components that require the model.
-        self.model_state = ModelState(self.vllm_config, self.model, self.device)
+        self.model_state = ModelState(
+            self.vllm_config, self.model, self.encoder_cache, self.device
+        )
         if self.is_pooling_model:
             self.pooling_runner = PoolingRunner(self.model)
 
@@ -435,12 +432,12 @@ def profile_run(self) -> None:
         gc.collect()
 
     def reset_mm_cache(self) -> None:
-        if self.supports_mm_inputs:
-            self.encoder_runner.reset_mm_cache()
+        if self.encoder_cache is not None:
+            self.encoder_cache.reset_mm_cache()
 
     def reset_encoder_cache(self) -> None:
-        if self.supports_mm_inputs:
-            self.encoder_runner.reset_encoder_cache()
+        if self.encoder_cache is not None:
+            self.encoder_cache.reset_encoder_cache()
 
     def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
         # SP is not supported yet.
@@ -469,14 +466,10 @@ def capture_model(self) -> int:
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         with self.maybe_setup_dummy_loras(self.lora_config):
-            inputs_embeds = None
-            if self.supports_mm_inputs:
-                inputs_embeds = self.encoder_runner.inputs_embeds
             self.cudagraph_manager.capture(
                 model=self.model,
                 model_state=self.model_state,
                 input_buffers=self.input_buffers,
-                inputs_embeds=inputs_embeds,
                 block_tables=self.block_tables,
                 attn_groups=self.attn_groups,
                 kv_cache_config=self.kv_cache_config,
@@ -511,15 +504,15 @@ def finish_requests(self, scheduler_output: SchedulerOutput) -> None:
             finished_req_ids = finished_req_ids.union(preempted_req_ids)
         for req_id in finished_req_ids:
             self.req_states.remove_request(req_id)
-            if self.supports_mm_inputs:
-                self.encoder_runner.remove_request(req_id)
+            if self.encoder_cache is not None:
+                self.encoder_cache.remove_request(req_id)
             self.prompt_logprobs_worker.remove_request(req_id)
             self.lora_state.remove_request(req_id)
 
     def free_states(self, scheduler_output: SchedulerOutput) -> None:
-        if self.supports_mm_inputs:
+        if self.encoder_cache is not None:
             for mm_hash in scheduler_output.free_encoder_mm_hashes:
-                self.encoder_runner.free_encoder_cache(mm_hash)
+                self.encoder_cache.free_encoder_cache(mm_hash)
 
     def add_requests(self, scheduler_output: SchedulerOutput) -> None:
         for new_req_data in scheduler_output.scheduled_new_reqs:
@@ -535,8 +528,8 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None:
             )
             req_index = self.req_states.req_id_to_index[req_id]
 
-            if self.supports_mm_inputs:
-                self.encoder_runner.add_request(req_id, new_req_data.mm_features)
+            if self.encoder_cache is not None:
+                self.encoder_cache.add_request(req_id, new_req_data.mm_features)
 
             self.model_state.add_request(req_index, new_req_data)
             self.block_tables.append_block_ids(
@@ -695,7 +688,6 @@ def prepare_inputs(
             dcp_local_seq_lens=dcp_local_seq_lens,
             input_ids=self.input_buffers.input_ids[:num_tokens_after_padding],
             positions=self.input_buffers.positions[:num_tokens_after_padding],
-            inputs_embeds=None,
             logits_indices=logits_indices,
             cu_num_logits=cu_num_logits,
             cu_num_logits_np=cu_num_logits_np,
@@ -724,26 +716,6 @@ def prepare_dummy_attn(
         )
         return block_tables, slot_mappings
 
-    @torch.inference_mode()
-    def get_mm_embeddings(
-        self,
-        scheduled_encoder_inputs: dict[str, list[int]],
-        input_batch: InputBatch,
-    ) -> tuple[list[torch.Tensor], torch.Tensor]:
-        mm_hashes, mm_kwargs = self.encoder_runner.prepare_mm_inputs(
-            scheduled_encoder_inputs
-        )
-        self.encoder_runner.execute_mm_encoder(self.model, mm_hashes, mm_kwargs)
-        mm_embeds, is_mm_embed = self.encoder_runner.gather_mm_embeddings(
-            input_batch.req_ids,
-            input_batch.num_tokens,
-            input_batch.num_scheduled_tokens,
-            input_batch.query_start_loc_np,
-            self.req_states.prefill_len.np[input_batch.idx_mapping_np],
-            self.req_states.num_computed_prefill_tokens[input_batch.idx_mapping_np],
-        )
-        return mm_embeds, is_mm_embed
-
     def sample(
         self,
         hidden_states: torch.Tensor,
@@ -890,18 +862,6 @@ def execute_model(
                     input_batch.num_scheduled_tokens,
                 )
                 self._set_active_loras(*lora_inputs)
-
-            # Only first PP rank prepares multimodal embeddings.
-            if self.supports_mm_inputs and self.is_first_pp_rank:
-                mm_embeds, is_mm_embed = self.get_mm_embeddings(
-                    scheduler_output.scheduled_encoder_inputs, input_batch
-                )
-                inputs_embeds = self.encoder_runner.get_inputs_embeds(
-                    self.model, input_batch.input_ids, mm_embeds, is_mm_embed
-                )
-                input_batch.inputs_embeds = inputs_embeds[
-                    : input_batch.num_tokens_after_padding
-                ]
         else:
             # No actual tokens to run. A dummy run for DP or memory profiling.
             num_reqs = min(num_tokens_after_padding, self.max_num_reqs)
@@ -934,10 +894,20 @@ def execute_model(
                 self.kv_cache_config,
             )
 
+        inputs_embeds = None
+        if self.supports_mm_inputs and self.is_first_pp_rank and not dummy_run:
+            # Run MM encoder (if needed) and get multimodal embeddings.
+            # Only first PP rank prepares multimodal embeddings.
+            inputs_embeds = self.model_state.get_mm_embeddings(
+                scheduler_output.scheduled_encoder_inputs,
+                input_batch,
+                self.req_states,
+            )
+
         model_inputs = {
             "input_ids": input_batch.input_ids,
             "positions": input_batch.positions,
-            "inputs_embeds": input_batch.inputs_embeds,
+            "inputs_embeds": inputs_embeds,
             # NOTE: Values returned by `prepare_inputs` will override the default
             # values above.
             **self.model_state.prepare_inputs(input_batch, self.req_states),
diff --git a/vllm/v1/worker/gpu/model_states.py b/vllm/v1/worker/gpu/model_states.py
index 838f177b39cd..ca4d63e6bc6f 100644
--- a/vllm/v1/worker/gpu/model_states.py
+++ b/vllm/v1/worker/gpu/model_states.py
@@ -10,22 +10,43 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
 from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
+from vllm.v1.worker.gpu.mm.encoder_runner import EncoderRunner
 from vllm.v1.worker.gpu.mm.mrope_utils import MRopeState
 from vllm.v1.worker.gpu.states import RequestState
 from vllm.v1.worker.utils import AttentionGroup
 
 
 class ModelState:
-    def __init__(self, vllm_config: VllmConfig, model: nn.Module, device: torch.device):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        model: nn.Module,
+        encoder_cache: EncoderCache | None,
+        device: torch.device,
+    ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.scheduler_config = vllm_config.scheduler_config
         self.model = model
         self.device = device
 
+        self.supports_mm_inputs = encoder_cache is not None
         self.max_model_len = self.model_config.max_model_len
         self.max_num_reqs = self.scheduler_config.max_num_seqs
         self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()
+        self.dtype = self.model_config.dtype
+
+        if self.supports_mm_inputs:
+            assert encoder_cache is not None
+            self.encoder_runner = EncoderRunner(
+                max_num_tokens=self.max_num_tokens,
+                hidden_size=self.inputs_embeds_size,
+                encoder_cache=encoder_cache,
+                dtype=self.dtype,
+                device=self.device,
+            )
 
         self.uses_mrope = self.model_config.uses_mrope
         if self.uses_mrope:
@@ -51,6 +72,29 @@ def apply_staged_writes(self) -> None:
         if self.uses_mrope:
             self.mrope_state.apply_staged_writes()
 
+    def get_mm_embeddings(
+        self,
+        scheduled_encoder_inputs: dict[str, list[int]],
+        input_batch: InputBatch,
+        req_states: RequestState,
+    ) -> torch.Tensor:
+        mm_hashes, mm_kwargs = self.encoder_runner.prepare_mm_inputs(
+            scheduled_encoder_inputs
+        )
+        self.encoder_runner.execute_mm_encoder(self.model, mm_hashes, mm_kwargs)
+        mm_embeds, is_mm_embed = self.encoder_runner.gather_mm_embeddings(
+            input_batch.req_ids,
+            input_batch.num_tokens,
+            input_batch.num_scheduled_tokens,
+            input_batch.query_start_loc_np,
+            req_states.prefill_len.np[input_batch.idx_mapping_np],
+            req_states.num_computed_prefill_tokens[input_batch.idx_mapping_np],
+        )
+        inputs_embeds = self.encoder_runner.get_inputs_embeds(
+            self.model, input_batch.input_ids, mm_embeds, is_mm_embed
+        )
+        return inputs_embeds[: input_batch.num_tokens_after_padding]
+
     def prepare_inputs(
         self, input_batch: InputBatch, req_states: RequestState
     ) -> dict[str, torch.Tensor | None]:
@@ -73,10 +117,14 @@ def prepare_inputs(
     def prepare_dummy_inputs(
         self, num_reqs: int, num_tokens: int
     ) -> dict[str, torch.Tensor | None]:
-        if not self.uses_mrope:
-            return {}
-        mrope_positions = self.mrope_state.mrope_positions[:, :num_tokens]
-        return {"positions": mrope_positions}
+        model_inputs = {}
+        if self.supports_mm_inputs:
+            inputs_embeds = self.encoder_runner.inputs_embeds[:num_tokens]
+            model_inputs["inputs_embeds"] = inputs_embeds
+        if self.uses_mrope:
+            mrope_positions = self.mrope_state.mrope_positions[:, :num_tokens]
+            model_inputs["positions"] = mrope_positions
+        return model_inputs
 
     def prepare_attn(
         self,
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
index 0c85bf65ee9c..74172ea1853f 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
@@ -44,7 +44,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         # the draft model's hidden size can be different from the target model's
         # hidden size (e.g., Llama 3.3 70B).
         self.hidden_size = self.draft_model_config.get_hidden_size()
-        self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
         self.vocab_size = self.draft_model_config.get_vocab_size()
         self.dtype = vllm_config.model_config.dtype
 

From d5b6f3ba362552d582f93c25958a6a612d28a700 Mon Sep 17 00:00:00 2001
From: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com>
Date: Fri, 27 Feb 2026 21:37:01 -0600
Subject: [PATCH 097/154] =?UTF-8?q?[ROCm][Quantization]=20Add=20Composable?=
 =?UTF-8?q?=20Kernel=20(CK)=20backend=20support=20for=20M=E2=80=A6=20(#343?=
 =?UTF-8?q?01)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Doug Lehr <douglehr@amd.com>
Signed-off-by: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com>
Signed-off-by: Douglas Lehr <Doug.Lehr@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: Rohan Potdar <66227218+Rohan138@users.noreply.github.com>
---
 vllm/_aiter_ops.py                            | 160 ++++++++++++++++++
 .../layers/quantization/mxfp4.py              | 106 +++++++++++-
 2 files changed, 260 insertions(+), 6 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 8ef34bfd6dce..c8366ecce543 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -87,6 +87,10 @@ def _rocm_aiter_fused_moe_impl(
     a2_scale: torch.Tensor | None = None,
     num_local_tokens: torch.Tensor | None = None,
     output_dtype: torch.dtype | None = None,
+    hidden_pad: int = 0,
+    intermediate_pad: int = 0,
+    bias1: torch.Tensor | None = None,
+    bias2: torch.Tensor | None = None,
 ) -> torch.Tensor:
     from aiter import ActivationType, QuantType
     from aiter.fused_moe import fused_moe
@@ -110,6 +114,10 @@ def _rocm_aiter_fused_moe_impl(
         a2_scale,
         num_local_tokens=num_local_tokens,
         dtype=output_dtype,
+        hidden_pad=hidden_pad,
+        intermediate_pad=intermediate_pad,
+        bias1=bias1,
+        bias2=bias2,
     )
 
 
@@ -307,6 +315,28 @@ def _rocm_aiter_grouped_topk_fake(
     pass
 
 
+def _rocm_aiter_fused_topk_impl(
+    x: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    gate_up: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter.fused_moe import fused_topk
+
+    # fused_topk returns (topk_weights, topk_indices)
+    return fused_topk(x, router_logits, top_k, gate_up)
+
+
+def _rocm_aiter_fused_topk_fake(
+    x: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    gate_up: bool,
+) -> None:
+    # tuple[torch.Tensor, torch.Tensor]:
+    pass
+
+
 # Cache whether aiter supports FP8 MLA parameters
 _AITER_MLA_SUPPORTS_FP8: bool | None = None
 
@@ -994,6 +1024,70 @@ def refresh_env_variables(cls):
         cls._MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
         cls._TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
 
+    @staticmethod
+    def get_aiter_activation_type(activation_str: str):
+        """
+        Given an activation type as a string, returns the corresponding aiter ActivationType enum.
+        Supported activation types: "no", "none", "silu", "gelu", "swiglu".
+        Returns None if the mapping fails.
+
+        Args:
+            activation_str (str): Activation type as string.
+
+        Returns:
+            Aiter ActivationType enum value, or None if not found.
+        """
+        # Import only locally, since aiter may not always be available.
+        try:
+            from aiter import ActivationType
+        except ImportError:
+            return None
+
+        if not isinstance(activation_str, str):
+            return None
+
+        name = activation_str.strip().lower()
+        mapping = {
+            "none": ActivationType.No,
+            "no": ActivationType.No,
+            "silu": ActivationType.Silu,
+            "gelu": ActivationType.Gelu,
+            "swiglu": ActivationType.Swiglu,
+        }
+        return mapping.get(name)
+
+    @staticmethod
+    def get_aiter_quant_type(quant_type_str: str):
+        """
+        Given a quantization type as a string, returns the corresponding aiter QuantType enum.
+        Supported quantization types: "no", "per_tensor", "per_token", "per_1x32", "per_1x128", "per_128x128".
+        Returns None if the mapping fails.
+
+        Args:
+            quant_type_str (str): Quantization type as string.
+
+        Returns:
+            Aiter QuantType enum value, or None if not found.
+        """
+        try:
+            from aiter import QuantType
+        except ImportError:
+            return None
+
+        if not isinstance(quant_type_str, str):
+            return None
+
+        name = quant_type_str.strip().lower()
+        mapping = {
+            "no": QuantType.No,
+            "per_tensor": QuantType.per_Tensor,
+            "per_token": QuantType.per_Token,
+            "per_1x32": QuantType.per_1x32,
+            "per_1x128": QuantType.per_1x128,
+            "per_128x128": QuantType.per_128x128,
+        }
+        return mapping.get(name)
+
     @classmethod
     @if_aiter_supported
     def is_enabled(cls) -> bool:
@@ -1127,6 +1221,14 @@ def register_ops_once() -> None:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_fused_topk",
+                op_func=_rocm_aiter_fused_topk_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_fused_topk_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_mla_decode_fwd",
                 op_func=_rocm_aiter_mla_decode_fwd_impl,
@@ -1360,6 +1462,10 @@ def fused_moe(
         a2_scale: torch.Tensor | None = None,
         num_local_tokens: torch.Tensor | None = None,
         output_dtype: torch.dtype | None = None,
+        hidden_pad: int = 0,
+        intermediate_pad: int = 0,
+        bias1: torch.Tensor | None = None,
+        bias2: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return torch.ops.vllm.rocm_aiter_fused_moe(
             hidden_states,
@@ -1377,6 +1483,10 @@ def fused_moe(
             a2_scale,
             num_local_tokens,
             output_dtype,
+            hidden_pad,
+            intermediate_pad,
+            bias1,
+            bias2,
         )
 
     @staticmethod
@@ -1481,6 +1591,15 @@ def grouped_topk(
             routed_scaling_factor,
         )
 
+    @staticmethod
+    def fused_topk(
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        gate_up: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return torch.ops.vllm.rocm_aiter_fused_topk(x, router_logits, top_k, gate_up)
+
     @staticmethod
     def mla_decode_fwd(
         q: torch.Tensor,
@@ -1701,6 +1820,47 @@ def shuffle_weight(
 
         return shuffle_weight(tensor, layout=layout)
 
+    @staticmethod
+    def shuffle_weight_a16w4(
+        tensor: "torch.Tensor",
+        nLane: int,
+        gate_up: bool,
+    ) -> "torch.Tensor":
+        """
+        Shuffles the weight tensor into (A16W4) layout for AITER kernels.
+
+        Args:
+            tensor: The input weight tensor to be shuffled.
+            layout: The block layout to use, defaults to (16, 4).
+
+        Returns:
+            torch.Tensor: The shuffled tensor.
+        """
+        from aiter.ops.shuffle import shuffle_weight_a16w4
+
+        return shuffle_weight_a16w4(tensor, nLane, gate_up)
+
+    @staticmethod
+    def shuffle_scale_a16w4(
+        tensor: "torch.Tensor",
+        num_experts: int,
+        gate_up: bool,
+    ) -> "torch.Tensor":
+        """
+        Shuffles the scale tensor into (A16W4) layout for AITER kernels.
+
+        Args:
+            tensor: The input scale tensor to be shuffled.
+            num_experts: Number of experts, needed for reshaping logic.
+            gate_up: Whether the scale is for w13 (True) or w2 (False).
+
+        Returns:
+            torch.Tensor: The shuffled scale tensor.
+        """
+        from aiter.ops.shuffle import shuffle_scale_a16w4
+
+        return shuffle_scale_a16w4(tensor, num_experts, gate_up)
+
     @staticmethod
     def shuffle_weights(
         *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 9318bedfff17..29dd0359607c 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -6,6 +6,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention import Attention
@@ -77,6 +78,8 @@ class Mxfp4Backend(Enum):
     # Triton Backend
     TRITON = 6
 
+    CK = 7
+
 
 def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
     """
@@ -167,9 +170,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
     elif current_platform.is_xpu():
         logger.info_once("Using xpu backend on XPU")
         return Mxfp4Backend.MARLIN
-    elif current_platform.is_rocm() and has_triton_kernels():
-        logger.info_once("Using Triton backend")
-        return Mxfp4Backend.TRITON
+    elif current_platform.is_rocm():
+        from vllm.platforms.rocm import on_gfx950
+
+        if rocm_aiter_ops.is_enabled() and on_gfx950():
+            logger.info_once("Using CK MXFP4 MoE backend (Aiter ROCm)")
+            return Mxfp4Backend.CK
+        elif has_triton_kernels():
+            logger.info_once("Using Triton backend")
+            return Mxfp4Backend.TRITON
 
     return Mxfp4Backend.NONE
 
@@ -338,6 +347,10 @@ def create_weights(
 
         self.intermediate_size = intermediate_size_per_partition_after_pad
         self.hidden_size = hidden_size
+        self.hidden_pad = extra_weight_attrs.get("hidden_pad", 0)
+        self.intermediate_pad = (
+            intermediate_size_per_partition_after_pad - intermediate_size_per_partition
+        )
         # Fused gate_up_proj (column parallel)
         w13_weight = torch.nn.Parameter(
             torch.zeros(
@@ -784,6 +797,66 @@ def _interleave_mxfp4_cutlass_sm90(w):
                 ),
                 shared_experts=None,
             )
+        elif self.mxfp4_backend == Mxfp4Backend.CK:
+            if layer.w13_bias is not None:
+                layer.w13_bias.data = layer.w13_bias.data.to(torch.float32)
+            if layer.w2_bias.data is not None:
+                layer.w2_bias.data = layer.w2_bias.data.to(torch.float32)
+
+            e, n, k = layer.w13_weight.shape
+            layer.w13_weight.view(torch.uint8).copy_(
+                layer.w13_weight.data.view(torch.uint8)
+                .view(e, n // 2, 2, k)
+                .permute(0, 2, 1, 3)
+                .contiguous()
+                .view(e, n, k)
+            )
+            layer.w13_weight_scale.data = (
+                layer.w13_weight_scale.data.view(e, n // 2, 2, -1)
+                .permute(0, 2, 1, 3)
+                .contiguous()
+                .view(e, n, -1)
+            )
+            layer.w13_weight.data = layer.w13_weight.data.view(torch.float4_e2m1fn_x2)
+            layer.w2_weight.data = layer.w2_weight.data.view(torch.float4_e2m1fn_x2)
+
+            layer.w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(
+                layer.w13_weight, 16, True
+            )
+            shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4(
+                layer.w13_weight_scale.view(-1, layer.w13_weight_scale.shape[-1]),
+                self.num_experts,
+                True,
+            )
+
+            layer.w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(
+                layer.w2_weight, 16, False
+            )
+            shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4(
+                layer.w2_weight_scale.view(-1, layer.w2_weight_scale.shape[-1]),
+                self.num_experts,
+                False,
+            )
+
+            layer.w13_bias.data = (
+                layer.w13_bias.data.view(-1, n // 2, 2)
+                .permute(0, 2, 1)
+                .contiguous()
+                .view(-1, n)
+            )
+
+            layer.w13_weight_scale = torch.nn.Parameter(
+                shuffled_w13_scale, requires_grad=False
+            )
+            layer.w2_weight_scale = torch.nn.Parameter(
+                shuffled_w2_scale, requires_grad=False
+            )
+            # replace_parameter(layer, "w13_bias", w13_bias)
+            # replace_parameter(layer, "w13_weight_scale", w13_weight_scale)
+            # replace_parameter(layer, "w2_weight_scale", w2_weight_scale)
+            # replace_parameter(layer, "w13_weight", w13_weight)
+            # replace_parameter(layer, "w2_weight", w2_weight)
+
         elif self.mxfp4_backend == Mxfp4Backend.TRITON:
             from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 
@@ -792,7 +865,6 @@ def _interleave_mxfp4_cutlass_sm90(w):
 
             layer.w13_bias = Parameter(w13_bias, requires_grad=False)
             layer.w2_bias = Parameter(w2_bias, requires_grad=False)
-
             # Ideally we'd use FusedMoEModularKernel.prepare_finalize object
             # (stored in self.fused_experts) to determine if the MoE has a
             # batched activation format. As self.fused_experts is not
@@ -803,7 +875,6 @@ def _interleave_mxfp4_cutlass_sm90(w):
                 num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8
             else:
                 num_warps = 8
-
             w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
                 layer.w13_weight, layer.w13_weight_scale, num_warps
             )
@@ -817,13 +888,13 @@ def _interleave_mxfp4_cutlass_sm90(w):
             self.w2_precision_config = PrecisionConfig(
                 weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
             )
-
             self.w13_weight = w13_weight
             self.w2_weight = w2_weight
             del layer.w13_weight
             del layer.w2_weight
             layer.w13_weight = w13_weight
             layer.w2_weight = w2_weight
+
         else:
             raise ValueError(
                 f"Unsupported mxfp4_backend: {self.mxfp4_backend}: "
@@ -862,6 +933,7 @@ def get_fused_moe_quant_config(
         elif self.mxfp4_backend in [
             Mxfp4Backend.SM100_FI_MXFP4_BF16,
             Mxfp4Backend.SM90_FI_MXFP4_BF16,
+            Mxfp4Backend.CK,
         ]:
             return mxfp4_w4a16_moe_quant_config(
                 w1_bias=layer.w13_bias,
@@ -933,6 +1005,7 @@ def is_monolithic(self) -> bool:
             self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
             or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
             or self.mxfp4_backend == Mxfp4Backend.TRITON
+            or self.mxfp4_backend == Mxfp4Backend.CK
         )
 
     def apply(
@@ -1054,6 +1127,27 @@ def apply_monolithic(
                 tune_max_num_tokens=max(self.max_capture_size, 1),
             )[0]
             return trtllm_gen_output
+        elif self.mxfp4_backend == Mxfp4Backend.CK:
+            topk_weights, topk_ids = rocm_aiter_ops.fused_topk(
+                x, router_logits, layer.top_k, True
+            )
+            output = rocm_aiter_ops.fused_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                activation_method=rocm_aiter_ops.get_aiter_activation_type("swiglu"),
+                quant_method=rocm_aiter_ops.get_aiter_quant_type("per_1x32"),
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                doweight_stage1=False,
+                hidden_pad=self.hidden_pad // 128 * 128,
+                intermediate_pad=self.intermediate_pad // 64 * 64 * 2,
+                bias1=layer.w13_bias,
+                bias2=layer.w2_bias,
+            )
+            return output
         elif self.mxfp4_backend == Mxfp4Backend.TRITON:
             from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (  # noqa: E501
                 triton_kernel_moe_forward,

From 0edf101d2b502a709e9e119c693567c31b80ef36 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Fri, 27 Feb 2026 22:16:34 -0600
Subject: [PATCH 098/154] [ROCm] Add `stablelm` Head Size 80 To Supported Head
 Sizes For ROCM_ATTN (#35527)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 docs/design/attention_backends.md       | 2 +-
 vllm/v1/attention/backends/rocm_attn.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index 3d0fcd6c7716..6d5c007e3ea4 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -172,7 +172,7 @@ Priority is **1 = highest** (tried first).
 | `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
 | `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | All | N/A |
-| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
+| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
 | `TREE_ATTN` |  | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
 | `TRITON_ATTN` |  | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
 
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index d4bfa764febe..b53170c98976 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -182,7 +182,7 @@ def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
+        return [32, 64, 80, 96, 128, 160, 192, 224, 256]
 
     @classmethod
     def validate_head_size(cls, head_size: int) -> None:

From fd68cd132bbd8844d733ef9069e0d3b28f11c20e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Feb 2026 12:20:55 +0800
Subject: [PATCH 099/154] [Bugfix] Fixes for SLA finder (#35537)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/benchmarking/sweeps.md        | 41 ++++++++++++++++++++----------
 vllm/benchmarks/datasets.py        |  4 ++-
 vllm/benchmarks/sweep/plot.py      |  9 +++++--
 vllm/benchmarks/sweep/serve.py     |  4 +++
 vllm/benchmarks/sweep/serve_sla.py | 28 +++++++++++++++++---
 5 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
index e0a7a1b6ddc1..5571db0a5a0a 100644
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -112,6 +112,7 @@ Example command:
 vllm bench sweep serve_sla \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100' \
+    --sla-variable max_concurrency \
     --serve-params benchmarks/serve_hparams.json \
     --bench-params benchmarks/bench_hparams.json
     -o benchmarks/results
@@ -119,8 +120,8 @@ vllm bench sweep serve_sla \
 
 The algorithm for scanning through different values of `sla_variable` can be summarized as follows:
 
-1. Run the benchmark once with `sla_variable = 1` to simulate serial inference. This results in the lowest possible latency and throughput.
-2. Run the benchmark once with `sla_variable = num_prompts` to simulate batch inference over the whole dataset. This results in the highest possible latency and throughput.
+1. Run the benchmark by sending requests one at a time (serial inference). This results in the lowest possible latency and throughput.
+2. Run the benchmark by sending all requests at once (batch inference). This results in the highest possible latency and throughput.
 3. Estimate the maximum value of `sla_variable` that can be supported by the server without oversaturating it.
 4. Run the benchmark over intermediate values of `sla_variable` uniformly using the remaining iterations.
 
@@ -129,6 +130,9 @@ You can override the number of iterations in the algorithm by setting `--sla-ite
 !!! tip
     This is our equivalent of [GuideLLM's `--profile sweep`](https://github.com/vllm-project/guidellm/blob/v0.5.3/src/guidellm/benchmark/profiles.py#L575).
 
+    In general, `--sla-variable max_concurrency` produces more reliable results because it directly controls the workload imposed on the vLLM engine.
+    Nevertheless, we default to `--sla-variable request_rate` to maintain similar behavior as GuideLLM.
+
 ## Startup Benchmark
 
 `vllm bench sweep startup` runs `vllm bench startup` across parameter combinations to compare cold/warm startup time for different engine settings.
@@ -197,23 +201,32 @@ Control the variables to plot via `--var-x` and `--var-y`, optionally applying `
 Example commands for visualizing [SLA Scanner](#sla-scanner) results:
 
 ```bash
-# Latency increases as the request rate increases
-vllm bench sweep plot benchmarks/results/<timestamp> \
-    --var-x request_rate \
-    --var-y p99_ttft_ms \
-    --row-by random_input_len \
-    --col-by random_output_len \
+# Name of the directory that stores the results
+TIMESTAMP=$1
+
+# Latency increases as the workload increases
+vllm bench sweep plot benchmarks/results/$TIMESTAMP \
+    --var-x max_concurrency \
+    --var-y median_ttft_ms \
+    --col-by _benchmark_name \
+    --curve-by max_num_seqs,max_num_batched_tokens \
+    --fig-name latency_curve
+
+# Throughput saturates as workload increases
+vllm bench sweep plot benchmarks/results/$TIMESTAMP \
+    --var-x max_concurrency \
+    --var-y total_token_throughput \
+    --col-by _benchmark_name \
     --curve-by max_num_seqs,max_num_batched_tokens \
-    --filter-by 'request_rate<=128'
+    --fig-name throughput_curve
 
 # Tradeoff between latency and throughput
-vllm bench sweep plot benchmarks/results/<timestamp> \
-    --var-x request_throughput \
+vllm bench sweep plot benchmarks/results/$TIMESTAMP \
+    --var-x total_token_throughput \
     --var-y median_ttft_ms \
-    --row-by random_input_len \
-    --col-by random_output_len \
+    --col-by _benchmark_name \
     --curve-by max_num_seqs,max_num_batched_tokens \
-    --filter-by 'request_rate<=128'
+    --fig-name latency_throughput
 ```
 
 !!! tip
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index a8b6b216153d..0cd76d8910f7 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -60,6 +60,8 @@
 
 logger = logging.getLogger(__name__)
 
+DEFAULT_NUM_PROMPTS = 1000
+
 # -----------------------------------------------------------------------------
 # Data Classes
 # -----------------------------------------------------------------------------
@@ -1338,7 +1340,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
     parser.add_argument(
         "--num-prompts",
         type=int,
-        default=1000,
+        default=DEFAULT_NUM_PROMPTS,
         help="Number of prompts to process.",
     )
     parser.add_argument(
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 53c7db3879ef..4f9184f9540b 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -324,6 +324,11 @@ def _plot_fig(
     df = filter_by.apply(df)
     df = bin_by.apply(df)
 
+    if len(df) == 0:
+        print(f"No data to plot. Filters: {filter_by}")
+        print("[END FIGURE]")
+        return
+
     # Sort by curve_by columns alphabetically for consistent legend ordering
     if curve_by:
         df = df.sort_values(by=curve_by)
@@ -570,13 +575,13 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         parser.add_argument(
             "--var-x",
             type=str,
-            default="request_throughput",
+            default="total_token_throughput",
             help="The variable for the x-axis.",
         )
         parser.add_argument(
             "--var-y",
             type=str,
-            default="p99_ttft_ms",
+            default="median_ttft_ms",
             help="The variable for the y-axis",
         )
         parser.add_argument(
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 7420f2518aae..4ab2dab5f077 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -138,12 +138,16 @@ def _get_comb_base_path(
     output_dir: Path,
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
+    *,
+    extra_parts: tuple[str, ...] = (),
 ):
     parts = list[str]()
     if serve_comb:
         parts.extend(("SERVE-", serve_comb.name))
     if bench_comb:
         parts.extend(("BENCH-", bench_comb.name))
+    if extra_parts:
+        parts.extend(extra_parts)
 
     return output_dir / sanitize_filename("-".join(parts))
 
diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_sla.py
index 89169ec15957..38d54ea42f5c 100644
--- a/vllm/benchmarks/sweep/serve_sla.py
+++ b/vllm/benchmarks/sweep/serve_sla.py
@@ -10,6 +10,7 @@
 import numpy as np
 from typing_extensions import assert_never
 
+from vllm.benchmarks.datasets import DEFAULT_NUM_PROMPTS
 from vllm.utils.import_utils import PlaceholderModule
 
 from .param_sweep import ParameterSweep, ParameterSweepItem
@@ -65,7 +66,12 @@ def run_comb_sla(
         bench_cmd,
         serve_comb=serve_comb,
         bench_comb=bench_comb_sla,
-        base_path=_get_comb_base_path(output_dir, serve_comb, bench_comb_sla),
+        base_path=_get_comb_base_path(
+            output_dir,
+            serve_comb,
+            bench_comb,
+            extra_parts=("SLA-", f"{sla_variable}={sla_value}"),
+        ),
         num_runs=num_runs,
         dry_run=dry_run,
         link_vars=link_vars,
@@ -93,11 +99,25 @@ def explore_sla(
     if sla_iters < 2:
         raise ValueError("`sla_iters` should be at least 2")
 
+    dataset_size = DEFAULT_NUM_PROMPTS
+    if "num_prompts" in bench_comb:
+        dataset_size = int(bench_comb["num_prompts"])  # type: ignore
+    else:
+        for i, arg in enumerate(bench_cmd):
+            if arg == "--num-prompts" and i + 1 < len(bench_cmd):
+                dataset_size = int(bench_cmd[i + 1])
+                break
+            elif arg.startswith("--num-prompts="):
+                dataset_size = int(arg.split("=", 1)[1])
+                break
+
+    print(f"Dataset size: {dataset_size}")
+
     serial_comb_data = run_comb_sla(
         server,
         bench_cmd,
         serve_comb=serve_comb,
-        bench_comb=bench_comb,
+        bench_comb=bench_comb | {"max_concurrency": 1},
         output_dir=output_dir,
         num_runs=num_runs,
         dry_run=dry_run,
@@ -109,13 +129,13 @@ def explore_sla(
         server,
         bench_cmd,
         serve_comb=serve_comb,
-        bench_comb=bench_comb,
+        bench_comb=bench_comb | {"max_concurrency": dataset_size},
         output_dir=output_dir,
         num_runs=num_runs,
         dry_run=dry_run,
         link_vars=link_vars,
         sla_variable=sla_variable,
-        sla_value=int(bench_comb.get("num_prompts", 1000)),  # type: ignore
+        sla_value=dataset_size,
     )
 
     if serial_comb_data is None or batch_comb_data is None:

From 2562e0271e51950da23c57b237649250bf41d9e0 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 27 Feb 2026 23:27:40 -0500
Subject: [PATCH 100/154] [MTP] Validate that MTP weights are actually loaded
 (#35548)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/model_executor/models/deepseek_mtp.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 182828c91027..c75ee1a1bbfe 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -415,6 +415,26 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                         weight_loader(param, loaded_weight)
             if not is_fusion_moe_shared_experts_layer:
                 loaded_params.add(name)
+
+        # Validate that weights were loaded for each expected MTP layer.
+        loaded_layers: set[int] = set()
+        for param_name in loaded_params:
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, param_name)
+            if spec_layer is not None:
+                loaded_layers.add(spec_layer)
+        for layer_idx in range(
+            self.model.mtp_start_layer_idx,
+            self.model.mtp_start_layer_idx + self.model.num_mtp_layers,
+        ):
+            if layer_idx not in loaded_layers:
+                raise ValueError(
+                    f"MTP speculative decoding layer {layer_idx} weights "
+                    f"missing from checkpoint. The checkpoint may have "
+                    f"been quantized without including the MTP layers. "
+                    f"Use a checkpoint that includes MTP layer weights, "
+                    f"or disable speculative decoding."
+                )
+
         return loaded_params
 
     def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:

From 90805ff46434eb81f3c609498146e2b1a2477c2d Mon Sep 17 00:00:00 2001
From: Ma Jian <jian1.ma@intel.com>
Date: Sat, 28 Feb 2026 12:35:21 +0800
Subject: [PATCH 101/154] [CI/Build] CPU release supports both of AVX2 and
 AVX512 (#35466)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Co-authored-by: jiang1.li <jiang1.li@intel.com>
---
 cmake/cpu_extension.cmake    | 250 +++++++++++++++++------------------
 csrc/cpu/torch_bindings.cpp  |  17 +--
 setup.py                     |  11 +-
 vllm/_custom_ops.py          |   4 +-
 vllm/platforms/cpu.py        |  24 ++++
 vllm/v1/worker/cpu_worker.py |   2 +-
 6 files changed, 161 insertions(+), 147 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 5a0980dcc965..dde8cc20751b 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -13,28 +13,16 @@ endif()
 #
 # Define environment variables for special configurations
 #
-set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
-set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
-set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
-set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
-set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
+set(ENABLE_X86_ISA $ENV{VLLM_CPU_X86})
 set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16})
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
-
 set (ENABLE_NUMA TRUE)
 
 #
 # Check the compile flags
 #
-
-if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
-    list(APPEND CXX_COMPILE_FLAGS
-        "-mf16c"
-    )
-endif()
-
 if(MACOSX_FOUND)
     list(APPEND CXX_COMPILE_FLAGS
         "-DVLLM_CPU_EXTENSION")
@@ -78,18 +66,6 @@ function(check_sysctl TARGET OUT)
     endif()
 endfunction()
 
-
-function (is_avx512_disabled OUT)
-    set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
-    if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
-        set(${OUT} ON PARENT_SCOPE)
-    else()
-        set(${OUT} OFF PARENT_SCOPE)
-    endif()
-endfunction()
-
-is_avx512_disabled(AVX512_DISABLED)
-
 if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
     message(STATUS "Apple Silicon Detected")
     set(APPLE_SILICON_FOUND TRUE)
@@ -97,8 +73,6 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
     check_sysctl(hw.optional.neon ASIMD_FOUND)
     check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
 else()
-    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
-    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
     find_isa(${CPUINFO} "Power11" POWER11_FOUND)
     find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
     find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
@@ -108,77 +82,32 @@ else()
     find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
 
     # Support cross-compilation by allowing override via environment variables
-    if (ENABLE_AVX2)
-        set(AVX2_FOUND ON)
-        message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
-    endif()
-    if (ENABLE_AVX512)
-        set(AVX512_FOUND ON)
-        message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
-    endif()
     if (ENABLE_ARM_BF16)
         set(ARM_BF16_FOUND ON)
         message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable")
     endif()
 endif()
 
-if (AVX512_FOUND AND NOT AVX512_DISABLED)
-    list(APPEND CXX_COMPILE_FLAGS
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
+    set(ENABLE_X86_ISA ON)
+    if (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3))
+        message(FATAL_ERROR "X86 backend requires gcc/g++ >= 12.3")
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS "-mf16c")
+    list(APPEND CXX_COMPILE_FLAGS_AVX512 ${CXX_COMPILE_FLAGS})
+    list(APPEND CXX_COMPILE_FLAGS_AVX2 ${CXX_COMPILE_FLAGS})
+    list(APPEND CXX_COMPILE_FLAGS_AVX512
         "-mavx512f"
         "-mavx512vl"
         "-mavx512bw"
-        "-mavx512dq")
-
-    find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
-    if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
-            list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
-            set(ENABLE_AVX512BF16 ON)
-        else()
-            set(ENABLE_AVX512BF16 OFF)
-            message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
-        endif()
-    else()
-        set(ENABLE_AVX512BF16 OFF)
-        message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
-    endif()
-
-    find_isa(${CPUINFO} "avx512_vnni" AVX512VNNI_FOUND)
-    if (AVX512VNNI_FOUND OR ENABLE_AVX512VNNI)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
-            list(APPEND CXX_COMPILE_FLAGS "-mavx512vnni")
-            set(ENABLE_AVX512VNNI ON)
-        else()
-            set(ENABLE_AVX512VNNI OFF)
-            message(WARNING "Disable AVX512-VNNI ISA support, requires gcc/g++ >= 12.3")
-        endif()
-    else()
-        set(ENABLE_AVX512VNNI OFF)
-        message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
-    endif()
-
-    find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND)
-    if (AMXBF16_FOUND OR ENABLE_AMXBF16)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
-            list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile")
-            set(ENABLE_AMXBF16 ON)
-            add_compile_definitions(-DCPU_CAPABILITY_AMXBF16)
-        else()
-            set(ENABLE_AMXBF16 OFF)
-            message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3")
-        endif()
-    else()
-        set(ENABLE_AMXBF16 OFF)
-        message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.")
-    endif()
-    
-elseif (AVX2_FOUND)
-    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
-    message(WARNING "vLLM CPU backend using AVX2 ISA")
-    
+        "-mavx512dq"
+        "-mavx512bf16"
+        "-mavx512vnni"
+        "-mamx-bf16"
+        "-mamx-tile")
+    list(APPEND CXX_COMPILE_FLAGS_AVX2
+        "-mavx2")
 elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     message(STATUS "PowerPC detected")
     if (POWER9_FOUND)
@@ -219,12 +148,12 @@ elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
         list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
     endif()
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
+    message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()
 
 
-# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+# Build oneDNN for GEMM kernels
+if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
     # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
     set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
@@ -329,13 +258,21 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
     set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
     set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
     set(ONEDNN_BUILD_GRAPH "OFF")
-    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "ON")
     set(ONEDNN_ENABLE_ITT_TASKS "OFF")
-    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
-    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(ONEDNN_VERBOSE "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "ON")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "ON")
+    set(ONEDNN_VERBOSE "ON")
     set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
+    # TODO: Refactor this
+    if (ENABLE_X86_ISA)
+        # Note: only enable oneDNN for AVX512
+        list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512})
+    else()
+        list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS})
+    endif()
+
     set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
     set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
     FetchContent_MakeAvailable(oneDNN)
@@ -348,14 +285,20 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
         PRIVATE ${oneDNN_SOURCE_DIR}/src
     )
     target_link_libraries(dnnl_ext dnnl torch)
-    target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
+    target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC)
     list(APPEND LIBS dnnl_ext)
     set(USE_ONEDNN ON)
 else()
     set(USE_ONEDNN OFF)
 endif()
 
-message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
+# TODO: Refactor this
+if (ENABLE_X86_ISA)
+    message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
+    message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
+else()
+    message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
+endif()
 
 if(ENABLE_NUMA)
     list(APPEND LIBS numa)
@@ -390,25 +333,6 @@ set(VLLM_EXT_SRC
     "csrc/cpu/cpu_attn.cpp"
     "csrc/cpu/torch_bindings.cpp")
 
-if (AVX512_FOUND AND NOT AVX512_DISABLED)
-    set(VLLM_EXT_SRC
-        "csrc/cpu/shm.cpp"
-        "csrc/cpu/cpu_wna16.cpp"
-        "csrc/cpu/cpu_fused_moe.cpp"
-        ${VLLM_EXT_SRC})
-    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
-        set(VLLM_EXT_SRC
-            "csrc/cpu/sgl-kernels/gemm.cpp"
-            "csrc/cpu/sgl-kernels/gemm_int8.cpp"
-            "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
-            "csrc/cpu/sgl-kernels/moe.cpp"
-            "csrc/cpu/sgl-kernels/moe_int8.cpp"
-            "csrc/cpu/sgl-kernels/moe_fp8.cpp"
-            ${VLLM_EXT_SRC})
-        add_compile_definitions(-DCPU_CAPABILITY_AVX512)
-    endif()
-endif()
-
 if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
     set(VLLM_EXT_SRC
         "csrc/cpu/shm.cpp"
@@ -421,21 +345,83 @@ if(USE_ONEDNN)
         ${VLLM_EXT_SRC})
 endif()
 
-message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
-
-#
-# Define extension targets
-#
+if (ENABLE_X86_ISA)
+    set(VLLM_EXT_SRC_AVX512
+        "csrc/cpu/sgl-kernels/gemm.cpp"
+        "csrc/cpu/sgl-kernels/gemm_int8.cpp"
+        "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
+        "csrc/cpu/sgl-kernels/moe.cpp"
+        "csrc/cpu/sgl-kernels/moe_int8.cpp"
+        "csrc/cpu/sgl-kernels/moe_fp8.cpp"
+        "csrc/cpu/shm.cpp"
+        "csrc/cpu/cpu_wna16.cpp"
+        "csrc/cpu/cpu_fused_moe.cpp"
+        "csrc/cpu/utils.cpp"
+        "csrc/cpu/cpu_attn.cpp"
+        "csrc/cpu/dnnl_kernels.cpp"
+        "csrc/cpu/torch_bindings.cpp"
+        # TODO: Remove these files
+        "csrc/cpu/activation.cpp"
+        "csrc/cpu/layernorm.cpp"
+        "csrc/cpu/mla_decode.cpp"
+        "csrc/cpu/pos_encoding.cpp"
+        "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 
+
+    set(VLLM_EXT_SRC_AVX2 
+        "csrc/cpu/utils.cpp"
+        "csrc/cpu/cpu_attn.cpp"
+        "csrc/cpu/torch_bindings.cpp"
+        # TODO: Remove these files
+        "csrc/cpu/activation.cpp"
+        "csrc/cpu/layernorm.cpp"
+        "csrc/cpu/mla_decode.cpp"
+        "csrc/cpu/pos_encoding.cpp"
+        "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 
+
+    message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
+    message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
+
+    define_extension_target(
+        _C
+        DESTINATION vllm
+        LANGUAGE CXX
+        SOURCES ${VLLM_EXT_SRC_AVX512}
+        LIBRARIES ${LIBS}
+        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
+        USE_SABI 3
+        WITH_SOABI
+    )
 
-define_extension_target(
-    _C
-    DESTINATION vllm
-    LANGUAGE CXX
-    SOURCES ${VLLM_EXT_SRC}
-    LIBRARIES ${LIBS}
-    COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-    USE_SABI 3
-    WITH_SOABI
-)
+    # For SGL kernels
+    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
+    # For AMX kernels
+    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
+
+    define_extension_target(
+        _C_AVX2
+        DESTINATION vllm
+        LANGUAGE CXX
+        SOURCES ${VLLM_EXT_SRC_AVX2}
+        LIBRARIES ${LIBS}
+        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
+        USE_SABI 3
+        WITH_SOABI
+    )
+else()
+    message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
+    #
+    # Define extension targets
+    #
+    define_extension_target(
+        _C
+        DESTINATION vllm
+        LANGUAGE CXX
+        SOURCES ${VLLM_EXT_SRC}
+        LIBRARIES ${LIBS}
+        COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
+        USE_SABI 3
+        WITH_SOABI
+    )
+endif()
 
 message(STATUS "Enabling C extension.")
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 11e1305c6027..2ea482148d4c 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,6 +4,10 @@
 
 #include <torch/library.h>
 
+// Note: overwrite the external defination for sharing same name between
+// libraries use different ISAs.
+#define TORCH_EXTENSION_NAME _C
+
 std::string init_cpu_threads_env(const std::string& cpu_ids);
 
 void release_dnnl_matmul_handler(int64_t handler);
@@ -324,19 +328,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "str act, str isa) -> ()");
   ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
 #endif
-}
-
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
-  // CPU utils
-  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
-}
-
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
-  cpu_ops.def(
+  ops.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
+  ops.def(
       "mla_decode_kvcache("
       "   Tensor! out, Tensor query, Tensor kv_cache,"
       "   float scale, Tensor block_tables, Tensor seq_lens) -> ()");
-  cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
+  ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
 }
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/setup.py b/setup.py
index a6f2019e5bdb..afdd4b19b2f0 100644
--- a/setup.py
+++ b/setup.py
@@ -818,7 +818,7 @@ def _is_xpu() -> bool:
 
 
 def _build_custom_ops() -> bool:
-    return _is_cuda() or _is_hip() or _is_cpu()
+    return _is_cuda() or _is_hip()
 
 
 def get_rocm_version():
@@ -987,6 +987,15 @@ def _read_requirements(filename: str) -> list[str]:
             CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
         )
 
+if _is_cpu():
+    import platform
+
+    if platform.machine() in ("x86_64", "AMD64"):
+        ext_modules.append(CMakeExtension(name="vllm._C"))
+        ext_modules.append(CMakeExtension(name="vllm._C_AVX2"))
+    else:
+        ext_modules.append(CMakeExtension(name="vllm._C"))
+
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 69f080ae20d6..46f9dfad9211 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -178,9 +178,7 @@ def mla_decode_kvcache_cpu(
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
 ) -> None:
-    torch.ops._C_cpu.mla_decode_kvcache(
-        out, query, kv_cache, scale, block_tables, seq_lens
-    )
+    torch.ops._C.mla_decode_kvcache(out, query, kv_cache, scale, block_tables, seq_lens)
 
 
 # merge attn states ops
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index b3d6b0ed64a3..421cf8797f2c 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -483,3 +483,27 @@ def opaque_attention_op(cls) -> bool:
     @classmethod
     def support_hybrid_kv_cache(cls) -> bool:
         return True
+
+    @classmethod
+    def import_kernels(cls) -> None:
+        if Platform.get_cpu_architecture() in (CpuArchEnum.X86,):
+            if torch._C._cpu._is_avx512_supported():
+                try:
+                    import vllm._C  # noqa: F401
+                except ImportError as e:
+                    logger.warning("Failed to import from vllm._C: %r", e)
+            else:
+                # Note: The lib name is _C_AVX2, but the module name is _C.
+                # This will cause a exception "dynamic module does define
+                # module export function". But the library is imported
+                # successfully. So ignore the exception for now, until we find
+                # a solution.
+                try:
+                    import vllm._C_AVX2  # noqa: F401
+                except ImportError as e:
+                    logger.warning("Failed to import from vllm._C_AVX2: %r", e)
+        else:
+            try:
+                import vllm._C  # noqa: F401
+            except ImportError as e:
+                logger.warning("Failed to import from vllm._C: %r", e)
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 752b692f809a..d0cecda29fdb 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -85,7 +85,7 @@ def init_device(self):
             self.local_omp_cpuid = omp_cpuids_list[self.rank]
 
         if self.local_omp_cpuid != "nobind":
-            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            ret = torch.ops._C.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
 

From 538b28841f1ff1fbe98db9a48d58662a125723b9 Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Tue, 24 Feb 2026 16:47:59 -0800
Subject: [PATCH 102/154] [XPU][NIXL] support GPUDirect RDMA

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 tools/install_nixl_from_source_ubuntu.py               |  5 +++--
 .../kv_transfer/kv_connector/v1/nixl_connector.py      | 10 +++++-----
 vllm/platforms/xpu.py                                  |  6 ++++++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index b8a55c615426..b2360d7720a6 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -139,7 +139,8 @@ def build_and_install_prerequisites(args):
     if not os.path.exists(UCX_DIR):
         run_command(["git", "clone", UCX_REPO_URL, UCX_DIR])
     ucx_source_path = os.path.abspath(UCX_DIR)
-    run_command(["git", "checkout", "v1.19.x"], cwd=ucx_source_path)
+    # Pin UCX to commit e5d9887 for XPU GDR support until a release includes it.
+    run_command(["git", "checkout", "e5d9887"], cwd=ucx_source_path)
     run_command(["./autogen.sh"], cwd=ucx_source_path)
     configure_command = [
         "./configure",
@@ -152,7 +153,7 @@ def build_and_install_prerequisites(args):
         "--enable-devel-headers",
         "--with-verbs",
         "--enable-mt",
-        "--with-ze=no",
+        "--with-ze=yes",
     ]
     run_command(configure_command, cwd=ucx_source_path)
     run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 87091d650b17..ff198580fb4d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -135,7 +135,10 @@
         "cpu",
     ),
     "tpu": ("cpu",),
-    "xpu": ("cpu",),
+    "xpu": (
+        "cpu",
+        "xpu",
+    ),
     "cpu": ("cpu",),
 }
 # support for oot platform by providing mapping in current_platform
@@ -945,10 +948,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         # type based on kv_buffer_device
         nixl_memory_type = current_platform.get_nixl_memory_type()
         if nixl_memory_type is None:
-            if self.kv_buffer_device == "cuda":
-                nixl_memory_type = "VRAM"
-            elif self.kv_buffer_device == "cpu":
-                nixl_memory_type = "DRAM"
+            nixl_memory_type = "DRAM" if self.kv_buffer_device == "cpu" else "VRAM"
         if nixl_memory_type is None:
             raise RuntimeError(
                 f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c97c3297e1ba..c06afcb69937 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -221,6 +221,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
+        # In some cases, the internal memory type cache can misdetect GPU
+        # memory as host memory, also leading to invalid memory access.
+        # This cache can be disabled by setting UCX_MEMTYPE_CACHE=n.
+        # ref. https://openucx.readthedocs.io/en/master/faq.html
+        os.environ["UCX_MEMTYPE_CACHE"] = "n"
+
     @classmethod
     def support_hybrid_kv_cache(cls) -> bool:
         return True

From cd1fbbace466aa8143a36b4810aadde50109bb68 Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Wed, 25 Feb 2026 01:55:45 -0800
Subject: [PATCH 103/154] make UCX_VERSION as parameter

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 docker/Dockerfile.xpu                    | 2 ++
 tools/install_nixl_from_source_ubuntu.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index d030b151e74e..ce1c88e557e5 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -117,6 +117,8 @@ RUN uv pip install -e tests/vllm_test_utils
 
 # install nixl from source code
 ENV NIXL_VERSION=0.7.0
+# Pin UCX to commit e5d9887 for XPU GDR support until a release includes it.
+ENV UCX_VERSION=e5d9887
 RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 
 # FIX triton
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index b2360d7720a6..09a2e624b1f7 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -32,6 +32,7 @@ def get_latest_nixl_version():
 
 
 NIXL_VERSION = os.environ.get("NIXL_VERSION", get_latest_nixl_version())
+UCX_VERSION = os.environ.get("UCX_VERSION", "v1.19.x")
 
 
 def run_command(command, cwd=".", env=None):

From f3aaa47ebe500db5d5aa0b484b9898687f89027c Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Wed, 25 Feb 2026 01:59:20 -0800
Subject: [PATCH 104/154] make with-ze as parameter

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 docker/Dockerfile.xpu                    | 1 +
 tools/install_nixl_from_source_ubuntu.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index ce1c88e557e5..33e955016cab 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -119,6 +119,7 @@ RUN uv pip install -e tests/vllm_test_utils
 ENV NIXL_VERSION=0.7.0
 # Pin UCX to commit e5d9887 for XPU GDR support until a release includes it.
 ENV UCX_VERSION=e5d9887
+ENV WITH_ZE=yes
 RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 
 # FIX triton
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index 09a2e624b1f7..2939a2ee89f8 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -33,6 +33,7 @@ def get_latest_nixl_version():
 
 NIXL_VERSION = os.environ.get("NIXL_VERSION", get_latest_nixl_version())
 UCX_VERSION = os.environ.get("UCX_VERSION", "v1.19.x")
+WITH_ZE = os.environ.get("WITH_ZE", "no").lower()
 
 
 def run_command(command, cwd=".", env=None):
@@ -141,7 +142,7 @@ def build_and_install_prerequisites(args):
         run_command(["git", "clone", UCX_REPO_URL, UCX_DIR])
     ucx_source_path = os.path.abspath(UCX_DIR)
     # Pin UCX to commit e5d9887 for XPU GDR support until a release includes it.
-    run_command(["git", "checkout", "e5d9887"], cwd=ucx_source_path)
+    run_command(["git", "checkout", UCX_VERSION], cwd=ucx_source_path)
     run_command(["./autogen.sh"], cwd=ucx_source_path)
     configure_command = [
         "./configure",
@@ -154,7 +155,7 @@ def build_and_install_prerequisites(args):
         "--enable-devel-headers",
         "--with-verbs",
         "--enable-mt",
-        "--with-ze=yes",
+        f"--with-ze={WITH_ZE}",
     ]
     run_command(configure_command, cwd=ucx_source_path)
     run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)

From 2185c10a26438ff221be142f412d8ecd3c384bed Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Wed, 25 Feb 2026 16:33:24 -0800
Subject: [PATCH 105/154] remove unused comments

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 tools/install_nixl_from_source_ubuntu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index 2939a2ee89f8..4a2420bd45df 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -141,7 +141,6 @@ def build_and_install_prerequisites(args):
     if not os.path.exists(UCX_DIR):
         run_command(["git", "clone", UCX_REPO_URL, UCX_DIR])
     ucx_source_path = os.path.abspath(UCX_DIR)
-    # Pin UCX to commit e5d9887 for XPU GDR support until a release includes it.
     run_command(["git", "checkout", UCX_VERSION], cwd=ucx_source_path)
     run_command(["./autogen.sh"], cwd=ucx_source_path)
     configure_command = [

From ff5fcd29aee139db9b9ac229c9e4620d6be943fd Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Fri, 27 Feb 2026 19:57:33 -0800
Subject: [PATCH 106/154] update of install nixl and ucx

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 docker/Dockerfile.xpu                    | 31 +++++++++++++++++++-----
 tools/install_nixl_from_source_ubuntu.py |  6 ++---
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 33e955016cab..b33bd19e1589 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -115,12 +115,31 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # install development dependencies (for testing)
 RUN uv pip install -e tests/vllm_test_utils
 
-# install nixl from source code
-ENV NIXL_VERSION=0.7.0
-# Pin UCX to commit e5d9887 for XPU GDR support until a release includes it.
-ENV UCX_VERSION=e5d9887
-ENV WITH_ZE=yes
-RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
+# install NIXL and UCX from source code
+ARG UCX_VERSION=e5d98879705239d254ede40b4a52891850cb5349
+ARG NIXL_VERSION=0.7.0
+
+RUN set -eux; \
+    git clone https://github.com/openucx/ucx /tmp/ucx_source; \
+    cd /tmp/ucx_source; \
+    git checkout "${UCX_VERSION}"; \
+    bash autogen.sh; \
+    ./configure --prefix=/tmp/ucx_install --with-ze=yes --enable-examples --enable-mt; \
+    make CFLAGS="-Wno-error=incompatible-pointer-types"; \
+    make install; \
+    git clone https://github.com/ai-dynamo/nixl /tmp/nixl_source; \
+    cd /tmp/nixl_source; \
+    git checkout "${NIXL_VERSION}"
+
+ENV PKG_CONFIG_PATH=/tmp/ucx_install/lib/pkgconfig:/tmp/nixl_install/lib/pkgconfig:${PKG_CONFIG_PATH}
+ENV LD_LIBRARY_PATH=/tmp/nixl_install/lib:${LD_LIBRARY_PATH}
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    cd /tmp/nixl_source && \
+    uv pip install --upgrade meson pybind11 patchelf && \
+    uv pip install -r requirements.txt && \
+    uv pip install . && \
+    rm -rf /tmp/ucx_source /tmp/nixl_source
 
 # FIX triton
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index 4a2420bd45df..b8a55c615426 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -32,8 +32,6 @@ def get_latest_nixl_version():
 
 
 NIXL_VERSION = os.environ.get("NIXL_VERSION", get_latest_nixl_version())
-UCX_VERSION = os.environ.get("UCX_VERSION", "v1.19.x")
-WITH_ZE = os.environ.get("WITH_ZE", "no").lower()
 
 
 def run_command(command, cwd=".", env=None):
@@ -141,7 +139,7 @@ def build_and_install_prerequisites(args):
     if not os.path.exists(UCX_DIR):
         run_command(["git", "clone", UCX_REPO_URL, UCX_DIR])
     ucx_source_path = os.path.abspath(UCX_DIR)
-    run_command(["git", "checkout", UCX_VERSION], cwd=ucx_source_path)
+    run_command(["git", "checkout", "v1.19.x"], cwd=ucx_source_path)
     run_command(["./autogen.sh"], cwd=ucx_source_path)
     configure_command = [
         "./configure",
@@ -154,7 +152,7 @@ def build_and_install_prerequisites(args):
         "--enable-devel-headers",
         "--with-verbs",
         "--enable-mt",
-        f"--with-ze={WITH_ZE}",
+        "--with-ze=no",
     ]
     run_command(configure_command, cwd=ucx_source_path)
     run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)

From 23743b924502c8d2322cd9ebb7370494c0201fd3 Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Fri, 27 Feb 2026 20:46:32 -0800
Subject: [PATCH 107/154] update nixl memory type

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 .../kv_transfer/kv_connector/v1/nixl_connector.py            | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index ff198580fb4d..c5a5b0450fbb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -948,7 +948,10 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         # type based on kv_buffer_device
         nixl_memory_type = current_platform.get_nixl_memory_type()
         if nixl_memory_type is None:
-            nixl_memory_type = "DRAM" if self.kv_buffer_device == "cpu" else "VRAM"
+            if self.kv_buffer_device in ["cuda", "xpu"]:
+                nixl_memory_type = "VRAM"
+            elif self.kv_buffer_device == "cpu":
+                nixl_memory_type = "DRAM"
         if nixl_memory_type is None:
             raise RuntimeError(
                 f"{self.device_type} with {self.kv_buffer_device} kv_buffer "

From dea268336fb51b5bc342dca29189f3d4440ca2a0 Mon Sep 17 00:00:00 2001
From: Itay Alroy <75032521+itayalroy@users.noreply.github.com>
Date: Sat, 28 Feb 2026 06:46:42 +0200
Subject: [PATCH 108/154] [1/N] Elastic EP Milestone 2 (#34861)

Signed-off-by: Yongji Wu <wuyongji317@gmail.com>
Signed-off-by: Itay Alroy <ialroy@nvidia.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Ron Tourgeman <rtourgeman@nvidia.com>
Co-authored-by: Yongji Wu <wuyongji317@gmail.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Ron Tourgeman <rtourgeman@nvidia.com>
---
 .buildkite/test_areas/expert_parallelism.yaml |  17 +-
 .../passes/distributed/test_async_tp.py       |   8 +-
 .../distributed/test_fusion_all_reduce.py     |   2 +-
 .../distributed/test_sequence_parallelism.py  |   2 +-
 tests/conftest.py                             |  22 +-
 tests/distributed/eplb_utils.py               |   7 +-
 tests/distributed/test_elastic_ep.py          | 202 +++++++
 tests/distributed/test_eplb_execute.py        | 469 ++++++++-------
 .../test_nccl_symm_mem_allreduce.py           |   4 +-
 tests/distributed/test_pynccl.py              |   4 +-
 tests/kernels/mamba/test_mamba_mixer2.py      |   5 +-
 tests/lora/conftest.py                        |  21 +-
 tests/lora/test_fused_moe_lora_kernel.py      |   5 +-
 tests/lora/test_worker.py                     |   6 +-
 tests/models/test_vision.py                   |  14 +-
 tests/utils.py                                |  33 +-
 tests/v1/worker/test_gpu_model_runner.py      |   7 +-
 .../v1/worker/test_worker_memory_snapshot.py  |   8 +-
 vllm/compilation/wrapper.py                   |  49 ++
 vllm/config/parallel.py                       | 122 +++-
 .../device_communicators/all2all.py           |  31 +-
 .../base_device_communicator.py               |  57 +-
 .../device_communicators/cuda_communicator.py |  54 +-
 .../device_communicators/pynccl.py            |  36 +-
 vllm/distributed/elastic_ep/__init__.py       |   0
 .../distributed/elastic_ep/elastic_execute.py | 529 ++++++++++++++++
 vllm/distributed/elastic_ep/elastic_state.py  | 563 ++++++++++++++++++
 vllm/distributed/elastic_ep/standby_state.py  | 117 ++++
 vllm/distributed/eplb/async_worker.py         |   4 -
 vllm/distributed/eplb/eplb_state.py           | 312 +++-------
 vllm/distributed/eplb/rebalance_execute.py    |  80 ++-
 vllm/distributed/parallel_state.py            | 231 ++++++-
 vllm/distributed/stateless_coordinator.py     | 322 ++++++++++
 vllm/distributed/utils.py                     |  87 ++-
 vllm/engine/arg_utils.py                      |   5 +
 vllm/entrypoints/cli/serve.py                 |   6 +-
 vllm/envs.py                                  |  12 +
 vllm/model_executor/layers/fused_moe/layer.py |   5 +-
 vllm/platforms/cuda.py                        |  34 ++
 vllm/platforms/rocm.py                        |  34 ++
 vllm/v1/engine/__init__.py                    |  14 +
 vllm/v1/engine/async_llm.py                   |  41 +-
 vllm/v1/engine/coordinator.py                 |  21 +-
 vllm/v1/engine/core.py                        | 215 +++++--
 vllm/v1/engine/core_client.py                 | 319 ++++++++--
 vllm/v1/engine/utils.py                       |  60 +-
 vllm/v1/executor/multiproc_executor.py        |  21 +-
 vllm/v1/executor/ray_executor.py              |   6 +-
 vllm/v1/executor/uniproc_executor.py          |  17 +-
 vllm/v1/worker/cpu_model_runner.py            |   7 +-
 vllm/v1/worker/gpu_model_runner.py            |  79 ++-
 vllm/v1/worker/gpu_worker.py                  | 255 +-------
 vllm/v1/worker/workspace.py                   |  28 +
 53 files changed, 3603 insertions(+), 1006 deletions(-)
 create mode 100644 tests/distributed/test_elastic_ep.py
 create mode 100644 vllm/distributed/elastic_ep/__init__.py
 create mode 100644 vllm/distributed/elastic_ep/elastic_execute.py
 create mode 100644 vllm/distributed/elastic_ep/elastic_state.py
 create mode 100644 vllm/distributed/elastic_ep/standby_state.py
 create mode 100644 vllm/distributed/stateless_coordinator.py

diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
index 9a10476ed78a..1443d847eaf5 100644
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -20,4 +20,19 @@ steps:
   - tests/distributed/test_eplb_execute.py
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
-  - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
+  - pytest -v -s distributed/test_eplb_spec_decode.py
+
+- label: Elastic EP Scaling Test
+  timeout_in_minutes: 20
+  device: b200
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/compilation/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_elastic_ep.py
diff --git a/tests/compile/passes/distributed/test_async_tp.py b/tests/compile/passes/distributed/test_async_tp.py
index df7747d1a1f3..abc71768c867 100644
--- a/tests/compile/passes/distributed/test_async_tp.py
+++ b/tests/compile/passes/distributed/test_async_tp.py
@@ -316,7 +316,6 @@ def async_tp_pass_on_test_model(
 
     # initialize distributed
     init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # configure vllm config for SequenceParallelismPass
     vllm_config = VllmConfig()
@@ -334,11 +333,10 @@ def async_tp_pass_on_test_model(
         model=model_name, trust_remote_code=True, dtype=dtype, seed=42
     )
 
-    async_tp_pass = AsyncTPPass(vllm_config)
-
-    # Set the global vllm_config for TestBackend which calls
-    # get_current_vllm_config()
     with set_current_vllm_config(vllm_config):
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+        async_tp_pass = AsyncTPPass(vllm_config)
         backend = TestBackend(async_tp_pass)
 
         assert (
diff --git a/tests/compile/passes/distributed/test_fusion_all_reduce.py b/tests/compile/passes/distributed/test_fusion_all_reduce.py
index 6d5113b1e84b..4beac8c4fb53 100644
--- a/tests/compile/passes/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py
@@ -278,7 +278,6 @@ def all_reduce_fusion_pass_on_test_model(
     )
 
     init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     custom_ops = []
     if enable_rms_norm_custom_op:
@@ -304,6 +303,7 @@ def all_reduce_fusion_pass_on_test_model(
         model=model_name, trust_remote_code=True, dtype=dtype, seed=42
     )
     with set_current_vllm_config(vllm_config):
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
         all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
         noop_pass = NoOpEliminationPass(vllm_config)
         func_pass = FixFunctionalizationPass(vllm_config)
diff --git a/tests/compile/passes/distributed/test_sequence_parallelism.py b/tests/compile/passes/distributed/test_sequence_parallelism.py
index 46363a9a4a44..78c3cf92a067 100644
--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -242,7 +242,6 @@ def sequence_parallelism_pass_on_test_model(
 
     # initialize distributed
     init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # configure vllm config for SequenceParallelismPass
     custom_ops_list = custom_ops.split(",") if custom_ops else []
@@ -272,6 +271,7 @@ def sequence_parallelism_pass_on_test_model(
     )
 
     with set_current_vllm_config(vllm_config):
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
         noop_pass = NoOpEliminationPass(vllm_config)
         sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
         cleanup_pass = PostCleanupPass(vllm_config)
diff --git a/tests/conftest.py b/tests/conftest.py
index 22bb19f2f3d1..5a2beea89e64 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -176,16 +176,20 @@ def init_test_http_connection():
 
 @pytest.fixture
 def dist_init():
+    from tests.utils import ensure_current_vllm_config
+
     temp_file = tempfile.mkstemp()[1]
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend="nccl",
-    )
-    initialize_model_parallel(1, 1)
-    yield
+
+    with ensure_current_vllm_config():
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            local_rank=0,
+            backend="nccl",
+        )
+        initialize_model_parallel(1, 1)
+        yield
     cleanup_dist_env_and_memory()
 
 
diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py
index 27a63e021514..7c27347fd359 100644
--- a/tests/distributed/eplb_utils.py
+++ b/tests/distributed/eplb_utils.py
@@ -7,6 +7,7 @@
 import torch
 import torch.multiprocessing as mp
 
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import (
     init_distributed_environment,
 )
@@ -42,7 +43,11 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
     local_rank = os.environ["LOCAL_RANK"]
     device = torch.device(f"cuda:{local_rank}")
     torch.cuda.set_device(device)
-    init_distributed_environment()
+
+    # Create a minimal vllm config for init_distributed_environment
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        init_distributed_environment()
 
     # Ensure each worker process has the same random seed
     random.seed(42)
diff --git a/tests/distributed/test_elastic_ep.py b/tests/distributed/test_elastic_ep.py
new file mode 100644
index 000000000000..1d0f615d6ea9
--- /dev/null
+++ b/tests/distributed/test_elastic_ep.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import subprocess
+import time
+
+import pytest
+import requests
+
+from ..evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from ..utils import RemoteOpenAIServer, multi_gpu_test
+
+
+@pytest.fixture(autouse=True)
+def cleanup_ray_between_tests():
+    """Force-stop any lingering Ray processes between tests."""
+    subprocess.run(["ray", "stop", "--force"], timeout=30, capture_output=True)
+    time.sleep(5)
+    yield
+
+
+MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+NUM_GSM8K_QUESTIONS = 256
+EXPECTED_ACCURACY = 0.58
+ACCURACY_TOL = 0.08
+MAX_NUM_SEQS = 32
+
+
+def _send_scale_command(server: RemoteOpenAIServer, new_dp_size: int) -> bool:
+    url = server.url_for("scale_elastic_ep")
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+        return response.status_code == 200
+    except requests.exceptions.RequestException:
+        return False
+
+
+def _run_gsm8k_eval(server: RemoteOpenAIServer, stage: str) -> float:
+    assert server.port is not None
+    result = evaluate_gsm8k(
+        num_questions=NUM_GSM8K_QUESTIONS,
+        host=f"http://{server.host}",
+        port=server.port,
+    )
+    accuracy = result["accuracy"]
+    print(
+        f"[{stage}] GSM8K accuracy: {accuracy:.3f} "
+        f"({result['num_questions']} questions)"
+    )
+    assert accuracy >= EXPECTED_ACCURACY, (
+        f"[{stage}] GSM8K accuracy {accuracy:.3f} is below "
+        f"expected threshold {EXPECTED_ACCURACY}"
+    )
+    return accuracy
+
+
+@multi_gpu_test(num_gpus=4)
+def test_elastic_ep_scaling():
+    vllm_serve_args = [
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        "1",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),
+        "--enable-expert-parallel",
+        "--all2all-backend",
+        "allgather_reducescatter",
+        "--enable-elastic-ep",
+        "--enable-eplb",
+        "--eplb-config.num_redundant_experts",
+        "0",
+        "--data-parallel-backend",
+        "ray",
+        "--data-parallel-size",
+        "2",
+        "--api-server-count",
+        "1",
+    ]
+
+    leader_address = os.environ.get("LEADER_ADDRESS")
+    if leader_address:
+        vllm_serve_args.extend(["--data-parallel-address", leader_address])
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
+    ) as server:
+        initial_accuracy = _run_gsm8k_eval(server, "Initial (2 GPUs)")
+
+        assert _send_scale_command(server, 4)
+        time.sleep(10)
+        scale_up_accuracy = _run_gsm8k_eval(server, "After scale up (4 GPUs)")
+
+        assert scale_up_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale up accuracy {scale_up_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        assert _send_scale_command(server, 2)
+        time.sleep(5)
+        scale_down_accuracy = _run_gsm8k_eval(server, "After scale down (2 GPUs)")
+
+        assert scale_down_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale down accuracy {scale_down_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        print("\nAccuracy Summary:")
+        print(f"  Initial:    {initial_accuracy:.3f}")
+        print(
+            f"  Scale up:   {scale_up_accuracy:.3f} "
+            f"(diff: {scale_up_accuracy - initial_accuracy:+.3f})"
+        )
+        print(
+            f"  Scale down: {scale_down_accuracy:.3f} "
+            f"(diff: {scale_down_accuracy - initial_accuracy:+.3f})"
+        )
+        print(f"  Tolerance:  {ACCURACY_TOL:.3f}")
+
+
+@multi_gpu_test(num_gpus=4)
+def test_elastic_ep_scaling_uneven():
+    """Test scale up with uneven worker distribution.
+
+    This tests the case where num_new_workers % old_dp_size != 0,
+    specifically 2 -> 3 where remainder = 1 % 2 = 1.
+    This exercises the remainder handling in sender-receiver pairing.
+    """
+    vllm_serve_args = [
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        "1",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),
+        "--enable-expert-parallel",
+        "--all2all-backend",
+        "allgather_reducescatter",
+        "--enable-elastic-ep",
+        "--enable-eplb",
+        "--eplb-config.num_redundant_experts",
+        "0",
+        "--data-parallel-backend",
+        "ray",
+        "--data-parallel-size",
+        "2",
+        "--api-server-count",
+        "1",
+    ]
+
+    leader_address = os.environ.get("LEADER_ADDRESS")
+    if leader_address:
+        vllm_serve_args.extend(["--data-parallel-address", leader_address])
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
+    ) as server:
+        initial_accuracy = _run_gsm8k_eval(server, "Initial (2 GPUs)")
+
+        # Scale 2 -> 3: This has remainder = 1 % 2 = 1
+        # Tests uneven sender-receiver pairing
+        assert _send_scale_command(server, 3)
+        time.sleep(10)
+        scale_up_accuracy = _run_gsm8k_eval(server, "After scale up (3 GPUs)")
+
+        assert scale_up_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale up accuracy {scale_up_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        # Scale back down to 2
+        assert _send_scale_command(server, 2)
+        time.sleep(5)
+        scale_down_accuracy = _run_gsm8k_eval(server, "After scale down (2 GPUs)")
+
+        assert scale_down_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale down accuracy {scale_down_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        print("\nAccuracy Summary (Uneven Scaling):")
+        print(f"  Initial:    {initial_accuracy:.3f}")
+        print(
+            f"  Scale up:   {scale_up_accuracy:.3f} "
+            f"(diff: {scale_up_accuracy - initial_accuracy:+.3f})"
+        )
+        print(
+            f"  Scale down: {scale_down_accuracy:.3f} "
+            f"(diff: {scale_down_accuracy - initial_accuracy:+.3f})"
+        )
+        print(f"  Tolerance:  {ACCURACY_TOL:.3f}")
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 48afc39c62ed..674a665b0626 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -8,6 +8,7 @@
 import torch
 import torch.distributed
 
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.eplb.rebalance_execute import (
     move_from_buffer,
     rearrange_expert_weights_inplace,
@@ -244,91 +245,96 @@ def _test_async_transfer_layer_without_mtp_worker(
     num_logical_experts: int,
 ) -> None:
     set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )
 
-    tp_group = get_tp_group()
-    ep_group = tp_group.device_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size
 
-    total_physical_experts = world_size * num_local_experts
-    hidden_sizes = [16, 32]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
-    redundancy_config = create_redundancy_config(
-        num_logical_experts,
-        total_physical_experts,
-    )
-    old_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        redundancy_config,
-    )
+        tp_group = get_tp_group()
+        ep_group = tp_group.device_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
 
-    new_redundancy_config = create_redundancy_config(
-        num_logical_experts,
-        total_physical_experts,
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        new_redundancy_config,
-    )
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [16, 32]
 
-    expert_weights = create_expert_weights(
-        num_layers,
-        num_local_experts,
-        hidden_sizes,
-        ep_rank,
-        device,
-        old_indices,
-    )
-    old_indices_cpu = old_indices.cpu()
-    new_indices_cpu = new_indices.cpu()
+        redundancy_config = create_redundancy_config(
+            num_logical_experts,
+            total_physical_experts,
+        )
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
+        )
 
-    expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
-    cuda_stream = torch.cuda.Stream(device=device)
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts,
+            total_physical_experts,
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )
 
-    for layer_idx in range(num_layers):
-        is_unchanged, is_received_locally, recv_metadata = asyncio.run(
-            transfer_layer(
-                old_layer_indices=old_indices_cpu[layer_idx],
-                new_layer_indices=new_indices_cpu[layer_idx],
+        expert_weights = create_expert_weights(
+            num_layers,
+            num_local_experts,
+            hidden_sizes,
+            ep_rank,
+            device,
+            old_indices,
+        )
+        old_indices_cpu = old_indices.cpu()
+        new_indices_cpu = new_indices.cpu()
+
+        expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+        cuda_stream = torch.cuda.Stream(device=device)
+
+        for layer_idx in range(num_layers):
+            is_unchanged, is_received_locally, recv_metadata = asyncio.run(
+                transfer_layer(
+                    old_layer_indices=old_indices_cpu[layer_idx],
+                    new_layer_indices=new_indices_cpu[layer_idx],
+                    expert_weights=expert_weights[layer_idx],
+                    expert_weights_buffer=expert_buffer,
+                    ep_group=ep_group,
+                    cuda_stream=cuda_stream,
+                )
+            )
+            cuda_stream.synchronize()
+            move_from_buffer(
                 expert_weights=expert_weights[layer_idx],
-                expert_weights_buffer=expert_buffer,
-                ep_group=ep_group,
-                cuda_stream=cuda_stream,
+                expert_weights_buffers=expert_buffer,
+                is_unchanged=is_unchanged,
+                is_received_locally=is_received_locally,
+                recv_metadata=recv_metadata,
+                new_indices=new_indices_cpu[layer_idx].numpy(),
+                ep_rank=ep_rank,
             )
+
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
         )
-        cuda_stream.synchronize()
-        move_from_buffer(
-            expert_weights=expert_weights[layer_idx],
-            expert_weights_buffers=expert_buffer,
-            is_unchanged=is_unchanged,
-            is_received_locally=is_received_locally,
-            recv_metadata=recv_metadata,
-            new_indices=new_indices_cpu[layer_idx].numpy(),
-            ep_rank=ep_rank,
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
         )
 
-    verify_expert_weights_after_shuffle(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        ep_rank,
-        num_local_experts,
-    )
-    verify_redundant_experts_have_same_weights(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        world_size,
-        num_local_experts,
-    )
-
 
 def _test_rearrange_expert_weights_with_redundancy(
     env, world_size, num_layers, num_local_experts, num_logical_experts
@@ -336,71 +342,76 @@ def _test_rearrange_expert_weights_with_redundancy(
     # Initialize model parallel (using tensor parallel as an entrypoint
     # to expert parallel)
     set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )
 
-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size
 
-    # Test parameters
-    total_physical_experts = world_size * num_local_experts
-    hidden_sizes = [32, 64]  # Two different weight matrices
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
-    # Create old expert indices (with redundancy)
-    redundancy_config = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
 
-    old_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        redundancy_config,
-    )
+        # Test parameters
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [32, 64]  # Two different weight matrices
 
-    # Create new expert indices (with redundancy)
-    new_redundancy_config = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        new_redundancy_config,
-    )
+        # Create old expert indices (with redundancy)
+        redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
 
-    # Create expert weights
-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-    )
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
+        )
 
-    # Execute weight rearrangement
-    rearrange_expert_weights_inplace(
-        old_indices,
-        new_indices,
-        expert_weights,
-        ep_group,
-        is_profile=False,
-    )
+        # Create new expert indices (with redundancy)
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )
 
-    # Verify the rearrangement result
-    verify_expert_weights_after_shuffle(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        ep_rank,
-        num_local_experts,
-    )
+        # Create expert weights
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )
 
-    verify_redundant_experts_have_same_weights(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        world_size,
-        num_local_experts,
-    )
+        # Execute weight rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify the rearrangement result
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
+        )
+
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
+        )
 
 
 @pytest.mark.parametrize(
@@ -444,58 +455,63 @@ def test_rearrange_expert_weights_with_redundancy(
 
 def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
     set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )
 
-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size
 
-    num_layers = 2
-    num_local_experts = 2
-    total_physical_experts = world_size * num_local_experts
-    num_logical_experts = total_physical_experts // 2  # Some redundancy
-    hidden_sizes = [32, 64]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
-    # Create redundancy configuration
-    redundancy_config = [2] * num_logical_experts
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
 
-    # Same indices - no change
-    indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, redundancy_config
-    )
+        num_layers = 2
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2  # Some redundancy
+        hidden_sizes = [32, 64]
 
-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
-    )
+        # Create redundancy configuration
+        redundancy_config = [2] * num_logical_experts
 
-    # Save original weights
-    original_weights = []
-    for layer_weights in expert_weights:
-        layer_copy = []
-        for weight in layer_weights:
-            layer_copy.append(weight.clone())
-        original_weights.append(layer_copy)
-
-    # Execute rearrangement (should be no change)
-    rearrange_expert_weights_inplace(
-        indices,
-        indices,  # Same indices
-        expert_weights,
-        ep_group,
-        is_profile=False,
-    )
+        # Same indices - no change
+        indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, redundancy_config
+        )
 
-    # Verify that the weights have not changed
-    for layer in range(num_layers):
-        for weight_idx in range(len(hidden_sizes)):
-            torch.testing.assert_close(
-                expert_weights[layer][weight_idx],
-                original_weights[layer][weight_idx],
-                msg=f"""Layer {layer}, weight {weight_idx}
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+        )
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute rearrangement (should be no change)
+        rearrange_expert_weights_inplace(
+            indices,
+            indices,  # Same indices
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify that the weights have not changed
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg=f"""Layer {layer}, weight {weight_idx}
  should remain unchanged""",
-            )
+                )
 
 
 @pytest.mark.parametrize(
@@ -538,64 +554,69 @@ def test_rearrange_expert_weights_no_change(world_size):
 
 def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
     set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )
 
-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size
 
-    num_layers = 1
-    num_local_experts = 2
-    total_physical_experts = world_size * num_local_experts
-    num_logical_experts = total_physical_experts // 2
-    hidden_sizes = [32]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
-    # Create different index distributions
-    old_redundancy = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
-    new_redundancy = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
 
-    old_indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, old_redundancy
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, new_redundancy
-    )
+        num_layers = 1
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2
+        hidden_sizes = [32]
 
-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-    )
+        # Create different index distributions
+        old_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
 
-    # Save original weights
-    original_weights = []
-    for layer_weights in expert_weights:
-        layer_copy = []
-        for weight in layer_weights:
-            layer_copy.append(weight.clone())
-        original_weights.append(layer_copy)
-
-    # Execute profile mode rearrangement
-    rearrange_expert_weights_inplace(
-        old_indices,
-        new_indices,
-        expert_weights,
-        ep_group,
-        is_profile=True,  # Profile mode
-    )
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, old_redundancy
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, new_redundancy
+        )
 
-    # In profile mode, the weights should remain unchanged
-    for layer in range(num_layers):
-        for weight_idx in range(len(hidden_sizes)):
-            torch.testing.assert_close(
-                expert_weights[layer][weight_idx],
-                original_weights[layer][weight_idx],
-                msg="In profile mode, the weights should remain unchanged",
-            )
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute profile mode rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=True,  # Profile mode
+        )
+
+        # In profile mode, the weights should remain unchanged
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg="In profile mode, the weights should remain unchanged",
+                )
 
 
 @pytest.mark.parametrize("world_size", [2, 4])
diff --git a/tests/distributed/test_nccl_symm_mem_allreduce.py b/tests/distributed/test_nccl_symm_mem_allreduce.py
index eeb74bdf5357..b81624fe1a89 100644
--- a/tests/distributed/test_nccl_symm_mem_allreduce.py
+++ b/tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -10,6 +10,7 @@
 import torch.multiprocessing as mp
 
 import vllm.envs as envs
+from tests.utils import ensure_current_vllm_config
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
 from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
@@ -51,7 +52,8 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
         )
 
         init_distributed_environment()
-        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        with ensure_current_vllm_config():
+            initialize_model_parallel(tensor_model_parallel_size=world_size)
 
         cuda_communicator = typing.cast(
             CudaCommunicator, get_tp_group().device_communicator
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index c7c9d0602def..d207103350c4 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -9,6 +9,7 @@
 import torch
 import torch.distributed
 
+from tests.utils import ensure_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
@@ -112,7 +113,8 @@ def test_pynccl_multiple_allreduce():
 @worker_fn_wrapper
 def multiple_allreduce_with_vllm_worker_fn():
     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
-    ensure_model_parallel_initialized(2, 2)
+    with ensure_current_vllm_config():
+        ensure_model_parallel_initialized(2, 2)
     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
     with graph_capture(device=device):
         # two tp groups can communicate independently
diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
index 98879ff6ed7f..322e717e921a 100644
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 
-from tests.utils import multi_gpu_test
+from tests.utils import ensure_current_vllm_config, multi_gpu_test
 from vllm.distributed.parallel_state import (
     init_distributed_environment,
     initialize_model_parallel,
@@ -87,7 +87,8 @@ def mixer2_gated_norm_tensor_parallel(
 
     # initialize distributed
     init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # create random weights an inputs
     weight = torch.rand((hidden_size,), dtype=dtype, device=device)
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index d0d8382acf9a..71180a2c76f8 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -45,21 +45,24 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 
 @pytest.fixture
 def dist_init():
+    from tests.utils import ensure_current_vllm_config
+
     temp_file = tempfile.mkstemp()[1]
 
     backend = "nccl"
     if current_platform.is_cpu() or current_platform.is_tpu():
         backend = "gloo"
 
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend=backend,
-    )
-    initialize_model_parallel(1, 1)
-    yield
+    with ensure_current_vllm_config():
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            local_rank=0,
+            backend=backend,
+        )
+        initialize_model_parallel(1, 1)
+        yield
     cleanup_dist_env_and_memory(shutdown_ray=True)
 
 
diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index 382999bca351..b2db7968ec36 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 
-from tests.utils import multi_gpu_test
+from tests.utils import ensure_current_vllm_config, multi_gpu_test
 from vllm import _custom_ops as ops
 from vllm.distributed import (
     init_distributed_environment,
@@ -631,7 +631,8 @@ def _get_shard_slice(shard_size):
         local_rank=local_rank,
         distributed_init_method=init_method,
     )
-    initialize_model_parallel(world_size, 1)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(world_size, 1)
     tp_size = get_tensor_model_parallel_world_size()
 
     input_dim = K if column_parallel else N
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 445aaf9cb7d1..274142e8d66e 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -13,6 +13,7 @@
     ParallelConfig,
     SchedulerConfig,
     VllmConfig,
+    set_current_vllm_config,
 )
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
@@ -77,8 +78,9 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
         distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
     )
 
-    worker.init_device()
-    worker.load_model()
+    with set_current_vllm_config(vllm_config):
+        worker.init_device()
+        worker.load_model()
 
     set_active_loras(worker, [])
     assert worker.list_loras() == set()
diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
index 24e49e9d61c8..17d82b1252b3 100644
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -6,7 +6,7 @@
 import torch
 import torch.multiprocessing as mp
 
-from tests.utils import multi_gpu_test
+from tests.utils import ensure_current_vllm_config, multi_gpu_test
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import (
     init_distributed_environment,
@@ -117,7 +117,8 @@ def run_dp_sharded_vision_model_vs_direct(
 
     # initialize distributed
     init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # Create a test input tensor
     image_input = torch.randn(batch_size, 3, 224, 224)
@@ -302,7 +303,8 @@ def run_dp_sharded_mrope_vision_model_vs_direct(
 
     # initialize distributed
     init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # Create test data
     grid_thw_list = []
@@ -377,7 +379,8 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
     )
 
     init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # Create empty inputs
     pixel_values = torch.empty((0, 768))
@@ -425,7 +428,8 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
     )
 
     init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # Create images with very different sizes
     grid_thw_list = [
diff --git a/tests/utils.py b/tests/utils.py
index 4041c261788c..d407733a358f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -895,6 +895,36 @@ def compare_all_settings(
                     )
 
 
+@contextmanager
+def ensure_current_vllm_config():
+    """Ensures a vllm config is set for the duration of the context.
+
+    If a config is already set, this is a no-op. Otherwise, it creates a default
+    VllmConfig and sets it for the duration of the context.
+
+    Used for tests that call functions which require a vllm config but don't
+    need a specific config.
+
+    Example:
+        with ensure_current_vllm_config():
+            init_distributed_environment(...)
+            ensure_model_parallel_initialized(...)
+    """
+    from vllm.config import (
+        VllmConfig,
+        get_current_vllm_config_or_none,
+        set_current_vllm_config,
+    )
+
+    if get_current_vllm_config_or_none() is not None:
+        # Config already set, just yield
+        yield
+    else:
+        # No config set, create a default one for the duration
+        with set_current_vllm_config(VllmConfig()):
+            yield
+
+
 def init_test_distributed_environment(
     tp_size: int,
     pp_size: int,
@@ -921,6 +951,7 @@ def init_test_distributed_environment(
             distributed_init_method=distributed_init_method,
             local_rank=local_rank,
         )
+        ensure_model_parallel_initialized(tp_size, pp_size)
     else:
         # No config set, create a default one for the test
         with set_current_vllm_config(VllmConfig()):
@@ -930,7 +961,7 @@ def init_test_distributed_environment(
                 distributed_init_method=distributed_init_method,
                 local_rank=local_rank,
             )
-    ensure_model_parallel_initialized(tp_size, pp_size)
+            ensure_model_parallel_initialized(tp_size, pp_size)
 
 
 def multi_process_parallel(
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 93e6822e6397..d1c43b64532b 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -789,8 +789,11 @@ def test_hybrid_attention_mamba_tensor_shapes():
             "MASTER_PORT": "12345",
         }
     )
-    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=1)
+    from tests.utils import ensure_current_vllm_config
+
+    with ensure_current_vllm_config():
+        init_distributed_environment()
+        initialize_model_parallel(tensor_model_parallel_size=1)
     torch.set_default_dtype(torch.float16)
 
     model_config = ModelConfig(
diff --git a/tests/v1/worker/test_worker_memory_snapshot.py b/tests/v1/worker/test_worker_memory_snapshot.py
index 66330127b5ec..27a9b4a759d4 100644
--- a/tests/v1/worker/test_worker_memory_snapshot.py
+++ b/tests/v1/worker/test_worker_memory_snapshot.py
@@ -10,6 +10,7 @@
 import pytest
 import torch
 
+from vllm.config import set_current_vllm_config
 from vllm.engine.arg_utils import EngineArgs
 from vllm.utils.mem_utils import MemorySnapshot
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
@@ -95,7 +96,12 @@ def worker_process(
             side_effect=make_operation_tracker("nccl_all_reduce", original_all_reduce),
         )
 
-        with init_patch, memory_patch, all_reduce_patch:
+        with (
+            init_patch,
+            memory_patch,
+            all_reduce_patch,
+            set_current_vllm_config(vllm_config),
+        ):
             # Initialize device (this is where we test the order)
             worker.init_device()
 
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 850ddae9ab97..5dff296d0c1e 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -319,3 +319,52 @@ def _dispatch_to_compiled_code(self) -> Generator[None, None, None]:
             yield
         finally:
             self.__class__.forward.__code__ = original
+
+
+def reset_compile_wrapper(model: torch.nn.Module) -> None:
+    """
+    Clean up compiled model and captured CUDA graphs for elastic EP.
+    """
+    if not isinstance(model, TorchCompileWithNoGuardsWrapper) and hasattr(
+        model, "model"
+    ):
+        model = model.model
+    if not isinstance(model, TorchCompileWithNoGuardsWrapper):
+        return
+    # model.do_not_compile is set by the @support_torch_compile decorator
+    if hasattr(model, "do_not_compile") and model.do_not_compile:
+        return
+    from vllm.compilation.counter import compilation_counter
+
+    # reset the compilation counter
+    compilation_counter.num_models_seen = 0
+    compilation_counter.num_graphs_seen = 0
+    compilation_counter.num_piecewise_graphs_seen = 0
+    compilation_counter.num_piecewise_capturable_graphs_seen = 0
+    compilation_counter.num_backend_compilations = 0
+    compilation_counter.num_gpu_runner_capture_triggers = 0
+    compilation_counter.num_cudagraph_captured = 0
+    compilation_counter.num_inductor_compiles = 0
+    compilation_counter.num_eager_compiles = 0
+    compilation_counter.num_cache_entries_updated = 0
+    compilation_counter.num_compiled_artifacts_saved = 0
+    compilation_counter.stock_torch_compile_count = 0
+
+    # Clear the AOT compiled function so the model is forced to
+    # recompile on the next call. Without this, decorators.py
+    # __call__ uses the stale aot_compiled_fn whose torchinductor
+    # kernels have old parameters (expert_map size for example)
+    # baked in as compile-time constants.
+    if hasattr(model, "aot_compiled_fn"):
+        model.aot_compiled_fn = None
+    if hasattr(model, "was_aot_compile_fn_loaded_from_disk"):
+        model.was_aot_compile_fn_loaded_from_disk = False
+
+    # Reset the cache_dir so VllmBackend recomputes the hash
+    # (data_parallel_size changed, so the config hash differs).
+    compilation_config = model.vllm_config.compilation_config
+    compilation_config.cache_dir = ""
+    compilation_config.local_cache_dir = ""
+
+    model.__class__.forward.__code__ = model.original_code_object()
+    TorchCompileWithNoGuardsWrapper.__init__(model)
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index fa4f72dccca4..59df4a214cc5 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -165,6 +165,9 @@ class ParallelConfig:
     disable_custom_all_reduce: bool = False
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
+    enable_elastic_ep: bool = False
+    """Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""
+
     enable_dbo: bool = False
     """Enable dual batch overlap for the model executor."""
     ubatch_size: int = 0
@@ -244,6 +247,34 @@ class is dynamically inherited by the worker class. This is used to inject
     Set to be private as it's not intended to be configured by users.
     """
 
+    _stateless_dp_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless DP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    It is a list of list[int], with each inner list contains a set of 3 ports
+    to be used for setting up the stateless CPU/device/TCPStore groups
+    in StatelessGroupCoordinator. The number of inner lists is equal to
+    the number of DP groups, 
+    i.e., len(self._stateless_dp_group_port_list) == world_size_across_dp // dp_size,
+    and len(self._stateless_dp_group_port_list[i]) == 3 for all i.
+    """
+
+    _stateless_ep_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless EP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    len(self._stateless_ep_group_port_list) == world_size_across_dp // ep_size,
+    """
+
+    _stateless_eplb_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless EPLB groups when enable_elastic_ep is True.
+    Same topology as EP but separate NCCL communicator to avoid deadlocks.
+    """
+
+    _stateless_world_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless world group when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    len(self._stateless_world_group_port_list) == 1,
+    """
+
     decode_context_parallel_size: int = 1
     """Number of decode context parallel groups, because the world size does
     not change by dcp, it simply reuse the GPUs of TP group, and tp_size
@@ -402,7 +433,67 @@ def get_next_dp_init_port(self) -> int:
 
         return answer
 
-    def stateless_init_dp_group(self) -> ProcessGroup:
+    def allocate_elastic_ep_ports(self) -> None:
+        """Allocate all ports for elastic EP (stateless groups + DP master).
+
+        Must be called AFTER ray.init() so that ports claimed by Ray's
+        idle worker pool are already in use and won't be returned by
+        get_open_ports_list().
+        """
+        if not self.enable_elastic_ep:
+            return
+        if self._stateless_world_group_port_list:
+            return
+
+        num_world_groups = 1
+        dp_size = self.data_parallel_size
+        ep_size = self.data_parallel_size * self.world_size_across_dp
+        num_dp_groups = max(1, self.world_size_across_dp // dp_size)
+        num_ep_groups = max(1, self.world_size_across_dp // ep_size)
+        num_eplb_groups = num_ep_groups
+        total_stateless_ports = (
+            num_world_groups + num_dp_groups + num_ep_groups + num_eplb_groups
+        ) * 3
+        num_dp_master_ports = 5
+
+        all_ports = get_open_ports_list(total_stateless_ports + num_dp_master_ports)
+
+        self._data_parallel_master_port_list = all_ports[-num_dp_master_ports:]
+        self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
+        all_ports = all_ports[:-num_dp_master_ports]
+
+        self._stateless_world_group_port_list = [
+            all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
+        ]
+        start_idx = num_world_groups * 3
+        self._stateless_dp_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
+        ]
+        start_idx += num_dp_groups * 3
+        self._stateless_ep_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
+        ]
+        start_idx += num_ep_groups * 3
+        self._stateless_eplb_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_eplb_groups * 3, 3)
+        ]
+
+    def get_next_stateless_world_group_port(self) -> list[int]:
+        return self._stateless_world_group_port_list.pop()
+
+    def get_next_stateless_dp_group_port(self) -> list[int]:
+        return self._stateless_dp_group_port_list.pop()
+
+    def get_next_stateless_ep_group_port(self) -> list[int]:
+        return self._stateless_ep_group_port_list.pop()
+
+    def get_next_stateless_eplb_group_port(self) -> list[int]:
+        return self._stateless_eplb_group_port_list.pop()
+
+    def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
         # NOTE: In high-concurrency scenarios multiple processes
         # can pick the same (currently free) port through a race
         # condition when calling `get_open_port()`. When the first
@@ -426,7 +517,8 @@ def stateless_init_dp_group(self) -> ProcessGroup:
                     self.get_next_dp_init_port(),
                     self.data_parallel_rank,
                     self.data_parallel_size,
-                    backend=current_platform.dist_backend,
+                    backend="gloo",
+                    return_store=return_store,
                 )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
@@ -561,6 +653,21 @@ def __post_init__(self) -> None:
             logger.info("Using external launcher for distributed inference.")
             self.world_size *= self.data_parallel_size
 
+        if self.enable_elastic_ep:
+            if not self.enable_eplb:
+                raise ValueError("Elastic EP is only supported with enable_eplb=True.")
+            if self.pipeline_parallel_size > 1:
+                raise ValueError(
+                    "Elastic EP is not supported with pipeline parallelism "
+                    f"(pipeline_parallel_size={self.pipeline_parallel_size})."
+                )
+            if self.data_parallel_external_lb or self.data_parallel_hybrid_lb:
+                raise NotImplementedError(
+                    "Elastic EP is not compatible with data_parallel_external_lb "
+                    "or data_parallel_hybrid_lb. Elastic EP relies on a single API "
+                    "server and core client to coordinate scale up/down."
+                )
+
         if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
             # Data parallel was specified in the engine args.
             if self.distributed_executor_backend == "external_launcher":
@@ -573,9 +680,12 @@ def __post_init__(self) -> None:
                     "Set data_parallel_rank to %d automatically.",
                     self.data_parallel_rank,
                 )
-            if not self._data_parallel_master_port_list:
-                self._data_parallel_master_port_list = get_open_ports_list(5)
-            self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
+            if not self.enable_elastic_ep:
+                if not self._data_parallel_master_port_list:
+                    self._data_parallel_master_port_list = get_open_ports_list(5)
+                self.data_parallel_master_port = (
+                    self._data_parallel_master_port_list.pop()
+                )
 
             if not (0 <= self.data_parallel_rank < self.data_parallel_size):
                 raise ValueError(
@@ -602,7 +712,7 @@ def __post_init__(self) -> None:
             os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
             logger.info("Disabling V1 multiprocessing for external launcher.")
 
-        if self.distributed_executor_backend is None and self.world_size > 1:
+        if self.distributed_executor_backend is None and self.world_size_across_dp > 1:
             # We use multiprocessing by default if world_size fits on the
             # current node and we aren't in a ray placement group.
 
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 4acab4e3c88e..3efcebd54a97 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -31,8 +31,8 @@ class NaiveAll2AllManager(All2AllManagerBase):
     debugging.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def naive_multicast(
         self,
@@ -138,8 +138,8 @@ class AgRsAll2AllManager(All2AllManagerBase):
     all-gather (dispatch) and reduce-scatter (combine).
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def dispatch_router_logits(
         self,
@@ -239,12 +239,12 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
     All2All communication based on DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_deep_ep(), (
             "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
             " to install DeepEP kernels."
         )  # noqa
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
         self.handle_cache = Cache()
 
         # This is the DeepEP default. Stick to it till we can establish
@@ -282,7 +282,10 @@ def combine(
         raise NotImplementedError
 
     def destroy(self):
-        pass
+        with self.handle_cache._lock:
+            for _, handle in self.handle_cache._cache.items():
+                handle.destroy()
+            self.handle_cache._cache.clear()
 
 
 class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
@@ -290,8 +293,8 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
     All2All communication based on DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def _make_all2all_kwargs(self) -> dict[Any, Any]:
         # Defaults for internode and intranode are taken from DeepEP tests.
@@ -314,6 +317,7 @@ def _make_all2all_kwargs(self) -> dict[Any, Any]:
             num_rdma_bytes=num_rdma_bytes,
             low_latency_mode=False,
             num_qps_per_rank=num_qps_per_rank,
+            explicitly_destroy=True,
         )
 
     def get_handle(self, kwargs):
@@ -347,8 +351,8 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
     All2All communication based on DeepEP Low-Latency kernels.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def _make_all2all_kwargs(
         self,
@@ -387,6 +391,7 @@ def _make_all2all_kwargs(
             num_qps_per_rank=num_qps_per_rank,
             allow_nvlink_for_low_latency_mode=True,
             allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
+            explicitly_destroy=True,
         )
 
     def get_handle(self, kwargs):
@@ -418,11 +423,11 @@ class FlashInferAllToAllManager(All2AllManagerBase):
     rank: int
     world_size: int
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_flashinfer_all2all(), (
             "flashinfer all2all module not found. Please install/check flashinfer"
         )  # noqa
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
         logger.debug(
             "Initialize for flashinfer All2All rank=%d, world size=%d",
             self.rank,
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 572bac80f916..2125f7381fe2 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -29,8 +29,9 @@ class All2AllManagerBase:
     rank: int
     world_size: int
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         self.cpu_group = cpu_group
+        self.tcp_store_group = tcp_store_group
 
         # compute some common properties
         from vllm.distributed.parallel_state import (
@@ -47,12 +48,17 @@ def __init__(self, cpu_group):
         # when we create this object
         self.dp_rank = self.dp_group.rank_in_group
         self.dp_world_size = self.dp_group.world_size
-        self.rank = dist.get_rank(cpu_group)
-        self.world_size = dist.get_world_size(cpu_group)
+        self.rank = cpu_group.rank()
+        self.world_size = cpu_group.size()
 
         # all2all communication often has separate implementations for
         # intra-node and inter-node communication
-        self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
+        if tcp_store_group is None:
+            self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
+        else:
+            self.internode = not all(
+                in_the_same_node_as(tcp_store_group, source_rank=0)
+            )
 
     def get_handle(self, kwargs):
         # get a handle for the all2all communication,
@@ -121,17 +127,36 @@ def __init__(
         device: torch.device | None = None,
         device_group: ProcessGroup | None = None,
         unique_name: str = "",
+        global_ranks: list[int] | None = None,
+        global_world_size: int | None = None,
     ):
         self.device = device or torch.device("cpu")
         self.cpu_group = cpu_group
         self.device_group = device_group
         self.unique_name = unique_name
-        self.rank = dist.get_rank(cpu_group)
-        self.world_size = dist.get_world_size(cpu_group)
-        self.ranks = dist.get_process_group_ranks(cpu_group)
-        self.global_rank = dist.get_rank()
-        self.global_world_size = dist.get_world_size()
-        self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank)
+
+        # Check if this is a stateless process group
+        from torch.distributed.distributed_c10d import _world
+
+        is_stateless = _world.pg_map.get(cpu_group, None) is None
+
+        if is_stateless:
+            # For stateless groups, we can't use torch.distributed methods
+            self.rank = cpu_group.rank()
+            self.world_size = cpu_group.size()
+            assert global_ranks is not None
+            assert global_world_size is not None
+            self.ranks = global_ranks
+            self.global_rank = self.ranks[self.rank]
+            self.global_world_size = global_world_size
+            self.rank_in_group = self.rank
+        else:
+            self.rank = dist.get_rank(cpu_group)
+            self.world_size = dist.get_world_size(cpu_group)
+            self.ranks = dist.get_process_group_ranks(cpu_group)
+            self.global_rank = dist.get_rank()
+            self.global_world_size = dist.get_world_size()
+            self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank)
 
         use_ep = False
         all2all_backend = None
@@ -145,7 +170,7 @@ def __init__(
             use_ep = config.parallel_config.data_parallel_size > 1
             all2all_backend = config.parallel_config.all2all_backend
 
-        self.is_ep_communicator = "ep" in unique_name
+        self.is_ep_communicator = unique_name.split(":")[0] == "ep"
         self.use_all2all = self.is_ep_communicator and use_ep
         self.all2all_backend = all2all_backend
         self.all2all_manager: All2AllManagerBase | None = None
@@ -275,6 +300,13 @@ def recv(
         torch.distributed.recv(tensor, self.ranks[src], self.device_group)
         return tensor
 
+    def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
+        """Broadcast a tensor from source rank to all ranks."""
+        if self.world_size == 1:
+            return tensor
+        torch.distributed.broadcast(tensor, self.ranks[src], self.device_group)
+        return tensor
+
     def destroy(self):
         pass
 
@@ -343,3 +375,6 @@ def combine(
         This is a no-op in the base class.
         """
         return hidden_states
+
+    def batch_isend_irecv(self, p2p_ops: list):
+        raise NotImplementedError
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index dd571482f5c8..5e18dbde91d2 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -16,6 +16,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+from ..utils import StatelessProcessGroup
 from .base_device_communicator import DeviceCommunicatorBase
 
 logger = init_logger(__name__)
@@ -28,8 +29,18 @@ def __init__(
         device: torch.device | None = None,
         device_group: ProcessGroup | None = None,
         unique_name: str = "",
+        global_ranks: list[int] | None = None,
+        global_world_size: int | None = None,
+        tcp_store_group: StatelessProcessGroup | None = None,
     ):
-        super().__init__(cpu_group, device, device_group, unique_name)
+        super().__init__(
+            cpu_group,
+            device,
+            device_group,
+            unique_name,
+            global_ranks,
+            global_world_size,
+        )
         if "tp" not in unique_name:
             # custom allreduce or torch symm mem can be used only by tp
             use_custom_allreduce = False
@@ -62,7 +73,7 @@ def __init__(
         self.pynccl_comm: PyNcclCommunicator | None = None
         if self.world_size > 1:
             self.pynccl_comm = PyNcclCommunicator(
-                group=self.cpu_group,
+                group=self.cpu_group if tcp_store_group is None else tcp_store_group,
                 device=self.device,
             )
             if is_symmetric_memory_enabled():
@@ -107,19 +118,27 @@ def __init__(
             if self.all2all_backend == "naive":
                 from .all2all import NaiveAll2AllManager
 
-                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+                self.all2all_manager = NaiveAll2AllManager(
+                    self.cpu_group, tcp_store_group
+                )
             elif self.all2all_backend == "allgather_reducescatter":
                 from .all2all import AgRsAll2AllManager
 
-                self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
+                self.all2all_manager = AgRsAll2AllManager(
+                    self.cpu_group, tcp_store_group
+                )
             elif self.all2all_backend == "deepep_high_throughput":
                 from .all2all import DeepEPHTAll2AllManager
 
-                self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
+                self.all2all_manager = DeepEPHTAll2AllManager(
+                    self.cpu_group, tcp_store_group
+                )
             elif self.all2all_backend == "deepep_low_latency":
                 from .all2all import DeepEPLLAll2AllManager
 
-                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
+                self.all2all_manager = DeepEPLLAll2AllManager(
+                    self.cpu_group, tcp_store_group
+                )
             elif self.all2all_backend == "mori":
                 from .all2all import MoriAll2AllManager
 
@@ -127,7 +146,9 @@ def __init__(
             elif self.all2all_backend == "flashinfer_all2allv":
                 from .all2all import FlashInferAllToAllManager
 
-                self.all2all_manager = FlashInferAllToAllManager(self.cpu_group)
+                self.all2all_manager = FlashInferAllToAllManager(
+                    self.cpu_group, tcp_store_group
+                )
             else:
                 raise ValueError(f"Unknown all2all backend: {self.all2all_backend}")
 
@@ -284,6 +305,18 @@ def recv(
             torch.distributed.recv(tensor, self.ranks[src], self.device_group)
         return tensor
 
+    def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
+        """Broadcast a tensor from source rank to all ranks."""
+        if self.world_size == 1:
+            return tensor
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.broadcast(tensor, src)
+            return tensor
+        else:
+            raise ValueError("No PyNCCL communicator found")
+
     def destroy(self):
         if self.pynccl_comm is not None:
             self.pynccl_comm = None
@@ -403,3 +436,10 @@ def combine(
             hidden_states,
             is_sequence_parallel,
         )
+
+    def batch_isend_irecv(self, p2p_ops: list):
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.batch_isend_irecv(p2p_ops)
+        else:
+            raise ValueError("No PyNCCL communicator found")
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 2fc35e80f591..44dc113e4f55 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -312,10 +312,19 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None):
         )
         if stream is None:
             stream = current_stream()
+        if tensor.dtype in [
+            torch.float8_e5m2,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        ]:
+            nccl_dtype = ncclDataTypeEnum.from_torch(torch.uint8)
+        else:
+            nccl_dtype = ncclDataTypeEnum.from_torch(tensor.dtype)
         self.nccl.ncclSend(
             buffer_type(tensor.data_ptr()),
             tensor.numel(),
-            ncclDataTypeEnum.from_torch(tensor.dtype),
+            nccl_dtype,
             dst,
             self.comm,
             cudaStream_t(stream.cuda_stream),
@@ -330,10 +339,19 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
         )
         if stream is None:
             stream = current_stream()
+        if tensor.dtype in [
+            torch.float8_e5m2,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        ]:
+            nccl_dtype = ncclDataTypeEnum.from_torch(torch.uint8)
+        else:
+            nccl_dtype = ncclDataTypeEnum.from_torch(tensor.dtype)
         self.nccl.ncclRecv(
             buffer_type(tensor.data_ptr()),
             tensor.numel(),
-            ncclDataTypeEnum.from_torch(tensor.dtype),
+            nccl_dtype,
             src,
             self.comm,
             cudaStream_t(stream.cuda_stream),
@@ -384,3 +402,17 @@ def register_comm_window_raw(self, ptr: int, size: int):
 
     def deregister_comm_window(self, window):
         return self.nccl.ncclCommWindowDeregister(self.comm, window)
+
+    def batch_isend_irecv(self, p2p_ops: list, stream=None):
+        if self.disabled:
+            return
+        if stream is None:
+            stream = current_stream()
+        self.group_start()
+        for op in p2p_ops:
+            if op.op is torch.distributed.isend:
+                self.send(op.tensor, op.group_peer, stream)
+            elif op.op is torch.distributed.irecv:
+                self.recv(op.tensor, op.group_peer, stream)
+
+        self.group_end()
diff --git a/vllm/distributed/elastic_ep/__init__.py b/vllm/distributed/elastic_ep/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py
new file mode 100644
index 000000000000..22d570660043
--- /dev/null
+++ b/vllm/distributed/elastic_ep/elastic_execute.py
@@ -0,0 +1,529 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import gc
+import weakref
+from collections.abc import Iterable, Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed import P2POp
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.cuda_graph import CUDAGraphWrapper
+from vllm.compilation.wrapper import reset_compile_wrapper
+from vllm.config import (
+    CompilationMode,
+    set_current_vllm_config,
+)
+from vllm.distributed import (
+    get_dp_group,
+    get_ep_group,
+    get_pcp_group,
+    get_tp_group,
+)
+from vllm.distributed.elastic_ep.standby_state import (
+    create_standby_groups,
+    get_standby_dp_group,
+    get_standby_ep_group,
+    pop_standby_groups,
+)
+from vllm.distributed.parallel_state import (
+    _replace_active_groups,
+    prepare_communication_buffer_for_model,
+)
+from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import FusedMoEParallelConfig
+from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
+from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
+from vllm.v1.worker.workspace import lock_workspace, unlock_workspace
+
+logger = init_logger(__name__)
+
+
+def batch_transfer_weights(
+    model: nn.Module,
+    is_sender: bool,
+    peer_rank: int,
+    dp_group: StatelessGroupCoordinator,
+    expert_weights: Sequence[Iterable[torch.Tensor]],
+) -> None:
+    device_comm = dp_group.device_communicator
+    if device_comm is None:
+        raise ValueError("No device communicator found")
+
+    expert_weights_set = set()
+    for weight_group in expert_weights:
+        for weight in weight_group:
+            expert_weights_set.add(weight.data_ptr())
+
+    state_dict = model.state_dict()
+    all_params = []
+
+    for name, param in state_dict.items():
+        if name.endswith("expert_map"):
+            continue
+        if param.data_ptr() not in expert_weights_set:
+            all_params.append(param.data)
+
+    assert len(all_params) > 0
+    p2p_ops = []
+    for param in all_params:
+        op = object.__new__(P2POp)
+        if is_sender:
+            op.op = torch.distributed.isend
+            op.tensor = param
+        else:
+            op.op = torch.distributed.irecv
+            op.tensor = param
+        op.group_peer = peer_rank
+        p2p_ops.append(op)
+    device_comm.batch_isend_irecv(p2p_ops)
+
+
+def broadcast_expert_mapping(
+    physical_to_logical: torch.Tensor | None,
+    num_local_physical_experts: int | None,
+    num_logical_experts: int | None,
+    dp_group: StatelessGroupCoordinator,
+    device: torch.device,
+    src_rank: int = 0,
+) -> tuple[torch.Tensor, int, int]:
+    if dp_group.rank_in_group == src_rank:
+        assert physical_to_logical is not None
+        assert num_local_physical_experts is not None
+        assert num_logical_experts is not None
+        assert physical_to_logical.dtype == torch.int64
+        shape_tensor = torch.tensor(
+            list(physical_to_logical.shape), dtype=torch.int64, device="cpu"
+        )
+        metadata_tensor = torch.tensor(
+            [num_local_physical_experts, num_logical_experts],
+            dtype=torch.int64,
+            device="cpu",
+        )
+    else:
+        shape_tensor = torch.empty(2, dtype=torch.int64, device="cpu")
+        metadata_tensor = torch.empty(2, dtype=torch.int64, device="cpu")
+
+    shape_tensor = dp_group.tcp_store_group.broadcast(shape_tensor, src_rank)
+    metadata_tensor = dp_group.tcp_store_group.broadcast(metadata_tensor, src_rank)
+
+    if dp_group.rank_in_group != src_rank:
+        assert device is not None
+        physical_to_logical = torch.empty(
+            tuple(shape_tensor.tolist()),
+            dtype=torch.int64,
+            device=device,
+        )
+
+    assert physical_to_logical is not None
+    physical_to_logical = dp_group.broadcast(physical_to_logical, src_rank)
+    num_local_physical_experts = int(metadata_tensor[0].item())
+    num_logical_experts = int(metadata_tensor[1].item())
+
+    return physical_to_logical, num_local_physical_experts, num_logical_experts
+
+
+class ElasticEPScalingExecutor:
+    def __init__(self, worker):
+        self.worker_ref = weakref.ref(worker)
+        self.reconfig_request = None
+
+    @property
+    def worker(self):
+        worker = self.worker_ref()
+        if worker is None:
+            raise RuntimeError("Worker has been garbage collected")
+        return worker
+
+    def execute(self, execute_method: str, *args, **kwargs):
+        method = getattr(self, execute_method, None)
+        if method is None:
+            raise ValueError(f"Unknown execute method: {execute_method}")
+        return method(*args, **kwargs)
+
+    def create_standby_groups(
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
+        self.reconfig_request = reconfig_request
+        new_dp_size = reconfig_request.new_data_parallel_size
+        world_size = self.worker.vllm_config.parallel_config.world_size
+        new_world_size_across_dp = world_size * new_dp_size
+        updated_config = copy.copy(self.worker.vllm_config)
+        updated_config.parallel_config = copy.deepcopy(
+            self.worker.vllm_config.parallel_config
+        )
+        updated_config.parallel_config.data_parallel_size = new_dp_size
+        with set_current_vllm_config(updated_config):
+            create_standby_groups(
+                new_dp_size=new_dp_size,
+                new_world_size_across_dp=new_world_size_across_dp,
+                master_ip=reconfig_request.new_data_parallel_master_ip,
+                world_group_ports=reconfig_request.new_stateless_world_group_port_list,
+                dp_group_ports=reconfig_request.new_stateless_dp_group_port_list,
+                ep_group_ports=reconfig_request.new_stateless_ep_group_port_list,
+                eplb_group_ports=reconfig_request.new_stateless_eplb_group_port_list,
+            )
+        self.worker.model_runner.eep_eplb_suppressed = True
+        standby_ep_group = get_standby_ep_group()
+        assert standby_ep_group is not None
+        if standby_ep_group.rank == 0:
+            logger.info("[Elastic EP] EPLB disabled during elastic scaling transition")
+
+    def transfer_weights(self, old_dp_size: int, new_dp_size: int) -> None:
+        standby_dp_group = get_standby_dp_group()
+        assert standby_dp_group is not None
+        # Broadcast old_dp_size to all workers in standby group
+        if standby_dp_group.rank_in_group < old_dp_size:
+            old_dp_size_tensor = torch.tensor(
+                [old_dp_size], dtype=torch.int64, device="cpu"
+            )
+        else:
+            old_dp_size_tensor = torch.empty(1, dtype=torch.int64, device="cpu")
+        old_dp_size_tensor = standby_dp_group.tcp_store_group.broadcast(
+            old_dp_size_tensor, 0
+        )
+
+        num_new_workers = new_dp_size - old_dp_size
+        dp_rank = self.worker.vllm_config.parallel_config.data_parallel_rank
+
+        # Sender-receiver pairing: the first new_workers % old_dp_size
+        # senders get (k+1) contiguous receivers, the rest get k
+        # receivers.
+        num_dst_per_sender = num_new_workers // old_dp_size
+        remainder = num_new_workers % old_dp_size
+
+        if dp_rank < remainder:
+            recv_begin = dp_rank * (num_dst_per_sender + 1)
+            recv_end = recv_begin + num_dst_per_sender + 1
+        else:
+            recv_begin = (
+                remainder * (num_dst_per_sender + 1)
+                + (dp_rank - remainder) * num_dst_per_sender
+            )
+            recv_end = recv_begin + num_dst_per_sender
+
+        ranks_to_send = list(range(old_dp_size + recv_begin, old_dp_size + recv_end))
+
+        model = self.worker.model_runner.get_model()
+        for new_worker_rank in sorted(ranks_to_send):
+            batch_transfer_weights(
+                model=model,
+                is_sender=True,
+                peer_rank=new_worker_rank,
+                dp_group=standby_dp_group,
+                expert_weights=model.expert_weights,
+            )
+        torch.cuda.synchronize()
+
+    def broadcast_expert_mapping(self) -> None:
+        standby_dp_group = get_standby_dp_group()
+        assert standby_dp_group is not None
+        model_config = self.worker.model_runner.model_config
+        eplb_state = self.worker.model_runner.eplb_state
+        assert eplb_state is not None
+        eplb_model_state = eplb_state.model_states[model_config.compute_hash()]
+        physical_to_logical = eplb_model_state.physical_to_logical_map
+        num_physical_experts = physical_to_logical.shape[1]
+        num_local_physical_experts = num_physical_experts // get_ep_group().world_size
+        num_logical_experts = eplb_model_state.logical_replica_count.shape[1]
+        broadcast_expert_mapping(
+            physical_to_logical=physical_to_logical,
+            num_local_physical_experts=num_local_physical_experts,
+            num_logical_experts=num_logical_experts,
+            dp_group=standby_dp_group,
+            src_rank=0,
+            device=self.worker.device,
+        )
+
+    def switch_and_remove(self) -> None:
+        _replace_active_groups(world=None, dp=None, ep=None, eplb=None, node_count=None)
+
+    def switch_and_prepare(self) -> None:
+        old_dp_size = get_dp_group().world_size
+        old_ep_size = get_ep_group().world_size
+
+        _replace_active_groups(**pop_standby_groups())
+
+        parallel_config = self.worker.vllm_config.parallel_config
+        reconfig_request = self.reconfig_request
+        assert reconfig_request is not None
+        new_dp_size = reconfig_request.new_data_parallel_size
+        new_ep_size = get_ep_group().world_size
+
+        parallel_config.data_parallel_size = new_dp_size
+        if (
+            reconfig_request.new_data_parallel_rank
+            != ReconfigureRankType.KEEP_CURRENT_RANK
+        ):
+            parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
+        if (
+            reconfig_request.new_data_parallel_rank_local
+            != ReconfigureRankType.KEEP_CURRENT_RANK
+        ):
+            parallel_config.data_parallel_rank_local = (
+                reconfig_request.new_data_parallel_rank_local
+            )
+        parallel_config.data_parallel_master_ip = (
+            reconfig_request.new_data_parallel_master_ip
+        )
+        parallel_config.data_parallel_master_port = (
+            reconfig_request.new_data_parallel_master_port
+        )
+
+        # Reconfigure MoE modules with new EP size
+        moe_modules = [
+            module
+            for module in self.worker.model_runner.model.modules()
+            if (
+                module.__class__.__name__ == "FusedMoE"
+                or module.__class__.__name__ == "SharedFusedMoE"
+            )
+        ]
+        num_local_experts = moe_modules[0].moe_config.num_local_experts
+        assert all(
+            module.moe_config.num_local_experts == num_local_experts
+            for module in moe_modules
+        ), "All MoE modules must have the same number of experts"
+        for module in moe_modules:
+            module.moe_config.num_experts = num_local_experts * new_ep_size
+            module.global_num_experts = module.moe_config.num_experts
+            tp_size = get_tp_group().world_size
+            is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+            sp_size = tp_size if is_sequence_parallel else 1
+            module.moe_parallel_config = FusedMoEParallelConfig.make(
+                tp_size_=tp_size,
+                pcp_size_=get_pcp_group().world_size,
+                dp_size_=get_dp_group().world_size,
+                sp_size_=sp_size,
+                vllm_parallel_config=parallel_config,
+            )
+            module.moe_config.moe_parallel_config = module.moe_parallel_config
+
+        # Update EPLB state
+        eplb_state = self.worker.model_runner.eplb_state
+        assert eplb_state is not None
+        model_config = self.worker.model_runner.model_config
+        eplb_model_state = eplb_state.model_states[model_config.compute_hash()]
+
+        num_physical_experts = num_local_experts * new_ep_size
+        num_logical_experts = eplb_model_state.logical_replica_count.shape[1]
+        parallel_config.eplb_config.num_redundant_experts = (
+            num_physical_experts - num_logical_experts
+        )
+        old_physical_to_logical = eplb_model_state.physical_to_logical_map
+        num_moe_layers = old_physical_to_logical.shape[0]
+        num_local_experts = eplb_model_state.expert_load_pass.shape[1] // old_ep_size
+        if new_dp_size > old_dp_size:
+            expanded_physical_to_logical = torch.full(
+                (num_moe_layers, num_local_experts * new_ep_size),
+                -1,
+                dtype=old_physical_to_logical.dtype,
+                device=old_physical_to_logical.device,
+            )
+            expanded_physical_to_logical[:, : num_local_experts * old_ep_size] = (
+                old_physical_to_logical
+            )
+            eplb_model_state.physical_to_logical_map = expanded_physical_to_logical
+
+        old_num_physical_experts = eplb_model_state.expert_load_pass.shape[1]
+        pad_size = num_physical_experts - old_num_physical_experts
+        if new_dp_size > old_dp_size:
+            assert pad_size > 0
+            expanded_expert_load_pass = F.pad(
+                eplb_model_state.expert_load_pass, (0, pad_size), value=0
+            )
+            expanded_expert_load_window = F.pad(
+                eplb_model_state.expert_load_window, (0, pad_size), value=0
+            )
+            eplb_model_state.expert_load_pass = expanded_expert_load_pass
+            eplb_model_state.expert_load_window = expanded_expert_load_window
+            eplb_state.num_valid_physical_experts = old_num_physical_experts
+        else:
+            assert pad_size < 0
+            eplb_model_state.expert_load_pass = eplb_model_state.expert_load_pass[
+                :, :num_physical_experts
+            ]
+            eplb_model_state.expert_load_window = eplb_model_state.expert_load_window[
+                :, :, :num_physical_experts
+            ]
+            eplb_state.num_valid_physical_experts = num_physical_experts
+
+        model = self.worker.model_runner.get_model()
+        model.expert_weights = []
+        with set_current_vllm_config(self.worker.vllm_config):
+            model.set_eplb_state(
+                eplb_model_state.expert_load_pass,
+                eplb_model_state.logical_to_physical_map,
+                eplb_model_state.logical_replica_count,
+            )
+            model.update_physical_experts_metadata(
+                num_physical_experts=num_physical_experts,
+                num_local_physical_experts=num_local_experts,
+            )
+            # Force re-creation of the modular kernel (and all2all manager)
+            # for the new EP size by resetting quant_method to base
+            for module in moe_modules:
+                if hasattr(module.quant_method, "old_quant_method"):
+                    module.quant_method = module.quant_method.old_quant_method
+                    module.runner = module._init_runner()
+            prepare_communication_buffer_for_model(self.worker.model_runner.model)
+        if (
+            self.worker.vllm_config.compilation_config.mode
+            == CompilationMode.STOCK_TORCH_COMPILE
+        ):
+            # NOTE(yongji): when using stock torch.compile,
+            # torch.compile is triggered during GPUModelRunner's load_model()
+            # TODO(yongji):check do we need to re-trigger torch.compile here?
+            # any changes to the tensor shapes in execution should already
+            # be handled internally by torch.compile.
+            backend = self.worker.vllm_config.compilation_config.init_backend(
+                self.worker.vllm_config
+            )
+            compilation_counter.stock_torch_compile_count += 1
+            self.worker.model_runner.model.compile(fullgraph=True, backend=backend)
+
+        # release all previously captured CUDA graphs
+        if isinstance(self.worker.model_runner.model, CUDAGraphWrapper):
+            wrapper = self.worker.model_runner.model
+            wrapper.concrete_cudagraph_entries = {}
+        elif isinstance(self.worker.model_runner.model, UBatchWrapper):
+            raise RuntimeError("DBO is not yet supported in elastic EP")
+
+        multi_block_table = self.worker.model_runner.input_batch.block_table
+        saved_block_tables: list[tuple[torch.Tensor, torch.Tensor]] = []
+        for bt in multi_block_table.block_tables:
+            saved_block_tables.append(
+                (bt.block_table.gpu.clone(), bt.block_table.cpu.clone())
+            )
+        multi_block_table.clear()
+
+        # reset the compile wrapper
+        torch.compiler.reset()
+        with set_current_vllm_config(self.worker.vllm_config):
+            reset_compile_wrapper(self.worker.model_runner.get_model())
+
+        gc.collect()
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+        unlock_workspace()
+        self.worker.compile_or_warm_up_model()
+        lock_workspace()
+
+        for bt, (saved_gpu, saved_cpu) in zip(
+            multi_block_table.block_tables, saved_block_tables
+        ):
+            bt.block_table.gpu.copy_(saved_gpu)
+            bt.block_table.cpu.copy_(saved_cpu)
+
+    def perform_eplb_reshuffle(self, new_dp_size: int | None = None) -> None:
+        if get_ep_group().rank == 0:
+            logger.info("[Elastic EP] Starting expert resharding...")
+
+        eplb_state = self.worker.model_runner.eplb_state
+        assert eplb_state is not None
+
+        model_config = self.worker.model_runner.model_config
+        eplb_model_state = eplb_state.model_states[model_config.compute_hash()]
+        is_async_enabled = eplb_state.is_async
+        eplb_state.is_async = False
+        if new_dp_size is None:
+            eplb_state.rearrange()
+        else:
+            # scale down
+            parallel_config = self.worker.vllm_config.parallel_config
+            tp_size = parallel_config.tensor_parallel_size
+            old_ep_size = parallel_config.data_parallel_size * tp_size
+            new_ep_size = new_dp_size * tp_size
+
+            rank_mapping = {
+                old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1
+                for old_ep_rank in range(old_ep_size)
+            }
+
+            eplb_state.rearrange(rank_mapping=rank_mapping)
+        # NOTE(yongji): check whether we need to synchronize here
+        torch.cuda.synchronize()
+        # reset expert_rearrangement_step to ensure all ranks are synchronized
+        eplb_state.expert_rearrangement_step = 0
+        eplb_state.num_valid_physical_experts = (
+            eplb_model_state.physical_to_logical_map.shape[1]
+        )
+        eplb_state.is_async = is_async_enabled
+        self.worker.model_runner.eep_eplb_suppressed = False
+        if get_ep_group().rank == 0:
+            logger.info("[Elastic EP] Expert resharding completed")
+
+    def receive_weights(self) -> None:
+        dp_group = get_dp_group()
+        assert isinstance(dp_group, StatelessGroupCoordinator)
+        new_dp_size = dp_group.world_size
+        dp_rank = self.worker.vllm_config.parallel_config.data_parallel_rank
+
+        # Receive old_dp_size broadcasted during transfer_weights
+        old_dp_size_tensor = torch.empty(1, dtype=torch.int64, device="cpu")
+        old_dp_size_tensor = dp_group.tcp_store_group.broadcast(old_dp_size_tensor, 0)
+        old_dp_size = int(old_dp_size_tensor[0].item())
+
+        # Calculate which existing worker will send to this new worker
+        num_new_workers = new_dp_size - old_dp_size
+        new_worker_idx = dp_rank - old_dp_size
+        num_dst_per_sender = num_new_workers // old_dp_size
+        remainder = num_new_workers % old_dp_size
+
+        if new_worker_idx < remainder * (num_dst_per_sender + 1):
+            sender_rank = new_worker_idx // (num_dst_per_sender + 1)
+        else:
+            sender_rank = (
+                remainder
+                + (new_worker_idx - remainder * (num_dst_per_sender + 1))
+                // num_dst_per_sender
+            )
+
+        model = self.worker.model_runner.get_model()
+        batch_transfer_weights(
+            model=model,
+            is_sender=False,
+            peer_rank=sender_rank,
+            dp_group=dp_group,
+            expert_weights=model.expert_weights,
+        )
+        torch.cuda.synchronize()
+
+    def receive_expert_mapping(self) -> tuple[torch.Tensor, int, int]:
+        dp_group = get_dp_group()
+        assert isinstance(dp_group, StatelessGroupCoordinator)
+        physical_to_logical, num_local_physical_experts, num_logical_experts = (
+            broadcast_expert_mapping(
+                physical_to_logical=None,
+                num_local_physical_experts=None,
+                num_logical_experts=None,
+                dp_group=dp_group,
+                src_rank=0,
+                device=self.worker.device,
+            )
+        )
+        num_moe_layers = physical_to_logical.shape[0]
+        new_dp_size = get_dp_group().world_size
+        tp_size = self.worker.vllm_config.parallel_config.tensor_parallel_size
+        new_ep_size = new_dp_size * tp_size
+        expanded_physical_to_logical = torch.full(
+            (num_moe_layers, num_local_physical_experts * new_ep_size),
+            -1,
+            dtype=physical_to_logical.dtype,
+            device=physical_to_logical.device,
+        )
+        old_num_physical_experts = physical_to_logical.shape[1]
+        expanded_physical_to_logical[:, :old_num_physical_experts] = physical_to_logical
+        return (
+            expanded_physical_to_logical,
+            num_logical_experts,
+            old_num_physical_experts,
+        )
+
+    def prepare_new_worker(self) -> None:
+        with set_current_vllm_config(self.worker.vllm_config):
+            prepare_communication_buffer_for_model(self.worker.model_runner.get_model())
diff --git a/vllm/distributed/elastic_ep/elastic_state.py b/vllm/distributed/elastic_ep/elastic_state.py
new file mode 100644
index 000000000000..4845a16f18fe
--- /dev/null
+++ b/vllm/distributed/elastic_ep/elastic_state.py
@@ -0,0 +1,563 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+import time
+import weakref
+from datetime import timedelta
+from typing import TYPE_CHECKING, Literal
+
+import torch.distributed
+
+from vllm.config import ParallelConfig
+from vllm.distributed import (
+    sched_yield,
+    stateless_destroy_torch_distributed_process_group,
+)
+from vllm.logger import init_logger
+from vllm.v1.engine import (
+    EEPNotificationType,
+    ReconfigureDistributedRequest,
+    ReconfigureRankType,
+)
+from vllm.v1.engine.core import DPEngineCoreProc
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.v1.executor.abstract import Executor
+
+logger = init_logger(__name__)
+
+WorkerType = Literal["existing", "new", "removing"]
+
+
+class ScaleUpExistingEngineState(enum.IntEnum):
+    WAIT_NEW_CORE_ENGINES_INIT = 0
+    CREATE_STANDBY_GROUPS = 1
+    TRANSFER_EXPERT_MAPPING = 2
+    WAIT_NEW_CORE_ENGINES_WEIGHTS_INIT = 3
+    TRANSFER_WEIGHTS = 4
+    SYNC_KV_CACHE_MEMORY_SIZE = 5
+    SWITCH_AND_PREPARE = 6
+    EPLB_RESHUFFLE = 7
+    COMPLETE = 8
+
+
+class ScaleUpNewEngineState(enum.IntEnum):
+    PREPARE = 0
+    EPLB_RESHUFFLE = 1
+    COMPLETE = 2
+
+
+class ScaleDownRemainingEngineState(enum.IntEnum):
+    PREPARE = 0
+    EPLB_RESHUFFLE = 1
+    SWITCH_AND_PREPARE = 2
+    COMPLETE = 3
+
+
+class ScaleDownRemovingEngineState(enum.IntEnum):
+    PREPARE = 0
+    EPLB_RESHUFFLE = 1
+    COMPLETE = 2
+
+
+class _BarrierTimeoutError(RuntimeError):
+    """
+    Exception raised for timeout
+    in the first stage of our two-staged
+    TCPStore based barrier to synchronize the
+    execution of all engines in the DP group.
+    """
+
+
+class ElasticEPScalingState:
+    def __init__(
+        self,
+        model_executor: "Executor",
+        engine_core: "DPEngineCoreProc",
+        vllm_config: "VllmConfig",
+        new_parallel_config: ParallelConfig,
+        worker_type: WorkerType,
+        scale_type: Literal["scale_up", "scale_down"],
+        reconfig_request: ReconfigureDistributedRequest | None = None,
+    ):
+        self.model_executor_ref = weakref.ref(model_executor)
+        self.engine_core_ref = weakref.ref(engine_core)
+        self.vllm_config = vllm_config
+        self.old_dp_group = self.engine_core.dp_group if worker_type != "new" else None
+        self.old_dp_store = self.engine_core.dp_store if worker_type != "new" else None
+        self.new_parallel_config: ParallelConfig = new_parallel_config
+        self.new_dp_group: torch.distributed.ProcessGroup | None = (
+            self.engine_core.dp_group if worker_type == "new" else None
+        )
+        self.new_dp_store = self.engine_core.dp_store if worker_type == "new" else None
+        self.worker_type = worker_type
+        self.scale_type = scale_type
+        self.reconfig_request = reconfig_request
+
+        if scale_type == "scale_up":
+            self.state = (
+                ScaleUpNewEngineState.PREPARE
+                if worker_type == "new"
+                else ScaleUpExistingEngineState.WAIT_NEW_CORE_ENGINES_INIT
+            )
+        else:
+            self.state = (
+                ScaleDownRemovingEngineState.PREPARE
+                if worker_type == "removing"
+                else ScaleDownRemainingEngineState.PREPARE
+            )
+
+    @property
+    def model_executor(self) -> "Executor":
+        model_executor = self.model_executor_ref()
+        if model_executor is None:
+            raise RuntimeError("Model executor has been garbage collected")
+        return model_executor
+
+    @property
+    def engine_core(self) -> "DPEngineCoreProc":
+        engine_core = self.engine_core_ref()
+        if engine_core is None:
+            raise RuntimeError("Engine core has been garbage collected")
+        return engine_core
+
+    def progress(self) -> bool:
+        if self.scale_type == "scale_up":
+            return (
+                self._progress_new_engine()
+                if self.worker_type == "new"
+                else self._progress_existing_engine()
+            )
+        return (
+            self._progress_removing_engine()
+            if self.worker_type == "removing"
+            else self._progress_remaining_engine()
+        )
+
+    def _execute_tcp_store_barrier(
+        self, dp_store, group_rank, group_size, barrier_id, timeout=None
+    ):
+        arrival_key = f"arrival_{barrier_id}_{group_rank}"
+        dp_store.set(arrival_key, b"1")
+
+        start_time = time.time()
+        processes_arrived: set[int] = set()
+
+        while len(processes_arrived) < group_size:
+            if (
+                timeout is not None
+                and time.time() - start_time > timeout.total_seconds()
+            ):
+                raise _BarrierTimeoutError(
+                    f"Barrier timed out after {timeout.total_seconds()} seconds"
+                )
+
+            for i in range(group_size):
+                if i in processes_arrived:
+                    continue
+
+                key = f"arrival_{barrier_id}_{i}"
+                present = dp_store.check([key])
+                if present:
+                    processes_arrived.add(i)
+
+            if len(processes_arrived) < group_size:
+                sched_yield()
+
+    def _staged_barrier(self, use_new_group: bool, barrier_name: str) -> bool:
+        """
+        Execute a two-staged barrier to synchronize all engines in the DP group.
+
+        Some DP EngineCores may receive the reconfiguration notifications
+        later than others, and already proceed to engine step (model forward)
+        in the busy loop.
+        In this case, EngineCores that already proceed to reconfiguration
+        should skip reconfiguration and execute model forward for one more
+        step, so in the next step, all EngineCores will be synchronized.
+        We use a two-staged barrier to achieve this. The first time each
+        EngineCore executes the barrier, if a timeout is reached before the
+        barrier completes, that means some EngineCores have already entered
+        engine step. The EngineCores that timed out will then proceed to
+        engine step, and will synchronize with the other EngineCores in the
+        next step with a barrier without timeout.
+        """
+        dp_store = self.new_dp_store if use_new_group else self.old_dp_store
+        dp_group = self.new_dp_group if use_new_group else self.old_dp_group
+        assert dp_group is not None
+
+        group_rank = dp_group.rank()
+        group_size = dp_group.size()
+        barrier_id = f"eep_barrier_{barrier_name}"
+        sync_key = f"{barrier_id}_sync"
+
+        # TODO(yongji): figure out appropriate timeout for the barrier
+        timeout = None if dp_store.check([sync_key]) else timedelta(seconds=5)
+
+        try:
+            self._execute_tcp_store_barrier(
+                dp_store, group_rank, group_size, barrier_id, timeout=timeout
+            )
+            torch.distributed.barrier(dp_group)
+            if group_rank == 0:
+                dp_store.delete_key(sync_key)
+                for i in range(group_size):
+                    dp_store.delete_key(f"arrival_{barrier_id}_{i}")
+            return True
+        except _BarrierTimeoutError as e:
+            if timeout is None:
+                raise RuntimeError("Unexpected timeout encountered") from e
+            dp_store.compare_set(sync_key, "", b"1")
+            return False
+
+    def _progress_existing_engine(self) -> bool:
+        state = self.state
+
+        if state == ScaleUpExistingEngineState.WAIT_NEW_CORE_ENGINES_INIT:
+            return False
+
+        elif state == ScaleUpExistingEngineState.CREATE_STANDBY_GROUPS:
+            # NOTE(yongji): wait for all existing workers to receive the request
+            if (
+                int(self.old_dp_store.get("eep_barrier_engine_count"))
+                < self.old_dp_group.size()
+            ):
+                return False
+            if not self._staged_barrier(
+                use_new_group=False, barrier_name="create_standby_groups"
+            ):
+                return False
+            if self.old_dp_group.rank() == 0:
+                self.old_dp_store.delete_key("eep_barrier_engine_count")
+            self._create_standby_groups()
+            self.state = ScaleUpExistingEngineState.TRANSFER_EXPERT_MAPPING
+            return True
+
+        elif state == ScaleUpExistingEngineState.TRANSFER_EXPERT_MAPPING:
+            self._transfer_expert_mapping()
+            self.state = ScaleUpExistingEngineState.WAIT_NEW_CORE_ENGINES_WEIGHTS_INIT
+            return True
+
+        elif state == ScaleUpExistingEngineState.WAIT_NEW_CORE_ENGINES_WEIGHTS_INIT:
+            return False
+
+        elif state == ScaleUpExistingEngineState.TRANSFER_WEIGHTS:
+            if (
+                int(self.old_dp_store.get("eep_barrier_engine_count"))
+                < self.old_dp_group.size()
+            ):
+                return False
+            if not self._staged_barrier(
+                use_new_group=False, barrier_name="transfer_weights"
+            ):
+                return False
+            if self.old_dp_group.rank() == 0:
+                self.old_dp_store.delete_key("eep_barrier_engine_count")
+            self._transfer_weights()
+            self.state = ScaleUpExistingEngineState.SYNC_KV_CACHE_MEMORY_SIZE
+            return True
+
+        elif state == ScaleUpExistingEngineState.SYNC_KV_CACHE_MEMORY_SIZE:
+            self._sync_kv_cache_memory_size()
+            self.state = ScaleUpExistingEngineState.SWITCH_AND_PREPARE
+            return True
+
+        elif state == ScaleUpExistingEngineState.SWITCH_AND_PREPARE:
+            self._switch_and_prepare()
+            self.state = ScaleUpExistingEngineState.EPLB_RESHUFFLE
+            self.new_dp_store.add("eep_barrier_engine_count", 1)
+            return True
+
+        elif state == ScaleUpExistingEngineState.EPLB_RESHUFFLE:
+            assert self.new_dp_group is not None
+            if (
+                int(self.new_dp_store.get("eep_barrier_engine_count"))
+                < self.new_dp_group.size()
+            ):
+                return False
+            if not self._staged_barrier(
+                use_new_group=True, barrier_name="eplb_reshuffle"
+            ):
+                return False
+            if self.new_dp_group.rank() == 0:
+                self.new_dp_store.delete_key("eep_barrier_engine_count")
+            self._eplb_reshuffle()
+            self.state = ScaleUpExistingEngineState.COMPLETE
+            self._update_parallel_config()
+            return True
+
+        else:
+            assert self.state == ScaleUpExistingEngineState.COMPLETE
+            return True
+
+    def _progress_new_engine(self) -> bool:
+        state = self.state
+        assert self.new_dp_group is not None
+
+        if state == ScaleUpNewEngineState.PREPARE:
+            tensor = torch.tensor([0, 0, 0], dtype=torch.int32, device="cpu")
+            torch.distributed.all_reduce(
+                tensor,
+                op=torch.distributed.ReduceOp.MAX,
+                group=self.new_dp_group,
+            )
+            data = tensor.tolist()
+            self.engine_core.engines_running = bool(data[0])
+            self.engine_core.current_wave = int(data[1])
+            self.engine_core.step_counter = int(data[2])
+            self.state = ScaleUpNewEngineState.EPLB_RESHUFFLE
+            self.new_dp_store.add("eep_barrier_engine_count", 1)
+            return True
+
+        elif state == ScaleUpNewEngineState.EPLB_RESHUFFLE:
+            if (
+                int(self.new_dp_store.get("eep_barrier_engine_count"))
+                < self.new_dp_group.size()
+            ):
+                return False
+            if not self._staged_barrier(
+                use_new_group=True, barrier_name="eplb_reshuffle"
+            ):
+                return False
+            assert self.new_dp_group.rank() > 0
+            self._eplb_reshuffle()
+            self.state = ScaleUpNewEngineState.COMPLETE
+            return True
+
+        else:
+            assert self.state == ScaleUpNewEngineState.COMPLETE
+            return True
+
+    def _progress_remaining_engine(self) -> bool:
+        state = self.state
+
+        if state == ScaleDownRemainingEngineState.PREPARE:
+            self.state = ScaleDownRemainingEngineState.EPLB_RESHUFFLE
+            self.old_dp_store.add("eep_barrier_engine_count", 1)
+            return True
+
+        elif state == ScaleDownRemainingEngineState.EPLB_RESHUFFLE:
+            if (
+                int(self.old_dp_store.get("eep_barrier_engine_count"))
+                < self.old_dp_group.size()
+            ):
+                return False
+            if not self._staged_barrier(
+                use_new_group=False, barrier_name="eplb_reshuffle"
+            ):
+                return False
+            if self.old_dp_group.rank() == 0:
+                self.old_dp_store.delete_key("eep_barrier_engine_count")
+            self._eplb_reshuffle_before_scale_down()
+            self.state = ScaleDownRemainingEngineState.SWITCH_AND_PREPARE
+            # NOTE(yongji): currently, after EPLB reshuffle
+            # that redistributes experts to remaining workers, workers
+            # to be removed will immediately initiate shutdown;
+            # existing workers can no longer execute forward steps using
+            # the old setup. In the future, we may keep
+            # the removing workers alive a bit longer,
+            # e.g., to drain in-batch requests.
+            self._create_standby_groups()
+            self._switch_and_prepare()
+            self._update_parallel_config()
+            self.state = ScaleDownRemainingEngineState.COMPLETE
+            return True
+
+        else:
+            assert self.state == ScaleDownRemainingEngineState.COMPLETE
+            return True
+
+    def _progress_removing_engine(self) -> bool:
+        state = self.state
+
+        if state == ScaleDownRemovingEngineState.PREPARE:
+            self.state = ScaleDownRemovingEngineState.EPLB_RESHUFFLE
+            self.old_dp_store.add("eep_barrier_engine_count", 1)
+            return True
+
+        if state == ScaleDownRemovingEngineState.EPLB_RESHUFFLE:
+            if (
+                int(self.old_dp_store.get("eep_barrier_engine_count"))
+                < self.old_dp_group.size()
+            ):
+                return False
+            if not self._staged_barrier(
+                use_new_group=False, barrier_name="eplb_reshuffle"
+            ):
+                return False
+            assert self.old_dp_group.rank() > 0
+            self._eplb_reshuffle_before_scale_down()
+            self._switch_and_remove()
+            self.state = ScaleDownRemovingEngineState.COMPLETE
+            self.engine_core._eep_send_engine_core_notification(
+                EEPNotificationType.SHUTDOWN_COMPLETE
+            )
+            self.engine_core.shutdown()
+            return True
+
+        else:
+            assert self.state == ScaleDownRemovingEngineState.COMPLETE
+            return True
+
+    def handle_notification(self, notification_type: EEPNotificationType):
+        assert self.worker_type != "new"
+        if (
+            notification_type == EEPNotificationType.NEW_CORE_ENGINES_INIT_READY
+            and self.state == ScaleUpExistingEngineState.WAIT_NEW_CORE_ENGINES_INIT
+        ):
+            self.old_dp_store.add("eep_barrier_engine_count", 1)
+            self.state = ScaleUpExistingEngineState.CREATE_STANDBY_GROUPS
+        elif (
+            notification_type == EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY
+            and self.state
+            == ScaleUpExistingEngineState.WAIT_NEW_CORE_ENGINES_WEIGHTS_INIT
+        ):
+            self.old_dp_store.add("eep_barrier_engine_count", 1)
+            self.state = ScaleUpExistingEngineState.TRANSFER_WEIGHTS
+
+    def is_complete(self) -> bool:
+        if self.scale_type == "scale_up":
+            return (
+                self.state == ScaleUpNewEngineState.COMPLETE
+                if self.worker_type == "new"
+                else self.state == ScaleUpExistingEngineState.COMPLETE
+            )
+        return (
+            self.state == ScaleDownRemovingEngineState.COMPLETE
+            if self.worker_type == "removing"
+            else self.state == ScaleDownRemainingEngineState.COMPLETE
+        )
+
+    def _create_standby_groups(self):
+        self.new_dp_group, self.new_dp_store = (
+            self.new_parallel_config.stateless_init_dp_group(return_store=True)
+        )
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("create_standby_groups", self.reconfig_request)
+        )
+        if self.old_dp_group.rank() == 0:
+            logger.info("[Elastic EP] Created standby communication groups")
+
+    def _transfer_weights(self):
+        assert self.reconfig_request is not None
+        old_dp_size = self.old_dp_group.size()
+        new_dp_size = self.reconfig_request.new_data_parallel_size
+
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("transfer_weights", old_dp_size, new_dp_size)
+        )
+        if self.old_dp_group.rank() == 0:
+            logger.info("[Elastic EP] Transferred weights to new workers")
+
+    def _transfer_expert_mapping(self):
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("broadcast_expert_mapping",)
+        )
+        if self.old_dp_group.rank() == 0:
+            logger.info("[Elastic EP] Broadcasted expert mapping to new workers")
+
+    def _sync_kv_cache_memory_size(self):
+        assert self.engine_core.available_gpu_memory_for_kv_cache > 0
+        assert self.new_dp_group is not None
+        ParallelConfig.sync_kv_cache_memory_size(
+            self.new_dp_group,
+            self.engine_core.available_gpu_memory_for_kv_cache,
+        )
+        if self.old_dp_group.rank() == 0:
+            logger.info("[Elastic EP] Synced KV cache memory size to new workers")
+
+    def _switch_and_prepare(self):
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("switch_and_prepare",)
+        )
+        old_dp_group = self.old_dp_group
+        stateless_destroy_torch_distributed_process_group(old_dp_group)
+        assert self.new_dp_group is not None
+        new_dp_group = self.new_dp_group
+        self.engine_core.dp_group = new_dp_group
+        self.engine_core.dp_rank = new_dp_group.rank()
+        self.engine_core.dp_store = self.new_dp_store
+        engines_running = int(self.engine_core.engines_running)
+        current_wave = self.engine_core.current_wave
+        step_counter = self.engine_core.step_counter
+        tensor = torch.tensor(
+            [engines_running, current_wave, step_counter],
+            dtype=torch.int32,
+            device="cpu",
+        )
+        torch.distributed.all_reduce(
+            tensor, op=torch.distributed.ReduceOp.MAX, group=new_dp_group
+        )
+        data = tensor.tolist()
+        self.engine_core.engines_running = bool(data[0])
+        self.engine_core.current_wave = int(data[1])
+        self.engine_core.step_counter = int(data[2])
+        if new_dp_group.rank() == 0:
+            self.engine_core._eep_send_engine_core_notification(
+                EEPNotificationType.RECONFIGURE_FINISHED
+            )
+            logger.info("[Elastic EP] Switched to new setup")
+
+    def _eplb_reshuffle(self):
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("perform_eplb_reshuffle",)
+        )
+        assert self.new_dp_group is not None
+        if self.new_dp_group.rank() == 0:
+            logger.info("[Elastic EP] EPLB reshuffle completed")
+
+    def _eplb_reshuffle_before_scale_down(self):
+        assert self.reconfig_request is not None
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute",
+            args=(
+                "perform_eplb_reshuffle",
+                self.reconfig_request.new_data_parallel_size,
+            ),
+        )
+        if self.old_dp_group.rank() == 0:
+            logger.info("[Elastic EP] EPLB reshuffle completed")
+
+    def _switch_and_remove(self):
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("switch_and_remove",)
+        )
+
+    def _update_parallel_config(self):
+        assert self.reconfig_request is not None
+        reconfig_request = self.reconfig_request
+        parallel_config = self.vllm_config.parallel_config
+        parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
+        if (
+            reconfig_request.new_data_parallel_rank
+            != ReconfigureRankType.KEEP_CURRENT_RANK
+        ):
+            parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
+        if (
+            reconfig_request.new_data_parallel_rank_local
+            != ReconfigureRankType.KEEP_CURRENT_RANK
+        ):
+            parallel_config.data_parallel_rank_local = (
+                reconfig_request.new_data_parallel_rank_local
+            )
+        parallel_config.data_parallel_master_ip = (
+            reconfig_request.new_data_parallel_master_ip
+        )
+        parallel_config.data_parallel_master_port = (
+            reconfig_request.new_data_parallel_master_port
+        )
+        parallel_config._data_parallel_master_port_list = (
+            reconfig_request.new_data_parallel_master_port_list
+        )
+        parallel_config._stateless_world_group_port_list = (
+            reconfig_request.new_stateless_world_group_port_list
+        )
+        parallel_config._stateless_dp_group_port_list = (
+            reconfig_request.new_stateless_dp_group_port_list
+        )
+        parallel_config._stateless_ep_group_port_list = (
+            reconfig_request.new_stateless_ep_group_port_list
+        )
+        parallel_config._stateless_eplb_group_port_list = (
+            reconfig_request.new_stateless_eplb_group_port_list
+        )
diff --git a/vllm/distributed/elastic_ep/standby_state.py b/vllm/distributed/elastic_ep/standby_state.py
new file mode 100644
index 000000000000..d11e0b550531
--- /dev/null
+++ b/vllm/distributed/elastic_ep/standby_state.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.distributed.parallel_state import (
+    _init_stateless_group,
+    _node_count,
+    get_pp_group,
+    get_tp_group,
+    get_world_group,
+)
+from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
+
+_STANDBY_WORLD: StatelessGroupCoordinator | None = None
+_STANDBY_WORLD_NODE_COUNT: int | None = None
+_STANDBY_DP: StatelessGroupCoordinator | None = None
+_STANDBY_EP: StatelessGroupCoordinator | None = None
+_STANDBY_EPLB: StatelessGroupCoordinator | None = None
+
+
+def get_standby_dp_group() -> StatelessGroupCoordinator | None:
+    return _STANDBY_DP
+
+
+def get_standby_ep_group() -> StatelessGroupCoordinator | None:
+    return _STANDBY_EP
+
+
+def get_standby_eplb_group() -> StatelessGroupCoordinator | None:
+    return _STANDBY_EPLB
+
+
+def get_standby_world_group() -> StatelessGroupCoordinator | None:
+    return _STANDBY_WORLD
+
+
+def create_standby_groups(
+    new_dp_size: int,
+    new_world_size_across_dp: int,
+    master_ip: str,
+    world_group_ports: list[list[int]],
+    dp_group_ports: list[list[int]],
+    ep_group_ports: list[list[int]],
+    eplb_group_ports: list[list[int]] | None = None,
+    backend: str | None = None,
+) -> None:
+    global \
+        _STANDBY_WORLD, \
+        _STANDBY_WORLD_NODE_COUNT, \
+        _STANDBY_DP, \
+        _STANDBY_EP, \
+        _STANDBY_EPLB
+
+    assert new_world_size_across_dp == torch.distributed.get_world_size() * new_dp_size
+    world_group = get_world_group()
+    assert isinstance(world_group, StatelessGroupCoordinator)
+    backend = backend or world_group.backend
+
+    standby_world_ranks = [list(range(new_world_size_across_dp))]
+    _STANDBY_WORLD = _init_stateless_group(
+        standby_world_ranks,
+        "world",
+        world_group_ports,
+        master_ip,
+        backend,
+        use_device_communicator=False,
+    )
+    _STANDBY_WORLD_NODE_COUNT = _node_count(_STANDBY_WORLD.tcp_store_group)
+
+    tp_size = get_tp_group().world_size
+    pp_size = get_pp_group().world_size
+
+    all_ranks = torch.arange(new_world_size_across_dp).reshape(
+        -1, new_dp_size, pp_size, tp_size
+    )
+    standby_dp_ranks = all_ranks.transpose(1, 3).reshape(-1, new_dp_size).unbind(0)
+    standby_dp_ranks = [x.tolist() for x in standby_dp_ranks]
+    _STANDBY_DP = _init_stateless_group(
+        standby_dp_ranks, "dp", dp_group_ports, master_ip, backend
+    )
+
+    standby_ep_ranks = (
+        all_ranks.transpose(1, 2).reshape(-1, new_dp_size * tp_size).unbind(0)
+    )
+    standby_ep_ranks = [x.tolist() for x in standby_ep_ranks]
+    _STANDBY_EP = _init_stateless_group(
+        standby_ep_ranks, "ep", ep_group_ports, master_ip, backend
+    )
+
+    if eplb_group_ports is not None:
+        _STANDBY_EPLB = _init_stateless_group(
+            standby_ep_ranks, "eplb", eplb_group_ports, master_ip, backend
+        )
+
+
+def pop_standby_groups() -> dict:
+    """Return all standby groups and clear the standby state."""
+    global \
+        _STANDBY_WORLD, \
+        _STANDBY_WORLD_NODE_COUNT, \
+        _STANDBY_DP, \
+        _STANDBY_EP, \
+        _STANDBY_EPLB
+
+    result = dict(
+        world=_STANDBY_WORLD,
+        dp=_STANDBY_DP,
+        ep=_STANDBY_EP,
+        eplb=_STANDBY_EPLB,
+        node_count=_STANDBY_WORLD_NODE_COUNT,
+    )
+    _STANDBY_WORLD = None
+    _STANDBY_WORLD_NODE_COUNT = None
+    _STANDBY_DP = None
+    _STANDBY_EP = None
+    _STANDBY_EPLB = None
+    return result
diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
index b81c7fa9c02b..5dd862f36bc2 100644
--- a/vllm/distributed/eplb/async_worker.py
+++ b/vllm/distributed/eplb/async_worker.py
@@ -24,7 +24,6 @@
 
 def start_async_worker(
     state: "EplbState",
-    rank_mapping: dict[int, int] | None = None,
     is_profile: bool = False,
 ) -> threading.Thread:
     eplb_group = get_eplb_group().device_group
@@ -45,7 +44,6 @@ def thread_target() -> None:
                     eplb_group=eplb_group,
                     cuda_stream=cuda_stream,
                     is_profile=is_profile,
-                    rank_mapping=rank_mapping,
                 )
             )
         except Exception as exc:  # pragma: no cover - diagnostic path
@@ -107,7 +105,6 @@ async def transfer_run_periodically(
     eplb_group: ProcessGroup,
     cuda_stream: torch.cuda.Stream,
     is_profile: bool = False,
-    rank_mapping: dict[int, int] | None = None,
 ) -> None:
     while True:
         await asyncio.to_thread(state.rearrange_event.wait)
@@ -176,7 +173,6 @@ async def transfer_run_periodically(
                             ep_group=eplb_group,
                             is_profile=is_profile,
                             cuda_stream=cuda_stream,
-                            rank_mapping=rank_mapping,
                         )
                         event = torch.cuda.Event(blocking=False)
                         cuda_stream.record_event(event)
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 891f19cfe254..b417c2b3256a 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -40,6 +40,7 @@
     get_node_count,
     in_the_same_node_as,
 )
+from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MixtureOfExperts
@@ -302,6 +303,14 @@ def __init__(self, parallel_config: ParallelConfig, device: torch.device):
         """
         CUDA device index for the async EPLB worker thread.
         """
+        self.num_valid_physical_experts: int = 0
+        """
+        Number of valid physical experts.
+        This is the number of physical experts that are
+        actually mapped to logical experts. In elastic EP,
+        newly started EP ranks may not have physical experts
+        mapped yet.
+        """
         if self.device.type == "cuda":
             self.cuda_device_index = self.device.index
             if self.cuda_device_index is None and torch.cuda.is_available():
@@ -367,9 +376,6 @@ def add_model(
         self,
         model: MixtureOfExperts,
         model_config: ModelConfig,
-        global_expert_load: torch.Tensor | None = None,
-        old_global_expert_indices: torch.Tensor | None = None,
-        rank_mapping: dict[int, int] | None = None,
     ):
         """
         Build the initial EPLB state.
@@ -462,75 +468,15 @@ def add_model(
         )
         self.expert_rearrangement_step_interval = eplb_step_interval
 
-        # Set the policy based on the selected eplb algorithm type.
         policy_type = self.parallel_config.eplb_config.policy
         self.policy = EPLB_POLICIES[policy_type]
         logger.debug("Selected EPLB policy: %s", policy_type)
-        if global_expert_load is not None:
-            ep_group = get_ep_group().device_group
-            assert global_expert_load.shape == (
-                model.num_moe_layers,
-                model.num_logical_experts,
-            )
-            assert global_expert_load.dtype == torch.int64
-
-            num_replicas = model.num_physical_experts
-            num_groups = model.num_expert_groups
-            num_nodes = get_node_count()
-            num_gpus = ep_group.size()
-
-            if num_gpus % num_nodes != 0:
-                num_nodes = 1
-                logger.warning_once(
-                    f"num_gpus % num_nodes != 0, "
-                    "not using hierarchical rearrangement algorithm.\n"
-                    f"{num_gpus=}, {num_nodes=}"
-                )
-
-            # Get new expert mappings
-            (
-                new_physical_to_logical_map,
-                new_logical_to_physical_map,
-                new_logical_replica_count,
-            ) = self.policy.rebalance_experts(
-                global_expert_load,
-                num_replicas,
-                num_groups,
-                num_nodes,
-                num_gpus,
-            )
-
-            max_physical_slots = new_logical_to_physical_map.shape[-1]
-            assert max_physical_slots <= logical_to_physical_map.shape[-1]
-            new_logical_to_physical_map = torch.nn.functional.pad(
-                new_logical_to_physical_map,
-                (0, logical_to_physical_map.shape[-1] - max_physical_slots),
-                value=-1,
-            )
-            physical_to_logical_map = new_physical_to_logical_map.to(self.device)
-            logical_to_physical_map.copy_(new_logical_to_physical_map)
-            logical_replica_count.copy_(new_logical_replica_count)
-        else:
-            new_physical_to_logical_map = None
-
-            new_logical_to_physical_map = None
 
-            new_logical_replica_count = None
         model.set_eplb_state(
             expert_load_pass,
             logical_to_physical_map,
             logical_replica_count,
         )
-        if global_expert_load is not None:
-            rearrange_expert_weights_inplace(
-                old_global_expert_indices,
-                new_physical_to_logical_map,
-                model.expert_weights,
-                ep_group,
-                False,
-                rank_mapping,
-            )
-            self.expert_rearrangement_step = 0
 
         expert_buffer = [torch.empty_like(w) for w in model.expert_weights[0]]
 
@@ -561,11 +507,12 @@ def add_model(
                 recv_dst_rows=np.array([]),
             ),
             cuda_device_index=self.cuda_device_index,
-            new_physical_to_logical_map=new_physical_to_logical_map,
-            new_logical_to_physical_map=new_logical_to_physical_map,
-            new_logical_replica_count=new_logical_replica_count,
+            new_physical_to_logical_map=None,
+            new_logical_to_physical_map=None,
+            new_logical_replica_count=None,
         )
         self.model_states[model_config.compute_hash()] = model_state
+        self.num_valid_physical_experts = model.num_physical_experts
 
     def step(
         self,
@@ -696,8 +643,6 @@ def step(
     def rearrange(
         self,
         is_profile: bool = False,
-        execute_shuffle: bool = True,
-        global_expert_loads: list[torch.Tensor] | None = None,
         rank_mapping: dict[int, int] | None = None,
     ) -> torch.Tensor | None:
         """
@@ -707,12 +652,6 @@ def rearrange(
             is_profile (bool): If `True`, perform a dummy rearrangement.
                 This is used in `profile_run` to reserve enough memory,
                 no memory movement will be performed. Default is False.
-            execute_shuffle (bool): If `True`, execute the shuffle
-                in elastic expert parallel (EEP). Default is True.
-            global_expert_loads (list[torch.Tensor] | None): The global expert
-                loads when scaling is done in EEP.
-                List of expert loads for the main and drafter
-                (when spec decode is used) models.
             rank_mapping (dict[int, int] | None): The rank mapping
                 when scaling is done in EEP.
         """
@@ -734,67 +673,34 @@ def rearrange(
                 "(profile)" if is_profile else "",
             )
 
-        if global_expert_loads is None:
-            # Map the physical expert load to global logical experts
-            global_expert_load_windows = []
-            if not execute_shuffle:
-                num_models = torch.tensor(
-                    [len(self.model_states)], dtype=torch.int32, device="cpu"
-                )
-                torch.distributed.broadcast(
-                    num_models, group=get_ep_group().cpu_group, group_src=0
-                )
-
-            for eplb_model_state in self.model_states.values():
-                logical_expert_load_window = torch.zeros(
-                    self.expert_load_window_size,
-                    eplb_model_state.model.num_moe_layers,
-                    eplb_model_state.model.num_logical_experts,
-                    dtype=eplb_model_state.expert_load_window.dtype,
-                    device=eplb_model_state.expert_load_window.device,
-                )
-                logical_expert_load_window.scatter_add_(
-                    dim=-1,
-                    index=eplb_model_state.physical_to_logical_map.unsqueeze(0)
-                    .expand_as(eplb_model_state.expert_load_window)
-                    .long(),
-                    src=eplb_model_state.expert_load_window,
-                )
-
-                if not execute_shuffle:
-                    metadata = torch.tensor(
-                        [
-                            eplb_model_state.model.num_moe_layers,
-                            eplb_model_state.model.num_logical_experts,
-                            eplb_model_state.physical_to_logical_map.shape[1],
-                        ],
-                        dtype=torch.int32,
-                        device="cpu",
-                    )
-                    torch.distributed.broadcast(
-                        metadata, group=get_ep_group().cpu_group, group_src=0
-                    )
-
-                global_expert_load_window = logical_expert_load_window.sum(dim=0)
-                global_expert_load_windows.append(global_expert_load_window)
-            # Perform all-reduce to get the expert load across all ranks for each model
-            global_expert_load_windows = self._allreduce_list(
-                global_expert_load_windows
+        # Map the physical expert load to global logical experts
+        global_expert_load_windows = []
+        for eplb_model_state in self.model_states.values():
+            expert_load_window = eplb_model_state.expert_load_window[
+                :, :, : self.num_valid_physical_experts
+            ]
+            logical_expert_load_window = torch.zeros(
+                self.expert_load_window_size,
+                eplb_model_state.model.num_moe_layers,
+                eplb_model_state.model.num_logical_experts,
+                dtype=eplb_model_state.expert_load_window.dtype,
+                device=eplb_model_state.expert_load_window.device,
+            )
+            logical_expert_load_window.scatter_add_(
+                dim=-1,
+                index=eplb_model_state.physical_to_logical_map[
+                    :, : self.num_valid_physical_experts
+                ]
+                .unsqueeze(0)
+                .expand_as(expert_load_window)
+                .long(),
+                src=expert_load_window,
             )
-            if not execute_shuffle:
-                for eplb_model_state, global_expert_load_window in zip(
-                    self.model_states.values(), global_expert_load_windows
-                ):
-                    # (num_moe_layers, old_num_physical_experts)
-                    old_global_expert_indices = eplb_model_state.physical_to_logical_map
-                    torch.distributed.broadcast(
-                        old_global_expert_indices, group=ep_group, group_src=0
-                    )
-            if not execute_shuffle:
-                return global_expert_load_windows
-        else:
-            assert execute_shuffle
-            global_expert_load_windows = global_expert_loads
+
+            global_expert_load_window = logical_expert_load_window.sum(dim=0)
+            global_expert_load_windows.append(global_expert_load_window)
+        # Perform all-reduce to get the expert load across all ranks for each model
+        global_expert_load_windows = self._allreduce_list(global_expert_load_windows)
 
         # TODO(bowen): Treat differently for prefill and decode nodes
         eplb_model_state = next(iter(self.model_states.values()))
@@ -806,8 +712,10 @@ def rearrange(
             # NOTE(yongji): scale down, we need to rebalance the experts on
             # remaining GPUs, transfer the experts while we haven't shutdown
             # the GPUs to be released.
-            cpu_group = get_ep_group().cpu_group
-            num_nodes = _node_count_with_rank_mapping(cpu_group, rank_mapping)
+            coordinator = get_ep_group()
+            assert isinstance(coordinator, StatelessGroupCoordinator)
+            tcp_store_group = coordinator.tcp_store_group
+            num_nodes = _node_count_with_rank_mapping(tcp_store_group, rank_mapping)
             num_gpus = sum(new_rank != -1 for new_rank in rank_mapping.values())
             num_replicas = (
                 num_replicas // ep_group.size() * num_gpus
@@ -933,7 +841,6 @@ def start_async_loop(
         if self.async_worker is None:
             self.async_worker = start_async_worker(
                 self,
-                rank_mapping=rank_mapping,
                 is_profile=is_profile,
             )
 
@@ -1089,83 +996,6 @@ def post_eplb(self, model_state: EplbModelState, is_profile: bool = False) -> No
         model_state.new_logical_to_physical_map = None
         model_state.new_logical_replica_count = None
 
-    @staticmethod
-    def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
-        """
-        Receive the expert load and old placement from the master rank.
-        """
-        ep_group = get_ep_group()
-        num_models = torch.empty(1, dtype=torch.int32, device="cpu")
-        torch.distributed.broadcast(num_models, group=ep_group.cpu_group, group_src=0)
-        num_models = num_models.item()
-        global_expert_loads = []
-        old_global_expert_indices_per_model = []
-        for _ in range(num_models):
-            metadata = torch.empty(3, dtype=torch.int32, device="cpu")
-            torch.distributed.broadcast(metadata, group=ep_group.cpu_group, group_src=0)
-            num_moe_layers, num_logical_experts, num_old_physical_experts = (
-                metadata.tolist()
-            )
-            global_expert_load = torch.zeros(
-                (num_moe_layers, num_logical_experts),
-                dtype=torch.int64,
-                device=ep_group.device,
-            )
-            all_reduce(global_expert_load, group=ep_group.device_group)
-            old_global_expert_indices = torch.empty(
-                (num_moe_layers, num_old_physical_experts),
-                dtype=torch.int64,
-                device=ep_group.device,
-            )
-            torch.distributed.broadcast(
-                old_global_expert_indices,
-                group=ep_group.device_group,
-                group_src=0,
-            )
-            global_expert_loads.append(global_expert_load)
-            old_global_expert_indices_per_model.append(old_global_expert_indices)
-        return global_expert_loads, old_global_expert_indices_per_model
-
-    @classmethod
-    def get_eep_state(
-        cls, parallel_config: ParallelConfig
-    ) -> tuple[
-        list[torch.Tensor] | None,
-        list[torch.Tensor] | None,
-        dict[int, int] | None,
-    ]:
-        num_local_physical_experts = torch.empty(1, dtype=torch.int32, device="cpu")
-        torch.distributed.broadcast(
-            num_local_physical_experts,
-            group=get_ep_group().cpu_group,
-            group_src=0,
-        )
-        num_local_physical_experts = int(num_local_physical_experts.item())
-        new_ep_size = get_ep_group().world_size
-        global_expert_loads, old_global_expert_indices_per_model = (
-            EplbState.recv_state()
-        )
-
-        # EP configuration for all models has to be the same so as eplb config
-        num_logical_experts = global_expert_loads[0].shape[1]
-        parallel_config.eplb_config.num_redundant_experts = (
-            num_local_physical_experts * new_ep_size - num_logical_experts
-        )
-        assert (
-            old_global_expert_indices_per_model[0].shape[1] % num_local_physical_experts
-            == 0
-        )
-        old_ep_size = (
-            old_global_expert_indices_per_model[0].shape[1]
-            // num_local_physical_experts
-        )
-        rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)}
-        return (
-            global_expert_loads,
-            old_global_expert_indices_per_model,
-            rank_mapping,
-        )
-
     def _allreduce_list(self, tensor_list: list[torch.Tensor]) -> list[torch.Tensor]:
         """
         All-reduce a list of tensors.
@@ -1203,6 +1033,60 @@ def _sync_load_pass(self) -> list[torch.Tensor]:
             load_pass_list.append(eplb_model_state.expert_load_pass.clone())
         return self._allreduce_list(load_pass_list)
 
+    @classmethod
+    def from_mapping(
+        cls,
+        model: MixtureOfExperts,
+        model_config: ModelConfig,
+        device: torch.device,
+        parallel_config: ParallelConfig,
+        expanded_physical_to_logical: torch.Tensor,
+        num_valid_physical_experts: int,
+    ) -> "EplbState":
+        eplb_state = cls(
+            parallel_config=parallel_config,
+            device=device,
+        )
+        eplb_state.add_model(
+            model=model,
+            model_config=model_config,
+        )
+        eplb_state.num_valid_physical_experts = num_valid_physical_experts
+        num_moe_layers = expanded_physical_to_logical.shape[0]
+        num_physical_experts = expanded_physical_to_logical.shape[1]
+        eplb_model_state = eplb_state.model_states[model_config.compute_hash()]
+        eplb_model_state.physical_to_logical_map.copy_(expanded_physical_to_logical)
+
+        logical_to_physical_map = torch.full(
+            (
+                num_moe_layers,
+                model.num_logical_experts,
+                eplb_model_state.logical_to_physical_map.shape[2],
+            ),
+            -1,
+            dtype=torch.int64,
+        )
+        logical_replica_count = torch.zeros(
+            (num_moe_layers, model.num_logical_experts),
+            dtype=torch.int64,
+        )
+        expanded_physical_to_logical_numpy = expanded_physical_to_logical.cpu().numpy()
+        for layer_idx in range(num_moe_layers):
+            for phys_idx in range(num_physical_experts):
+                logical_idx = expanded_physical_to_logical_numpy[layer_idx, phys_idx]
+                if logical_idx >= 0:
+                    replica_idx = logical_replica_count[layer_idx, logical_idx]
+                    logical_to_physical_map[layer_idx, logical_idx, replica_idx] = (
+                        phys_idx
+                    )
+                    logical_replica_count[layer_idx, logical_idx] += 1
+
+        logical_to_physical_map = logical_to_physical_map.to(device)
+        logical_replica_count = logical_replica_count.to(device)
+        eplb_model_state.logical_to_physical_map.copy_(logical_to_physical_map)
+        eplb_model_state.logical_replica_count.copy_(logical_replica_count)
+        return eplb_state
+
 
 @dataclass
 class EplbLayerState:
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 1be1e24836d3..777f9c553b98 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -19,6 +19,8 @@
     get_global_rank,
 )
 
+from vllm.distributed.parallel_state import get_ep_group
+from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -249,10 +251,18 @@ def move_to_buffer(
                     b[dst].copy_(w[src_local], non_blocking=True)
 
     p2p_ops: list[P2POp] = []
+    if isinstance(get_ep_group(), StatelessGroupCoordinator):
+        ep_group = get_ep_group()
+        is_stateless = True
+    else:
+        is_stateless = False
 
-    # Pre-compute global ranks mapping
+    # Pre-compute global ranks mapping (only needed for non-stateless groups)
     ep_size = ep_group.size()
-    rank_to_global = {rank: get_global_rank(ep_group, rank) for rank in range(ep_size)}
+    if not is_stateless:
+        rank_to_global = {
+            rank: get_global_rank(ep_group, rank) for rank in range(ep_size)
+        }
 
     # 2. Post sends
     if send_count > 0:
@@ -284,15 +294,23 @@ def move_to_buffer(
             if recver_pos < len(ranks_to_recv):
                 recv_ranks.append(ranks_to_recv[recver_pos])
             for dst in recv_ranks:
-                dst_global = rank_to_global[dst]
-                p2p_ops += [
-                    P2POp(
-                        torch.distributed.isend,
-                        w[src],
-                        dst_global,
-                    )
-                    for w in expert_weights
-                ]
+                if is_stateless:
+                    for w in expert_weights:
+                        op = object.__new__(P2POp)
+                        op.op = torch.distributed.isend
+                        op.tensor = w[src]
+                        op.group_peer = dst
+                        p2p_ops.append(op)
+                else:
+                    dst_global = rank_to_global[dst]
+                    p2p_ops += [
+                        P2POp(
+                            torch.distributed.isend,
+                            w[src],
+                            dst_global,
+                        )
+                        for w in expert_weights
+                    ]
 
     # 3. Post recvs
     if recv_count > 0:
@@ -321,26 +339,40 @@ def move_to_buffer(
                 src = ranks_to_send[recver_pos // num_dst_per_sender]
             else:
                 src = ranks_to_send[recver_pos - remainder_start]
-            src_global = rank_to_global[src]
-            p2p_ops += [
-                P2POp(
-                    torch.distributed.irecv,
-                    b[dst],
-                    src_global,
-                )
-                for b in expert_weights_buffers
-            ]
+            if is_stateless:
+                for b in expert_weights_buffers:
+                    op = object.__new__(P2POp)
+                    op.op = torch.distributed.irecv
+                    op.tensor = b[dst]
+                    op.group_peer = src
+                    p2p_ops.append(op)
+            else:
+                src_global = rank_to_global[src]
+                p2p_ops += [
+                    P2POp(
+                        torch.distributed.irecv,
+                        b[dst],
+                        src_global,
+                    )
+                    for b in expert_weights_buffers
+                ]
 
     # 4. Execute the P2P operations. The real communication happens here.
     if p2p_ops and cuda_stream is not None:
         with torch.cuda.stream(cuda_stream):
+            if is_stateless:
+                ep_group.device_communicator.batch_isend_irecv(p2p_ops)
+            else:
+                reqs = batch_isend_irecv(p2p_ops)
+                for req in reqs:
+                    req.wait()
+    elif p2p_ops:
+        if is_stateless:
+            ep_group.device_communicator.batch_isend_irecv(p2p_ops)
+        else:
             reqs = batch_isend_irecv(p2p_ops)
             for req in reqs:
                 req.wait()
-    elif p2p_ops:
-        reqs = batch_isend_irecv(p2p_ops)
-        for req in reqs:
-            req.wait()
     # wait for the communication to finish
     return (
         is_unchanged,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 9994096bf132..9e6b6df081aa 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -33,7 +33,7 @@
 from dataclasses import dataclass
 from datetime import timedelta
 from multiprocessing import shared_memory
-from typing import Any, Protocol
+from typing import TYPE_CHECKING, Any, Protocol
 from unittest.mock import patch
 
 import torch
@@ -55,6 +55,9 @@
     direct_register_custom_op,
 )
 
+if TYPE_CHECKING:
+    from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
+
 
 @dataclass
 class GraphCaptureContext:
@@ -1157,6 +1160,55 @@ def init_model_parallel_group(
     )
 
 
+def _init_stateless_group(
+    group_ranks: list[list[int]],
+    group_name: str,
+    group_ports: list[list[int]],
+    host: str,
+    backend: str,
+    use_device_communicator: bool = True,
+) -> "StatelessGroupCoordinator":
+    """Create a StatelessGroupCoordinator with the given parameters."""
+    from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
+
+    world = get_world_group()
+    return StatelessGroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=world.local_rank,
+        torch_distributed_backend=backend,
+        use_device_communicator=use_device_communicator,
+        group_name=group_name,
+        host=host,
+        group_ports=group_ports,
+        global_rank=world.rank,
+        global_world_size=world.world_size,
+    )
+
+
+def _replace_active_groups(
+    *,
+    world: GroupCoordinator | None,
+    dp: GroupCoordinator | None,
+    ep: GroupCoordinator | None,
+    eplb: GroupCoordinator | None,
+    node_count: int | None,
+) -> None:
+    """Destroy the current DP/EP/WORLD/EPLB groups and replace them.
+
+    Destruction is collective — all ranks in the old groups must call this
+    function together.  Pass all-``None`` to tear down without replacement.
+    """
+    global _WORLD, _DP, _EP, _EPLB, _NODE_COUNT
+    for group in (_DP, _EP, _WORLD, _EPLB):
+        if group is not None:
+            group.destroy()
+    _WORLD = world
+    _DP = dp
+    _EP = ep
+    _EPLB = eplb
+    _NODE_COUNT = node_count
+
+
 _TP: GroupCoordinator | None = None
 
 
@@ -1254,6 +1306,39 @@ def set_custom_all_reduce(enable: bool):
     _ENABLE_CUSTOM_ALL_REDUCE = enable
 
 
+def _init_elastic_ep_world(
+    config, local_rank: int, backend: str, rank: int, world_size: int
+) -> None:
+    from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator
+
+    global _WORLD, _NODE_COUNT
+    assert _WORLD is None, "world group already initialized"
+    parallel_config = config.parallel_config
+    global_rank = parallel_config.data_parallel_rank * world_size + rank
+    global_world_size = parallel_config.world_size_across_dp
+    all_ranks = list(range(global_world_size))
+    group_ranks = [all_ranks[i : i + 1] for i in range(global_world_size)]
+    if global_rank in all_ranks:
+        group_ranks = [all_ranks]
+    group_ports = [parallel_config.get_next_stateless_world_group_port()]
+    world = StatelessGroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_device_communicator=False,
+        group_name="world",
+        host=parallel_config.data_parallel_master_ip,
+        group_ports=group_ports,
+        global_rank=global_rank,
+        global_world_size=global_world_size,
+    )
+    assert parallel_config.nnodes_within_dp == 1, (
+        "Elastic EP is not supported with multi-node TP/PP"
+    )
+    _NODE_COUNT = _node_count(world.tcp_store_group)
+    _WORLD = world
+
+
 def init_distributed_environment(
     world_size: int = -1,
     rank: int = -1,
@@ -1273,6 +1358,7 @@ def init_distributed_environment(
     from vllm.config import get_current_vllm_config_or_none
 
     config = get_current_vllm_config_or_none()
+    enable_elastic_ep = config is not None and config.parallel_config.enable_elastic_ep
     if (
         config is not None
         and config.parallel_config.distributed_executor_backend != "external_launcher"
@@ -1280,6 +1366,7 @@ def init_distributed_environment(
             config.parallel_config.nnodes > 1
             or config.parallel_config.data_parallel_size > 1
         )
+        and not enable_elastic_ep
     ):
         parallel_config = config.parallel_config
         # adjust to take into account data parallelism
@@ -1333,6 +1420,18 @@ def init_distributed_environment(
             rank=rank,
             timeout=timeout,
         )
+        if enable_elastic_ep:
+            tp_pp_cpu_group = torch.distributed.new_group(
+                backend="gloo", timeout=timeout
+            )
+            if _node_count(tp_pp_cpu_group) > 1:
+                # NOTE(yongji): StatelessGroupCoordinator uses data_parallel_master_ip
+                # to initialize all DP/EP groups, hence all ranks within TP/PP group
+                # must reside on the same node
+                raise RuntimeError(
+                    "Elastic EP is not yet supported with multi-node TP/PP"
+                )
+
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
     # see https://github.com/pytorch/pytorch/issues/122816
@@ -1341,6 +1440,9 @@ def init_distributed_environment(
         # setting, where we can use rank as local rank
         local_rank = envs.LOCAL_RANK if distributed_init_method == "env://" else rank
     global _WORLD, _NODE_COUNT, _INNER_DP_WORLD
+    if enable_elastic_ep:
+        _init_elastic_ep_world(config, local_rank, backend, rank, world_size)
+        return
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
         _WORLD = init_world_group(ranks, local_rank, backend)
@@ -1404,16 +1506,33 @@ def initialize_model_parallel(
     """
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
-    world_size: int = torch.distributed.get_world_size()
-    rank = torch.distributed.get_rank()
-    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
-
-    data_parallel_size = 1
-    from vllm.config import get_current_vllm_config_or_none
 
-    config = get_current_vllm_config_or_none()
-    if config is not None:
-        data_parallel_size = config.parallel_config.data_parallel_size
+    from vllm.config import get_current_vllm_config
+
+    config = get_current_vllm_config()
+    data_parallel_size = config.parallel_config.data_parallel_size
+    enable_elastic_ep = config.parallel_config.enable_elastic_ep
+    if enable_elastic_ep:
+        # Use stateless world group for global information
+        world_size = get_world_group().world_size
+        rank = get_world_group().rank
+        backend = backend or "nccl"
+        tp_pp_pcp_size = (
+            tensor_model_parallel_size
+            * pipeline_model_parallel_size
+            * prefill_context_model_parallel_size
+        )
+        local_all_ranks = torch.arange(tp_pp_pcp_size).reshape(
+            pipeline_model_parallel_size,
+            prefill_context_model_parallel_size,
+            tensor_model_parallel_size,
+        )
+    else:
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        backend = backend or torch.distributed.get_backend(
+            get_world_group().device_group
+        )
 
     # the layout order is: ExternalDP x DP x PP x TP
     # ExternalDP is the data parallel group that is not part of the model,
@@ -1437,7 +1556,9 @@ def initialize_model_parallel(
     assert _TP is None, "tensor model parallel group is already initialized"
     group_ranks = all_ranks.view(-1, tensor_model_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
-
+    if enable_elastic_ep:
+        group_ranks = local_all_ranks.view(-1, tensor_model_parallel_size).unbind(0)
+        group_ranks = [x.tolist() for x in group_ranks]
     # message queue broadcaster is only used in tensor model parallel group
     _TP = init_model_parallel_group(
         group_ranks,
@@ -1456,6 +1577,11 @@ def initialize_model_parallel(
     # TP group into tp_size//dcp_size DCP groups.
     group_ranks = all_ranks.reshape(-1, decode_context_model_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
+    if enable_elastic_ep:
+        group_ranks = local_all_ranks.reshape(
+            -1, decode_context_model_parallel_size
+        ).unbind(0)
+        group_ranks = [x.tolist() for x in group_ranks]
     _DCP = init_model_parallel_group(
         group_ranks,
         get_world_group().local_rank,
@@ -1472,6 +1598,13 @@ def initialize_model_parallel(
         .unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
+    if enable_elastic_ep:
+        group_ranks = (
+            local_all_ranks.transpose(1, 2)
+            .reshape(-1, prefill_context_model_parallel_size)
+            .unbind(0)
+        )
+        group_ranks = [x.tolist() for x in group_ranks]
     _PCP = init_model_parallel_group(
         group_ranks, get_world_group().local_rank, backend, group_name="pcp"
     )
@@ -1483,6 +1616,13 @@ def initialize_model_parallel(
         all_ranks.transpose(2, 4).reshape(-1, pipeline_model_parallel_size).unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
+    if enable_elastic_ep:
+        group_ranks = (
+            local_all_ranks.transpose(0, 2)
+            .reshape(-1, pipeline_model_parallel_size)
+            .unbind(0)
+        )
+        group_ranks = [x.tolist() for x in group_ranks]
     _PP = init_model_parallel_group(
         group_ranks, get_world_group().local_rank, backend, group_name="pp"
     )
@@ -1491,14 +1631,27 @@ def initialize_model_parallel(
     assert _DP is None, "data parallel group is already initialized"
     group_ranks = all_ranks.transpose(1, 4).reshape(-1, data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
-    _DP = init_model_parallel_group(
-        group_ranks, get_world_group().local_rank, backend, group_name="dp"
-    )
+    if enable_elastic_ep:
+        parallel_config = config.parallel_config
+        dp_ports = [
+            parallel_config.get_next_stateless_dp_group_port() for _ in group_ranks
+        ]
+        _DP = _init_stateless_group(
+            group_ranks,
+            "dp",
+            dp_ports,
+            parallel_config.data_parallel_master_ip,
+            backend,
+        )
+    else:
+        _DP = init_model_parallel_group(
+            group_ranks, get_world_group().local_rank, backend, group_name="dp"
+        )
 
     global _EP
     assert _EP is None, "expert parallel group is already initialized"
     # Don't create EP group for dense models.
-    if config is None or config.model_config is None or config.model_config.is_moe:
+    if config.model_config is None or config.model_config.is_moe:
         group_ranks = (
             all_ranks.transpose(1, 2)
             .reshape(
@@ -1510,9 +1663,22 @@ def initialize_model_parallel(
             .unbind(0)
         )
         group_ranks = [x.tolist() for x in group_ranks]
-        _EP = init_model_parallel_group(
-            group_ranks, get_world_group().local_rank, backend, group_name="ep"
-        )
+        if enable_elastic_ep:
+            parallel_config = config.parallel_config
+            ep_ports = [
+                parallel_config.get_next_stateless_ep_group_port() for _ in group_ranks
+            ]
+            _EP = _init_stateless_group(
+                group_ranks,
+                "ep",
+                ep_ports,
+                parallel_config.data_parallel_master_ip,
+                backend,
+            )
+        else:
+            _EP = init_model_parallel_group(
+                group_ranks, get_world_group().local_rank, backend, group_name="ep"
+            )
 
         # Create EPLB group with the same ranks as EP if EPLB is enabled.
         # This is a separate process group to isolate EPLB communications
@@ -1525,10 +1691,25 @@ def initialize_model_parallel(
             and config.parallel_config is not None
             and config.parallel_config.enable_eplb
         ):
-            # Reuse the same group_ranks from EP
-            _EPLB = init_model_parallel_group(
-                group_ranks, get_world_group().local_rank, backend, group_name="eplb"
-            )
+            if enable_elastic_ep:
+                eplb_ports = [
+                    parallel_config.get_next_stateless_eplb_group_port()
+                    for _ in group_ranks
+                ]
+                _EPLB = _init_stateless_group(
+                    group_ranks,
+                    "eplb",
+                    eplb_ports,
+                    parallel_config.data_parallel_master_ip,
+                    backend,
+                )
+            else:
+                _EPLB = init_model_parallel_group(
+                    group_ranks,
+                    get_world_group().local_rank,
+                    backend,
+                    group_name="eplb",
+                )
     # If no EP group needed, _EP remains None
     # If no EPLB group needed, _EPLB remains None
 
@@ -1558,7 +1739,11 @@ def ensure_model_parallel_initialized(
     or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
     values if the model parallel groups are initialized.
     """
-    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+    world_group = get_world_group()
+    if hasattr(world_group, "backend"):
+        backend = backend or world_group.backend
+    else:
+        backend = backend or torch.distributed.get_backend(world_group.device_group)
     if not model_parallel_is_initialized():
         initialize_model_parallel(
             tensor_model_parallel_size,
diff --git a/vllm/distributed/stateless_coordinator.py b/vllm/distributed/stateless_coordinator.py
new file mode 100644
index 000000000000..f2126fdbaa32
--- /dev/null
+++ b/vllm/distributed/stateless_coordinator.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+from torch.distributed import Backend, ProcessGroup
+
+from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
+from vllm.distributed.parallel_state import (
+    GroupCoordinator,
+    TensorMetadata,
+    _get_unique_name,
+    _register_group,
+    _split_tensor_dict,
+)
+from vllm.distributed.utils import (
+    StatelessProcessGroup,
+    stateless_destroy_torch_distributed_process_group,
+    stateless_init_torch_distributed_process_group,
+)
+from vllm.logger import init_logger
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+logger = init_logger(__name__)
+
+
+class StatelessGroupCoordinator(GroupCoordinator):
+    """
+    A stateless version of the GroupCoordinator class in parallel_state,
+    It will create CPU, device and TCPStore based communication groups
+    that are independent of PyTorch's WORLD group. Hence,
+    communication groups with a different set of participants GPUs
+    can be created without destroying the existing ones.
+    """
+
+    def __init__(
+        self,
+        group_ranks: list[list[int]],
+        local_rank: int,
+        torch_distributed_backend: str | Backend,
+        use_device_communicator: bool,
+        use_message_queue_broadcaster: bool = False,
+        group_name: str | None = None,
+        host: str = "127.0.0.1",
+        group_ports: list[list[int]] | None = None,
+        global_rank: int = 0,
+        global_world_size: int = 1,
+    ):
+        group_name = group_name or "anonymous"
+        self.unique_name = _get_unique_name(group_name)
+        _register_group(self)
+
+        self.rank = global_rank
+        self.local_rank = local_rank
+
+        self_device_group = None
+        self_cpu_group = None
+        self_tcp_store_group = None
+
+        from vllm.platforms import current_platform
+
+        backend = str(torch_distributed_backend)
+        self.backend = backend
+        assert group_ports is not None, "group_ports is not provided"
+        for idx, ranks in enumerate(group_ranks):
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+
+                ports = group_ports[idx]
+                device_port = ports[0]
+                cpu_port = ports[1]
+                tcp_store_port = ports[2]
+
+                device_group = stateless_init_torch_distributed_process_group(
+                    host=host,
+                    port=device_port,
+                    rank=self.rank_in_group,
+                    world_size=self.world_size,
+                    backend=backend,
+                    group_name=f"{self.unique_name}_device",
+                )
+                cpu_group = stateless_init_torch_distributed_process_group(
+                    host=host,
+                    port=cpu_port,
+                    rank=self.rank_in_group,
+                    world_size=self.world_size,
+                    backend="gloo",
+                    group_name=f"{self.unique_name}_cpu",
+                )
+                tcp_store_group = StatelessProcessGroup.create(
+                    host=host,
+                    port=tcp_store_port,
+                    rank=self.rank_in_group,
+                    world_size=self.world_size,
+                )
+
+                self_device_group = device_group
+                self_cpu_group = cpu_group
+                self_tcp_store_group = tcp_store_group
+
+        assert self_cpu_group is not None
+        assert self_device_group is not None
+        assert self_tcp_store_group is not None
+
+        self.cpu_group = self_cpu_group
+        self.device_group = self_device_group
+        self.tcp_store_group = self_tcp_store_group
+
+        if current_platform.is_cuda_alike():
+            self.device = torch.device(f"cuda:{local_rank}")
+        elif current_platform.is_xpu():
+            self.device = torch.device(f"xpu:{local_rank}")
+        elif current_platform.is_out_of_tree():
+            self.device = torch.device(f"{current_platform.device_name}:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
+
+        self.use_device_communicator = use_device_communicator
+        self.device_communicator = None
+        if use_device_communicator and self.world_size > 1:
+            device_comm_cls = resolve_obj_by_qualname(
+                current_platform.get_device_communicator_cls()
+            )
+            assert device_comm_cls == CudaCommunicator
+            self.device_communicator = CudaCommunicator(
+                cpu_group=self.cpu_group,
+                device=self.device,
+                device_group=self.device_group,
+                unique_name=self.unique_name,
+                global_ranks=self.ranks,
+                global_world_size=global_world_size,
+                tcp_store_group=self.tcp_store_group,
+            )
+
+        self.mq_broadcaster = None
+
+        self.use_custom_op_call = (
+            current_platform.is_cuda_alike() or current_platform.is_tpu()
+        )
+        self.use_cpu_custom_send_recv = False
+
+    def destroy(self):
+        if self.device_communicator:
+            self.device_communicator.destroy()
+        if self.device_group:
+            stateless_destroy_torch_distributed_process_group(self.device_group)
+        if self.cpu_group:
+            stateless_destroy_torch_distributed_process_group(self.cpu_group)
+
+    def size(self) -> int:
+        """Return the world size of this group."""
+        return self.world_size
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0):
+        if self.world_size == 1:
+            return input_
+
+        if self.device_communicator and input_.is_cuda:
+            return self.device_communicator.broadcast(input_, src)
+        else:
+            return self.tcp_store_group.broadcast(input_, src)
+
+    def broadcast_object(self, obj=None, src: int = 0):
+        if self.world_size == 1:
+            return obj
+        return self.tcp_store_group.broadcast_obj(obj, src)
+
+    def broadcast_object_list(
+        self, obj_list: list[Any], src: int = 0, group: ProcessGroup | None = None
+    ):
+        assert src < self.world_size
+
+        if self.world_size == 1:
+            return obj_list
+
+        if self.rank_in_group == src:
+            for obj in obj_list:
+                self.tcp_store_group.broadcast_obj(obj, src)
+        else:
+            for i in range(len(obj_list)):
+                obj_list[i] = self.tcp_store_group.broadcast_obj(None, src)
+
+        return obj_list
+
+    def broadcast_tensor_dict(
+        self,
+        tensor_dict: dict[str, torch.Tensor | Any] | None = None,
+        src: int = 0,
+        group: ProcessGroup | None = None,
+        metadata_group: ProcessGroup | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
+        if self.world_size == 1:
+            return tensor_dict
+
+        if self.rank_in_group == src:
+            assert isinstance(tensor_dict, dict), (
+                f"Expecting a dictionary, got {type(tensor_dict)}"
+            )
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        else:
+            metadata_list = None
+            tensor_list = []
+
+        recv_metadata_list: list[tuple[str, Any]] = self.tcp_store_group.broadcast_obj(
+            metadata_list, src
+        )
+
+        if self.rank_in_group != src:
+            tensor_dict = {}
+            for key, value in recv_metadata_list:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(
+                        value.size, dtype=value.dtype, device=value.device
+                    )
+                    tensor_list.append(tensor)
+                    tensor_dict[key] = tensor
+                else:
+                    tensor_dict[key] = value
+
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                continue
+            if self.device_communicator and tensor.is_cuda:
+                tensor.copy_(self.device_communicator.broadcast(tensor, src))
+            else:
+                tensor.copy_(self.tcp_store_group.broadcast(tensor, src))
+
+        return tensor_dict
+
+    def send_object(self, obj, dst: int) -> None:
+        assert dst < self.world_size
+        assert dst != self.rank_in_group
+        self.tcp_store_group.send_obj(obj, dst)
+
+    def recv_object(self, src: int):
+        assert src < self.world_size
+        assert src != self.rank_in_group
+        return self.tcp_store_group.recv_obj(src)
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: dict[str, torch.Tensor | Any],
+        dst: int | None = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+        all_gather_tensors: dict[str, bool] | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
+        if self.world_size == 1:
+            return tensor_dict
+
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        assert dst < self.world_size
+
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        self.tcp_store_group.send_obj(metadata_list, dst)
+
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                continue
+            if self.device_communicator and tensor.is_cuda:
+                self.device_communicator.send(tensor, dst)
+            else:
+                self.tcp_store_group.send(tensor, dst)
+
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: int | None = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+        all_gather_tensors: dict[str, bool] | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
+        if self.world_size == 1:
+            return None
+
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+        assert src < self.world_size
+
+        recv_metadata_list = self.tcp_store_group.recv_obj(src)
+        tensor_dict = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size, dtype=value.dtype, device=value.device)
+                if tensor.numel() > 0:
+                    if self.device_communicator and tensor.is_cuda:
+                        tensor = self.device_communicator.recv(
+                            tensor.size(), tensor.dtype, src
+                        )
+                    else:
+                        tensor = self.tcp_store_group.recv(tensor, src)
+                tensor_dict[key] = tensor
+            else:
+                tensor_dict[key] = value
+        return tensor_dict
+
+    def barrier(self):
+        self.tcp_store_group.barrier()
+
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> torch.Tensor | None:
+        if self.world_size == 1:
+            return input_
+
+        if self.device_communicator is None:
+            raise ValueError("No device communicator found")
+
+        if self.rank_in_group == dst:
+            gathered_list = [torch.empty_like(input_) for _ in range(self.world_size)]
+            gathered_list[self.rank_in_group] = input_
+            for src_rank in range(self.world_size):
+                if src_rank != self.rank_in_group:
+                    gathered_list[src_rank] = self.device_communicator.recv(
+                        input_.size(), input_.dtype, src_rank
+                    )
+            return torch.cat(gathered_list, dim=dim)
+        else:
+            self.device_communicator.send(input_, dst)
+            return None
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 17375259ebff..102f2f727b75 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -18,7 +18,7 @@
 from typing import Any
 
 import torch
-from torch.distributed import ProcessGroup, TCPStore
+from torch.distributed import ProcessGroup, Store, TCPStore
 from torch.distributed.distributed_c10d import (
     Backend,
     PrefixStore,
@@ -228,6 +228,55 @@ def all_gather_obj(self, obj: Any) -> list[Any]:
                 gathered_objs.append(recv_obj)
         return gathered_objs
 
+    def broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
+        """Broadcast a tensor from source rank to all other ranks."""
+        if self.rank == src:
+            tensor_bytes = pickle.dumps(tensor)
+            self.expire_data()
+            key = f"broadcast_tensor/{src}/{self.broadcast_send_counter}"
+            self.store.set(key, tensor_bytes)
+            self.broadcast_send_counter += 1
+            self.entries.append((key, time.time()))
+            return tensor
+        else:
+            key = f"broadcast_tensor/{src}/{self.broadcast_recv_src_counter[src]}"
+            tensor = pickle.loads(self.store.get(key))
+            self.broadcast_recv_src_counter[src] += 1
+            return tensor
+
+    def send(self, tensor: torch.Tensor, dst: int):
+        """Send a tensor to a destination rank."""
+        self.expire_data()
+        key = f"send_tensor/{dst}/{self.send_dst_counter[dst]}"
+        self.store.set(key, pickle.dumps(tensor))
+        self.send_dst_counter[dst] += 1
+        self.entries.append((key, time.time()))
+
+    def recv(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
+        """Receive a tensor from a source rank."""
+        key = f"send_tensor/{self.rank}/{self.recv_src_counter[src]}"
+        received = pickle.loads(self.store.get(key))
+        self.recv_src_counter[src] += 1
+        tensor.copy_(received)
+        return tensor
+
+    def all_reduce(
+        self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM
+    ) -> torch.Tensor:
+        """All-reduce a tensor across all ranks."""
+        tensors = self.all_gather_obj(tensor)
+        result = tensors[0].clone()
+        for t in tensors[1:]:
+            if op == torch.distributed.ReduceOp.SUM:
+                result.add_(t)
+            elif op == torch.distributed.ReduceOp.PRODUCT:
+                result.mul_(t)
+            elif op == torch.distributed.ReduceOp.MAX:
+                result = torch.maximum(result, t)
+            elif op == torch.distributed.ReduceOp.MIN:
+                result = torch.minimum(result, t)
+        return result
+
     def barrier(self, timeout: float = 30.0):
         """A robust barrier to synchronize all ranks.
 
@@ -448,8 +497,14 @@ def init_gloo_process_group(
 
 
 def stateless_init_torch_distributed_process_group(
-    host: str, port: int, rank: int, world_size: int, backend: str
-) -> ProcessGroup:
+    host: str,
+    port: int,
+    rank: int,
+    world_size: int,
+    backend: str,
+    group_name: str | None = None,
+    return_store: bool = False,
+) -> ProcessGroup | tuple[ProcessGroup, Store]:
     """
     A replacement for `torch.distributed.init_process_group` that does not
     pollute the global state. The created ProcessGroup object can be used for
@@ -496,26 +551,36 @@ def stateless_init_torch_distributed_process_group(
     # Use a PrefixStore to avoid accidental overrides of keys used by
     # different systems (e.g. RPC) in case the store is multi-tenant.
     prefix_store = PrefixStore(init_method, store)
-    try:
-        from vllm.platforms import current_platform
 
-        return current_platform.stateless_init_device_torch_dist_pg(
-            backend=backend,
+    if backend == "gloo":
+        pg = init_gloo_process_group(
             prefix_store=prefix_store,
             group_rank=group_rank,
             group_size=group_size,
             timeout=timeout,
         )
-    except NotImplementedError:
-        # If platform doesn't implement stateless_init_device_torch_dist_pg, it
-        # will raise a NotImplementedError. In this case, we fall back to gloo.
-        return init_gloo_process_group(
+    else:
+        from vllm.platforms import current_platform
+
+        pg = current_platform.stateless_init_device_torch_dist_pg(
+            backend=backend,
             prefix_store=prefix_store,
             group_rank=group_rank,
             group_size=group_size,
             timeout=timeout,
         )
 
+    if group_name is not None:
+        from torch._C._distributed_c10d import _register_process_group
+
+        pg._set_group_name(group_name)
+        _register_process_group(group_name, pg)
+
+    if return_store:
+        return pg, store
+    else:
+        return pg
+
 
 def stateless_destroy_torch_distributed_process_group(pg: ProcessGroup) -> None:
     """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2e9cd6634895..64b505a1d319 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -419,6 +419,7 @@ class EngineArgs:
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     moe_backend: MoEBackend = KernelConfig.moe_backend
     all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
+    enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep
     enable_dbo: bool = ParallelConfig.enable_dbo
     ubatch_size: int = ParallelConfig.ubatch_size
     dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
@@ -896,6 +897,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--ubatch-size",
             **parallel_kwargs["ubatch_size"],
         )
+        parallel_group.add_argument(
+            "--enable-elastic-ep", **parallel_kwargs["enable_elastic_ep"]
+        )
         parallel_group.add_argument(
             "--dbo-decode-token-threshold",
             **parallel_kwargs["dbo_decode_token_threshold"],
@@ -1698,6 +1702,7 @@ def create_engine_config(
             is_moe_model=model_config.is_moe,
             enable_expert_parallel=self.enable_expert_parallel,
             all2all_backend=self.all2all_backend,
+            enable_elastic_ep=self.enable_elastic_ep,
             enable_dbo=self.enable_dbo,
             ubatch_size=self.ubatch_size,
             dbo_decode_token_threshold=self.dbo_decode_token_threshold,
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index c12cc7ff2a0b..9e3988b15bee 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -246,8 +246,12 @@ def run_multi_api_server(args: argparse.Namespace):
 
     api_server_manager: APIServerProcessManager | None = None
 
+    from vllm.v1.engine.utils import get_engine_zmq_addresses
+
+    addresses = get_engine_zmq_addresses(vllm_config, num_api_servers)
+
     with launch_core_engines(
-        vllm_config, executor_class, log_stats, num_api_servers
+        vllm_config, executor_class, log_stats, addresses, num_api_servers
     ) as (local_engine_manager, coordinator, addresses):
         # Construct common args for the APIServerProcessManager up-front.
         api_server_manager_kwargs = dict(
diff --git a/vllm/envs.py b/vllm/envs.py
index 07d9f81eac99..864ea6649a49 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -243,6 +243,8 @@
     VLLM_LORA_DISABLE_PDL: bool = False
     VLLM_ENABLE_CUDA_COMPATIBILITY: bool = False
     VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
+    VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
+    VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
 
 
 def get_default_cache_root():
@@ -1617,6 +1619,16 @@ def _get_or_set_default() -> str:
     "VLLM_CUDA_COMPATIBILITY_PATH": lambda: os.environ.get(
         "VLLM_CUDA_COMPATIBILITY_PATH", None
     ),
+    # Whether it is a scale up launch engine for elastic EP,
+    # Should only be set by EngineCoreClient.
+    "VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": lambda: bool(
+        int(os.getenv("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH", "0"))
+    ),
+    # Whether to wait for all requests to drain before sending the
+    # scaling command in elastic EP.
+    "VLLM_ELASTIC_EP_DRAIN_REQUESTS": lambda: bool(
+        int(os.getenv("VLLM_ELASTIC_EP_DRAIN_REQUESTS", "0"))
+    ),
 }
 
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index a7dee700415e..6200477092ab 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -627,6 +627,7 @@ def _get_quant_method() -> FusedMoEMethodBase:
             moe_quant_params["intermediate_size_full"] = intermediate_size
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
+        self.base_quant_method = self.quant_method
 
         # Disable shared expert overlap if:
         #   - we are using eplb with non-default backend, because of correctness issues
@@ -683,7 +684,7 @@ def maybe_init_modular_kernel(self) -> None:
         # routing_tables only needed for round-robin expert placement with
         # DeepEP all2all backend.
         routing_tables = self._maybe_init_expert_routing_tables()
-        prepare_finalize = self.quant_method.maybe_make_prepare_finalize(
+        prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize(
             routing_tables=routing_tables
         )
         if prepare_finalize is not None:
@@ -693,7 +694,7 @@ def maybe_init_modular_kernel(self) -> None:
             self._replace_quant_method(
                 FusedMoEModularMethod.make(
                     self,
-                    self.quant_method,
+                    self.base_quant_method,
                     prepare_finalize,
                     self.shared_experts,
                     inplace=not self.moe_config.disable_inplace,
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index d3312fe1560e..af627964f38f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -6,10 +6,13 @@
 
 import os
 from collections.abc import Callable
+from datetime import timedelta
 from functools import cache, wraps
 from typing import TYPE_CHECKING, TypeVar
 
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
 from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
@@ -482,6 +485,37 @@ def opaque_attention_op(cls) -> bool:
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(
+            prefix_store, group_rank, group_size, backend_options
+        )
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
     @classmethod
     def device_count(cls) -> int:
         return cuda_device_count_stateless()
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 3808ecc6e47f..e867ebbd60ec 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -2,10 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
+from datetime import timedelta
 from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING
 
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -656,6 +659,37 @@ def is_navi(cls) -> bool:
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(
+            prefix_store, group_rank, group_size, backend_options
+        )
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
     @classmethod
     def device_count(cls) -> int:
         return cuda_device_count_stateless()
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 1dd9f64f877d..19413ddb430e 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -29,6 +29,15 @@
 # so form part of the external API.
 FINISH_REASON_STRINGS = ("stop", "length", "abort", "error")
 
+EEP_NOTIFICATION_CALL_ID = -1
+
+
+class EEPNotificationType(enum.Enum):
+    NEW_CORE_ENGINES_INIT_READY = "NEW_CORE_ENGINES_INIT_READY"
+    NEW_CORE_ENGINES_WEIGHTS_INIT_READY = "NEW_CORE_ENGINES_WEIGHTS_INIT_READY"
+    RECONFIGURE_FINISHED = "RECONFIGURE_FINISHED"
+    SHUTDOWN_COMPLETE = "SHUTDOWN_COMPLETE"
+
 
 class FinishReason(enum.IntEnum):
     """
@@ -235,6 +244,11 @@ class ReconfigureDistributedRequest(msgspec.Struct):
     new_data_parallel_rank_local: int
     new_data_parallel_master_ip: str
     new_data_parallel_master_port: int
+    new_data_parallel_master_port_list: list[int]
+    new_stateless_world_group_port_list: list[list[int]]
+    new_stateless_dp_group_port_list: list[list[int]]
+    new_stateless_ep_group_port_list: list[list[int]]
+    new_stateless_eplb_group_port_list: list[list[int]]
 
 
 class ReconfigureRankType(enum.IntEnum):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d86e1b43df59..f172d6dda02e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -20,6 +20,7 @@
 )
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient, StreamingInput
+from vllm.entrypoints.serve.elastic_ep.middleware import set_scaling_elastic_ep
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -647,7 +648,11 @@ def _run_output_handler(self):
         engine_core = self.engine_core
         output_processor = self.output_processor
         log_stats = self.log_stats
-        logger_manager = self.logger_manager
+        # We use a mutable list for logger_manager so that it can be updated
+        # during elastic EP scaling (see scale_elastic_ep) without creating
+        # a circular reference via self.
+        self._logger_ref = [self.logger_manager]
+        logger_ref = self._logger_ref
         renderer = self.renderer
         chunk_size = envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
 
@@ -691,8 +696,8 @@ async def output_handler():
                     # 4) Logging.
                     # TODO(rob): make into a coroutine and launch it in
                     # background thread once Prometheus overhead is non-trivial.
-                    if logger_manager:
-                        logger_manager.record(
+                    if logger_ref[0]:
+                        logger_ref[0].record(
                             engine_idx=outputs.engine_index,
                             scheduler_stats=outputs.scheduler_stats,
                             iteration_stats=iteration_stats,
@@ -976,17 +981,13 @@ async def scale_elastic_ep(
                 new_data_parallel_size,
             )
             return
-        logger.info(
-            "Waiting for requests to drain before scaling up to %s engines...",
-            new_data_parallel_size,
-        )
-        await self.wait_for_requests_to_drain(drain_timeout)
-        logger.info(
-            "Requests have been drained, proceeding with scale to %s engines",
-            new_data_parallel_size,
-        )
-        await self.engine_core.scale_elastic_ep(new_data_parallel_size)
-        self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
+
+        if envs.VLLM_ELASTIC_EP_DRAIN_REQUESTS:
+            logger.info(
+                "VLLM_ELASTIC_EP_DRAIN_REQUESTS is set, "
+                "waiting for requests to drain before scaling"
+            )
+            await self.wait_for_requests_to_drain(drain_timeout)
 
         # recreate stat loggers
         if new_data_parallel_size > old_data_parallel_size and self.log_stats:
@@ -999,6 +1000,18 @@ async def scale_elastic_ep(
                 engine_idxs=list(range(new_data_parallel_size)),
                 custom_stat_loggers=None,
             )
+            # Update the mutable ref so output_handler picks up the
+            # new logger without creating a circular reference via self.
+            if hasattr(self, "_logger_ref"):
+                self._logger_ref[0] = self.logger_manager
+            self.logger_manager.log_engine_initialized()
+
+        set_scaling_elastic_ep(True)
+        try:
+            await self.engine_core.scale_elastic_ep(new_data_parallel_size)
+            self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
+        finally:
+            set_scaling_elastic_ep(False)
 
     @property
     def is_running(self) -> bool:
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index 672d536a53a3..44a346350fc8 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -71,6 +71,9 @@ def __init__(
         )
 
         local_only_eng = dp_size == parallel_config.data_parallel_size_local
+        # NOTE(yongji): handling scaling from intra-node to inter-node
+        if parallel_config.enable_elastic_ep:
+            local_only_eng = False
         back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
         back_output_address = get_engine_client_zmq_addr(local_only_eng, host)
 
@@ -201,6 +204,7 @@ def process_input_socket(
 
             poller = zmq.Poller()
             poller.register(publish_front, zmq.POLLIN)
+            poller.register(publish_back, zmq.POLLIN)
             poller.register(output_back, zmq.POLLIN)
             last_publish_time = 0
             while True:
@@ -231,6 +235,22 @@ def process_input_socket(
                 events = dict(events)
                 wave_state_changed = False
 
+                if publish_back in events:
+                    buffer = publish_back.recv()
+                    if buffer == b"\x01":
+                        # NOTE(yongji): newly started engine subscribed
+                        # We need to send READY message here instead of receiving
+                        # SCALE_ELASTIC_EP notification from engine core client
+                        # as SCALE_ELASTIC_EP is only sent when
+                        # new engines finished initialization.
+                        # Subscription message, on the other hand, is sent
+                        # by each engine during initialization
+                        publish_back.send(b"READY")
+                    else:
+                        logger.error(
+                            "DP Coordinator receives unexpected message from engines"
+                        )
+
                 if publish_front in events:
                     buffer = publish_front.recv()
                     if buffer in (b"\x01", b"\x00"):
@@ -259,7 +279,6 @@ def process_input_socket(
                             # current_wave
                             # we note that 0 is the wave number for the new
                             # engine
-                            engines_running = False
                             logger.info(
                                 "DPCoordinator scaled up from %s to %s engines",
                                 current_count,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 39515cab7742..4de3e4ea7d3a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -17,6 +17,7 @@
 import msgspec
 import zmq
 
+import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.envs import enable_envs_cache
@@ -44,6 +45,8 @@
 from vllm.v1.core.sched.interface import PauseState, SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import (
+    EEP_NOTIFICATION_CALL_ID,
+    EEPNotificationType,
     EngineCoreOutput,
     EngineCoreOutputs,
     EngineCoreRequest,
@@ -110,6 +113,9 @@ def __init__(
 
         self.available_gpu_memory_for_kv_cache = -1
 
+        if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+            self._eep_scale_up_before_kv_init()
+
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
             vllm_config
@@ -233,12 +239,10 @@ def _initialize_kv_caches(
 
         has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
         if has_kv_cache:
-            if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
-                dp_group = getattr(self, "dp_group", None)
-                assert dp_group is not None
-                self.available_gpu_memory_for_kv_cache = (
-                    ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
-                )
+            if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+                # NOTE(yongji): should already be set
+                # during _eep_scale_up_before_kv_init
+                assert self.available_gpu_memory_for_kv_cache > 0
                 available_gpu_memory = [self.available_gpu_memory_for_kv_cache] * len(
                     kv_cache_specs
                 )
@@ -752,11 +756,22 @@ def preprocess_add_request(self, request: EngineCoreRequest) -> tuple[Request, i
             self.structured_output_manager.grammar_init(req)
         return req, request.current_wave
 
+    def _eep_scale_up_before_kv_init(self):
+        raise NotImplementedError
+
+    def _eep_send_engine_core_notification(
+        self,
+        notification_type: EEPNotificationType,
+        vllm_config: VllmConfig | None = None,
+    ):
+        raise NotImplementedError
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
     ENGINE_CORE_DEAD = b"ENGINE_CORE_DEAD"
+    addresses: EngineZmqAddresses
 
     @instrument(span_name="EngineCoreProc init")
     def __init__(
@@ -807,6 +822,13 @@ def __init__(
             # and "hybrid" LB modes.
             self.publish_dp_lb_stats = internal_dp_balancing
 
+            self.addresses = addresses
+            self.process_input_queue_block = True
+            if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+                self._eep_send_engine_core_notification(
+                    EEPNotificationType.NEW_CORE_ENGINES_INIT_READY,
+                    vllm_config=vllm_config,
+                )
             self._init_data_parallel(vllm_config)
 
             super().__init__(
@@ -1119,8 +1141,14 @@ def _process_input_queue(self):
                 if logger.isEnabledFor(DEBUG):
                     logger.debug("EngineCore waiting for work.")
                     waited = True
-            req = self.input_queue.get()
-            self._handle_client_request(*req)
+            block = self.process_input_queue_block
+            try:
+                req = self.input_queue.get(block=block)
+                self._handle_client_request(*req)
+            except queue.Empty:
+                break
+            if not block:
+                break
 
         if waited:
             logger.debug("EngineCore loop active.")
@@ -1290,6 +1318,11 @@ def process_input_sockets(
                 for input_socket, _ in poller.poll():
                     # (RequestType, RequestData)
                     type_frame, *data_frames = input_socket.recv_multipart(copy=False)
+                    # NOTE(yongji): ignore READY message sent by DP coordinator
+                    # that is used to notify newly started engines
+                    if type_frame.buffer == b"READY":
+                        assert input_socket == coord_socket
+                        continue
                     request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                     # Deserialize the request data.
@@ -1488,6 +1521,10 @@ def __init__(
         self.current_wave = 0
         self.last_counts = (0, 0)
 
+        from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
+
+        self.eep_scaling_state: ElasticEPScalingState | None = None
+
         # Initialize the engine.
         dp_rank = vllm_config.parallel_config.data_parallel_rank
         super().__init__(
@@ -1511,7 +1548,9 @@ def _init_data_parallel(self, vllm_config: VllmConfig):
         assert 0 <= local_dp_rank <= dp_rank < dp_size
 
         self.dp_rank = dp_rank
-        self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+        self.dp_group, self.dp_store = (
+            vllm_config.parallel_config.stateless_init_dp_group(return_store=True)
+        )
 
     def shutdown(self):
         super().shutdown()
@@ -1574,7 +1613,12 @@ def run_busy_loop(self):
             # 1) Poll the input queue until there is work to do.
             self._process_input_queue()
 
-            # 2) Step the engine core.
+            if self.eep_scaling_state is not None:
+                _ = self.eep_scaling_state.progress()
+                if self.eep_scaling_state.is_complete():
+                    self.process_input_queue_block = True
+                    self.eep_scaling_state = None
+
             executed = self._process_engine_step()
             self._maybe_publish_request_counts()
 
@@ -1624,54 +1668,129 @@ def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
     def reinitialize_distributed(
         self, reconfig_request: ReconfigureDistributedRequest
     ) -> None:
-        stateless_destroy_torch_distributed_process_group(self.dp_group)
-        self.shutdown()
-
-        parallel_config = self.vllm_config.parallel_config
-        old_dp_size = parallel_config.data_parallel_size
-        parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
-        if reconfig_request.new_data_parallel_rank != -1:
-            parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
-        # local rank specifies device visibility, it should not be changed
-        assert (
-            reconfig_request.new_data_parallel_rank_local
-            == ReconfigureRankType.KEEP_CURRENT_RANK
-        )
-        parallel_config.data_parallel_master_ip = (
+        from copy import deepcopy
+
+        from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
+
+        new_parallel_config = deepcopy(self.vllm_config.parallel_config)
+        old_dp_size = new_parallel_config.data_parallel_size
+        new_parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
+        if (
+            reconfig_request.new_data_parallel_rank
+            != ReconfigureRankType.KEEP_CURRENT_RANK
+        ):
+            new_parallel_config.data_parallel_rank = (
+                reconfig_request.new_data_parallel_rank
+            )
+        new_parallel_config.data_parallel_master_ip = (
             reconfig_request.new_data_parallel_master_ip
         )
-        parallel_config.data_parallel_master_port = (
+        new_parallel_config.data_parallel_master_port = (
             reconfig_request.new_data_parallel_master_port
         )
-        if reconfig_request.new_data_parallel_rank != -2:
-            self.dp_rank = parallel_config.data_parallel_rank
-            self.dp_group = parallel_config.stateless_init_dp_group()
-        reconfig_request.new_data_parallel_master_port = (
-            parallel_config.data_parallel_master_port
+        new_parallel_config._data_parallel_master_port_list = (
+            reconfig_request.new_data_parallel_master_port_list
         )
 
-        self.model_executor.reinitialize_distributed(reconfig_request)
-        if reconfig_request.new_data_parallel_size > old_dp_size:
-            assert self.available_gpu_memory_for_kv_cache > 0
-            # pass available_gpu_memory_for_kv_cache from existing
-            # engine-cores to new engine-cores so they can directly
-            # use it in _initialize_kv_caches() rather than profiling.
-            ParallelConfig.sync_kv_cache_memory_size(
-                self.dp_group, self.available_gpu_memory_for_kv_cache
-            )
-            # NOTE(yongji): newly joined workers require dummy_run even
-            # CUDA graph is not used
-            self.model_executor.collective_rpc("compile_or_warm_up_model")
-        if (
+        is_scale_down = reconfig_request.new_data_parallel_size < old_dp_size
+        is_shutdown = (
             reconfig_request.new_data_parallel_rank
             == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
-        ):
-            self.shutdown()
-            logger.info("DPEngineCoreProc %s shutdown", self.dp_rank)
+        )
+
+        self.eep_scaling_state = ElasticEPScalingState(
+            model_executor=self.model_executor,
+            engine_core=self,
+            vllm_config=self.vllm_config,
+            new_parallel_config=new_parallel_config,
+            worker_type="removing" if is_shutdown else "existing",
+            scale_type="scale_down" if is_scale_down else "scale_up",
+            reconfig_request=reconfig_request,
+        )
+        self.process_input_queue_block = False
+        logger.info(
+            "[Elastic EP] Received reconfiguration request and starting scaling up/down"
+        )
+
+    def _eep_send_engine_core_notification(
+        self,
+        notification_type: EEPNotificationType,
+        vllm_config: VllmConfig | None = None,
+    ):
+        """
+        Send notifications to EngineCoreClient, which can then forward
+        the notifications to other engine core processes. It is used for:
+        1) In scale up: new core engines to notify exisiting core engines
+           that they are ready;
+        2) In scale down: removing core engines to notify EngineCoreClient
+           so EngineCoreClient can release their ray placement groups;
+        3) Both scale up/down: to notify EngineCoreClient that exisiting
+           core engines have already switched to the new parallel setup.
+        """
+        if vllm_config is None:
+            dp_rank = self.vllm_config.parallel_config.data_parallel_rank
         else:
-            logger.info(
-                "Distributed environment reinitialized for DP rank %s", self.dp_rank
+            dp_rank = vllm_config.parallel_config.data_parallel_rank
+        notification_data = (notification_type.value, dp_rank)
+        outputs = EngineCoreOutputs(
+            utility_output=UtilityOutput(
+                call_id=EEP_NOTIFICATION_CALL_ID,
+                result=UtilityResult(notification_data),
             )
+        )
+        outputs.engine_index = self.engine_index
+
+        if hasattr(self, "output_thread") and self.output_thread.is_alive():
+            self.output_queue.put_nowait((0, outputs))
+        else:
+            encoder = MsgpackEncoder()
+            with (
+                zmq.Context() as ctx,
+                make_zmq_socket(
+                    ctx, self.addresses.outputs[0], zmq.PUSH, linger=4000
+                ) as socket,
+            ):
+                socket.send_multipart(encoder.encode(outputs))
+
+    def eep_handle_engine_core_notification(
+        self, notification_type: str | EEPNotificationType
+    ):
+        """
+        Handle notification received from EngineCoreClient
+        (forwarded from new core engines).
+        """
+        assert self.eep_scaling_state is not None
+        if isinstance(notification_type, str):
+            notification_type = EEPNotificationType(notification_type)
+        self.eep_scaling_state.handle_notification(notification_type)
+
+    def _eep_scale_up_before_kv_init(self):
+        from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
+
+        self.eep_scaling_state = ElasticEPScalingState(
+            model_executor=self.model_executor,
+            engine_core=self,
+            vllm_config=self.vllm_config,
+            new_parallel_config=self.vllm_config.parallel_config,
+            worker_type="new",
+            scale_type="scale_up",
+            reconfig_request=None,
+        )
+        self.model_executor.collective_rpc("init_device")
+        self.model_executor.collective_rpc("load_model")
+        self._eep_send_engine_core_notification(
+            EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY
+        )
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("receive_weights",)
+        )
+        self.available_gpu_memory_for_kv_cache = (
+            ParallelConfig.sync_kv_cache_memory_size(self.dp_group, -1)
+        )
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("prepare_new_worker",)
+        )
+        self.process_input_queue_block = False
 
 
 class EngineCoreActorMixin:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 777dea5ae0a5..e19b31396b9b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -28,11 +28,12 @@
 from vllm.utils.async_utils import in_loop
 from vllm.utils.network_utils import (
     close_sockets,
-    get_open_port,
     get_open_zmq_inproc_path,
     make_zmq_socket,
 )
 from vllm.v1.engine import (
+    EEP_NOTIFICATION_CALL_ID,
+    EEPNotificationType,
     EngineCoreOutputs,
     EngineCoreRequest,
     EngineCoreRequestType,
@@ -47,6 +48,7 @@
 from vllm.v1.engine.utils import (
     CoreEngineActorManager,
     CoreEngineProcManager,
+    get_engine_zmq_addresses,
     launch_core_engines,
 )
 from vllm.v1.executor import Executor
@@ -445,6 +447,63 @@ def validate_alive(self, frames: Sequence[zmq.Frame]):
             raise EngineDeadError()
 
 
+@dataclass
+class ElasticScalingCache:
+    existing_core_engines: list[EngineIdentity]
+    num_new_core_engines: int
+    pending_notifications: dict[EEPNotificationType, set[int]]
+
+
+def allocate_stateless_group_ports(parallel_config, new_data_parallel_size: int):
+    """
+    Allocate stateless group ports for elastic EP.
+    """
+    from vllm.utils.network_utils import get_open_ports_list
+
+    assert parallel_config.enable_elastic_ep, "Elastic EP must be enabled"
+    world_size = parallel_config.world_size
+    new_world_size_across_dp = world_size * new_data_parallel_size
+    num_world_groups = 1
+    num_dp_groups = max(1, new_world_size_across_dp // new_data_parallel_size)
+    num_ep_groups = max(
+        1,
+        new_world_size_across_dp
+        // (new_data_parallel_size * parallel_config.tensor_parallel_size),
+    )
+    num_eplb_groups = num_ep_groups
+    total_ports_needed = (
+        num_world_groups + num_dp_groups + num_ep_groups + num_eplb_groups
+    ) * 3 + 5
+    all_ports = get_open_ports_list(total_ports_needed)
+    new_data_parallel_master_port_list = all_ports[-5:]
+    all_ports = all_ports[:-5]
+    new_stateless_world_group_port_list = [
+        all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
+    ]
+    start_idx = num_world_groups * 3
+    new_stateless_dp_group_port_list = [
+        all_ports[i : i + 3] for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
+    ]
+    start_idx += num_dp_groups * 3
+    new_stateless_ep_group_port_list = [
+        all_ports[i : i + 3] for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
+    ]
+    start_idx += num_ep_groups * 3
+    new_stateless_eplb_group_port_list = [
+        all_ports[i : i + 3]
+        for i in range(start_idx, start_idx + num_eplb_groups * 3, 3)
+    ]
+
+    parallel_config._stateless_world_group_port_list = (
+        new_stateless_world_group_port_list
+    )
+    parallel_config._stateless_dp_group_port_list = new_stateless_dp_group_port_list
+    parallel_config._stateless_ep_group_port_list = new_stateless_ep_group_port_list
+    parallel_config._stateless_eplb_group_port_list = new_stateless_eplb_group_port_list
+    parallel_config.data_parallel_master_port = new_data_parallel_master_port_list.pop()
+    parallel_config._data_parallel_master_port_list = new_data_parallel_master_port_list
+
+
 class MPClient(EngineCoreClient):
     """
     MPClient: base client for multi-proc EngineCore.
@@ -491,32 +550,37 @@ def __init__(
                 input_address = client_addresses["input_address"]
                 output_address = client_addresses["output_address"]
                 self.stats_update_address = client_addresses.get("stats_update_address")
+                self.input_socket = self.resources.input_socket = make_zmq_socket(
+                    self.ctx, input_address, zmq.ROUTER, bind=True
+                )
+                self.resources.output_socket = make_zmq_socket(
+                    self.ctx, output_address, zmq.PULL
+                )
             else:
                 # Engines are managed by this client.
-                with launch_core_engines(vllm_config, executor_class, log_stats) as (
-                    engine_manager,
-                    coordinator,
+                addresses = get_engine_zmq_addresses(vllm_config)
+                self.input_socket = self.resources.input_socket = make_zmq_socket(
+                    self.ctx, addresses.inputs[0], zmq.ROUTER, bind=True
+                )
+                self.resources.output_socket = make_zmq_socket(
+                    self.ctx, addresses.outputs[0], zmq.PULL
+                )
+
+                with launch_core_engines(
+                    vllm_config,
+                    executor_class,
+                    log_stats,
                     addresses,
-                ):
+                ) as (engine_manager, coordinator, addresses):
                     self.resources.coordinator = coordinator
                     self.resources.engine_manager = engine_manager
 
-                (input_address,) = addresses.inputs
-                (output_address,) = addresses.outputs
                 self.stats_update_address = addresses.frontend_stats_publish_address
                 if coordinator is not None:
                     assert self.stats_update_address == (
                         coordinator.get_stats_publish_address()
                     )
 
-            # Create input and output sockets.
-            self.input_socket = self.resources.input_socket = make_zmq_socket(
-                self.ctx, input_address, zmq.ROUTER, bind=True
-            )
-            self.resources.output_socket = make_zmq_socket(
-                self.ctx, output_address, zmq.PULL
-            )
-
             parallel_config = vllm_config.parallel_config
             dp_size = parallel_config.data_parallel_size
             dp_rank = parallel_config.data_parallel_index
@@ -877,6 +941,10 @@ def _ensure_output_queue_task(self):
         output_socket = resources.output_socket
         assert output_socket is not None
 
+        notification_callback_handler: (
+            Callable[[AsyncMPClient, Sequence[Any]], Any] | None
+        ) = getattr(self.__class__, "eep_process_engine_core_notification", None)
+
         async def process_outputs_socket():
             try:
                 while True:
@@ -884,7 +952,26 @@ async def process_outputs_socket():
                     resources.validate_alive(frames)
                     outputs: EngineCoreOutputs = decoder.decode(frames)
                     if outputs.utility_output:
-                        _process_utility_output(outputs.utility_output, utility_results)
+                        if (
+                            outputs.utility_output.call_id == EEP_NOTIFICATION_CALL_ID
+                            and notification_callback_handler is not None
+                        ):
+                            assert _self_ref is not None
+                            _self = _self_ref()
+                            if not _self:
+                                return
+                            if outputs.utility_output.result is None:
+                                continue
+                            notification_data = outputs.utility_output.result.result
+                            assert isinstance(notification_data, Sequence)
+                            assert len(notification_data) == 2
+                            asyncio.create_task(
+                                notification_callback_handler(_self, notification_data)
+                            )
+                        else:
+                            _process_utility_output(
+                                outputs.utility_output, utility_results
+                            )
                         continue
 
                     if output_handler is not None:
@@ -1081,6 +1168,8 @@ def __init__(
         # Used only by DPLBAsyncMPClient subclass.
         self.lb_engines: list[list[int]] = [[0, 0] for _ in self.core_engines]
 
+        self.eep_scaling_cache: ElasticScalingCache | None = None
+
         self.first_req_sock_addr = get_open_zmq_inproc_path()
         self.first_req_send_socket = self.resources.first_req_send_socket = (
             make_zmq_socket(self.ctx, self.first_req_sock_addr, zmq.PAIR, bind=True)
@@ -1101,12 +1190,6 @@ def _ensure_stats_update_task(self):
         assert self.stats_update_address is not None
         stats_addr: str = self.stats_update_address
         assert len(self.engine_ranks_managed) > 0
-        # NOTE: running and waiting counts are all global from
-        # the Coordinator include all global EngineCores. This
-        # slice includes just the cores managed by this client.
-        count_slice = slice(
-            self.engine_ranks_managed[0], self.engine_ranks_managed[-1] + 1
-        )
 
         async def run_engine_stats_update_task():
             with (
@@ -1145,6 +1228,29 @@ async def run_engine_stats_update_task():
                         ):
                             # Extract new engine count from the decoded message
                             new_engine_count = decoded[1]
+                            # Update engine_ranks_managed and count_slice
+                            parallel_config = self.vllm_config.parallel_config
+                            dp_size = parallel_config.data_parallel_size
+                            dp_rank = parallel_config.data_parallel_rank
+                            assert dp_rank == 0
+                            assert dp_size == new_engine_count
+                            assert not (
+                                parallel_config.data_parallel_hybrid_lb
+                                or parallel_config.data_parallel_external_lb
+                            )
+                            num_ranks = dp_size
+                            self.engine_ranks_managed = list(
+                                range(dp_rank, dp_rank + num_ranks)
+                            )
+                            if len(self.lb_engines) < new_engine_count:
+                                self.lb_engines = self.lb_engines + [
+                                    [0, 0]
+                                    for _ in range(
+                                        new_engine_count - len(self.lb_engines)
+                                    )
+                                ]
+                            else:
+                                self.lb_engines = self.lb_engines[:new_engine_count]
                             # Send scale up notification to coordinator
                             scale_msg = msgspec.msgpack.encode(
                                 ("SCALE_ELASTIC_EP", new_engine_count)
@@ -1178,6 +1284,11 @@ async def run_engine_stats_update_task():
                     self.current_wave = wave
                     self.engines_running = running
                     if counts is not None:
+                        # Running and waiting counts are global from the
+                        # Coordinator including all EngineCores. Slice to get
+                        # just the cores managed by this client.
+                        ranks = self.engine_ranks_managed
+                        count_slice = slice(ranks[0], ranks[-1] + 1)
                         sliced_counts = counts[count_slice]
                         self.lb_engines = sliced_counts
                         logger.debug(
@@ -1287,6 +1398,67 @@ async def process_engine_outputs(
             for req_id in outputs.finished_requests:
                 self.reqs_in_flight.pop(req_id, None)
 
+    @staticmethod
+    async def eep_process_engine_core_notification(
+        self: "DPLBAsyncMPClient", notification_data: tuple[str, int]
+    ):
+        cache = self.eep_scaling_cache
+        notification_type_str, dp_rank = notification_data
+        try:
+            notification_type = EEPNotificationType(notification_type_str)
+        except ValueError as e:
+            raise ValueError(
+                f"Unknown EEP notification type: {notification_type_str}"
+            ) from e
+
+        if notification_type == EEPNotificationType.RECONFIGURE_FINISHED:
+            from vllm.v1.engine import UtilityResult
+
+            # NOTE(yongji): process a dummy UtilityOutput to resolve the future
+            # awaited in _eep_wait_for_setup_switch_complete(), signaling that
+            # all engine cores have completed reconfiguration.
+            dummy_output = UtilityOutput(
+                call_id=EEP_NOTIFICATION_CALL_ID, result=UtilityResult(None)
+            )
+            _process_utility_output(dummy_output, self.utility_results)
+            return
+        assert cache is not None
+        if notification_type not in cache.pending_notifications:
+            cache.pending_notifications[notification_type] = set()
+        if dp_rank in cache.pending_notifications[notification_type]:
+            raise ValueError(
+                f"Duplicate notification {notification_type} from dp_rank {dp_rank}"
+            )
+        cache.pending_notifications[notification_type].add(dp_rank)
+        if len(cache.pending_notifications[notification_type]) >= abs(
+            cache.num_new_core_engines
+        ):
+            if notification_type == EEPNotificationType.SHUTDOWN_COMPLETE:
+                assert isinstance(self.resources.engine_manager, CoreEngineActorManager)
+                assert cache.num_new_core_engines < 0
+                old_dp_size = len(cache.existing_core_engines)
+                new_dp_size = old_dp_size + cache.num_new_core_engines
+                self.resources.engine_manager.scale_down_elastic_ep(
+                    old_dp_size, new_dp_size
+                )
+            else:
+                await asyncio.gather(
+                    *[
+                        self._call_utility_async(
+                            "eep_handle_engine_core_notification",
+                            notification_type,
+                            engine=engine,
+                        )
+                        for engine in cache.existing_core_engines
+                    ]
+                )
+            cache.pending_notifications[notification_type] = set()
+            if notification_type in [
+                EEPNotificationType.SHUTDOWN_COMPLETE,
+                EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY,
+            ]:
+                self.eep_scaling_cache = None
+
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         if not request_ids or self.resources.engine_dead:
             return
@@ -1333,6 +1505,20 @@ async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
                 cur_data_parallel_size, new_data_parallel_size
             )
 
+    async def _eep_wait_for_setup_switch_complete(self) -> None:
+        """
+        Wait for core engines to switch to the new setup.
+
+        In eep_process_engine_core_notification(), a dummy UtilityOutput with
+        EEP_NOTIFICATION_CALL_ID will be set when RECONFIGURE_FINISHED
+        notification is received from engine 0. We create a future with
+        that call_id and wait for it to be resolved.
+        """
+        future = asyncio.get_running_loop().create_future()
+        self.utility_results[EEP_NOTIFICATION_CALL_ID] = future
+        self._ensure_output_queue_task()
+        await future
+
     async def _scale_up_elastic_ep(
         self, cur_data_parallel_size: int, new_data_parallel_size: int
     ) -> None:
@@ -1340,38 +1526,57 @@ async def _scale_up_elastic_ep(
         and reconfiguring existing ones."""
         cur_data_parallel_size = len(self.core_engines)
 
-        # Phase 1: Send reconfigure messages to all existing engines and wait
-        # for them to be sent
+        self.eep_scaling_cache = ElasticScalingCache(
+            existing_core_engines=self.core_engines.copy(),
+            num_new_core_engines=new_data_parallel_size - cur_data_parallel_size,
+            pending_notifications=dict(),
+        )
+
+        parallel_config = self.vllm_config.parallel_config
+        allocate_stateless_group_ports(parallel_config, new_data_parallel_size)
+
+        # Phase 1: Send reconfig messages to existing engines
         reconfig_futures = []
-        self.vllm_config.parallel_config.data_parallel_master_port = get_open_port()
         for engine in self.core_engines:
             reconfig_request = ReconfigureDistributedRequest(
                 new_data_parallel_size=new_data_parallel_size,
                 new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK,
                 new_data_parallel_rank_local=ReconfigureRankType.KEEP_CURRENT_RANK,
-                new_data_parallel_master_ip=self.vllm_config.parallel_config.data_parallel_master_ip,
-                new_data_parallel_master_port=self.vllm_config.parallel_config.data_parallel_master_port,
+                new_data_parallel_master_ip=parallel_config.data_parallel_master_ip,
+                new_data_parallel_master_port=parallel_config.data_parallel_master_port,
+                new_data_parallel_master_port_list=parallel_config._data_parallel_master_port_list,
+                new_stateless_world_group_port_list=parallel_config._stateless_world_group_port_list,
+                new_stateless_dp_group_port_list=parallel_config._stateless_dp_group_port_list,
+                new_stateless_ep_group_port_list=parallel_config._stateless_ep_group_port_list,
+                new_stateless_eplb_group_port_list=parallel_config._stateless_eplb_group_port_list,
             )
             coro = self._call_utility_async(
                 "reinitialize_distributed", reconfig_request, engine=engine
             )
             reconfig_futures.append(asyncio.create_task(coro))
 
-        logger.info("All reconfigure messages sent, starting engine creation")
-
-        # Phase 2: Create new engines now that reconfig messages have been sent
-        # self.resources.engine_manager is guaranteed to be
-        # CoreEngineActorManager for RayDPClient
+        # Phase 2: Create new engines
         assert isinstance(self.resources.engine_manager, CoreEngineActorManager)
-        self.resources.engine_manager.scale_up_elastic_ep(
-            self.vllm_config, new_data_parallel_size
+        parallel_config.eplb_config.num_redundant_experts = 0
+        start_new_worker_future = asyncio.to_thread(
+            self.resources.engine_manager.scale_up_elastic_ep,
+            self.vllm_config,
+            new_data_parallel_size,
         )
+        wait_future = self._eep_wait_for_setup_switch_complete()
+
+        # Phase 3: Wait for new engines to be created
+        # and reconfig messages to be received
+        await asyncio.gather(start_new_worker_future, *reconfig_futures)
+        logger.info("[Elastic EP] Successfully started new engines")
 
         # Create new CoreEngine objects for the new engines
         new_engine_identities = set()
         for i in range(cur_data_parallel_size, new_data_parallel_size):
             new_engine = i.to_bytes(2, "little")
             self.core_engines.append(new_engine)
+            # NOTE(yongji): we don't update lb_engines here,
+            # we let run_engine_stats_update_task to update it.
             new_engine_identities.add(new_engine)
 
         # Wait for ready messages from new engines on the input socket
@@ -1387,10 +1592,11 @@ async def _scale_up_elastic_ep(
             identity, _ = sync_input_socket.recv_multipart()
             new_engine_identities.discard(identity)
 
-        # Phase 3: Wait for all existing engines to complete reconfiguration
-        logger.info("Waiting for existing engines to complete reconfiguration")
-        await asyncio.gather(*reconfig_futures)
-
+        # NOTE(yongji): Before we schedule any requests on the new workers,
+        # we should wait for them to switch to the new setup.
+        await wait_future
+        # Update the parallel config
+        self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
         # Notify coordinator about scale up through existing
         # stats_update_task connection
         self._ensure_stats_update_task()
@@ -1399,8 +1605,6 @@ async def _scale_up_elastic_ep(
         )
         await self.first_req_send_socket.send(scale_up_marker)
 
-        # Update the parallel config
-        self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
         logger.info(
             "[Elastic EP] Scale up completed, new data parallel size: %s",
             new_data_parallel_size,
@@ -1413,7 +1617,14 @@ async def _scale_down_elastic_ep(
         reconfiguring existing engine cores."""
         cur_data_parallel_size = len(self.core_engines)
 
-        self.vllm_config.parallel_config.data_parallel_master_port = get_open_port()
+        self.eep_scaling_cache = ElasticScalingCache(
+            existing_core_engines=self.core_engines.copy(),
+            num_new_core_engines=new_data_parallel_size - cur_data_parallel_size,
+            pending_notifications=dict(),
+        )
+
+        parallel_config = self.vllm_config.parallel_config
+        allocate_stateless_group_ports(parallel_config, new_data_parallel_size)
 
         reconfig_futures = []
         for cur_dp_rank, engine in enumerate(self.core_engines):
@@ -1421,8 +1632,13 @@ async def _scale_down_elastic_ep(
                 new_data_parallel_size=new_data_parallel_size,
                 new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK,
                 new_data_parallel_rank_local=ReconfigureRankType.KEEP_CURRENT_RANK,
-                new_data_parallel_master_ip=self.vllm_config.parallel_config.data_parallel_master_ip,
-                new_data_parallel_master_port=self.vllm_config.parallel_config.data_parallel_master_port,
+                new_data_parallel_master_ip=parallel_config.data_parallel_master_ip,
+                new_data_parallel_master_port=parallel_config.data_parallel_master_port,
+                new_data_parallel_master_port_list=parallel_config._data_parallel_master_port_list,
+                new_stateless_world_group_port_list=parallel_config._stateless_world_group_port_list,
+                new_stateless_dp_group_port_list=parallel_config._stateless_dp_group_port_list,
+                new_stateless_ep_group_port_list=parallel_config._stateless_ep_group_port_list,
+                new_stateless_eplb_group_port_list=parallel_config._stateless_eplb_group_port_list,
             )
             if cur_dp_rank >= new_data_parallel_size:
                 reconfig_request.new_data_parallel_rank = (
@@ -1433,23 +1649,24 @@ async def _scale_down_elastic_ep(
             )
             reconfig_futures.append(asyncio.create_task(coro))
 
-        for _ in range(new_data_parallel_size, cur_data_parallel_size):
-            self.core_engines.pop()
+        # NOTE(yongji): Immediately stop sending requests to the removing engines.
+        self.core_engines = self.core_engines[:new_data_parallel_size]
+        self.lb_engines = self.lb_engines[:new_data_parallel_size]
+        wait_future = self._eep_wait_for_setup_switch_complete()
 
         await asyncio.gather(*reconfig_futures)
 
-        assert isinstance(self.resources.engine_manager, CoreEngineActorManager)
-        self.resources.engine_manager.scale_down_elastic_ep(
-            cur_data_parallel_size, new_data_parallel_size
-        )
-
+        self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
         self._ensure_stats_update_task()
         scale_down_marker = msgspec.msgpack.encode(
             ("SCALE_ELASTIC_EP", new_data_parallel_size)
         )
         await self.first_req_send_socket.send(scale_down_marker)
 
-        self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
+        # NOTE(yongji): Unlike scaling up,
+        # here we don't actually need to wait for the setup switch to complete.
+        # We may want to remove it in the future.
+        await wait_future
         logger.info(
             "[Elastic EP] Scale down completed, new data parallel size: %s",
             new_data_parallel_size,
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 6c11087a39c1..a7d3c10b5752 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -277,6 +277,8 @@ def __init__(
         else:
             ray.init()
 
+        vllm_config.parallel_config.allocate_elastic_ep_ports()
+
         if placement_groups is not None:
             assert local_dp_ranks is not None, (
                 "local_dp_ranks must be provided if placement_groups is provided"
@@ -584,6 +586,8 @@ def add_dp_placement_groups(
 
             node_ip = node.node_ip
             node_id = node.node_id
+            if device_str not in available_resources[node_id]:
+                continue
             available_gpus = int(available_resources[node_id][device_str])
 
             # Get total GPUs on this node from the node's resources
@@ -773,26 +777,15 @@ def close(self):
             ray.util.remove_placement_group(pg)
 
 
-@contextlib.contextmanager
-def launch_core_engines(
+def get_engine_zmq_addresses(
     vllm_config: VllmConfig,
-    executor_class: type[Executor],
-    log_stats: bool,
     num_api_servers: int = 1,
-) -> Iterator[
-    tuple[
-        CoreEngineProcManager | CoreEngineActorManager | None,
-        DPCoordinator | None,
-        EngineZmqAddresses,
-    ]
-]:
-    """Launch engine and DP coordinator processes as needed."""
-
+) -> EngineZmqAddresses:
+    """Allocate ZMQ addresses for engine-client communication."""
     parallel_config = vllm_config.parallel_config
-    dp_size = parallel_config.data_parallel_size
     local_engine_count = parallel_config.data_parallel_size_local
     local_start_index = parallel_config.data_parallel_rank_local
-    dp_rank = parallel_config.data_parallel_rank
+    dp_size = parallel_config.data_parallel_size
     host = parallel_config.data_parallel_master_ip
     local_engines_only = parallel_config.local_engines_only
 
@@ -806,9 +799,11 @@ def launch_core_engines(
     client_local_only = (
         offline_mode or local_engines_only or (local_engine_count == dp_size)
     )
+    # NOTE(yongji): handling scaling from intra-node to inter-node
+    if parallel_config.enable_elastic_ep:
+        client_local_only = False
 
-    # Set up input and output addresses.
-    addresses = EngineZmqAddresses(
+    return EngineZmqAddresses(
         inputs=[
             get_engine_client_zmq_addr(client_local_only, host)
             for _ in range(num_api_servers)
@@ -819,6 +814,33 @@ def launch_core_engines(
         ],
     )
 
+
+@contextlib.contextmanager
+def launch_core_engines(
+    vllm_config: VllmConfig,
+    executor_class: type[Executor],
+    log_stats: bool,
+    addresses: EngineZmqAddresses,
+    num_api_servers: int = 1,
+) -> Iterator[
+    tuple[
+        CoreEngineProcManager | CoreEngineActorManager | None,
+        DPCoordinator | None,
+        EngineZmqAddresses,
+    ]
+]:
+    """Launch engine and DP coordinator processes as needed."""
+
+    parallel_config = vllm_config.parallel_config
+    dp_size = parallel_config.data_parallel_size
+    local_engine_count = parallel_config.data_parallel_size_local
+    local_start_index = parallel_config.data_parallel_rank_local
+    dp_rank = parallel_config.data_parallel_rank
+    host = parallel_config.data_parallel_master_ip
+    local_engines_only = parallel_config.local_engines_only
+
+    offline_mode = local_start_index is not None
+
     # Run the DP Coordinator process with rank 0 when in online DP mode.
     # The coordinator is needed for:
     # 1. Internal/hybrid LB: collecting and publishing queue stats for load balancing
@@ -885,6 +907,10 @@ def launch_core_engines(
     # will be False.
     handshake_local_only = offline_mode or local_engine_count == dp_size
 
+    # NOTE(yongji): handling scaling from intra-node to inter-node
+    if parallel_config.enable_elastic_ep:
+        handshake_local_only = False
+
     handshake_address = get_engine_client_zmq_addr(
         handshake_local_only, host, parallel_config.data_parallel_rpc_port
     )
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 9ea29df0058b..e3376ba2d1f7 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -38,6 +38,7 @@
     get_pcp_group,
     get_pp_group,
     get_tp_group,
+    model_parallel_is_initialized,
 )
 from vllm.envs import enable_envs_cache
 from vllm.logger import init_logger
@@ -580,17 +581,20 @@ def __init__(
             )
             self.async_output_copy_thread.start()
 
-        # Initialize device
-        self.worker.init_device()
-
-        # Set process title and log prefix
         self.setup_proc_title_and_log_prefix(
             enable_ep=vllm_config.parallel_config.enable_expert_parallel
         )
 
         # Load model
         self._init_message_queues(input_shm_handle, vllm_config)
-        self.worker.load_model()
+        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
+        if not is_eep_new_worker:
+            self.worker.init_device()
+            # Update process title now that parallel groups are initialized
+            self.setup_proc_title_and_log_prefix(
+                enable_ep=vllm_config.parallel_config.enable_expert_parallel
+            )
+            self.worker.load_model()
 
         # Enable environment variable cache (e.g. assume no more
         # environment variable overrides after this point)
@@ -885,6 +889,13 @@ def worker_busy_loop(self, cancel: threading.Event | None = None):
 
     @staticmethod
     def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
+        # Check if parallel groups are initialized first
+        if not model_parallel_is_initialized():
+            # Parallel groups not yet initialized, use default process name
+            set_process_title(name="Worker")
+            decorate_logs("Worker")
+            return
+
         dp_size = get_dp_group().world_size
         dp_rank = get_dp_group().rank_in_group
         pp_size = get_pp_group().world_size
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index ad51526ae941..200de181acae 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -382,8 +382,10 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
             all_kwargs.append(kwargs)
         self.collective_rpc("init_worker", args=(all_kwargs,))
 
-        self.collective_rpc("init_device")
-        self.collective_rpc("load_model")
+        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
+        if not is_eep_new_worker:
+            self.collective_rpc("init_device")
+            self.collective_rpc("load_model")
 
         for pp_rank in range(self.parallel_config.pipeline_parallel_size):
             self.pp_tp_workers.append([])
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index b9c7b550170b..3759c751c76d 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -14,7 +14,6 @@
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
-from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
 from vllm.v1.serial_utils import run_method
@@ -43,9 +42,11 @@ def _init_executor(self) -> None:
                 max_workers=1, thread_name_prefix="WorkerAsyncOutput"
             )
 
+        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
         self.driver_worker.init_worker(all_kwargs=[kwargs])
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
+        if not is_eep_new_worker:
+            self.driver_worker.init_device()
+            self.driver_worker.load_model()
 
     def _distributed_args(self) -> tuple[str, int, int]:
         """Return (distributed_init_method, rank, local_rank)."""
@@ -122,16 +123,6 @@ def check_health(self) -> None:
         # it's running.
         return
 
-    def reinitialize_distributed(
-        self, reconfig_request: ReconfigureDistributedRequest
-    ) -> None:
-        self.driver_worker.reinitialize_distributed(reconfig_request)
-        if (
-            reconfig_request.new_data_parallel_rank
-            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
-        ):
-            self.shutdown()
-
     def shutdown(self) -> None:
         if worker := self.driver_worker:
             worker.shutdown()
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 8ee758353b14..489480004821 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -53,7 +53,12 @@ def replace_tensor(obj: Any, cpu_attr_name: str, device_attr_name) -> None:
                     v.gpu = v.cpu
 
     @instrument(span_name="Loading (CPU)")
-    def load_model(self, eep_scale_up: bool = False) -> None:
+    def load_model(self, load_dummy_weights: bool = False) -> None:
+        if load_dummy_weights:
+            raise ValueError(
+                "Loading dummy weights (needed for elastic EP scale-up) "
+                "Is not supported by the CPU Model Runner."
+            )
         logger.info("Starting to load model %s...", self.model_config.model)
         self.model = get_model(vllm_config=self.vllm_config)
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5e8de1429711..59a82d4ce6ae 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -461,6 +461,8 @@ def __init__(
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
         self.eplb_state: EplbState | None = None
+        # NOTE(yongji): flag to temporarily disable EPLB during scaling up/down
+        self.eep_eplb_suppressed = False
         """
         State of the expert parallelism load balancer.
 
@@ -2702,7 +2704,7 @@ def eplb_step(self, is_dummy: bool = False, is_profile: bool = False) -> None:
         """
         Step for the EPLB (Expert Parallelism Load Balancing) state.
         """
-        if not self.parallel_config.enable_eplb:
+        if not self.parallel_config.enable_eplb or self.eep_eplb_suppressed:
             return
 
         assert self.eplb_state is not None
@@ -2714,6 +2716,23 @@ def eplb_step(self, is_dummy: bool = False, is_profile: bool = False) -> None:
             log_stats=self.parallel_config.eplb_config.log_balancedness,
         )
 
+    def setup_eplb_from_mapping(
+        self,
+        expanded_physical_to_logical: torch.Tensor,
+        old_num_physical_experts: int,
+    ) -> None:
+        model = self.get_model()
+        assert is_mixture_of_experts(model)
+
+        self.eplb_state = EplbState.from_mapping(
+            model=model,
+            model_config=self.model_config,
+            device=self.device,
+            parallel_config=self.parallel_config,
+            expanded_physical_to_logical=expanded_physical_to_logical,
+            num_valid_physical_experts=old_num_physical_experts,
+        )
+
     def _pool(
         self,
         hidden_states: torch.Tensor,
@@ -4175,21 +4194,16 @@ def update_config(self, overrides: dict[str, Any]) -> None:
             setattr(self, config_name, new_config)
 
     @instrument(span_name="Loading (GPU)")
-    def load_model(self, eep_scale_up: bool = False) -> None:
+    def load_model(self, load_dummy_weights: bool = False) -> None:
         """
         Args:
-            eep_scale_up: the model loading is for elastic EP scale up.
+            load_dummy_weights: load dummy weights instead of real weights.
         """
         logger.info_once(
             "Starting to load model %s...",
             self.model_config.model,
             scope="global",
         )
-        global_expert_loads, old_global_expert_indices_per_model, rank_mapping = (
-            EplbState.get_eep_state(self.parallel_config)
-            if eep_scale_up
-            else (None, None, None)
-        )
 
         if self.parallel_config.enable_eplb:
             self.eplb_state = EplbState(self.parallel_config, self.device)
@@ -4198,6 +4212,8 @@ def load_model(self, eep_scale_up: bool = False) -> None:
         try:
             with DeviceMemoryProfiler() as m:
                 time_before_load = time.perf_counter()
+                if load_dummy_weights:
+                    self.load_config.load_format = "dummy"
                 model_loader = get_model_loader(self.load_config)
                 self.model = model_loader.load_model(
                     vllm_config=self.vllm_config, model_config=self.model_config
@@ -4214,6 +4230,9 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                         and is_mixture_of_experts(self.drafter.model)
                         and self.parallel_config.enable_eplb
                     ):
+                        assert not self.parallel_config.enable_elastic_ep, (
+                            "Elastic EP is not supported with drafter model."
+                        )
                         spec_config = self.vllm_config.speculative_config
                         assert spec_config is not None
                         assert spec_config.draft_model_config is not None
@@ -4221,17 +4240,6 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                             "EPLB is enabled for drafter model %s.",
                             spec_config.draft_model_config.model,
                         )
-
-                        global_expert_load = (
-                            global_expert_loads[eplb_models]
-                            if global_expert_loads
-                            else None
-                        )
-                        old_global_expert_indices = (
-                            old_global_expert_indices_per_model[eplb_models]
-                            if old_global_expert_indices_per_model
-                            else None
-                        )
                         if self.eplb_state is None:
                             self.eplb_state = EplbState(
                                 self.parallel_config, self.device
@@ -4239,9 +4247,6 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                         self.eplb_state.add_model(
                             self.drafter.model,
                             spec_config.draft_model_config,
-                            global_expert_load,
-                            old_global_expert_indices,
-                            rank_mapping,
                         )
                         eplb_models += 1
 
@@ -4283,11 +4288,12 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             time_after_load - time_before_load,
             scope="local",
         )
-        prepare_communication_buffer_for_model(self.model)
-        if (drafter := getattr(self, "drafter", None)) and (
-            drafter_model := getattr(drafter, "model", None)
-        ):
-            prepare_communication_buffer_for_model(drafter_model)
+        if not load_dummy_weights:
+            prepare_communication_buffer_for_model(self.model)
+            if (drafter := getattr(self, "drafter", None)) and (
+                drafter_model := getattr(drafter, "model", None)
+            ):
+                prepare_communication_buffer_for_model(drafter_model)
         mm_config = self.model_config.multimodal_config
         self.is_multimodal_pruning_enabled = (
             supports_multimodal_pruning(self.get_model())
@@ -4295,26 +4301,19 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             and mm_config.is_multimodal_pruning_enabled()
         )
 
-        if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb:
+        if (
+            is_mixture_of_experts(self.model)
+            and self.parallel_config.enable_eplb
+            and not load_dummy_weights
+        ):
             logger.info_once("EPLB is enabled for model %s.", self.model_config.model)
-            global_expert_load = (
-                global_expert_loads[eplb_models] if global_expert_loads else None
-            )
-            old_global_expert_indices = (
-                old_global_expert_indices_per_model[eplb_models]
-                if old_global_expert_indices_per_model
-                else None
-            )
             assert self.eplb_state is not None
             self.eplb_state.add_model(
                 self.model,
                 self.model_config,
-                global_expert_load,
-                old_global_expert_indices,
-                rank_mapping,
             )
             if self.eplb_state.is_async:
-                self.eplb_state.start_async_loop(rank_mapping=rank_mapping)
+                self.eplb_state.start_async_loop()
 
         if (
             self.vllm_config.compilation_config.mode
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 06410b2eb2b3..07582ad963c9 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -7,11 +7,10 @@
 from collections.abc import Callable
 from contextlib import AbstractContextManager, nullcontext
 from types import NoneType
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import torch
-import torch.distributed
 import torch.nn as nn
 
 import vllm.envs as envs
@@ -32,14 +31,12 @@
 )
 from vllm.distributed.parallel_state import (
     Handle,
-    get_pcp_group,
     get_pp_group,
     get_tp_group,
 )
 from vllm.distributed.weight_transfer import WeightTransferEngineFactory
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
 from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
@@ -49,7 +46,6 @@
 from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
 from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
-from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (
     AsyncModelRunnerOutput,
@@ -124,6 +120,10 @@ def __init__(
         precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
         torch.set_float32_matmul_precision(precision)
 
+        from vllm.distributed.elastic_ep.elastic_execute import ElasticEPScalingExecutor
+
+        self.elastic_ep_executor = ElasticEPScalingExecutor(self)
+
         # Buffers saved before sleep
         self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
 
@@ -317,12 +317,29 @@ def init_device(self):
     # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
     # to hijack tensor allocation.
     def load_model(self) -> None:
-        eep_scale_up = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1"
+        dummy_weights = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1"
+        if dummy_weights:
+            (
+                expanded_physical_to_logical,
+                num_logical_experts,
+                old_num_physical_experts,
+            ) = self.elastic_ep_executor.receive_expert_mapping()
+            num_physical_experts = expanded_physical_to_logical.shape[1]
+            self.parallel_config.eplb_config.num_redundant_experts = (
+                num_physical_experts - num_logical_experts
+            )
+
         with (
             self._maybe_get_memory_pool_context(tag="weights"),
             set_current_vllm_config(self.vllm_config),
         ):
-            self.model_runner.load_model(eep_scale_up=eep_scale_up)
+            self.model_runner.load_model(load_dummy_weights=dummy_weights)
+
+        if dummy_weights:
+            self.model_runner.setup_eplb_from_mapping(
+                expanded_physical_to_logical, old_num_physical_experts
+            )
+            self.model_runner.eep_eplb_suppressed = True
 
     def update_config(self, overrides: dict[str, Any]) -> None:
         self.model_runner.update_config(overrides)
@@ -801,227 +818,6 @@ def check_health(self) -> None:
         # worker will always be healthy as long as it's running.
         return
 
-    def _eplb_before_scale_down(self, old_ep_size: int, new_ep_size: int) -> None:
-        from vllm.distributed.parallel_state import get_ep_group
-
-        if get_ep_group().rank == 0:
-            logger.info(
-                "[Elastic EP] Starting expert resharding before scaling down..."
-            )
-        rank_mapping = {
-            old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1
-            for old_ep_rank in range(old_ep_size)
-        }
-        assert self.model_runner.eplb_state is not None
-        self.model_runner.eplb_state.rearrange(
-            execute_shuffle=True,
-            global_expert_loads=None,
-            rank_mapping=rank_mapping,
-        )
-        torch.cuda.synchronize()
-        if get_ep_group().rank == 0:
-            logger.info("[Elastic EP] Expert resharding completed!")
-
-    def _eplb_after_scale_up(
-        self,
-        old_ep_size: int,
-        new_ep_size: int,
-        global_expert_loads: list[torch.Tensor] | None,
-    ) -> None:
-        from vllm.distributed.parallel_state import get_ep_group
-
-        if get_ep_group().rank == 0:
-            logger.info("[Elastic EP] Starting expert resharding after scaling up...")
-        rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)}
-        assert self.model_runner.eplb_state is not None
-        self.model_runner.eplb_state.rearrange(
-            execute_shuffle=True,
-            global_expert_loads=global_expert_loads,
-            rank_mapping=rank_mapping,
-        )
-        if get_ep_group().rank == 0:
-            logger.info("[Elastic EP] Expert resharding completed!")
-
-    def _reconfigure_parallel_config(
-        self, reconfig_request: ReconfigureDistributedRequest
-    ) -> None:
-        """
-        Update parallel config with provided reconfig_request
-        """
-        parallel_config = self.vllm_config.parallel_config
-        parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
-        if (
-            reconfig_request.new_data_parallel_rank
-            != ReconfigureRankType.KEEP_CURRENT_RANK
-        ):
-            parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
-        if (
-            reconfig_request.new_data_parallel_rank_local
-            != ReconfigureRankType.KEEP_CURRENT_RANK
-        ):
-            parallel_config.data_parallel_rank_local = (
-                reconfig_request.new_data_parallel_rank_local
-            )
-        parallel_config.data_parallel_master_ip = (
-            reconfig_request.new_data_parallel_master_ip
-        )
-        parallel_config.data_parallel_master_port = (
-            reconfig_request.new_data_parallel_master_port
-        )
-
-    def _reconfigure_moe(
-        self, old_ep_size: int, new_ep_size: int
-    ) -> list[torch.Tensor] | None:
-        """
-        Reconfigure MoE modules with provided reconfig_request
-
-        Return the global expert load if new_ep_size > old_ep_size,
-        otherwise None
-        """
-        from vllm.distributed.parallel_state import (
-            get_dp_group,
-            get_ep_group,
-            prepare_communication_buffer_for_model,
-        )
-        from vllm.model_executor.layers.fused_moe.layer import (
-            FusedMoE,
-            FusedMoEParallelConfig,
-        )
-
-        parallel_config = self.vllm_config.parallel_config
-
-        def get_moe_modules(model: torch.nn.Module) -> list[FusedMoE]:
-            return [
-                module
-                for module in model.modules()
-                if (
-                    module.__class__.__name__ == "FusedMoE"
-                    or module.__class__.__name__ == "SharedFusedMoE"
-                )
-            ]
-
-        def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int):
-            assert all(
-                module.moe_config.num_local_experts == num_local_experts
-                for module in moe_modules
-            ), "All MoE modules must have the same number of experts"
-            for module in moe_modules:
-                module.moe_config.num_experts = num_local_experts * new_ep_size
-                module.global_num_experts = module.moe_config.num_experts
-                tp_size = get_tp_group().world_size
-                is_sequence_parallel = parallel_config.use_sequence_parallel_moe
-                sp_size = tp_size if is_sequence_parallel else 1
-                module.moe_parallel_config = FusedMoEParallelConfig.make(
-                    tp_size_=tp_size,
-                    pcp_size_=get_pcp_group().world_size,
-                    dp_size_=get_dp_group().world_size,
-                    sp_size_=sp_size,
-                    vllm_parallel_config=parallel_config,
-                )
-                module.moe_config.moe_parallel_config = module.moe_parallel_config
-            return moe_modules
-
-        model_moe_modules = get_moe_modules(self.model_runner.model)
-        num_local_experts = model_moe_modules[0].moe_config.num_local_experts
-
-        update_moe_modules(model_moe_modules, num_local_experts)
-        drafter_model = None
-        if hasattr(self.model_runner, "drafter") and hasattr(
-            self.model_runner.drafter, "model"
-        ):
-            drafter_model = self.model_runner.drafter.model
-        if drafter_model is not None and is_mixture_of_experts(drafter_model):
-            drafter_moe_modules = get_moe_modules(drafter_model)
-            # Check if drafter and model have matching configs
-            assert (
-                drafter_moe_modules[0].moe_config.num_local_experts == num_local_experts
-            ), "Drafter and model configs should be the same"
-            update_moe_modules(drafter_moe_modules, num_local_experts)
-
-        if new_ep_size < old_ep_size:
-            num_local_physical_experts = num_local_experts
-            assert self.model_runner.eplb_state is not None
-            new_physical_experts = (
-                self.model_runner.eplb_state.physical_to_logical_map.shape[1]  # type: ignore[attr-defined]
-            )
-            parallel_config.eplb_config.num_redundant_experts = (
-                new_physical_experts
-                - self.model_runner.eplb_state.logical_replica_count.shape[1]  # type: ignore[attr-defined]
-            )
-            global_expert_loads = None
-        else:
-            num_local_physical_experts_tensor = torch.tensor(
-                [num_local_experts], dtype=torch.int32, device="cpu"
-            )
-            torch.distributed.broadcast(
-                num_local_physical_experts_tensor,
-                group=get_ep_group().cpu_group,
-                group_src=0,
-            )
-            num_local_physical_experts = int(num_local_physical_experts_tensor.item())
-            new_physical_experts = num_local_physical_experts * new_ep_size
-            assert self.model_runner.eplb_state is not None
-            global_expert_loads_any = self.model_runner.eplb_state.rearrange(
-                execute_shuffle=False
-            )
-            global_expert_loads = cast(list[torch.Tensor], global_expert_loads_any)
-            parallel_config.eplb_config.num_redundant_experts = (
-                new_physical_experts - global_expert_loads[0].shape[1]
-            )
-        prepare_communication_buffer_for_model(self.model_runner.model)
-        if drafter_model is not None:
-            prepare_communication_buffer_for_model(drafter_model)
-        self.model_runner.model.update_physical_experts_metadata(
-            num_physical_experts=new_physical_experts,
-            num_local_physical_experts=num_local_physical_experts,
-        )
-        return global_expert_loads
-
-    def reinitialize_distributed(
-        self, reconfig_request: ReconfigureDistributedRequest
-    ) -> None:
-        from vllm.config import set_current_vllm_config
-        from vllm.distributed.parallel_state import (
-            cleanup_dist_env_and_memory,
-            get_ep_group,
-        )
-
-        old_ep_size = get_ep_group().world_size
-        old_ep_rank = get_ep_group().rank
-        new_ep_size = (
-            reconfig_request.new_data_parallel_size
-            * get_tp_group().world_size
-            * get_pp_group().world_size
-        )
-        if new_ep_size < old_ep_size:
-            self._eplb_before_scale_down(old_ep_size, new_ep_size)
-
-        cleanup_dist_env_and_memory()
-
-        if (
-            reconfig_request.new_data_parallel_rank
-            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
-        ):
-            assert old_ep_rank >= new_ep_size
-            # shutdown
-            return
-
-        self._reconfigure_parallel_config(reconfig_request)
-
-        with set_current_vllm_config(self.vllm_config):
-            init_worker_distributed_environment(
-                self.vllm_config,
-                self.rank,
-                self.distributed_init_method,
-                self.local_rank,
-            )
-
-        global_expert_loads = self._reconfigure_moe(old_ep_size, new_ep_size)
-
-        if new_ep_size > old_ep_size:
-            assert global_expert_loads is not None
-            self._eplb_after_scale_up(old_ep_size, new_ep_size, global_expert_loads)
-
     def save_sharded_state(
         self,
         path: str,
@@ -1118,6 +914,9 @@ def shutdown(self) -> None:
         if weight_transfer_engine := getattr(self, "weight_transfer_engine", None):
             weight_transfer_engine.shutdown()
 
+    def elastic_ep_execute(self, execute_method: str, *args, **kwargs):
+        return self.elastic_ep_executor.execute(execute_method, *args, **kwargs)
+
 
 def init_worker_distributed_environment(
     vllm_config: VllmConfig,
diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py
index ef32a32f6cff..28ba85a26248 100644
--- a/vllm/v1/worker/workspace.py
+++ b/vllm/v1/worker/workspace.py
@@ -66,6 +66,23 @@ def lock(self) -> None:
                 ],
             )
 
+    def unlock(self) -> None:
+        """Unlock the workspace to allow growth.
+
+        This is used during elastic EP scaling when the workspace size
+        needs to grow due to changes in the number of experts.
+        """
+        self._locked = False
+        if envs.VLLM_DEBUG_WORKSPACE:
+            logger.info(
+                "[WORKSPACE DEBUG] Workspace unlocked. Current sizes: %s",
+                [
+                    self._workspace_size_bytes(ws) / _MB
+                    for ws in self._current_workspaces
+                    if ws is not None
+                ],
+            )
+
     def is_locked(self) -> bool:
         """Check if workspace is locked."""
         return self._locked
@@ -242,6 +259,17 @@ def lock_workspace() -> None:
     current_workspace_manager().lock()
 
 
+def unlock_workspace() -> None:
+    """Unlock the workspace to allow growth.
+
+    This is used during elastic EP scaling when the workspace size
+    needs to grow due to changes in the number of experts.
+    After scaling operations complete, lock_workspace() should be
+    called again to prevent unexpected allocations.
+    """
+    current_workspace_manager().unlock()
+
+
 def reset_workspace_manager() -> None:
     """Reset the workspace manager to uninitialized state.
 

From 7b346ba8ed546ad274a7905ccfd9f3c0c242226a Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 27 Feb 2026 21:03:22 -0800
Subject: [PATCH 109/154] [Bugfix] Propagate compilation_time from workers to
 main process for TP>1 (#35503)

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 vllm/v1/executor/abstract.py  | 10 +++++++++-
 vllm/v1/worker/cpu_worker.py  |  3 ++-
 vllm/v1/worker/gpu_worker.py  |  4 +++-
 vllm/v1/worker/worker_base.py |  8 ++++++--
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 91bd019f8acc..8e7c48054554 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -115,7 +115,15 @@ def initialize_from_config(self, kv_cache_configs: list[KVCacheConfig]) -> None:
         underlying workers.
         """
         self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
-        self.collective_rpc("compile_or_warm_up_model")
+        compilation_times: list[float] = self.collective_rpc("compile_or_warm_up_model")
+        # Propagate compilation time from workers back to the main process.
+        # With TP>1, compilation happens in worker processes, so the main
+        # process config is never updated. Use max across workers since they
+        # compile in parallel.
+        if compilation_times:
+            self.vllm_config.compilation_config.compilation_time = max(
+                compilation_times
+            )
 
     def register_failure_callback(self, callback: FailureCallback):  # noqa: B027
         """
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index d0cecda29fdb..a72f450a7daf 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -118,11 +118,12 @@ def wake_up(self, tags: list[str] | None = None) -> None:
     def determine_available_memory(self) -> int:
         return self.cache_config.cpu_kvcache_space_bytes or 0
 
-    def compile_or_warm_up_model(self) -> None:
+    def compile_or_warm_up_model(self) -> float:
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
         self.model_runner.warming_up_model()
+        return self.compilation_config.compilation_time
 
     def _get_autobind_cpu_ids(
         self, cpu_selector: Callable[[list[LogicalCPUInfo]], list[LogicalCPUInfo]]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 07582ad963c9..977d15ff2de5 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -480,7 +480,7 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
             self.model_runner.initialize_kv_cache(kv_cache_config)
 
     @instrument(span_name="Warmup (GPU)")
-    def compile_or_warm_up_model(self) -> None:
+    def compile_or_warm_up_model(self) -> float:
         warmup_sizes = []
 
         if self.vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE:
@@ -605,6 +605,8 @@ def compile_or_warm_up_model(self) -> None:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+        return self.compilation_config.compilation_time
+
     def reset_mm_cache(self) -> None:
         self.model_runner.reset_mm_cache()
 
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index b4454589d7ed..2e8c03e15474 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -87,8 +87,12 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """Get specifications for KV cache implementation."""
         raise NotImplementedError
 
-    def compile_or_warm_up_model(self) -> None:
-        """Prepare model for execution through compilation/warmup."""
+    def compile_or_warm_up_model(self) -> float:
+        """Prepare model for execution through compilation/warmup.
+
+        Returns:
+            The accumulated compilation time in seconds.
+        """
         raise NotImplementedError
 
     def check_health(self) -> None:

From 1d5ab5d603792e6c96584c3c6f4cd50b9925690f Mon Sep 17 00:00:00 2001
From: Umut Polat <52835619+umut-polat@users.noreply.github.com>
Date: Sat, 28 Feb 2026 08:26:19 +0300
Subject: [PATCH 110/154] [Bugfix] Move chat completion response_format
 validation to Pydantic model_validator (#35510)

Signed-off-by: umut-polat <52835619+umut-polat@users.noreply.github.com>
---
 tests/entrypoints/openai/test_chat_error.py   | 12 ++++++++
 .../openai/chat_completion/protocol.py        | 28 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py
index 7d84be218447..970945b4759f 100644
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -370,3 +370,15 @@ def test_system_message_warns_on_video(video_content):
     call_args = str(mock_logger.warning_once.call_args)
     assert "System messages should only contain text" in call_args
     assert "video_url" in call_args
+
+
+def test_json_schema_response_format_missing_schema():
+    """When response_format type is 'json_schema' but the json_schema field
+    is not provided, request construction should raise a validation error
+    so the API returns 400 instead of 500."""
+    with pytest.raises(Exception, match="json_schema.*must be provided"):
+        ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": "hello"}],
+            response_format={"type": "json_schema"},
+        )
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
index 1bf0de53fa3c..12bbc44a0a53 100644
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -502,6 +502,34 @@ def to_sampling_params(
             skip_clone=True,  # Created fresh per request, safe to skip clone
         )
 
+    @model_validator(mode="before")
+    @classmethod
+    def validate_response_format(cls, data):
+        response_format = data.get("response_format")
+        if response_format is None:
+            return data
+
+        rf_type = (
+            response_format.get("type")
+            if isinstance(response_format, dict)
+            else getattr(response_format, "type", None)
+        )
+
+        if rf_type == "json_schema":
+            json_schema = (
+                response_format.get("json_schema")
+                if isinstance(response_format, dict)
+                else getattr(response_format, "json_schema", None)
+            )
+            if json_schema is None:
+                raise VLLMValidationError(
+                    "When response_format type is 'json_schema', the "
+                    "'json_schema' field must be provided.",
+                    parameter="response_format",
+                )
+
+        return data
+
     @model_validator(mode="before")
     @classmethod
     def validate_stream_options(cls, data):

From b2d8b422b2014b23c44fea703c70331eef35e7a1 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Sat, 28 Feb 2026 06:47:12 +0100
Subject: [PATCH 111/154] [EPLB] Enforce sync eplb for NCCL-based all2all
 backend (#35212)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
---
 vllm/config/parallel.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 59df4a214cc5..6e84cf16b203 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -774,6 +774,17 @@ def __post_init__(self) -> None:
                 "backend is mp, uni or external_launcher."
             )
 
+        if (
+            self.all2all_backend in ("allgather_reducescatter", "naive")
+            and self.eplb_config.use_async
+        ):
+            logger.warning(
+                "Async EPLB causes hangs with the '%s' all2all backend. "
+                "Forcing synchronous EPLB.",
+                self.all2all_backend,
+            )
+            self.eplb_config.use_async = False
+
     @property
     def use_ray(self) -> bool:
         return self.distributed_executor_backend == "ray" or (

From 88e8525f2ed7d234937190ae523841a523a49f99 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Fri, 27 Feb 2026 23:53:28 -0600
Subject: [PATCH 112/154] [ROCm][CI] Adding infiniband mappings for moriio
 tests (#35170)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       | 194 +++++++++++++++++-
 1 file changed, 183 insertions(+), 11 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 89736eec1273..aa84d0e8aae9 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -6,6 +6,26 @@
 # Multi-node detection: Instead of matching on fragile group names, we detect
 # multi-node jobs structurally by looking for the bracket command syntax
 # "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
+#
+###############################################################################
+# QUOTING / COMMAND PASSING
+#
+# Passing commands as positional arguments ($*) is fragile when the command
+# string itself contains double quotes, e.g.:
+#
+#   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
+#
+# The outer shell resolves the nested quotes *before* this script runs, so
+# the script receives mangled input it cannot fully recover.
+#
+# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
+#
+#   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
+#   bash run-amd-test.sh
+#
+# Single-quoted assignment preserves all inner double quotes verbatim.
+# The $* path is kept for backward compatibility but callers should migrate.
+###############################################################################
 set -o pipefail
 
 # Export Python path
@@ -80,25 +100,140 @@ is_multi_node() {
 }
 
 ###############################################################################
-# Pytest marker re-quoting
+# Pytest marker/keyword re-quoting
 #
 # When commands are passed through Buildkite -> shell -> $* -> bash -c,
-# quotes around pytest -m marker expressions get stripped:
+# quotes around multi-word pytest -m/-k expressions get stripped:
 #   pytest -v -s -m 'not cpu_test' v1/core
 # becomes:
 #   pytest -v -s -m not cpu_test v1/core
 #
 # pytest then interprets "cpu_test" as a file path, not part of the marker.
-# This function detects unquoted multi-word marker expressions and re-quotes
-# them so they survive the final bash -c expansion.
+#
+# This function detects unquoted expressions after -m/-k and re-quotes them
+# by collecting tokens until a recognizable boundary is reached:
+#   - test path (contains '/')
+#   - test file (ends with '.py')
+#   - another pytest flag (--xxx or -x single-char flags)
+#   - command separator (&& || ; |)
+#   - environment variable assignment (FOO=bar)
+#
+# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
+# unquoted since they have no spaces and work fine.
+#
+# Already-quoted expressions (containing literal single quotes) are passed
+# through untouched to avoid double-quoting values injected by
+# apply_rocm_test_overrides.
+#
+# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
+# double-quotes stripped by the calling shell (see header comment).
+# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
 ###############################################################################
-
 re_quote_pytest_markers() {
-  local cmds="$1"
-  # Pattern: -m not <identifier>  ->  -m 'not <identifier>'
-  # Handles the common cases: 'not cpu_test', 'not slow_test', etc.
-  cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
-  echo "$cmds"
+  local input="$1"
+  local output=""
+  local collecting=false
+  local marker_buf=""
+
+  # Flatten newlines for consistent tokenization
+  local flat="${input//$'\n'/ }"
+
+  # Disable globbing to prevent *.py etc. from expanding during read -ra
+  local restore_glob
+  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
+  set -o noglob
+  local -a words
+  read -ra words <<< "$flat"
+  eval "$restore_glob"
+
+  for word in "${words[@]}"; do
+    if $collecting; then
+      # If the token we're about to collect already contains a literal
+      # single quote, the expression was already quoted upstream.
+      # Flush and stop collecting.
+      if [[ "$word" == *"'"* ]]; then
+        if [[ -n "$marker_buf" ]]; then
+          # Should not normally happen (partial buf + quote), flush raw
+          output+="${marker_buf} "
+          marker_buf=""
+        fi
+        output+="${word} "
+        collecting=false
+        continue
+      fi
+
+      local is_boundary=false
+      case "$word" in
+        # Command separators
+        "&&"|"||"|";"|"|")
+          is_boundary=true ;;
+        # Long flags (--ignore, --shard-id, etc.)
+        --*)
+          is_boundary=true ;;
+        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
+        # like "not" which don't start with "-". Also skip -k/-m which
+        # would start a new marker (handled below).
+        -[a-zA-Z])
+          is_boundary=true ;;
+        # Test path (contains /)
+        */*)
+          is_boundary=true ;;
+        # Test file (ends with .py, possibly with ::method)
+        *.py|*.py::*)
+          is_boundary=true ;;
+        # Environment variable assignment preceding a command (FOO=bar)
+        *=*)
+          # Only treat as boundary if it looks like VAR=value, not
+          # pytest filter expressions like num_gpus=2 inside markers
+          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
+            is_boundary=true
+          fi
+          ;;
+      esac
+
+      if $is_boundary; then
+        # Flush the collected marker expression
+        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
+          output+="'${marker_buf}' "
+        else
+          output+="${marker_buf} "
+        fi
+        collecting=false
+        marker_buf=""
+        # Check if this boundary word itself starts a new -m/-k
+        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
+          output+="${word} "
+          collecting=true
+        else
+          output+="${word} "
+        fi
+      else
+        # Accumulate into marker buffer
+        if [[ -n "$marker_buf" ]]; then
+          marker_buf+=" ${word}"
+        else
+          marker_buf="${word}"
+        fi
+      fi
+    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
+      output+="${word} "
+      collecting=true
+      marker_buf=""
+    else
+      output+="${word} "
+    fi
+  done
+
+  # Flush any trailing marker expression (marker at end of command)
+  if $collecting && [[ -n "$marker_buf" ]]; then
+    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
+      output+="'${marker_buf}'"
+    else
+      output+="${marker_buf}"
+    fi
+  fi
+
+  echo "${output% }"
 }
 
 ###############################################################################
@@ -231,11 +366,35 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 
-commands="$*"
+# ---- Command source selection ----
+# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
+# Fall back to $* for backward compatibility, but warn that inner
+# double-quotes will have been stripped by the calling shell.
+if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
+  commands="${VLLM_TEST_COMMANDS}"
+  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
+else
+  commands="$*"
+  if [[ -z "$commands" ]]; then
+    echo "Error: No test commands provided." >&2
+    echo "Usage:" >&2
+    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
+    echo "  Legacy:     bash $0 \"commands here\"" >&2
+    exit 1
+  fi
+  echo "Commands sourced from positional args (legacy mode)"
+  echo "WARNING: Inner double-quotes in the command string may have been"
+  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
+  echo "  export VLLM_TEST_COMMANDS='your commands here'"
+  echo "  bash $0"
+fi
+
 echo "Raw commands: $commands"
 
 # Fix quoting before ROCm overrides (so overrides see correct structure)
 commands=$(re_quote_pytest_markers "$commands")
+echo "After re-quoting: $commands"
+
 commands=$(apply_rocm_test_overrides "$commands")
 echo "Final commands: $commands"
 
@@ -248,6 +407,18 @@ if [[ -z "$render_gid" ]]; then
   exit 1
 fi
 
+# --- RDMA device passthrough (conditional) ---
+# If the host has RDMA devices, pass them through so tests like
+# test_moriio_connector can access ibverbs. On hosts without RDMA
+# hardware the tests will gracefully skip via _rdma_available().
+RDMA_FLAGS=""
+if [ -d /dev/infiniband ]; then
+  echo "RDMA devices detected on host, enabling passthrough"
+  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
+else
+  echo "No RDMA devices found on host, RDMA tests will be skipped"
+fi
+
 # --- Route: multi-node vs single-node ---
 if is_multi_node "$commands"; then
   echo "--- Multi-node job detected"
@@ -295,6 +466,7 @@ else
   echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
   docker run \
     --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+    $RDMA_FLAGS \
     --network=host \
     --shm-size=16gb \
     --group-add "$render_gid" \

From 94029ffaf02f0b73e296e11cab721c23fd5a5f97 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Fri, 27 Feb 2026 23:55:28 -0600
Subject: [PATCH 113/154] [ROCm] Derive device capability from GCN arch string
 without CUDA init (#35069)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 vllm/platforms/rocm.py     | 111 ++++++++++++++++++++++++++++++++++---
 vllm/utils/system_utils.py |  16 ++++++
 2 files changed, 120 insertions(+), 7 deletions(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index e867ebbd60ec..ab4c3e0740a9 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -6,6 +6,7 @@
 from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING
 
+import regex as re
 import torch
 from torch.distributed import PrefixStore, ProcessGroup
 from torch.distributed.distributed_c10d import is_nccl_available
@@ -64,13 +65,29 @@
     "0x744c": "AMD_Radeon_RX7900XTX",
 }
 
-# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
-if "HIP_VISIBLE_DEVICES" in os.environ:
-    val = os.environ["HIP_VISIBLE_DEVICES"]
-    if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
-        assert val == cuda_val
-    else:
-        os.environ["CUDA_VISIBLE_DEVICES"] = val
+
+def _sync_hip_cuda_env_vars():
+    """Ensure HIP_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES are consistent.
+    Treats empty string as unset. Raises on genuine conflicts."""
+    hip_val = os.environ.get("HIP_VISIBLE_DEVICES") or None
+    cuda_val = os.environ.get("CUDA_VISIBLE_DEVICES") or None
+
+    if hip_val is not None and cuda_val is not None:
+        if hip_val != cuda_val:
+            raise ValueError(
+                f"Inconsistent GPU visibility env vars: "
+                f"HIP_VISIBLE_DEVICES='{hip_val}' vs "
+                f"CUDA_VISIBLE_DEVICES='{cuda_val}'. "
+                f"Please set only one, or ensure they match."
+            )
+    elif hip_val is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = hip_val
+    elif cuda_val is not None:
+        os.environ["HIP_VISIBLE_DEVICES"] = cuda_val
+
+
+# Sync at import time - catches misconfigurations from process start.
+_sync_hip_cuda_env_vars()
 
 # AMDSMI utils
 # Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,
@@ -134,6 +151,77 @@ def _get_gcn_arch() -> str:
 _ON_GFX950 = "gfx950" in _GCN_ARCH
 
 
+def _capability_from_gcn_arch(gcn_arch: str) -> tuple[int, int] | None:
+    """
+    Parse (major, minor) from a GCN arch string, mirroring how
+    HIP derives hipDeviceProp_t.major / .minor.
+
+    Format: gfx<MAJOR><MINOR><STEPPING>
+      - 1-digit major  (gfx9xx):  "gfx" + M + m + stepping
+      - 2-digit major  (gfx1xxx): "gfx" + MM + m + stepping
+
+    Examples:
+      gfx90a  -> (9, 0)    gfx942  -> (9, 4)    gfx950 -> (9, 5)
+      gfx1100 -> (11, 0)   gfx1101 -> (11, 0)   gfx1200 -> (12, 0)
+
+    Returns None only when the string is not gfx-prefixed at all
+    (i.e. not a ROCm arch string). Raises on any string that looks
+    like a GCN arch but does not match a known layout.
+    """
+    m = re.match(r"gfx(\d+)", gcn_arch)
+    if not m:
+        # Not a gfx string at all — caller should fall back to torch.cuda
+        return None
+
+    digits = m.group(1)
+    n = len(digits)
+
+    if n < 2:
+        raise ValueError(
+            f"GCN arch '{gcn_arch}' has too few digits ({n}) after 'gfx' "
+            f"to derive a (major, minor) capability. "
+            f"Please file a vLLM issue with your GPU model."
+        )
+
+    if n in (2, 3):
+        # 1-digit major: gfx9 family
+        # len 2: major + minor          (e.g. gfx90 from gfx90a)
+        # len 3: major + minor + step   (e.g. gfx942)
+        major = int(digits[0])
+        minor = int(digits[1])
+    elif n == 4:
+        # 2-digit major: gfx10xx, gfx11xx, gfx12xx
+        # major(2) + minor(1) + stepping(1)
+        major = int(digits[:2])
+        minor = int(digits[2])
+    elif n >= 5:
+        raise ValueError(
+            f"GCN arch '{gcn_arch}' has {n} digits after 'gfx', which "
+            f"exceeds the known 4-digit layout (MMms). Cannot determine "
+            f"major/minor split unambiguously. "
+            f"Please file a vLLM issue with your GPU model."
+        )
+
+    if major < 9:
+        raise ValueError(
+            f"Parsed unknown ROCm architecture from GCN arch '{gcn_arch}': "
+            f"major={major}, minor={minor}. "
+            f"Major version < 9 is not expected for any supported AMD GPU. "
+            f"Please file a vLLM issue with your GPU model."
+        )
+
+    if major > 12:
+        raise ValueError(
+            f"Parsed unknown ROCm architecture from GCN arch '{gcn_arch}': "
+            f"major={major}, minor={minor}. "
+            f"Major version > 12 is beyond currently known AMD generations. "
+            f"Please file a vLLM issue with your GPU model so support "
+            f"can be added."
+        )
+
+    return (major, minor)
+
+
 def on_gfx1x() -> bool:
     return _ON_GFX1X
 
@@ -444,6 +532,15 @@ def set_device(cls, device: torch.device) -> None:
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
+        cap = _capability_from_gcn_arch(_GCN_ARCH)
+        if cap is not None:
+            return DeviceCapability(major=cap[0], minor=cap[1])
+
+        logger.warning_once(
+            "Could not derive device capability from GCN arch '%s', "
+            "falling back to torch.cuda (this will initialize CUDA).",
+            _GCN_ARCH,
+        )
         major, minor = torch.cuda.get_device_capability(device_id)
         return DeviceCapability(major=major, minor=minor)
 
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index 840056e8bef3..4bd5388796fe 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -16,6 +16,7 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.platforms.interface import in_wsl
 from vllm.ray.lazy_utils import is_in_ray_actor
 
@@ -111,6 +112,17 @@ def unique_filepath(fn: Callable[[int], Path]) -> Path:
 # Process management utilities
 
 
+def _sync_visible_devices_env_vars():
+    """Sync HIP/CUDA visibility env vars before spawning (ROCm only)."""
+
+    if not current_platform.is_rocm():
+        return
+
+    from vllm.platforms.rocm import _sync_hip_cuda_env_vars
+
+    _sync_hip_cuda_env_vars()
+
+
 def _maybe_force_spawn():
     """Check if we need to force the use of the `spawn` multiprocessing start
     method.
@@ -156,6 +168,10 @@ def get_mp_context():
     VLLM_WORKER_MULTIPROC_METHOD.
     """
     _maybe_force_spawn()
+    # (ROCm): Sync GPU visibility env vars so spawned children inherit
+    # consistent values. Must run after _maybe_force_spawn and regardless
+    # of whether spawn was already set.
+    _sync_visible_devices_env_vars()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
 

From f5d1281c9d1b96cb4f046f1ec2c53a525f319098 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Fri, 27 Feb 2026 23:57:31 -0600
Subject: [PATCH 114/154] [ROCm][CI] Expose tests to AMD production CI and fix
 amdsmi heap corruption (#35071)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test-amd.yaml | 94 ++++++++++++++++++++++++----------------
 tests/utils.py           | 15 ++++---
 2 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 65701b78bce6..4c15e7382046 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -156,8 +156,9 @@ steps:
 
 - label: Entrypoints Integration Test (API Server 1) # 100min
   timeout_in_minutes: 130
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -173,8 +174,9 @@ steps:
 
 - label: Entrypoints Integration Test (API Server 2)
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -192,8 +194,9 @@ steps:
 
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -207,8 +210,9 @@ steps:
 
 - label: Entrypoints Integration Test (Responses API)
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -222,8 +226,9 @@ steps:
 
 - label: Distributed Tests (4 GPUs) # 35min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -285,8 +290,9 @@ steps:
 
 - label: Distributed Tests (8 GPUs) # 4min
   timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_8
+  optional: true
   # grade: Blocking
   gpu: h100
   num_gpus: 8
@@ -381,10 +387,11 @@ steps:
 
 - label: V1 Test e2e + engine # 65min
   timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
   # See discussion here: https://github.com/vllm-project/vllm/pull/31040
   agent_pool: mi325_8
+  optional: true
   # grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -408,8 +415,9 @@ steps:
 
 - label: V1 Test others # 42min
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -436,8 +444,9 @@ steps:
 # TODO: Add the "V1 Test attetion (MI300)" test group
 
 - label: V1 Test attention (H100) # 10min
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   timeout_in_minutes: 30
   gpu: h100
@@ -541,8 +550,9 @@ steps:
 
 - label: Samplers Test # 56min
   timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
   - vllm/model_executor/layers
@@ -554,8 +564,9 @@ steps:
 
 - label: LoRA Test %N # 20min each
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
   - vllm/lora
@@ -665,8 +676,9 @@ steps:
 
 - label: Kernels Quantization Test %N # 64min
   timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
   - csrc/quantization/
@@ -799,8 +811,9 @@ steps:
 
 - label: LM Eval Small Models # 53min
   timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
   - csrc/
@@ -861,8 +874,9 @@ steps:
 
 - label: Basic Models Tests (Other)
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -903,8 +917,9 @@ steps:
 
 - label: Language Models Tests (Extra Standard) %N
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -924,8 +939,9 @@ steps:
 
 - label: Language Models Tests (Hybrid) %N
   timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -945,7 +961,7 @@ steps:
 
 - label: Language Models Test (Extended Generation) # 80min
   timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -961,7 +977,7 @@ steps:
 
 - label: Language Models Test (PPL)
   timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -973,7 +989,7 @@ steps:
 
 - label: Language Models Test (Extended Pooling)  # 36min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -985,7 +1001,7 @@ steps:
 
 - label: Language Models Test (MTEB)
   timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -997,7 +1013,7 @@ steps:
 
 - label: Multi-Modal Processor Test (CPU)
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   source_file_dependencies:
   - vllm/
@@ -1009,7 +1025,7 @@ steps:
 
 - label: Multi-Modal Processor Test # 44min
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -1021,7 +1037,7 @@ steps:
 
 - label: Multi-Modal Models Test (Standard) # 60min
   timeout_in_minutes: 100
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -1054,7 +1070,7 @@ steps:
 
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -1069,7 +1085,7 @@ steps:
 
 - label: Multi-Modal Models Test (Extended) 2 #60min
   timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -1084,7 +1100,7 @@ steps:
 
 - label: Multi-Modal Models Test (Extended) 3 # 75min
   timeout_in_minutes: 150
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -1109,7 +1125,7 @@ steps:
     - pytest -v -s models/quantization
 
 - label: Transformers Nightly Models Test
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   working_dir: "/vllm-workspace/"
@@ -1264,8 +1280,9 @@ steps:
 
 - label: 2 Node Tests (4 GPUs in total) # 16min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdmultinode]
+  mirror_hardwares: [amdexperimental, amdproduction, amdmultinode]
   agent_pool: mi325_4
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -1291,8 +1308,9 @@ steps:
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -1331,8 +1349,9 @@ steps:
 
 - label: Distributed Model Tests (2 GPUs) # 37min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -1442,7 +1461,7 @@ steps:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
 
 - label: Weight Loading Multiple GPU Test - Large Models # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
@@ -1486,7 +1505,7 @@ steps:
 ##### A100 test #####
 
 - label: Distributed Tests (A100) # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   gpu: a100
@@ -1509,7 +1528,7 @@ steps:
 - label: LM Eval Large Models # optional
   gpu: a100
   optional: true
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   num_gpus: 4
@@ -1525,7 +1544,7 @@ steps:
 - label: LM Eval Large Models (H100) # optional
   gpu: h100
   optional: true
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   num_gpus: 4
@@ -1540,7 +1559,7 @@ steps:
 
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
   # grade: Blocking
   gpu: h200
@@ -1600,8 +1619,9 @@ steps:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
 - label: ROCm LM Eval Large Models (8 Card)
-  mirror_hardwares: [amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_8
+  optional: true
   num_gpus: 8
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   commands:
@@ -1660,7 +1680,7 @@ steps:
 
 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   optional: true
diff --git a/tests/utils.py b/tests/utils.py
index d407733a358f..03e5ccadb6f7 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -65,6 +65,8 @@
 FP8_DTYPE = current_platform.fp8_dtype()
 
 if current_platform.is_rocm():
+    import threading
+
     from amdsmi import (
         amdsmi_get_gpu_vram_usage,
         amdsmi_get_processor_handles,
@@ -72,13 +74,16 @@
         amdsmi_shut_down,
     )
 
+    _amdsmi_lock = threading.Lock()
+
     @contextmanager
     def _nvml():
-        try:
-            amdsmi_init()
-            yield
-        finally:
-            amdsmi_shut_down()
+        with _amdsmi_lock:
+            try:
+                amdsmi_init()
+                yield
+            finally:
+                amdsmi_shut_down()
 elif current_platform.is_cuda():
     from vllm.third_party.pynvml import (
         nvmlDeviceGetHandleByIndex,

From 06254d4cbb79e0a406fbf4e18d739293d5470114 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sat, 28 Feb 2026 14:47:43 +0800
Subject: [PATCH 115/154] [CI] add trainer_send_weights for
 MockWeightTransferEngine (#35589)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/entrypoints/weight_transfer/test_weight_transfer_llm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py b/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py
index cd13aca7e7df..255bca444f9d 100644
--- a/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py
+++ b/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py
@@ -90,6 +90,10 @@ def receive_weights(
     def shutdown(self) -> None:
         MockWeightTransferEngine.shutdown_called = True
 
+    def trainer_send_weights(self, *args, **kwargs):
+        """Mock method to simulate trainer sending weights."""
+        pass
+
 
 def mock_create_engine(config, parallel_config):
     """Mock factory function that returns our mock engine."""

From 57c86c07411606eb2ec523e19ec287833b2e7f66 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sat, 28 Feb 2026 14:51:35 +0800
Subject: [PATCH 116/154] [Misc] Change logging level from info to debug for
 tool parser import (#35575)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/tool_parsers/qwen3coder_tool_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py
index a3c79f865b15..dfe790ee752e 100644
--- a/vllm/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/tool_parsers/qwen3coder_tool_parser.py
@@ -82,7 +82,7 @@ def __init__(self, tokenizer: TokenizerLike):
                 "tokens in the tokenizer!"
             )
 
-        logger.info(
+        logger.debug(
             "vLLM Successfully import tool parser %s !", self.__class__.__name__
         )
 

From 24d6ea8afdb13ceee95b36645ba61a641f9a2f7f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Feb 2026 15:31:55 +0800
Subject: [PATCH 117/154] [Benchmark] Rename SLA Finder to Workload Explorer
 (#35586)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/benchmarking/sweeps.md                   |  31 ++--
 docs/cli/bench/sweep/serve_sla.md             |   9 -
 docs/cli/bench/sweep/serve_workload.md        |   9 +
 docs/mkdocs/hooks/generate_argparse.py        |   8 +-
 vllm/benchmarks/sweep/cli.py                  |   6 +-
 .../sweep/{serve_sla.py => serve_workload.py} | 169 ++++++++++--------
 6 files changed, 125 insertions(+), 107 deletions(-)
 delete mode 100644 docs/cli/bench/sweep/serve_sla.md
 create mode 100644 docs/cli/bench/sweep/serve_workload.md
 rename vllm/benchmarks/sweep/{serve_sla.py => serve_workload.py} (61%)

diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
index 5571db0a5a0a..156b9c0c07aa 100644
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -102,36 +102,39 @@ By default, each parameter combination is benchmarked 3 times to make the result
 !!! tip
     You can use the `--resume` option to continue the parameter sweep if an unexpected error occurs, e.g., timeout when connecting to HF Hub.
 
-### SLA Scanner
+### Workload Explorer
 
-`vllm bench sweep serve_sla` is a variant of `vllm bench sweep serve` that scans through values of request rate or concurrency (choose using `--sla-variable`) in order to find the tradeoff between latency and throughput. The results can then be [visualized](#visualization) to determine the feasible SLAs.
+`vllm bench sweep serve_workload` is a variant of `vllm bench sweep serve` that explores different workload levels in order to find the tradeoff between latency and throughput. The results can also be [visualized](#visualization) to determine the feasible SLAs.
+
+The workload can be expressed in terms of request rate or concurrency (choose using `--workload-var`).
 
 Example command:
 
 ```bash
-vllm bench sweep serve_sla \
+vllm bench sweep serve_workload \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100' \
-    --sla-variable max_concurrency \
+    --workload-var max_concurrency \
     --serve-params benchmarks/serve_hparams.json \
-    --bench-params benchmarks/bench_hparams.json
+    --bench-params benchmarks/bench_hparams.json \
+    --num-runs 1 \
     -o benchmarks/results
 ```
 
-The algorithm for scanning through different values of `sla_variable` can be summarized as follows:
+The algorithm for exploring different workload levels can be summarized as follows:
 
-1. Run the benchmark by sending requests one at a time (serial inference). This results in the lowest possible latency and throughput.
-2. Run the benchmark by sending all requests at once (batch inference). This results in the highest possible latency and throughput.
-3. Estimate the maximum value of `sla_variable` that can be supported by the server without oversaturating it.
-4. Run the benchmark over intermediate values of `sla_variable` uniformly using the remaining iterations.
+1. Run the benchmark by sending requests one at a time (serial inference, lowest workload). This results in the lowest possible latency and throughput.
+2. Run the benchmark by sending all requests at once (batch inference, highest workload). This results in the highest possible latency and throughput.
+3. Estimate the value of `workload_var` corresponding to Step 2.
+4. Run the benchmark over intermediate values of `workload_var` uniformly using the remaining iterations.
 
-You can override the number of iterations in the algorithm by setting `--sla-iters`.
+You can override the number of iterations in the algorithm by setting `--workload-iters`.
 
 !!! tip
     This is our equivalent of [GuideLLM's `--profile sweep`](https://github.com/vllm-project/guidellm/blob/v0.5.3/src/guidellm/benchmark/profiles.py#L575).
 
-    In general, `--sla-variable max_concurrency` produces more reliable results because it directly controls the workload imposed on the vLLM engine.
-    Nevertheless, we default to `--sla-variable request_rate` to maintain similar behavior as GuideLLM.
+    In general, `--workload-var max_concurrency` produces more reliable results because it directly controls the workload imposed on the vLLM engine.
+    Nevertheless, we default to `--workload-var request_rate` to maintain similar behavior as GuideLLM.
 
 ## Startup Benchmark
 
@@ -198,7 +201,7 @@ vllm bench sweep startup \
 
 Control the variables to plot via `--var-x` and `--var-y`, optionally applying `--filter-by` and `--bin-by` to the values. The plot is organized according to `--fig-by`, `--row-by`, `--col-by`, and `--curve-by`.
 
-Example commands for visualizing [SLA Scanner](#sla-scanner) results:
+Example commands for visualizing [Workload Explorer](#workload-explorer) results:
 
 ```bash
 # Name of the directory that stores the results
diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md
deleted file mode 100644
index 688d64f0bc24..000000000000
--- a/docs/cli/bench/sweep/serve_sla.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# vllm bench sweep serve_sla
-
-## JSON CLI Arguments
-
---8<-- "docs/cli/json_tip.inc.md"
-
-## Arguments
-
---8<-- "docs/generated/argparse/bench_sweep_serve_sla.inc.md"
diff --git a/docs/cli/bench/sweep/serve_workload.md b/docs/cli/bench/sweep/serve_workload.md
new file mode 100644
index 000000000000..8c21788e8d93
--- /dev/null
+++ b/docs/cli/bench/sweep/serve_workload.md
@@ -0,0 +1,9 @@
+# vllm bench sweep serve_workload
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/generated/argparse/bench_sweep_serve_workload.inc.md"
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 801cc8a05d15..9d87f88f5666 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -100,8 +100,8 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
     "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs"
 )
 bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
-bench_sweep_serve_sla = auto_mock(
-    "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
+bench_sweep_serve_workload = auto_mock(
+    "vllm.benchmarks.sweep.serve_workload", "SweepServeWorkloadArgs"
 )
 bench_throughput = auto_mock("vllm.benchmarks", "throughput")
 AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
@@ -229,7 +229,9 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
         "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
         "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
-        "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
+        "bench_sweep_serve_workload": create_parser(
+            bench_sweep_serve_workload.add_cli_args
+        ),
         "bench_throughput": create_parser(bench_throughput.add_cli_args),
     }
 
diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py
index a752000f943d..75549105fa97 100644
--- a/vllm/benchmarks/sweep/cli.py
+++ b/vllm/benchmarks/sweep/cli.py
@@ -10,14 +10,14 @@
 from .plot_pareto import main as plot_pareto_main
 from .serve import SweepServeArgs
 from .serve import main as serve_main
-from .serve_sla import SweepServeSLAArgs
-from .serve_sla import main as serve_sla_main
+from .serve_workload import SweepServeWorkloadArgs
+from .serve_workload import main as serve_workload_main
 from .startup import SweepStartupArgs
 from .startup import main as startup_main
 
 SUBCOMMANDS = (
     (SweepServeArgs, serve_main),
-    (SweepServeSLAArgs, serve_sla_main),
+    (SweepServeWorkloadArgs, serve_workload_main),
     (SweepStartupArgs, startup_main),
     (SweepPlotArgs, plot_main),
     (SweepPlotParetoArgs, plot_pareto_main),
diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_workload.py
similarity index 61%
rename from vllm/benchmarks/sweep/serve_sla.py
rename to vllm/benchmarks/sweep/serve_workload.py
index 38d54ea42f5c..3da403a8416a 100644
--- a/vllm/benchmarks/sweep/serve_sla.py
+++ b/vllm/benchmarks/sweep/serve_workload.py
@@ -28,25 +28,32 @@
     pd = PlaceholderModule("pandas")
 
 
-SLAVariable = Literal["request_rate", "max_concurrency"]
+WorkloadVariable = Literal["request_rate", "max_concurrency"]
 
 
-def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable):
+def _estimate_workload_value(
+    run_data: dict[str, object],
+    workload_var: WorkloadVariable,
+):
     request_throughput = float(run_data["request_throughput"])  # type: ignore
-    if sla_variable == "request_rate":
+    if workload_var == "request_rate":
         return request_throughput
-    if sla_variable == "max_concurrency":
+    if workload_var == "max_concurrency":
         mean_latency_ms = float(run_data["mean_e2el_ms"])  # type: ignore
         return request_throughput * mean_latency_ms / 1000
 
-    assert_never(sla_variable)
+    assert_never(workload_var)
 
 
-def _estimate_sla_avg(runs: list[dict[str, object]], sla_variable: SLAVariable):
-    return sum(_estimate_sla_value(run, sla_variable) for run in runs) / len(runs)
+def _estimate_workload_avg(
+    runs: list[dict[str, object]],
+    workload_var: WorkloadVariable,
+):
+    total = sum(_estimate_workload_value(run, workload_var) for run in runs)
+    return total / len(runs)
 
 
-def run_comb_sla(
+def run_comb_workload(
     server: ServerProcess | None,
     bench_cmd: list[str],
     *,
@@ -56,21 +63,21 @@ def run_comb_sla(
     num_runs: int,
     dry_run: bool,
     link_vars: list[tuple[str, str]],
-    sla_variable: SLAVariable,
-    sla_value: int,
+    workload_var: WorkloadVariable,
+    workload_value: int,
 ) -> list[dict[str, object]] | None:
-    bench_comb_sla = bench_comb | {sla_variable: sla_value}
+    bench_comb_workload = bench_comb | {workload_var: workload_value}
 
     return run_comb(
         server,
         bench_cmd,
         serve_comb=serve_comb,
-        bench_comb=bench_comb_sla,
+        bench_comb=bench_comb_workload,
         base_path=_get_comb_base_path(
             output_dir,
             serve_comb,
             bench_comb,
-            extra_parts=("SLA-", f"{sla_variable}={sla_value}"),
+            extra_parts=("WL-", f"{workload_var}={workload_value}"),
         ),
         num_runs=num_runs,
         dry_run=dry_run,
@@ -78,26 +85,26 @@ def run_comb_sla(
     )
 
 
-def explore_sla(
+def explore_comb_workloads(
     server: ServerProcess | None,
     bench_cmd: list[str],
     *,
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
-    sla_variable: SLAVariable,
-    sla_iters: int,
+    workload_var: WorkloadVariable,
+    workload_iters: int,
     output_dir: Path,
     num_runs: int,
     dry_run: bool,
     link_vars: list[tuple[str, str]],
 ):
-    print("[SLA START]")
+    print("[WL START]")
     print(f"Serve parameters: {serve_comb.as_text() or '(None)'}")
     print(f"Bench parameters: {bench_comb.as_text() or '(None)'}")
-    print(f"Number of SLA iterations: {sla_iters}")
+    print(f"Number of workload iterations: {workload_iters}")
 
-    if sla_iters < 2:
-        raise ValueError("`sla_iters` should be at least 2")
+    if workload_iters < 2:
+        raise ValueError("`workload_iters` should be at least 2")
 
     dataset_size = DEFAULT_NUM_PROMPTS
     if "num_prompts" in bench_comb:
@@ -113,7 +120,7 @@ def explore_sla(
 
     print(f"Dataset size: {dataset_size}")
 
-    serial_comb_data = run_comb_sla(
+    serial_workload_data = run_comb_workload(
         server,
         bench_cmd,
         serve_comb=serve_comb,
@@ -122,10 +129,10 @@ def explore_sla(
         num_runs=num_runs,
         dry_run=dry_run,
         link_vars=link_vars,
-        sla_variable=sla_variable,
-        sla_value=1,
+        workload_var=workload_var,
+        workload_value=1,
     )
-    batch_comb_data = run_comb_sla(
+    batch_workload_data = run_comb_workload(
         server,
         bench_cmd,
         serve_comb=serve_comb,
@@ -134,32 +141,38 @@ def explore_sla(
         num_runs=num_runs,
         dry_run=dry_run,
         link_vars=link_vars,
-        sla_variable=sla_variable,
-        sla_value=dataset_size,
+        workload_var=workload_var,
+        workload_value=dataset_size,
     )
 
-    if serial_comb_data is None or batch_comb_data is None:
+    if serial_workload_data is None or batch_workload_data is None:
         if dry_run:
-            print("Omitting intermediate SLA iterations.")
-            print("[SLA END]")
+            print("Omitting intermediate Workload iterations.")
+            print("[WL END]")
 
         return
 
-    serial_sla_value = math.ceil(_estimate_sla_avg(serial_comb_data, sla_variable))
-    print(f"Serial inference: {sla_variable}={serial_sla_value}")
+    serial_workload_value = math.ceil(
+        _estimate_workload_avg(serial_workload_data, workload_var)
+    )
+    print(f"Serial inference: {workload_var}={serial_workload_value}")
 
-    batch_sla_value = math.floor(_estimate_sla_avg(batch_comb_data, sla_variable))
-    print(f"Batch inference: {sla_variable}={batch_sla_value}")
+    batch_workload_value = math.floor(
+        _estimate_workload_avg(batch_workload_data, workload_var)
+    )
+    print(f"Batch inference: {workload_var}={batch_workload_value}")
 
     # Avoid duplicated runs for intermediate values if the range between
-    # `serial_sla_value` and `batch_sla_value` is small
-    inter_sla_values = np.linspace(serial_sla_value, batch_sla_value, sla_iters)[1:-1]
-    inter_sla_values = sorted(set(map(round, inter_sla_values)))
-
-    inter_combs_data: list[dict[str, object]] = []
-    for inter_sla_value in inter_sla_values:
-        print(f"Exploring: {sla_variable}={inter_sla_value}")
-        inter_comb_data = run_comb_sla(
+    # `serial_workload_value` and `batch_workload_value` is small
+    inter_workload_values = np.linspace(
+        serial_workload_value, batch_workload_value, workload_iters
+    )[1:-1]
+    inter_workload_values = sorted(set(map(round, inter_workload_values)))
+
+    inter_workloads_data: list[dict[str, object]] = []
+    for inter_workload_value in inter_workload_values:
+        print(f"Exploring: {workload_var}={inter_workload_value}")
+        inter_workload_data = run_comb_workload(
             server,
             bench_cmd,
             serve_comb=serve_comb,
@@ -168,18 +181,18 @@ def explore_sla(
             num_runs=num_runs,
             dry_run=dry_run,
             link_vars=link_vars,
-            sla_variable=sla_variable,
-            sla_value=inter_sla_value,
+            workload_var=workload_var,
+            workload_value=inter_workload_value,
         )
-        if inter_comb_data is not None:
-            inter_combs_data.extend(inter_comb_data)
+        if inter_workload_data is not None:
+            inter_workloads_data.extend(inter_workload_data)
 
-    print("[SLA END]")
+    print("[WL END]")
 
-    return serial_comb_data + inter_combs_data + batch_comb_data
+    return serial_workload_data + inter_workloads_data + batch_workload_data
 
 
-def run_slas(
+def explore_combs_workloads(
     serve_cmd: list[str],
     bench_cmd: list[str],
     after_bench_cmd: list[str],
@@ -188,17 +201,17 @@ def run_slas(
     server_ready_timeout: int,
     serve_params: ParameterSweep,
     bench_params: ParameterSweep,
-    sla_variable: SLAVariable,
-    sla_iters: int,
+    workload_var: WorkloadVariable,
+    workload_iters: int,
     output_dir: Path,
     num_runs: int,
     dry_run: bool,
     link_vars: list[tuple[str, str]],
 ):
-    if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params):
+    if any(bench_comb.has_param(workload_var) for bench_comb in bench_params):
         raise ValueError(
-            f"You should not override `{sla_variable}` in `bench_params` in SLA mode, "
-            "since it is supposed to be determined automatically."
+            f"You should not override `{workload_var}` in `bench_params` "
+            "since it is supposed to be explored automatically."
         )
 
     all_data = list[dict[str, object]]()
@@ -214,13 +227,13 @@ def run_slas(
             dry_run=dry_run,
         ) as server:
             for bench_comb in bench_params:
-                comb_data = explore_sla(
+                comb_data = explore_comb_workloads(
                     server,
                     bench_cmd,
                     serve_comb=serve_comb,
                     bench_comb=bench_comb,
-                    sla_variable=sla_variable,
-                    sla_iters=sla_iters,
+                    workload_var=workload_var,
+                    workload_iters=workload_iters,
                     output_dir=output_dir,
                     num_runs=num_runs,
                     dry_run=dry_run,
@@ -240,13 +253,13 @@ def run_slas(
 
 
 @dataclass
-class SweepServeSLAArgs(SweepServeArgs):
-    sla_variable: SLAVariable
-    sla_iters: int
+class SweepServeWorkloadArgs(SweepServeArgs):
+    workload_var: WorkloadVariable
+    workload_iters: int
 
-    parser_name: ClassVar[str] = "serve_sla"
+    parser_name: ClassVar[str] = "serve_workload"
     parser_help: ClassVar[str] = (
-        "Explore the latency-throughput space for determining SLAs."
+        "Explore the latency-throughput tradeoff for different workload levels."
     )
 
     @classmethod
@@ -256,35 +269,35 @@ def from_cli_args(cls, args: argparse.Namespace):
 
         return cls(
             **asdict(base_args),
-            sla_variable=args.sla_variable,
-            sla_iters=args.sla_iters,
+            workload_var=args.workload_var,
+            workload_iters=args.workload_iters,
         )
 
     @classmethod
     def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         parser = super().add_cli_args(parser)
 
-        sla_group = parser.add_argument_group("sla options")
-        sla_group.add_argument(
-            "--sla-variable",
+        workload_group = parser.add_argument_group("workload options")
+        workload_group.add_argument(
+            "--workload-var",
             type=str,
-            choices=get_args(SLAVariable),
+            choices=get_args(WorkloadVariable),
             default="request_rate",
             help="The variable to adjust in each iteration.",
         )
-        sla_group.add_argument(
-            "--sla-iters",
+        workload_group.add_argument(
+            "--workload-iters",
             type=int,
             default=10,
-            help="Number of iterations used to explore the latency-throughput space. "
+            help="Number of workload levels to explore. "
             "This includes the first two iterations used to interpolate the value of "
-            "`sla_variable` for remaining iterations.",
+            "`workload_var` for remaining iterations.",
         )
 
         return parser
 
 
-def run_main(args: SweepServeSLAArgs):
+def run_main(args: SweepServeWorkloadArgs):
     timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
     output_dir = args.output_dir / timestamp
 
@@ -292,7 +305,7 @@ def run_main(args: SweepServeSLAArgs):
         raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
 
     try:
-        return run_slas(
+        return explore_combs_workloads(
             serve_cmd=args.serve_cmd,
             bench_cmd=args.bench_cmd,
             after_bench_cmd=args.after_bench_cmd,
@@ -300,8 +313,8 @@ def run_main(args: SweepServeSLAArgs):
             server_ready_timeout=args.server_ready_timeout,
             serve_params=args.serve_params,
             bench_params=args.bench_params,
-            sla_variable=args.sla_variable,
-            sla_iters=args.sla_iters,
+            workload_var=args.workload_var,
+            workload_iters=args.workload_iters,
             output_dir=output_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
@@ -315,11 +328,11 @@ def run_main(args: SweepServeSLAArgs):
 
 
 def main(args: argparse.Namespace):
-    run_main(SweepServeSLAArgs.from_cli_args(args))
+    run_main(SweepServeWorkloadArgs.from_cli_args(args))
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=SweepServeSLAArgs.parser_help)
-    SweepServeSLAArgs.add_cli_args(parser)
+    parser = argparse.ArgumentParser(description=SweepServeWorkloadArgs.parser_help)
+    SweepServeWorkloadArgs.add_cli_args(parser)
 
     main(parser.parse_args())

From 06733eccc6aea99aceac060cfc5e032b91b29b9d Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Fri, 27 Feb 2026 23:42:16 -0800
Subject: [PATCH 118/154] update dockerfile

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 docker/Dockerfile.xpu | 60 +++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index b33bd19e1589..91dffefdedfb 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -119,20 +119,52 @@ RUN uv pip install -e tests/vllm_test_utils
 ARG UCX_VERSION=e5d98879705239d254ede40b4a52891850cb5349
 ARG NIXL_VERSION=0.7.0
 
-RUN set -eux; \
-    git clone https://github.com/openucx/ucx /tmp/ucx_source; \
-    cd /tmp/ucx_source; \
-    git checkout "${UCX_VERSION}"; \
-    bash autogen.sh; \
-    ./configure --prefix=/tmp/ucx_install --with-ze=yes --enable-examples --enable-mt; \
-    make CFLAGS="-Wno-error=incompatible-pointer-types"; \
-    make install; \
-    git clone https://github.com/ai-dynamo/nixl /tmp/nixl_source; \
-    cd /tmp/nixl_source; \
-    git checkout "${NIXL_VERSION}"
-
-ENV PKG_CONFIG_PATH=/tmp/ucx_install/lib/pkgconfig:/tmp/nixl_install/lib/pkgconfig:${PKG_CONFIG_PATH}
-ENV LD_LIBRARY_PATH=/tmp/nixl_install/lib:${LD_LIBRARY_PATH}
+RUN apt-get update && apt-get install -y \
+    pciutils \
+    net-tools \
+    iproute2 \
+    hwloc \
+    numactl \
+    wget \
+    curl \
+    git \
+    build-essential 
+
+RUN apt-get install -y \
+    autoconf \
+    automake \
+    libtool \
+    pkg-config
+
+RUN apt-get update && apt-get install -y \
+    rdma-core \
+    libibverbs-dev \
+    ibverbs-utils \
+    libibverbs1 \
+    ibverbs-providers \
+    librdmacm-dev \
+    librdmacm1 \
+    libibumad-dev \
+    libibumad3 \
+    libibmad-dev \
+    libibmad5 \
+    infiniband-diags \
+    perftest \
+    ibutils \
+    libmlx5-1 \
+    libmlx4-1 \
+    ibverbs-providers \
+    librdmacm1t64
+
+RUN git clone https://github.com/openucx/ucx /tmp/ucx_source && \
+    cd /tmp/ucx_source && git checkout "${UCX_VERSION}" && \
+    bash autogen.sh && \
+    ./configure --prefix=/tmp/ucx_install --with-ze=yes --enable-examples --enable-mt && \
+    make CFLAGS="-Wno-error=incompatible-pointer-types" -j8 && make install && \
+    git clone https://github.com/ai-dynamo/nixl /tmp/nixl_source && \
+    cd /tmp/nixl_source && git checkout "${NIXL_VERSION}"
+
+ENV PKG_CONFIG_PATH=/tmp/ucx_install/lib/pkgconfig:${PKG_CONFIG_PATH}
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     cd /tmp/nixl_source && \

From 4292e3b807a51507f60f43b3829b5e5e918f5b87 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Feb 2026 16:36:02 +0800
Subject: [PATCH 119/154] [Benchmark] Improve UX of sweep scripts (#35600)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/benchmarking/sweeps.md             |  24 +++--
 vllm/benchmarks/sweep/plot.py           |   8 +-
 vllm/benchmarks/sweep/plot_pareto.py    |   5 +-
 vllm/benchmarks/sweep/serve.py          | 125 +++++++++++++++---------
 vllm/benchmarks/sweep/serve_workload.py |  54 +++++-----
 vllm/benchmarks/sweep/startup.py        | 103 ++++++++++++-------
 6 files changed, 191 insertions(+), 128 deletions(-)

diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
index 156b9c0c07aa..41a799cf2109 100644
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -72,7 +72,7 @@ Follow these steps to run the script:
     ]
     ```
 
-5. Determine where you want to save the results, and pass that to `--output-dir`.
+5. Set `--output-dir` and optionally `--experiment-name` to control where to save the results.
 
 Example command:
 
@@ -82,7 +82,8 @@ vllm bench sweep serve \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
     --serve-params benchmarks/serve_hparams.json \
     --bench-params benchmarks/bench_hparams.json \
-    -o benchmarks/results
+    --output-dir benchmarks/results \
+    --experiment-name demo
 ```
 
 By default, each parameter combination is benchmarked 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
@@ -118,7 +119,8 @@ vllm bench sweep serve_workload \
     --serve-params benchmarks/serve_hparams.json \
     --bench-params benchmarks/bench_hparams.json \
     --num-runs 1 \
-    -o benchmarks/results
+    --output-dir benchmarks/results \
+    --experiment-name demo
 ```
 
 The algorithm for exploring different workload levels can be summarized as follows:
@@ -186,7 +188,8 @@ vllm bench sweep startup \
     --startup-cmd 'vllm bench startup --model Qwen/Qwen3-0.6B' \
     --serve-params benchmarks/serve_hparams.json \
     --startup-params benchmarks/startup_hparams.json \
-    -o benchmarks/results
+    --output-dir benchmarks/results \
+    --experiment-name demo
 ```
 
 !!! important
@@ -204,11 +207,10 @@ Control the variables to plot via `--var-x` and `--var-y`, optionally applying `
 Example commands for visualizing [Workload Explorer](#workload-explorer) results:
 
 ```bash
-# Name of the directory that stores the results
-TIMESTAMP=$1
+EXPERIMENT_DIR=${1:-"benchmarks/results/demo"}
 
 # Latency increases as the workload increases
-vllm bench sweep plot benchmarks/results/$TIMESTAMP \
+vllm bench sweep plot $EXPERIMENT_DIR \
     --var-x max_concurrency \
     --var-y median_ttft_ms \
     --col-by _benchmark_name \
@@ -216,7 +218,7 @@ vllm bench sweep plot benchmarks/results/$TIMESTAMP \
     --fig-name latency_curve
 
 # Throughput saturates as workload increases
-vllm bench sweep plot benchmarks/results/$TIMESTAMP \
+vllm bench sweep plot $EXPERIMENT_DIR \
     --var-x max_concurrency \
     --var-y total_token_throughput \
     --col-by _benchmark_name \
@@ -224,7 +226,7 @@ vllm bench sweep plot benchmarks/results/$TIMESTAMP \
     --fig-name throughput_curve
 
 # Tradeoff between latency and throughput
-vllm bench sweep plot benchmarks/results/$TIMESTAMP \
+vllm bench sweep plot $EXPERIMENT_DIR \
     --var-x total_token_throughput \
     --var-y median_ttft_ms \
     --col-by _benchmark_name \
@@ -249,7 +251,9 @@ Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add
 Example:
 
 ```bash
-vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
+EXPERIMENT_DIR=${1:-"benchmarks/results/demo"}
+
+vllm bench sweep plot_pareto $EXPERIMENT_DIR \
   --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
 ```
 
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 4f9184f9540b..156e18f697f0 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -499,7 +499,7 @@ class SweepPlotArgs:
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
-        output_dir = Path(args.OUTPUT_DIR)
+        output_dir = Path(args.EXPERIMENT_DIR)
         if not output_dir.exists():
             raise ValueError(f"No parameter sweep results under {output_dir}")
 
@@ -531,11 +531,9 @@ def from_cli_args(cls, args: argparse.Namespace):
     @classmethod
     def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         parser.add_argument(
-            "OUTPUT_DIR",
+            "EXPERIMENT_DIR",
             type=str,
-            default="results",
-            help="The directory containing the results to plot, "
-            "i.e., the `--output-dir` argument to the parameter sweep script.",
+            help="The directory containing the sweep results to plot.",
         )
         parser.add_argument(
             "--fig-dir",
diff --git a/vllm/benchmarks/sweep/plot_pareto.py b/vllm/benchmarks/sweep/plot_pareto.py
index 3d17e4741c17..365e87f757d1 100644
--- a/vllm/benchmarks/sweep/plot_pareto.py
+++ b/vllm/benchmarks/sweep/plot_pareto.py
@@ -325,7 +325,7 @@ class SweepPlotParetoArgs:
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
-        output_dir = Path(args.OUTPUT_DIR)
+        output_dir = Path(args.EXPERIMENT_DIR)
         if not output_dir.exists():
             raise ValueError(f"No parameter sweep results under {output_dir}")
 
@@ -342,9 +342,8 @@ def from_cli_args(cls, args: argparse.Namespace):
     @classmethod
     def add_cli_args(cls, parser: argparse.ArgumentParser):
         parser.add_argument(
-            "OUTPUT_DIR",
+            "EXPERIMENT_DIR",
             type=str,
-            default="results",
             help="The directory containing the sweep results to plot.",
         )
         parser.add_argument(
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 4ab2dab5f077..f64006ee1023 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -4,6 +4,7 @@
 import contextlib
 import json
 import shlex
+from contextlib import contextmanager
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
@@ -135,7 +136,7 @@ def run_benchmark(
 
 
 def _get_comb_base_path(
-    output_dir: Path,
+    experiment_dir: Path,
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
     *,
@@ -149,7 +150,7 @@ def _get_comb_base_path(
     if extra_parts:
         parts.extend(extra_parts)
 
-    return output_dir / sanitize_filename("-".join(parts))
+    return experiment_dir / sanitize_filename("-".join(parts))
 
 
 def _get_comb_run_path(base_path: Path, run_number: int | None):
@@ -162,10 +163,10 @@ def _get_comb_run_path(base_path: Path, run_number: int | None):
 def _comb_needs_server(
     serve_comb: ParameterSweepItem,
     bench_combs: ParameterSweep,
-    output_dir: Path,
+    experiment_dir: Path,
 ):
     for bench_comb in bench_combs:
-        base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
+        base_path = _get_comb_base_path(experiment_dir, serve_comb, bench_comb)
         if not _get_comb_run_path(base_path, run_number=None).exists():
             return True
 
@@ -179,11 +180,11 @@ def server_ctx(
     show_stdout: bool,
     serve_comb: ParameterSweepItem,
     bench_params: ParameterSweep,
-    output_dir: Path,
+    experiment_dir: Path,
     dry_run: bool,
     server_ready_timeout: int = 300,
 ):
-    if not _comb_needs_server(serve_comb, bench_params, output_dir):
+    if not _comb_needs_server(serve_comb, bench_params, experiment_dir):
         return contextlib.nullcontext()
 
     return run_server(
@@ -215,10 +216,10 @@ def run_comb(
     *,
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
+    link_vars: list[tuple[str, str]],
     base_path: Path,
     num_runs: int,
     dry_run: bool,
-    link_vars: list[tuple[str, str]],
 ):
     if not _comb_is_valid(serve_comb, bench_comb, link_vars):
         return None
@@ -257,10 +258,10 @@ def run_combs(
     server_ready_timeout: int,
     serve_params: ParameterSweep,
     bench_params: ParameterSweep,
-    output_dir: Path,
+    link_vars: list[tuple[str, str]],
+    experiment_dir: Path,
     num_runs: int,
     dry_run: bool,
-    link_vars: list[tuple[str, str]],
 ):
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
@@ -270,22 +271,22 @@ def run_combs(
             show_stdout=show_stdout,
             serve_comb=serve_comb,
             bench_params=bench_params,
-            output_dir=output_dir,
+            experiment_dir=experiment_dir,
             dry_run=dry_run,
             server_ready_timeout=server_ready_timeout,
         ) as server:
             for bench_comb in bench_params:
-                base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
+                base_path = _get_comb_base_path(experiment_dir, serve_comb, bench_comb)
 
                 comb_data = run_comb(
                     server,
                     bench_cmd,
                     serve_comb=serve_comb,
                     bench_comb=bench_comb,
+                    link_vars=link_vars,
                     base_path=base_path,
                     num_runs=num_runs,
                     dry_run=dry_run,
-                    link_vars=link_vars,
                 )
 
                 if comb_data is not None:
@@ -295,7 +296,7 @@ def run_combs(
         return None
 
     combined_df = pd.DataFrame.from_records(all_data)
-    combined_df.to_csv(output_dir / "summary.csv")
+    combined_df.to_csv(experiment_dir / "summary.csv")
 
     return combined_df
 
@@ -309,11 +310,12 @@ class SweepServeArgs:
     server_ready_timeout: int
     serve_params: ParameterSweep
     bench_params: ParameterSweep
+    link_vars: list[tuple[str, str]]
     output_dir: Path
+    experiment_name: str
     num_runs: int
     dry_run: bool
-    resume: str | None
-    link_vars: list[tuple[str, str]]
+    resume: bool
 
     parser_name: ClassVar[str] = "serve"
     parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings."
@@ -340,6 +342,11 @@ def from_cli_args(cls, args: argparse.Namespace):
 
         link_vars = cls.parse_link_vars(args.link_vars)
 
+        if args.experiment_name:
+            experiment_name = args.experiment_name
+        else:
+            experiment_name = datetime.now().strftime("%Y%m%d_%H%M%S")
+
         num_runs = args.num_runs
         if num_runs < 1:
             raise ValueError("`num_runs` should be at least 1.")
@@ -351,11 +358,12 @@ def from_cli_args(cls, args: argparse.Namespace):
             show_stdout=args.show_stdout,
             serve_params=serve_params,
             bench_params=bench_params,
+            link_vars=link_vars,
             output_dir=Path(args.output_dir),
+            experiment_name=experiment_name,
             num_runs=num_runs,
             dry_run=args.dry_run,
             resume=args.resume,
-            link_vars=link_vars,
             server_ready_timeout=args.server_ready_timeout,
         )
 
@@ -392,6 +400,7 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
             default=300,
             help="Timeout in seconds to wait for the server to become ready.",
         )
+
         parser.add_argument(
             "--serve-params",
             type=str,
@@ -402,6 +411,16 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
             "If both `serve_params` and `bench_params` are given, "
             "this script will iterate over their Cartesian product.",
         )
+        parser.add_argument(
+            "--link-vars",
+            type=str,
+            default="",
+            help=(
+                "Comma-separated list of linked variables between serve and bench, "
+                "e.g. max_num_seqs=max_concurrency,max_model_len=random_input_len"
+            ),
+        )
+
         parser.add_argument(
             "--bench-params",
             type=str,
@@ -417,7 +436,15 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
             "--output-dir",
             type=str,
             default="results",
-            help="The directory to which results are written.",
+            help="The main directory to which results are written.",
+        )
+        parser.add_argument(
+            "-e",
+            "--experiment-name",
+            type=str,
+            default=None,
+            help="The name of this experiment (defaults to current timestamp). "
+            "Results will be stored under `output_dir/experiment_name`.",
         )
         parser.add_argument(
             "--num-runs",
@@ -433,21 +460,10 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         )
         parser.add_argument(
             "--resume",
-            type=str,
-            default=None,
-            help="Set this to the name of a directory under `output_dir` (which is a "
-            "timestamp) to resume a previous execution of this script, i.e., only run "
-            "parameter combinations for which there are still no output files.",
-        )
-
-        parser.add_argument(
-            "--link-vars",
-            type=str,
-            default="",
-            help=(
-                "Comma-separated list of linked variables between serve and bench, "
-                "e.g. max_num_seqs=max_concurrency,max_model_len=random_input_len"
-            ),
+            action="store_true",
+            help="Resume a previous execution of this script, i.e., only run "
+            "parameter combinations for which there are still no output files "
+            "under `output_dir/experiment_name`.",
         )
 
         return parser
@@ -462,33 +478,52 @@ def parse_link_vars(s: str) -> list[tuple[str, str]]:
             pairs.append((a.strip(), b.strip()))
         return pairs
 
+    def resolve_experiment_dir(self) -> Path:
+        experiment_dir = self.output_dir / self.experiment_name
 
-def run_main(args: SweepServeArgs):
-    timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_dir = args.output_dir / timestamp
+        if self.resume:
+            if not experiment_dir.exists():
+                raise ValueError(f"Cannot resume from non-existent {experiment_dir=}")
+        else:
+            if experiment_dir.exists():
+                raise ValueError(f"Cannot overwrite existing {experiment_dir=}")
+
+        return experiment_dir
 
-    if args.resume and not output_dir.exists():
-        raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
+    @contextmanager
+    def run_ctx(self, experiment_dir: Path):
+        if self.dry_run:
+            yield
+            print(f"Experiment will be saved at: {experiment_dir}")
+            return
 
-    try:
+        try:
+            yield
+            print(f"Experiment has been saved at: {experiment_dir}")
+        except BaseException as exc:
+            raise RuntimeError(
+                "The script was terminated early. Use `--resume` "
+                "to continue the script from its last checkpoint."
+            ) from exc
+
+
+def run_main(args: SweepServeArgs):
+    experiment_dir = args.resolve_experiment_dir()
+
+    with args.run_ctx(experiment_dir):
         return run_combs(
             serve_cmd=args.serve_cmd,
             bench_cmd=args.bench_cmd,
+            link_vars=args.link_vars,
             after_bench_cmd=args.after_bench_cmd,
             show_stdout=args.show_stdout,
             server_ready_timeout=args.server_ready_timeout,
             serve_params=args.serve_params,
             bench_params=args.bench_params,
-            output_dir=output_dir,
+            experiment_dir=experiment_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
-            link_vars=args.link_vars,
         )
-    except BaseException as exc:
-        raise RuntimeError(
-            f"The script was terminated early. Use `--resume {timestamp}` "
-            f"to continue the script from its last checkpoint."
-        ) from exc
 
 
 def main(args: argparse.Namespace):
diff --git a/vllm/benchmarks/sweep/serve_workload.py b/vllm/benchmarks/sweep/serve_workload.py
index 3da403a8416a..ca7ba09a5334 100644
--- a/vllm/benchmarks/sweep/serve_workload.py
+++ b/vllm/benchmarks/sweep/serve_workload.py
@@ -3,7 +3,6 @@
 import argparse
 import math
 from dataclasses import asdict, dataclass
-from datetime import datetime
 from pathlib import Path
 from typing import ClassVar, Literal, get_args
 
@@ -59,10 +58,10 @@ def run_comb_workload(
     *,
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
-    output_dir: Path,
+    link_vars: list[tuple[str, str]],
+    experiment_dir: Path,
     num_runs: int,
     dry_run: bool,
-    link_vars: list[tuple[str, str]],
     workload_var: WorkloadVariable,
     workload_value: int,
 ) -> list[dict[str, object]] | None:
@@ -73,15 +72,15 @@ def run_comb_workload(
         bench_cmd,
         serve_comb=serve_comb,
         bench_comb=bench_comb_workload,
+        link_vars=link_vars,
         base_path=_get_comb_base_path(
-            output_dir,
+            experiment_dir,
             serve_comb,
             bench_comb,
             extra_parts=("WL-", f"{workload_var}={workload_value}"),
         ),
         num_runs=num_runs,
         dry_run=dry_run,
-        link_vars=link_vars,
     )
 
 
@@ -91,12 +90,12 @@ def explore_comb_workloads(
     *,
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
+    link_vars: list[tuple[str, str]],
     workload_var: WorkloadVariable,
     workload_iters: int,
-    output_dir: Path,
+    experiment_dir: Path,
     num_runs: int,
     dry_run: bool,
-    link_vars: list[tuple[str, str]],
 ):
     print("[WL START]")
     print(f"Serve parameters: {serve_comb.as_text() or '(None)'}")
@@ -125,10 +124,10 @@ def explore_comb_workloads(
         bench_cmd,
         serve_comb=serve_comb,
         bench_comb=bench_comb | {"max_concurrency": 1},
-        output_dir=output_dir,
+        link_vars=link_vars,
+        experiment_dir=experiment_dir,
         num_runs=num_runs,
         dry_run=dry_run,
-        link_vars=link_vars,
         workload_var=workload_var,
         workload_value=1,
     )
@@ -137,10 +136,10 @@ def explore_comb_workloads(
         bench_cmd,
         serve_comb=serve_comb,
         bench_comb=bench_comb | {"max_concurrency": dataset_size},
-        output_dir=output_dir,
+        link_vars=link_vars,
+        experiment_dir=experiment_dir,
         num_runs=num_runs,
         dry_run=dry_run,
-        link_vars=link_vars,
         workload_var=workload_var,
         workload_value=dataset_size,
     )
@@ -177,10 +176,10 @@ def explore_comb_workloads(
             bench_cmd,
             serve_comb=serve_comb,
             bench_comb=bench_comb,
-            output_dir=output_dir,
+            link_vars=link_vars,
+            experiment_dir=experiment_dir,
             num_runs=num_runs,
             dry_run=dry_run,
-            link_vars=link_vars,
             workload_var=workload_var,
             workload_value=inter_workload_value,
         )
@@ -201,12 +200,12 @@ def explore_combs_workloads(
     server_ready_timeout: int,
     serve_params: ParameterSweep,
     bench_params: ParameterSweep,
+    link_vars: list[tuple[str, str]],
     workload_var: WorkloadVariable,
     workload_iters: int,
-    output_dir: Path,
+    experiment_dir: Path,
     num_runs: int,
     dry_run: bool,
-    link_vars: list[tuple[str, str]],
 ):
     if any(bench_comb.has_param(workload_var) for bench_comb in bench_params):
         raise ValueError(
@@ -223,7 +222,7 @@ def explore_combs_workloads(
             server_ready_timeout=server_ready_timeout,
             serve_comb=serve_comb,
             bench_params=bench_params,
-            output_dir=output_dir,
+            experiment_dir=experiment_dir,
             dry_run=dry_run,
         ) as server:
             for bench_comb in bench_params:
@@ -232,12 +231,12 @@ def explore_combs_workloads(
                     bench_cmd,
                     serve_comb=serve_comb,
                     bench_comb=bench_comb,
+                    link_vars=link_vars,
                     workload_var=workload_var,
                     workload_iters=workload_iters,
-                    output_dir=output_dir,
+                    experiment_dir=experiment_dir,
                     num_runs=num_runs,
                     dry_run=dry_run,
-                    link_vars=link_vars,
                 )
 
                 if comb_data is not None:
@@ -247,7 +246,7 @@ def explore_combs_workloads(
         return None
 
     combined_df = pd.DataFrame.from_records(all_data)
-    combined_df.to_csv(output_dir / "summary.csv")
+    combined_df.to_csv(experiment_dir / "summary.csv")
 
     return combined_df
 
@@ -298,13 +297,9 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
 
 
 def run_main(args: SweepServeWorkloadArgs):
-    timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_dir = args.output_dir / timestamp
-
-    if args.resume and not output_dir.exists():
-        raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
+    experiment_dir = args.resolve_experiment_dir()
 
-    try:
+    with args.run_ctx(experiment_dir):
         return explore_combs_workloads(
             serve_cmd=args.serve_cmd,
             bench_cmd=args.bench_cmd,
@@ -313,18 +308,13 @@ def run_main(args: SweepServeWorkloadArgs):
             server_ready_timeout=args.server_ready_timeout,
             serve_params=args.serve_params,
             bench_params=args.bench_params,
+            link_vars=args.link_vars,
             workload_var=args.workload_var,
             workload_iters=args.workload_iters,
-            output_dir=output_dir,
+            experiment_dir=experiment_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
-            link_vars=args.link_vars,
         )
-    except BaseException as exc:
-        raise RuntimeError(
-            f"The script was terminated early. Use `--resume {timestamp}` "
-            f"to continue the script from its last checkpoint."
-        ) from exc
 
 
 def main(args: argparse.Namespace):
diff --git a/vllm/benchmarks/sweep/startup.py b/vllm/benchmarks/sweep/startup.py
index b4d979b1609e..6f5217ed328d 100644
--- a/vllm/benchmarks/sweep/startup.py
+++ b/vllm/benchmarks/sweep/startup.py
@@ -4,6 +4,7 @@
 import json
 import shlex
 import subprocess
+from contextlib import contextmanager
 from dataclasses import dataclass
 from datetime import datetime
 from functools import lru_cache
@@ -111,7 +112,7 @@ def _apply_output_json(cmd: list[str], output_path: Path) -> list[str]:
 
 
 def _get_comb_base_path(
-    output_dir: Path,
+    experiment_dir: Path,
     serve_comb: ParameterSweepItem,
     startup_comb: ParameterSweepItem,
 ) -> Path:
@@ -120,7 +121,8 @@ def _get_comb_base_path(
         parts.extend(("SERVE-", serve_comb.name))
     if startup_comb:
         parts.extend(("STARTUP-", startup_comb.name))
-    return output_dir / sanitize_filename("-".join(parts))
+
+    return experiment_dir / sanitize_filename("-".join(parts))
 
 
 def _get_comb_run_path(base_path: Path, run_number: int | None) -> Path:
@@ -225,7 +227,7 @@ def run_combs(
     *,
     serve_params: ParameterSweep,
     startup_params: ParameterSweep,
-    output_dir: Path,
+    experiment_dir: Path,
     num_runs: int,
     show_stdout: bool,
     dry_run: bool,
@@ -233,7 +235,7 @@ def run_combs(
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
         for startup_comb in startup_params:
-            base_path = _get_comb_base_path(output_dir, serve_comb, startup_comb)
+            base_path = _get_comb_base_path(experiment_dir, serve_comb, startup_comb)
             comb_data = run_comb(
                 startup_cmd,
                 serve_comb=serve_comb,
@@ -250,7 +252,7 @@ def run_combs(
         return None
 
     combined_df = pd.DataFrame.from_records(all_data)
-    combined_df.to_csv(output_dir / "summary.csv")
+    combined_df.to_csv(experiment_dir / "summary.csv")
     return combined_df
 
 
@@ -260,11 +262,11 @@ class SweepStartupArgs:
     serve_params: ParameterSweep
     startup_params: ParameterSweep
     output_dir: Path
+    experiment_name: str
     num_runs: int
     show_stdout: bool
     dry_run: bool
-    resume: str | None
-    strict_params: bool
+    resume: bool
 
     parser_name: ClassVar[str] = "startup"
     parser_help: ClassVar[str] = (
@@ -286,13 +288,19 @@ def from_cli_args(cls, args: argparse.Namespace):
             startup_params = ParameterSweep.from_records([{}])
 
         supported = _get_supported_startup_keys()
+        strict_params = args.strict_params
         serve_params = _filter_params(
-            serve_params, supported=supported, strict=args.strict_params
+            serve_params, supported=supported, strict=strict_params
         )
         startup_params = _filter_params(
-            startup_params, supported=supported, strict=args.strict_params
+            startup_params, supported=supported, strict=strict_params
         )
 
+        if args.experiment_name:
+            experiment_name = args.experiment_name
+        else:
+            experiment_name = datetime.now().strftime("%Y%m%d_%H%M%S")
+
         if args.num_runs < 1:
             raise ValueError("`num_runs` should be at least 1.")
 
@@ -301,11 +309,11 @@ def from_cli_args(cls, args: argparse.Namespace):
             serve_params=serve_params,
             startup_params=startup_params,
             output_dir=Path(args.output_dir),
+            experiment_name=experiment_name,
             num_runs=args.num_runs,
             show_stdout=args.show_stdout,
             dry_run=args.dry_run,
             resume=args.resume,
-            strict_params=args.strict_params,
         )
 
     @classmethod
@@ -316,6 +324,7 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
             default="vllm bench startup",
             help="The command used to run the startup benchmark.",
         )
+
         parser.add_argument(
             "--serve-params",
             type=str,
@@ -331,12 +340,27 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
             help="Path to JSON file containing parameter combinations "
             "for the `vllm bench startup` command.",
         )
+        parser.add_argument(
+            "--strict-params",
+            action="store_true",
+            help="If set, unknown parameters in sweep files raise an error "
+            "instead of being ignored.",
+        )
+
         parser.add_argument(
             "-o",
             "--output-dir",
             type=str,
             default="results",
-            help="The directory to which results are written.",
+            help="The main directory to which results are written.",
+        )
+        parser.add_argument(
+            "-e",
+            "--experiment-name",
+            type=str,
+            default=None,
+            help="The name of this experiment (defaults to current timestamp). "
+            "Results will be stored under `output_dir/experiment_name`.",
         )
         parser.add_argument(
             "--num-runs",
@@ -357,43 +381,56 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         )
         parser.add_argument(
             "--resume",
-            type=str,
-            default=None,
-            help="Set this to the name of a directory under `output_dir` (which is a "
-            "timestamp) to resume a previous execution of this script, i.e., only run "
-            "parameter combinations for which there are still no output files.",
-        )
-        parser.add_argument(
-            "--strict-params",
             action="store_true",
-            help="If set, unknown parameters in sweep files raise an error "
-            "instead of being ignored.",
+            help="Resume a previous execution of this script, i.e., only run "
+            "parameter combinations for which there are still no output files "
+            "under `output_dir/experiment_name`.",
         )
+
         return parser
 
+    def resolve_experiment_dir(self) -> Path:
+        experiment_dir = self.output_dir / self.experiment_name
 
-def run_main(args: SweepStartupArgs):
-    timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_dir = args.output_dir / timestamp
+        if self.resume:
+            if not experiment_dir.exists():
+                raise ValueError(f"Cannot resume from non-existent {experiment_dir=}")
+        else:
+            if experiment_dir.exists():
+                raise ValueError(f"Cannot overwrite existing {experiment_dir=}")
+
+        return experiment_dir
+
+    @contextmanager
+    def run_ctx(self, experiment_dir: Path):
+        if self.dry_run:
+            yield
+            print(f"Experiment will be saved at: {experiment_dir}")
+            return
 
-    if args.resume and not output_dir.exists():
-        raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
+        try:
+            yield
+            print(f"Experiment has been saved at: {experiment_dir}")
+        except BaseException as exc:
+            raise RuntimeError(
+                "The script was terminated early. Use `--resume` "
+                "to continue the script from its last checkpoint."
+            ) from exc
+
+
+def run_main(args: SweepStartupArgs):
+    experiment_dir = args.resolve_experiment_dir()
 
-    try:
+    with args.run_ctx(experiment_dir):
         return run_combs(
             startup_cmd=args.startup_cmd,
             serve_params=args.serve_params,
             startup_params=args.startup_params,
-            output_dir=output_dir,
+            experiment_dir=experiment_dir,
             num_runs=args.num_runs,
             show_stdout=args.show_stdout,
             dry_run=args.dry_run,
         )
-    except BaseException as exc:
-        raise RuntimeError(
-            f"The script was terminated early. Use `--resume {timestamp}` "
-            f"to continue the script from its last checkpoint."
-        ) from exc
 
 
 def main(args: argparse.Namespace):

From 1e69c048877335e92720772cac704650ad99b219 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 28 Feb 2026 02:59:26 -0600
Subject: [PATCH 120/154] [ROCm][CI] Parametrize vision score tests across
 attention backends with per-backend tolerances (#35571)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .../pooling/score/test_online_score_vision.py | 193 ++++++++++++++----
 1 file changed, 153 insertions(+), 40 deletions(-)

diff --git a/tests/entrypoints/pooling/score/test_online_score_vision.py b/tests/entrypoints/pooling/score/test_online_score_vision.py
index 9e9bc3fec881..bd53153c33fc 100644
--- a/tests/entrypoints/pooling/score/test_online_score_vision.py
+++ b/tests/entrypoints/pooling/score/test_online_score_vision.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import json
+
 import pytest
 import requests
 
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
 from vllm.multimodal.utils import encode_image_url, fetch_image
+from vllm.platforms import current_platform
 
 MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
 HF_OVERRIDES = {
@@ -15,6 +18,60 @@
     "is_original_qwen3_reranker": True,
 }
 
+ROCM_ATTN_BACKENDS = [
+    "ROCM_ATTN",
+    "ROCM_AITER_FA",
+    "TRITON_ATTN",
+    "FLEX_ATTENTION",
+]
+
+ATTN_BACKENDS = ROCM_ATTN_BACKENDS if current_platform.is_rocm() else []
+
+# Per-backend tolerance with explicit entries; "default" is the fallback
+BACKEND_TOL: dict[str, float] = {
+    "default": 0.05,  # 5% tolerance for other backends (e.g. FLASH_ATTN)
+    # Relaxed tolerances for ROCm attn
+    # See: https://github.com/vllm-project/vllm/issues/35569
+    "ROCM_ATTN": 0.09,  # gfx950:~8.45%, gfx942:~3.70%
+    "ROCM_AITER_FA": 0.045,  # gfx950:~2.00%, gfx942:~0.80%
+    "TRITON_ATTN": 0.045,  # gfx950:~3.00%, gfx942:~2.20%
+    "FLEX_ATTENTION": 0.045,  # gfx950:~3.25%, gfx942:~1.10%
+}
+
+# ROCm: disable skinny GEMM to avoid non-deterministic results from
+# atomic reductions in wvSplitKrc kernel.
+# See: https://github.com/vllm-project/vllm/pull/33493#issuecomment-3906083975
+ROCM_ENV_OVERRIDES = (
+    {"VLLM_ROCM_USE_SKINNY_GEMM": "0"} if current_platform.is_rocm() else {}
+)
+# ROCm: disable prefix caching and eliminate batch variance to reduce
+# test flakiness.
+ROCM_EXTRA_ARGS = (
+    ["--no-enable-prefix-caching", "--max-num-seqs", "1"]
+    if current_platform.is_rocm()
+    else []
+)
+
+
+def get_tol(backend: str) -> float:
+    return BACKEND_TOL.get(backend, BACKEND_TOL["default"])
+
+
+def assert_score(actual: float, expected: float, backend: str, label: str):
+    tol = get_tol(backend)
+    diff = abs(actual - expected)
+    rel_diff = diff / abs(expected) if expected != 0 else diff
+    print(
+        f"[{backend}] {label}: actual={actual:.6f} expected={expected:.6f} "
+        f"diff={diff:.6f} rel_diff={rel_diff:.4f} tol={tol}"
+    )
+    assert actual == pytest.approx(expected, rel=tol), (
+        f"[{backend}] {label}: score mismatch — "
+        f"actual={actual:.6f}, expected={expected:.6f}, "
+        f"rel_diff={rel_diff:.4f}, tol={tol}"
+    )
+
+
 query = "A cat standing in the snow."
 document = "This product was excellent and exceeded my expectations."
 image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
@@ -36,28 +93,37 @@
 TEXT_VS_TEXT = 0.10040374100208282
 TEXT_VS_IMAGE = 0.7423753142356873
 TEXT_VS_TEXT_PLUS_IMAGE = 0.5298863053321838
-TOL = 0.05
 
 
-@pytest.fixture(scope="module")
-def server():
+@pytest.fixture(scope="module", params=ATTN_BACKENDS)
+def server(request):
+    backend = request.param
+    print(f"\n=== Starting server with attention backend: {backend} ===")
     args = [
         "--enforce-eager",
         "--max-model-len",
         "8192",
         "--chat-template",
         str(VLLM_PATH / "examples/pooling/score/template/qwen3_vl_reranker.jinja"),
-    ]
+        "--attention-config",
+        json.dumps({"backend": backend}),
+    ] + ROCM_EXTRA_ARGS
+
+    env = dict(ROCM_ENV_OVERRIDES)
+    if backend != "ROCM_AITER_FA":
+        env["VLLM_ROCM_USE_AITER"] = "0"
 
     with RemoteOpenAIServer(
-        MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+        MODEL_NAME, args, override_hf_configs=HF_OVERRIDES, env_dict=env
     ) as remote_server:
-        yield remote_server
+        print(f"=== Server ready with backend: {backend} ===")
+        yield remote_server, backend
 
 
-def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
+def test_score_api_queries_str_documents_str(server: tuple[RemoteOpenAIServer, str]):
+    remote_server, backend = server
     score_response = requests.post(
-        server.url_for("score"),
+        remote_server.url_for("score"),
         json={
             "model": MODEL_NAME,
             "queries": query,
@@ -71,12 +137,15 @@ def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
     assert score.data is not None
     assert len(score.data) == 1
     assert score.usage.prompt_tokens == 81
-    assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
+    assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "text_vs_text")
 
 
-def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer):
+def test_score_api_queries_str_documents_text_content(
+    server: tuple[RemoteOpenAIServer, str],
+):
+    remote_server, backend = server
     score_response = requests.post(
-        server.url_for("score"),
+        remote_server.url_for("score"),
         json={
             "model": MODEL_NAME,
             "queries": query,
@@ -90,12 +159,15 @@ def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer
     assert score.data is not None
     assert len(score.data) == 1
     assert score.usage.prompt_tokens == 81
-    assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
+    assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "text_vs_text")
 
 
-def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIServer):
+def test_score_api_queries_str_documents_image_url_content(
+    server: tuple[RemoteOpenAIServer, str],
+):
+    remote_server, backend = server
     score_response = requests.post(
-        server.url_for("score"),
+        remote_server.url_for("score"),
         json={
             "model": MODEL_NAME,
             "queries": query,
@@ -109,14 +181,15 @@ def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIS
     assert score.data is not None
     assert len(score.data) == 1
     assert score.usage.prompt_tokens == 98
-    assert score.data[0].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
+    assert_score(score.data[0].score, TEXT_VS_IMAGE, backend, "text_vs_image")
 
 
 def test_score_api_queries_str_documents_image_base64_content(
-    server: RemoteOpenAIServer,
+    server: tuple[RemoteOpenAIServer, str],
 ):
+    remote_server, backend = server
     score_response = requests.post(
-        server.url_for("score"),
+        remote_server.url_for("score"),
         json={
             "model": MODEL_NAME,
             "queries": query,
@@ -130,14 +203,15 @@ def test_score_api_queries_str_documents_image_base64_content(
     assert score.data is not None
     assert len(score.data) == 1
     assert score.usage.prompt_tokens == 98
-    assert score.data[0].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
+    assert_score(score.data[0].score, TEXT_VS_IMAGE, backend, "text_vs_image_base64")
 
 
 def test_score_api_queries_str_documents_image_url_plus_text_content(
-    server: RemoteOpenAIServer,
+    server: tuple[RemoteOpenAIServer, str],
 ):
+    remote_server, backend = server
     score_response = requests.post(
-        server.url_for("score"),
+        remote_server.url_for("score"),
         json={
             "model": MODEL_NAME,
             "queries": query,
@@ -151,12 +225,17 @@ def test_score_api_queries_str_documents_image_url_plus_text_content(
     assert score.data is not None
     assert len(score.data) == 1
     assert score.usage.prompt_tokens == 108
-    assert score.data[0].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
+    assert_score(
+        score.data[0].score, TEXT_VS_TEXT_PLUS_IMAGE, backend, "text_vs_text_plus_image"
+    )
 
 
-def test_score_api_queries_str_documents_list(server: RemoteOpenAIServer):
+def test_score_api_queries_str_documents_list(
+    server: tuple[RemoteOpenAIServer, str],
+):
+    remote_server, backend = server
     score_response = requests.post(
-        server.url_for("score"),
+        remote_server.url_for("score"),
         json={
             "model": MODEL_NAME,
             "queries": query,
@@ -175,15 +254,23 @@ def test_score_api_queries_str_documents_list(server: RemoteOpenAIServer):
     assert score.data is not None
     assert len(score.data) == 4
     assert score.usage.prompt_tokens == 368
-    assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
-    assert score.data[1].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
-    assert score.data[2].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
-    assert score.data[3].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
+    assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "list[0]_text_vs_text")
+    assert_score(score.data[1].score, TEXT_VS_TEXT, backend, "list[1]_text_vs_text")
+    assert_score(score.data[2].score, TEXT_VS_IMAGE, backend, "list[2]_text_vs_image")
+    assert_score(
+        score.data[3].score,
+        TEXT_VS_TEXT_PLUS_IMAGE,
+        backend,
+        "list[3]_text_vs_text_plus_image",
+    )
 
 
-def test_rerank_api_queries_str_documents_list(server: RemoteOpenAIServer):
+def test_rerank_api_queries_str_documents_list(
+    server: tuple[RemoteOpenAIServer, str],
+):
+    remote_server, backend = server
     rerank_response = requests.post(
-        server.url_for("rerank"),
+        remote_server.url_for("rerank"),
         json={
             "model": MODEL_NAME,
             "query": query,
@@ -204,17 +291,38 @@ def test_rerank_api_queries_str_documents_list(server: RemoteOpenAIServer):
     assert len(rerank.results) == 4
 
     rerank.results.sort(key=lambda x: x.index)
-    assert rerank.results[0].relevance_score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
-    assert rerank.results[1].relevance_score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
-    assert rerank.results[2].relevance_score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
-    assert rerank.results[3].relevance_score == pytest.approx(
-        TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL
+    assert_score(
+        rerank.results[0].relevance_score,
+        TEXT_VS_TEXT,
+        backend,
+        "rerank[0]_text_vs_text",
+    )
+    assert_score(
+        rerank.results[1].relevance_score,
+        TEXT_VS_TEXT,
+        backend,
+        "rerank[1]_text_vs_text",
+    )
+    assert_score(
+        rerank.results[2].relevance_score,
+        TEXT_VS_IMAGE,
+        backend,
+        "rerank[2]_text_vs_image",
+    )
+    assert_score(
+        rerank.results[3].relevance_score,
+        TEXT_VS_TEXT_PLUS_IMAGE,
+        backend,
+        "rerank[3]_text_vs_text_plus_image",
     )
 
 
-def test_score_api_queries_list_documents_list(server: RemoteOpenAIServer):
+def test_score_api_queries_list_documents_list(
+    server: tuple[RemoteOpenAIServer, str],
+):
+    remote_server, backend = server
     score_response = requests.post(
-        server.url_for("score"),
+        remote_server.url_for("score"),
         json={
             "model": MODEL_NAME,
             "queries": [query] * 4,
@@ -233,7 +341,12 @@ def test_score_api_queries_list_documents_list(server: RemoteOpenAIServer):
     assert score.data is not None
     assert len(score.data) == 4
     assert score.usage.prompt_tokens == 368
-    assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
-    assert score.data[1].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
-    assert score.data[2].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
-    assert score.data[3].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
+    assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "paired[0]_text_vs_text")
+    assert_score(score.data[1].score, TEXT_VS_TEXT, backend, "paired[1]_text_vs_text")
+    assert_score(score.data[2].score, TEXT_VS_IMAGE, backend, "paired[2]_text_vs_image")
+    assert_score(
+        score.data[3].score,
+        TEXT_VS_TEXT_PLUS_IMAGE,
+        backend,
+        "paired[3]_text_vs_text_plus_image",
+    )

From 7600642eaead7454fd977dde3513682244109e7c Mon Sep 17 00:00:00 2001
From: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com>
Date: Sat, 28 Feb 2026 01:02:05 -0800
Subject: [PATCH 121/154] Add padding support to wvSplitK solution for skinny
 GEMMs (#33762)

Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
---
 csrc/rocm/skinny_gemms.cu                     | 658 +++++++-----------
 .../quantization/test_rocm_skinny_gemms.py    |  74 +-
 vllm/model_executor/layers/utils.py           |   1 -
 3 files changed, 289 insertions(+), 444 deletions(-)

diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
index 15ebcc776ad7..19bb324bdcd5 100644
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -304,8 +304,9 @@ __device__ inline unsigned int min__(uint32_t a, uint32_t b) {
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_sml_(const int K, const int M, const int Bx, const int By,
-                     const scalar_t* B, const scalar_t* __restrict__ A,
+    wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap, const int M,
+                     const int Bx, const int By, const scalar_t* B,
+                     const scalar_t* __restrict__ A,
                      const scalar_t* __restrict__ BIAS, scalar_t* C,
                      const int _WvPrGrp, const int CuCount) {
   constexpr int max_lds_len = LDS_SIZE / 2;
@@ -314,7 +315,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   #else
   constexpr bool use_mfma = false;
   #endif
-
   using scalar8 =
       __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
   using half4 =
@@ -346,13 +346,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // - Then the WG will move to another 8 K elements
   // TODO: Logic below will only work when K is multiple of 8
   //----------------------------------------------------
-  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
-       k += THRDS * WvPrGrp * A_CHUNK) {
-    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
-
-    if (k_in >= min__(K * N, max_lds_len)) break;
-
-    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min__(Kap * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+  #if defined(__gfx950__)
+    __builtin_amdgcn_global_load_lds((int*)(&A[k]), (int*)(&s[k]), 16, 0, 0);
+  #else
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  #endif
   }
   __syncthreads();
 
@@ -360,9 +360,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 
   uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
 
-  float sum[N][YTILE];
-  scalar8 sum4[N][YTILE];
-
   //----------------------------------------------------
   // Each wave works on a single column of weight matrix.
   // There are 16 waves per WG, and hence, each WG is
@@ -386,44 +383,20 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     // YTILE represents how many column of weight matrix
     // are being worked on by each wave.
     //----------------------------------------------------
-    for (int i = 0; i < YTILE; i++)
-      for (int n = 0; n < N; n++)
-        if constexpr (!use_mfma)
-          sum[n][i] = 0;
-        else
-          sum4[n][i] = {0, 0, 0, 0};
-
-    bigType bigA[N][UNRL];
-    bigType bigB[YTILE][UNRL];
-    //----------------------------------------------------
-    // Fetch weight matrix B in interleaved K-split!
-    // - Each thread (lane) is fetching 8 elements (A_Chunk)
-    // - Each wave will fetch 64*8=> 512 elements (1024B)
-    // - YTILE represents the number of column being serviced
-    //   by wave
-    // - Loop for fetching weight matrix (B) are unrolled
-    //
-    // Fetch activation matrix A from LDS
-    // - Loop for fetching activation matrix (A) are unrolled
-    //
-    // Finally, do the matrix multiplication in an unrolled
-    // fashion. This provides lot of food for compiler
-    // scheduling.
-    //
-    // TODO: Logic below will only work when K is multiple of 8
-    //----------------------------------------------------
-    // for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+    float sum[N][YTILE] = {};
+    scalar8 sum4[N][YTILE] = {};
+
     for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      bigType bigA[N][UNRL] = {};
+      bigType bigB[YTILE][UNRL];
       // Fetch the weight matrix from memory!
   #pragma unroll
       for (uint32_t k2 = 0; k2 < UNRL; k2++) {
         uint32_t k = k1 + k2 * THRDS * A_CHUNK;
         uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
-        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
         for (int y = 0; y < YTILE; y++)
-          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * K])));
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[min__(y + m, M - 1) * Kbp])));
       }
 
       // Fetch activation matrix from either just LDS or from both LDS / memory
@@ -432,33 +405,20 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         uint32_t k = k1 + k2 * THRDS * A_CHUNK;
         uint32_t k_ = k + threadIdx.x * A_CHUNK;
         if (k_ >= K) break;
-
-        // Fetch A activation matrix in interleaved fashion from LDS or memory
-
         for (int n = 0; n < N; n++) {
-          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + Kap * n])));
         }
       }
 
       // Do the matrix multiplication in interleaved manner
-  #pragma unroll
       for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-        uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-        // Do the matrix multiplication of activation and weight matrix
-        // - Remember the accumulation is happening for K-split of 64!
-  #pragma unroll
         for (uint32_t n = 0; n < N; n++) {
-  #pragma unroll
           for (int y = 0; y < YTILE; y++) {
             if constexpr (!use_mfma)
-  #pragma unroll
               for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
                 DOT2C(sum[n][y], bigA[n][k2].f[b], bigB[y][k2].f[b])
               }
             else
-  #pragma unroll
               for (uint32_t b = 0; b < A_CHUNK / 4; b++)
                 sum4[n][y] = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(
                     bigA[n][k2].h4[b], bigB[y][k2].h4[b], sum4[n][y], 0, 0, 0);
@@ -466,46 +426,44 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         }
       }
     }
-
+    __builtin_amdgcn_sched_barrier(0);
     //----------------------------------------------------
     // Final reduction step using shuffle
     //----------------------------------------------------
     if constexpr (!use_mfma) {
       for (int n = 0; n < N; n++) {
         for (int y = 0; y < YTILE; y++) {
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x118, 0xf, 0xf,
+                                                1);  // row_shr8
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x114, 0xf, 0xf,
+                                                1);  // row_shr4
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x112, 0xf, 0xf,
+                                                1);  // row_shr2
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
+                                                1);  // row_shr1
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
+                                                1);  // ROW_BCAST15
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
+                                                1);  // ROW_BCAST31
         }
       }
 
       if (threadIdx.x == 63) {
+        scalar_t biases[N][YTILE] = {};
+        if (BIAS)
+          for (int n = 0; n < N; n++) {
+            for (int y = 0; y < YTILE; y++) {
+              biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+            }
+          }
         for (int n = 0; n < N; n++) {
-          for (int i = 0; i < YTILE; i++) {
+          for (int y = 0; y < YTILE; y++) {
             if constexpr (std::is_same_v<scalar_t, half>) {
-              if (BIAS)
-                sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
+              sum[n][y] += __half2float(biases[n][y]);
             } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-              if (BIAS)
-                sum[n][i] +=
-                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
+              sum[n][y] += __bfloat162float(biases[n][y]);
             }
-            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+            C[m + y + n * M] = __float2s<scalar_t>(sum[n][y]);
           }
         }
       }
@@ -514,45 +472,43 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
       for (int n = 0; n < N; n++) {
   #pragma unroll
         for (int y = 0; y < YTILE; y++) {
-          // float accm1 = 0;
-          // for (int i=0; i<64; i++)
-          //    accm1 += __shfl(sum4[n][y][i%4], i);
+          /*float accm1 = 0;
+           for (int i=0; i<64; i++)
+              accm1 += __shfl(sum4[n][y][i%4], i);
+          sum4[n][y][0] = accm1;*/
           float accm = sum4[n][y][0];
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][1]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][2]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][3]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:4 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_mov_b32 %0, %2 row_shr:15 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][1], 0x101, 0xf, 0xf,
+                                           1);  // row_shl1
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][2], 0x102, 0xf, 0xf,
+                                           1);  // row_shl2
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][3], 0x103, 0xf, 0xf,
+                                           1);  // row_shl3
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x104, 0xf, 0xf,
+                                           1);  // row_shl4
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x108, 0xf, 0xf,
+                                           1);  // row_shl8
+          accm = __builtin_amdgcn_mov_dpp(accm, 0x11f, 0xf, 0xf,
+                                          1);  // row_shr15
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x142, 0xf, 0xf,
+                                           1);  // ROW_BCAST15
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x143, 0xf, 0xf,
+                                           1);  // ROW_BCAST31
 
           sum4[n][y][0] = accm;
         }
       }
       if (threadIdx.x == 63) {
+        scalar_t biases[N][YTILE] = {};
+        if (BIAS)
+          for (int n = 0; n < N; n++) {
+            for (int y = 0; y < YTILE; y++) {
+              biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+            }
+          }
         for (int n = 0; n < N; n++) {
-          for (int i = 0; i < YTILE; i++) {
-            if (BIAS)
-              sum4[n][i][0] +=
-                  __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
-            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+          for (int y = 0; y < YTILE; y++) {
+            sum4[n][y][0] += __bfloat162float(biases[n][y]);
+            C[m + y + n * M] = __float2bfloat16(sum4[n][y][0]);
           }
         }
       }
@@ -563,8 +519,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
-__global__ void wvSplitK_hf_sml_(const int K, const int M, const int Bx,
-                                 const int By, const scalar_t* B,
+__global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
+                                 const int M, const int Bx, const int By,
+                                 const scalar_t* B,
                                  const scalar_t* __restrict__ A,
                                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                                  const int _WvPrGrp, const int CuCount) {
@@ -577,8 +534,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const int Bx,
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_(const int K, const int M, const int Bx, const int By,
-                 const scalar_t* B, const scalar_t* __restrict__ A,
+    wvSplitK_hf_(const int K, const int Kbp, const int Kap, const int M,
+                 const int Bx, const int By, const scalar_t* B,
+                 const scalar_t* __restrict__ A,
                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                  const int _WvPrGrp, const int CuCount) {
   constexpr int max_lds_len = LDS_SIZE / 2;
@@ -601,13 +559,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     scalar8 h8;
   };
 
-  //----------------------------------------------------
-  // Reserving 64 KB of LDS to have 1 WG / CU
-  // Goal is to bring the activation matrix A to the LDS
-  // and use it across the lifetime of the work group
-  // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not going to work!
-  //----------------------------------------------------
   __shared__ scalar_t s[max_lds_len];
 
   //----------------------------------------------------
@@ -618,12 +569,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     commitColumn[i] = 1;
   }
 
-  //----------------------------------------------------
-  // Indexing function into the column of weight matrix B
-  // Algorithm does 64 lane k-splitting / wave and uses
-  // WG ID and Thread ID to find the index.
-  //----------------------------------------------------
-  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
   uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
 
   // Check whether there will be fragmentation!
@@ -636,91 +581,34 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m = startColumn;
   }
 
-  //----------------------------------------------------
-  // Fetch the activation matrix to LDS
-  // Loop iteration:
-  // - Each thread (lane) is fetching 8 elements (A_Chunk)
-  // - Each wave will fetch 64*8=> 512 elements
-  // - Each WG will fetch 512 * 16 => 8K elements
-  // - Then the WG will move to another 8 K elements
-  // TODO: Logic below will only work when K is multiple of 8
-  //----------------------------------------------------
-  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
-       k += THRDS * WvPrGrp * A_CHUNK) {
-    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
-
-    if (k_in >= min__(K * N, max_lds_len)) break;
-
-    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min__(Kap * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+  #if defined(__gfx950__)
+    __builtin_amdgcn_global_load_lds((int*)(&A[k]), (int*)(&s[k]), 16, 0, 0);
+  #else
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  #endif
   }
 
   __syncthreads();
 
   if (threadIdx.y >= _WvPrGrp) return;
 
-  float sum[N][YTILE];
-  scalar8 sum4[N][YTILE];
-
-  //----------------------------------------------------
-  // Each wave works on a single column of weight matrix.
-  // There are 16 waves per WG, and hence, each WG is
-  // working on 16 columns of weight matrix. Moreover,
-  // we tile in column direction by YTILE, so when YTILE=1
-  // the above math is right, however, when YTILE=2 then
-  // each wave  will be working on 2 columns and WG will
-  // be working on 32 columns.
-  //
-  // Top level loop that makes WGs persistent!
-  // - WGs iterates across columns of weight matrix
-  // - Each wave within WG works on a given column(s)
-  // - After completing first set of columns, WGs start
-  //   working on the next set of available columns
-  //----------------------------------------------------
   while (m < M) {
-    //----------------------------------------------------
-    // 'sum' accumulates the matrix A x B computation
-    // split across 64 lanes.
-    //
-    // YTILE represents how many column of weight matrix
-    // are being worked on by each wave.
-    //----------------------------------------------------
-    for (int i = 0; i < YTILE; i++)
-      for (int n = 0; n < N; n++)
-        if constexpr (!use_mfma)
-          sum[n][i] = 0;
-        else
-          sum4[n][i] = {0, 0, 0, 0};
-
-    bigType bigA[N][UNRL];
-    bigType bigB[YTILE][UNRL];
-    //----------------------------------------------------
-    // Fetch weight matrix B in interleaved K-split!
-    // - Each thread (lane) is fetching 8 elements (A_Chunk)
-    // - Each wave will fetch 64*8=> 512 elements (1024B)
-    // - YTILE represents the number of column being serviced
-    //   by wave
-    // - Loop for fetching weight matrix (B) are unrolled
-    //
-    // Fetch activation matrix A from LDS
-    // - Loop for fetching activation matrix (A) are unrolled
-    //
-    // Finally, do the matrix multiplication in an unrolled
-    // fashion. This provides lot of food for compiler
-    // scheduling.
-    //
-    // TODO: Logic below will only work when K is multiple of 8
-    //----------------------------------------------------
+    float sum[N][YTILE] = {};
+    scalar8 sum4[N][YTILE] = {};
+
     for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      bigType bigA[N][UNRL] = {};
+      bigType bigB[YTILE][UNRL];
       // Fetch the weight matrix from memory!
   #pragma unroll
       for (uint32_t k2 = 0; k2 < UNRL; k2++) {
         uint32_t k = k1 + k2 * THRDS * A_CHUNK;
         uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
-        const scalar_t* B_ = &B[(m + 0) * K + k_];
-        for (int b = 0; b < YTILE; b++)
-          bigB[b][k2].h8 = (loadnt((scalar8*)(&B_[b * K])));
+        const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
+        for (int y = 0; y < YTILE; y++)
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[min__(y + m, M - 1) * Kbp])));
       }
 
       // Fetch activation matrix from either just LDS or from both LDS / memory
@@ -729,36 +617,23 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         uint32_t k = k1 + k2 * THRDS * A_CHUNK;
         uint32_t k_ = k + threadIdx.x * A_CHUNK;
         if (k_ >= K) break;
-
-        // Fetch A activation matrix in interleaved fashion from LDS or memory
-
         for (int n = 0; n < N; n++) {
-          if (k_ + K * n < max_lds_len)
-            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          if (k_ + Kap * n < max_lds_len)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + Kap * n])));
           else
-            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + Kap * n])));
         }
       }
 
       // Do the matrix multiplication in interleaved manner
-  #pragma unroll
       for (uint32_t n = 0; n < N; n++) {
-  #pragma unroll
         for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-          uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-          uint32_t k_ = k + threadIdx.x * A_CHUNK;
-          if (k_ >= K) break;
-          // Do the matrix multiplication of activation and weight matrix
-          // - Remember the accumulation is happening for K-split of 64!
-  #pragma unroll
           for (int y = 0; y < YTILE; y++) {
             if constexpr (!use_mfma)
-  #pragma unroll
               for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
                 DOT2C(sum[n][y], bigA[n][k2].f[b], bigB[y][k2].f[b])
               }
             else
-  #pragma unroll
               for (uint32_t b = 0; b < A_CHUNK / 4; b++)
                 sum4[n][y] = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(
                     bigA[n][k2].h4[b], bigB[y][k2].h4[b], sum4[n][y], 0, 0, 0);
@@ -773,40 +648,38 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     if constexpr (!use_mfma) {
       for (int n = 0; n < N; n++) {
         for (int y = 0; y < YTILE; y++) {
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x118, 0xf, 0xf,
+                                                1);  // row_shr8
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x114, 0xf, 0xf,
+                                                1);  // row_shr4
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x112, 0xf, 0xf,
+                                                1);  // row_shr2
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
+                                                1);  // row_shr1
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
+                                                1);  // ROW_BCAST15
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
+                                                1);  // ROW_BCAST31
         }
       }
 
       if (threadIdx.x == 63) {
+        scalar_t biases[N][YTILE] = {};
+        if (BIAS)
+          for (int n = 0; n < N; n++) {
+            for (int y = 0; y < YTILE; y++) {
+              biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+            }
+          }
         for (int n = 0; n < N; n++) {
-          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i]) {
+          for (int y = 0; y < YTILE; y++) {
+            if (commitColumn[y]) {
               if constexpr (std::is_same_v<scalar_t, half>) {
-                if (BIAS)
-                  sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
+                sum[n][y] += __half2float(biases[n][y]);
               } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-                if (BIAS)
-                  sum[n][i] +=
-                      __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
+                sum[n][y] += __bfloat162float(biases[n][y]);
               }
-              C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+              C[m + y + n * M] = __float2s<scalar_t>(sum[n][y]);
             }
           }
         }
@@ -819,44 +692,39 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
           // float accm1 = 0;
           // for (int i=0; i<64; i++)
           //    accm1 += __shfl(sum4[n][y][i%4], i);
-
           float accm = sum4[n][y][0];
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][1]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][2]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][3]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:4 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_mov_b32 %0, %2 row_shr:15 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][1], 0x101, 0xf, 0xf,
+                                           1);  // row_shl1
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][2], 0x102, 0xf, 0xf,
+                                           1);  // row_shl2
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][3], 0x103, 0xf, 0xf,
+                                           1);  // row_shl3
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x104, 0xf, 0xf,
+                                           1);  // row_shl4
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x108, 0xf, 0xf,
+                                           1);  // row_shl8
+          accm = __builtin_amdgcn_mov_dpp(accm, 0x11f, 0xf, 0xf,
+                                          1);  // row_shr15
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x142, 0xf, 0xf,
+                                           1);  // ROW_BCAST15
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x143, 0xf, 0xf,
+                                           1);  // ROW_BCAST31
           sum4[n][y][0] = accm;
         }
       }
       if (threadIdx.x == 63) {
+        scalar_t biases[N][YTILE] = {};
+        if (BIAS)
+          for (int n = 0; n < N; n++) {
+            for (int y = 0; y < YTILE; y++) {
+              biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+            }
+          }
         for (int n = 0; n < N; n++) {
-          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i]) {
-              if (BIAS)
-                sum4[n][i][0] +=
-                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
-              C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+          for (int y = 0; y < YTILE; y++) {
+            if (commitColumn[y]) {
+              sum4[n][y][0] += __bfloat162float(biases[n][y]);
+              C[m + y + n * M] = __float2bfloat16(sum4[n][y][0]);
             }
           }
         }
@@ -880,9 +748,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
-__global__ void wvSplitK_hf_(const int K, const int M, const int Bx,
-                             const int By, const scalar_t* B,
-                             const scalar_t* __restrict__ A,
+__global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
+                             const int M, const int Bx, const int By,
+                             const scalar_t* B, const scalar_t* __restrict__ A,
                              const scalar_t* __restrict__ BIAS, scalar_t* C,
                              const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
@@ -894,8 +762,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const int Bx,
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_big_(const int K, const int M, const int Bx, const int By,
-                     const scalar_t* B, const scalar_t* __restrict__ A,
+    wvSplitK_hf_big_(const int K, const int Kbp, const int Kap, const int M,
+                     const int Bx, const int By, const scalar_t* B,
+                     const scalar_t* __restrict__ A,
                      const scalar_t* __restrict__ BIAS, scalar_t* C,
                      const int _WvPrGrp, const int CuCount) {
   constexpr int max_lds_len = LDS_SIZE / 2;
@@ -966,13 +835,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   //----------------------------------------------------
   #define PCML
   #ifndef PCML
-  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
-       k += THRDS * WvPrGrp * A_CHUNK) {
-    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
-
-    if (k_in >= min__(K * N, max_lds_len)) break;
-
-    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min__(Kap * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+    #if defined(__gfx950__)
+    __builtin_amdgcn_global_load_lds((int*)(&A[k]), (int*)(&s[k]), 16, 0, 0);
+    #else
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+    #endif
   }
   __syncthreads();
   #endif
@@ -987,10 +856,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
              ? kFit
              : (kFit - kFit % TUC);  // round up to multiple of TUC
   // if (kFit == 0) kFit = TUC;
-  kFit = min__(kFit, K);
-
-  float sum[N][YTILE];
-  scalar8 sum4[N][YTILE];
+  kFit = min__(kFit, Kap);
 
   //----------------------------------------------------
   // Each wave works on a single column of weight matrix.
@@ -1021,15 +887,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     // YTILE represents how many column of weight matrix
     // are being worked on by each wave.
     //----------------------------------------------------
-    for (int i = 0; i < YTILE; i++)
-      for (int n = 0; n < N; n++)
-        if constexpr (!use_mfma)
-          sum[n][i] = 0;
-        else
-          sum4[n][i] = {0, 0, 0, 0};
-
-    bigType bigA[N][UNRL];
-    bigType bigB[YTILE][UNRL];
+    float sum[N][YTILE] = {};
+    scalar8 sum4[N][YTILE] = {};
+
     //----------------------------------------------------
     // Fetch weight matrix B in interleaved K-split!
     // - Each thread (lane) is fetching 8 elements (A_Chunk)
@@ -1048,18 +908,26 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     // TODO: Logic below will only work when K is multiple of 8
     //----------------------------------------------------
     for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      bigType bigA[N][UNRL] = {};
+      bigType bigB[YTILE][UNRL];
+
   #ifdef PCML
       if ((k1 == 0) || (k1 == kBase + kFit)) {  // load next chunk of A[] to LDS
         if (k1 != 0) kBase += kFit;
         __syncthreads();
         for (uint32_t k = 0; k < kFit; k += THRDS * _WvPrGrp * A_CHUNK) {
           uint32_t kOff = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
-          if (kBase + kOff >= K) break;
+          if (kBase + kOff >= Kap) break;
           if (kOff >= kFit) break;
           for (uint32_t n = 0; n < N; n++) {
-            uint32_t k_in = kBase + n * K + kOff;
+            uint32_t k_in = kBase + n * Kap + kOff;
             uint32_t k_ot = n * kFit + kOff;
+    #if defined(__gfx950__)
+            __builtin_amdgcn_global_load_lds((int*)(&A[k_in]), (int*)(&s[k_ot]),
+                                             16, 0, 0);
+    #else
             *((bigType*)(&s[k_ot])) = *((bigType*)(&A[k_in]));
+    #endif
           }
         }
         __syncthreads();
@@ -1072,11 +940,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
       for (uint32_t k2 = 0; k2 < UNRL; k2++) {
         uint32_t k = k1 + k2 * THRDS * A_CHUNK;
         uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
-        const scalar_t* B_ = &B[(m + 0) * K + k_];
-        for (int b = 0; b < YTILE; b++)
-          bigB[b][k2].h8 = (loadnt((scalar8*)(&B_[b * K])));
+        const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
+        for (int y = 0; y < YTILE; y++)
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[min__(y + m, M - 1) * Kbp])));
       }
 
       // Fetch activation matrix from either just LDS or from both LDS / memory
@@ -1085,17 +951,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         uint32_t k = k1 + k2 * THRDS * A_CHUNK;
         uint32_t k_ = k + threadIdx.x * A_CHUNK;
         if (k_ >= K) break;
-
-        // Fetch A activation matrix in interleaved fashion from LDS or memory
-
         for (int n = 0; n < N; n++) {
   #ifdef PCML
           bigA[n][k2] = *((const bigType*)(&(s[k_ - kBase + kFit * n])));
   #else
-          if (k_ + K * n < 32 * 1024)
-            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          if (k_ + Kap * n < max_lds_len)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + Kap * n])));
           else
-            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + Kap * n])));
   #endif
         }
       }
@@ -1103,22 +966,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
       // Do the matrix multiplication in interleaved manner
   #pragma unroll
       for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-        uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-  #pragma unroll
         for (uint32_t n = 0; n < N; n++) {
-          // Do the matrix multiplication of activation and weight matrix
-          // - Remember the accumulation is happening for K-split of 64!
-  #pragma unroll
           for (int y = 0; y < YTILE; y++) {
             if constexpr (!use_mfma)
-  #pragma unroll
               for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
                 DOT2C(sum[n][y], bigA[n][k2].f[b], bigB[y][k2].f[b])
               }
             else
-  #pragma unroll
               for (uint32_t b = 0; b < A_CHUNK / 4; b++)
                 sum4[n][y] = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(
                     bigA[n][k2].h4[b], bigB[y][k2].h4[b], sum4[n][y], 0, 0, 0);
@@ -1141,40 +995,38 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     if constexpr (!use_mfma) {
       for (int n = 0; n < N; n++) {
         for (int y = 0; y < YTILE; y++) {
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
-              : "=v"(sum[n][y])
-              : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x118, 0xf, 0xf,
+                                                1);  // row_shr8
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x114, 0xf, 0xf,
+                                                1);  // row_shr4
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x112, 0xf, 0xf,
+                                                1);  // row_shr2
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
+                                                1);  // row_shr1
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
+                                                1);  // ROW_BCAST15
+          sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
+                                                1);  // ROW_BCAST31
         }
       }
 
       if (threadIdx.x == 63) {
+        scalar_t biases[N][YTILE] = {};
+        if (BIAS)
+          for (int n = 0; n < N; n++) {
+            for (int y = 0; y < YTILE; y++) {
+              biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+            }
+          }
         for (int n = 0; n < N; n++) {
-          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i]) {
+          for (int y = 0; y < YTILE; y++) {
+            if (commitColumn[y]) {
               if constexpr (std::is_same_v<scalar_t, half>) {
-                if (BIAS)
-                  sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
+                sum[n][y] += __half2float(biases[n][y]);
               } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-                if (BIAS)
-                  sum[n][i] +=
-                      __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
+                sum[n][y] += __bfloat162float(biases[n][y]);
               }
-              C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+              C[m + y + n * M] = __float2s<scalar_t>(sum[n][y]);
             }
           }
         }
@@ -1185,42 +1037,38 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   #pragma unroll
         for (int y = 0; y < YTILE; y++) {
           float accm = sum4[n][y][0];
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][1]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][2]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(sum4[n][y][3]), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:4 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_mov_b32 %0, %2 row_shr:15 bound_ctrl:0 "
-              : "=v"(accm)
-              : "0"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-          asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
-              : "=v"(accm)
-              : "0"(accm), "v"(accm), "v"(accm));
-
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][1], 0x101, 0xf, 0xf,
+                                           1);  // row_shl1
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][2], 0x102, 0xf, 0xf,
+                                           1);  // row_shl2
+          accm += __builtin_amdgcn_mov_dpp(sum4[n][y][3], 0x103, 0xf, 0xf,
+                                           1);  // row_shl3
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x104, 0xf, 0xf,
+                                           1);  // row_shl4
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x108, 0xf, 0xf,
+                                           1);  // row_shl8
+          accm = __builtin_amdgcn_mov_dpp(accm, 0x11f, 0xf, 0xf,
+                                          1);  // row_shr15
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x142, 0xf, 0xf,
+                                           1);  // ROW_BCAST15
+          accm += __builtin_amdgcn_mov_dpp(accm, 0x143, 0xf, 0xf,
+                                           1);  // ROW_BCAST31
           sum4[n][y][0] = accm;
         }
       }
       if (threadIdx.x == 63) {
+        scalar_t biases[N][YTILE] = {};
+        if (BIAS)
+          for (int n = 0; n < N; n++) {
+            for (int y = 0; y < YTILE; y++) {
+              biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+            }
+          }
         for (int n = 0; n < N; n++) {
-          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i]) {
-              if (BIAS)
-                sum4[n][i][0] +=
-                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
-              C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+          for (int y = 0; y < YTILE; y++) {
+            if (commitColumn[y]) {
+              sum4[n][y][0] += __bfloat162float(biases[n][y]);
+              C[m + y + n * M] = __float2bfloat16(sum4[n][y][0]);
             }
           }
         }
@@ -1244,8 +1092,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
-__global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
-                                 const int By, const scalar_t* B,
+__global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
+                                 const int M, const int Bx, const int By,
+                                 const scalar_t* B,
                                  const scalar_t* __restrict__ A,
                                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                                  const int _WvPrGrp, const int CuCount) {
@@ -1272,6 +1121,8 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
   auto M_in = in_a.size(0);
   auto K_in = in_a.size(1);
   auto N_in = in_b.size(0);
+  auto Kap_in = in_a.stride(0);
+  auto Kbp_in = in_b.stride(0);
   auto Bx_in =
       (in_bias.has_value() && in_bias->numel() > 0)
           ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
@@ -1296,27 +1147,30 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const int max_lds_len = get_lds_size() / 2;
 
-#define WVSPLITK(_YTILE, _UNRL, _N)                                        \
-  {                                                                        \
-    dim3 block(64, 16);                                                    \
-    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16);                    \
-    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0))              \
-      wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
-                                       biasf4, c, __wvPrGrp, CuCount);     \
-    else if (K_in * N_in <= max_lds_len * 1.2)                             \
-      wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                   \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
-                                       biasf4, c, __wvPrGrp, CuCount);     \
-    else                                                                   \
-      wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
-                                       biasf4, c, __wvPrGrp, CuCount);     \
+#define WVSPLITK(_YTILE, _UNRL, _N)                                           \
+  {                                                                           \
+    dim3 block(64, 16);                                                       \
+    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16);                       \
+    if ((Kbp_in * N_in <= max_lds_len) && (M_in % _YTILE == 0))               \
+      wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                  \
+          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
+                                       By_in, af4, bf4, biasf4, c, __wvPrGrp, \
+                                       CuCount);                              \
+    else if (Kbp_in * N_in <= max_lds_len * 1.2)                              \
+      wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                      \
+          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
+                                       By_in, af4, bf4, biasf4, c, __wvPrGrp, \
+                                       CuCount);                              \
+    else                                                                      \
+      wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                  \
+          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
+                                       By_in, af4, bf4, biasf4, c, __wvPrGrp, \
+                                       CuCount);                              \
   }
 
 #define WVSPLIT_TILE(_sYT, __N)                           \
   {                                                       \
-    bool fit_lds = (K_in * N_in <= max_lds_len);          \
+    bool fit_lds = (Kbp_in * N_in <= max_lds_len);        \
     if (_sYT <= 1)                                        \
       WVSPLITK(1, 4, __N)                                 \
     else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index e67772616bfa..1f55a597d122 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -30,15 +30,22 @@
 
 NKM_FACTORS_WVSPLITK = [
     # Different batch sizes with key dimensions
-    (1, 16, 16),
+    (1, 32, 16),
     (1, 64, 64),
     (2, 256, 256),
     (3, 1024, 1024),
     (4, 4096, 4096),
+    (4, 4096, 4096 + 1),
+    (4, 4096 + 16, 4096),
+    (4, 4096 + 16, 4096 + 1),
     # Extended K values
     (1, 9216, 512),
     (2, 10240, 1024),
     (4, 16384, 8192),
+    (4, 16384 * 2, 8192),
+    (4, 16384 * 2, 8192 + 1),
+    (4, 16384 * 2 + 16, 8192),
+    (4, 16384 * 2 + 16, 8192 + 1),
     # Minimum M constraint validation (m >= 8)
     (1, 64, 8),
     (2, 128, 8),
@@ -180,59 +187,44 @@ def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
     torch.testing.assert_close(out, ref_out, atol=1e-8, rtol=1e-2)
 
 
+@pytest.mark.parametrize("xnorm", [False, True])
 @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
-def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
-    torch.manual_seed(seed)
-    cu_count = num_compute_units()
-
-    A = torch.rand(n, k, dtype=dtype, device="cuda") - 0.5
-    B = torch.rand(m, k, dtype=dtype, device="cuda") - 0.5
-
-    ref_out = torch.nn.functional.linear(A, B)
-    out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count)
-
-    torch.testing.assert_close(out, ref_out, atol=1e-8, rtol=1e-2)
-
-
-@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
-def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed):
+@pytest.mark.parametrize("bias_mode", BIAS_MODES)
+@pytest.mark.parametrize("padded_a", [False, True])
+@pytest.mark.parametrize("padded_b", [False, True])
+def test_rocm_wvsplitk_kernel(
+    xnorm, n, k, m, dtype, seed, bias_mode, padded_a, padded_b
+):
     torch.manual_seed(seed)
     cu_count = num_compute_units()
 
-    xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
-    A = (torch.rand(n, k, dtype=dtype, device="cuda") - 0.5) * xavier
-    B = (torch.rand(m, k, dtype=dtype, device="cuda") - 0.5) * xavier
-    BIAS = torch.rand(m, dtype=dtype, device="cuda") - 0.5
-
-    ref_out = torch.nn.functional.linear(A, B, BIAS)
-    out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS)
-
-    torch.testing.assert_close(out, ref_out, atol=1e-8, rtol=1e-2)
-
+    xavier = (
+        math.sqrt(2 / k) if xnorm else 1
+    )  # normalize to avoid large output-bias deltas
+    A = (torch.rand(n, k, dtype=dtype, device="cuda") * 2 - 1) * xavier
+    B = (torch.rand(m, k, dtype=dtype, device="cuda") * 2 - 1) * xavier
 
-@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
-def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
-    torch.manual_seed(seed)
-    cu_count = num_compute_units()
+    BIAS = None
+    if bias_mode == 1:
+        BIAS = torch.rand(m, dtype=dtype, device="cuda") * 2 - 1
+    elif bias_mode == 2:
+        BIAS = torch.rand(n, m, dtype=dtype, device="cuda") * 2 - 1
 
-    xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
-    A = (torch.rand(n, k, dtype=dtype, device="cuda") - 0.5) * xavier
-    B = (torch.rand(m, k, dtype=dtype, device="cuda") - 0.5) * xavier
-    BIAS = torch.rand(n, m, dtype=dtype, device="cuda") - 0.5
+    if padded_a:
+        A = pad_fp8(A)
+    if padded_b:
+        B = pad_fp8(B)
 
     ref_out = torch.nn.functional.linear(A, B, BIAS)
     out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS)
 
-    torch.testing.assert_close(out, ref_out, atol=1e-8, rtol=1e-2)
+    if xnorm:
+        assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-8)
+    else:
+        assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-2)
 
 
 @pytest.mark.parametrize("xnorm", [False, True])
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index bc51b0e5e5de..79d48a203ab6 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -191,7 +191,6 @@ def rocm_unquantized_gemm_impl(
         and on_gfx9()
         and x.dtype in [torch.float16, torch.bfloat16]
         and k % 8 == 0
-        and x.is_contiguous()
     )
 
     if use_skinny is not True:

From 0892d1ab1f9b3476f31811e851d7b3705dfeaefe Mon Sep 17 00:00:00 2001
From: Mario Hong <86880754+mariohong128@users.noreply.github.com>
Date: Sat, 28 Feb 2026 17:02:33 +0800
Subject: [PATCH 122/154] [Feature]Supports Anthropic Thinking Block (#33671)

Signed-off-by: mariohong <mariohong128@gmail.com>
Co-authored-by: zetaohong <i-hongzetao@stepfun.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 vllm/entrypoints/anthropic/protocol.py |  12 +-
 vllm/entrypoints/anthropic/serving.py  | 321 +++++++++++++++++--------
 2 files changed, 236 insertions(+), 97 deletions(-)

diff --git a/vllm/entrypoints/anthropic/protocol.py b/vllm/entrypoints/anthropic/protocol.py
index af9430e78475..3081e9781034 100644
--- a/vllm/entrypoints/anthropic/protocol.py
+++ b/vllm/entrypoints/anthropic/protocol.py
@@ -34,7 +34,7 @@ class AnthropicUsage(BaseModel):
 class AnthropicContentBlock(BaseModel):
     """Content block in message"""
 
-    type: Literal["text", "image", "tool_use", "tool_result"]
+    type: Literal["text", "image", "tool_use", "tool_result", "thinking"]
     text: str | None = None
     # For image content
     source: dict[str, Any] | None = None
@@ -45,6 +45,9 @@ class AnthropicContentBlock(BaseModel):
     input: dict[str, Any] | None = None
     content: str | list[dict[str, Any]] | None = None
     is_error: bool | None = None
+    # For thinking content
+    thinking: str | None = None
+    signature: str | None = None
 
 
 class AnthropicMessage(BaseModel):
@@ -118,9 +121,14 @@ def validate_max_tokens(cls, v):
 class AnthropicDelta(BaseModel):
     """Delta for streaming responses"""
 
-    type: Literal["text_delta", "input_json_delta"] | None = None
+    type: (
+        Literal["text_delta", "input_json_delta", "thinking_delta", "signature_delta"]
+        | None
+    ) = None
     text: str | None = None
+    thinking: str | None = None
     partial_json: str | None = None
+    signature: str | None = None
 
     # Message delta
     stop_reason: (
diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py
index dc037313de33..6318f854a1e4 100644
--- a/vllm/entrypoints/anthropic/serving.py
+++ b/vllm/entrypoints/anthropic/serving.py
@@ -8,6 +8,7 @@
 import json
 import logging
 import time
+import uuid
 from collections.abc import AsyncGenerator
 from typing import Any
 
@@ -112,6 +113,7 @@ def _convert_anthropic_to_openai_request(
                 # Handle complex content blocks
                 content_parts: list[dict[str, Any]] = []
                 tool_calls: list[dict[str, Any]] = []
+                reasoning_parts: list[str] = []
 
                 for block in msg.content:
                     if block.type == "text" and block.text:
@@ -123,6 +125,8 @@ def _convert_anthropic_to_openai_request(
                                 "image_url": {"url": block.source.get("data", "")},
                             }
                         )
+                    elif block.type == "thinking" and block.thinking is not None:
+                        reasoning_parts.append(block.thinking)
                     elif block.type == "tool_use":
                         # Convert tool use to function call format
                         tool_call = {
@@ -157,6 +161,9 @@ def _convert_anthropic_to_openai_request(
                                 }
                             )
 
+                if reasoning_parts:
+                    openai_msg["reasoning"] = "".join(reasoning_parts)
+
                 # Add tool calls to the message if any
                 if tool_calls:
                     openai_msg["tool_calls"] = tool_calls  # type: ignore
@@ -167,7 +174,7 @@ def _convert_anthropic_to_openai_request(
                         openai_msg["content"] = content_parts[0]["text"]
                     else:
                         openai_msg["content"] = content_parts  # type: ignore
-                elif not tool_calls:
+                elif not tool_calls and not reasoning_parts:
                     continue
 
             openai_messages.append(openai_msg)
@@ -263,23 +270,32 @@ def messages_full_converter(
                 output_tokens=generator.usage.completion_tokens,
             ),
         )
-        if generator.choices[0].finish_reason == "stop":
+        choice = generator.choices[0]
+        if choice.finish_reason == "stop":
             result.stop_reason = "end_turn"
-        elif generator.choices[0].finish_reason == "length":
+        elif choice.finish_reason == "length":
             result.stop_reason = "max_tokens"
-        elif generator.choices[0].finish_reason == "tool_calls":
+        elif choice.finish_reason == "tool_calls":
             result.stop_reason = "tool_use"
 
-        content: list[AnthropicContentBlock] = [
-            AnthropicContentBlock(
-                type="text",
-                text=generator.choices[0].message.content
-                if generator.choices[0].message.content
-                else "",
+        content: list[AnthropicContentBlock] = []
+        if choice.message.reasoning:
+            content.append(
+                AnthropicContentBlock(
+                    type="thinking",
+                    thinking=choice.message.reasoning,
+                    signature=uuid.uuid4().hex,
+                )
+            )
+        if choice.message.content:
+            content.append(
+                AnthropicContentBlock(
+                    type="text",
+                    text=choice.message.content,
+                )
             )
-        ]
 
-        for tool_call in generator.choices[0].message.tool_calls:
+        for tool_call in choice.message.tool_calls:
             anthropic_tool_call = AnthropicContentBlock(
                 type="tool_use",
                 id=tool_call.id,
@@ -297,10 +313,85 @@ async def message_stream_converter(
         generator: AsyncGenerator[str, None],
     ) -> AsyncGenerator[str, None]:
         try:
+
+            class _ActiveBlockState:
+                def __init__(self) -> None:
+                    self.content_block_index = 0
+                    self.block_type: str | None = None
+                    self.block_index: int | None = None
+                    self.block_signature: str | None = None
+                    self.signature_emitted: bool = False
+                    self.tool_use_id: str | None = None
+
+                def reset(self) -> None:
+                    self.block_type = None
+                    self.block_index = None
+                    self.block_signature = None
+                    self.signature_emitted = False
+                    self.tool_use_id = None
+
+                def start(self, block: AnthropicContentBlock) -> None:
+                    self.block_type = block.type
+                    self.block_index = self.content_block_index
+                    if block.type == "thinking":
+                        self.block_signature = uuid.uuid4().hex
+                        self.signature_emitted = False
+                        self.tool_use_id = None
+                    elif block.type == "tool_use":
+                        self.block_signature = None
+                        self.signature_emitted = True
+                        self.tool_use_id = block.id
+                    else:
+                        self.block_signature = None
+                        self.signature_emitted = True
+                        self.tool_use_id = None
+
             first_item = True
             finish_reason = None
-            content_block_index = 0
-            content_block_started = False
+            state = _ActiveBlockState()
+            # Map from tool call index to tool_use_id
+            tool_index_to_id: dict[int, str] = {}
+
+            def stop_active_block():
+                events: list[str] = []
+                if state.block_type is None:
+                    return events
+                if (
+                    state.block_type == "thinking"
+                    and state.block_signature is not None
+                    and not state.signature_emitted
+                ):
+                    chunk = AnthropicStreamEvent(
+                        index=state.block_index,
+                        type="content_block_delta",
+                        delta=AnthropicDelta(
+                            type="signature_delta",
+                            signature=state.block_signature,
+                        ),
+                    )
+                    data = chunk.model_dump_json(exclude_unset=True)
+                    events.append(wrap_data_with_event(data, "content_block_delta"))
+                    state.signature_emitted = True
+                stop_chunk = AnthropicStreamEvent(
+                    index=state.block_index,
+                    type="content_block_stop",
+                )
+                data = stop_chunk.model_dump_json(exclude_unset=True)
+                events.append(wrap_data_with_event(data, "content_block_stop"))
+                state.reset()
+                state.content_block_index += 1
+                return events
+
+            def start_block(block: AnthropicContentBlock):
+                chunk = AnthropicStreamEvent(
+                    index=state.content_block_index,
+                    type="content_block_start",
+                    content_block=block,
+                )
+                data = chunk.model_dump_json(exclude_unset=True)
+                event = wrap_data_with_event(data, "content_block_start")
+                state.start(block)
+                return event
 
             async for item in generator:
                 if item.startswith("data:"):
@@ -326,6 +417,8 @@ async def message_stream_converter(
                                     id=origin_chunk.id,
                                     content=[],
                                     model=origin_chunk.model,
+                                    stop_reason=None,
+                                    stop_sequence=None,
                                     usage=AnthropicUsage(
                                         input_tokens=origin_chunk.usage.prompt_tokens
                                         if origin_chunk.usage
@@ -341,13 +434,8 @@ async def message_stream_converter(
 
                         # last chunk including usage info
                         if len(origin_chunk.choices) == 0:
-                            if content_block_started:
-                                stop_chunk = AnthropicStreamEvent(
-                                    index=content_block_index,
-                                    type="content_block_stop",
-                                )
-                                data = stop_chunk.model_dump_json(exclude_unset=True)
-                                yield wrap_data_with_event(data, "content_block_stop")
+                            for event in stop_active_block():
+                                yield event
                             stop_reason = self.stop_reason_map.get(
                                 finish_reason or "stop"
                             )
@@ -369,96 +457,139 @@ async def message_stream_converter(
 
                         if origin_chunk.choices[0].finish_reason is not None:
                             finish_reason = origin_chunk.choices[0].finish_reason
-                            continue
+                            # continue
 
-                        # content
-                        if origin_chunk.choices[0].delta.content is not None:
-                            if not content_block_started:
+                        # thinking / text content
+                        reasoning_delta = origin_chunk.choices[0].delta.reasoning
+                        if reasoning_delta is not None:
+                            if reasoning_delta == "":
+                                pass
+                            else:
+                                if state.block_type != "thinking":
+                                    for event in stop_active_block():
+                                        yield event
+                                    start_event = start_block(
+                                        AnthropicContentBlock(
+                                            type="thinking", thinking=""
+                                        )
+                                    )
+                                    yield start_event
                                 chunk = AnthropicStreamEvent(
-                                    index=content_block_index,
-                                    type="content_block_start",
-                                    content_block=AnthropicContentBlock(
-                                        type="text", text=""
+                                    index=(
+                                        state.block_index
+                                        if state.block_index is not None
+                                        else state.content_block_index
+                                    ),
+                                    type="content_block_delta",
+                                    delta=AnthropicDelta(
+                                        type="thinking_delta",
+                                        thinking=reasoning_delta,
                                     ),
                                 )
                                 data = chunk.model_dump_json(exclude_unset=True)
-                                yield wrap_data_with_event(data, "content_block_start")
-                                content_block_started = True
+                                yield wrap_data_with_event(data, "content_block_delta")
 
+                        if origin_chunk.choices[0].delta.content is not None:
                             if origin_chunk.choices[0].delta.content == "":
-                                continue
-                            chunk = AnthropicStreamEvent(
-                                index=content_block_index,
-                                type="content_block_delta",
-                                delta=AnthropicDelta(
-                                    type="text_delta",
-                                    text=origin_chunk.choices[0].delta.content,
-                                ),
-                            )
-                            data = chunk.model_dump_json(exclude_unset=True)
-                            yield wrap_data_with_event(data, "content_block_delta")
-                            continue
-
-                        # tool calls
-                        elif len(origin_chunk.choices[0].delta.tool_calls) > 0:
-                            tool_call = origin_chunk.choices[0].delta.tool_calls[0]
-                            if tool_call.id is not None:
-                                if content_block_started:
-                                    stop_chunk = AnthropicStreamEvent(
-                                        index=content_block_index,
-                                        type="content_block_stop",
-                                    )
-                                    data = stop_chunk.model_dump_json(
-                                        exclude_unset=True
-                                    )
-                                    yield wrap_data_with_event(
-                                        data, "content_block_stop"
+                                pass
+                            else:
+                                if state.block_type != "text":
+                                    for event in stop_active_block():
+                                        yield event
+                                    start_event = start_block(
+                                        AnthropicContentBlock(type="text", text="")
                                     )
-                                    content_block_started = False
-                                    content_block_index += 1
-
+                                    yield start_event
                                 chunk = AnthropicStreamEvent(
-                                    index=content_block_index,
-                                    type="content_block_start",
-                                    content_block=AnthropicContentBlock(
-                                        type="tool_use",
-                                        id=tool_call.id,
-                                        name=tool_call.function.name
-                                        if tool_call.function
-                                        else None,
-                                        input={},
+                                    index=(
+                                        state.block_index
+                                        if state.block_index is not None
+                                        else state.content_block_index
                                     ),
-                                )
-                                data = chunk.model_dump_json(exclude_unset=True)
-                                yield wrap_data_with_event(data, "content_block_start")
-                                content_block_started = True
-                                if tool_call.function and tool_call.function.arguments:
-                                    chunk = AnthropicStreamEvent(
-                                        index=content_block_index,
-                                        type="content_block_delta",
-                                        delta=AnthropicDelta(
-                                            type="input_json_delta",
-                                            partial_json=tool_call.function.arguments,
-                                        ),
-                                    )
-                                    data = chunk.model_dump_json(exclude_unset=True)
-                                    yield wrap_data_with_event(
-                                        data, "content_block_delta"
-                                    )
-
-                            else:
-                                chunk = AnthropicStreamEvent(
-                                    index=content_block_index,
                                     type="content_block_delta",
                                     delta=AnthropicDelta(
-                                        type="input_json_delta",
-                                        partial_json=tool_call.function.arguments
-                                        if tool_call.function
-                                        else None,
+                                        type="text_delta",
+                                        text=origin_chunk.choices[0].delta.content,
                                     ),
                                 )
                                 data = chunk.model_dump_json(exclude_unset=True)
                                 yield wrap_data_with_event(data, "content_block_delta")
+
+                        # tool calls - process all tool calls in the delta
+                        if len(origin_chunk.choices[0].delta.tool_calls) > 0:
+                            for tool_call in origin_chunk.choices[0].delta.tool_calls:
+                                if tool_call.id is not None:
+                                    # Update mapping for incremental updates
+                                    tool_index_to_id[tool_call.index] = tool_call.id
+                                    # Only create new block if different tool call
+                                    # AND has a name
+                                    tool_name = (
+                                        tool_call.function.name
+                                        if tool_call.function
+                                        else None
+                                    )
+                                    if (
+                                        state.tool_use_id != tool_call.id
+                                        and tool_name is not None
+                                    ):
+                                        for event in stop_active_block():
+                                            yield event
+                                        start_event = start_block(
+                                            AnthropicContentBlock(
+                                                type="tool_use",
+                                                id=tool_call.id,
+                                                name=tool_name,
+                                                input={},
+                                            )
+                                        )
+                                        yield start_event
+                                    # Handle initial arguments if present
+                                    if (
+                                        tool_call.function
+                                        and tool_call.function.arguments
+                                        and state.tool_use_id == tool_call.id
+                                    ):
+                                        chunk = AnthropicStreamEvent(
+                                            index=(
+                                                state.block_index
+                                                if state.block_index is not None
+                                                else state.content_block_index
+                                            ),
+                                            type="content_block_delta",
+                                            delta=AnthropicDelta(
+                                                type="input_json_delta",
+                                                partial_json=tool_call.function.arguments,
+                                            ),
+                                        )
+                                        data = chunk.model_dump_json(exclude_unset=True)
+                                        yield wrap_data_with_event(
+                                            data, "content_block_delta"
+                                        )
+                                else:
+                                    # Incremental update - use index to find tool_use_id
+                                    tool_use_id = tool_index_to_id.get(tool_call.index)
+                                    if (
+                                        tool_use_id is not None
+                                        and tool_call.function
+                                        and tool_call.function.arguments
+                                        and state.tool_use_id == tool_use_id
+                                    ):
+                                        chunk = AnthropicStreamEvent(
+                                            index=(
+                                                state.block_index
+                                                if state.block_index is not None
+                                                else state.content_block_index
+                                            ),
+                                            type="content_block_delta",
+                                            delta=AnthropicDelta(
+                                                type="input_json_delta",
+                                                partial_json=tool_call.function.arguments,
+                                            ),
+                                        )
+                                        data = chunk.model_dump_json(exclude_unset=True)
+                                        yield wrap_data_with_event(
+                                            data, "content_block_delta"
+                                        )
                             continue
                 else:
                     error_response = AnthropicStreamEvent(

From 8e75d885544c9d7602344e9db2c7e3cff9b73c11 Mon Sep 17 00:00:00 2001
From: Augusto Yao <augusto.yjh@antgroup.com>
Date: Sat, 28 Feb 2026 17:16:37 +0800
Subject: [PATCH 123/154] add io_process_plugin for sparse embedding (#34214)

Signed-off-by: augusto.yjh <augusto.yjh@antgroup.com>
Signed-off-by: Augusto Yao <augusto.yjh@antgroup.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .buildkite/test-amd.yaml                      |  10 +-
 .buildkite/test_areas/plugins.yaml            |   4 +
 docs/design/io_processor_plugins.md           |   7 +-
 .../bge_m3_sparse_processor/__init__.py       |   6 +
 .../sparse_embeddings_processor.py            | 135 +++++++++++
 .../bge_m3_sparse_processor/types.py          |  32 +++
 tests/plugins/bge_m3_sparse_plugin/setup.py   |  15 ++
 .../prithvi_io_processor/prithvi_processor.py |   5 +-
 ...test_bge_m3_sparse_io_processor_plugins.py | 212 ++++++++++++++++++
 .../test_io_processor_plugins.py              |   2 +-
 vllm/plugins/io_processors/__init__.py        |  18 +-
 vllm/plugins/io_processors/interface.py       |   3 +-
 vllm/v1/engine/async_llm.py                   |   1 +
 vllm/v1/engine/llm_engine.py                  |   1 +
 14 files changed, 441 insertions(+), 10 deletions(-)
 create mode 100644 tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py
 create mode 100644 tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py
 create mode 100644 tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py
 create mode 100644 tests/plugins/bge_m3_sparse_plugin/setup.py
 create mode 100644 tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4c15e7382046..6c35e0db137d 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1390,6 +1390,10 @@ steps:
   - pip install -e ./plugins/prithvi_io_processor_plugin
   - pytest -v -s plugins_tests/test_io_processor_plugins.py
   - pip uninstall prithvi_io_processor_plugin -y
+  # test bge_m3_sparse io_processor plugin
+  - pip install -e ./plugins/bge_m3_sparse_plugin
+  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
+  - pip uninstall bge_m3_sparse_plugin -y
   # end io_processor plugins test
   # begin stat_logger plugins test
   - pip install -e ./plugins/vllm_add_dummy_stat_logger
@@ -2967,6 +2971,10 @@ steps:
   - pip install -e ./plugins/prithvi_io_processor_plugin
   - pytest -v -s plugins_tests/test_io_processor_plugins.py
   - pip uninstall prithvi_io_processor_plugin -y
+  # test bge_m3_sparse io_processor plugin
+  - pip install -e ./plugins/bge_m3_sparse_plugin
+  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
+  - pip uninstall bge_m3_sparse_plugin -y
   # end io_processor plugins test
   # begin stat_logger plugins test
   - pip install -e ./plugins/vllm_add_dummy_stat_logger
@@ -3248,4 +3256,4 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
\ No newline at end of file
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
index ccc54b47abd4..16f9abccf6e1 100644
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -19,6 +19,10 @@ steps:
   - pip install -e ./plugins/prithvi_io_processor_plugin
   - pytest -v -s plugins_tests/test_io_processor_plugins.py
   - pip uninstall prithvi_io_processor_plugin -y
+  # test bge_m3_sparse io_processor plugin
+  - pip install -e ./plugins/bge_m3_sparse_plugin
+  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
+  - pip uninstall bge_m3_sparse_plugin -y
   # end io_processor plugins test
   # begin stat_logger plugins test
   - pip install -e ./plugins/vllm_add_dummy_stat_logger
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index c6945e443c37..68b532108672 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -13,12 +13,13 @@ IOProcessorInput = TypeVar("IOProcessorInput")
 IOProcessorOutput = TypeVar("IOProcessorOutput")
 
 class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
-    def __init__(self, vllm_config: VllmConfig):
+    """Abstract interface for pre/post-processing of engine I/O."""
+
+    def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer):
         super().__init__()
 
         self.vllm_config = vllm_config
 
-    @abstractmethod
     def parse_data(self, data: object) -> IOProcessorInput:
         raise NotImplementedError
 
@@ -32,7 +33,7 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
         self,
         params: PoolingParams | None = None,
     ) -> PoolingParams:
-        return params or PoolingParams()
+        return params or PoolingParams(task="plugin")
 
     @abstractmethod
     def pre_process(
diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py
new file mode 100644
index 000000000000..a428be6fc0ec
--- /dev/null
+++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+def register_bge_m3_sparse_embeddings_processor():
+    return "bge_m3_sparse_processor.sparse_embeddings_processor.BgeM3SparseEmbeddingsProcessor"  # noqa: E501
diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py
new file mode 100644
index 000000000000..4749d3e81fed
--- /dev/null
+++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from vllm.config import VllmConfig
+from vllm.entrypoints.openai.engine.protocol import UsageInfo
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput
+from vllm.plugins.io_processors.interface import (
+    IOProcessor,
+)
+from vllm.pooling_params import PoolingParams
+from vllm.renderers import BaseRenderer
+from vllm.tokenizers.detokenizer_utils import convert_ids_list_to_tokens
+
+from .types import (
+    SparseEmbeddingCompletionRequestMixin,
+    SparseEmbeddingResponse,
+    SparseEmbeddingResponseData,
+    SparseEmbeddingTokenWeight,
+)
+
+logger = init_logger(__name__)
+
+
+class BgeM3SparseEmbeddingsProcessor(
+    IOProcessor[SparseEmbeddingCompletionRequestMixin, SparseEmbeddingResponse]
+):
+    def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer):
+        super().__init__(vllm_config, renderer)
+        self.offline_requests: list[SparseEmbeddingCompletionRequestMixin] = []
+        self.online_requests: dict[str, SparseEmbeddingCompletionRequestMixin] = {}
+        self.renderer: BaseRenderer = renderer
+
+    def merge_pooling_params(
+        self,
+        params: PoolingParams | None = None,
+    ) -> PoolingParams:
+        if params is None:
+            params = PoolingParams()
+        # refer to PoolingCompletionRequest.to_pooling_params
+        params.task = "token_classify"
+        return params
+
+    def parse_request(
+        self, request_data: object
+    ) -> SparseEmbeddingCompletionRequestMixin:
+        # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly.
+        if isinstance(request_data, dict):
+            return SparseEmbeddingCompletionRequestMixin(**request_data)
+        raise TypeError("request_data should be a dictionary")
+
+    def pre_process(
+        self,
+        prompt: SparseEmbeddingCompletionRequestMixin,
+        request_id: str | None = None,
+        **kwargs,
+    ) -> PromptType | Sequence[PromptType]:
+        if request_id is not None:
+            assert request_id not in self.online_requests, "request_id duplicated"
+            self.online_requests[request_id] = prompt
+        else:
+            self.offline_requests.append(prompt)
+        return prompt.input
+
+    def _get_sparse_embedding_request(self, request_id: str | None = None):
+        if request_id:
+            return self.online_requests.pop(request_id, None)
+        return self.offline_requests.pop()
+
+    def _build_sparse_embedding_token_weights(
+        self,
+        sparse_embedding: dict[int, float],
+        return_tokens: bool = False,
+    ) -> list[SparseEmbeddingTokenWeight]:
+        token_ids = sparse_embedding.keys()
+        token_weights = sparse_embedding.values()
+        tokens = [None] * len(token_ids)
+
+        if return_tokens and self.renderer is not None:
+            tokens = convert_ids_list_to_tokens(
+                self.renderer.get_tokenizer(), token_ids
+            )
+        sparse_embedding_output: list[SparseEmbeddingTokenWeight] = []
+        for token_id, weight, token in zip(token_ids, token_weights, tokens):
+            sparse_embedding_output.append(
+                SparseEmbeddingTokenWeight(
+                    token_id=token_id, weight=weight, token=token
+                )
+            )
+        return sparse_embedding_output
+
+    def post_process(
+        self,
+        model_output: Sequence[PoolingRequestOutput],
+        request_id: str | None = None,
+        **kwargs,
+    ) -> SparseEmbeddingResponse:
+        num_prompt_tokens = 0
+        response_data = []
+        return_tokens = self._get_sparse_embedding_request(request_id).return_tokens
+        for idx in range(len(model_output)):
+            mo = model_output[idx]
+            sparse_embedding: dict[int, float] = {}
+            num_prompt_tokens += len(mo.prompt_token_ids)
+            if len(mo.prompt_token_ids) != len(mo.outputs.data):
+                # this is the case that add_special_tokens is True,
+                # which means first token and last token are special tokens
+                mo.prompt_token_ids = mo.prompt_token_ids[1:]
+            for token_id, weight in zip(mo.prompt_token_ids, mo.outputs.data.tolist()):
+                sparse_embedding[token_id] = max(
+                    weight, sparse_embedding.get(token_id, 0.0)
+                )
+            response_data.append(
+                SparseEmbeddingResponseData(
+                    index=idx,
+                    sparse_embedding=self._build_sparse_embedding_token_weights(
+                        sparse_embedding,
+                        return_tokens,
+                    ),
+                )
+            )
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+        resp = SparseEmbeddingResponse(
+            data=response_data,
+            usage=usage,
+        )
+
+        return resp
diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py
new file mode 100644
index 000000000000..1dcf30a058c9
--- /dev/null
+++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pydantic import BaseModel, Field
+
+from vllm.entrypoints.openai.engine.protocol import UsageInfo
+from vllm.entrypoints.pooling.base.protocol import CompletionRequestMixin
+
+
+class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin):
+    return_tokens: bool | None = Field(
+        default=None,
+        description="Whether to return dict shows the mapping of token_id to text."
+        "`None` or False means not return.",
+    )
+
+
+class SparseEmbeddingTokenWeight(BaseModel):
+    token_id: int
+    weight: float
+    token: str | None
+
+
+class SparseEmbeddingResponseData(BaseModel):
+    index: int
+    object: str = "sparse-embedding"
+    sparse_embedding: list[SparseEmbeddingTokenWeight]
+
+
+class SparseEmbeddingResponse(BaseModel):
+    data: list[SparseEmbeddingResponseData]
+    usage: UsageInfo
diff --git a/tests/plugins/bge_m3_sparse_plugin/setup.py b/tests/plugins/bge_m3_sparse_plugin/setup.py
new file mode 100644
index 000000000000..7bc01399f73b
--- /dev/null
+++ b/tests/plugins/bge_m3_sparse_plugin/setup.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from setuptools import setup
+
+setup(
+    name="bge-m3-sparse-plugin",
+    version="0.1",
+    packages=["bge_m3_sparse_processor"],
+    entry_points={
+        "vllm.io_processor_plugins": [
+            "bge_m3_sparse_plugin = bge_m3_sparse_processor:register_bge_m3_sparse_embeddings_processor",  # noqa: E501
+        ]
+    },
+)
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index f9dfa0848b80..b22239fcc267 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -22,6 +22,7 @@
 from vllm.logger import init_logger
 from vllm.outputs import PoolingRequestOutput
 from vllm.plugins.io_processors.interface import IOProcessor
+from vllm.renderers import BaseRenderer
 
 from .types import DataModuleConfig, ImagePrompt, ImageRequestOutput
 
@@ -218,8 +219,8 @@ def load_image(
 class PrithviMultimodalDataProcessor(IOProcessor[ImagePrompt, ImageRequestOutput]):
     indices = [0, 1, 2, 3, 4, 5]
 
-    def __init__(self, vllm_config: VllmConfig):
-        super().__init__(vllm_config)
+    def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer):
+        super().__init__(vllm_config, renderer)
 
         self.datamodule = Sen1Floods11NonGeoDataModule(
             data_root=datamodule_config["data_root"],
diff --git a/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py b/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
new file mode 100644
index 000000000000..20c400e59795
--- /dev/null
+++ b/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+
+# Test configuration for BGE-M3 sparse plugin
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse
+
+model_config = {
+    "model_name": "BAAI/bge-m3",
+    "plugin": "bge_m3_sparse_plugin",
+    "test_input": "What is the capital of France?",
+    "hf_overrides": json.dumps(
+        {"architectures": ["BgeM3EmbeddingModel"], "head_dtype": "float16"}
+    ),
+}
+
+
+def _float_close(expected: object, result: object):
+    assert isinstance(expected, float) and isinstance(result, float), (
+        f"{expected=}  or {result=} is not float"
+    )
+    return (expected - result) < 1e-3 or abs(expected / result - 1) < 1e-3
+
+
+def _get_attr_or_val(obj: object | dict, key: str):
+    if isinstance(obj, dict) and key in obj:
+        return obj[key]
+    return getattr(obj, key, None)
+
+
+def _check_sparse_embedding(data, check_tokens=False):
+    expected_weights = [
+        {"token_id": 32, "weight": 0.0552978515625, "token": "?"},
+        {"token_id": 70, "weight": 0.09808349609375, "token": "the"},
+        {"token_id": 83, "weight": 0.08154296875, "token": "is"},
+        {"token_id": 111, "weight": 0.11810302734375, "token": "of"},
+        {"token_id": 4865, "weight": 0.1171875, "token": "What"},
+        {"token_id": 9942, "weight": 0.292236328125, "token": "France"},
+        {"token_id": 10323, "weight": 0.2802734375, "token": "capital"},
+    ]
+    expected_embed = {x["token_id"]: x for x in expected_weights}
+
+    assert len(data) == len(expected_embed)
+    for entry in data:
+        expected_val = expected_embed[_get_attr_or_val(entry, "token_id")]
+        assert _float_close(
+            expected_val["weight"], _get_attr_or_val(entry, "weight")
+        ), f"actual embed {entry} not equal to {expected_val}"
+        if check_tokens:
+            assert expected_val["token"] == _get_attr_or_val(entry, "token"), (
+                f"actual embed {entry} not equal to {expected_val}"
+            )
+        else:
+            assert _get_attr_or_val(entry, "token") is None, (
+                f"{entry} should not return token"
+            )
+
+
+@pytest.fixture(scope="function")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "32",
+        "--hf_overrides",
+        model_config["hf_overrides"],
+        "--io-processor-plugin",
+        model_config["plugin"],
+    ]
+
+    with RemoteOpenAIServer(model_config["model_name"], args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "return_tokens",
+    [True, False],
+)
+async def test_bge_m3_sparse_plugin_online(
+    server: RemoteOpenAIServer, return_tokens: bool
+):
+    """Test BGE-M3 sparse plugin in online mode via API."""
+    request_payload = {
+        "model": model_config["model_name"],
+        "task": "token_classify",
+        "data": {"input": model_config["test_input"], "return_tokens": return_tokens},
+    }
+
+    ret = requests.post(
+        server.url_for("pooling"),
+        json=request_payload,
+    )
+
+    response = ret.json()
+
+    # Verify the request response is in the correct format
+    assert (parsed_response := IOProcessorResponse(**response).data)
+
+    # Verify the output is formatted as expected for this plugin
+    assert _get_attr_or_val(parsed_response, "data")
+    assert len(_get_attr_or_val(parsed_response, "data")) > 0
+
+    data_entry = _get_attr_or_val(parsed_response, "data")[0]
+    assert _get_attr_or_val(data_entry, "object") == "sparse-embedding"
+    assert _get_attr_or_val(data_entry, "sparse_embedding")
+
+    # Verify sparse embedding format
+    sparse_embedding = _get_attr_or_val(data_entry, "sparse_embedding")
+    assert isinstance(sparse_embedding, list)
+    _check_sparse_embedding(sparse_embedding, return_tokens)
+
+    # Verify usage information
+    usage = _get_attr_or_val(parsed_response, "usage")
+    assert usage, f"usage not found for {parsed_response}"
+    assert _get_attr_or_val(usage, "prompt_tokens") > 0
+    assert _get_attr_or_val(usage, "total_tokens") == _get_attr_or_val(
+        usage, "prompt_tokens"
+    )
+
+
+@pytest.mark.parametrize(
+    "return_tokens",
+    [True, False],
+)
+def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool):
+    """Test BGE-M3 sparse plugin in offline mode."""
+    prompt = {
+        "data": {
+            "input": model_config["test_input"],
+            "return_tokens": return_tokens,
+        }
+    }
+
+    with vllm_runner(
+        model_config["model_name"],
+        runner="pooling",
+        enforce_eager=True,
+        max_num_seqs=32,
+        io_processor_plugin=model_config["plugin"],
+        hf_overrides=json.loads(model_config["hf_overrides"]),
+        default_torch_num_threads=1,
+    ) as llm_runner:
+        llm = llm_runner.get_llm()
+        pooler_output = llm.encode(prompt, pooling_task="token_classify")
+
+    outputs = pooler_output[0]
+
+    # Verify output structure
+    assert hasattr(outputs, "outputs")
+    response = outputs.outputs
+    assert hasattr(response, "data")
+    assert len(response.data) == 1
+    # Verify response data
+    for i, output in enumerate(response.data):
+        # Each output should have sparse embeddings
+        sparse_embedding = output.sparse_embedding
+        assert isinstance(sparse_embedding, list)
+        _check_sparse_embedding(sparse_embedding, return_tokens)
+
+    # Verify usage
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.total_tokens == response.usage.prompt_tokens
+
+
+def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner):
+    """Test BGE-M3 sparse plugin with multiple inputs in offline mode."""
+    prompts = {
+        "data": {
+            "input": [
+                "What is the capital of France?",
+                "What is the capital of Germany?",
+                "What is the capital of Spain?",
+            ],
+            "return_tokens": True,
+        }
+    }
+
+    with vllm_runner(
+        model_config["model_name"],
+        runner="pooling",
+        enforce_eager=True,
+        max_num_seqs=32,
+        io_processor_plugin=model_config["plugin"],
+        hf_overrides=json.loads(model_config["hf_overrides"]),
+        default_torch_num_threads=1,
+    ) as llm_runner:
+        llm = llm_runner.get_llm()
+        pooler_output = llm.encode(prompts, pooling_task="token_classify")
+
+    outputs = pooler_output[0]
+
+    # Verify output structure
+    assert hasattr(outputs, "outputs")
+    response = outputs.outputs
+    assert hasattr(response, "data")
+    assert len(response.data) == 3
+    for i, output in enumerate(response.data):
+        # Each output should have sparse embeddings
+        sparse_embedding = output.sparse_embedding
+        assert isinstance(sparse_embedding, list)
+
+    # Verify usage
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.total_tokens == response.usage.prompt_tokens
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 04cb19499296..f11d00316ff7 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -39,7 +39,7 @@ def _compute_image_hash(base64_data: str) -> str:
 def test_loading_missing_plugin():
     vllm_config = VllmConfig()
     with pytest.raises(ValueError):
-        get_io_processor(vllm_config, "wrong_plugin")
+        get_io_processor(vllm_config, None, "wrong_plugin")
 
 
 @pytest.fixture(scope="function")
diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py
index b3a3b548781e..86ebe41b0355 100644
--- a/vllm/plugins/io_processors/__init__.py
+++ b/vllm/plugins/io_processors/__init__.py
@@ -1,18 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import inspect
 import logging
 
 from vllm.config import VllmConfig
 from vllm.plugins import IO_PROCESSOR_PLUGINS_GROUP, load_plugins_by_group
 from vllm.plugins.io_processors.interface import IOProcessor
+from vllm.renderers import BaseRenderer
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 logger = logging.getLogger(__name__)
 
 
 def get_io_processor(
-    vllm_config: VllmConfig, plugin_from_init: str | None = None
+    vllm_config: VllmConfig,
+    renderer: BaseRenderer,
+    plugin_from_init: str | None = None,
 ) -> IOProcessor | None:
     # Input.Output processors are loaded as plugins under the
     # 'vllm.io_processor_plugins' group. Similar to platform
@@ -65,4 +69,14 @@ def get_io_processor(
 
     activated_plugin_cls = loadable_plugins[model_plugin]
 
-    return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config)
+    activated_plugin_typ = resolve_obj_by_qualname(activated_plugin_cls)
+
+    # for backward compatibility, the plugin does not have a renderer argument
+    if "renderer" not in inspect.signature(activated_plugin_typ.__init__).parameters:
+        logger.warning(
+            "The renderer argument will be required in v0.18, "
+            "please update your IOProcessor plugin: %s",
+            activated_plugin_cls,
+        )
+        return activated_plugin_typ(vllm_config)
+    return activated_plugin_typ(vllm_config, renderer)
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
index fa71b4ca0995..f73eb99abd73 100644
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@@ -9,6 +9,7 @@
 from vllm.inputs.data import PromptType
 from vllm.outputs import PoolingRequestOutput
 from vllm.pooling_params import PoolingParams
+from vllm.renderers import BaseRenderer
 from vllm.sampling_params import SamplingParams
 
 IOProcessorInput = TypeVar("IOProcessorInput")
@@ -18,7 +19,7 @@
 class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     """Abstract interface for pre/post-processing of engine I/O."""
 
-    def __init__(self, vllm_config: VllmConfig):
+    def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer):
         super().__init__()
 
         self.vllm_config = vllm_config
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index f172d6dda02e..6be0a07baeb2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -135,6 +135,7 @@ def __init__(
         self.renderer = renderer = renderer_from_config(self.vllm_config)
         self.io_processor = get_io_processor(
             self.vllm_config,
+            self.renderer,
             self.model_config.io_processor_plugin,
         )
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 29a73251faa1..0d9279331d02 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -92,6 +92,7 @@ def __init__(
         self.renderer = renderer = renderer_from_config(self.vllm_config)
         self.io_processor = get_io_processor(
             self.vllm_config,
+            self.renderer,
             self.model_config.io_processor_plugin,
         )
 

From 7e08c22b8cb65a1bea6b4bf9c52ed6e71d4acc47 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sat, 28 Feb 2026 18:12:00 +0800
Subject: [PATCH 124/154] [Feat] Add CUDA torch fallbacks for
 fp8_mqa_logits/fp8_paged_mqa_logits_torch function (#35271)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../layers/sparse_attn_indexer.py             |  76 +++++++----
 vllm/utils/deep_gemm.py                       | 121 ++++++++++++++++++
 vllm/v1/attention/backends/mla/indexer.py     |   7 +-
 3 files changed, 176 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py
index 826caa5d3a78..f4ce6fca8d56 100644
--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -9,8 +9,13 @@
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import fp8_mqa_logits, fp8_paged_mqa_logits
-from vllm.utils.import_utils import has_deep_gemm
+from vllm.utils.deep_gemm import (
+    fp8_mqa_logits,
+    fp8_mqa_logits_torch,
+    fp8_paged_mqa_logits,
+    fp8_paged_mqa_logits_torch,
+    is_deep_gemm_supported,
+)
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.mla.indexer import (
     DeepseekV32IndexerMetadata,
@@ -102,15 +107,23 @@ def sparse_attn_indexer(
                 chunk.block_table,
                 chunk.cu_seq_lens,
             )
-
-            logits = fp8_mqa_logits(
-                q_fp8[chunk.token_start : chunk.token_end],
-                (k_fp8, k_scale.view(torch.float32).flatten()),
-                weights[chunk.token_start : chunk.token_end],
-                chunk.cu_seqlen_ks,
-                chunk.cu_seqlen_ke,
-                clean_logits=False,
-            )
+            if is_deep_gemm_supported():
+                logits = fp8_mqa_logits(
+                    q_fp8[chunk.token_start : chunk.token_end],
+                    (k_fp8, k_scale.view(torch.float32).flatten()),
+                    weights[chunk.token_start : chunk.token_end],
+                    chunk.cu_seqlen_ks,
+                    chunk.cu_seqlen_ke,
+                    clean_logits=False,
+                )
+            else:
+                logits = fp8_mqa_logits_torch(
+                    q_fp8[chunk.token_start : chunk.token_end],
+                    (k_fp8, k_scale.view(torch.float32).flatten()),
+                    weights[chunk.token_start : chunk.token_end],
+                    chunk.cu_seqlen_ks,
+                    chunk.cu_seqlen_ke,
+                )
             num_rows = logits.shape[0]
 
             topk_indices = topk_indices_buffer[
@@ -159,18 +172,26 @@ def sparse_attn_indexer(
         next_n = padded_q_fp8_decode_tokens.shape[1]
         assert batch_size == decode_metadata.seq_lens.shape[0]
         num_padded_tokens = batch_size * next_n
-
-        logits = fp8_paged_mqa_logits(
-            padded_q_fp8_decode_tokens,
-            kv_cache,
-            weights[:num_padded_tokens],
-            decode_metadata.seq_lens,
-            decode_metadata.block_table,
-            decode_metadata.schedule_metadata,
-            max_model_len=max_model_len,
-            clean_logits=False,
-        )
-
+        if is_deep_gemm_supported():
+            logits = fp8_paged_mqa_logits(
+                padded_q_fp8_decode_tokens,
+                kv_cache,
+                weights[:num_padded_tokens],
+                decode_metadata.seq_lens,
+                decode_metadata.block_table,
+                decode_metadata.schedule_metadata,
+                max_model_len=max_model_len,
+                clean_logits=False,
+            )
+        else:
+            logits = fp8_paged_mqa_logits_torch(
+                padded_q_fp8_decode_tokens,
+                kv_cache,
+                weights[:num_padded_tokens],
+                decode_metadata.seq_lens,
+                decode_metadata.block_table,
+                max_model_len=max_model_len,
+            )
         num_rows = logits.shape[0]
         topk_indices = topk_indices_buffer[:num_padded_tokens, :topk_tokens]
 
@@ -278,9 +299,12 @@ def __init__(
         self.max_model_len = max_model_len
         self.max_total_seq_len = max_total_seq_len
         self.topk_indices_buffer = topk_indices_buffer
-        if current_platform.is_cuda() and not has_deep_gemm():
-            raise RuntimeError(
-                "Sparse Attention Indexer CUDA op requires DeepGEMM to be installed."
+        if current_platform.is_cuda() and not is_deep_gemm_supported():
+            logger.warning_once(
+                "DeepGEMM is not supported or available. SparseAttnIndexer will use a "
+                "less efficient PyTorch implementation. "
+                "Please make sure you have the required hardware and software setup "
+                "for DeepGEMM to achieve optimal performance."
             )
 
     def forward_native(
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 8f664cc7daf0..ee104a6cc75c 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -418,6 +418,125 @@ def should_use_deepgemm_for_fp8_linear(
     )
 
 
+def fp8_mqa_logits_torch(
+    q: torch.Tensor,
+    kv: tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits for a single sequence without KV paging (CUDA fallback).
+
+    This is a pure PyTorch fallback for CUDA when DeepGEMM is not available.
+
+    Args:
+        q: Query tensor of shape [M, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
+            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
+            [N, 1]) with dtype `torch.float32`.
+        weights: weights of shape [M, H], dtype `torch.float32`.
+        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
+            shape [M], dtype int32.
+        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
+            shape [M], dtype int32.
+
+    Returns:
+        Logits tensor of shape [M, N], dtype `torch.float32`.
+    """
+    kv_fp8, scale = kv
+    seq_len_kv = kv_fp8.shape[0]
+    k = kv_fp8.to(torch.bfloat16)
+    q = q.to(torch.bfloat16)
+
+    mask_lo = (
+        torch.arange(0, seq_len_kv, device=q.device)[None, :] >= cu_seqlen_ks[:, None]
+    )
+    mask_hi = (
+        torch.arange(0, seq_len_kv, device=q.device)[None, :] < cu_seqlen_ke[:, None]
+    )
+    mask = mask_lo & mask_hi
+
+    score = torch.einsum("mhd,nd->hmn", q, k).float() * scale
+    logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
+    logits = logits.masked_fill(~mask, float("-inf"))
+
+    return logits
+
+
+def fp8_paged_mqa_logits_torch(
+    q: torch.Tensor,
+    kv_cache: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    max_model_len: int,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits using paged KV-cache (CUDA fallback).
+
+    This is a pure PyTorch fallback for CUDA when DeepGEMM is not available.
+    Handles head_dim = 132 (128 + 4 for RoPE).
+
+    Args:
+        q: Query tensor of shape [B, next_n, H, D].
+        kv_cache: Paged KV-cache in packed FP8+scale layout with shape
+            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
+            4 bytes per (block,pos) store the `float` dequant scale.
+        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
+        context_lens: Tensor of shape [B], dtype int32; effective context length
+            for each batch element.
+        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
+            block indices to physical blocks in the paged cache.
+        max_model_len: Maximum sequence length used to size the logits output.
+
+    Returns:
+        Logits tensor of shape [B * next_n, max_model_len], dtype
+        `torch.float32`.
+    """
+    fp8_dtype = current_platform.fp8_dtype()
+    batch_size, next_n, heads, dim = q.size()
+    kv_cache, scale = kv_cache[..., :dim], kv_cache[..., dim:]
+    scale = scale.contiguous().view(torch.float)
+    q = q.float()
+    kv_cache = kv_cache.view(fp8_dtype).float() * scale
+    num_blocks, block_size, _, dim = kv_cache.size()
+    logits = torch.full(
+        [batch_size * next_n, max_model_len],
+        float("-inf"),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    for i in range(batch_size):
+        context_len = context_lens[i].item()
+        q_offsets = torch.arange(context_len - next_n, context_len, device=q.device)
+        weight_slice = (
+            weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous()
+        )
+        for block_idx in range(cdiv(context_len, block_size)):
+            block_id = block_tables[i][block_idx]
+            qx, kx = q[i], kv_cache[block_id]
+            k_offsets = torch.arange(
+                block_idx * block_size, (block_idx + 1) * block_size, device=q.device
+            )
+            mask = (k_offsets[None, :] < context_len) & (
+                k_offsets[None, :] <= q_offsets[:, None]
+            )
+            s = torch.where(
+                mask[None, :, :],
+                (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to(
+                    logits.dtype
+                ),
+                float("-inf"),
+            )
+            s = torch.relu(s) * weight_slice[..., None]
+            s = s.sum(dim=0)
+            logits[
+                i * next_n : (i + 1) * next_n,
+                block_idx * block_size : (block_idx + 1) * block_size,
+            ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s, float("-inf"))
+    return logits
+
+
 __all__ = [
     "calc_diff",
     "DeepGemmQuantScaleFMT",
@@ -425,7 +544,9 @@ def should_use_deepgemm_for_fp8_linear(
     "m_grouped_fp8_gemm_nt_contiguous",
     "fp8_m_grouped_gemm_nt_masked",
     "fp8_mqa_logits",
+    "fp8_mqa_logits_torch",
     "fp8_paged_mqa_logits",
+    "fp8_paged_mqa_logits_torch",
     "get_paged_mqa_logits_metadata",
     "per_block_cast_to_fp8",
     "is_deep_gemm_e8m0_used",
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index 3c56f9fd0d9a..7c81a4359223 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -8,7 +8,10 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, has_deep_gemm
+from vllm.utils.deep_gemm import (
+    get_paged_mqa_logits_metadata,
+    is_deep_gemm_supported,
+)
 from vllm.utils.platform_utils import num_compute_units
 from vllm.v1.attention.backend import (
     AttentionBackend,
@@ -344,7 +347,7 @@ def build(
             seq_lens = common_attn_metadata.seq_lens[:num_decodes]
 
             # DeepGEMM is required for the paged MQA logits on CUDA devices
-            if current_platform.is_cuda() and has_deep_gemm():
+            if current_platform.is_cuda() and is_deep_gemm_supported():
                 self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
                     seq_lens, self.kv_cache_spec.block_size, self.num_sms
                 )

From c68e69f1449cc6d84f43137fcc36c142de1c8fd3 Mon Sep 17 00:00:00 2001
From: flutist <30485581+flutist@users.noreply.github.com>
Date: Sat, 28 Feb 2026 19:49:52 +0800
Subject: [PATCH 125/154] custom dataset img support base64 (#35280)

Signed-off-by: xjx <493337577@qq.com>
---
 vllm/benchmarks/datasets.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 0cd76d8910f7..c8644ef267a6 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -305,9 +305,11 @@ def process_image(image: Any) -> Mapping[str, Any]:
        a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
        a dictionary with the image as a base64 data URL.
 
-    3. String input: - Treats the string as a URL or local file path.  -
-       Prepends "file://" if the string doesn't start with "http://" or
-       "file://".  - Returns a dictionary with the image URL.
+    3. String input: - Treats the string as a URL, local file path, or base64
+       encoded data.  - If string starts with "data:image/", treats as base64.
+       - If string starts with "http://", "https://", or "file://", treats as URL.
+       - Otherwise treats as local file path and prepends "file://".
+       - Returns a dictionary with the image URL or base64 data.
 
     Raises:
         ValueError: If the input is not a supported type.
@@ -327,14 +329,14 @@ def process_image(image: Any) -> Mapping[str, Any]:
     if isinstance(image, str):
         image_url = (
             image
-            if image.startswith(("http://", "https://", "file://"))
+            if image.startswith(("http://", "https://", "file://", "data:image/"))
             else f"file://{image}"
         )
         return {"type": "image_url", "image_url": {"url": image_url}}
 
     raise ValueError(
-        f"Invalid image input {image}. Must be a PIL.Image.Image"
-        " or str or dictionary with raw image bytes."
+        f"Invalid image input {image}. Must be a PIL.Image.Image, "
+        "str (URL, file path, or base64 data URL), or dictionary with raw image bytes."
     )
 
 

From 63d7972f13d1c5a9d9bd55b664017067a9abd451 Mon Sep 17 00:00:00 2001
From: cwazai <38356712+cwazai@users.noreply.github.com>
Date: Sat, 28 Feb 2026 22:50:55 +0800
Subject: [PATCH 126/154] Fix Qwen3_5MTP packed_modules_mapping for
 gate_up_proj (#35581)

---
 vllm/model_executor/models/qwen3_5_mtp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py
index a3bf02f32873..e42403213da7 100644
--- a/vllm/model_executor/models/qwen3_5_mtp.py
+++ b/vllm/model_executor/models/qwen3_5_mtp.py
@@ -339,7 +339,7 @@ class Qwen3_5MTP(nn.Module, SupportsMultiModal):
             "k_proj",
             "v_proj",
         ],
-        "gate_up_proj": ["up_proj", "down_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

From 49b9ae32e94b902b87e3d2894f5ac4a5f8dd4abb Mon Sep 17 00:00:00 2001
From: emricksini-h <emrick.birivoutin@hcompany.ai>
Date: Sat, 28 Feb 2026 17:14:29 +0100
Subject: [PATCH 127/154] [Fix] Avoid sending image input to other PP ranks
 (#35405)

Signed-off-by: emricksini-h <emrick.birivoutin@hcompany.ai>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/v1/executor/ray_utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index 67c5a58f7ea9..1e707df7b75d 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -104,6 +104,18 @@ def execute_model_ray(
                 scheduler_output, intermediate_tensors
             )
             if self._is_intermediate_tensors(output):
+                if (
+                    self.worker.model_runner.supports_mm_inputs
+                    and get_pp_group().is_first_rank
+                ):
+                    # Strip mm_features before Ray forwards it to the next PP Stage.
+                    # PP Stage>0 only needs the intermediate tensors,
+                    # not preprocessed multimodal data.
+
+                    # scheduled_new_reqs is a required field of SchedulerOutput,
+                    # so accessing it directly will raise AttributeError if missing.
+                    for req in scheduler_output.scheduled_new_reqs:
+                        req.mm_features = []
                 return scheduler_output, grammar_output, output
 
             if isinstance(output, AsyncModelRunnerOutput):

From 1dafb29f91661778d3bcb6a83c7ff03f02c049d4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 1 Mar 2026 01:07:02 +0800
Subject: [PATCH 128/154] [Benchmark] Avoid unnecessary video download in MMVU
 (#35618)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/datasets.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index c8644ef267a6..21ebeb9069bb 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -31,6 +31,7 @@
 from typing import Any, cast
 
 import numpy as np
+from huggingface_hub import snapshot_download
 from PIL import Image
 from typing_extensions import deprecated
 
@@ -2680,6 +2681,14 @@ class MMVUDataset(HuggingFaceDataset):
         + (" ".join(f"{k}.{v}" for k, v in x["choices"].items())),
     }
 
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self._remote_path_root = (
+            f"https://huggingface.co/datasets/{self.hf_name}/resolve/main"
+        )
+        self._local_path_root = snapshot_download(self.hf_name, repo_type="dataset")
+
     def sample(
         self,
         tokenizer: TokenizerLike,
@@ -2702,7 +2711,9 @@ def sample(
                 break
 
             prompt = parser_fn(item)
-            mm_content = process_video(item["video"])
+            mm_content = process_video(
+                item["video"].replace(self._remote_path_root, self._local_path_root)
+            )
             prompt_len = len(tokenizer.encode(prompt))
             if enable_multimodal_chat:
                 # Note: when chat is enabled the request prompt_len is no longer

From e113a301136402301381a86fb89d58da488ab55b Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 28 Feb 2026 12:32:37 -0500
Subject: [PATCH 129/154] [Deprecation] Deprecate code in 0.17 as scheduled
 (#35441)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../entrypoints/pooling/embed/test_online.py  | 12 ++--
 vllm/entrypoints/grpc_server.py               | 15 ++++-
 vllm/entrypoints/llm.py                       | 64 +++----------------
 .../openai/chat_completion/protocol.py        |  1 -
 .../entrypoints/openai/completion/protocol.py |  1 -
 .../openai/translations/__init__.py           | 12 ----
 .../openai/translations/api_router.py         | 14 ----
 .../openai/translations/protocol.py           | 14 ----
 .../openai/translations/serving.py            | 14 ----
 .../openai/translations/speech_to_text.py     | 15 -----
 vllm/entrypoints/pooling/base/protocol.py     |  4 --
 vllm/entrypoints/pooling/classify/protocol.py |  2 -
 vllm/entrypoints/pooling/embed/protocol.py    | 19 ------
 vllm/entrypoints/pooling/pooling/protocol.py  | 19 ------
 vllm/entrypoints/pooling/score/protocol.py    |  2 -
 .../layers/mamba/mamba_utils.py               |  3 -
 vllm/model_executor/models/ovis2_5.py         |  3 -
 vllm/multimodal/processing/processor.py       | 12 +---
 vllm/multimodal/utils.py                      | 18 ------
 vllm/pooling_params.py                        |  8 +--
 vllm/sampling_params.py                       | 18 +-----
 vllm/v1/engine/input_processor.py             | 11 ----
 22 files changed, 31 insertions(+), 250 deletions(-)
 delete mode 100644 vllm/entrypoints/openai/translations/__init__.py
 delete mode 100644 vllm/entrypoints/openai/translations/api_router.py
 delete mode 100644 vllm/entrypoints/openai/translations/protocol.py
 delete mode 100644 vllm/entrypoints/openai/translations/serving.py
 delete mode 100644 vllm/entrypoints/openai/translations/speech_to_text.py

diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index 89341670c7ff..adec6233414f 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -683,13 +683,13 @@ async def test_params_not_supported(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_normalize(server: RemoteOpenAIServer, model_name: str):
-    async def get_outputs(normalize):
+async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
+    async def get_outputs(use_activation):
         request_args = {
             "model": MODEL_NAME,
             "input": input_text,
             "encoding_format": "float",
-            "normalize": normalize,
+            "use_activation": use_activation,
         }
 
         response = requests.post(server.url_for("v1/embeddings"), json=request_args)
@@ -697,9 +697,9 @@ async def get_outputs(normalize):
 
         return torch.tensor([x["embedding"] for x in outputs["data"]])
 
-    default = await get_outputs(normalize=None)
-    w_normal = await get_outputs(normalize=True)
-    wo_normal = await get_outputs(normalize=False)
+    default = await get_outputs(use_activation=None)
+    w_normal = await get_outputs(use_activation=True)
+    wo_normal = await get_outputs(use_activation=False)
 
     assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
     assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py
index 1fc3354a41cd..ec8f4804b286 100755
--- a/vllm/entrypoints/grpc_server.py
+++ b/vllm/entrypoints/grpc_server.py
@@ -101,11 +101,15 @@ async def Generate(
             sampling_params = self._sampling_params_from_proto(
                 request.sampling_params, stream=request.stream
             )
+            tokenization_kwargs = self._tokenization_kwargs_from_proto(
+                request.sampling_params
+            )
 
             async for output in self.async_llm.generate(
                 prompt=prompt,
                 sampling_params=sampling_params,
                 request_id=request_id,
+                tokenization_kwargs=tokenization_kwargs,
             ):
                 # Convert vLLM output to protobuf
                 # For streaming, always send chunks
@@ -308,9 +312,6 @@ def _sampling_params_from_proto(
             seed=params.seed if params.HasField("seed") else None,
             include_stop_str_in_output=params.include_stop_str_in_output,
             logit_bias=dict(params.logit_bias) if params.logit_bias else None,
-            truncate_prompt_tokens=params.truncate_prompt_tokens
-            if params.HasField("truncate_prompt_tokens")
-            else None,
             structured_outputs=structured_outputs,
             # detokenize must be True if stop strings are used
             detokenize=bool(stop),
@@ -319,6 +320,14 @@ def _sampling_params_from_proto(
             else RequestOutputKind.FINAL_ONLY,
         )
 
+    @staticmethod
+    def _tokenization_kwargs_from_proto(
+        params: vllm_engine_pb2.SamplingParams,
+    ) -> dict[str, int] | None:
+        if params.HasField("truncate_prompt_tokens"):
+            return {"truncate_prompt_tokens": params.truncate_prompt_tokens}
+        return None
+
     @staticmethod
     def _chunk_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse:
         """
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index ee78d4d4894c..b3260f9144ec 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-import warnings
 from collections.abc import Callable, Iterable, Sequence
 from typing import TYPE_CHECKING, Any
 
@@ -1030,7 +1029,6 @@ def encode(
         prompts: PromptType | Sequence[PromptType] | DataPrompt,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         *,
-        truncate_prompt_tokens: int | None = None,
         use_tqdm: bool | Callable[..., tqdm] = True,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
         pooling_task: PoolingTask | None = None,
@@ -1088,20 +1086,6 @@ def encode(
                 "pooling model."
             )
 
-        if truncate_prompt_tokens is not None:
-            warnings.warn(
-                "The `truncate_prompt_tokens` parameter in `LLM.encode()` "
-                "is deprecated and will be removed in v0.16. "
-                "Please pass it via `tokenization_kwargs` instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
-            tokenization_kwargs = merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=truncate_prompt_tokens),
-            )
-
         if use_io_processor := (isinstance(prompts, dict) and "data" in prompts):
             if self.io_processor is None:
                 raise ValueError(
@@ -1185,7 +1169,6 @@ def embed(
         self,
         prompts: PromptType | Sequence[PromptType],
         *,
-        truncate_prompt_tokens: int | None = None,
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
@@ -1221,12 +1204,6 @@ def embed(
                 "Try converting the model using `--convert embed`."
             )
 
-        if truncate_prompt_tokens is not None:
-            tokenization_kwargs = merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=truncate_prompt_tokens),
-            )
-
         items = self.encode(
             prompts,
             use_tqdm=use_tqdm,
@@ -1294,7 +1271,6 @@ def reward(
         /,
         *,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
-        truncate_prompt_tokens: int | None = None,
         use_tqdm: bool | Callable[..., tqdm] = True,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
@@ -1319,13 +1295,11 @@ def reward(
             A list of `PoolingRequestOutput` objects containing the
             pooled hidden states in the same order as the input prompts.
         """
-
         return self.encode(
             prompts,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
             pooling_params=pooling_params,
-            truncate_prompt_tokens=truncate_prompt_tokens,
             pooling_task="token_classify",
             tokenization_kwargs=tokenization_kwargs,
         )
@@ -1771,23 +1745,15 @@ def _add_completion_requests(
         seq_prompts = prompt_to_seq(prompts)
         seq_params = self._params_to_seq(params, len(seq_prompts))
         seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_prompts))
-        seq_tok_kwargs = [
-            merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
-            )
-            for param in seq_params
-        ]
         seq_priority = self._priority_to_seq(priority, len(prompts))
 
         return self._render_and_add_requests(
             prompts=(
-                self._preprocess_cmpl_one(prompt, tok_kwargs)
-                for prompt, tok_kwargs in zip(
-                    maybe_tqdm(
-                        seq_prompts, use_tqdm=use_tqdm, desc="Rendering prompts"
-                    ),
-                    seq_tok_kwargs,
+                self._preprocess_cmpl_one(prompt, tokenization_kwargs)
+                for prompt in maybe_tqdm(
+                    seq_prompts,
+                    use_tqdm=use_tqdm,
+                    desc="Rendering prompts",
                 )
             ),
             params=seq_params,
@@ -1841,13 +1807,6 @@ def _run_chat(
         seq_convs = conversation_to_seq(messages)
         seq_params = self._params_to_seq(params, len(seq_convs))
         seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_convs))
-        seq_tok_kwargs = [
-            merge_kwargs(
-                tokenization_kwargs,
-                dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
-            )
-            for param in seq_params
-        ]
 
         return self._render_and_run_requests(
             prompts=(
@@ -1859,16 +1818,13 @@ def _run_chat(
                     add_generation_prompt=add_generation_prompt,
                     continue_final_message=continue_final_message,
                     tools=tools,
-                    tokenization_kwargs=tok_kwargs,
+                    tokenization_kwargs=tokenization_kwargs,
                     mm_processor_kwargs=mm_processor_kwargs,
                 )
-                for conversation, tok_kwargs in zip(
-                    maybe_tqdm(
-                        seq_convs,
-                        use_tqdm=use_tqdm,
-                        desc="Rendering conversations",
-                    ),
-                    seq_tok_kwargs,
+                for conversation in maybe_tqdm(
+                    seq_convs,
+                    use_tqdm=use_tqdm,
+                    desc="Rendering conversations",
                 )
             ),
             params=seq_params,
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
index 12bbc44a0a53..edba28a59e01 100644
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -490,7 +490,6 @@ def to_sampling_params(
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA
             if self.stream
             else RequestOutputKind.FINAL_ONLY,
diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py
index 02e6e0d036ac..222640439267 100644
--- a/vllm/entrypoints/openai/completion/protocol.py
+++ b/vllm/entrypoints/openai/completion/protocol.py
@@ -302,7 +302,6 @@ def to_sampling_params(
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA
             if self.stream
             else RequestOutputKind.FINAL_ONLY,
diff --git a/vllm/entrypoints/openai/translations/__init__.py b/vllm/entrypoints/openai/translations/__init__.py
deleted file mode 100644
index cf210d50571f..000000000000
--- a/vllm/entrypoints/openai/translations/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "The 'vllm.entrypoints.openai.translations' module has been renamed to "
-    "'vllm.entrypoints.openai.speech_to_text'. Please update your imports. "
-    "This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
diff --git a/vllm/entrypoints/openai/translations/api_router.py b/vllm/entrypoints/openai/translations/api_router.py
deleted file mode 100644
index 4a43bf8b9ca4..000000000000
--- a/vllm/entrypoints/openai/translations/api_router.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.api_router' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.api_router'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-from vllm.entrypoints.openai.speech_to_text.api_router import *  # noqa: F401,F403,E402
diff --git a/vllm/entrypoints/openai/translations/protocol.py b/vllm/entrypoints/openai/translations/protocol.py
deleted file mode 100644
index c8ec156d94b1..000000000000
--- a/vllm/entrypoints/openai/translations/protocol.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.protocol' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.protocol'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-from vllm.entrypoints.openai.speech_to_text.protocol import *  # noqa: F401,F403,E402
diff --git a/vllm/entrypoints/openai/translations/serving.py b/vllm/entrypoints/openai/translations/serving.py
deleted file mode 100644
index 1749d6155aa3..000000000000
--- a/vllm/entrypoints/openai/translations/serving.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.serving' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.serving'. Please update your "
-    "imports. This backward-compatible alias will be removed in version 0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-from vllm.entrypoints.openai.speech_to_text.serving import *  # noqa: F401,F403,E402
diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py
deleted file mode 100644
index eb26c6a83079..000000000000
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import warnings
-
-warnings.warn(
-    "'vllm.entrypoints.openai.translations.speech_to_text' has been moved to "
-    "'vllm.entrypoints.openai.speech_to_text.speech_to_text'. Please update "
-    "your imports. This backward-compatible alias will be removed in version "
-    "0.17+.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-from vllm.entrypoints.openai.speech_to_text.speech_to_text import *  # noqa: F401,F403,E402
diff --git a/vllm/entrypoints/pooling/base/protocol.py b/vllm/entrypoints/pooling/base/protocol.py
index 86dc12cbdf14..53945108d7f3 100644
--- a/vllm/entrypoints/pooling/base/protocol.py
+++ b/vllm/entrypoints/pooling/base/protocol.py
@@ -190,10 +190,6 @@ class EmbedRequestMixin(EncodingRequestMixin):
         description="Whether to use activation for the pooler outputs. "
         "`None` uses the pooler's default, which is `True` in most cases.",
     )
-    normalize: bool | None = Field(
-        default=None,
-        description="Deprecated; please pass `use_activation` instead",
-    )
     # --8<-- [end:embed-extra-params]
 
 
diff --git a/vllm/entrypoints/pooling/classify/protocol.py b/vllm/entrypoints/pooling/classify/protocol.py
index 3c4bbd8c2c1e..bfc38ebef2a6 100644
--- a/vllm/entrypoints/pooling/classify/protocol.py
+++ b/vllm/entrypoints/pooling/classify/protocol.py
@@ -40,7 +40,6 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
     def to_pooling_params(self):
         return PoolingParams(
             task="classify",
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
             use_activation=self.use_activation,
         )
 
@@ -63,7 +62,6 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
     def to_pooling_params(self):
         return PoolingParams(
             task="classify",
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
             use_activation=self.use_activation,
         )
 
diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
index 4f83105f27e7..4b47c6522e42 100644
--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -14,12 +14,9 @@
     EmbedRequestMixin,
     PoolingBasicRequestMixin,
 )
-from vllm.logger import init_logger
 from vllm.renderers import TokenizeParams
 from vllm.utils import random_uuid
 
-logger = init_logger(__name__)
-
 
 def _get_max_total_output_tokens(
     model_config: ModelConfig,
@@ -60,18 +57,10 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
         )
 
     def to_pooling_params(self):
-        if self.normalize is not None:
-            logger.warning_once(
-                "`normalize` is deprecated and will be removed in v0.17. "
-                "Please pass `use_activation` instead."
-            )
-            self.use_activation = self.normalize
-
         return PoolingParams(
             task="embed",
             dimensions=self.dimensions,
             use_activation=self.use_activation,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
         )
 
 
@@ -97,18 +86,10 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
         )
 
     def to_pooling_params(self):
-        if self.normalize is not None:
-            logger.warning_once(
-                "`normalize` is deprecated and will be removed in v0.17. "
-                "Please pass `use_activation` instead."
-            )
-            self.use_activation = self.normalize
-
         return PoolingParams(
             task="embed",
             dimensions=self.dimensions,
             use_activation=self.use_activation,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
         )
 
 
diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py
index a8c1c59ff796..b99f98959abc 100644
--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -16,13 +16,10 @@
     EncodingRequestMixin,
     PoolingBasicRequestMixin,
 )
-from vllm.logger import init_logger
 from vllm.renderers import TokenizeParams
 from vllm.tasks import PoolingTask
 from vllm.utils import random_uuid
 
-logger = init_logger(__name__)
-
 
 class PoolingCompletionRequest(
     PoolingBasicRequestMixin,
@@ -45,16 +42,8 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
         )
 
     def to_pooling_params(self):
-        if self.normalize is not None:
-            logger.warning_once(
-                "`normalize` is deprecated and will be removed in v0.17. "
-                "Please pass `use_activation` instead."
-            )
-            self.use_activation = self.normalize
-
         return PoolingParams(
             task=self.task,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
             use_activation=self.use_activation,
             dimensions=self.dimensions,
         )
@@ -78,16 +67,8 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
         )
 
     def to_pooling_params(self):
-        if self.normalize is not None:
-            logger.warning_once(
-                "`normalize` is deprecated and will be removed in v0.17. "
-                "Please pass `use_activation` instead."
-            )
-            self.use_activation = self.normalize
-
         return PoolingParams(
             task=self.task,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
             use_activation=self.use_activation,
             dimensions=self.dimensions,
         )
diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py
index a85ed5d707d3..643eeed36ed3 100644
--- a/vllm/entrypoints/pooling/score/protocol.py
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -37,7 +37,6 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
     def to_pooling_params(self, task: PoolingTask = "score"):
         return PoolingParams(
             task=task,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
             use_activation=self.use_activation,
         )
 
@@ -113,7 +112,6 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
     def to_pooling_params(self, task: PoolingTask = "score"):
         return PoolingParams(
             task=task,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
             use_activation=self.use_activation,
         )
 
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index fc8912f8c76b..1f6751f6c8b1 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -289,9 +289,6 @@ def get_temporal_copy_spec(
     )
 
 
-get_full_copy_spec = get_temporal_copy_spec
-
-
 class MambaStateCopyFuncCalculator:
     @classmethod
     def linear_attention_state_copy_func(cls):
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 2d9385c572a0..57559ba99c11 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -43,12 +43,9 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 
 IMAGE_TOKEN = "<image>"
-IMAGE_PLACEHOLDER_ID = 151669
 VIDEO_TOKEN = "<video>"
-VIDEO_PLACEHOLDER_ID = 151670
 INDICATOR_IDS = [151672, 151673, 151674, 151675]
 IMAGE_PAD_TOKEN_ID = 151655
-THINK_END_TOKEN_ID = 151668
 
 
 class Ovis2_5ImagePatchInputs(TensorSchema):
diff --git a/vllm/multimodal/processing/processor.py b/vllm/multimodal/processing/processor.py
index 67d3ab32dfad..84720a554d15 100644
--- a/vllm/multimodal/processing/processor.py
+++ b/vllm/multimodal/processing/processor.py
@@ -17,7 +17,7 @@
 
 import regex as re
 import torch
-from typing_extensions import TypeVar, assert_never, deprecated
+from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
@@ -996,16 +996,6 @@ def __init__(
 
         self.data_parser = self.info.get_data_parser()
 
-    @property
-    @deprecated("Will be removed in v0.17. Use `info.supported_mm_limits` instead.")
-    def supported_mm_limits(self):
-        return self.info.supported_mm_limits
-
-    @property
-    @deprecated("Will be removed in v0.17. Use `info.allowed_mm_limits` instead.")
-    def allowed_mm_limits(self):
-        return self.info.allowed_mm_limits
-
     def __call__(
         self,
         prompt: str,
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index d94faa67557f..886756c99da9 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import mimetypes
-import warnings
 from collections import defaultdict
 from collections.abc import Generator, Sequence
 from itertools import groupby
@@ -30,23 +29,6 @@
     torch = LazyLoader("torch", globals(), "torch")
 
 
-def __getattr__(name: str):
-    if name == "MEDIA_CONNECTOR_REGISTRY":
-        from .media import MEDIA_CONNECTOR_REGISTRY
-
-        warnings.warn(
-            "`vllm.multimodal.utils.MEDIA_CONNECTOR_REGISTRY` "
-            "has been moved to `vllm.multimodal.media.MEDIA_CONNECTOR_REGISTRY`. "
-            "The old name will be removed in v0.17.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        return MEDIA_CONNECTOR_REGISTRY
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-
-
 def encode_audio_base64(
     audio: np.ndarray,
     sampling_rate: int,
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 75d441d74cd9..487a9383933e 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
-from typing import Annotated, Any
+from typing import Any
 
 import msgspec
 
@@ -19,10 +19,6 @@ class PoolingParams(
     """API parameters for pooling models.
 
     Attributes:
-        truncate_prompt_tokens: Controls prompt truncation.
-            Set to -1 to use the model's default truncation size.
-            Set to k to keep only the last k tokens (left truncation).
-            Set to None to disable truncation.
         use_activation: Whether to apply activation function to the pooler outputs.
             `None` uses the pooler's default, which is `True` in most cases.
         dimensions: Reduce the dimensions of embeddings
@@ -30,7 +26,6 @@ class PoolingParams(
     """
 
     # --8<-- [start:common-pooling-params]
-    truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
     use_activation: bool | None = None
     # --8<-- [end:common-pooling-params]
 
@@ -198,7 +193,6 @@ def __repr__(self) -> str:
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "
             f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
-            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
             f"extra_kwargs={self.extra_kwargs})"
         )
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index e36a90d6c568..866202950b1f 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -7,7 +7,7 @@
 from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
-from typing import Annotated, Any
+from typing import Any
 
 import msgspec
 from pydantic.dataclasses import dataclass
@@ -209,10 +209,6 @@ class SamplingParams(
     """Whether to add spaces between special tokens in the output."""
     include_stop_str_in_output: bool = False
     """Whether to include the stop strings in output text."""
-    truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
-    """If set to -1, will use the truncation size supported by the model. If
-    set to an integer k, will use only the last k tokens from the prompt
-    (i.e., left truncation). If set to `None`, truncation is disabled."""
     output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
     skip_clone: bool = False
     """Internal flag indicating that this SamplingParams instance is safe to
@@ -273,7 +269,6 @@ def from_optional(
         detokenize: bool = True,
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
-        truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
         structured_outputs: StructuredOutputsParams | None = None,
         logit_bias: dict[int, float] | dict[str, float] | None = None,
@@ -313,7 +308,6 @@ def from_optional(
             detokenize=detokenize,
             skip_special_tokens=skip_special_tokens,
             spaces_between_special_tokens=spaces_between_special_tokens,
-            truncate_prompt_tokens=truncate_prompt_tokens,
             output_kind=output_kind,
             structured_outputs=structured_outputs,
             logit_bias=logit_bias,
@@ -449,15 +443,6 @@ def _verify_args(self) -> None:
                 parameter="prompt_logprobs",
                 value=self.prompt_logprobs,
             )
-        if self.truncate_prompt_tokens is not None and (
-            self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1
-        ):
-            raise VLLMValidationError(
-                f"truncate_prompt_tokens must be an integer >= 1 or -1, "
-                f"got {self.truncate_prompt_tokens}",
-                parameter="truncate_prompt_tokens",
-                value=self.truncate_prompt_tokens,
-            )
         assert isinstance(self.stop_token_ids, list)
         if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
             raise ValueError(
@@ -835,7 +820,6 @@ def __repr__(self) -> str:
             f"skip_special_tokens={self.skip_special_tokens}, "
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
-            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
             f"structured_outputs={self.structured_outputs}, "
             f"extra_args={self.extra_args})"
         )
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index b4b193abbdf3..ad70f839d9c5 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
-import warnings
 from collections.abc import Mapping
 from typing import Any, Literal
 
@@ -114,16 +113,6 @@ def _validate_params(
         supported_tasks: tuple[SupportedTask, ...],
     ) -> None:
         """Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
-        if params.truncate_prompt_tokens is not None:
-            params_type = type(params).__name__
-            warnings.warn(
-                f"The `truncate_prompt_tokens` parameter in `{params_type}` "
-                "is deprecated and will be removed in v0.17. "
-                "Please pass it via `tokenization_kwargs` instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
         if isinstance(params, SamplingParams):
             supported_generation_tasks = [
                 task for task in supported_tasks if task in GENERATION_TASKS

From e94b263bd6557dc54582bfc5ba74f0a631bd642d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 1 Mar 2026 03:22:41 +0800
Subject: [PATCH 130/154] [Chore] Cleanup BNB utilization dead code (#35620)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/linear.py | 39 ----------------------------
 1 file changed, 39 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 7c228cc9083d..f0d06e179f33 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -3,7 +3,6 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Any
 
 import torch
 from torch.nn.parameter import Parameter, UninitializedParameter
@@ -133,44 +132,6 @@ def adjust_scalar_to_fused_array(
     return param_data[shard_id], loaded_weight
 
 
-# TODO(Isotr0py): We might need a more flexible structure to handle
-# bitsandbytes shard offsets.
-def left_shift_bitsandbytes_4bit_shard(
-    bnb_weight_attrs: dict[str, Any],
-) -> tuple[dict[str, Any], dict[str, Any]]:
-    """
-    Separate the BitsAndBytes 4-bit shard.
-
-    For example, given bnb weight attributes as below:
-    {
-        'bnb_shard_offsets': array([0, 4, 8, 16]),
-        'bnb_quant_state': {0: ..., 1: ..., 2: ...},
-    }
-
-    The function will return:
-    {
-        'bnb_shard_offsets': array([0, 4]),
-        'bnb_quant_state': {0: ...},
-    }
-    and
-    {
-        'bnb_shard_offsets': array([0, 4, 12]),
-        'bnb_quant_state': {0: ..., 1: ...},
-    }
-    """
-    shard_offsets = bnb_weight_attrs["bnb_shard_offsets"]
-    offset_l = shard_offsets[:2]
-    offset_r = shard_offsets[1:] - shard_offsets[1]
-    quant_state_l = {0: bnb_weight_attrs["bnb_quant_state"][0]}
-    quant_state_r = {
-        i - 1: bnb_weight_attrs["bnb_quant_state"][i]
-        for i in range(1, len(shard_offsets) - 1)
-    }
-    left = dict(bnb_shard_offsets=offset_l, bnb_quant_state=quant_state_l)
-    right = dict(bnb_shard_offsets=offset_r, bnb_quant_state=quant_state_r)
-    return left, right
-
-
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 

From 95a395dbec08e795ea4eb30494b7a86c8e906c08 Mon Sep 17 00:00:00 2001
From: Martin Vit <martin@voipmonitor.org>
Date: Sat, 28 Feb 2026 21:57:08 +0100
Subject: [PATCH 131/154] [Bugfix] Fix Anthropic API base64 image handling in
 Messages endpoint (#35557)

Signed-off-by: Martin Vit <martin@voipmonitor.org>
---
 .../test_anthropic_messages_conversion.py     | 326 ++++++++++++++++++
 vllm/entrypoints/anthropic/serving.py         |  68 +++-
 2 files changed, 389 insertions(+), 5 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_anthropic_messages_conversion.py

diff --git a/tests/entrypoints/openai/test_anthropic_messages_conversion.py b/tests/entrypoints/openai/test_anthropic_messages_conversion.py
new file mode 100644
index 000000000000..3647c187f519
--- /dev/null
+++ b/tests/entrypoints/openai/test_anthropic_messages_conversion.py
@@ -0,0 +1,326 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for Anthropic-to-OpenAI request conversion.
+
+Tests the image source handling and tool_result content parsing in
+AnthropicServingMessages._convert_anthropic_to_openai_request().
+"""
+
+from vllm.entrypoints.anthropic.protocol import (
+    AnthropicMessagesRequest,
+)
+from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
+
+_convert = AnthropicServingMessages._convert_anthropic_to_openai_request
+_img_url = AnthropicServingMessages._convert_image_source_to_url
+
+
+def _make_request(
+    messages: list[dict],
+    **kwargs,
+) -> AnthropicMessagesRequest:
+    return AnthropicMessagesRequest(
+        model="test-model",
+        max_tokens=128,
+        messages=messages,
+        **kwargs,
+    )
+
+
+# ======================================================================
+# _convert_image_source_to_url
+# ======================================================================
+
+
+class TestConvertImageSourceToUrl:
+    def test_base64_source(self):
+        source = {
+            "type": "base64",
+            "media_type": "image/jpeg",
+            "data": "iVBORw0KGgo=",
+        }
+        assert _img_url(source) == "data:image/jpeg;base64,iVBORw0KGgo="
+
+    def test_base64_png(self):
+        source = {
+            "type": "base64",
+            "media_type": "image/png",
+            "data": "AAAA",
+        }
+        assert _img_url(source) == "data:image/png;base64,AAAA"
+
+    def test_url_source(self):
+        source = {
+            "type": "url",
+            "url": "https://example.com/image.jpg",
+        }
+        assert _img_url(source) == "https://example.com/image.jpg"
+
+    def test_missing_type_defaults_to_base64(self):
+        """When 'type' is absent, treat as base64."""
+        source = {
+            "media_type": "image/webp",
+            "data": "UklGR",
+        }
+        assert _img_url(source) == "data:image/webp;base64,UklGR"
+
+    def test_missing_media_type_defaults_to_jpeg(self):
+        source = {"type": "base64", "data": "abc123"}
+        assert _img_url(source) == "data:image/jpeg;base64,abc123"
+
+    def test_url_source_missing_url_returns_empty(self):
+        source = {"type": "url"}
+        assert _img_url(source) == ""
+
+    def test_empty_source_returns_data_uri_shell(self):
+        source: dict = {}
+        assert _img_url(source) == "data:image/jpeg;base64,"
+
+
+# ======================================================================
+# Image blocks inside user messages
+# ======================================================================
+
+
+class TestImageContentBlocks:
+    def test_base64_image_in_user_message(self):
+        request = _make_request(
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Describe this image"},
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/jpeg",
+                                "data": "iVBORw0KGgo=",
+                            },
+                        },
+                    ],
+                }
+            ]
+        )
+
+        result = _convert(request)
+        user_msg = result.messages[0]
+        assert user_msg["role"] == "user"
+
+        parts = user_msg["content"]
+        assert len(parts) == 2
+        assert parts[0] == {"type": "text", "text": "Describe this image"}
+        assert parts[1] == {
+            "type": "image_url",
+            "image_url": {"url": "data:image/jpeg;base64,iVBORw0KGgo="},
+        }
+
+    def test_url_image_in_user_message(self):
+        request = _make_request(
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is this?"},
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "url",
+                                "url": "https://example.com/cat.png",
+                            },
+                        },
+                    ],
+                }
+            ]
+        )
+
+        result = _convert(request)
+        parts = result.messages[0]["content"]
+        assert parts[1] == {
+            "type": "image_url",
+            "image_url": {"url": "https://example.com/cat.png"},
+        }
+
+
+# ======================================================================
+# tool_result content handling
+# ======================================================================
+
+
+class TestToolResultContent:
+    def _make_tool_result_request(
+        self, tool_result_content
+    ) -> AnthropicMessagesRequest:
+        """Build a request with assistant tool_use followed by user
+        tool_result."""
+        return _make_request(
+            [
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "tool_use",
+                            "id": "call_001",
+                            "name": "read_file",
+                            "input": {"path": "/tmp/img.png"},
+                        }
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "tool_result",
+                            "tool_use_id": "call_001",
+                            "content": tool_result_content,
+                        }
+                    ],
+                },
+            ]
+        )
+
+    def test_tool_result_string_content(self):
+        request = self._make_tool_result_request("file contents here")
+        result = _convert(request)
+
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == "file contents here"
+        assert tool_msg[0]["tool_call_id"] == "call_001"
+
+    def test_tool_result_text_blocks(self):
+        request = self._make_tool_result_request(
+            [
+                {"type": "text", "text": "line 1"},
+                {"type": "text", "text": "line 2"},
+            ]
+        )
+        result = _convert(request)
+
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == "line 1\nline 2"
+
+    def test_tool_result_with_image(self):
+        """Image in tool_result should produce a follow-up user message."""
+        request = self._make_tool_result_request(
+            [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": "AAAA",
+                    },
+                }
+            ]
+        )
+        result = _convert(request)
+
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == ""
+
+        # The image should be injected as a follow-up user message
+        follow_up = [
+            m
+            for m in result.messages
+            if m["role"] == "user" and isinstance(m.get("content"), list)
+        ]
+        assert len(follow_up) == 1
+        img_parts = follow_up[0]["content"]
+        assert len(img_parts) == 1
+        assert img_parts[0] == {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,AAAA"},
+        }
+
+    def test_tool_result_with_text_and_image(self):
+        """Mixed text+image tool_result: text in tool msg, image in user
+        msg."""
+        request = self._make_tool_result_request(
+            [
+                {"type": "text", "text": "Here is the screenshot"},
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": "QUFB",
+                    },
+                },
+            ]
+        )
+        result = _convert(request)
+
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == "Here is the screenshot"
+
+        follow_up = [
+            m
+            for m in result.messages
+            if m["role"] == "user" and isinstance(m.get("content"), list)
+        ]
+        assert len(follow_up) == 1
+        assert follow_up[0]["content"][0]["image_url"]["url"] == (
+            "data:image/jpeg;base64,QUFB"
+        )
+
+    def test_tool_result_with_multiple_images(self):
+        request = self._make_tool_result_request(
+            [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": "IMG1",
+                    },
+                },
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "url",
+                        "url": "https://example.com/img2.jpg",
+                    },
+                },
+            ]
+        )
+        result = _convert(request)
+
+        follow_up = [
+            m
+            for m in result.messages
+            if m["role"] == "user" and isinstance(m.get("content"), list)
+        ]
+        assert len(follow_up) == 1
+        urls = [p["image_url"]["url"] for p in follow_up[0]["content"]]
+        assert urls == [
+            "data:image/png;base64,IMG1",
+            "https://example.com/img2.jpg",
+        ]
+
+    def test_tool_result_none_content(self):
+        request = self._make_tool_result_request(None)
+        result = _convert(request)
+
+        tool_msg = [m for m in result.messages if m["role"] == "tool"]
+        assert len(tool_msg) == 1
+        assert tool_msg[0]["content"] == ""
+
+    def test_tool_result_no_follow_up_when_no_images(self):
+        """Ensure no extra user message is added when there are no images."""
+        request = self._make_tool_result_request(
+            [
+                {"type": "text", "text": "just text"},
+            ]
+        )
+        result = _convert(request)
+
+        user_follow_ups = [
+            m
+            for m in result.messages
+            if m["role"] == "user" and isinstance(m.get("content"), list)
+        ]
+        assert len(user_follow_ups) == 0
diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py
index 6318f854a1e4..82af26476b05 100644
--- a/vllm/entrypoints/anthropic/serving.py
+++ b/vllm/entrypoints/anthropic/serving.py
@@ -86,8 +86,30 @@ def __init__(
             "tool_calls": "tool_use",
         }
 
+    @staticmethod
+    def _convert_image_source_to_url(source: dict[str, Any]) -> str:
+        """Convert an Anthropic image source to an OpenAI-compatible URL.
+
+        Anthropic supports two image source types:
+        - base64: {"type": "base64", "media_type": "image/jpeg", "data": "..."}
+        - url: {"type": "url", "url": "https://..."}
+
+        For base64 sources, this constructs a proper data URI that
+        downstream processors (e.g. vLLM's media connector) can handle.
+        """
+        source_type = source.get("type")
+        if source_type == "url":
+            return source.get("url", "")
+        # Default to base64 processing if type is "base64"
+        # or missing, ensuring a proper data URI is always
+        # constructed for non-URL sources.
+        media_type = source.get("media_type", "image/jpeg")
+        data = source.get("data", "")
+        return f"data:{media_type};base64,{data}"
+
+    @classmethod
     def _convert_anthropic_to_openai_request(
-        self, anthropic_request: AnthropicMessagesRequest
+        cls, anthropic_request: AnthropicMessagesRequest
     ) -> ChatCompletionRequest:
         """Convert Anthropic message format to OpenAI format"""
         openai_messages = []
@@ -119,10 +141,11 @@ def _convert_anthropic_to_openai_request(
                     if block.type == "text" and block.text:
                         content_parts.append({"type": "text", "text": block.text})
                     elif block.type == "image" and block.source:
+                        image_url = cls._convert_image_source_to_url(block.source)
                         content_parts.append(
                             {
                                 "type": "image_url",
-                                "image_url": {"url": block.source.get("data", "")},
+                                "image_url": {"url": image_url},
                             }
                         )
                     elif block.type == "thinking" and block.thinking is not None:
@@ -140,15 +163,50 @@ def _convert_anthropic_to_openai_request(
                         tool_calls.append(tool_call)
                     elif block.type == "tool_result":
                         if msg.role == "user":
+                            # Parse tool_result content which can be
+                            # a string or a list of content blocks
+                            # (text, image, etc.)
+                            tool_text = ""
+                            tool_image_urls: list[str] = []
+                            if isinstance(block.content, str):
+                                tool_text = block.content
+                            elif isinstance(block.content, list):
+                                text_parts: list[str] = []
+                                for item in block.content:
+                                    if not isinstance(item, dict):
+                                        continue
+                                    item_type = item.get("type")
+                                    if item_type == "text":
+                                        text_parts.append(item.get("text", ""))
+                                    elif item_type == "image":
+                                        source = item.get("source", {})
+                                        url = cls._convert_image_source_to_url(source)
+                                        if url:
+                                            tool_image_urls.append(url)
+                                tool_text = "\n".join(text_parts)
                             openai_messages.append(
                                 {
                                     "role": "tool",
                                     "tool_call_id": block.tool_use_id or "",
-                                    "content": str(block.content)
-                                    if block.content
-                                    else "",
+                                    "content": tool_text or "",
                                 }
                             )
+                            # OpenAI tool messages only support string
+                            # content, so inject images from tool
+                            # results as a follow-up user message
+                            if tool_image_urls:
+                                openai_messages.append(
+                                    {
+                                        "role": "user",
+                                        "content": [  # type: ignore[dict-item]
+                                            {
+                                                "type": "image_url",
+                                                "image_url": {"url": img},
+                                            }
+                                            for img in tool_image_urls
+                                        ],
+                                    }
+                                )
                         else:
                             # Assistant tool result becomes regular text
                             tool_result_text = (

From e3eb146f7ad4bc920e11e98cf88cee3839cf5f89 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 28 Feb 2026 13:19:45 -0800
Subject: [PATCH 132/154] [Model Runner V2] Add ModelStateInterface [4/N]
 (#35621)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/cudagraph_utils.py         |  2 +-
 vllm/v1/worker/gpu/model_runner.py            |  4 +-
 vllm/v1/worker/gpu/model_states/__init__.py   | 18 +++++
 .../default.py}                               |  3 +-
 vllm/v1/worker/gpu/model_states/interface.py  | 67 +++++++++++++++++++
 5 files changed, 90 insertions(+), 4 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/model_states/__init__.py
 rename vllm/v1/worker/gpu/{model_states.py => model_states/default.py} (98%)
 create mode 100644 vllm/v1/worker/gpu/model_states/interface.py

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 6e43043bc2ff..783715cfe984 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -22,7 +22,7 @@
 from vllm.v1.worker.gpu.block_table import BlockTables
 from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
 from vllm.v1.worker.gpu.input_batch import InputBuffers
-from vllm.v1.worker.gpu.model_states import ModelState
+from vllm.v1.worker.gpu.model_states.interface import ModelState
 from vllm.v1.worker.utils import AttentionGroup
 
 
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 188a2694e991..ca44ad164388 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -78,7 +78,7 @@
 )
 from vllm.v1.worker.gpu.lora_utils import LoraState
 from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
-from vllm.v1.worker.gpu.model_states import ModelState
+from vllm.v1.worker.gpu.model_states import init_model_state
 from vllm.v1.worker.gpu.pool.pooling_runner import PoolingRunner
 from vllm.v1.worker.gpu.pp_utils import pp_broadcast, pp_receive
 from vllm.v1.worker.gpu.sample.output import SamplerOutput
@@ -267,7 +267,7 @@ def load_model(self, *args, **kwargs) -> None:
             prepare_communication_buffer_for_model(self.speculator)
 
         # Initialize the components that require the model.
-        self.model_state = ModelState(
+        self.model_state = init_model_state(
             self.vllm_config, self.model, self.encoder_cache, self.device
         )
         if self.is_pooling_model:
diff --git a/vllm/v1/worker/gpu/model_states/__init__.py b/vllm/v1/worker/gpu/model_states/__init__.py
new file mode 100644
index 000000000000..3ddce0fdcb0b
--- /dev/null
+++ b/vllm/v1/worker/gpu/model_states/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
+
+
+def init_model_state(
+    vllm_config: VllmConfig,
+    model: nn.Module,
+    encoder_cache: EncoderCache | None,
+    device: torch.device,
+):
+    from vllm.v1.worker.gpu.model_states.default import DefaultModelState
+
+    return DefaultModelState(vllm_config, model, encoder_cache, device)
diff --git a/vllm/v1/worker/gpu/model_states.py b/vllm/v1/worker/gpu/model_states/default.py
similarity index 98%
rename from vllm/v1/worker/gpu/model_states.py
rename to vllm/v1/worker/gpu/model_states/default.py
index ca4d63e6bc6f..d52f7d0ec960 100644
--- a/vllm/v1/worker/gpu/model_states.py
+++ b/vllm/v1/worker/gpu/model_states/default.py
@@ -13,11 +13,12 @@
 from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
 from vllm.v1.worker.gpu.mm.encoder_runner import EncoderRunner
 from vllm.v1.worker.gpu.mm.mrope_utils import MRopeState
+from vllm.v1.worker.gpu.model_states.interface import ModelState
 from vllm.v1.worker.gpu.states import RequestState
 from vllm.v1.worker.utils import AttentionGroup
 
 
-class ModelState:
+class DefaultModelState(ModelState):
     def __init__(
         self,
         vllm_config: VllmConfig,
diff --git a/vllm/v1/worker/gpu/model_states/interface.py b/vllm/v1/worker/gpu/model_states/interface.py
new file mode 100644
index 000000000000..d5a25710cb09
--- /dev/null
+++ b/vllm/v1/worker/gpu/model_states/interface.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from typing import Any
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.v1.core.sched.output import NewRequestData
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
+from vllm.v1.worker.gpu.states import RequestState
+from vllm.v1.worker.utils import AttentionGroup
+
+
+class ModelState(ABC):
+    @abstractmethod
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        model: nn.Module,
+        encoder_cache: EncoderCache | None,
+        device: torch.device,
+    ) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_request(self, req_index: int, new_req_data: NewRequestData) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_staged_writes(self) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_embeddings(
+        self,
+        scheduled_encoder_inputs: dict[str, list[int]],
+        input_batch: InputBatch,
+        req_states: RequestState,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_inputs(
+        self, input_batch: InputBatch, req_states: RequestState
+    ) -> dict[str, torch.Tensor | None]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_dummy_inputs(
+        self, num_reqs: int, num_tokens: int
+    ) -> dict[str, torch.Tensor | None]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_attn(
+        self,
+        input_batch: InputBatch,
+        block_tables: tuple[torch.Tensor, ...],
+        slot_mappings: torch.Tensor,
+        attn_groups: list[list[AttentionGroup]],
+        kv_cache_config: KVCacheConfig,
+    ) -> dict[str, Any]:
+        raise NotImplementedError

From 795da8e37467428c224823dd7330f01d196a8772 Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Sat, 28 Feb 2026 18:52:21 -0800
Subject: [PATCH 133/154] update dockerfile

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 docker/Dockerfile.xpu | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 91dffefdedfb..3ed6de8fc722 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -128,20 +128,15 @@ RUN apt-get update && apt-get install -y \
     wget \
     curl \
     git \
-    build-essential 
-
-RUN apt-get install -y \
+    build-essential \
     autoconf \
     automake \
     libtool \
-    pkg-config
-
-RUN apt-get update && apt-get install -y \
+    pkg-config \
     rdma-core \
     libibverbs-dev \
     ibverbs-utils \
     libibverbs1 \
-    ibverbs-providers \
     librdmacm-dev \
     librdmacm1 \
     libibumad-dev \
@@ -156,17 +151,16 @@ RUN apt-get update && apt-get install -y \
     ibverbs-providers \
     librdmacm1t64
 
-RUN git clone https://github.com/openucx/ucx /tmp/ucx_source && \
+ENV PKG_CONFIG_PATH=/tmp/ucx_install/lib/pkgconfig:${PKG_CONFIG_PATH}
+ENV LD_LIBRARY_PATH=/tmp/ucx_install/lib:${LD_LIBRARY_PATH}
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone https://github.com/openucx/ucx /tmp/ucx_source && \
     cd /tmp/ucx_source && git checkout "${UCX_VERSION}" && \
     bash autogen.sh && \
     ./configure --prefix=/tmp/ucx_install --with-ze=yes --enable-examples --enable-mt && \
     make CFLAGS="-Wno-error=incompatible-pointer-types" -j8 && make install && \
     git clone https://github.com/ai-dynamo/nixl /tmp/nixl_source && \
-    cd /tmp/nixl_source && git checkout "${NIXL_VERSION}"
-
-ENV PKG_CONFIG_PATH=/tmp/ucx_install/lib/pkgconfig:${PKG_CONFIG_PATH}
-
-RUN --mount=type=cache,target=/root/.cache/uv \
+    cd /tmp/nixl_source && git checkout "${NIXL_VERSION}" && \
     cd /tmp/nixl_source && \
     uv pip install --upgrade meson pybind11 patchelf && \
     uv pip install -r requirements.txt && \

From 3ecd0bf9fccc425c015f7723b6a7730c0dda2970 Mon Sep 17 00:00:00 2001
From: gnovack <gnovack@amazon.com>
Date: Sat, 28 Feb 2026 18:55:25 -0800
Subject: [PATCH 134/154] Add TMA support to fused_moe_lora kernel (#32195)

Signed-off-by: gnovack <gnovack@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_fused_moe_lora_kernel.py      |  35 ++-
 tests/lora/test_olmoe_tp.py                   |  39 +++
 vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 238 ++++++++++++++----
 vllm/lora/ops/triton_ops/utils.py             |   6 +
 vllm/triton_utils/allocation.py               |  13 +
 5 files changed, 279 insertions(+), 52 deletions(-)
 create mode 100644 vllm/triton_utils/allocation.py

diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index b2db7968ec36..3df3a606c040 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -231,17 +231,22 @@ def use_torch(
     lora_a_stacked,
     lora_b_stacked,
     top_k_num,
+    num_slices=1,
 ):
     outputs = []
     for i in range(hidden_states.shape[0]):
-        lora_idx = token_lora_mapping[i]
-        expert_ids = topk_ids[i]
-        lora_a = lora_a_stacked[0][lora_idx][expert_ids]
-        lora_b = lora_b_stacked[0][lora_idx][expert_ids]
-        tensors = [
-            hidden_states[i] @ lora_a[x].T @ lora_b[x].T for x in range(top_k_num)
-        ]
-        outputs.append(torch.stack(tensors, dim=0))
+        slice_tensors = []
+        for slice_id in range(num_slices):
+            lora_idx = token_lora_mapping[i]
+            expert_ids = topk_ids[i]
+            lora_a = lora_a_stacked[slice_id][lora_idx][expert_ids]
+            lora_b = lora_b_stacked[slice_id][lora_idx][expert_ids]
+            tensors = [
+                hidden_states[i] @ lora_a[x].T @ lora_b[x].T for x in range(top_k_num)
+            ]
+            slice_tensors.append(torch.stack(tensors, dim=0))
+
+        outputs.append(torch.concat(slice_tensors, dim=-1))
     return torch.stack(outputs, dim=0)
 
 
@@ -259,6 +264,7 @@ def use_torch(
 @pytest.mark.parametrize("K", [2048])
 @pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
 @pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_slices", [1, 2])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("seed", SEED)
@@ -271,6 +277,7 @@ def test_fused_moe_lora_kernel(
     K,
     max_lora_rank,
     block_size,
+    num_slices,
     dtype,
     device,
     seed,
@@ -295,17 +302,19 @@ def test_fused_moe_lora_kernel(
             ),
             dtype=dtype,
         )
+        for _ in range(num_slices)
     ]
     lora_b_stacked = [
         torch.rand(
             (
                 max_loras,
                 num_experts,
-                N,
+                N // num_slices,
                 max_lora_rank,
             ),
             dtype=dtype,
         )
+        for _ in range(num_slices)
     ]
     hidden_states = torch.rand(
         (
@@ -340,6 +349,7 @@ def test_fused_moe_lora_kernel(
         lora_a_stacked,
         lora_b_stacked,
         top_k_num,
+        num_slices,
     )
 
     torch.testing.assert_close(output, output2, atol=1e-2, rtol=1e-2)
@@ -434,6 +444,7 @@ def use_fused_moe_lora_kernel_naive(
 @pytest.mark.parametrize("K", [2048])
 @pytest.mark.parametrize("max_lora_rank", [16, 32])
 @pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_slices", [1, 2])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("seed", SEED)
@@ -446,6 +457,7 @@ def test_fused_moe_lora_kernel_naive_block_assignment(
     K,
     max_lora_rank,
     block_size,
+    num_slices,
     dtype,
     device,
     seed,
@@ -484,17 +496,19 @@ def test_fused_moe_lora_kernel_naive_block_assignment(
             ),
             dtype=dtype,
         )
+        for _ in range(num_slices)
     ]
     lora_b_stacked = [
         torch.rand(
             (
                 max_loras,
                 num_experts,
-                N,
+                N // num_slices,
                 max_lora_rank,
             ),
             dtype=dtype,
         )
+        for _ in range(num_slices)
     ]
     hidden_states = torch.rand(
         (
@@ -529,6 +543,7 @@ def test_fused_moe_lora_kernel_naive_block_assignment(
         lora_a_stacked,
         lora_b_stacked,
         top_k_num,
+        num_slices,
     )
 
     torch.testing.assert_close(output, output_ref, atol=1e-2, rtol=1e-2)
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index e10419d244c3..5e38638b9b6f 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -2,7 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+import shutil
+
 import pytest
+import torch
+from safetensors.torch import load_file, save_file
 
 import vllm
 from vllm.lora.request import LoRARequest
@@ -122,6 +126,41 @@ def test_olmoe_lora_mixed(olmoe_lora_files):
     generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
 
 
+def test_olmoe_lora_mixed_random(olmoe_lora_files, tmp_path):
+    # Create a dummy LoRA with random weights based on the real one
+    random_lora_path = tmp_path / "random_lora"
+    shutil.copytree(olmoe_lora_files, random_lora_path)
+
+    weights_path = random_lora_path / "adapter_model.safetensors"
+    weights = load_file(str(weights_path))
+    random_weights = {k: torch.randn_like(v) for k, v in weights.items()}
+    save_file(random_weights, str(weights_path))
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+
+    prompts = [
+        PROMPT_TEMPLATE.format(context="How many candidates are there?"),
+        PROMPT_TEMPLATE.format(context="Count the number of candidates."),
+    ]
+
+    lora_requests = [
+        LoRARequest("real", 1, olmoe_lora_files),
+        LoRARequest("random", 2, str(random_lora_path)),
+    ]
+
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
+    outputs = llm.generate(prompts, sampling_params, lora_request=lora_requests)
+    assert outputs[0].outputs[0].text.strip().startswith(EXPECTED_LORA_OUTPUT[0])
+
+
 @pytest.mark.parametrize("fully_sharded_loras", [False, True])
 @multi_gpu_test(num_gpus=2)
 def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras):
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index c9c85c194176..8072f8769329 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -8,9 +8,10 @@
     tensor_model_parallel_all_reduce,
 )
 from vllm.triton_utils import tl, triton
+from vllm.triton_utils.allocation import set_triton_allocator
 from vllm.utils.torch_utils import direct_register_custom_op
 
-from .utils import supports_pdl
+from .utils import supports_pdl, supports_tma
 
 
 @triton.jit
@@ -70,6 +71,37 @@ def _get_token_offs(
         )
 
 
+@triton.jit
+def _get_c_ptrs(
+    cur_c_ptr,
+    lora_id,
+    pid_m,
+    offs,
+    offs_token,
+    offs_cn,
+    stride_cm,
+    stride_cn,
+    EM: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    sort_c: tl.constexpr,
+):
+    # When sort_c is true, store the output in c_ptr using token order defined
+    # in sorted_token_ids_ptr; otherwise, use the original token order from the prompt
+    if sort_c:
+        offs_token_id = pid_m * BLOCK_SIZE_M + offs
+        c_ptrs = (
+            cur_c_ptr
+            + lora_id * EM * stride_cm
+            + stride_cm * offs_token_id[:, None]
+            + stride_cn * offs_cn[None, :]
+        )
+    else:
+        c_ptrs = (
+            cur_c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+        )
+    return c_ptrs
+
+
 _LORA_PTR_DICT: dict[tuple[int, ...], torch.tensor] = {}
 
 
@@ -125,7 +157,9 @@ def _adjust_kernel_inputs(
 )
 def _fused_moe_lora_kernel(
     a_ptr,
+    a_desc,
     b_ptr,
+    b_desc,
     c_ptr,
     topk_weights_ptr,
     sorted_token_ids_ptr,
@@ -177,6 +211,18 @@ def _fused_moe_lora_kernel(
     USE_GDC: tl.constexpr,
     launch_pdl: tl.constexpr,
     IS_PRIMARY: tl.constexpr,
+    USE_TMA: tl.constexpr,
+    # sort_c determines whether tokens are stored in C in the order determined
+    # by sorted_token_ids to enable later TMA loads from this tensor.
+    #
+    # When USE_TMA is enabled, the parameter combinations are:
+    #   a_desc  | b_desc  | sort_c | Use Case
+    #   --------|---------|--------|-----------------------------
+    #   yes     | yes     | False  | expand kernel (num_slices=1)
+    #   no      | yes     | True   | shrink kernel (num_slices=1)
+    #   yes     | no      | False  | expand kernel (num_slices>1)
+    #   no      | no      | True   | shrink kernel (num_slices>1)
+    sort_c: tl.constexpr,
 ):
     pid = tl.program_id(axis=0)
     slice_id = tl.program_id(axis=1)
@@ -250,58 +296,90 @@ def _fused_moe_lora_kernel(
     cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty))
     cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size
 
-    # remove modulo wrap-around
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)
     offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
     token_mask = offs_token < num_valid_tokens
 
-    # get a_ptrs,b_ptrs
-    a_ptrs = cur_a_ptr + (
-        offs_token[:, None] // token_mapping_factor * stride_am
-        + offs_k[None, :] * stride_ak
-    )
+    if USE_TMA and a_desc is not None:
+        # Expand path - with TMA enabled, load from A using TMA descriptor
+        offs_am = (
+            slice_id * max_loras * EM
+            + lora_id * EM
+            + pid_m * BLOCK_SIZE_M // token_mapping_factor
+        )
+        offs_ak = pid_sk * BLOCK_SIZE_K
+    else:
+        # Shrink path - load hidden states based on order defined in
+        # 'sorted_token_ids_ptr' then store them in c_ptr in this same sorted order
+        tl.static_assert(a_desc is None, "a_desc must be none")
+        a_ptrs = cur_a_ptr + (
+            offs_token[:, None] // token_mapping_factor * stride_am
+            + offs_k[None, :] * stride_ak
+        )
 
-    b_ptrs = (
-        cur_b_ptr
-        + lora_id * stride_bl
-        + expert_id * stride_be
-        + offs_k[:, None] * stride_bk
-        + offs_bn[None, :] * stride_bn
-    )
+    if USE_TMA:
+        offs_bn = pid_n * BLOCK_SIZE_N
+        offs_bk = pid_sk * BLOCK_SIZE_K
+        if b_desc is None:
+            # Note(@gnovack) - Allocation of TMA descriptors on-device
+            # can cause conflicts when running in parallel via PDL
+            if USE_GDC and not IS_PRIMARY:
+                tl.extra.cuda.gdc_wait()
+
+            b_desc = tl.make_tensor_descriptor(
+                cur_b_ptr,
+                shape=[max_loras, num_experts, N, K],
+                strides=[stride_bl, stride_be, stride_bn, stride_bk],
+                block_shape=[1, 1, BLOCK_SIZE_N, BLOCK_SIZE_K],
+            )
+    else:
+        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)
+        b_ptrs = (
+            cur_b_ptr
+            + lora_id * stride_bl
+            + expert_id * stride_be
+            + offs_k[:, None] * stride_bk
+            + offs_bn[None, :] * stride_bn
+        )
 
     if USE_GDC and IS_PRIMARY:
         # GDC launch dependents hints the runtime system to launch dependent kernels.
         tl.extra.cuda.gdc_launch_dependents()
 
-    # accumulator
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
     if USE_GDC and not IS_PRIMARY:
         tl.extra.cuda.gdc_wait()
 
     for k in range(0, grid_k):
-        k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
-        # GDC wait waits for ALL programs in the prior kernel to complete
-        # before continuing.
+        cur_k_offset = k * (BLOCK_SIZE_K * SPLIT_K)
+        k_remaining = K - cur_k_offset
         # pre-fetch lora weight
-        # add (offs_bn < N) mask; optional .ca for B
-        b_mask = (offs_k[:, None] < k_remaining) & (offs_bn[None, :] < N)
-        if USE_B_L2_CACHE:
-            b = tl.load(b_ptrs, mask=b_mask, other=0.0, cache_modifier=".ca")
+        if b_desc is not None:
+            b = (
+                b_desc.load([lora_id, expert_id, offs_bn, offs_bk + cur_k_offset])
+                .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K)
+                .T
+            )
         else:
-            b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-
-        if USE_GDC and not IS_PRIMARY:
-            tl.extra.cuda.gdc_wait()
-        a = tl.load(
-            a_ptrs,
-            mask=token_mask[:, None] & (offs_k[None, :] < k_remaining),
-            other=0.0,
-        )
+            # add (offs_bn < N) mask; optional .ca for B
+            b_mask = (offs_k[:, None] < k_remaining) & (offs_bn[None, :] < N)
+            if USE_B_L2_CACHE:
+                b = tl.load(b_ptrs, mask=b_mask, other=0.0, cache_modifier=".ca")
+            else:
+                b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+            b_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_bk
+
+        if a_desc is not None:
+            a = a_desc.load([offs_am, offs_ak + cur_k_offset])
+        else:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None] & (offs_k[None, :] < k_remaining),
+                other=0.0,
+            )
+            a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak
+
         accumulator += tl.dot(a, b)
-        # Advance the ptrs to the next K block.
-        a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak
-        b_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_bk
 
     if MUL_ROUTED_WEIGHT:
         moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0.0)
@@ -309,7 +387,19 @@ def _fused_moe_lora_kernel(
     accumulator = accumulator.to(c_ptr.dtype.element_ty)
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = cur_c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+    c_ptrs = _get_c_ptrs(
+        cur_c_ptr,
+        lora_id,
+        pid_m,
+        offs,
+        offs_token,
+        offs_cn,
+        stride_cm,
+        stride_cn,
+        EM,
+        BLOCK_SIZE_M,
+        sort_c,
+    )
     c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
 
     if SPLIT_K == 1:
@@ -357,6 +447,7 @@ def _fused_moe_lora_shrink(
     num_active_loras: int,
     mul_routed_weight: bool = False,
     use_gdc: bool = False,
+    use_tma: bool = False,
 ) -> None:
     w1_lora_a_stacked = lora_a_stacked[0]
     shrink_config = {
@@ -369,6 +460,7 @@ def _fused_moe_lora_shrink(
         "SPLIT_K": split_k,
         "USE_GDC": use_gdc,
         "launch_pdl": use_gdc,  # triton kernel metadata
+        "USE_TMA": use_tma,
     }
 
     b_ptr = _get_ptr(lora_a_stacked, device)
@@ -383,9 +475,20 @@ def _fused_moe_lora_shrink(
         len(lora_a_stacked),
         grid_lora_dim,
     )
+
+    a_desc = None
+    b_desc = None
+    if use_tma and num_slices == 1:
+        b_desc = triton.tools.tensor_descriptor.TensorDescriptor.from_tensor(
+            lora_a_stacked[0],
+            [1, 1, shrink_config["BLOCK_SIZE_N"], shrink_config["BLOCK_SIZE_K"]],
+        )
+
     _fused_moe_lora_kernel[grid](
         qcurr_hidden_states,
+        a_desc,
         b_ptr,
+        b_desc,
         a_intermediate_cache1,
         topk_weights,
         sorted_token_ids,
@@ -407,8 +510,8 @@ def _fused_moe_lora_shrink(
         w1_lora_a_stacked.stride(1),
         w1_lora_a_stacked.stride(3),
         w1_lora_a_stacked.stride(2),
-        a_intermediate_cache1.stride(2),
-        a_intermediate_cache1.stride(3),
+        a_intermediate_cache1.stride(-2),
+        a_intermediate_cache1.stride(-1),
         stride_tl,
         stride_el,
         slice_a_size=qcurr_hidden_states.numel(),
@@ -419,7 +522,8 @@ def _fused_moe_lora_shrink(
         naive_block_assignment=sorted_token_ids is None,
         MUL_ROUTED_WEIGHT=False,
         ADD_INPUTS=False,
-        USE_B_L2_CACHE=True,  # new
+        USE_B_L2_CACHE=True,
+        sort_c=use_tma and sorted_token_ids is not None,
         IS_PRIMARY=True,
         **shrink_config,
     )
@@ -462,6 +566,7 @@ def _fused_moe_lora_expand(
     mul_routed_weight: bool = False,
     offset: int = 0,
     use_gdc: bool = False,
+    use_tma: bool = False,
 ) -> None:
     b_ptr = _get_ptr(lora_b_stacked, device)
     K = max_lora_rank
@@ -470,7 +575,7 @@ def _fused_moe_lora_expand(
     w1_lora_b_stacked = lora_b_stacked[0]
 
     a_intermediate_cache1 = a_intermediate_cache1.view(
-        -1, a_intermediate_cache1.shape[3]
+        -1, a_intermediate_cache1.shape[-1]
     )
 
     expand_config = {
@@ -483,6 +588,7 @@ def _fused_moe_lora_expand(
         "SPLIT_K": 1,  # Set split_k = 1 for expand calls
         "USE_GDC": use_gdc,
         "launch_pdl": use_gdc,  # triton kernel metadata
+        "USE_TMA": use_tma,
     }
 
     grid_lora_dim, stride_tl, stride_el = _adjust_kernel_inputs(
@@ -498,10 +604,27 @@ def _fused_moe_lora_expand(
     # Fast path: directly accumulate into the corresponding slice interval of output.
     out_view = output[:, :, offset : offset + num_slices * N]
     slice_c_size = N * out_view.stride(2)
+    a_desc = None
+    b_desc = None
+    if use_tma:
+        if sorted_token_ids is not None:
+            a_desc = triton.tools.tensor_descriptor.TensorDescriptor.from_tensor(
+                a_intermediate_cache1,
+                [expand_config["BLOCK_SIZE_M"], expand_config["BLOCK_SIZE_K"]],
+            )
+        if num_slices == 1:
+            b_desc = triton.tools.tensor_descriptor.TensorDescriptor.from_tensor(
+                lora_b_stacked[0],
+                [1, 1, expand_config["BLOCK_SIZE_N"], expand_config["BLOCK_SIZE_K"]],
+            )
+    else:
+        b_desc = None
 
     _fused_moe_lora_kernel[grid](
         a_intermediate_cache1,
+        a_desc,
         b_ptr,
+        b_desc,
         out_view,
         topk_weights,
         sorted_token_ids,
@@ -535,7 +658,8 @@ def _fused_moe_lora_expand(
         naive_block_assignment=sorted_token_ids is None,
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         ADD_INPUTS=True,
-        USE_B_L2_CACHE=True,  # new
+        USE_B_L2_CACHE=True,
+        sort_c=False,
         IS_PRIMARY=False,
         **expand_config,
     )
@@ -616,8 +740,34 @@ def _fused_moe_lora(
         else num_tokens * shrink_block_size_m
     )
 
+    # TMA is not currently compatiple with fully_sharded due to the non-determinism
+    # of token id sorting across ranks.
+    use_tma = supports_tma(device) and not fully_sharded
+
+    intermediate_cache_shape = (
+        num_slices,
+        M,
+        top_k_num,
+        max_lora_rank,
+    )
+    if use_tma:
+        if num_slices > 1:
+            # if num_slices > 1, we construct TMA descriptors for LoRA
+            # weights within the kernel, which requires us to first set an allocator
+            set_triton_allocator(device)
+
+        # When storing intermediate data in sorted order for TMA, we
+        # need an extra 'num_active_loras' dim in the cache to avoid conflicts
+        if sorted_token_ids is not None:
+            intermediate_cache_shape = (
+                num_slices,
+                sorted_token_ids.shape[0],
+                EM,
+                max_lora_rank,
+            )
+
     a_intermediate_cache1 = torch.zeros(
-        (num_slices, M, top_k_num, max_lora_rank),
+        intermediate_cache_shape,
         dtype=output.dtype,
         device=device,
     )
@@ -654,6 +804,7 @@ def _fused_moe_lora(
         num_active_loras,
         mul_routed_weight,
         use_gdc=use_gdc,
+        use_tma=use_tma,
     )
 
     if fully_sharded:
@@ -703,6 +854,7 @@ def _fused_moe_lora(
         mul_routed_weight,
         offset,
         use_gdc=use_gdc,
+        use_tma=use_tma,
     )
 
 
@@ -772,6 +924,7 @@ def _fused_moe_lora_shrink_fake(
     num_active_loras: int,
     mul_routed_weight: bool = False,
     use_gdc: bool = False,
+    use_tma: bool = False,
 ) -> None:
     return
 
@@ -809,6 +962,7 @@ def _fused_moe_lora_expand_fake(
     mul_routed_weight: bool = False,
     offset: int = 0,
     use_gdc: bool = False,
+    use_tma: bool = False,
 ) -> None:
     return
 
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index c7ac5914bb80..a863b9726054 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -316,3 +316,9 @@ def supports_pdl(device: torch.device | None = None) -> bool:
         and current_platform.has_device_capability(90)
         and not envs.VLLM_LORA_DISABLE_PDL
     )
+
+
+@lru_cache
+def supports_tma(device: torch.device | None = None) -> bool:
+    # TMA requires compute capability SM90 or above
+    return current_platform.is_cuda() and current_platform.has_device_capability(90)
diff --git a/vllm/triton_utils/allocation.py b/vllm/triton_utils/allocation.py
new file mode 100644
index 000000000000..e805f80b8941
--- /dev/null
+++ b/vllm/triton_utils/allocation.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.triton_utils import triton
+
+
+def set_triton_allocator(device: torch.device):
+    def alloc_fn(size: int, alignment: int, stream: int | None):
+        return torch.empty(size, device=device, dtype=torch.int8)
+
+    triton.set_allocator(alloc_fn)

From afd089f231d714e7fd06b51e3bc7df7fe004c7f9 Mon Sep 17 00:00:00 2001
From: lailoo <ll1042668699@gmail.com>
Date: Sun, 1 Mar 2026 11:27:37 +0800
Subject: [PATCH 135/154] [Bugfix][Model] Fix Qwen3.5/Qwen3Next ignoring
 --dtype flag on older GPUs (#35617)

---
 vllm/model_executor/models/qwen3_5.py    | 2 --
 vllm/model_executor/models/qwen3_next.py | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py
index 731bf3947a7c..66d8ff8e1b0a 100644
--- a/vllm/model_executor/models/qwen3_5.py
+++ b/vllm/model_executor/models/qwen3_5.py
@@ -274,7 +274,6 @@ def __init__(
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.dtype,
                 ),
             )
             self.ffn_layer_scale = torch.nn.Parameter(
@@ -282,7 +281,6 @@ def __init__(
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.dtype,
                 ),
             )
 
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index c57265cc7944..7f1386d7be57 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -463,7 +463,6 @@ def __init__(
             group_size=None,
             norm_before_gate=True,
             device=current_platform.current_device(),
-            dtype=config.dtype,
         )
 
         self.out_proj = RowParallelLinear(
@@ -1018,7 +1017,6 @@ def __init__(
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.dtype,
                 ),
             )
             self.ffn_layer_scale = torch.nn.Parameter(
@@ -1026,7 +1024,6 @@ def __init__(
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.dtype,
                 ),
             )
 

From a9ec392c86446996087e6919eaf59023c984b8fe Mon Sep 17 00:00:00 2001
From: lin-shh <82112156+lin-shh@users.noreply.github.com>
Date: Sun, 1 Mar 2026 02:34:37 -0500
Subject: [PATCH 136/154] Fix typo: implictly -> implicitly in isaac.py
 docstring (#35646)

---
 vllm/model_executor/models/isaac.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py
index f4f7ce45908c..6d8b45a7a989 100644
--- a/vllm/model_executor/models/isaac.py
+++ b/vllm/model_executor/models/isaac.py
@@ -551,7 +551,7 @@ def process_vision_for_patches(
             `(num_images, height, width, channels)` for a batch. Channels are
             expected to be RGB.
         patch_size (`int`):
-            Edge length of square patches; implictly controls resize grid granularity.
+            Edge length of square patches; implicitly controls resize grid granularity.
         max_num_patches (`int`):
             Maximum number of patches allowed after resizing.
         min_num_patches (`int`, *optional*):

From 87d319c52f22d3d08ef8ee49163aad9aad08f472 Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Sun, 1 Mar 2026 01:58:07 -0600
Subject: [PATCH 137/154] [AMD][CI] Support Triton attention with
 ExampleConnector (#34931)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 .../unit/test_example_connector.py            | 18 +++++++++++-------
 .../kv_connector/unit/test_multi_connector.py |  8 --------
 .../kv_connector/v1/example_connector.py      | 19 +++++++++++++++++--
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py
index d415608c95fa..e42f691eacd4 100644
--- a/tests/v1/kv_connector/unit/test_example_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@@ -8,7 +8,7 @@
 
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
-from vllm.config import KVTransferConfig
+from vllm.config import AttentionConfig, KVTransferConfig
 from vllm.multimodal.utils import encode_image_url
 from vllm.platforms import current_platform
 
@@ -110,14 +110,17 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
         print("-" * 50)
 
 
-@pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason=(
-        "hipErrorLaunchFailure when running this test, see issue:"
-        "https://github.com/ROCm/pytorch/issues/2822"
+@pytest.mark.parametrize(
+    "attn_backend",
+    (
+        ["FLASH_ATTN", "TRITON_ATTN"]
+        if current_platform.is_cuda()
+        else ["TRITON_ATTN"]
+        if current_platform.is_rocm()
+        else []
     ),
 )
-def test_shared_storage_connector_hashes(tmp_path):
+def test_shared_storage_connector_hashes(tmp_path, attn_backend):
     """
     Tests that ExampleConnector saves KV to the storage locations
     with proper hashes; that are unique for inputs with identical text but
@@ -138,6 +141,7 @@ def test_shared_storage_connector_hashes(tmp_path):
         max_model_len=8192,
         max_num_seqs=1,
         gpu_memory_utilization=0.4,
+        attention_config=AttentionConfig(backend=attn_backend),
         enforce_eager=True,
         kv_transfer_config=kv_transfer_config,
         limit_mm_per_prompt={"image": 2},
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index b91c9c771ef5..0541dcaa50bc 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -20,7 +20,6 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlKVConnectorStats,
 )
-from vllm.platforms import current_platform
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -97,13 +96,6 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
     return True
 
 
-@pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason=(
-        "hipErrorLaunchFailure when running this test, see issue:"
-        "https://github.com/ROCm/pytorch/issues/2822"
-    ),
-)
 def test_multi_example_connector_consistency():
     """
     Tests that MultiConnector with two ExampleConnectors saves
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
index d4a99cf0929a..14feafced5a5 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadata
 from vllm.utils.hashing import safe_hash
 from vllm.v1.attention.backend import AttentionMetadata
+from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
@@ -118,12 +119,12 @@ def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> Non
             The number of elements in kv_caches and layer_names should be
             the same.
         """
-        attn_metadata = forward_context.attn_metadata
 
         def inject_kv_into_layer(
             dst_kv_cache_layer: torch.Tensor,
             src_kv_cache: torch.Tensor,
             slot_mapping: torch.Tensor,
+            attn_metadata: AttentionMetadata,
         ) -> None:
             """Inject the KV cache into the layer.
 
@@ -145,6 +146,10 @@ def inject_kv_into_layer(
                     num_pages * page_size, -1
                 )
                 dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+            elif isinstance(attn_metadata, TritonAttentionMetadata):
+                block_idxs = slot_mapping // self._block_size
+                offsets = slot_mapping % self._block_size
+                dst_kv_cache_layer[block_idxs, :, offsets] = src_kv_cache
             else:
                 num_pages = dst_kv_cache_layer_shape[1]
                 page_size = dst_kv_cache_layer_shape[2]
@@ -186,7 +191,13 @@ def inject_kv_into_layer(
                     layer_name, request.token_ids, request.mm_hashes
                 )
                 kv_cache = safetensors.torch.load_file(filename)["kv_cache"].cuda()
-                inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping)
+                if isinstance(attn_metadata, dict):
+                    inject_kv_into_layer(
+                        kv_cache_layer,
+                        kv_cache,
+                        request.slot_mapping,
+                        attn_metadata[layer_name],
+                    )
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """Blocking until the KV for a specific layer is loaded into vLLM's
@@ -229,6 +240,10 @@ def extract_kv_from_layer(
             if isinstance(attn_metadata, MLACommonMetadata):
                 num_pages, page_size = layer.shape[0], layer.shape[1]
                 return layer.reshape(num_pages * page_size, -1)[slot_mapping, ...]
+            elif isinstance(attn_metadata, TritonAttentionMetadata):
+                block_idxs = slot_mapping // self._block_size
+                offsets = slot_mapping % self._block_size
+                return layer[block_idxs, :, offsets]
             num_pages, page_size = layer.shape[1], layer.shape[2]
             return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...]
 

From da543d1abe2468a1b79f230e91e8bbdc2bf6ee71 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 1 Mar 2026 00:15:39 -0800
Subject: [PATCH 138/154] [Model Runner V2] Minor refactoring for EncoderRunner
 (#35628)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/mm/encoder_runner.py    | 15 ++++-----------
 vllm/v1/worker/gpu/model_states/default.py | 11 +++++++++--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py
index c0676d05d268..e62c2ef63d77 100644
--- a/vllm/v1/worker/gpu/mm/encoder_runner.py
+++ b/vllm/v1/worker/gpu/mm/encoder_runner.py
@@ -13,12 +13,14 @@
 class EncoderRunner:
     def __init__(
         self,
+        model: SupportsMultiModal,
         max_num_tokens: int,
         hidden_size: int,
         encoder_cache: EncoderCache,
         dtype: torch.dtype,
         device: torch.device,
     ):
+        self.model = model
         self.max_num_tokens = max_num_tokens
         self.hidden_size = hidden_size
         self.encoder_cache = encoder_cache
@@ -48,25 +50,17 @@ def prepare_mm_inputs(
     @torch.inference_mode()
     def execute_mm_encoder(
         self,
-        model: SupportsMultiModal,
-        mm_hashes: list[str],
         mm_kwargs: list[tuple[str, MultiModalKwargsItem]],
     ) -> list[torch.Tensor]:
-        if not mm_hashes:
-            return []
-
         encoder_outputs: list[torch.Tensor] = []
         for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
             mm_kwargs, device=self.device, pin_memory=False
         ):
-            curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
+            curr_group_outputs = self.model.embed_multimodal(**mm_kwargs_group)
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs, expected_num_items=num_items
             )
             encoder_outputs.extend(curr_group_outputs)
-
-        # Cache the encoder outputs by mm_hash
-        self.encoder_cache.encoder_outputs.update(zip(mm_hashes, encoder_outputs))
         return encoder_outputs
 
     def gather_mm_embeddings(
@@ -146,12 +140,11 @@ def gather_mm_embeddings(
     @torch.inference_mode()
     def get_inputs_embeds(
         self,
-        model: SupportsMultiModal,
         input_ids: torch.Tensor,
         mm_embeds: list[torch.Tensor],
         is_mm_embed: torch.Tensor,
     ) -> torch.Tensor:
-        x = model.embed_input_ids(
+        x = self.model.embed_input_ids(
             input_ids, multimodal_embeddings=mm_embeds, is_multimodal=is_mm_embed
         )
         # Copy to the pre-allocated buffer for CUDA graphs.
diff --git a/vllm/v1/worker/gpu/model_states/default.py b/vllm/v1/worker/gpu/model_states/default.py
index d52f7d0ec960..e27916b40663 100644
--- a/vllm/v1/worker/gpu/model_states/default.py
+++ b/vllm/v1/worker/gpu/model_states/default.py
@@ -41,7 +41,9 @@ def __init__(
 
         if self.supports_mm_inputs:
             assert encoder_cache is not None
+            self.encoder_cache = encoder_cache
             self.encoder_runner = EncoderRunner(
+                model=self.model,
                 max_num_tokens=self.max_num_tokens,
                 hidden_size=self.inputs_embeds_size,
                 encoder_cache=encoder_cache,
@@ -82,7 +84,12 @@ def get_mm_embeddings(
         mm_hashes, mm_kwargs = self.encoder_runner.prepare_mm_inputs(
             scheduled_encoder_inputs
         )
-        self.encoder_runner.execute_mm_encoder(self.model, mm_hashes, mm_kwargs)
+        if mm_kwargs:
+            # Execute the multimodal encoder.
+            encoder_outputs = self.encoder_runner.execute_mm_encoder(mm_kwargs)
+            # Cache the encoder outputs by mm_hash
+            self.encoder_cache.encoder_outputs.update(zip(mm_hashes, encoder_outputs))
+
         mm_embeds, is_mm_embed = self.encoder_runner.gather_mm_embeddings(
             input_batch.req_ids,
             input_batch.num_tokens,
@@ -92,7 +99,7 @@ def get_mm_embeddings(
             req_states.num_computed_prefill_tokens[input_batch.idx_mapping_np],
         )
         inputs_embeds = self.encoder_runner.get_inputs_embeds(
-            self.model, input_batch.input_ids, mm_embeds, is_mm_embed
+            input_batch.input_ids, mm_embeds, is_mm_embed
         )
         return inputs_embeds[: input_batch.num_tokens_after_padding]
 

From bbf81f9a9284d572b69db2c4fb002c2a8a80d507 Mon Sep 17 00:00:00 2001
From: Asaf Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Sun, 1 Mar 2026 14:40:23 +0200
Subject: [PATCH 139/154] [Mamba1] - Kernel Level Chunk Alignment for Prefix
 Caching (#34798)

Signed-off-by: Josephasafg <ajgard7@gmail.com>
---
 csrc/mamba/mamba_ssm/selective_scan.h         |   4 +-
 csrc/mamba/mamba_ssm/selective_scan_fwd.cu    | 103 ++++++++++-----
 csrc/ops.h                                    |   4 +-
 csrc/torch_bindings.cpp                       |   4 +-
 tests/kernels/mamba/test_mamba_ssm.py         |   4 +
 vllm/_custom_ops.py                           |   4 +
 .../layers/mamba/mamba_mixer.py               |   4 +
 .../layers/mamba/ops/mamba_ssm.py             |   4 +
 vllm/v1/attention/backends/mamba1_attn.py     |  33 ++++-
 vllm/v1/attention/backends/mamba2_attn.py     | 112 +---------------
 vllm/v1/attention/backends/mamba_attn.py      | 121 ++++++++++++++++++
 11 files changed, 251 insertions(+), 146 deletions(-)

diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h
index e93455a57cd8..8f33c7cfa163 100644
--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -17,7 +17,7 @@
 struct SSMParamsBase {
     using index_t = size_t;
 
-    int batch, dim, seqlen, dstate, n_groups, n_chunks;
+    int batch, dim, seqlen, dstate, n_groups;
     int dim_ngroups_ratio;
     bool is_variable_B;
     bool is_variable_C;
@@ -72,6 +72,8 @@ struct SSMParamsBase {
     void *__restrict__ block_idx_first_scheduled_token_ptr;  // (batch,) - first block to write
     void *__restrict__ block_idx_last_scheduled_token_ptr;   // (batch,) - last block to write
     void *__restrict__ initial_state_idx_ptr;  // (batch,) - index of the initial state to use
+    void *__restrict__ cu_chunk_seqlen_ptr;      // (nchunks+1,) - cumulative chunk token offsets
+    void *__restrict__ last_chunk_indices_ptr;   // (batch,) - index of last chunk per sequence
 };
 
 
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index fb2a2e578999..d852a0ed4928 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -81,7 +81,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     constexpr bool kIsVariableC = Ktraits::kIsVariableC;
     constexpr bool kHasZ = Ktraits::kHasZ;
     constexpr bool kVarlen = Ktraits::kVarlen;
-    constexpr int kNThreads = Ktraits::kNThreads;
     constexpr int kNItems = Ktraits::kNItems;
     constexpr int kNRows = Ktraits::kNRows;
     constexpr bool kDirectIO = Ktraits::kDirectIO;
@@ -161,17 +160,8 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
         }
     }
 
-
-    // for (int state_idx = threadIdx.x; state_idx < params.dstate; state_idx += blockDim.x) {
-    //     smem_a[state_idx] = A[state_idx * params.A_dstate_stride];
-    //     smem_bc[state_idx] = B[state_idx * params.B_dstate_stride] * C[state_idx * params.C_dstate_stride];
-    // }
-
-    constexpr int kChunkSize = kNThreads * kNItems;
-
     // Use block_size for chunking when APC is enabled, otherwise use 2048 for backwards compatibility
-    const int iteration_chunk_size = params.cache_enabled ? params.block_size : 2048;
-    const int n_chunks = (seqlen + iteration_chunk_size - 1) / iteration_chunk_size;
+    const int block_size = params.cache_enabled ? params.block_size : 2048;
 
     const int* batch_cache_indices = cache_indices != nullptr ?
                                      cache_indices + batch_id * params.cache_indices_stride : nullptr;
@@ -181,10 +171,44 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                                           reinterpret_cast<const int*>(params.block_idx_last_scheduled_token_ptr) : nullptr;
     const int* initial_state_idx = params.initial_state_idx_ptr != nullptr ?
                                    reinterpret_cast<const int*>(params.initial_state_idx_ptr) : nullptr;
+    const int* cu_chunk_seqlen = params.cu_chunk_seqlen_ptr != nullptr ?
+                                 reinterpret_cast<const int*>(params.cu_chunk_seqlen_ptr) : nullptr;
+    const int* last_chunk_indices = params.last_chunk_indices_ptr != nullptr ?
+                                    reinterpret_cast<const int*>(params.last_chunk_indices_ptr) : nullptr;
 
     const size_t load_cache_slot = params.cache_enabled && batch_cache_indices != nullptr ? batch_cache_indices[initial_state_idx[batch_id]] : cache_index;
 
+    const int block_idx_first = (params.cache_enabled && block_idx_first_scheduled != nullptr) ?
+                                 block_idx_first_scheduled[batch_id] : 0;
+
+    // Determine chunk boundaries from pre-computed metadata (APC mode)
+    // or fall back to simple block_size chunking.
+    int first_chunk_idx, n_chunks;
+    int current_position;
+
+    if (cu_chunk_seqlen != nullptr && last_chunk_indices != nullptr) {
+        const int last_chunk_idx = last_chunk_indices[batch_id];
+        first_chunk_idx = (batch_id == 0) ? 0 : last_chunk_indices[batch_id - 1] + 1;
+        n_chunks = last_chunk_idx - first_chunk_idx + 1;
+        // Derive current_position: if the first chunk is partial (fills remainder
+        // of a started block), offset into the block accordingly.
+        const int first_chunk_tokens = cu_chunk_seqlen[first_chunk_idx + 1] - cu_chunk_seqlen[first_chunk_idx];
+        const int chunk_start_offset = (n_chunks > 1 && first_chunk_tokens < block_size)
+                                        ? (block_size - first_chunk_tokens) : 0;
+        current_position = block_idx_first * block_size + chunk_start_offset;
+    } else {
+        first_chunk_idx = 0;
+        n_chunks = (seqlen + block_size - 1) / block_size;
+        current_position = 0;
+    }
+
+    int tokens_processed = 0;
+
     for (int chunk = 0; chunk < n_chunks; ++chunk) {
+        const int chunk_tokens = (cu_chunk_seqlen != nullptr)
+            ? cu_chunk_seqlen[first_chunk_idx + chunk + 1] - cu_chunk_seqlen[first_chunk_idx + chunk]
+            : min(block_size, seqlen - tokens_processed);
+        if (chunk_tokens <= 0) break;
         input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
 
         __syncthreads();
@@ -193,12 +217,12 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
             if constexpr (!kDirectIO) {
                 if (r > 0) { __syncthreads(); }
             }
-            load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, seqlen - chunk * kChunkSize);
+            load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, chunk_tokens);
             if constexpr (!kDirectIO) { __syncthreads(); }
-            load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, seqlen - chunk * kChunkSize);
+            load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, chunk_tokens);
         }
-        u += kChunkSize;
-        delta += kChunkSize;
+        u += chunk_tokens;
+        delta += chunk_tokens;
     
         float delta_vals[kNRows][kNItems], delta_u_vals[kNRows][kNItems], out_vals[kNRows][kNItems];
         #pragma unroll
@@ -232,7 +256,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
             weight_t B_vals[kNItems], C_vals[kNItems];
             if constexpr (kIsVariableB) {
                 load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
-                    smem_load_weight, (seqlen - chunk * kChunkSize) * (1));
+                    smem_load_weight, chunk_tokens);
                 if constexpr (!kIsVariableC) {
                     #pragma unroll
                     for (int r = 0; r < kNRows; ++r) {
@@ -243,7 +267,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
             if constexpr (kIsVariableC) {
                 auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
                 load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
-                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1));
+                    smem_load_weight_C, chunk_tokens);
                 if constexpr (!kIsVariableB) {
                     #pragma unroll
                     for (int r = 0; r < kNRows; ++r) {
@@ -266,10 +290,8 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 for (int i = 0; i < kNItems; ++i) {
                     thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
                                                  !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
-                    if (seqlen % (kNItems * kNThreads) != 0) {  // So that the last state is correct
-                        if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
-                            thread_data[i] = make_float2(1.f, 0.f);
-                        }
+                    if (threadIdx.x * kNItems + i >= chunk_tokens) {
+                        thread_data[i] = make_float2(1.f, 0.f);
                     }
                 }
                 // Initialize running total
@@ -301,14 +323,14 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 if (threadIdx.x == 0) {
                     smem_running_prefix[state_idx + r * MAX_DSTATE] = prefix_op.running_prefix;
 
-                    // Store state at the end of each chunk when cache is enabled
+                    // Store state at the end of each aligned chunk when cache is enabled
                     if (params.cache_enabled && batch_cache_indices != nullptr) {
-
                         size_t cache_slot;
                         if (chunk == n_chunks - 1) {
                             cache_slot = batch_cache_indices[block_idx_last_scheduled[batch_id]];
                         } else {
-                            cache_slot = batch_cache_indices[block_idx_first_scheduled[batch_id] + chunk];
+                            const int block_idx_completed = (current_position + chunk_tokens - 1) / block_size;
+                            cache_slot = batch_cache_indices[block_idx_completed];
                         }
 
                         size_t state_offset = cache_slot * params.ssm_states_batch_stride +
@@ -331,38 +353,41 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
             }
         }
         input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
-            + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
+            + dim_id * kNRows * params.out_d_stride + tokens_processed;
         __syncthreads();
         #pragma unroll
         for (int r = 0; r < kNRows; ++r) {
             if constexpr (!kDirectIO) {
                 if (r > 0) { __syncthreads(); }
             }
-            store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
+            store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, chunk_tokens);
         }
 
         if constexpr (kHasZ) {
             input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + sequence_start_index * params.z_batch_stride
-                + dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
+                + dim_id * kNRows * params.z_d_stride + tokens_processed;
             input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + sequence_start_index * params.out_z_batch_stride
-                + dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
+                + dim_id * kNRows * params.out_z_d_stride + tokens_processed;
             #pragma unroll
             for (int r = 0; r < kNRows; ++r) {
                 input_t z_vals[kNItems];
                 __syncthreads();
-                load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, seqlen - chunk * kChunkSize);
+                load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, chunk_tokens);
                 #pragma unroll
                 for (int i = 0; i < kNItems; ++i) {
                     float z_val = z_vals[i];
                     out_vals[r][i] *= z_val / (1 + expf(-z_val));
                 }
                 __syncthreads();
-                store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
+                store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, chunk_tokens);
             }
         }
 
-        Bvar += kChunkSize * 1;
-        Cvar += kChunkSize * 1;
+        Bvar += chunk_tokens;
+        Cvar += chunk_tokens;
+
+        tokens_processed += chunk_tokens;
+        current_position += chunk_tokens;
     }
 }
 
@@ -506,7 +531,9 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                         int64_t block_size,
                         const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
                         const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
-                        const std::optional<torch::Tensor> &initial_state_idx) {
+                        const std::optional<torch::Tensor> &initial_state_idx,
+                        const std::optional<torch::Tensor> &cu_chunk_seqlen,
+                        const std::optional<torch::Tensor> &last_chunk_indices) {
 
     // Reset the parameters
     memset(&params, 0, sizeof(params));
@@ -548,6 +575,8 @@ void set_ssm_params_fwd(SSMParamsBase &params,
     params.block_idx_first_scheduled_token_ptr = block_idx_first_scheduled_token.has_value() ? block_idx_first_scheduled_token.value().data_ptr() : nullptr;
     params.block_idx_last_scheduled_token_ptr = block_idx_last_scheduled_token.has_value() ? block_idx_last_scheduled_token.value().data_ptr() : nullptr;
     params.initial_state_idx_ptr = initial_state_idx.has_value() ? initial_state_idx.value().data_ptr() : nullptr;
+    params.cu_chunk_seqlen_ptr = cu_chunk_seqlen.has_value() ? cu_chunk_seqlen.value().data_ptr() : nullptr;
+    params.last_chunk_indices_ptr = last_chunk_indices.has_value() ? last_chunk_indices.value().data_ptr() : nullptr;
 
     // All stride are in elements, not bytes.
     params.A_d_stride = A.stride(0);
@@ -633,7 +662,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                   int64_t block_size,
                   const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
                   const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
-                  const std::optional<torch::Tensor> &initial_state_idx) {
+                  const std::optional<torch::Tensor> &initial_state_idx,
+                  const std::optional<torch::Tensor> &cu_chunk_seqlen,
+                  const std::optional<torch::Tensor> &last_chunk_indices) {
     auto input_type = u.scalar_type();
     auto weight_type = A.scalar_type();
     TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -778,7 +809,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                        block_size,
                        block_idx_first_scheduled_token,
                        block_idx_last_scheduled_token,
-                       initial_state_idx
+                       initial_state_idx,
+                       cu_chunk_seqlen,
+                       last_chunk_indices
                        );
 
     
diff --git a/csrc/ops.h b/csrc/ops.h
index 690342b37caa..921d6484d2d3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -371,7 +371,9 @@ void selective_scan_fwd(
     const torch::Tensor& ssm_states, int64_t pad_slot_id, int64_t block_size,
     const std::optional<torch::Tensor>& block_idx_first_scheduled_token,
     const std::optional<torch::Tensor>& block_idx_last_scheduled_token,
-    const std::optional<torch::Tensor>& initial_state_idx);
+    const std::optional<torch::Tensor>& initial_state_idx,
+    const std::optional<torch::Tensor>& cu_chunk_seqlen,
+    const std::optional<torch::Tensor>& last_chunk_indices);
 
 torch::Tensor dynamic_4bit_int_moe_cpu(
     torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8be30b209106..9ba18289eaa9 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -640,7 +640,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "int block_size,"
       "Tensor? block_idx_first_scheduled_token,"
       "Tensor? block_idx_last_scheduled_token,"
-      "Tensor? initial_state_idx) -> ()");
+      "Tensor? initial_state_idx,"
+      "Tensor? cu_chunk_seqlen,"
+      "Tensor? last_chunk_indices) -> ()");
   ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
 
   // Hadamard transforms
diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 905207109474..9a00e1d047b3 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -183,6 +183,8 @@ def selective_scan_opcheck_fn(
     block_idx_first_scheduled_token=None,
     block_idx_last_scheduled_token=None,
     initial_state_idx=None,
+    cu_chunk_seqlen=None,
+    last_chunk_indices=None,
 ):
     """if return_last_state is True, returns (out, last_state)
     last_state has shape (batch, dim, dstate).
@@ -231,6 +233,8 @@ def selective_scan_opcheck_fn(
             block_idx_first_scheduled_token,
             block_idx_last_scheduled_token,
             initial_state_idx,
+            cu_chunk_seqlen,
+            last_chunk_indices,
         ),
         test_utils=["test_schema", "test_faketensor"],
     )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 46f9dfad9211..9ed8dfa8df52 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2021,6 +2021,8 @@ def selective_scan_fwd(
     block_idx_first_scheduled_token: torch.Tensor | None = None,
     block_idx_last_scheduled_token: torch.Tensor | None = None,
     initial_state_idx: torch.Tensor | None = None,
+    cu_chunk_seqlen: torch.Tensor | None = None,
+    last_chunk_indices: torch.Tensor | None = None,
 ):
     torch.ops._C.selective_scan_fwd(
         u,
@@ -2041,6 +2043,8 @@ def selective_scan_fwd(
         block_idx_first_scheduled_token,
         block_idx_last_scheduled_token,
         initial_state_idx,
+        cu_chunk_seqlen,
+        last_chunk_indices,
     )
 
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 24e189a5c073..6a33fc7d6b1b 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -271,6 +271,8 @@ def forward_impl(self, hidden_states: torch.Tensor, output: torch.Tensor):
             conv_state = self_kv_cache[0].transpose(-1, -2)
             ssm_state = self_kv_cache[1]
             has_initial_states_p = attn_metadata.has_initial_states_p
+            cu_chunk_seqlen_p = attn_metadata.cu_chunk_seqlen_p
+            last_chunk_indices_p = attn_metadata.last_chunk_indices_p
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
@@ -376,6 +378,8 @@ def forward_impl(self, hidden_states: torch.Tensor, output: torch.Tensor):
                 block_idx_first_scheduled_token=block_idx_first_scheduled_token_p,
                 block_idx_last_scheduled_token=block_idx_last_scheduled_token_p,
                 initial_state_idx=block_idx_last_computed_token_p,
+                cu_chunk_seqlen=cu_chunk_seqlen_p,
+                last_chunk_indices=last_chunk_indices_p,
             )
             ssm_outputs.append(scan_out_p)
 
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index a0df65f90c05..44e73dd20a1d 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -497,6 +497,8 @@ def selective_scan_fn(
     block_idx_first_scheduled_token=None,
     block_idx_last_scheduled_token=None,
     initial_state_idx=None,
+    cu_chunk_seqlen=None,
+    last_chunk_indices=None,
 ) -> torch.Tensor:
     """
     u: (dim, total_length) for varlen or (batch, dim, seqlen)
@@ -588,6 +590,8 @@ def selective_scan_fn(
         block_idx_first_scheduled_token,
         block_idx_last_scheduled_token,
         initial_state_idx,
+        cu_chunk_seqlen,
+        last_chunk_indices,
     )
 
     if z is None:
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index c7228ecea584..8903406200ca 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
+from typing import Any
 
-from vllm.v1.attention.backend import AttentionBackend
+from vllm.v1.attention.backend import AttentionBackend, CommonAttentionMetadata
 from vllm.v1.attention.backends.mamba_attn import (
     BaseMambaAttentionMetadata,
     BaseMambaAttentionMetadataBuilder,
@@ -29,3 +30,31 @@ class Mamba1AttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
 ):
     metadata_cls = Mamba1AttentionMetadata
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+        **kwargs: Any,
+    ) -> Mamba1AttentionMetadata:
+        common = self._compute_common_metadata(common_attn_metadata)
+
+        if (
+            common.num_prefills > 0
+            and self.vllm_config.cache_config.mamba_cache_mode == "all"
+        ):
+            cu_chunk_seqlen_p, _, last_chunk_indices_p = (
+                self._build_chunk_metadata_tensors(
+                    self.kv_cache_spec.block_size,
+                    common,
+                    common_attn_metadata,
+                )
+            )
+            return replace(
+                common,
+                cu_chunk_seqlen_p=cu_chunk_seqlen_p,
+                last_chunk_indices_p=last_chunk_indices_p,
+            )
+
+        return common
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 94587c3d6ba7..5e8abbab565e 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -7,7 +7,6 @@
 import torch
 
 from vllm.config import VllmConfig
-from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backend import (
     AttentionBackend,
     CommonAttentionMetadata,
@@ -105,14 +104,6 @@ class Mamba2AttentionMetadata(BaseMambaAttentionMetadata):
 
     # Chunk-related metadata (only for prefill)
     seq_idx_p: torch.Tensor | None = None
-    # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for
-    # each chunk, its offsets into the varlen sequence dimension. It is defined
-    # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to
-    # cu_chunk_seqlen_p[i+1].
-    cu_chunk_seqlen_p: torch.Tensor | None = None
-    # last_chunk_indices_p is a tensor of shape (batch,) that contains the
-    # index of the last chunk for every sequence in the (prefill) batch.
-    last_chunk_indices_p: torch.Tensor | None = None
 
 
 class Mamba2AttentionMetadataBuilder(
@@ -134,68 +125,6 @@ def __init__(
         )
         self.chunk_size: int = chunk_size
 
-    def _compute_chunk_metadata(
-        self,
-        num_prefills: int,
-        num_computed_tokens_p_cpu: torch.Tensor,
-        query_start_loc_p_cpu: torch.Tensor,
-    ) -> tuple[list[int], list[int], list[int]]:
-        """
-        Compute chunk-specific metadata for Mamba2.
-
-        The code below carefully constructs the chunks such that:
-        1. Chunks contain tokens from a *single* sequence only.
-        2. For every sequence, we are guaranteed that we can
-           retrieve the mamba state *every* chunk_size tokens.
-        Constraint (1) dramatically simplifies the mamba2 kernels.
-        Constraint (2) dramatically simplifies the implementation
-        of prefix caching for mamba2 (wip). We need to take care
-        of the interaction with chunked prefill in order to
-        satisfy constraint (2).
-        """
-        # TODO (tdoublep): This code could probably be optimized.
-        cu_chunk_seqlen = []
-        seq_idx = []
-        last_chunk_indices = []
-        seqlen_pos = 0
-
-        for req_idx in range(num_prefills):
-            this_num_computed = num_computed_tokens_p_cpu[req_idx].item()
-            this_new_tokens = (
-                query_start_loc_p_cpu[req_idx + 1].item()
-                - query_start_loc_p_cpu[req_idx].item()
-            )
-
-            # if computed tokens are not chunk-aligned, use the first
-            # chunk to finish it off
-            if this_num_computed % self.chunk_size != 0:
-                seq_idx.append(req_idx)
-                cu_chunk_seqlen.append(seqlen_pos)
-                # how many tokens to finish the chunk?
-                chunk_len = (
-                    cdiv(this_num_computed, self.chunk_size) * self.chunk_size
-                    - this_num_computed
-                )
-                # we can only use at most this_new_tokens
-                chunk_len = min(chunk_len, this_new_tokens)
-                seqlen_pos += chunk_len
-                this_new_tokens -= chunk_len
-
-            n_chunks = cdiv(this_new_tokens, self.chunk_size)
-            for chunk in range(n_chunks):
-                seq_idx.append(req_idx)
-                cu_chunk_seqlen.append(seqlen_pos)
-                chunk_len = min(self.chunk_size, this_new_tokens)
-                seqlen_pos += chunk_len
-                this_new_tokens -= chunk_len
-
-            assert this_new_tokens == 0
-            last_chunk_indices.append(len(cu_chunk_seqlen) - 1)
-
-        cu_chunk_seqlen.append(seqlen_pos)
-
-        return cu_chunk_seqlen, seq_idx, last_chunk_indices
-
     def build(
         self,
         common_prefix_len: int,
@@ -220,41 +149,12 @@ def build(
                 else False
             )
 
-            num_reqs = common.num_reqs
-            num_prefills = common.num_prefills
-            num_decode_tokens = common.num_decode_tokens
-
-            num_computed_tokens_cpu = (
-                common_attn_metadata.compute_num_computed_tokens().cpu()
-            )
-            num_computed_tokens_p_cpu = num_computed_tokens_cpu[
-                num_reqs - num_prefills : num_reqs
-            ]
-            query_start_loc_p_cpu = (
-                common_attn_metadata.query_start_loc_cpu[-num_prefills - 1 :]
-                - num_decode_tokens
-            )
-
-            cu_chunk_seqlen, seq_idx, last_chunk_indices = self._compute_chunk_metadata(
-                num_prefills,
-                num_computed_tokens_p_cpu,
-                query_start_loc_p_cpu,
-            )
-
-            seq_idx_p = torch.as_tensor(
-                seq_idx,
-                device=common_attn_metadata.query_start_loc.device,
-                dtype=torch.int32,
-            )
-            cu_chunk_seqlen_p = torch.as_tensor(
-                cu_chunk_seqlen,
-                device=common_attn_metadata.query_start_loc.device,
-                dtype=torch.int32,
-            )
-            last_chunk_indices_p = torch.as_tensor(
-                last_chunk_indices,
-                device=common_attn_metadata.query_start_loc.device,
-                dtype=torch.int32,
+            cu_chunk_seqlen_p, seq_idx_p, last_chunk_indices_p = (
+                self._build_chunk_metadata_tensors(
+                    self.chunk_size,
+                    common,
+                    common_attn_metadata,
+                )
             )
 
         return replace(
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index c4ffb16f505e..27c9b85eb754 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -59,6 +59,15 @@ class BaseMambaAttentionMetadata:
     # The following tensor is only used for prefix caching in align mode
     seq_lens: torch.Tensor
 
+    # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for
+    # each chunk, its offsets into the varlen sequence dimension. It is defined
+    # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to
+    # cu_chunk_seqlen_p[i+1].
+    cu_chunk_seqlen_p: torch.Tensor | None = None
+    # last_chunk_indices_p is a tensor of shape (batch,) that contains the
+    # index of the last chunk for every sequence in the (prefill) batch.
+    last_chunk_indices_p: torch.Tensor | None = None
+
     # The following attributes are for triton implementation of causal_conv1d
     nums_dict: dict | None = None
     batch_ptr: torch.Tensor | None = None
@@ -185,6 +194,118 @@ def build(
             common_attn_metadata, num_accepted_tokens=num_accepted_tokens
         )
 
+    def _compute_chunk_metadata(
+        self,
+        chunk_size: int,
+        num_prefills: int,
+        num_computed_tokens_p_cpu: torch.Tensor,
+        query_start_loc_p_cpu: torch.Tensor,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """
+        Compute chunk-specific metadata for Mamba models.
+
+        The code below carefully constructs the chunks such that:
+        1. Chunks contain tokens from a *single* sequence only.
+        2. For every sequence, we are guaranteed that we can
+           retrieve the mamba state *every* chunk_size tokens.
+        Constraint (1) dramatically simplifies the mamba kernels.
+        Constraint (2) dramatically simplifies the implementation
+        of prefix caching for mamba (wip). We need to take care
+        of the interaction with chunked prefill in order to
+        satisfy constraint (2).
+        """
+        # TODO (tdoublep): This code could probably be optimized.
+        cu_chunk_seqlen = []
+        seq_idx = []
+        last_chunk_indices = []
+        seqlen_pos = 0
+
+        for req_idx in range(num_prefills):
+            this_num_computed = num_computed_tokens_p_cpu[req_idx].item()
+            this_new_tokens = (
+                query_start_loc_p_cpu[req_idx + 1].item()
+                - query_start_loc_p_cpu[req_idx].item()
+            )
+
+            # if computed tokens are not chunk-aligned, use the first
+            # chunk to finish it off
+            if this_num_computed % chunk_size != 0:
+                seq_idx.append(req_idx)
+                cu_chunk_seqlen.append(seqlen_pos)
+                # how many tokens to finish the chunk?
+                chunk_len = (
+                    cdiv(this_num_computed, chunk_size) * chunk_size - this_num_computed
+                )
+                # we can only use at most this_new_tokens
+                chunk_len = min(chunk_len, this_new_tokens)
+                seqlen_pos += chunk_len
+                this_new_tokens -= chunk_len
+
+            n_chunks = cdiv(this_new_tokens, chunk_size)
+            for chunk in range(n_chunks):
+                seq_idx.append(req_idx)
+                cu_chunk_seqlen.append(seqlen_pos)
+                chunk_len = min(chunk_size, this_new_tokens)
+                seqlen_pos += chunk_len
+                this_new_tokens -= chunk_len
+
+            assert this_new_tokens == 0
+            last_chunk_indices.append(len(cu_chunk_seqlen) - 1)
+
+        cu_chunk_seqlen.append(seqlen_pos)
+
+        return cu_chunk_seqlen, seq_idx, last_chunk_indices
+
+    def _build_chunk_metadata_tensors(
+        self,
+        chunk_size: int,
+        common: M,
+        common_attn_metadata: CommonAttentionMetadata,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute chunk metadata and return as device tensors.
+        Returns (cu_chunk_seqlen_p, seq_idx_p, last_chunk_indices_p).
+        """
+        num_reqs = common.num_reqs
+        num_prefills = common.num_prefills
+        num_decode_tokens = common.num_decode_tokens
+
+        num_computed_tokens_cpu = (
+            common_attn_metadata.compute_num_computed_tokens().cpu()
+        )
+        num_computed_tokens_p_cpu = num_computed_tokens_cpu[
+            num_reqs - num_prefills : num_reqs
+        ]
+        query_start_loc_p_cpu = (
+            common_attn_metadata.query_start_loc_cpu[-num_prefills - 1 :]
+            - num_decode_tokens
+        )
+
+        cu_chunk_seqlen, seq_idx, last_chunk_indices = self._compute_chunk_metadata(
+            chunk_size,
+            num_prefills,
+            num_computed_tokens_p_cpu,
+            query_start_loc_p_cpu,
+        )
+
+        device = common_attn_metadata.query_start_loc.device
+        cu_chunk_seqlen_p = torch.as_tensor(
+            cu_chunk_seqlen,
+            device=device,
+            dtype=torch.int32,
+        )
+        seq_idx_p = torch.as_tensor(
+            seq_idx,
+            device=device,
+            dtype=torch.int32,
+        )
+        last_chunk_indices_p = torch.as_tensor(
+            last_chunk_indices,
+            device=device,
+            dtype=torch.int32,
+        )
+        return cu_chunk_seqlen_p, seq_idx_p, last_chunk_indices_p
+
     def _compute_prefix_caching_block_indices(
         self,
         common_attn_metadata: CommonAttentionMetadata,

From 59d7af9c6ced8958a2ca9d257c59dc7c22fa32c6 Mon Sep 17 00:00:00 2001
From: Taneem Ibrahim <taneem.ibrahim@gmail.com>
Date: Sun, 1 Mar 2026 08:26:44 -0600
Subject: [PATCH 140/154] [MISC] Fixing a null reference by removing
 parallel_utils from mypy EXCLUDE (#35630)

Signed-off-by: Taneem Ibrahim <taneem.ibrahim@gmail.com>
---
 tools/pre_commit/mypy.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 27312ac59b11..b2f70f184207 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -34,7 +34,6 @@
 
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 EXCLUDE = [
-    "vllm/model_executor/parallel_utils",
     "vllm/model_executor/models",
     "vllm/model_executor/layers/fla/ops",
     # Ignore triton kernels in ops.

From 5a435507d877f4eb16802095037d5c56e767c589 Mon Sep 17 00:00:00 2001
From: Seungho Yoon <yoonsnowdev@gmail.com>
Date: Sun, 1 Mar 2026 23:59:30 +0900
Subject: [PATCH 141/154] fix(mxfp4): return is_monolithic=False when LoRA is
 enabled for Triton backend (#35382)

Signed-off-by: Seungho Yoon <yoonsnowdev@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 29dd0359607c..0ad1b8931270 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1001,6 +1001,8 @@ def select_gemm_impl(
 
     @property
     def is_monolithic(self) -> bool:
+        if self.moe.is_lora_enabled:
+            return False
         return (
             self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
             or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16

From 72f4d162623854786d29e1d9c6e232cfdf81d3cc Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 1 Mar 2026 10:31:13 -0800
Subject: [PATCH 142/154] [Model Runner V2] Use block table apis for capture
 inputs (#35671)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/block_table.py     | 11 +++++++++++
 vllm/v1/worker/gpu/cudagraph_utils.py |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
index 9dfdf834d243..b06a35805799 100644
--- a/vllm/v1/worker/gpu/block_table.py
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -119,6 +119,10 @@ def gather_block_tables(
         return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
 
     def get_dummy_block_tables(self, num_reqs: int) -> tuple[torch.Tensor, ...]:
+        # NOTE(woosuk): The output may be used for CUDA graph capture.
+        # Therefore, this method must return the persistent tensor
+        # with the same memory address as that used during the model's forward pass,
+        # rather than allocating a new tensor.
         return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
 
     def compute_slot_mappings(
@@ -150,7 +154,14 @@ def compute_slot_mappings(
         return self.slot_mappings[:, :num_tokens]
 
     def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor:
+        # Fill the entire slot_mappings tensor, not just the first `num_tokens` entries.
+        # This is because the padding logic is complex and kernels may access beyond
+        # the requested range.
         self.slot_mappings.fill_(PAD_SLOT_ID)
+        # NOTE(woosuk): The output may be used for CUDA graph capture.
+        # Therefore, this method must return the persistent tensor
+        # with the same memory address as that used during the model's forward pass,
+        # rather than allocating a new tensor.
         return self.slot_mappings[:, :num_tokens]
 
 
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 783715cfe984..c9ae28abf52b 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -420,8 +420,8 @@ def prepare_inputs_to_capture(
     input_buffers.dcp_local_seq_lens[:num_reqs] = num_tokens
     input_buffers.dcp_local_seq_lens[num_reqs:] = 0
 
-    input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
-    slot_mappings = block_tables.slot_mappings[:, :num_tokens]
+    input_block_tables = block_tables.get_dummy_block_tables(num_reqs)
+    slot_mappings = block_tables.get_dummy_slot_mappings(num_tokens)
     slot_mappings_by_layer = build_slot_mappings_by_layer(
         slot_mappings, kv_cache_config
     )

From 6290470843c131681e3e1318ae71070a34f33225 Mon Sep 17 00:00:00 2001
From: haosdent <haosdent@gmail.com>
Date: Mon, 2 Mar 2026 04:14:46 +0800
Subject: [PATCH 143/154] [Bugfix] Fix dtype mismatch in
 RMSNormGated.forward_native() during torch.compile (#35256)

Signed-off-by: haosdent <haosdent@gmail.com>
---
 tests/kernels/test_fla_layernorm_guard.py | 64 ++++++++++++++++++++++-
 vllm/model_executor/layers/layernorm.py   | 11 ++--
 2 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/tests/kernels/test_fla_layernorm_guard.py b/tests/kernels/test_fla_layernorm_guard.py
index 2ece5497cb06..4858ff2d7fe4 100644
--- a/tests/kernels/test_fla_layernorm_guard.py
+++ b/tests/kernels/test_fla_layernorm_guard.py
@@ -74,7 +74,7 @@ def layer_norm_ref(
     return out.to(dtype)
 
 
-DTYPES = [torch.bfloat16, torch.float32]
+DTYPES = [torch.float16, torch.bfloat16, torch.float32]
 # Test various M sizes to ensure rows_per_block logic works correctly
 NUM_TOKENS = [
     1,
@@ -380,6 +380,68 @@ def test_multidimensional_input(
     torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
 
 
+@pytest.mark.parametrize("num_tokens", [1, 128, 1024])
+@pytest.mark.parametrize("hidden_size", [64, 256, 1024])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("has_gate", [True, False])
+@pytest.mark.parametrize("group_size", [None, 64])
+@pytest.mark.parametrize("norm_before_gate", [True, False])
+@torch.inference_mode()
+def test_rmsnorm_gated_forward_native_dtype(
+    default_vllm_config,
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    has_gate: bool,
+    group_size: int | None,
+    norm_before_gate: bool,
+):
+    """Test that RMSNormGated.forward_native preserves input dtype."""
+    if group_size is not None and hidden_size % group_size != 0:
+        pytest.skip(
+            f"hidden_size {hidden_size} not divisible by group_size {group_size}"
+        )
+
+    from vllm.model_executor.layers.layernorm import RMSNormGated
+
+    device = torch.device("cuda:0")
+    set_random_seed(42)
+
+    layer = RMSNormGated(
+        hidden_size,
+        eps=1e-5,
+        group_size=group_size,
+        norm_before_gate=norm_before_gate,
+        device=device,
+        dtype=dtype,
+    )
+
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    z = (
+        torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+        if has_gate
+        else None
+    )
+
+    out = layer.forward_native(x, z)
+
+    # Verify dtype preservation
+    assert out.dtype == dtype, f"Expected {dtype}, got {out.dtype}"
+
+    # Verify numerical correctness against reference
+    ref_out = rms_norm_ref(
+        x,
+        layer.weight,
+        layer.bias,
+        z=z,
+        eps=1e-5,
+        group_size=group_size,
+        norm_before_gate=norm_before_gate,
+        upcast=True,
+    )
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
 if __name__ == "__main__":
     # Run a quick smoke test
     test_layer_norm_fwd_basic(128, 1024, torch.float16, 42, False)
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 72f42de06ee5..2a1180dd6255 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -557,6 +557,11 @@ def forward_native(
             - norm_before_gate=True: out = norm(x) * silu(z)
             - norm_before_gate=False: out = norm(x * silu(z))
         """
+        orig_dtype = x.dtype
+        x = x.float()
+        weight = self.weight.float()
+        z = z.float() if z is not None else None
+
         # Apply gating before normalization if needed
         if z is not None and not self.norm_before_gate:
             x = x * F.silu(z)
@@ -566,7 +571,7 @@ def forward_native(
             # Standard RMS norm across the last dimension
             variance = x.pow(2).mean(dim=-1, keepdim=True)
             x_normed = x * torch.rsqrt(variance + self.eps)
-            out = x_normed * self.weight
+            out = x_normed * weight
         else:
             # Group RMS norm
             from einops import rearrange
@@ -574,13 +579,13 @@ def forward_native(
             x_group = rearrange(x, "... (g d) -> ... g d", d=self.group_size)
             variance = x_group.pow(2).mean(dim=-1, keepdim=True)
             x_normed = x_group * torch.rsqrt(variance + self.eps)
-            out = rearrange(x_normed, "... g d -> ... (g d)") * self.weight
+            out = rearrange(x_normed, "... g d -> ... (g d)") * weight
 
         # Apply gating after normalization if needed
         if z is not None and self.norm_before_gate:
             out = out * F.silu(z)
 
-        return out.to(x.dtype)
+        return out.to(orig_dtype)
 
     def forward_cuda(
         self, x: torch.Tensor, z: torch.Tensor | None = None

From e82fbeec7b360af4fb908bf67a659b22f93266d3 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Sun, 1 Mar 2026 16:44:22 -0500
Subject: [PATCH 144/154] [torch.compile] Undo the fast_moe_cold_start hack in
 torch>=2.11 (#35475)

Signed-off-by: Richard Zou <zou3519@gmail.com>
---
 vllm/config/vllm.py                           |  8 +++-
 vllm/env_override.py                          | 41 +++++++++++++++++++
 .../fused_moe/runner/default_moe_runner.py    | 35 ++++++++++++----
 vllm/utils/torch_utils.py                     | 35 ++++++++++++++++
 4 files changed, 109 insertions(+), 10 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 7f7b21316c7b..d781d778e604 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -883,7 +883,13 @@ def has_blocked_weights():
                     self.compilation_config.pass_config.enable_sp = False
                     self.compilation_config.pass_config.fuse_gemm_comms = False
 
-        if self.compilation_config.fast_moe_cold_start is None:
+        from vllm.utils.torch_utils import HAS_OPAQUE_TYPE
+
+        if HAS_OPAQUE_TYPE:
+            # On torch >= 2.11 the hoisted OpaqueObject approach supersedes
+            # fast_moe_cold_start, so force it off.
+            self.compilation_config.fast_moe_cold_start = False
+        elif self.compilation_config.fast_moe_cold_start is None:
             # resolve default behavior: try to be as safe as possible
             # this config is unsafe if any spec decoding draft model has a MOE.
             # We'll conservatively turn it off if we see spec decoding.
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 181d000a68a7..27992218f9e8 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -482,3 +482,44 @@ def _patch_get_raw_stream_if_needed():
 
     PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched
     GraphLowering._update_scheduler = _update_scheduler_patched
+
+# ===================================================
+# torch 2.11 Inductor constrain_to_fx_strides monkeypatch
+# ===================================================
+# Patch the inductor's `constrain_to_fx_strides` to handle opaque
+# (non-tensor) arguments.  The original calls `.stride()` on every FX
+# arg's meta value, which crashes on FakeScriptObject (the compile-time
+# proxy for hoisted opaque types).  The patched version skips args
+# whose meta value is not a torch.Tensor.
+# Upstream issue: https://github.com/pytorch/pytorch/issues/175973
+
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+if is_torch_equal_or_newer("2.11.0.dev"):
+    import torch._inductor.ir as _ir
+    import torch._inductor.lowering as _lowering
+    from torch._inductor.virtualized import V as _V
+
+    _orig_constrain = _lowering.constrain_to_fx_strides
+
+    def _patched_constrain_to_fx_strides(fx_node, *args, **kwargs):
+        def apply_constraint(arg, fx_arg):
+            if isinstance(arg, _ir.IRNode):
+                meta_val = fx_arg.meta.get("val")
+                if isinstance(meta_val, torch.Tensor):
+                    stride_order = _ir.get_stride_order(
+                        meta_val.stride(), _V.graph.sizevars.shape_env
+                    )
+                    return _ir.ExternKernel.require_stride_order(arg, stride_order)
+                return arg
+            if isinstance(arg, dict):
+                return {key: apply_constraint(arg[key], fx_arg[key]) for key in arg}
+            return arg
+
+        args = tuple(
+            apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)
+        )
+        kwargs = {k: apply_constraint(v, fx_node.kwargs[k]) for k, v in kwargs.items()}
+        return args, kwargs
+
+    _lowering.constrain_to_fx_strides = _patched_constrain_to_fx_strides
diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
index 9c2adf7996ec..274929c071ac 100644
--- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
+++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import nullcontext
+from typing import TYPE_CHECKING
 
 import torch
 import torch.nn.functional as F
@@ -30,6 +31,8 @@
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import (
+    HAS_OPAQUE_TYPE,
+    ModuleName,
     aux_stream,
     current_stream,
     direct_register_custom_op,
@@ -56,13 +59,27 @@ def get_layer_from_name(layer_name: str) -> torch.nn.Module:
     return forward_context.no_compile_layers[layer_name]
 
 
+# On torch >= 2.11, layer_name is a hoisted ModuleName opaque object;
+# on older versions it remains a plain str.
+if TYPE_CHECKING:
+    from typing import TypeAlias
+
+    _layer_name_type: TypeAlias = str | ModuleName
+else:
+    _layer_name_type = ModuleName if HAS_OPAQUE_TYPE else str
+
+
+def _resolve_layer_name(layer_name: str | ModuleName) -> str:
+    return layer_name.value if isinstance(layer_name, ModuleName) else layer_name
+
+
 def _moe_forward(
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,
     shared_experts_input: torch.Tensor | None,
-    layer_name: str,
+    layer_name: _layer_name_type,
 ) -> torch.Tensor:
-    layer = get_layer_from_name(layer_name)
+    layer = get_layer_from_name(_resolve_layer_name(layer_name))
     # TODO(bnell): this can be removed after MK migration is complete.
     layer.ensure_moe_quant_config_init()
     return layer.runner.forward_impl(
@@ -74,7 +91,7 @@ def _moe_forward_fake(
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,
     shared_experts_input: torch.Tensor | None,
-    layer_name: str,
+    layer_name: _layer_name_type,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
@@ -83,9 +100,9 @@ def _moe_forward_shared(
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,
     shared_experts_input: torch.Tensor | None,
-    layer_name: str,
+    layer_name: _layer_name_type,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    layer = get_layer_from_name(layer_name)
+    layer = get_layer_from_name(_resolve_layer_name(layer_name))
     # TODO(bnell): this can be removed after MK migration is complete.
     layer.ensure_moe_quant_config_init()
     return layer.runner.forward_impl(
@@ -97,7 +114,7 @@ def _moe_forward_shared_fake(
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,
     shared_experts_input: torch.Tensor | None,
-    layer_name: str,
+    layer_name: _layer_name_type,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     # Output shapes:
     # - fused_out: same as hidden_states (routed experts use transformed size)
@@ -105,12 +122,10 @@ def _moe_forward_shared_fake(
     #               hidden_states
     # (For latent MoE: shared experts use original hidden_size, not latent size)
     fused_out = torch.empty_like(hidden_states)
-
     if shared_experts_input is not None:
         shared_out = torch.empty_like(shared_experts_input)
     else:
         shared_out = torch.empty_like(hidden_states)
-
     return shared_out, fused_out
 
 
@@ -367,7 +382,9 @@ def reduce_and_trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor:
             assert len(trunc_sizes) == 1
             return func(states, trunc_sizes[0])
 
-    def _encode_layer_name(self) -> str:
+    def _encode_layer_name(self) -> str | ModuleName:
+        if HAS_OPAQUE_TYPE:
+            return ModuleName(self.layer_name)
         # Can be unavailable or None in unittests
         if (
             is_forward_context_available()
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index e834108ca19c..e4aa4fe61beb 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -740,6 +740,41 @@ def is_torch_equal(target: str) -> bool:
         return Version(importlib.metadata.version("torch")) == Version(target)
 
 
+HAS_OPAQUE_TYPE = is_torch_equal_or_newer("2.11.0.dev")
+
+if HAS_OPAQUE_TYPE:
+    from torch._opaque_base import OpaqueBase
+else:
+    OpaqueBase = object  # type: ignore[misc, assignment]
+
+
+class ModuleName(OpaqueBase):  # type: ignore[misc]
+    """Wraps a module name string for use as a torch opaque type.
+
+    When torch >= 2.11, this is registered as a hoisted value-type opaque
+    object so that torch.compile lifts it as a graph input instead of baking
+    it as a constant.  This avoids per-layer recompilation for MOE ops.
+    """
+
+    def __init__(self, value: str):
+        self.value = value
+
+    def __eq__(self, other):
+        return isinstance(other, ModuleName) and self.value == other.value
+
+    def __hash__(self):
+        return hash(self.value)
+
+    def __fx_repr__(self):
+        return (f"ModuleName({self.value!r})", {ModuleName})
+
+
+if HAS_OPAQUE_TYPE:
+    from torch._library.opaque_object import register_opaque_type
+
+    register_opaque_type(ModuleName, typ="value", hoist=True)
+
+
 # Supports xccl with PyTorch versions >= 2.8.0.dev for XPU platform
 def supports_xccl() -> bool:
     return torch.distributed.is_xccl_available()

From 57a96e26c913cb9fae96c9e600fa4ff10dc40a1a Mon Sep 17 00:00:00 2001
From: zhanqiuhu <49648934+ZhanqiuHu@users.noreply.github.com>
Date: Sun, 1 Mar 2026 17:32:37 -0500
Subject: [PATCH 145/154] Revert "[Bugfix] Disable TRTLLM attention with KV
 transfer enabled (#33192)" (#34832)

Signed-off-by: Zhanqiu Hu <zh338@cornell.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 5300cf56c5e7..233251d07667 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -575,20 +575,6 @@ def __init__(
         # try to use fp8 q if kv cache is fp8, and will fall back to model dtype
         # if TRTLLM attention kernel is not used when building attn metadata
         can_use_trtllm = can_use_trtllm_attention(self.num_qo_heads, self.num_kv_heads)
-
-        # TRTLLM attention requires strictly contiguous KV cache tensors.
-        # When KV transfer (P/D disaggregation) is enabled, the KV cache may be
-        # permuted into non-contiguous views, which causes assertion failures.
-        self._kv_transfer_enabled = vllm_config.kv_transfer_config is not None
-        if can_use_trtllm and self._kv_transfer_enabled:
-            logger.info_once(
-                "TRTLLM attention is disabled because KV transfer "
-                "(P/D disaggregation) is enabled. TRTLLM attention requires "
-                "strictly contiguous KV cache tensors which may not be "
-                "guaranteed with KV transfer."
-            )
-            can_use_trtllm = False
-
         if (
             can_use_trtllm
             and not vllm_config.attention_config.disable_flashinfer_q_quantization
@@ -865,9 +851,6 @@ def build(
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )
-        # KV transfer requires non-contiguous KV cache views, incompatible with TRTLLM
-        if self._kv_transfer_enabled:
-            prefill_use_trtllm = False
         decode_use_trtllm = (
             self.use_trtllm_decode_attention and self.dcp_world_size <= 1
         )

From 8b5014d3dd343736ccf3e26cd44a0bb7700d205c Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 1 Mar 2026 18:44:57 -0500
Subject: [PATCH 146/154] [Attention] FA4 integration (#32974)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .buildkite/test_areas/misc.yaml               |   1 +
 .gitignore                                    |   2 +
 cmake/external_projects/vllm_flash_attn.cmake |  85 ++-
 docs/design/attention_backends.md             |   3 +-
 requirements/cuda.txt                         |   4 +
 setup.py                                      |   5 +
 .../generate_attention_backend_docs.py        | 108 +++-
 vllm/config/attention.py                      |   4 +-
 .../layers/attention/mla_attention.py         |   4 +-
 .../layers/attention/mm_encoder_attention.py  |   4 +-
 vllm/v1/attention/backends/fa_utils.py        |  48 +-
 vllm/v1/attention/backends/flash_attn.py      |  10 +-
 vllm/v1/cudagraph_dispatcher.py               |   2 +-
 vllm/vllm_flash_attn/__init__.py              |  24 +
 vllm/vllm_flash_attn/flash_attn_interface.py  | 567 ++++++++++++++++++
 15 files changed, 817 insertions(+), 54 deletions(-)
 create mode 100644 vllm/vllm_flash_attn/__init__.py
 create mode 100644 vllm/vllm_flash_attn/flash_attn_interface.py

diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index 69390cd6d373..d8957c217755 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -9,6 +9,7 @@ steps:
     - tests/v1
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     # split the test to avoid interference
     - pytest -v -s -m 'not cpu_test' v1/core
     - pytest -v -s v1/executor
diff --git a/.gitignore b/.gitignore
index 8e864d090c9d..795071bd77f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@
 
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
+!vllm/vllm_flash_attn/__init__.py
+!vllm/vllm_flash_attn/flash_attn_interface.py
 
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 41c4e308d0be..c206b9c39ee1 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -17,7 +17,8 @@ endif()
 # They should be identical but if they aren't, this is a massive footgun.
 #
 # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
-# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
+# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2), --component _vllm_fa3_C (for FA3),
+# or --component _vllm_fa4_cutedsl_C (for FA4 CuteDSL Python files).
 # If no component is specified, vllm-flash-attn is still installed.
 
 # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
@@ -38,7 +39,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 5824e6e2008271063c3229ab3e7032bd74abbbc6
+          GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
@@ -46,38 +47,62 @@ else()
 endif()
 
 
-# Ensure the vllm/vllm_flash_attn directory exists before installation
-install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
-
-# Make sure vllm-flash-attn install rules are nested under vllm/
-# This is here to support installing all components under the same prefix with cmake --install.
-# setup.py installs every component separately but uses the same prefix for all.
-# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
-# and these statements don't hurt when installing neither component.
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
-install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
+# Install rules for FA components need the install prefix nested under vllm/
+# These run at install time, before the FA library's own install rules
+foreach(_FA_COMPONENT _vllm_fa2_C _vllm_fa3_C)
+  install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT ${_FA_COMPONENT})
+  install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT ${_FA_COMPONENT})
+  install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT ${_FA_COMPONENT})
+endforeach()
 
 # Fetch the vllm-flash-attn library
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
 
-# Restore the install prefix
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+# Restore the install prefix after FA's install rules
+foreach(_FA_COMPONENT _vllm_fa2_C _vllm_fa3_C)
+  install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT ${_FA_COMPONENT})
+  install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT ${_FA_COMPONENT})
+endforeach()
+
+# Install shared Python files for both FA2 and FA3 components
+foreach(_FA_COMPONENT _vllm_fa2_C _vllm_fa3_C)
+  # Ensure the vllm/vllm_flash_attn directory exists before installation
+  install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")"
+    COMPONENT ${_FA_COMPONENT})
+
+  # Copy vllm_flash_attn python files (except __init__.py and flash_attn_interface.py
+  # which are source-controlled in vllm)
+  install(
+    DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+    DESTINATION vllm/vllm_flash_attn
+    COMPONENT ${_FA_COMPONENT}
+    FILES_MATCHING PATTERN "*.py"
+    PATTERN "__init__.py" EXCLUDE
+    PATTERN "flash_attn_interface.py" EXCLUDE
+  )
+
+endforeach()
 
-# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
-# case only one is built, in the case both are built redundant work is done)
-install(
-  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm/vllm_flash_attn
-  COMPONENT _vllm_fa2_C
-  FILES_MATCHING PATTERN "*.py"
-)
+#
+# FA4 CuteDSL component
+# This is a Python-only component that copies the flash_attn/cute directory
+# and transforms imports to match our package structure.
+#
+add_custom_target(_vllm_fa4_cutedsl_C)
 
-install(
-  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm/vllm_flash_attn
-  COMPONENT _vllm_fa3_C
-  FILES_MATCHING PATTERN "*.py"
-)
+# Copy flash_attn/cute directory (needed for FA4) and transform imports
+# The cute directory uses flash_attn.cute imports internally, which we replace
+# with vllm.vllm_flash_attn.cute to match our package structure.
+install(CODE "
+  file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\")
+  foreach(SRC_FILE \${CUTE_PY_FILES})
+    file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE})
+    set(DST_FILE \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute/\${REL_PATH}\")
+    get_filename_component(DST_DIR \${DST_FILE} DIRECTORY)
+    file(MAKE_DIRECTORY \${DST_DIR})
+    file(READ \${SRC_FILE} FILE_CONTENTS)
+    string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\")
+    file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\")
+  endforeach()
+" COMPONENT _vllm_fa4_cutedsl_C)
diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index 6d5c007e3ea4..e726d99256f5 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -168,6 +168,7 @@ Priority is **1 = highest** (tried first).
 | `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
 | `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 |
 | `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x |
+| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
 | `FLASH_ATTN_DIFFKV` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
 | `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
 | `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
@@ -178,7 +179,7 @@ Priority is **1 = highest** (tried first).
 
 > **†** FlashInfer uses TRTLLM attention on Blackwell (SM100), which supports sinks. Disable via `--attention-config.use_trtllm_attention=0`.
 >
-> **\*** Specify the FlashAttention version via `--attention-config.flash_attn_version=2` or `3`. Default is FA3 on SM90, FA2 otherwise.
+> **\*** Specify the FlashAttention version via `--attention-config.flash_attn_version=2`, `3`, or `4`. Default is FA4 on SM100+ (Blackwell), FA3 on SM90 (Hopper), FA2 otherwise.
 
 ## MLA (Multi-head Latent Attention) Backends
 
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 84fe347306cd..22477dc821b7 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -11,3 +11,7 @@ torchaudio==2.10.0
 torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.6.4
+
+# QuACK and Cutlass DSL for FA4 (cute-DSL implementation)
+nvidia-cutlass-dsl>=4.4.0.dev1
+quack-kernels>=0.2.7
diff --git a/setup.py b/setup.py
index afdd4b19b2f0..556a511a3429 100644
--- a/setup.py
+++ b/setup.py
@@ -976,6 +976,11 @@ def _read_requirements(filename: str) -> list[str]:
     ):
         # FA3 requires CUDA 12.3 or later
         ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+    # FA4 CuteDSL - Python-only component for FA4's cute DSL support
+    # Optional since this doesn't produce a .so file, just copies Python files
+    ext_modules.append(
+        CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa4_cutedsl_C", optional=True)
+    )
     if envs.VLLM_USE_PRECOMPILED or (
         CUDA_HOME and get_nvcc_cuda_version() >= Version("12.9")
     ):
diff --git a/tools/pre_commit/generate_attention_backend_docs.py b/tools/pre_commit/generate_attention_backend_docs.py
index 3aca49f94599..628656f0df1a 100644
--- a/tools/pre_commit/generate_attention_backend_docs.py
+++ b/tools/pre_commit/generate_attention_backend_docs.py
@@ -563,14 +563,53 @@ def analyze_backend(backend_name: str, class_path: str) -> dict[str, Any] | None
 
 
 # ---------------------------------------------------------------------------
-# Special backend variant parsers (FA2/FA3, FlashInfer TRTLLM, MLA prefill)
+# Special backend variant parsers (FA2/FA3/FA4, FlashInfer TRTLLM, MLA prefill)
 # ---------------------------------------------------------------------------
 
 
+def _parse_fa4_supported_caps() -> str | None:
+    """Parse flash_attn_interface.py for FA4 supported compute capabilities.
+
+    Looks for `cc not in [9, 10, 11]` pattern in _is_fa4_supported().
+    """
+    fa_interface_file = (
+        REPO_ROOT / "vllm" / "vllm_flash_attn" / "flash_attn_interface.py"
+    )
+    if not fa_interface_file.exists():
+        return None
+
+    try:
+        tree = ast.parse(fa_interface_file.read_text())
+    except Exception:
+        return None
+
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.FunctionDef) or node.name != "_is_fa4_supported":
+            continue
+        for n in ast.walk(node):
+            if not (
+                isinstance(n, ast.Compare)
+                and len(n.ops) == 1
+                and isinstance(n.ops[0], ast.NotIn)
+                and isinstance(n.comparators[0], ast.List)
+            ):
+                continue
+            caps: list[int] = [
+                e.value
+                for e in n.comparators[0].elts
+                if isinstance(e, ast.Constant) and isinstance(e.value, int)
+            ]
+            if caps:
+                caps.sort()
+                return f"{caps[0]}.x-{caps[-1]}.x"
+
+    return None
+
+
 def parse_flash_attn_features() -> dict[str, dict[str, Any]]:
-    """Parse fa_utils.py to detect FA2 vs FA3 feature differences.
+    """Parse fa_utils.py to detect FA2 vs FA3 vs FA4 feature differences.
 
-    Returns a dict with 'fa2' and 'fa3' keys containing their respective
+    Returns a dict with 'fa2', 'fa3', and 'fa4' keys containing their respective
     feature overrides for compute capability, KV cache dtypes, and sink support.
     """
     if not FA_UTILS_FILE.exists():
@@ -585,6 +624,7 @@ def parse_flash_attn_features() -> dict[str, dict[str, Any]]:
     fa3_supports_fp8 = False
     fa3_supports_sinks = False
     fa3_compute_cap: str | None = None
+    fa4_compute_cap: str | None = None
 
     for node in ast.walk(tree):
         if not isinstance(node, ast.FunctionDef):
@@ -614,14 +654,12 @@ def parse_flash_attn_features() -> dict[str, dict[str, Any]]:
                     fa3_supports_sinks = True
                     break
 
-        # Check get_flash_attn_version for FA3 compute capability
-        # Look for the ternary: 3 if (device_capability.major == 9 ...) else 2
+        # Check get_flash_attn_version for FA3/FA4 compute capability
         if node.name == "get_flash_attn_version":
             for n in ast.walk(node):
-                # Look for IfExp (ternary) with `device_capability.major == 9`
+                # Handle IfExp (ternary) with `device_capability.major == 9`
                 if isinstance(n, ast.IfExp):
                     test = n.test
-                    # Check if test is a BoolOp (and) containing the major check
                     if isinstance(test, ast.BoolOp):
                         for val in test.values:
                             if (
@@ -634,6 +672,38 @@ def parse_flash_attn_features() -> dict[str, dict[str, Any]]:
                                 fa3_compute_cap = f"{val.comparators[0].value}.x"
                                 break
 
+                # Handle If statements for FA3/FA4 detection
+                # e.g. `if device_capability.major == 9` -> FA3
+                #      `elif device_capability.major >= 10` -> FA4
+                if isinstance(n, ast.If):
+                    test = n.test
+                    comparisons = (
+                        [v for v in test.values if isinstance(v, ast.Compare)]
+                        if isinstance(test, ast.BoolOp)
+                        else [test]
+                        if isinstance(test, ast.Compare)
+                        else []
+                    )
+                    for comp in comparisons:
+                        if not (
+                            isinstance(comp.left, ast.Attribute)
+                            and comp.left.attr == "major"
+                            and comp.comparators
+                            and isinstance(comp.comparators[0], ast.Constant)
+                            and isinstance(comp.comparators[0].value, int)
+                        ):
+                            continue
+                        op = comp.ops[0]
+                        val = comp.comparators[0].value
+                        if isinstance(op, ast.Eq) and fa3_compute_cap is None:
+                            fa3_compute_cap = f"{val}.x"
+                        elif isinstance(op, ast.GtE) and fa4_compute_cap is None:
+                            fa4_compute_cap = f"≥{val}.0"
+
+    # Fallback: try to parse FA4 compute caps from flash_attn_interface.py
+    if fa4_compute_cap is None:
+        fa4_compute_cap = _parse_fa4_supported_caps()
+
     return {
         "fa2": {
             "supports_fp8": False,
@@ -644,6 +714,11 @@ def parse_flash_attn_features() -> dict[str, dict[str, Any]]:
             "supports_fp8": fa3_supports_fp8,
             "supports_sink": fa3_supports_sinks,
         },
+        "fa4": {
+            "compute_capability": fa4_compute_cap,
+            "supports_fp8": False,
+            "supports_sink": False,
+        },
     }
 
 
@@ -760,7 +835,7 @@ def parse_mla_prefill_backends() -> list[dict[str, Any]]:
 
 
 # ---------------------------------------------------------------------------
-# Backend variant expansion (FA2/FA3, FlashInfer native/TRTLLM)
+# Backend variant expansion (FA2/FA3/FA4, FlashInfer native/TRTLLM)
 # ---------------------------------------------------------------------------
 
 
@@ -768,7 +843,7 @@ def _expand_flash_attn_variants(
     all_backends: list[dict[str, Any]],
     fa_features: dict[str, dict[str, Any]],
 ) -> list[dict[str, Any]]:
-    """Expand FLASH_ATTN into FA2 and FA3 variants with different capabilities."""
+    """Expand FLASH_ATTN into FA2, FA3, and FA4 variants."""
     expanded = []
     for backend in all_backends:
         if backend["name"] != "FLASH_ATTN":
@@ -801,6 +876,18 @@ def _expand_flash_attn_variants(
 
         expanded.append(fa2)
         expanded.append(fa3)
+
+        # Create FA4 entry if FA4 features are available
+        if "fa4" in fa_features:
+            fa4 = backend.copy()
+            fa4["version"] = "FA4*"
+            fa4["_sort_key"] = "FLASH_ATTN"
+            fa4["_sort_order"] = 2
+            if fa_features["fa4"].get("compute_capability"):
+                fa4["compute_capability"] = fa_features["fa4"]["compute_capability"]
+            fa4["supports_sink"] = fa_features["fa4"]["supports_sink"]
+            expanded.append(fa4)
+
     return expanded
 
 
@@ -1360,7 +1447,8 @@ def generate_docs() -> str:
     if fa_features:
         footnotes.append(
             "> **\\*** Specify the FlashAttention version via "
-            "`--attention-config.flash_attn_version=2` or `3`. Default is FA3 on SM90, "
+            "`--attention-config.flash_attn_version=2`, `3`, or `4`. "
+            "Default is FA4 on SM100+ (Blackwell), FA3 on SM90 (Hopper), "
             "FA2 otherwise."
         )
     if footnotes:
diff --git a/vllm/config/attention.py b/vllm/config/attention.py
index 97a139c79a39..74bb3d68fd15 100644
--- a/vllm/config/attention.py
+++ b/vllm/config/attention.py
@@ -16,8 +16,8 @@ class AttentionConfig:
     backend: AttentionBackendEnum | None = None
     """Attention backend to use. If None, will be selected automatically."""
 
-    flash_attn_version: Literal[2, 3] | None = None
-    """Force vllm to use a specific flash-attention version (2 or 3).
+    flash_attn_version: Literal[2, 3, 4] | None = None
+    """Force vllm to use a specific flash-attention version (2, 3, or 4).
     Only valid when using the flash-attention backend."""
 
     use_prefill_decode_attention: bool = False
diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index d444e20dad9e..f6e7ab85d14e 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -2014,7 +2014,9 @@ def __init__(
             # RoCM and the latter has an additional parameter to control
             # FA2 vs FA3
             self.flash_attn_varlen_func = flash_attn_varlen_func
-            self.vllm_flash_attn_version = get_flash_attn_version()
+            self.vllm_flash_attn_version = get_flash_attn_version(
+                head_size=self.qk_head_dim
+            )
             if self.vllm_flash_attn_version is not None:
                 self.flash_attn_varlen_func = functools.partial(
                     flash_attn_varlen_func, fa_version=self.vllm_flash_attn_version
diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py
index d89366bbdfe3..d902f2ebceba 100644
--- a/vllm/model_executor/layers/attention/mm_encoder_attention.py
+++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -204,7 +204,9 @@ def __init__(
         }
 
         self._fa_version = (
-            get_flash_attn_version() if self.is_flash_attn_backend else None
+            get_flash_attn_version(head_size=head_size)
+            if self.is_flash_attn_backend
+            else None
         )
 
         if self.attn_backend == AttentionBackendEnum.FLASHINFER:
diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py
index 3150ad9a5505..9658a7e3c175 100644
--- a/vllm/v1/attention/backends/fa_utils.py
+++ b/vllm/v1/attention/backends/fa_utils.py
@@ -52,7 +52,9 @@ def get_scheduler_metadata(*args: Any, **kwargs: Any) -> None:  # type: ignore[m
     reshape_and_cache_flash = ops.reshape_and_cache_flash
 
 
-def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
+def get_flash_attn_version(
+    requires_alibi: bool = False, head_size: int | None = None
+) -> int | None:
     # import here to avoid circular dependencies
     from vllm.platforms import current_platform
 
@@ -72,9 +74,15 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
         assert device_capability is not None
 
         # 1. default version depending on platform
-        fa_version = (
-            3 if (device_capability.major == 9 and is_fa_version_supported(3)) else 2
-        )
+        if device_capability.major == 9 and is_fa_version_supported(3):
+            # Hopper (SM90): prefer FA3
+            fa_version = 3
+        elif device_capability.major == 10 and is_fa_version_supported(4):
+            # Blackwell (SM100+, restrict to SM100 for now): prefer FA4
+            fa_version = 4
+        else:
+            # Fallback to FA2
+            fa_version = 2
 
         # 2. override if passed by environment or config
         from vllm.config import get_current_vllm_config_or_none
@@ -87,12 +95,12 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
             fa_version = vllm_config.attention_config.flash_attn_version
 
         # 3. fallback for unsupported combinations
-        if device_capability.major == 10 and fa_version == 3:
+        if device_capability.major >= 10 and fa_version == 3:
             logger.warning_once(
                 "Cannot use FA version 3 on Blackwell platform, "
-                "defaulting to FA version 2."
+                "defaulting to FA version 4 if supported, otherwise FA2."
             )
-            fa_version = 2
+            fa_version = 4 if is_fa_version_supported(4) else 2
 
         if requires_alibi and fa_version == 3:
             logger.warning_once(
@@ -100,6 +108,28 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
             )
             fa_version = 2
 
+        if requires_alibi and fa_version == 4:
+            logger.warning_once(
+                "Cannot use FA version 4 with ALiBi, defaulting to FA version 2."
+            )
+            fa_version = 2
+
+        # FA4 on SM100 (Blackwell) has TMEM capacity limits that restrict
+        # supported head dimensions.
+        # See: https://github.com/Dao-AILab/flash-attention/issues/1959
+        if (
+            fa_version == 4
+            and device_capability.major >= 10
+            and head_size is not None
+            and head_size > 128
+        ):
+            logger.warning_once(
+                "FA4 on Blackwell does not support head_size=%d due to TMEM "
+                "capacity limits, defaulting to FA version 2.",
+                head_size,
+            )
+            fa_version = 2
+
         if not is_fa_version_supported(fa_version):
             logger.error(
                 "Cannot use FA version %d is not supported due to %s",
@@ -139,6 +169,10 @@ def flash_attn_supports_mla():
             return is_fa_version_supported(
                 3
             ) and current_platform.is_device_capability_family(90)
+
+            # NOTE(Lucas): FA4 CuteDSL does NOT currently support MLA's non-standard
+            # head dimensions (576 for qk, 512 for v) due to TMEM capacity limits.
+
         except (ImportError, AssertionError):
             pass
     return False
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 940dc7515491..91c49c55c147 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -580,7 +580,15 @@ def __init__(
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
         self.attn_type = attn_type
-        self.vllm_flash_attn_version = get_flash_attn_version()
+        self.vllm_flash_attn_version = get_flash_attn_version(
+            requires_alibi=alibi_slopes is not None,
+            head_size=head_size,
+        )
+        logger.info_once(
+            "Using FlashAttention version %s",
+            self.vllm_flash_attn_version,
+            scope="local",
+        )
         # Cache the batch invariant result for use in forward passes
         self.batch_invariant_enabled = vllm_is_batch_invariant()
 
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 1578209e612e..be459cd296a4 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -137,7 +137,7 @@ def _create_padded_batch_descriptor(
         num_tokens_padded = self._bs_to_padded_graph_size[num_tokens]
 
         if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL):
-            num_reqs = num_tokens_padded // uniform_decode_query_len
+            num_reqs = min(num_tokens_padded // uniform_decode_query_len, max_num_seqs)
             assert num_tokens_padded % uniform_decode_query_len == 0
         else:
             uniform_decode = False
diff --git a/vllm/vllm_flash_attn/__init__.py b/vllm/vllm_flash_attn/__init__.py
new file mode 100644
index 000000000000..3507defabaea
--- /dev/null
+++ b/vllm/vllm_flash_attn/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.vllm_flash_attn.flash_attn_interface import (
+    FA2_AVAILABLE,
+    FA3_AVAILABLE,
+    fa_version_unsupported_reason,
+    flash_attn_varlen_func,
+    get_scheduler_metadata,
+    is_fa_version_supported,
+)
+
+if not (FA2_AVAILABLE or FA3_AVAILABLE):
+    raise ImportError(
+        "vllm.vllm_flash_attn requires the CUDA flash attention extensions "
+        "(_vllm_fa2_C or _vllm_fa3_C). On ROCm, use upstream flash_attn."
+    )
+
+__all__ = [
+    "fa_version_unsupported_reason",
+    "flash_attn_varlen_func",
+    "get_scheduler_metadata",
+    "is_fa_version_supported",
+]
diff --git a/vllm/vllm_flash_attn/flash_attn_interface.py b/vllm/vllm_flash_attn/flash_attn_interface.py
new file mode 100644
index 000000000000..9d9a9be2f316
--- /dev/null
+++ b/vllm/vllm_flash_attn/flash_attn_interface.py
@@ -0,0 +1,567 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2023, Tri Dao.
+# ruff: noqa: E501
+
+
+import torch
+
+# isort: off
+# We need to import the CUDA kernels after importing torch
+# Use relative import to support build-from-source installation in vLLM
+
+try:
+    from . import _vllm_fa2_C  # type: ignore[attr-defined]  # noqa: F401
+
+    FA2_UNAVAILABLE_REASON = None
+    FA2_AVAILABLE = True
+except ImportError as e:
+    FA2_UNAVAILABLE_REASON = str(e)
+    FA2_AVAILABLE = False
+
+try:
+    from . import _vllm_fa3_C  # type: ignore[attr-defined]  # noqa: F401
+
+    FA3_UNAVAILABLE_REASON = None
+    FA3_AVAILABLE = True
+except ImportError as e:
+    FA3_UNAVAILABLE_REASON = str(e)
+    FA3_AVAILABLE = False
+
+
+try:
+    import os
+
+    _cute_interface_path = os.path.join(
+        os.path.dirname(__file__), "cute", "interface.py"
+    )
+    if not os.path.exists(_cute_interface_path):
+        raise ImportError("vllm.vllm_flash_attn.cute.interface not found")
+
+    FA4_UNAVAILABLE_REASON = None
+    FA4_AVAILABLE = True
+except (ImportError, ModuleNotFoundError) as e:
+    FA4_UNAVAILABLE_REASON = str(e)
+    FA4_AVAILABLE = False
+
+# isort: on
+
+DEFAULT_FA_VERSION = 2
+
+
+def _is_fa2_supported() -> tuple[bool, str | None]:
+    if not FA2_AVAILABLE:
+        return False, f"FA2 is unavailable due to: {FA2_UNAVAILABLE_REASON}"
+    from vllm.platforms import current_platform
+
+    if not current_platform.has_device_capability(80):
+        return False, "FA2 is only supported on devices with compute capability >= 8"
+    return True, None
+
+
+def _is_fa3_supported() -> tuple[bool, str | None]:
+    if not FA3_AVAILABLE:
+        return False, f"FA3 is unavailable due to: {FA3_UNAVAILABLE_REASON}"
+    from vllm.platforms import current_platform
+
+    if not current_platform.is_device_capability_family(90):
+        return False, "FA3 is only supported on devices with compute capability 9.x"
+    return True, None
+
+
+def _is_fa4_supported() -> tuple[bool, str | None]:
+    if not FA4_AVAILABLE:
+        return False, f"FA4 is unavailable due to: {FA4_UNAVAILABLE_REASON}"
+    from vllm.platforms import current_platform
+
+    if not (
+        current_platform.is_device_capability_family(90)
+        or current_platform.is_device_capability_family(100)
+        or current_platform.is_device_capability_family(110)
+    ):
+        return (
+            False,
+            "FA4 is only supported on devices with compute capability 9.x, 10.x, or 11.x",
+        )
+    return True, None
+
+
+def is_fa_version_supported(fa_version: int) -> bool:
+    if fa_version == 2:
+        return _is_fa2_supported()[0]
+    elif fa_version == 3:
+        return _is_fa3_supported()[0]
+    elif fa_version == 4:
+        return _is_fa4_supported()[0]
+    else:
+        raise ValueError(f"Unsupported FA version: {fa_version}")
+
+
+def fa_version_unsupported_reason(fa_version: int) -> str | None:
+    if fa_version == 2:
+        return _is_fa2_supported()[1]
+    elif fa_version == 3:
+        return _is_fa3_supported()[1]
+    elif fa_version == 4:
+        return _is_fa4_supported()[1]
+    else:
+        raise ValueError(f"Unsupported FA version: {fa_version}")
+
+
+#
+#  For vLLM we only care about `flash_attn_varlen_func` and
+#   `flash_attn_with_kvcache` so we only maintain wrappers for these two.
+#
+
+
+def maybe_contiguous(x):
+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+
+
+# NOTE only used in FA3
+def get_scheduler_metadata(
+    batch_size,
+    max_seqlen_q,
+    max_seqlen_k,
+    num_heads_q,
+    num_heads_kv,
+    headdim,
+    cache_seqlens: torch.Tensor,
+    qkv_dtype=torch.bfloat16,
+    headdim_v=None,
+    cu_seqlens_q: torch.Tensor | None = None,
+    cu_seqlens_k_new: torch.Tensor | None = None,
+    cache_leftpad: torch.Tensor | None = None,
+    page_size: int | None = None,
+    max_seqlen_k_new=0,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    has_softcap=False,
+    num_splits=0,  # Can be tuned for speed
+    pack_gqa=None,  # Can be tuned for speed
+    sm_margin=0,  # Can be tuned if some SMs are used for communication
+):
+    cache_seqlens = maybe_contiguous(cache_seqlens)
+    if headdim_v is None:
+        headdim_v = headdim
+    scheduler_metadata = torch.ops._vllm_fa3_C.get_scheduler_metadata(
+        batch_size,
+        max_seqlen_q,
+        max_seqlen_k,
+        num_heads_q,
+        num_heads_kv,
+        headdim,
+        headdim_v,
+        qkv_dtype,
+        cache_seqlens,
+        cu_seqlens_q,
+        None,  # cu_seqlens_k
+        cu_seqlens_k_new,
+        None,  # seqused_q
+        cache_leftpad,
+        page_size,
+        max_seqlen_k_new,
+        causal,
+        window_size[0],
+        window_size[1],
+        has_softcap,
+        num_splits,
+        pack_gqa,
+        sm_margin,
+    )
+
+    return scheduler_metadata
+
+
+def flash_attn_varlen_func(
+    q,
+    k,
+    v,
+    max_seqlen_q,
+    cu_seqlens_q,
+    max_seqlen_k,
+    cu_seqlens_k=None,  # only used for non-paged prefill
+    seqused_k=None,
+    q_v=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size: list[int] | None = None,
+    softcap=0.0,  # 0.0 means deactivated
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+    block_table=None,
+    return_softmax_lse=False,
+    out=None,
+    # FA3 Only
+    scheduler_metadata=None,
+    q_descale=None,
+    k_descale=None,
+    v_descale=None,
+    num_splits: int = 0,
+    # Version selector
+    fa_version: int = DEFAULT_FA_VERSION,
+    s_aux=None,
+    cp_world_size=1,
+    cp_rank=0,
+    cp_tot_seqused_k=None,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        softcap: float. Anything > 0 activates softcapping attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_softmax_lse=True]: (nheads, total_q_seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+    """
+    assert cu_seqlens_k is not None or seqused_k is not None, (
+        "cu_seqlens_k or seqused_k must be provided"
+    )
+    assert cu_seqlens_k is None or seqused_k is None, (
+        "cu_seqlens_k and seqused_k cannot be provided at the same time"
+    )
+    assert block_table is None or seqused_k is not None, (
+        "seqused_k must be provided if block_table is provided"
+    )
+
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+    # custom op does not support non-tuple input
+    real_window_size: tuple[int, int]
+    if window_size is None:
+        real_window_size = (-1, -1)
+    else:
+        assert len(window_size) == 2
+        real_window_size = (window_size[0], window_size[1])
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+
+    dummy_cu_seqlens_k = torch.empty_like(cu_seqlens_q)
+
+    if fa_version == 2:
+        if (
+            scheduler_metadata is not None
+            and q_descale is not None
+            and k_descale is not None
+            and v_descale is not None
+        ):
+            raise NotImplementedError(
+                "FA2 does not support scheduler_metadata, q_descale, "
+                "k_descale, v_descale"
+            )
+        if s_aux is not None:
+            raise NotImplementedError("FA2 does not support s_aux")
+        if num_splits > 1:
+            raise NotImplementedError("FA2 does not support num_splits > 1")
+        out, softmax_lse = torch.ops._vllm_fa2_C.varlen_fwd(
+            q,
+            k,
+            v,
+            out,
+            cu_seqlens_q,
+            # cu_seqlens_k not used since we use seqused_k, but flash_api.cpp
+            # still wants it so we pass all zeros
+            dummy_cu_seqlens_k if cu_seqlens_k is None else cu_seqlens_k,
+            seqused_k,
+            None,
+            block_table,
+            alibi_slopes,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            False,
+            causal,
+            real_window_size[0],
+            real_window_size[1],
+            softcap,
+            return_softmax_lse and dropout_p > 0,
+            num_splits,
+            None,
+        )
+    elif fa_version == 3:
+        assert alibi_slopes is None, "Alibi is not supported in FA3"
+        out, softmax_lse, _, _ = torch.ops._vllm_fa3_C.fwd(
+            q,
+            k,
+            v,
+            None,
+            None,  # k_new, v_new
+            q_v,
+            out,
+            cu_seqlens_q,
+            cu_seqlens_k,  # cu_seqlens_k
+            None,  # cu_seqlens_k_new
+            None,
+            seqused_k,  # seqused_q, seqused_k
+            max_seqlen_q,
+            max_seqlen_k,
+            block_table,
+            None,  # kv_batch_idx
+            None,  # leftpad_k
+            None,
+            None,
+            None,  # rotary_cos, rotary_sin, seqlens_rotary
+            q_descale,
+            k_descale,
+            v_descale,
+            softmax_scale,
+            causal,
+            real_window_size[0],
+            real_window_size[1],
+            softcap,
+            True,  # rotary_interleaved
+            scheduler_metadata,
+            num_splits,
+            None,  # pack_gqa
+            0,  # sm_margin
+            s_aux,  # s_aux
+            cp_world_size,
+            cp_rank,
+            cp_tot_seqused_k,
+        )
+    elif fa_version == 4:
+        assert alibi_slopes is None, "Alibi is not supported in FA4"
+        # FA4 on SM90 doesn't support paged KV; SM100+ does
+        from vllm.platforms import current_platform
+
+        if block_table is not None and current_platform.is_device_capability_family(90):
+            raise NotImplementedError(
+                "FA4 with paged KV is not supported on SM90 (Hopper). "
+                "Use FA3 or upgrade to Blackwell (SM100+)."
+            )
+        from vllm.vllm_flash_attn.cute.interface import _flash_attn_fwd
+
+        out, softmax_lse = _flash_attn_fwd(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            seqused_k=seqused_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            page_table=block_table,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            softcap=softcap,
+            window_size_left=real_window_size[0] if real_window_size[0] >= 0 else None,
+            window_size_right=real_window_size[1] if real_window_size[1] >= 0 else None,
+            num_splits=num_splits,
+            return_lse=return_softmax_lse,
+            out=out,
+        )
+    else:
+        raise ValueError(f"Unsupported FA version: {fa_version}")
+    return (out, softmax_lse) if return_softmax_lse else out
+
+
+def sparse_attn_func(
+    q,
+    k,
+    v,
+    block_count,
+    block_offset,
+    column_count,
+    column_index,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    softcap=0.0,  # 0.0 means deactivated
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+    *,
+    return_softmax_lse=False,
+    out=None,
+):
+    """Compute attention with vertical and slash sparsity patterns.
+    Most Arguments are the same with the flash_attn_func interface, except for 4 extra args:
+    block_count and block_offset for slash sparsity patterns, and
+    column_count and column_index for vertical sparsity patterns.
+    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.
+
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k: (batch_size, seqlen, nheads_k, headdim)
+        v: (batch_size, seqlen, nheads_k, headdim)
+        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
+        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, softmax_lse = torch.ops._vllm_fa2_C.fwd_sparse(
+        q,
+        k,
+        v,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        out,
+        alibi_slopes,
+        dropout_p,
+        softmax_scale,
+        causal,
+        softcap,
+        return_attn_probs and dropout_p > 0,
+        None,
+    )
+    return (out, softmax_lse) if return_softmax_lse else out
+
+
+def sparse_attn_varlen_func(
+    q,
+    k,
+    v,
+    block_count,
+    block_offset,
+    column_count,
+    column_index,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    softcap=0.0,  # 0.0 means deactivated
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+    *,
+    return_softmax_lse=False,
+    out=None,
+):
+    """Compute attention with vertical and slash sparsity patterns.
+    Most Arguments are the same with the flash_attn_varlen_func interface, except for 4 extra args:
+    block_count and block_offset for slash sparsity patterns, and
+    column_count and column_index for vertical sparsity patterns.
+    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.
+
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
+        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        softcap: float. Anything > 0 activates softcapping attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_softmax_lse=True]: (nheads, total_q_seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, softmax_lse = torch.ops._vllm_fa2_C.varlen_fwd_sparse(
+        q,
+        k,
+        v,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        None,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        softcap,
+        return_attn_probs and dropout_p > 0,
+        None,
+    )
+    return (out, softmax_lse) if return_softmax_lse else out

From a60985b07eaf0fefa4c353ad2309b745707591ed Mon Sep 17 00:00:00 2001
From: Jesse Cai <jessecai@fb.com>
Date: Sun, 1 Mar 2026 17:32:03 -0800
Subject: [PATCH 147/154] Fix deprecated v1 config tests (#35327)

Signed-off-by: Jesse Cai <jessecai@fb.com>
---
 tests/quantization/test_torchao.py | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index c859f890bddf..fb794baa53f0 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -20,7 +20,7 @@
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
     with vllm_runner(
-        "drisspg/fp8-opt-125m",
+        "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.15.0",
         quantization="torchao",
         dtype="bfloat16",
         enforce_eager=True,
@@ -52,22 +52,6 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_loca
         assert output
 
 
-@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
-def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
-    torch._dynamo.reset()
-    model_name = "jerryzh168/opt-125m-int4wo-per-module"
-    with vllm_runner(
-        model_name=model_name,
-        quantization="torchao",
-        dtype="bfloat16",
-        pt_load_map_location="cuda:0",
-        enforce_eager=True,
-    ) as llm:
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
-
-        assert output
-
-
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
     torch._dynamo.reset()

From 92f5d0f070ec9e0ca5fe72af672138a9bbd1cb68 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Mon, 2 Mar 2026 11:48:39 +0800
Subject: [PATCH 148/154] [XPU] fix mxfp4 activation type (#35691)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 0ad1b8931270..8856eb1e2e49 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1258,7 +1258,7 @@ def apply_monolithic(
             topk_weights=routing_weights,
             topk_ids=selected_experts,
             n_experts_per_token=layer.top_k,
-            activation=layer.activation,
+            activation=layer.activation.value,
             num_experts=layer.local_num_experts,
             is_mxfp4=True,
         )

From f26650d649aac25cb3b7a6b49863e1929da5df32 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Mon, 2 Mar 2026 01:02:43 -0500
Subject: [PATCH 149/154] [ROCm] add amd-quark package in requirements for rocm
 to use quantized models (#35658)

Signed-off-by: Hongxia Yang <hongxiay.yang@amd.com>
Co-authored-by: Hongxia Yang <hongxiay.yang@amd.com>
---
 requirements/rocm.txt            |  5 ++++-
 tests/quantization/test_quark.py | 25 ++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 9f2b39199350..fcc67e463e15 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -19,4 +19,7 @@ setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.3
 conch-triton-kernels==1.2.1
-timm>=1.0.17
\ No newline at end of file
+timm>=1.0.17
+# amd-quark: required for Quark quantization on ROCm 
+# To be consistent with test_quark.py
+amd-quark>=0.8.99
\ No newline at end of file
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 0ff6e8407ce6..a560494a4e75 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -26,9 +26,12 @@
 
 from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
 
+# Minimum amd-quark version for MXFP4/OCP_MX tests (single source of truth).
+QUARK_MXFP4_MIN_VERSION = "0.8.99"
+
 QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
     importlib.metadata.version("amd-quark")
-) >= version.parse("0.8.99")
+) >= version.parse(QUARK_MXFP4_MIN_VERSION)
 
 if QUARK_MXFP4_AVAILABLE:
     from quark.torch.export.nn.modules.realquantizer import StaticScaledRealQuantizer
@@ -200,7 +203,10 @@ def get_model_args(
 ]
 
 
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not QUARK_MXFP4_AVAILABLE,
+    reason=f"amd-quark>={QUARK_MXFP4_MIN_VERSION} is not available",
+)
 @pytest.mark.parametrize("config", WIKITEXT_ACCURACY_CONFIGS)
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
@@ -231,7 +237,10 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
 
 
 @pytest.mark.parametrize("config", GSM8K_ACCURACY_CONFIGS)
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not QUARK_MXFP4_AVAILABLE,
+    reason=f"amd-quark>={QUARK_MXFP4_MIN_VERSION} is not available",
+)
 @pytest.mark.skipif(
     not HF_HUB_AMD_ORG_ACCESS,
     reason="Read access to huggingface.co/amd is required for this test.",
@@ -261,7 +270,10 @@ def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig):
     ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not QUARK_MXFP4_AVAILABLE,
+    reason=f"amd-quark>={QUARK_MXFP4_MIN_VERSION} is not available",
+)
 @pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
 def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype, scalings: list[int]):
@@ -289,7 +301,10 @@ def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype, scalings: list[in
         )
 
 
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not QUARK_MXFP4_AVAILABLE,
+    reason=f"amd-quark>={QUARK_MXFP4_MIN_VERSION} is not available",
+)
 @pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
 def test_mxfp4_dequant_kernel_match_quark(

From c34963f13873d0919f9431ca5e7b21d03ae1c1e6 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Mon, 2 Mar 2026 01:04:18 -0600
Subject: [PATCH 150/154] [ROCm][CI] Disable skinny GEMMs in language model
 standard tests to fix non-determinism (#35152)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 tests/models/language/generation/conftest.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/models/language/generation/conftest.py b/tests/models/language/generation/conftest.py
index f423b656b2f2..aeb13bde4602 100644
--- a/tests/models/language/generation/conftest.py
+++ b/tests/models/language/generation/conftest.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM language generation tests."""
 
+import os
 import warnings
 
 import torch
@@ -9,6 +10,23 @@
 from vllm.platforms import current_platform
 
 
+def pytest_configure(config):
+    """Early ROCm configuration that must happen before test collection."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable skinny GEMM on ROCm to avoid non-deterministic results
+    # from atomic reductions in wvSplitKrc kernel.
+    # See: https://github.com/vllm-project/vllm/pull/33493#issuecomment-3906083975
+    os.environ["VLLM_ROCM_USE_SKINNY_GEMM"] = "0"
+    warnings.warn(
+        "ROCm: Set VLLM_ROCM_USE_SKINNY_GEMM=0 to avoid non-deterministic "
+        "results from skinny GEMM atomic reductions",
+        UserWarning,
+        stacklevel=1,
+    )
+
+
 def pytest_sessionstart(session):
     """Configure ROCm-specific settings before test session starts."""
     if not current_platform.is_rocm():

From cb21972a976b01e5ee1d899ab744207826435684 Mon Sep 17 00:00:00 2001
From: EdalatiAli <aliedalati@cohere.com>
Date: Mon, 2 Mar 2026 02:31:19 -0500
Subject: [PATCH 151/154] [Kernel] Integrate SM100 MXFP8 blockscaled grouped MM
 and quant kernels (#34448)

Signed-off-by: EdalatiAli <aliedalati@cohere.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 CMakeLists.txt                                |  27 ++
 .../moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu |  60 +++
 .../cutlass_mxfp8_grouped_mm_functor.cuh      | 141 ++++++
 .../cutlass_mxfp8_grouped_mm_launcher.cuh     | 179 ++++++++
 .../cutlass_mxfp8_grouped_mm_traits.cuh       | 127 ++++++
 csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu     |  60 +++
 csrc/moe/mxfp8_moe/mxfp8_experts_quant.cuh    | 414 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |  16 +
 .../moe/test_cutlass_mxfp8_grouped_mm.py      | 237 ++++++++++
 vllm/_custom_ops.py                           |  70 +++
 10 files changed, 1331 insertions(+)
 create mode 100644 csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu
 create mode 100644 csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_functor.cuh
 create mode 100644 csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_launcher.cuh
 create mode 100644 csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_traits.cuh
 create mode 100644 csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu
 create mode 100644 csrc/moe/mxfp8_moe/mxfp8_experts_quant.cuh
 create mode 100644 tests/kernels/moe/test_cutlass_mxfp8_grouped_mm.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 479d6db1eebc..65df275cd314 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -771,6 +771,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  # Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS)
+    set(SRCS
+      "csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu"
+      "csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1")
+    message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8
+        AND ES_MXFP8_GROUPED_MM_ARCHS)
+      message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is "
+                     "not >= 12.8.")
+    else()
+      message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found "
+                     "in CUDA target architectures.")
+    endif()
+  endif()
+
   # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
     cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
diff --git a/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu b/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu
new file mode 100644
index 000000000000..f507f9299b03
--- /dev/null
+++ b/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+// Adapted from SGLang:
+// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled.cu
+
+#include <torch/all.h>
+
+#include "cutlass_mxfp8_grouped_mm_launcher.cuh"
+
+void cutlass_mxfp8_grouped_mm(const torch::Tensor& a, const torch::Tensor& b,
+                              const torch::Tensor& sfa,
+                              const torch::Tensor& sfb, torch::Tensor& d,
+                              const torch::Tensor& problem_sizes,
+                              const torch::Tensor& expert_offsets,
+                              const torch::Tensor& blockscale_offsets) {
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt32,
+              "expert_offsets must be int32");
+  TORCH_CHECK(blockscale_offsets.dtype() == torch::kInt32,
+              "blockscale_offsets must be int32");
+  TORCH_CHECK(a.dim() == 2, "a must be a 2D tensor of shape (num_tokens, k)");
+  TORCH_CHECK(b.dim() == 3,
+              "b must be a 3D tensor of shape (num_experts, k, n)");
+  TORCH_CHECK(a.size(1) == b.size(1) && a.size(1) % 128 == 0,
+              "k should align 128");
+  TORCH_CHECK(b.size(2) % 128 == 0, "n should align 128");
+  TORCH_CHECK(a.strides()[1] == 1, "a must be row major");
+  TORCH_CHECK(b.strides()[1] == 1, "b must be column major");
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  if (d.dtype() == torch::kBFloat16) {
+    expert_specialization::cutlass_mxfp8_grouped_mm_dispatch_out_dtype<
+        cutlass::bfloat16_t>(a, b, sfa, sfb, d, problem_sizes, expert_offsets,
+                             blockscale_offsets, stream);
+  } else if (d.dtype() == torch::kFloat16) {
+    expert_specialization::cutlass_mxfp8_grouped_mm_dispatch_out_dtype<
+        cutlass::half_t>(a, b, sfa, sfb, d, problem_sizes, expert_offsets,
+                         blockscale_offsets, stream);
+  } else {
+    TORCH_CHECK(false, "dtype must be kFloat16 or kBFloat16");
+  }
+#else
+  TORCH_CHECK(false,
+              "No implemented cutlass_mxfp8_grouped_mm for "
+              "current device");
+#endif
+}
+
+#include "core/registration.h"
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_mxfp8_grouped_mm", cutlass_mxfp8_grouped_mm);
+}
\ No newline at end of file
diff --git a/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_functor.cuh b/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_functor.cuh
new file mode 100644
index 000000000000..9fb1dbf8eef5
--- /dev/null
+++ b/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_functor.cuh
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+// Adapted from SGLang:
+// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_functor.cuh
+
+#pragma once
+#include <cuda.h>
+
+#include "cute/tensor.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass_mxfp8_grouped_mm_traits.cuh"
+
+namespace expert_specialization {
+
+using namespace cute;
+
+template <typename GemmTraits>
+struct CutlassMxfp8GroupedMmOffsetFunctor {
+  using Gemm = typename GemmTraits::Gemm;
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementSF = typename GemmTraits::ElementSF;
+  using ElementD = typename GemmTraits::ElementOutput;
+  // Input
+  int* expert_offsets{nullptr};
+  int* blockscale_offsets{nullptr};
+  // Output
+  ElementA* a_base{nullptr};
+  ElementB* b_base{nullptr};
+  ElementSF* sfa_base{nullptr};
+  ElementSF* sfb_base{nullptr};
+  ElementD* d_base{nullptr};
+  ElementA** a_offsets{nullptr};
+  ElementB** b_offsets{nullptr};
+  ElementSF** sfa_offsets{nullptr};
+  ElementSF** sfb_offsets{nullptr};
+  ElementD** d_offsets{nullptr};
+
+  CutlassMxfp8GroupedMmOffsetFunctor() = default;
+  CutlassMxfp8GroupedMmOffsetFunctor(
+      int* _expert_offsets, int* _blockscale_offsets, ElementA* _a_base,
+      ElementB* _b_base, ElementSF* _sfa_base, ElementSF* _sfb_base,
+      ElementD* _d_base, ElementA** _a_offsets, ElementB** _b_offsets,
+      ElementSF** _sfa_offsets, ElementSF** _sfb_offsets, ElementD** _d_offsets)
+      : expert_offsets{_expert_offsets},
+        blockscale_offsets{_blockscale_offsets},
+        a_base(_a_base),
+        b_base(_b_base),
+        sfa_base(_sfa_base),
+        sfb_base(_sfb_base),
+        d_base(_d_base),
+        a_offsets(_a_offsets),
+        b_offsets(_b_offsets),
+        sfa_offsets(_sfa_offsets),
+        sfb_offsets(_sfb_offsets),
+        d_offsets(_d_offsets) {}
+
+  void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
+    int64_t expert_offset = static_cast<int64_t>(expert_offsets[expert_id]);
+    int64_t blockscale_offset =
+        static_cast<int64_t>(blockscale_offsets[expert_id]);
+    int64_t a_stride = expert_offset * k;
+    int64_t b_stride = expert_id * k * n;
+    int64_t d_stride = expert_offset * n;
+    int64_t sfa_stride = blockscale_offset * (k / 32);
+    int64_t sfb_stride = expert_id * n * (k / 32);
+
+    a_offsets[expert_id] = a_base + a_stride;
+    b_offsets[expert_id] = b_base + b_stride;
+    sfa_offsets[expert_id] = sfa_base + sfa_stride;
+    sfb_offsets[expert_id] = sfb_base + sfb_stride;
+    d_offsets[expert_id] = d_base + d_stride;
+  }
+};
+
+template <typename GemmTraits>
+struct CutlassMxfp8GroupedMmLayoutFunctor {
+  using Sm1xxBlkScaledConfig = typename GemmTraits::Sm1xxBlkScaledConfig;
+  using LayoutSFA = typename GemmTraits::LayoutSFA;
+  using LayoutSFB = typename GemmTraits::LayoutSFB;
+  LayoutSFA* layout_sfa_base{nullptr};
+  LayoutSFB* layout_sfb_base{nullptr};
+
+  CutlassMxfp8GroupedMmLayoutFunctor() = default;
+  CutlassMxfp8GroupedMmLayoutFunctor(LayoutSFA* _layout_sfa_base,
+                                     LayoutSFB* _layout_sfb_base)
+      : layout_sfa_base(_layout_sfa_base), layout_sfb_base(_layout_sfb_base) {}
+
+  void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
+    LayoutSFA* layout_sfa_ptr = layout_sfa_base + expert_id;
+    LayoutSFB* layout_sfb_ptr = layout_sfb_base + expert_id;
+    *layout_sfa_ptr = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(
+        cute::make_shape(m, n, k, 1));
+    *layout_sfb_ptr = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(
+        cute::make_shape(m, n, k, 1));
+  }
+};
+
+template <typename GemmTraits>
+struct CutlassMxfp8GroupedMmStrideFunctor {
+  using StrideA = typename GemmTraits::StrideA;
+  using StrideB = typename GemmTraits::StrideB;
+  using StrideD = typename GemmTraits::StrideD;
+  StrideA* stride_A_base{nullptr};
+  StrideB* stride_B_base{nullptr};
+  StrideD* stride_D_base{nullptr};
+
+  CutlassMxfp8GroupedMmStrideFunctor() = default;
+  CutlassMxfp8GroupedMmStrideFunctor(StrideA* _stride_A_base,
+                                     StrideB* _stride_B_base,
+                                     StrideD* _stride_D_base)
+      : stride_A_base(_stride_A_base),
+        stride_B_base(_stride_B_base),
+        stride_D_base(_stride_D_base) {}
+
+  void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
+    StrideA* stride_A = stride_A_base + expert_id;
+    StrideB* stride_B = stride_B_base + expert_id;
+    StrideD* stride_D = stride_D_base + expert_id;
+    *stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
+    *stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
+    *stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
+  }
+};
+
+template <typename OffsetFunctor, typename LayoutFunctor,
+          typename StrideFunctor>
+__global__ void cutlassMxfp8GroupedMmPreComputeKernel(
+    int* problem_sizes, OffsetFunctor offset_functor,
+    LayoutFunctor layout_functor, StrideFunctor stride_functor) {
+  int64_t expert_id = static_cast<int64_t>(threadIdx.x);
+  int m = problem_sizes[expert_id * 3 + 0];
+  int n = problem_sizes[expert_id * 3 + 1];
+  int k = problem_sizes[expert_id * 3 + 2];
+
+  offset_functor(expert_id, m, n, k);
+  layout_functor(expert_id, m, n, k);
+  stride_functor(expert_id, m, n, k);
+}
+
+}  // namespace expert_specialization
\ No newline at end of file
diff --git a/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_launcher.cuh b/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_launcher.cuh
new file mode 100644
index 000000000000..2c46e1fa7252
--- /dev/null
+++ b/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_launcher.cuh
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+// Adapted from SGLang:
+// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_launcher.cuh
+
+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <cassert>
+#include <iostream>
+#include <string>
+
+#include "cute/tensor.hpp"
+#include "cutlass_mxfp8_grouped_mm_functor.cuh"
+#include "cutlass_mxfp8_grouped_mm_traits.cuh"
+
+namespace expert_specialization {
+
+template <typename GemmTraits>
+void cutlass_mxfp8_grouped_mm_pre_compute(
+    torch::Tensor& a_ptrs, torch::Tensor& b_ptrs, torch::Tensor& sfa_ptrs,
+    torch::Tensor& sfb_ptrs, torch::Tensor& d_ptrs, torch::Tensor& stride_a,
+    torch::Tensor& stride_b, torch::Tensor& stride_d, torch::Tensor& layout_sfa,
+    torch::Tensor& layout_sfb, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& sfa, const torch::Tensor& sfb, const torch::Tensor& d,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets,
+    const torch::Tensor& blockscale_offsets, cudaStream_t stream) {
+  using OffsetFunctor = CutlassMxfp8GroupedMmOffsetFunctor<GemmTraits>;
+  using ElementA = typename OffsetFunctor::ElementA;
+  using ElementB = typename OffsetFunctor::ElementB;
+  using ElementSF = typename OffsetFunctor::ElementSF;
+  using ElementD = typename OffsetFunctor::ElementD;
+
+  using LayoutFunctor = CutlassMxfp8GroupedMmLayoutFunctor<GemmTraits>;
+  using LayoutSFA = typename LayoutFunctor::LayoutSFA;
+  using LayoutSFB = typename LayoutFunctor::LayoutSFB;
+
+  using StrideFunctor = CutlassMxfp8GroupedMmStrideFunctor<GemmTraits>;
+  using StrideA = typename StrideFunctor::StrideA;
+  using StrideB = typename StrideFunctor::StrideB;
+  using StrideD = typename StrideFunctor::StrideD;
+
+  int num_experts = (int)expert_offsets.size(0);
+  TORCH_CHECK(num_experts <= 1024,
+              "Number of experts cannot exceed 1024, the maximum number of "
+              "threads per block.");
+
+  OffsetFunctor offset_functor(
+      reinterpret_cast<int*>(expert_offsets.data_ptr()),
+      reinterpret_cast<int*>(blockscale_offsets.data_ptr()),
+      reinterpret_cast<ElementA*>(a.data_ptr()),
+      reinterpret_cast<ElementB*>(b.data_ptr()),
+      reinterpret_cast<ElementSF*>(sfa.data_ptr()),
+      reinterpret_cast<ElementSF*>(sfb.data_ptr()),
+      reinterpret_cast<ElementD*>(d.data_ptr()),
+      reinterpret_cast<ElementA**>(a_ptrs.data_ptr()),
+      reinterpret_cast<ElementB**>(b_ptrs.data_ptr()),
+      reinterpret_cast<ElementSF**>(sfa_ptrs.data_ptr()),
+      reinterpret_cast<ElementSF**>(sfb_ptrs.data_ptr()),
+      reinterpret_cast<ElementD**>(d_ptrs.data_ptr()));
+  LayoutFunctor layout_functor(
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()));
+  StrideFunctor stride_functor(reinterpret_cast<StrideA*>(stride_a.data_ptr()),
+                               reinterpret_cast<StrideB*>(stride_b.data_ptr()),
+                               reinterpret_cast<StrideD*>(stride_d.data_ptr()));
+  cutlassMxfp8GroupedMmPreComputeKernel<<<1, num_experts, 0, stream>>>(
+      static_cast<int*>(problem_sizes.data_ptr()), offset_functor,
+      layout_functor, stride_functor);
+}
+
+template <typename GemmTraits>
+void cutlass_mxfp8_grouped_mm(
+    const torch::Tensor& a_ptrs, const torch::Tensor& b_ptrs,
+    const torch::Tensor& sfa_ptrs, const torch::Tensor& sfb_ptrs,
+    const torch::Tensor& d_ptrs, const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b, const torch::Tensor& stride_d,
+    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes, cudaStream_t stream) {
+  using Gemm = typename GemmTraits::Gemm;
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementSF = typename GemmTraits::ElementSF;
+  using ElementD = typename GemmTraits::ElementOutput;
+  using StrideA = typename GemmTraits::StrideA;
+  using StrideB = typename GemmTraits::StrideB;
+  using StrideD = typename GemmTraits::StrideD;
+  using LayoutSFA = typename GemmTraits::LayoutSFA;
+  using LayoutSFB = typename GemmTraits::LayoutSFB;
+  using UnderlyingProblemShape =
+      typename GemmTraits::ProblemShape::UnderlyingProblemShape;
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = c10::cuda::current_device();
+  hw_info.sm_count =
+      at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  hw_info.cluster_shape = GemmTraits::MMAConfig::preferred_cluster;
+  hw_info.cluster_shape_fallback = GemmTraits::MMAConfig::fallback_cluster;
+
+  int num_experts = (int)problem_sizes.size(0);
+
+  UnderlyingProblemShape* underlying_problem_shape =
+      reinterpret_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  typename Gemm::Arguments arguments = {
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, underlying_problem_shape, nullptr},
+      {reinterpret_cast<const ElementA**>(a_ptrs.data_ptr()),
+       reinterpret_cast<StrideA*>(stride_a.data_ptr()),
+       reinterpret_cast<const ElementB**>(b_ptrs.data_ptr()),
+       reinterpret_cast<StrideB*>(stride_b.data_ptr()),
+       reinterpret_cast<const ElementSF**>(sfa_ptrs.data_ptr()),
+       reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+       reinterpret_cast<const ElementSF**>(sfb_ptrs.data_ptr()),
+       reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())},
+      {{},
+       nullptr,
+       nullptr,
+       reinterpret_cast<ElementD**>(d_ptrs.data_ptr()),
+       reinterpret_cast<StrideD*>(stride_d.data_ptr())},
+      hw_info,
+      {}  // Scheduler
+  };
+
+  Gemm gemm;
+
+  auto can_implement_status = gemm.can_implement(arguments);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM");
+
+  torch::TensorOptions options_uint8 =
+      torch::TensorOptions().dtype(torch::kUInt8).device(d_ptrs.device());
+  size_t workspace_size = gemm.get_workspace_size(arguments);
+  torch::Tensor workspace = torch::empty(workspace_size, options_uint8);
+
+  auto status = gemm.initialize(arguments, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm.run(stream, nullptr, true);  // Enable PDL
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void cutlass_mxfp8_grouped_mm_dispatch_out_dtype(
+    const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& sfa,
+    const torch::Tensor& sfb, torch::Tensor& d,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets,
+    const torch::Tensor& blockscale_offsets, cudaStream_t stream) {
+  int num_experts = (int)problem_sizes.size(0);
+  torch::TensorOptions options_int64 =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+  torch::TensorOptions options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int64);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int64);
+  torch::Tensor sfa_ptrs = torch::empty(num_experts, options_int64);
+  torch::Tensor sfb_ptrs = torch::empty(num_experts, options_int64);
+  torch::Tensor d_ptrs = torch::empty(num_experts, options_int64);
+
+  torch::Tensor stride_a = torch::empty(num_experts, options_int64);
+  torch::Tensor stride_b = torch::empty(num_experts, options_int64);
+  torch::Tensor stride_d = torch::empty(num_experts, options_int64);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int32);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int32);
+
+  using GemmTraits = CutlassMxfp8GroupedMmGemmTraits<MMA1SMConfig, OutType>;
+  cutlass_mxfp8_grouped_mm_pre_compute<GemmTraits>(
+      a_ptrs, b_ptrs, sfa_ptrs, sfb_ptrs, d_ptrs, stride_a, stride_b, stride_d,
+      layout_sfa, layout_sfb, a, b, sfa, sfb, d, problem_sizes, expert_offsets,
+      blockscale_offsets, stream);
+  cutlass_mxfp8_grouped_mm<GemmTraits>(
+      a_ptrs, b_ptrs, sfa_ptrs, sfb_ptrs, d_ptrs, stride_a, stride_b, stride_d,
+      layout_sfa, layout_sfb, problem_sizes, stream);
+}
+
+}  // namespace expert_specialization
\ No newline at end of file
diff --git a/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_traits.cuh b/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_traits.cuh
new file mode 100644
index 000000000000..ed8cd7ce0658
--- /dev/null
+++ b/csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_traits.cuh
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+// Adapted from SGLang:
+// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_traits.cuh
+
+#pragma once
+
+// Misc
+#include "cute/tensor.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/layout/layout.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_size.h"
+
+// Collective Builder
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+// Integration
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+namespace expert_specialization {
+
+using namespace cute;
+
+// Different configs for 1SM and 2SM MMA kernel
+struct MMA1SMConfig {
+  using MmaTileShape = Shape<_128, _128, _128>;
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  const static dim3 preferred_cluster;
+  const static dim3 fallback_cluster;
+};
+const dim3 MMA1SMConfig::preferred_cluster(1, 4, 1);
+const dim3 MMA1SMConfig::fallback_cluster(1, 2, 1);
+
+template <typename _MMAConfig, typename OutputDtype>
+struct CutlassMxfp8GroupedMmGemmTraits {
+  using MMAConfig = _MMAConfig;
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutputDtype;
+  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
+
+  // A matrix configuration
+  using ElementA = cutlass::mx_float8_t<ElementInput>;
+  using LayoutA = cutlass::layout::RowMajor;
+  constexpr static int AlignmentA = 32;
+
+  // B matrix configuration
+  using ElementB = cutlass::mx_float8_t<ElementInput>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  constexpr static int AlignmentB = 32;
+
+  // C/D matrix configuration
+  using ElementC = void;
+  using ElementD = ElementOutput;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  constexpr static int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+  constexpr static int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  using ElementAccumulator = float;
+
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using CustomEVTIdentity =  // acc
+      cutlass::epilogue::fusion::Sm90EVT<
+          cutlass::epilogue::fusion::Sm90Compute<
+              cutlass::epilogue::thread::Identity, ElementD, ElementAccumulator,
+              RoundStyle>,
+          cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  // Core kernel configurations
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+
+  // Runtime Cluster Shape
+  using ClusterShape = Shape<int32_t, int32_t, _1>;
+
+  // Define Epilogue
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, typename MMAConfig::MmaTileShape,
+          ClusterShape, Shape<_64, _64>, ElementAccumulator, ElementAccumulator,
+          ElementC, LayoutC*, AlignmentC, ElementD, LayoutD*, AlignmentD,
+          typename MMAConfig::EpilogueSchedule,
+          CustomEVTIdentity>::CollectiveOp;
+
+  // Define Mainloop
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
+          LayoutB*, AlignmentB, ElementAccumulator,
+          typename MMAConfig::MmaTileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          typename MMAConfig::KernelSchedule>::CollectiveOp;
+
+  // Define GemmKernel
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using ElementSF = typename Gemm::GemmKernel::ElementSF;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+  using LayoutSFA =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using Sm1xxBlkScaledConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+};
+
+}  // namespace expert_specialization
\ No newline at end of file
diff --git a/csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu b/csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu
new file mode 100644
index 000000000000..2a93ab94d5ca
--- /dev/null
+++ b/csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+// Adapted from SGLang:
+// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_group_quant.cu
+
+#include <torch/all.h>
+
+#include "mxfp8_experts_quant.cuh"
+
+void mxfp8_experts_quant(const torch::Tensor& input,
+                         const torch::Tensor& problem_sizes,
+                         const torch::Tensor& expert_offsets,
+                         const torch::Tensor& blockscale_offsets,
+                         torch::Tensor& quant_output,
+                         torch::Tensor& scale_factor) {
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+  TORCH_CHECK(input.dim() == 2, "input must be 2D tensor");
+  TORCH_CHECK(input.size(1) % 128 == 0, "k must align to 128");
+  TORCH_CHECK(input.strides()[1] == 1, "input must be row major");
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt32,
+              "expert_offsets must be int32");
+  TORCH_CHECK(blockscale_offsets.dtype() == torch::kInt32,
+              "blockscale_offsets must be int32");
+
+  auto groups = problem_sizes.size(0);
+  TORCH_CHECK(
+      expert_offsets.dim() == 1 && expert_offsets.size(0) == groups,
+      "expert_offsets must be 1D and have size equal to the number of groups");
+  TORCH_CHECK(
+      blockscale_offsets.dim() == 1 && blockscale_offsets.size(0) == groups,
+      "blockscale_offsets must be 1D and have size equal to the number of "
+      "groups");
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  if (input.dtype() == torch::kBFloat16) {
+    expert_specialization::launch_mxfp8_experts_quant<__nv_bfloat16>(
+        input, problem_sizes, expert_offsets, blockscale_offsets, quant_output,
+        scale_factor);
+  } else if (input.dtype() == torch::kFloat16) {
+    expert_specialization::launch_mxfp8_experts_quant<__half>(
+        input, problem_sizes, expert_offsets, blockscale_offsets, quant_output,
+        scale_factor);
+  } else {
+    TORCH_CHECK(false, "dtype must be kFloat16 or kBFloat16");
+  }
+#else
+  TORCH_CHECK(false,
+              "No implemented mxfp8_experts_quant for "
+              "current device");
+#endif
+}
+
+#include "core/registration.h"
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("mxfp8_experts_quant", mxfp8_experts_quant);
+}
\ No newline at end of file
diff --git a/csrc/moe/mxfp8_moe/mxfp8_experts_quant.cuh b/csrc/moe/mxfp8_moe/mxfp8_experts_quant.cuh
new file mode 100644
index 000000000000..9a85852080fb
--- /dev/null
+++ b/csrc/moe/mxfp8_moe/mxfp8_experts_quant.cuh
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+// Adapted from SGLang:
+// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_group_quant.cuh
+
+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <torch/all.h>
+
+#include <cuda/ptx>
+
+#include "cute/tensor.hpp"
+
+namespace expert_specialization {
+
+using namespace cute;
+
+constexpr uint32_t THREAD_BLOCK_SIZE = 128;
+constexpr uint32_t WARP_SIZE = 32;
+constexpr int BLOCK_M = 128;
+constexpr int BLOCK_K = 128;
+using ThrLayout = Layout<Shape<_16, _8>, Stride<_8, _1>>;
+using ValLayout = Layout<Shape<_1, _16>>;
+using SfR2SThrLayout = Layout<Shape<_16, _4>, Stride<_4, _1>>;
+using SfR2SValLayout = Layout<Shape<_1, _1>>;
+using ScaleFactorTileLayout =
+    Layout<Shape<Shape<_32, _4>, _4>, Stride<Stride<_16, _4>, _1>>;
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+// Some code references TRT-LLM:
+// https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/quantization.cuh
+template <typename FragmentS, typename FragmentD>
+__inline__ __device__ uint8_t cvt_warp_fp16_to_mxfp8(FragmentS& fragment_s,
+                                                     FragmentD& fragment_d) {
+  using FragmentSLayout = typename FragmentS::layout_type;
+  using FragmentDLayout = typename FragmentD::layout_type;
+  FragmentSLayout fragment_s_layout;
+  FragmentDLayout fragment_d_layout;
+  static_assert(is_static<FragmentSLayout>::value &&
+                size(fragment_s_layout) == 16);
+  static_assert(is_static<FragmentDLayout>::value &&
+                size(fragment_d_layout) == 16);
+
+  constexpr int eles_per_thr = 16;
+  using ValType = typename FragmentS::element_type;
+  using VecType = std::conditional_t<std::is_same_v<ValType, __nv_bfloat16>,
+                                     __nv_bfloat162, __half2>;
+  VecType vec[8];
+  // Assign vals
+  vec[0].x = fragment_s(Int<0>{});
+  vec[0].y = fragment_s(Int<1>{});
+  vec[1].x = fragment_s(Int<2>{});
+  vec[1].y = fragment_s(Int<3>{});
+  vec[2].x = fragment_s(Int<4>{});
+  vec[2].y = fragment_s(Int<5>{});
+  vec[3].x = fragment_s(Int<6>{});
+  vec[3].y = fragment_s(Int<7>{});
+  vec[4].x = fragment_s(Int<8>{});
+  vec[4].y = fragment_s(Int<9>{});
+  vec[5].x = fragment_s(Int<10>{});
+  vec[5].y = fragment_s(Int<11>{});
+  vec[6].x = fragment_s(Int<12>{});
+  vec[6].y = fragment_s(Int<13>{});
+  vec[7].x = fragment_s(Int<14>{});
+  vec[7].y = fragment_s(Int<15>{});
+
+  auto local_max = __habs2(vec[0]);
+  for (int i = 1; i < eles_per_thr / 2; i++) {
+    local_max = __hmax2(__habs2(vec[i]), local_max);
+  }
+  local_max = __hmax2(__shfl_xor_sync(uint32_t(-1), local_max, 1), local_max);
+
+  // Get the final absolute maximum values.
+  float block_max(0.0f);
+  if constexpr (std::is_same_v<ValType, __nv_bfloat16>) {
+    block_max = __bfloat162float(__hmax(local_max.x, local_max.y));
+  } else {
+    block_max = __half2float(__hmax(local_max.x, local_max.y));
+  }
+  // Get the SF (max value of the vector / max value of mxfp8).
+  float sf_val = block_max * reciprocal_approximate_ftz(448.0f);
+  // 8 bits representation of the SF.
+  uint8_t fp8_sf_val;
+
+  __nv_fp8_e8m0 tmp_sf_val;
+  tmp_sf_val.__x =
+      __nv_cvt_float_to_e8m0(sf_val, __NV_SATFINITE, cudaRoundPosInf);
+  sf_val = static_cast<float>(tmp_sf_val);
+  fp8_sf_val = tmp_sf_val.__x;
+  // Get the output scale (reciprocal of the SFValue).
+  float output_scale =
+      block_max != 0.f ? reciprocal_approximate_ftz(sf_val) : 0.0f;
+
+  // Convert the input to float.
+  float2 fp2_vals[eles_per_thr / 2];
+
+#pragma unroll
+  for (int i = 0; i < eles_per_thr / 2; i++) {
+    if constexpr (std::is_same_v<ValType, __half>) {
+      fp2_vals[i] = __half22float2(vec[i]);
+    } else {
+      fp2_vals[i] = __bfloat1622float2(vec[i]);
+    }
+    fp2_vals[i].x *= output_scale;
+    fp2_vals[i].y *= output_scale;
+  }
+  union {
+    uint8_t bytes[16];
+    __nv_fp8x2_e4m3 elts[8];
+  } u;
+  u.elts[0] = __nv_fp8x2_e4m3(fp2_vals[0]);
+  u.elts[1] = __nv_fp8x2_e4m3(fp2_vals[1]);
+  u.elts[2] = __nv_fp8x2_e4m3(fp2_vals[2]);
+  u.elts[3] = __nv_fp8x2_e4m3(fp2_vals[3]);
+  u.elts[4] = __nv_fp8x2_e4m3(fp2_vals[4]);
+  u.elts[5] = __nv_fp8x2_e4m3(fp2_vals[5]);
+  u.elts[6] = __nv_fp8x2_e4m3(fp2_vals[6]);
+  u.elts[7] = __nv_fp8x2_e4m3(fp2_vals[7]);
+  fragment_d(Int<0>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[0]);
+  fragment_d(Int<1>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[1]);
+  fragment_d(Int<2>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[2]);
+  fragment_d(Int<3>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[3]);
+  fragment_d(Int<4>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[4]);
+  fragment_d(Int<5>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[5]);
+  fragment_d(Int<6>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[6]);
+  fragment_d(Int<7>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[7]);
+  fragment_d(Int<8>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[8]);
+  fragment_d(Int<9>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[9]);
+  fragment_d(Int<10>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[10]);
+  fragment_d(Int<11>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[11]);
+  fragment_d(Int<12>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[12]);
+  fragment_d(Int<13>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[13]);
+  fragment_d(Int<14>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[14]);
+  fragment_d(Int<15>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[15]);
+  return fp8_sf_val;
+}
+
+template <typename TensorS, typename TensorP, typename TensorD,
+          typename TensorSharedSF, typename TensorSF, typename TiledCopyG2R,
+          typename TiledCopyR2G, typename TiledCopyR2S>
+__inline__ __device__ void mxfp8_experts_quant_tile(
+    TensorS& tensor_s, TensorP& tensor_p, TensorD& tensor_d,
+    TensorSharedSF& tensor_shared_sf, TensorSF& tensor_sf, int m,
+    TiledCopyG2R& tiled_copy_g2r, TiledCopyR2G& tiled_copy_r2g,
+    TiledCopyR2S& tiled_copy_r2s) {
+  static_assert(size(get<0>(typename TensorS::layout_type{})) == 128 &&
+                size(get<1>(typename TensorS::layout_type{})) == 128 &&
+                stride(get<1>(typename TensorS::layout_type{})) == 1);
+  static_assert(size(get<0>(typename TensorD::layout_type{})) == 128 &&
+                size(get<1>(typename TensorD::layout_type{})) == 128 &&
+                stride(get<1>(typename TensorD::layout_type{})) == 1);
+  static_assert(size(get<0>(typename TensorP::layout_type{})) == 128 &&
+                size(get<1>(typename TensorP::layout_type{})) == 128);
+  static_assert(size(get<0>(typename TensorSharedSF::layout_type{})) == 128 &&
+                size(get<1>(typename TensorSharedSF::layout_type{})) == 4);
+  static_assert(size(get<0>(typename TensorSF::layout_type{})) == 128 &&
+                size(get<1>(typename TensorSF::layout_type{})) == 4);
+
+  using Tiler_MN = typename TiledCopyG2R::Tiler_MN;
+  auto tiler_mn = Tiler_MN{};
+  static_assert(size<0>(tiler_mn) == 16 && size<1>(tiler_mn) == 128);
+
+  auto tiled_tensor_s = tiled_divide(tensor_s, tiler_mn);
+  auto tiled_tensor_p = tiled_divide(tensor_p, tiler_mn);
+  auto tiled_tensor_d = tiled_divide(tensor_d, tiler_mn);
+  static_assert(size<2>(tiled_tensor_s) == 1);
+  static_assert(size<2>(tiled_tensor_p) == 1);
+  static_assert(size<2>(tiled_tensor_d) == 1);
+  auto squeeze_tiled_tensor_s = take<0, 2>(tiled_tensor_s);
+  auto squeeze_tiled_tensor_p = take<0, 2>(tiled_tensor_p);
+  auto squeeze_tiled_tensor_d = take<0, 2>(tiled_tensor_d);
+
+  using SF_Tiler_MN = typename TiledCopyR2S::Tiler_MN;
+  auto sf_tiler_mn = SF_Tiler_MN{};
+  static_assert(size<0>(sf_tiler_mn) == 16 && size<1>(sf_tiler_mn) == 4);
+
+  auto tiled_tensor_sf = tiled_divide(tensor_sf, sf_tiler_mn);
+  auto tiled_tensor_shared_sf = tiled_divide(tensor_shared_sf, sf_tiler_mn);
+  auto squeeze_tiled_tensor_sf = take<0, 2>(tiled_tensor_sf);
+  auto squeeze_tiled_tensor_shared_sf = take<0, 2>(tiled_tensor_shared_sf);
+
+  constexpr int tile_loop_count = size<1>(tiled_tensor_s);
+  constexpr int rows_in_tile = 16;
+  // We don't need to clear shared memory
+  // clear(squeeze_tiled_tensor_shared_sf);
+#pragma unroll 4
+  for (int t = 0; t < tile_loop_count; t++) {
+    if (t * rows_in_tile >= m) {
+      break;
+    }
+    auto current_copy_tile_s = tensor<0>(squeeze_tiled_tensor_s(_, t));
+    auto current_copy_tile_p = tensor<0>(squeeze_tiled_tensor_p(_, t));
+    auto current_copy_tile_d = tensor<0>(squeeze_tiled_tensor_d(_, t));
+    auto current_copy_tile_sf = tensor<0>(squeeze_tiled_tensor_sf(_, t));
+    auto current_copy_tile_shared_sf =
+        tensor<0>(squeeze_tiled_tensor_shared_sf(_, t));
+
+    // Global to Register copy
+    auto thr_copy_g2r = tiled_copy_g2r.get_thread_slice(threadIdx.x);
+    auto thr_tile_g2r_s = thr_copy_g2r.partition_S(current_copy_tile_s);
+    auto thr_tile_g2r_p = thr_copy_g2r.partition_S(current_copy_tile_p);
+    auto input_fragment = make_fragment_like(thr_tile_g2r_s);
+
+    // Register to Global copy
+    auto thr_copy_r2g = tiled_copy_r2g.get_thread_slice(threadIdx.x);
+    auto thr_tile_r2g_d = thr_copy_r2g.partition_D(current_copy_tile_d);
+    auto thr_tile_r2g_p = thr_copy_r2g.partition_D(current_copy_tile_p);
+    auto output_fragment = make_fragment_like(thr_tile_r2g_d);
+
+    // Register to Shared copy
+    auto thr_copy_r2s = tiled_copy_r2s.get_thread_slice(threadIdx.x / 2);
+    auto thr_tile_r2s_shared_sf =
+        thr_copy_r2s.partition_D(current_copy_tile_shared_sf);
+    auto shared_sf_fragment = make_fragment_like(thr_tile_r2s_shared_sf);
+
+    // CopyG2R & convert & CopyR2G
+    copy_if(tiled_copy_g2r, thr_tile_g2r_p, thr_tile_g2r_s, input_fragment);
+    uint8_t fp8_sf_val =
+        cvt_warp_fp16_to_mxfp8(input_fragment, output_fragment);
+    copy_if(tiled_copy_r2g, thr_tile_r2g_p, output_fragment, thr_tile_r2g_d);
+    shared_sf_fragment[0] = fp8_sf_val;
+
+    // Before first copy r2s, clear shared memory and wait previous group
+    if (t == 0 && threadIdx.x == 0) {
+      // Wait for the group to have completed reading from shared memory.
+      cuda::ptx::cp_async_bulk_wait_group_read(cuda::ptx::n32_t<0>());
+    }
+    __syncthreads();
+
+    if (threadIdx.x % 2 == 0) {
+      copy(tiled_copy_r2s, shared_sf_fragment, thr_tile_r2s_shared_sf);
+    }
+    __syncthreads();
+  }
+
+  // Wait for shared memory writes to be visible to TMA engine.
+  cuda::ptx::fence_proxy_async(cuda::ptx::space_shared);  // b)
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cuda::ptx::cp_async_bulk(cuda::ptx::space_global, cuda::ptx::space_shared,
+                             squeeze_tiled_tensor_sf.data().get(),
+                             squeeze_tiled_tensor_shared_sf.data().get(), 512);
+    // Wait for TMA transfer to have finished reading shared memory.
+    // Create a "bulk async-group" out of the previous bulk copy operation.
+    cuda::ptx::cp_async_bulk_commit_group();
+  }
+  __syncthreads();
+}
+
+template <typename T_IN, typename TiledCopyG2R, typename TiledCopyR2G,
+          typename TiledCopyR2S>
+__global__ void mxfp8_experts_quant_kernel(
+    const T_IN* input, const int* problem_sizes, const int* expert_offsets,
+    const int* blockscale_offsets, cutlass::float_e4m3_t* quant_output,
+    uint8_t* scale_factor, int groups, TiledCopyG2R tiled_copy_g2r,
+    TiledCopyR2G tiled_copy_r2g, TiledCopyR2S tiled_copy_r2s) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
+  __shared__ __align__(512) uint8_t shared_memory[512];
+  ScaleFactorTileLayout scale_factor_tile_layout{};
+  auto scale_factor_shared =
+      make_tensor(make_smem_ptr(shared_memory),
+                  scale_factor_tile_layout);  // ((_32,_4), _4):((_16,_4), _1)
+  // TODO: Transform Groupwise Schedule into a more efficient Schedule
+  for (int g = 0; g < groups; g++) {
+    int m = problem_sizes[g * 3 + 0];
+    int k = problem_sizes[g * 3 + 2];
+    int64_t expert_offset = static_cast<int64_t>(expert_offsets[g]);
+    int64_t blockscale_offset = static_cast<int64_t>(blockscale_offsets[g]);
+
+    auto input_tensor = make_tensor(
+        make_gmem_ptr(input + expert_offset * k),
+        make_layout(make_shape(m, k),
+                    LayoutRight{}));  // (M, K):(K, 1) half_t/bfloat16_t
+
+    auto quant_output_tensor = make_tensor(
+        make_gmem_ptr(quant_output + expert_offset * k),
+        make_layout(make_shape(m, k),
+                    LayoutRight{}));  // (M, K):(K, 1) cutlass::float_e4m3_t
+
+    auto scale_factor_shape = make_shape(ceil_div(m, 128) * 128, k / 32);
+    auto scale_factor_layout = tile_to_shape(scale_factor_tile_layout,
+                                             scale_factor_shape, LayoutRight{});
+    // layout<0>(layout<0>(scale_factor_layout))  (_32,_4):(_16,_4) -- static
+    // layout<1>(layout<0>(scale_factor_layout))  M_align_128 / 128 -- dynamic
+    // shape dynamic stride layout<0>(layout<1>(scale_factor_layout))  _4:_1 --
+    // static layout<1>(layout<1>(scale_factor_layout))  (K / 32) / 4 : _512 --
+    // dynamic shape static stride
+
+    // Reshape to zipped layout for 1D indexing
+    auto zipped_scale_factor_layout = make_layout(
+        make_layout(layout<0>(layout<0>(scale_factor_layout)),
+                    layout<0>(layout<1>(scale_factor_layout))),
+        make_layout(
+            layout<1>(layout<0>(scale_factor_layout)),
+            layout<1>(layout<1>(
+                scale_factor_layout))));  // (((_32,_4),_4),(M_align_128 /
+                                          // 128,(K / 32) /
+                                          // 4)):(((_16,_4),_1),(?,_512))
+
+    auto scale_factor_tensor =
+        make_tensor(make_gmem_ptr(scale_factor + blockscale_offset * (k / 32)),
+                    zipped_scale_factor_layout);
+
+    // Used for cases where M is not divisible by 128 (most scenarios).
+    auto input_shape = shape(input_tensor);  // (M, K):(K, 1)
+    auto identity_tensor = make_identity_tensor(input_shape);
+    auto predict_tensor = cute::lazy::transform(
+        identity_tensor, [&](auto c) { return elem_less(c, input_shape); });
+
+    // (_128, _128)
+    auto tiler = make_shape(Int<BLOCK_M>{}, Int<BLOCK_K>{});
+
+    auto tiled_input_tensor = zipped_divide(
+        input_tensor, tiler);  // ((128, 128), (cdiv(M, 128), cdiv(K, 128)))
+    auto tiled_quant_output_tensor =
+        zipped_divide(quant_output_tensor,
+                      tiler);  // ((128, 128), (cdiv(M, 128), cdiv(K, 128)))
+    auto tiled_predict_tensor = zipped_divide(
+        predict_tensor, tiler);  // ((128, 128), (cdiv(M, 128), cdiv(K, 128)))
+
+    auto total_tiles =
+        size<1>(tiled_input_tensor);  // cdiv(M, 128) * cdiv(K, 128)
+    decltype(total_tiles) blk_offset = blockIdx.x;
+    while (blk_offset < total_tiles) {
+      auto current_input_tile = tensor<0>(tiled_input_tensor(_, blk_offset));
+      auto current_quant_output_tile =
+          tensor<0>(tiled_quant_output_tensor(_, blk_offset));
+      auto current_predict_tile =
+          tensor<0>(tiled_predict_tensor(_, blk_offset));
+      auto current_scale_factor_tile =
+          tensor<0>(scale_factor_tensor(_, blk_offset));
+
+      mxfp8_experts_quant_tile<
+          decltype(current_input_tile), decltype(current_predict_tile),
+          decltype(current_quant_output_tile), decltype(scale_factor_shared),
+          decltype(current_scale_factor_tile), TiledCopyG2R, TiledCopyR2G,
+          TiledCopyR2S>(current_input_tile, current_predict_tile,
+                        current_quant_output_tile, scale_factor_shared,
+                        current_scale_factor_tile, m, tiled_copy_g2r,
+                        tiled_copy_r2g, tiled_copy_r2s);
+      blk_offset += gridDim.x;
+    }
+  }
+#endif
+}
+
+template <typename T_IN>
+void launch_mxfp8_experts_quant(const torch::Tensor& input,
+                                const torch::Tensor& problem_sizes,
+                                const torch::Tensor& expert_offsets,
+                                const torch::Tensor& blockscale_offsets,
+                                torch::Tensor& quant_output,
+                                torch::Tensor& scale_factor) {
+  ThrLayout thr_layout{};
+  ValLayout val_layout{};
+  SfR2SThrLayout r2s_thr_layout{};
+  SfR2SValLayout r2s_val_layout{};
+
+  using CopyOpG2R =
+      UniversalCopy<cutlass::AlignedArray<T_IN, size(val_layout)>>;
+  using CopyAtomG2R = cute::Copy_Atom<CopyOpG2R, T_IN>;
+  auto tiled_copy_g2r = cute::make_tiled_copy(
+      CopyAtomG2R{}, thr_layout, val_layout);  // Tiler_MN: (16, 128)
+
+  using CopyOpR2G = UniversalCopy<
+      cutlass::AlignedArray<cutlass::float_e4m3_t, size(val_layout)>>;
+  using CopyAtomR2G = cute::Copy_Atom<CopyOpR2G, cutlass::float_e4m3_t>;
+  auto tiled_copy_r2g = cute::make_tiled_copy(
+      CopyAtomR2G{}, thr_layout, val_layout);  // Tiler_MN: (16, 128)
+
+  using CopyOpR2S =
+      UniversalCopy<cutlass::AlignedArray<uint8_t, size(r2s_val_layout)>>;
+  using CopyAtomR2S = cute::Copy_Atom<CopyOpR2S, uint8_t>;
+  auto tiled_copy_r2s = cute::make_tiled_copy(
+      CopyAtomR2S{}, r2s_thr_layout, r2s_val_layout);  // Tiler_MN: (16, 4)
+
+  int max_active_blocks_per_sm = -1;
+  AT_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_active_blocks_per_sm,
+      mxfp8_experts_quant_kernel<T_IN, decltype(tiled_copy_g2r),
+                                 decltype(tiled_copy_r2g),
+                                 decltype(tiled_copy_r2s)>,
+      THREAD_BLOCK_SIZE, 0));
+
+  dim3 grid(at::cuda::getCurrentDeviceProperties()->multiProcessorCount *
+                max_active_blocks_per_sm,
+            1, 1);
+  dim3 block(THREAD_BLOCK_SIZE, 1, 1);
+  int num_experts = (int)problem_sizes.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  mxfp8_experts_quant_kernel<T_IN, decltype(tiled_copy_g2r),
+                             decltype(tiled_copy_r2g), decltype(tiled_copy_r2s)>
+      <<<grid, block, 0, stream>>>(
+          reinterpret_cast<const T_IN*>(input.data_ptr()),
+          reinterpret_cast<const int*>(problem_sizes.data_ptr()),
+          reinterpret_cast<const int*>(expert_offsets.data_ptr()),
+          reinterpret_cast<const int*>(blockscale_offsets.data_ptr()),
+          reinterpret_cast<cutlass::float_e4m3_t*>(quant_output.data_ptr()),
+          reinterpret_cast<uint8_t*>(scale_factor.data_ptr()), num_experts,
+          tiled_copy_g2r, tiled_copy_r2g, tiled_copy_r2s);
+}
+
+}  // namespace expert_specialization
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 9ba18289eaa9..f7ea8c788dd0 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -426,6 +426,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()");
   // conditionally compiled so impl registration is in source file
 
+  // Expert-specialization mxfp8 blockscaled grouped quantization (SM100+).
+  ops.def(
+      "mxfp8_experts_quant("
+      " Tensor input, Tensor problem_sizes, Tensor expert_offsets,"
+      " Tensor blockscale_offsets, Tensor! quant_output, Tensor! scale_factor)"
+      " -> ()");
+  // conditionally compiled so impl registration is in source file
+
+  // Expert-specialization mxfp8 blockscaled grouped GEMM (SM100+).
+  ops.def(
+      "cutlass_mxfp8_grouped_mm("
+      " Tensor a, Tensor b, Tensor sfa, Tensor sfb, Tensor! out,"
+      " Tensor problem_sizes, Tensor expert_offsets, Tensor blockscale_offsets)"
+      " -> ()");
+  // conditionally compiled so impl registration is in source file
+
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization, as well as bias
   ops.def(
diff --git a/tests/kernels/moe/test_cutlass_mxfp8_grouped_mm.py b/tests/kernels/moe/test_cutlass_mxfp8_grouped_mm.py
new file mode 100644
index 000000000000..3a154fbb84cd
--- /dev/null
+++ b/tests/kernels/moe/test_cutlass_mxfp8_grouped_mm.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from SGLang:
+# https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/tests/test_es_fp8_blockwise_moe.py
+
+"""Tests for SM100 CUTLASS MXFP8 grouped MoE kernels."""
+
+import random
+
+import pytest
+import torch
+
+from tests.kernels.utils import torch_moe_single
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
+
+random.seed(42)
+set_random_seed(42)
+
+
+def align(val: int, alignment: int = 128) -> int:
+    return int((val + alignment - 1) // alignment * alignment)
+
+
+# Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+def is_sm100_supported() -> bool:
+    return current_platform.is_cuda() and current_platform.is_device_capability_family(
+        100
+    )
+
+
+def compute_ref_output(
+    input_tensor: torch.Tensor,
+    weight_list: list[torch.Tensor],
+    expert_offsets: list[int],
+    expert_offset: int,
+    num_experts: int,
+) -> torch.Tensor:
+    # Build a top-1 routing score so each token maps to its owning expert.
+    score = torch.full(
+        (expert_offset, num_experts),
+        -1e9,
+        device=input_tensor.device,
+        dtype=torch.float32,
+    )
+    for g in range(num_experts):
+        start = expert_offsets[g]
+        end = expert_offsets[g + 1] if g + 1 < num_experts else expert_offset
+        score[start:end, g] = 0.0
+
+    return torch_moe_single(
+        input_tensor, torch.stack(weight_list, dim=0), score, topk=1
+    )
+
+
+def compute_kernel_output(
+    input_tensor: torch.Tensor,
+    weight_tensor: torch.Tensor,
+    problem_sizes: list[list[int]],
+    aux_problem_sizes: list[list[int]],
+    expert_offsets: list[int],
+    aux_expert_offsets: list[int],
+    input_blockscale_offsets: list[int],
+    weight_blockscale_offsets: list[int],
+    input_blockscale_offset: int,
+    n_g: int,
+    k_g: int,
+    num_experts: int,
+    expert_offset: int,
+    out_dtype: torch.dtype,
+) -> torch.Tensor:
+    device = input_tensor.device
+    _problem_sizes = torch.tensor(problem_sizes).to(device=device, dtype=torch.int32)
+    _aux_problem_sizes = torch.tensor(aux_problem_sizes).to(
+        device=device, dtype=torch.int32
+    )
+    _expert_offsets = torch.tensor(expert_offsets).to(device=device, dtype=torch.int32)
+    _aux_expert_offsets = torch.tensor(aux_expert_offsets).to(
+        device=device, dtype=torch.int32
+    )
+    _input_blockscale_offsets = torch.tensor(input_blockscale_offsets).to(
+        device=device, dtype=torch.int32
+    )
+    _weight_blockscale_offsets = torch.tensor(weight_blockscale_offsets).to(
+        device=device, dtype=torch.int32
+    )
+
+    input_quant = torch.zeros_like(
+        input_tensor, dtype=torch.float8_e4m3fn, device=device
+    )
+    input_scale_factor = torch.zeros(
+        (input_blockscale_offset, k_g // 32), dtype=torch.uint8, device=device
+    )
+
+    weight_quant = torch.zeros_like(
+        weight_tensor, dtype=torch.float8_e4m3fn, device=device
+    )
+    weight_scale_factor = torch.zeros(
+        (num_experts, n_g, k_g // 32), dtype=torch.uint8, device=device
+    )
+
+    ops.mxfp8_experts_quant(
+        input_tensor,
+        _problem_sizes,
+        _expert_offsets,
+        _input_blockscale_offsets,
+        input_quant,
+        input_scale_factor,
+    )
+
+    ops.mxfp8_experts_quant(
+        weight_tensor,
+        _aux_problem_sizes,
+        _aux_expert_offsets,
+        _weight_blockscale_offsets,
+        weight_quant,
+        weight_scale_factor,
+    )
+    weight_quant = weight_quant.view(num_experts, n_g, k_g).transpose(1, 2)
+    weight_scale_factor = weight_scale_factor.view(
+        num_experts, n_g, k_g // 32
+    ).transpose(1, 2)
+
+    output = torch.empty((expert_offset, n_g), device=device, dtype=out_dtype)
+    ops.cutlass_mxfp8_grouped_mm(
+        input_quant,
+        weight_quant,
+        input_scale_factor,
+        weight_scale_factor,
+        output,
+        _problem_sizes,
+        _expert_offsets,
+        _input_blockscale_offsets,
+    )
+    return output
+
+
+@pytest.mark.skipif(
+    not is_sm100_supported(),
+    reason=(
+        "cutlass_mxfp8_grouped_mm and mxfp8_experts_quant "
+        "are only supported on CUDA SM100"
+    ),
+)
+@pytest.mark.parametrize("num_experts", [8, 16, 32, 64])
+@pytest.mark.parametrize("out_dtype", [torch.half, torch.bfloat16])
+def test_cutlass_mxfp8_grouped_mm(num_experts, out_dtype):
+    device = "cuda"
+    alignment = 128
+    n_g = random.randint(1, 64) * alignment
+    k_g = random.randint(1, 64) * alignment
+
+    expert_offset = 0
+    expert_offsets = []
+    aux_expert_offset = 0
+    aux_expert_offsets = []
+    input_blockscale_offset = 0
+    input_blockscale_offsets = []
+    weight_blockscale_offset = 0
+    weight_blockscale_offsets = []
+    problem_sizes = []
+    aux_problem_sizes = []
+    input_list = []
+    weight_list = []
+
+    for g in range(num_experts):
+        m_g = random.randint(1, 512)
+        expert_offsets.append(expert_offset)
+        expert_offset += m_g
+        aux_expert_offsets.append(aux_expert_offset)
+        aux_expert_offset += n_g
+        input_blockscale_offsets.append(input_blockscale_offset)
+        input_blockscale_offset += align(m_g, 128)
+        weight_blockscale_offsets.append(weight_blockscale_offset)
+        weight_blockscale_offset += n_g  # n_g already align to 128
+        problem_sizes.append([m_g, n_g, k_g])
+        aux_problem_sizes.append([n_g, m_g, k_g])
+
+        input_tensor = torch.normal(
+            0.0, std=1.0, size=(m_g, k_g), device=device, dtype=out_dtype
+        )  # (M, K):(K, 1)
+        weight_tensor = torch.normal(
+            0.0, std=1.0, size=(n_g, k_g), device=device, dtype=out_dtype
+        )  # (N, K):(K, 1)
+
+        input_list.append(input_tensor)
+        weight_list.append(weight_tensor)
+    input_tensor = torch.concat(input_list, dim=0)
+    weight_tensor = torch.concat(weight_list, dim=0)
+
+    ref_output = compute_ref_output(
+        input_tensor=input_tensor,
+        weight_list=weight_list,
+        expert_offsets=expert_offsets,
+        expert_offset=expert_offset,
+        num_experts=num_experts,
+    )
+    output = compute_kernel_output(
+        input_tensor=input_tensor,
+        weight_tensor=weight_tensor,
+        problem_sizes=problem_sizes,
+        aux_problem_sizes=aux_problem_sizes,
+        expert_offsets=expert_offsets,
+        aux_expert_offsets=aux_expert_offsets,
+        input_blockscale_offsets=input_blockscale_offsets,
+        weight_blockscale_offsets=weight_blockscale_offsets,
+        input_blockscale_offset=input_blockscale_offset,
+        n_g=n_g,
+        k_g=k_g,
+        num_experts=num_experts,
+        expert_offset=expert_offset,
+        out_dtype=out_dtype,
+    )
+
+    for g in range(num_experts):
+        baseline = ref_output[
+            expert_offsets[g] : (expert_offsets[g] + problem_sizes[g][0])
+        ]
+        actual = output[expert_offsets[g] : (expert_offsets[g] + problem_sizes[g][0])]
+        diff = calc_diff(actual, baseline)
+        assert diff < 0.001
+        print(
+            f"m_g={baseline.shape[0]} n_g={n_g} k_g={k_g} num_experts={num_experts}, "
+            f"out_dtype={out_dtype}, diff={diff:.5f}: OK"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9ed8dfa8df52..45e016d1abc9 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1102,6 +1102,76 @@ def cutlass_fp4_moe_mm(
     )
 
 
+def mxfp8_experts_quant(
+    input_tensor: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    blockscale_offsets: torch.Tensor,
+    quant_output: torch.Tensor,
+    scale_factor: torch.Tensor,
+) -> None:
+    torch.ops._C.mxfp8_experts_quant(
+        input_tensor,
+        problem_sizes,
+        expert_offsets,
+        blockscale_offsets,
+        quant_output,
+        scale_factor,
+    )
+
+
+def cutlass_mxfp8_grouped_mm(
+    a_tensors: torch.Tensor,
+    b_tensors: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    out_tensors: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    blockscale_offsets: torch.Tensor,
+) -> None:
+    torch.ops._C.cutlass_mxfp8_grouped_mm(
+        a_tensors,
+        b_tensors,
+        a_scales,
+        b_scales,
+        out_tensors,
+        problem_sizes,
+        expert_offsets,
+        blockscale_offsets,
+    )
+
+
+if hasattr(torch.ops._C, "mxfp8_experts_quant"):
+
+    @register_fake("_C::mxfp8_experts_quant")
+    def _mxfp8_experts_quant_fake(
+        input_tensor: torch.Tensor,
+        problem_sizes: torch.Tensor,
+        expert_offsets: torch.Tensor,
+        blockscale_offsets: torch.Tensor,
+        quant_output: torch.Tensor,
+        scale_factor: torch.Tensor,
+    ) -> None:
+        return None
+
+
+if hasattr(torch.ops._C, "cutlass_mxfp8_grouped_mm"):
+
+    @register_fake("_C::cutlass_mxfp8_grouped_mm")
+    def _cutlass_mxfp8_grouped_mm_fake(
+        a_tensors: torch.Tensor,
+        b_tensors: torch.Tensor,
+        a_scales: torch.Tensor,
+        b_scales: torch.Tensor,
+        out_tensors: torch.Tensor,
+        problem_sizes: torch.Tensor,
+        expert_offsets: torch.Tensor,
+        blockscale_offsets: torch.Tensor,
+    ) -> None:
+        return None
+
+
 # gptq_marlin
 def gptq_marlin_repack(
     b_q_weight: torch.Tensor,

From 3fd1d4ec2c992689594f9c7ee0ac79597f74a2ef Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Mon, 2 Mar 2026 01:43:38 -0600
Subject: [PATCH 152/154] [Rocm][CI] Fix LM Eval Large Models (H100) test group
 (#34750)

Signed-off-by: charlifu <charlifu@amd.com>
---
 .buildkite/lm-eval-harness/configs/models-large-rocm.txt | 1 +
 .buildkite/test-amd.yaml                                 | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
index 4fb0b84bc4d8..a9a60f348d6a 100644
--- a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -1 +1,2 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+Qwen3-235B-A22B-Instruct-2507-FP8.yaml
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 6c35e0db137d..ab8bf9d23c0d 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1544,8 +1544,8 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
-##### H100 test #####
-- label: LM Eval Large Models (H100) # optional
+##### FP8 test #####
+- label: LM Eval Large Models (H100) # optional, still use H100 for consistency
   gpu: h100
   optional: true
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -1557,8 +1557,8 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+    - export VLLM_USE_DEEP_GEMM=0 
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4
 
 
 ##### H200 test #####

From ec27b36b4b17ab51fe5a9fed4b0fbd4b39123058 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Mon, 2 Mar 2026 02:10:54 -0600
Subject: [PATCH 153/154] [CI] Defining extended V1 e2e + engine tests (#35580)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test-amd.yaml          | 32 ++++++++++++++++++++++++++---
 .buildkite/test_areas/engine.yaml | 34 ++++++++++++++++++++++++++++++-
 tests/v1/e2e/test_spec_decode.py  |  2 +-
 3 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ab8bf9d23c0d..c5db1ca83634 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -388,9 +388,7 @@ steps:
 - label: V1 Test e2e + engine # 65min
   timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental, amdproduction]
-  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
-  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
-  agent_pool: mi325_8
+  agent_pool: mi325_1
   optional: true
   # grade: Blocking
   source_file_dependencies:
@@ -402,6 +400,34 @@ steps:
     - pytest -v -s v1/e2e
     - pytest -v -s v1/engine
 
+- label: V1 Test e2e (2 GPUs) # 65min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_2
+  optional: true
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # Only run tests that need exactly 2 GPUs
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+
+- label: V1 Test e2e (4 GPUs) # 65min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental, amdproduction]
+  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
+  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
+  agent_pool: mi325_4
+  optional: true
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # Only run tests that need 4 GPUs
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index 19cd91370e64..b5b3eeb6d728 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -14,7 +14,7 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
 
-- label: V1 e2e + engine
+- label: V1 e2e + engine (1 GPU)
   timeout_in_minutes: 45
   source_file_dependencies:
     - vllm/
@@ -36,3 +36,35 @@ steps:
       commands:
       - pytest -v -s v1/e2e
       - pytest -v -s v1/engine
+
+- label: V1 e2e (2 GPUs)
+  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
+  optional: true
+  num_devices: 2
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/e2e
+  commands:
+    # Only run tests that need exactly 2 GPUs
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+  mirror:
+    amd:
+      device: mi325_2
+      depends_on:
+      - image-build-amd
+
+- label: V1 e2e (4 GPUs)
+  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
+  optional: true
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/e2e
+  commands:
+    # Only run tests that need 4 GPUs
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+  mirror:
+    amd:
+      device: mi325_4
+      depends_on:
+      - image-build-amd
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 7f2db19a0750..4c90df5f471f 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -630,7 +630,7 @@ def test_eagle_correctness_medium(
             False,
             "auto",
             0.8,
-            marks=multi_gpu_marks(num_gpus=4),
+            marks=[*multi_gpu_marks(num_gpus=4), large_gpu_mark(min_gb=40)],
             id="llama4_eagle",
         ),
         pytest.param(

From c212202d936fd772f3c08e1c176f5145e8d37718 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 2 Mar 2026 09:57:07 +0100
Subject: [PATCH 154/154] [Misc] Bound NIXL upper bound version (#35495)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 requirements/kv_connectors.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index bd454f1ab415..1164720e0dd6 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,3 +1,3 @@
 lmcache >= 0.3.9
-nixl >= 0.7.1 # Required for disaggregated prefill
+nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill
 mooncake-transfer-engine >= 0.3.8