From 9bbcb88bc104f471775bc39545f3915d5fa2af32 Mon Sep 17 00:00:00 2001
From: AlpinDale <alpindale@gmail.com>
Date: Sat, 2 May 2026 09:08:48 +0430
Subject: [PATCH] chore: sync to upstream
 985961345a13f3e3bb15d29c94b011ba9a6b858b

---
 CMakeLists.txt                                |    1 -
 aphrodite/_aiter_ops.py                       |  124 +-
 aphrodite/_custom_ops.py                      |   27 +-
 aphrodite/compilation/backends.py             |    3 +-
 aphrodite/compilation/cuda_graph.py           |    9 +-
 aphrodite/compilation/decorators.py           |   89 +-
 .../passes/fusion/act_quant_fusion.py         |    1 +
 .../passes/fusion/allreduce_rms_fusion.py     |  198 ++-
 .../passes/fusion/collective_fusion.py        |  319 +++-
 .../passes/fusion/sequence_parallelism.py     |   10 +-
 aphrodite/compilation/passes/pass_manager.py  |    8 +-
 aphrodite/compilation/wrapper.py              |   21 +-
 aphrodite/config/aphrodite.py                 |   37 +-
 aphrodite/config/attention.py                 |   62 +-
 aphrodite/config/model.py                     |    2 +-
 aphrodite/config/parallel.py                  |   28 +
 aphrodite/config/speculative.py               |   59 +-
 .../device_communicators/all2all.py           |   25 +-
 .../distributed/eplb/eplb_communicator.py     |  386 ++---
 .../distributed/eplb/rebalance_execute.py     |   13 +-
 .../kv_connector/v1/multi_connector.py        |   44 +-
 .../kv_connector/v1/nixl/scheduler.py         |   87 +-
 .../kv_connector/v1/nixl/worker.py            |    2 +-
 .../kv_connector/v1/offloading/common.py      |   51 +-
 .../kv_connector/v1/offloading/scheduler.py   |  508 ++++--
 .../kv_connector/v1/offloading/worker.py      |  108 +-
 .../kv_connector/v1/offloading_connector.py   |   19 +-
 aphrodite/distributed/parallel_state.py       |   10 +-
 aphrodite/engine/protocol.py                  |    1 +
 aphrodite/entrypoints/anthropic/protocol.py   |    4 +
 aphrodite/entrypoints/anthropic/serving.py    |   19 +
 aphrodite/entrypoints/chat_utils.py           |  256 ++-
 aphrodite/entrypoints/llm.py                  |    2 +-
 .../openai/chat_completion/batch_serving.py   |    1 +
 .../openai/chat_completion/protocol.py        |   31 +-
 .../openai/chat_completion/serving.py         |  204 +--
 aphrodite/entrypoints/openai/cli_args.py      |   13 +-
 .../entrypoints/openai/completion/protocol.py |    3 +
 .../entrypoints/openai/completion/serving.py  |    9 +-
 .../entrypoints/openai/engine/protocol.py     |    9 +
 .../entrypoints/openai/engine/serving.py      |   13 +
 aphrodite/entrypoints/openai/fingerprint.py   |   81 +
 .../entrypoints/openai/generate/api_router.py |    8 +
 .../openai/parser/harmony_utils.py            |    9 +-
 .../entrypoints/openai/responses/serving.py   |  612 +------
 .../openai/responses/streaming_events.py      |  450 +++++-
 aphrodite/entrypoints/serve/render/serving.py |    2 +-
 aphrodite/env_override.py                     |  125 +-
 aphrodite/envs.py                             |   47 +-
 aphrodite/inputs/engine.py                    |   19 +
 aphrodite/inputs/llm.py                       |   11 +
 aphrodite/lora/worker_manager.py              |   34 +-
 .../kernels/linear/scaled_mm/pytorch.py       |   10 +-
 .../layers/attention/mla_attention.py         |  594 +------
 .../model_executor/layers/batch_invariant.py  |   20 +-
 .../layers/deepseek_compressor.py             |    9 +-
 .../layers/deepseek_v4_attention.py           |  152 +-
 .../model_executor/layers/fla/ops/kda.py      |   12 +-
 .../layers/fused_moe/__init__.py              |   18 +-
 .../layers/fused_moe/all2all_utils.py         |   30 +-
 .../model_executor/layers/fused_moe/config.py |   40 +-
 ...880,device_name=NVIDIA_H100_80GB_HBM3.json |  147 ++
 .../experts/gpt_oss_triton_kernels_moe.py     |    5 +-
 .../fused_moe/experts/trtllm_fp8_moe.py       |   52 +-
 .../fused_moe/experts/trtllm_mxfp4_moe.py     |   47 +-
 .../fused_moe/experts/trtllm_nvfp4_moe.py     |   49 +-
 .../fused_moe/flashinfer_cutlass_moe.py       |    2 +-
 .../layers/fused_moe/fused_batched_moe.py     |    4 +-
 .../layers/fused_moe/fused_humming_moe.py     |    4 +-
 .../layers/fused_moe/fused_marlin_moe.py      |    4 +-
 .../layers/fused_moe/fused_moe.py             |    4 +-
 .../layers/fused_moe/fused_moe_method_base.py |    4 -
 .../model_executor/layers/fused_moe/layer.py  |   24 +-
 .../layers/fused_moe/oracle/int_wna16.py      |    4 +-
 .../layers/fused_moe/oracle/mxfp4.py          |   25 +-
 .../layers/fused_moe/oracle/nvfp4.py          |    8 +-
 .../fused_moe/prepare_finalize/deepep_ll.py   |   64 +-
 .../flashinfer_nvlink_one_sided.py            |   31 +-
 .../flashinfer_nvlink_two_sided.py            |    1 +
 .../fused_moe/prepare_finalize/naive_dp_ep.py |    1 +
 .../fused_moe/prepare_finalize/no_dp_ep.py    |    1 +
 .../layers/fused_moe/rocm_aiter_fused_moe.py  |    3 +-
 .../fused_moe/routed_experts_capturer.py      |   21 +-
 .../fused_moe/router/custom_routing_router.py |    4 +
 .../router/fused_topk_bias_router.py          |    2 -
 .../layers/fused_moe/router/gate_linear.py    |    2 +-
 .../layers/fused_moe/runner/moe_runner.py     |   16 +-
 .../fused_moe/runner/moe_runner_interface.py  |    3 +-
 .../layers/fused_moe/runner/shared_experts.py |   16 +-
 .../layers/fused_moe/triton_cutlass_moe.py    |    4 +-
 .../fused_moe/unquantized_fused_moe_method.py |    8 +-
 .../model_executor/layers/fused_moe/utils.py  |   10 +-
 aphrodite/model_executor/layers/linear.py     |    5 +-
 .../layers/mamba/linear_attn.py               |   37 +-
 .../layers/mamba/mamba_utils.py               |    3 -
 aphrodite/model_executor/layers/mhc.py        |  130 ++
 aphrodite/model_executor/layers/mla.py        |   11 +-
 .../layers/pooler/tokwise/methods.py          |   20 +-
 .../layers/quantization/modelopt.py           |   14 +-
 .../layers/quantization/quark/quark.py        |   26 +-
 .../layers/quantization/quark/quark_moe.py    |   18 +-
 .../quark/schemes/quark_ocp_mx.py             |   33 +-
 .../layers/quantization/utils/mxfp8_utils.py  |   24 +-
 .../utils/nvfp4_emulation_utils.py            |  294 +++-
 .../rotary_embedding/deepseek_scaling_rope.py |   15 +-
 aphrodite/model_executor/layers/utils.py      |   32 +-
 .../model_loader/base_loader.py               |    2 +-
 .../model_loader/default_loader.py            |   33 +-
 .../model_loader/reload/layerwise.py          |   39 +-
 .../model_loader/reload/utils.py              |   31 +-
 .../models/bailing_moe_linear.py              |   30 +-
 .../model_executor/models/cohere2_vision.py   |   15 +-
 aphrodite/model_executor/models/cohere_asr.py |   77 +-
 aphrodite/model_executor/models/cohere_moe.py |  485 ++++++
 .../model_executor/models/deepseek_v4.py      |  187 ++-
 .../model_executor/models/deepseek_v4_mtp.py  |   32 +-
 aphrodite/model_executor/models/gemma4.py     |   31 +-
 .../model_executor/models/granite4_vision.py  |    3 +-
 aphrodite/model_executor/models/laguna.py     |  827 ++++++++++
 aphrodite/model_executor/models/llama.py      |   15 +-
 .../model_executor/models/longcat_flash.py    |  139 +-
 aphrodite/model_executor/models/mimo_audio.py | 1269 +++++++++++++++
 .../models/{mimo_v2_flash.py => mimo_v2.py}   |   24 +-
 .../model_executor/models/mimo_v2_mtp.py      |  346 ++++
 .../model_executor/models/mimo_v2_omni.py     | 1417 +++++++++++++++++
 aphrodite/model_executor/models/minimax_m2.py |   14 +-
 .../model_executor/models/mistral_eagle.py    |  162 ++
 aphrodite/model_executor/models/moondream3.py | 1370 ++++++++++++++++
 aphrodite/model_executor/models/qwen2.py      |   39 +-
 aphrodite/model_executor/models/registry.py   |   11 +-
 aphrodite/multimodal/cache.py                 |   24 +-
 aphrodite/multimodal/registry.py              |    3 +-
 aphrodite/parser/abstract_parser.py           |   86 +-
 aphrodite/platforms/cpu.py                    |   13 +-
 aphrodite/platforms/interface.py              |    3 +
 aphrodite/platforms/rocm.py                   |   13 +-
 aphrodite/reasoning/__init__.py               |   12 +
 .../cohere_command_reasoning_parser.py        |  519 ++++++
 aphrodite/reasoning/olmo3_reasoning_parser.py |   38 +-
 .../reasoning/poolside_v1_reasoning_parser.py |   68 +
 aphrodite/renderers/base.py                   |    4 +-
 aphrodite/renderers/embed_utils.py            |   50 +-
 aphrodite/renderers/hf.py                     |  432 ++++-
 aphrodite/sampling_params.py                  |   25 +
 aphrodite/tokenizers/deepseek_v4.py           |   10 +-
 aphrodite/tool_parsers/__init__.py            |   12 +
 .../cohere_command_tool_parser.py             |  125 ++
 .../tool_parsers/deepseekv32_tool_parser.py   |   96 +-
 .../tool_parsers/poolside_v1_tool_parser.py   |  554 +++++++
 aphrodite/tool_parsers/streaming.py           |  189 +++
 aphrodite/transformers_utils/config.py        |   49 +-
 .../transformers_utils/configs/__init__.py    |    8 +
 .../transformers_utils/configs/laguna.py      |  120 ++
 .../configs/mimo_v2_omni.py                   |   61 +
 .../transformers_utils/configs/moondream3.py  |  152 ++
 .../model_arch_config_convertor.py            |   39 +
 .../transformers_utils/processors/__init__.py |    4 +
 .../processors/mimo_v2_omni.py                | 1181 ++++++++++++++
 .../processors/moondream3.py                  |  522 ++++++
 aphrodite/utils/flashinfer.py                 |   33 +
 aphrodite/utils/multi_stream_utils.py         |   66 +
 aphrodite/v1/attention/backends/cpu_attn.py   |   41 +-
 aphrodite/v1/attention/backends/flashinfer.py |  115 +-
 .../v1/attention/backends/flex_attention.py   |   20 +-
 .../v1/attention/backends/mla/indexer.py      |    7 +-
 .../backends/mla/prefill/__init__.py          |   11 +
 .../v1/attention/backends/mla/prefill/base.py |  124 ++
 .../backends/mla/prefill/flash_attn.py        |  174 ++
 .../backends/mla/prefill/flashinfer.py        |  204 +++
 .../backends/mla/prefill/registry.py          |   43 +
 .../backends/mla/prefill/selector.py          |  170 ++
 .../backends/mla/prefill/trtllm_ragged.py     |  172 ++
 .../attention/backends/mla/rocm_aiter_mla.py  |    2 +
 .../v1/attention/backends/mla/triton_mla.py   |   10 -
 aphrodite/v1/attention/ops/dcp_alltoall.py    |  374 +++--
 .../fused_compress_quant_cache.py             |   10 +-
 .../ops/deepseek_v4_ops/fused_indexer_q.py    |   55 +-
 .../fused_inv_rope_fp8_quant.py               |  144 +-
 .../v1/attention/ops/rocm_aiter_mla_sparse.py |   39 +-
 aphrodite/v1/attention/selector.py            |    5 +-
 aphrodite/v1/core/kv_cache_coordinator.py     |   13 +-
 aphrodite/v1/core/kv_cache_manager.py         |   64 +-
 aphrodite/v1/core/kv_cache_utils.py           |  105 +-
 aphrodite/v1/core/sched/output.py             |    2 +
 aphrodite/v1/core/sched/scheduler.py          |   14 +-
 .../v1/core/single_type_kv_cache_manager.py   |    8 +-
 aphrodite/v1/engine/__init__.py               |    7 +
 aphrodite/v1/engine/async_llm.py              |    7 +-
 aphrodite/v1/engine/core.py                   |   88 +-
 aphrodite/v1/engine/input_processor.py        |    5 +-
 aphrodite/v1/engine/logprobs.py               |    2 +-
 aphrodite/v1/kv_cache_interface.py            |    4 +
 aphrodite/v1/kv_offload/abstract.py           |  197 ---
 aphrodite/v1/kv_offload/base.py               |  371 +++++
 aphrodite/v1/kv_offload/cpu/common.py         |   13 +
 .../{worker/cpu_gpu.py => cpu/gpu_worker.py}  |    8 +-
 aphrodite/v1/kv_offload/cpu/manager.py        |   24 +-
 aphrodite/v1/kv_offload/cpu/policies/arc.py   |    4 +-
 .../cpu/policies/{abstract.py => base.py}     |    2 +-
 aphrodite/v1/kv_offload/cpu/policies/lru.py   |    4 +-
 aphrodite/v1/kv_offload/cpu/spec.py           |   13 +-
 aphrodite/v1/kv_offload/factory.py            |    2 +-
 aphrodite/v1/kv_offload/mediums.py            |   68 -
 aphrodite/v1/kv_offload/reuse_manager.py      |   15 +-
 aphrodite/v1/kv_offload/spec.py               |  141 --
 aphrodite/v1/kv_offload/worker/worker.py      |    2 +-
 aphrodite/v1/metrics/ray_wrappers.py          |   31 +-
 aphrodite/v1/request.py                       |    9 +
 .../v1/sample/logits_processor/__init__.py    |    3 -
 .../v1/sample/logits_processor/builtin.py     |  232 +--
 aphrodite/v1/sample/metadata.py               |    6 +
 aphrodite/v1/sample/ops/topk_topp_sampler.py  |   44 +-
 aphrodite/v1/sample/rejection_sampler.py      |   15 +-
 aphrodite/v1/sample/sampler.py                |   58 +
 aphrodite/v1/sample/thinking_budget_state.py  |  477 ++++++
 aphrodite/v1/spec_decode/dflash.py            |   13 +-
 aphrodite/v1/spec_decode/llm_base_proposer.py |   34 +-
 aphrodite/v1/structured_output/__init__.py    |   33 +-
 aphrodite/v1/structured_output/request.py     |    9 +-
 aphrodite/v1/worker/cpu_model_runner.py       |   11 +
 aphrodite/v1/worker/gpu/block_table.py        |   25 +-
 aphrodite/v1/worker/gpu/cudagraph_utils.py    |   38 +-
 aphrodite/v1/worker/gpu/kv_connector.py       |    2 +-
 aphrodite/v1/worker/gpu/mm/rope.py            |    2 +-
 aphrodite/v1/worker/gpu/model_runner.py       |   20 +-
 .../v1/worker/gpu/model_states/default.py     |    4 +-
 aphrodite/v1/worker/gpu/sample/logprob.py     |  140 +-
 aphrodite/v1/worker/gpu/sample/sampler.py     |   22 +-
 .../worker/gpu/spec_decode/eagle/cudagraph.py |   53 +-
 .../gpu/spec_decode/eagle/speculator.py       |   33 +-
 aphrodite/v1/worker/gpu_input_batch.py        |   36 +-
 aphrodite/v1/worker/gpu_model_runner.py       |   56 +-
 aphrodite/v1/worker/gpu_ubatch_wrapper.py     |    4 +-
 aphrodite/v1/worker/gpu_worker.py             |   40 +-
 cmake/external_projects/deepgemm.cmake        |   27 +-
 csrc/cpu/cpu_attn.cpp                         |  103 +-
 csrc/cpu/cpu_attn_amx.hpp                     |  217 ++-
 csrc/cpu/cpu_attn_fp8.hpp                     |  214 +++
 csrc/cpu/cpu_attn_impl.hpp                    |   38 +-
 csrc/cpu/cpu_attn_neon.hpp                    |    9 +-
 csrc/cpu/cpu_attn_neon_bfmmla.hpp             |    3 +-
 csrc/cpu/cpu_attn_vec.hpp                     |  133 +-
 csrc/cpu/cpu_attn_vec16.hpp                   |    6 +-
 csrc/cpu/cpu_attn_vxe.hpp                     |    7 +-
 csrc/cpu/cpu_types_arm.hpp                    |    6 +
 csrc/cpu/cpu_types_vxe.hpp                    |    6 +
 csrc/cpu/cpu_types_x86.hpp                    |  139 ++
 csrc/cpu/generate_cpu_attn_dispatch.py        |  262 +--
 csrc/cpu/torch_bindings.cpp                   |   16 +-
 csrc/cutlass_extensions/common.hpp            |   45 +-
 .../w8a8/cutlass/c3x/scaled_mm.cuh            |    2 +-
 ...scaled_mm_blockwise_sm100_fp8_dispatch.cuh |    2 +-
 .../c3x/scaled_mm_sm100_fp8_dispatch.cuh      |    2 +-
 .../w8a8/fp8/per_token_group_quant.cu         |  258 ++-
 .../w8a8/per_token_group_quant_8bit.h         |   10 +
 csrc/libtorch_stable/torch_bindings.cpp       |    8 +
 csrc/moe/moe_ops.h                            |    3 -
 csrc/moe/router_gemm.cu                       |   52 -
 csrc/moe/torch_bindings.cpp                   |    4 -
 csrc/persistent_topk.cuh                      |   25 +-
 csrc/pos_encoding_kernels.cu                  |   76 +-
 csrc/topk.cu                                  |   86 +-
 tools/report_build_time_ninja.py              |    4 +-
 263 files changed, 19152 insertions(+), 4404 deletions(-)
 create mode 100644 aphrodite/entrypoints/openai/fingerprint.py
 create mode 100644 aphrodite/model_executor/layers/fused_moe/configs/E=128,N=2880,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 aphrodite/model_executor/models/cohere_moe.py
 create mode 100644 aphrodite/model_executor/models/laguna.py
 create mode 100644 aphrodite/model_executor/models/mimo_audio.py
 rename aphrodite/model_executor/models/{mimo_v2_flash.py => mimo_v2.py} (96%)
 create mode 100644 aphrodite/model_executor/models/mimo_v2_mtp.py
 create mode 100644 aphrodite/model_executor/models/mimo_v2_omni.py
 create mode 100644 aphrodite/model_executor/models/mistral_eagle.py
 create mode 100644 aphrodite/model_executor/models/moondream3.py
 create mode 100644 aphrodite/reasoning/cohere_command_reasoning_parser.py
 create mode 100644 aphrodite/reasoning/poolside_v1_reasoning_parser.py
 create mode 100644 aphrodite/tool_parsers/cohere_command_tool_parser.py
 create mode 100644 aphrodite/tool_parsers/poolside_v1_tool_parser.py
 create mode 100644 aphrodite/tool_parsers/streaming.py
 create mode 100644 aphrodite/transformers_utils/configs/laguna.py
 create mode 100644 aphrodite/transformers_utils/configs/mimo_v2_omni.py
 create mode 100644 aphrodite/transformers_utils/configs/moondream3.py
 create mode 100644 aphrodite/transformers_utils/processors/mimo_v2_omni.py
 create mode 100644 aphrodite/transformers_utils/processors/moondream3.py
 create mode 100644 aphrodite/v1/attention/backends/mla/prefill/__init__.py
 create mode 100644 aphrodite/v1/attention/backends/mla/prefill/base.py
 create mode 100644 aphrodite/v1/attention/backends/mla/prefill/flash_attn.py
 create mode 100644 aphrodite/v1/attention/backends/mla/prefill/flashinfer.py
 create mode 100644 aphrodite/v1/attention/backends/mla/prefill/registry.py
 create mode 100644 aphrodite/v1/attention/backends/mla/prefill/selector.py
 create mode 100644 aphrodite/v1/attention/backends/mla/prefill/trtllm_ragged.py
 delete mode 100644 aphrodite/v1/kv_offload/abstract.py
 create mode 100644 aphrodite/v1/kv_offload/base.py
 create mode 100644 aphrodite/v1/kv_offload/cpu/common.py
 rename aphrodite/v1/kv_offload/{worker/cpu_gpu.py => cpu/gpu_worker.py} (98%)
 rename aphrodite/v1/kv_offload/cpu/policies/{abstract.py => base.py} (97%)
 delete mode 100644 aphrodite/v1/kv_offload/mediums.py
 delete mode 100644 aphrodite/v1/kv_offload/spec.py
 create mode 100644 aphrodite/v1/sample/thinking_budget_state.py
 create mode 100644 csrc/cpu/cpu_attn_fp8.hpp
 delete mode 100644 csrc/moe/router_gemm.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8922863cbf..9b41fbb183 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -948,7 +948,6 @@ if(APHRODITE_GPU_LANG STREQUAL "CUDA")
   list(APPEND APHRODITE_MOE_EXT_SRC
     "csrc/moe/moe_wna16.cu"
     "csrc/moe/grouped_topk_kernels.cu"
-    "csrc/moe/router_gemm.cu"
     "csrc/moe/topk_softplus_sqrt_kernels.cu")
 endif()
 
diff --git a/aphrodite/_aiter_ops.py b/aphrodite/_aiter_ops.py
index 5135800f5c..1b56b8fb57 100644
--- a/aphrodite/_aiter_ops.py
+++ b/aphrodite/_aiter_ops.py
@@ -2,9 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from collections.abc import Callable
+from contextlib import contextmanager
+from typing import Protocol
 
 import torch
 from torch._ops import OpOverload
+from torch.distributed import ProcessGroup
 
 import aphrodite.envs as envs
 from aphrodite.platforms import current_platform
@@ -39,6 +42,27 @@ def is_aiter_found() -> bool:
 IS_AITER_FOUND = is_aiter_found()
 
 
+class AiterCustomAllreduceProto(Protocol):
+    max_size: int
+    world_size: int
+    fully_connected: bool
+
+    @contextmanager
+    def capture(self): ...
+    def close(self) -> None: ...
+    def fused_ar_rms(
+        self,
+        inp: torch.Tensor,
+        res_inp: torch.Tensor,
+        *,
+        w: torch.Tensor,
+        eps: float,
+        registered: bool = False,
+        use_1stage: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]: ...
+    def should_custom_ar(self, inp: torch.Tensor) -> bool: ...
+
+
 def is_aiter_found_and_supported() -> bool:
     """Check if AITER library is available and platform supports it.
 
@@ -731,6 +755,55 @@ def _rocm_aiter_per_tensor_quant_impl(
     return per_tensor_quant_hip(x, scale, quant_dtype)
 
 
+def _rocm_aiter_fused_allreduce_rmsnorm_impl(
+    input_: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    aiter_ar = rocm_aiter_ops.get_aiter_allreduce()
+    assert aiter_ar is not None, "aiter allreduce must be initialized"
+
+    total_bytes = input_.numel() * input_.element_size()
+    hidden_dim = input_.shape[-1]
+    token_num = input_.shape[0]
+    hidden_ok = hidden_dim in (512, 1024, 2048, 4096, 7168)
+    token_ok = token_num <= 80
+    world_size = aiter_ar.world_size
+    full_nvlink = aiter_ar.fully_connected
+
+    if world_size == 2:
+        size_ok = True
+    elif full_nvlink and world_size <= 4:
+        size_ok = total_bytes < 256 * 1024
+    elif full_nvlink and world_size <= 8:
+        size_ok = total_bytes < 128 * 1024
+    else:
+        size_ok = False
+
+    use_1stage = hidden_ok and token_ok and size_ok
+
+    result = aiter_ar.fused_ar_rms(
+        input_,
+        residual,
+        w=weight,
+        eps=epsilon,
+        registered=torch.cuda.is_current_stream_capturing(),
+        use_1stage=use_1stage,
+    )
+    assert result is not None
+    return result[0], result[1]
+
+
+def _rocm_aiter_fused_allreduce_rmsnorm_fake(
+    input_: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty_like(input_), torch.empty_like(residual)
+
+
 def _rocm_aiter_per_tensor_quant_fake(
     x: torch.Tensor,
     quant_dtype: torch.dtype,
@@ -747,7 +820,7 @@ def _rocm_aiter_per_token_quant_impl(
     assert quant_dtype in [torch.int8, FP8_DTYPE]
 
     out_shape = x.shape
-    out = torch.empty(x.shape, dtype=FP8_DTYPE, device=x.device)
+    out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
     if scale is None:
         scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device)
     dynamic_per_token_scaled_quant(
@@ -767,7 +840,7 @@ def _rocm_aiter_per_token_quant_fake(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     out_shape = x.shape
     return (
-        torch.empty(x.shape, dtype=FP8_DTYPE, device=x.device),
+        torch.empty(x.shape, dtype=quant_dtype, device=x.device),
         torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device),
     )
 
@@ -1157,6 +1230,9 @@ class rocm_aiter_ops:
     # TODO: Consolidate under _LINEAR_ENABLED
     _TRITON_UNQUANT_GEMM = envs.APHRODITE_ROCM_USE_AITER_TRITON_GEMM
 
+    _ALL_REDUCE_MAX_SIZE: int = 8192 * 1024 * 8 * 2
+    _CUSTOM_ALL_REDUCE: AiterCustomAllreduceProto | None = None
+
     @classmethod
     def refresh_env_variables(cls):
         """
@@ -1324,6 +1400,40 @@ def is_triton_rotary_embed_enabled(cls) -> bool:
     def is_triton_gemm_enabled(cls) -> bool:
         return cls._AITER_ENABLED and cls._TRITON_UNQUANT_GEMM
 
+    @classmethod
+    @if_aiter_supported
+    def is_tgemm_enabled(cls) -> bool:
+        from aphrodite.platforms.rocm import on_gfx950
+
+        return cls.is_linear_enabled() and on_gfx950()
+
+    @classmethod
+    def initialize_aiter_allreduce(cls, group: ProcessGroup, device: torch.device) -> None:
+        try:
+            from aiter.dist.device_communicators.custom_all_reduce import (
+                CustomAllreduce as AiterCustomAllreduce,
+            )
+
+            cls._CUSTOM_ALL_REDUCE = AiterCustomAllreduce(group, device)
+        except Exception:
+            cls._CUSTOM_ALL_REDUCE = None
+
+    @classmethod
+    def get_aiter_allreduce(cls) -> AiterCustomAllreduceProto | None:
+        return cls._CUSTOM_ALL_REDUCE
+
+    @classmethod
+    def destroy_aiter_allreduce(cls) -> None:
+        if cls._CUSTOM_ALL_REDUCE is not None:
+            cls._CUSTOM_ALL_REDUCE.close()
+            cls._CUSTOM_ALL_REDUCE = None
+
+    @classmethod
+    def get_aiter_allreduce_max_size(cls) -> int | None:
+        # effective max input size (based on upstream aiter version: v0.1.10.post3)
+        # https://github.com/ROCm/aiter/blob/6a0e7b26ccf33164785531212cc2ec2cde0b9243/aiter/dist/device_communicators/custom_all_reduce.py#L272-L273
+        return int(cls._ALL_REDUCE_MAX_SIZE / 2)
+
     @staticmethod
     @if_aiter_supported
     def register_ops_once() -> None:
@@ -1514,6 +1624,12 @@ def register_ops_once() -> None:
                 fake_impl=_triton_rotary_embedding_fake,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_fused_allreduce_rmsnorm",
+                op_func=_rocm_aiter_fused_allreduce_rmsnorm_impl,
+                fake_impl=_rocm_aiter_fused_allreduce_rmsnorm_fake,
+            )
+
             direct_register_custom_op(
                 op_name="fused_mla_dual_rms_norm",
                 op_func=_fused_mla_dual_rms_norm_impl,
@@ -1567,6 +1683,10 @@ def get_triton_add_rmsnorm_pad_op() -> OpOverload:
     def get_triton_rotary_embedding_op() -> OpOverload:
         return torch.ops.aphrodite.rocm_aiter_triton_rotary_embedding.default
 
+    @staticmethod
+    def get_fused_allreduce_rmsnorm_op() -> OpOverload:
+        return torch.ops.aphrodite.rocm_aiter_fused_allreduce_rmsnorm.default
+
     @staticmethod
     def get_fused_mla_dual_rms_norm_op() -> OpOverload:
         return torch.ops.aphrodite.fused_mla_dual_rms_norm.default
diff --git a/aphrodite/_custom_ops.py b/aphrodite/_custom_ops.py
index 982b26ed00..f7c201fa70 100644
--- a/aphrodite/_custom_ops.py
+++ b/aphrodite/_custom_ops.py
@@ -2632,21 +2632,6 @@ def moe_wna16_gemm(
     )
 
 
-def router_gemm_bf16_fp32(input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
-    """bf16 x bf16 -> fp32 GEMM via cuBLAS. weight shape: (N, K)."""
-    return torch.ops._moe_C.router_gemm_bf16_fp32(input, weight)
-
-
-if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "router_gemm_bf16_fp32"):
-
-    @register_fake("_moe_C::router_gemm_bf16_fp32")
-    def router_gemm_bf16_fp32_fake(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-    ) -> torch.Tensor:
-        return torch.empty(input.shape[0], weight.shape[0], dtype=torch.float32, device=input.device)
-
-
 def dsv3_router_gemm(
     hidden_states: torch.Tensor,
     router_weight: torch.Tensor,
@@ -3552,6 +3537,9 @@ def cpu_attn_reshape_and_cache(
     value_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
     isa: str,
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
+    kv_cache_dtype: str = "auto",
 ) -> None:
     torch.ops._C.cpu_attn_reshape_and_cache(
         key,
@@ -3560,6 +3548,9 @@ def cpu_attn_reshape_and_cache(
         value_cache,
         slot_mapping,
         isa,
+        k_scale,
+        v_scale,
+        kv_cache_dtype,
     )
 
 
@@ -3578,6 +3569,9 @@ def cpu_attention_with_kv_cache(
     softcap: float,
     scheduler_metadata: torch.Tensor,
     s_aux: torch.Tensor | None,
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
+    kv_cache_dtype: str = "auto",
 ) -> None:
     torch.ops._C.cpu_attention_with_kv_cache(
         query,
@@ -3595,6 +3589,9 @@ def cpu_attention_with_kv_cache(
         softcap,
         scheduler_metadata,
         s_aux,
+        k_scale,
+        v_scale,
+        kv_cache_dtype,
     )
 
 
diff --git a/aphrodite/compilation/backends.py b/aphrodite/compilation/backends.py
index 69b77b5122..2d6a490c96 100644
--- a/aphrodite/compilation/backends.py
+++ b/aphrodite/compilation/backends.py
@@ -265,6 +265,7 @@ def compile(
         compilation_counter.num_backend_compilations += 1
 
         compiled_graph = None
+        handle = None
 
         # try to load from the cache
         compiled_graph = self.load(graph, example_inputs, graph_index, compile_range)
@@ -342,7 +343,7 @@ def autograd_cache_key(*args, **kwargs):
                     )
                 except StopCompiling:
                     assert cache_key is not None
-                    return self.loaded_artifacts[cache_key]
+                    compiled_graph = self.loaded_artifacts[cache_key]
             if cache_key is not None and compiled_graph is not None:
                 self.loaded_artifacts[cache_key] = compiled_graph
 
diff --git a/aphrodite/compilation/cuda_graph.py b/aphrodite/compilation/cuda_graph.py
index a9e7d56912..317e312c4e 100644
--- a/aphrodite/compilation/cuda_graph.py
+++ b/aphrodite/compilation/cuda_graph.py
@@ -268,8 +268,13 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
                     # across layers will make the cudagraph capture very slow.
                     # therefore, we only run gc for the first graph,
                     # and disable gc for the rest of the graphs.
-                    stack.enter_context(patch("gc.collect", lambda: None))
-                    stack.enter_context(patch("torch.accelerator.empty_cache", lambda: None))
+                    stack.enter_context(patch("gc.collect", lambda *args, **kwargs: None))
+                    stack.enter_context(
+                        patch(
+                            "torch.accelerator.empty_cache",
+                            lambda *args, **kwargs: None,
+                        )
+                    )
 
                 if self.graph_pool is not None:
                     set_graph_pool_id(self.graph_pool)
diff --git a/aphrodite/compilation/decorators.py b/aphrodite/compilation/decorators.py
index 23edabddc2..3f0d0e3c02 100644
--- a/aphrodite/compilation/decorators.py
+++ b/aphrodite/compilation/decorators.py
@@ -32,6 +32,9 @@
 
 from .monitor import monitor_profiling_run, monitor_torch_compile
 
+# shape_id parameter was added to mark_unbacked in PyTorch 2.11.0
+_SUPPORTS_SHAPE_ID = is_torch_equal_or_newer("2.11.0")
+
 if TYPE_CHECKING:
     # Only added on nightly/2.10 so wrap
     try:
@@ -89,7 +92,7 @@ def support_torch_compile(
 @overload
 def support_torch_compile(
     *,
-    dynamic_arg_dims: dict[str, int | list[int]] | None,
+    dynamic_arg_dims: dict[str, int | list[int] | dict[int, str]] | None,
 ) -> Callable[[type[_T]], type[_T]]: ...
 
 
@@ -103,7 +106,7 @@ def support_torch_compile(
 @overload
 def support_torch_compile(
     *,
-    dynamic_arg_dims: dict[str, int | list[int]] | None,
+    dynamic_arg_dims: dict[str, int | list[int] | dict[int, str]] | None,
     mark_unbacked_dims: dict[str, int | list[int]] | None,
 ) -> Callable[[type[_T]], type[_T]]: ...
 
@@ -115,11 +118,10 @@ def support_torch_compile(cls: type[_T]) -> type[_T]: ...
 def support_torch_compile(
     cls: type[_T] | None = None,
     *,
-    dynamic_arg_dims: dict[str, int | list[int]] | None = None,
+    dynamic_arg_dims: dict[str, int | list[int] | dict[int, str]] | None = None,
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[AphroditeConfig], bool] | None = None,
     is_encoder: bool = False,
-    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> Callable[[type[_T]], type[_T]] | type[_T]:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -141,8 +143,12 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
     ```
 
     `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic
-    dimensions of the argument. The dynamic dimensions can be either a single
-    integer or a list of integers.
+    dimensions of the argument. The value can be:
+    - int: a single dimension index (e.g., 0)
+    - list[int]: multiple dimension indices (e.g., [0, 1])
+    - dict[int, str]: dimension to shape_id mapping for shape relations
+      (e.g., {0: "b"}). Dimensions with the same shape_id share the same
+      unbacked symbol.
 
     if `dynamic_arg_dims` is `None`, it is inferred from the type annotation
     of the `forward` method, based on the following default rules:
@@ -189,7 +195,7 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
             torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
     This enforces constraints on the symbolic shapes without hardcoding
     specific values. It is needed for some models to avoid data dependent
-    errors.
+    errors and maximize perf when unbacked shapes are used.
     """
 
     def cls_decorator_helper(cls: type[_T]) -> type[_T]:
@@ -233,7 +239,6 @@ def cls_decorator_helper(cls: type[_T]) -> type[_T]:
             mark_unbacked_dims,
             enable_if,
             is_encoder,
-            shape_invariants,
         )
 
     if cls is not None:
@@ -314,15 +319,13 @@ def _try_load_aot_compiled_fn(
 
 def _support_torch_compile(
     cls: type[_T],
-    dynamic_arg_dims: dict[str, int | list[int]],
+    dynamic_arg_dims: dict[str, int | list[int] | dict[int, str]],
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[AphroditeConfig], bool] | None = None,
     is_encoder: bool = False,
-    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> type[_T]:
-    """
-    A decorator to add support for compiling the forward method of a class.
-    """
+    """Internal implementation of support_torch_compile decorator."""
+
     if TorchCompileWithNoGuardsWrapper in cls.__bases__:
         # support decorating multiple times
         return cls
@@ -381,7 +384,8 @@ def __init__(
         if self.do_not_compile:
             return
 
-        self._check_shape_invariants = shape_invariants
+        self._dynamic_arg_dims = dynamic_arg_dims
+
         self.was_aot_compile_fn_loaded_from_disk = False
         compilation_counter.num_models_seen += 1
         self.compiled = False
@@ -396,43 +400,70 @@ def __init__(
     cls.__init__ = __init__
 
     def _mark_dynamic_inputs(mod: type[_T], ds_type: DynamicShapesType, *args: Any, **kwargs: Any) -> None:
-        def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None:
+        def mark_dynamic(arg: torch.Tensor, dim_shape_pairs: list[tuple[int, str | None]]) -> None:
             if ds_type == DynamicShapesType.UNBACKED:
                 if is_torch_equal_or_newer("2.10.0"):
-                    for dim in dims:
-                        torch._dynamo.decorators.mark_unbacked(arg, dim, hint_override=arg.size()[dim])
+                    for dim, shape_id in dim_shape_pairs:
+                        if shape_id is not None:
+                            if not _SUPPORTS_SHAPE_ID:
+                                raise RuntimeError(f"shape_id='{shape_id}' requires PyTorch >= 2.11.0")
+                            torch._dynamo.decorators.mark_unbacked(
+                                arg,
+                                dim,
+                                hint_override=arg.size()[dim],
+                                shape_id=shape_id,
+                            )
+                        else:
+                            torch._dynamo.decorators.mark_unbacked(
+                                arg,
+                                dim,
+                                hint_override=arg.size()[dim],
+                            )
                 else:
+                    # For older versions, we can't use hint_override or shape_id
+                    dims = [dim for dim, _ in dim_shape_pairs]
                     torch._dynamo.decorators.mark_unbacked(arg, dims)
             else:
+                dims = [dim for dim, _ in dim_shape_pairs]
                 torch._dynamo.mark_dynamic(arg, dims)
 
         sig = inspect.signature(mod.__class__.forward)  # type: ignore[attr-defined]
         bound_args = sig.bind(mod, *args, **kwargs)
         bound_args.apply_defaults()
-        for k, dims in dynamic_arg_dims.items():
+
+        # Normalize dynamic_arg_dims to dict[str, dict[int, str | None]]
+        normalized_dims: dict[str, dict[int, str | None]] = {}
+        for k, v in dynamic_arg_dims.items():
+            if isinstance(v, dict):
+                normalized_dims[k] = {dim: shape_id for dim, shape_id in v.items()}
+            elif isinstance(v, int):
+                normalized_dims[k] = {v: None}
+            else:
+                normalized_dims[k] = {d: None for d in v}
+
+        for k, dim_to_shape_id in normalized_dims.items():
             arg = bound_args.arguments.get(k)
 
             if arg is not None:
-                dims = [dims] if isinstance(dims, int) else dims
+                dims = list(dim_to_shape_id.keys())
+
                 if isinstance(arg, torch.Tensor):
-                    # In case dims is specified with negative indexing
-                    dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                    mark_dynamic(arg, dims)
+                    dim_shape_pairs = [(arg.ndim + d if d < 0 else d, dim_to_shape_id.get(d)) for d in dims]
+                    mark_dynamic(arg, dim_shape_pairs)
                 elif isinstance(arg, IntermediateTensors):
                     for tensor in arg.tensors.values():
-                        # In case dims is specified with negative indexing
-                        dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
-                        mark_dynamic(tensor, dims)
+                        dim_shape_pairs = [(tensor.ndim + d if d < 0 else d, dim_to_shape_id.get(d)) for d in dims]
+                        mark_dynamic(tensor, dim_shape_pairs)
                 else:
                     raise ValueError(f"Unsupported dynamic dimensions {dims} for argument {k} with type {type(arg)}.")
+
         if mark_unbacked_dims:
-            for k, dims in mark_unbacked_dims.items():
+            for k, dims_val in mark_unbacked_dims.items():
                 arg = bound_args.arguments.get(k)
                 if arg is not None:
-                    dims = [dims] if isinstance(dims, int) else dims
+                    dims = [dims_val] if isinstance(dims_val, int) else list(dims_val)
                     if isinstance(arg, torch.Tensor):
-                        # In case dims is specified with negative indexing
-                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                        dims = [arg.ndim + d if d < 0 else d for d in dims]
                         if is_torch_equal_or_newer("2.10.0"):
                             for dim in dims:
                                 torch._dynamo.decorators.mark_unbacked(arg, dim, hint_override=arg.size()[dim])
diff --git a/aphrodite/compilation/passes/fusion/act_quant_fusion.py b/aphrodite/compilation/passes/fusion/act_quant_fusion.py
index 247efc2296..56594d3a4f 100644
--- a/aphrodite/compilation/passes/fusion/act_quant_fusion.py
+++ b/aphrodite/compilation/passes/fusion/act_quant_fusion.py
@@ -183,6 +183,7 @@ def __init__(
         is_scale_transposed: bool = False,
         is_e8m0: bool = False,
         is_tma_aligned: bool = False,
+        match_aiter: bool = False,
     ) -> None:
         super().__init__(quant_key)
         self.quant_matcher = MatcherQuantFP8(
diff --git a/aphrodite/compilation/passes/fusion/allreduce_rms_fusion.py b/aphrodite/compilation/passes/fusion/allreduce_rms_fusion.py
index c8c74412d4..c70b14d8b2 100644
--- a/aphrodite/compilation/passes/fusion/allreduce_rms_fusion.py
+++ b/aphrodite/compilation/passes/fusion/allreduce_rms_fusion.py
@@ -12,12 +12,14 @@
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
 import aphrodite.ir.ops
+from aphrodite._aiter_ops import rocm_aiter_ops
 from aphrodite.compilation.passes.fusion.rms_quant_fusion import (
     _rms_input_weight_dtype_match,
 )
 from aphrodite.config import AphroditeConfig
 from aphrodite.config.utils import Range
 from aphrodite.distributed import get_tp_group, tensor_model_parallel_all_reduce
+from aphrodite.distributed.device_communicators.custom_all_reduce import CustomAllreduce
 from aphrodite.distributed.parallel_state import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -31,7 +33,12 @@
     direct_register_custom_op,
 )
 
-from ..aphrodite_inductor_pass import AphroditeInductorPass, AphroditePatternMatcherPass
+from ..aphrodite_inductor_pass import (
+    AphroditeFusionPatternMatcherPass,
+    AphroditeInductorPass,
+    AphroditePatternMatcherPass,
+    AphroditePatternReplacement,
+)
 from ..inductor_pass import enable_fake_mode
 from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8
 
@@ -845,3 +852,192 @@ def __del__(self) -> None:
             return
         with contextlib.suppress(Exception):
             destroy_fi_ar_workspace()
+
+
+# TODO: make BasePattern to inherit from AphroditePatternReplacement
+class AiterAllreduceFusedRMSNormPattern(BasePattern, AphroditePatternReplacement):
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str | None,
+        use_aiter_rmsnorm: bool = True,
+    ) -> None:
+        super().__init__(dtype, device)
+        self.dtype = dtype
+        self.epsilon = epsilon
+        self.FUSED_AR_RMSNORM_OP = rocm_aiter_ops.get_fused_allreduce_rmsnorm_op()
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        return [self.empty(5, 16), self.empty(16)]
+
+    @property
+    def pattern(self):
+        def _pattern(input: torch.Tensor, weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            allreduce_output = tensor_model_parallel_all_reduce(input)
+            rms = aphrodite.ir.ops.rms_norm(allreduce_output, weight, self.epsilon)
+
+            return rms, allreduce_output
+
+        return _pattern
+
+    @property
+    def replacement(self):
+        def _replacement(input: torch.Tensor, weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            residual = torch.empty_like(input)
+            allreduce = self.FUSED_AR_RMSNORM_OP(
+                input_=input,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+            )
+            return allreduce[0], allreduce[1]
+
+        return _replacement
+
+
+class AiterAllreduceFusedAddRMSNormPattern(BasePattern, AphroditePatternReplacement):
+    def __init__(
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str | None,
+        use_aiter_rmsnorm: bool = True,
+    ) -> None:
+        super().__init__(dtype, device)
+        self.epsilon = epsilon
+        self.dtype = dtype
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=use_aiter_rmsnorm)
+        self.FUSED_AR_RMSNORM_OP = rocm_aiter_ops.get_fused_allreduce_rmsnorm_op()
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        input, residual, weight = self.rmsnorm_matcher.inputs()
+
+        return [residual, input.to(self.dtype), weight]
+
+    @property
+    def pattern(self):
+        def _pattern(
+            residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            allreduce_output = tensor_model_parallel_all_reduce(input)
+            rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
+
+            return rms, residual
+
+        return _pattern
+
+    @property
+    def replacement(self):
+        def _replacement(
+            residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            allreduce = self.FUSED_AR_RMSNORM_OP(
+                input_=input,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+            )
+            return allreduce[0], allreduce[1]
+
+        return _replacement
+
+
+class RocmAiterAllReduceFusionPass(AphroditeFusionPatternMatcherPass):
+    def __init__(self, config: AphroditeConfig) -> None:
+        super().__init__(config, "rocm_aiter_allreduce_fusion_pass")
+        self.disabled = True
+        self.tp_size = get_tensor_model_parallel_world_size()
+        if self.tp_size <= 1:
+            logger.warning_once("AllReduce fusion pass is disabled for tp_size <= 1.")
+            return
+
+        if config.model_config is None:
+            logger.warning_once("AllReduce fusion pass is disabled for missing model_config.")
+            return
+
+        device_comm = get_tp_group().device_communicator
+        if device_comm is None:
+            logger.warning_once("Device communicator is required.")
+            return
+
+        ca_comm = getattr(device_comm, "ca_comm", None)
+        if ca_comm is None:
+            logger.warning_once("Custom Allreduce is required.")
+            return
+        self.ca_comm = ca_comm
+
+        assert isinstance(ca_comm, CustomAllreduce)
+
+        group = get_tp_group().cpu_group
+        rocm_aiter_ops.initialize_aiter_allreduce(group, self.device)
+        hidden_dim = config.model_config.get_hidden_size()
+        element_size = torch.tensor([], dtype=self.model_dtype).element_size()
+        max_size = rocm_aiter_ops.get_aiter_allreduce_max_size()
+        if max_size is None:
+            logger.warning("AITER allreduce fusion must be initialized")
+            return
+
+        # Aiter's fused_allreduce_rmsnorm kernel dispatches on hidden_dim.
+        # Before aiter v0.1.12 the launcher was template-specialized on HIDDEN_DIM
+        # and silently no-op'd for sizes outside {512, 1024, 2048, 4096}. From v0.1.12
+        # hidden_dim is a runtime argument. Detect the older API via the missing
+        # `_pool` attribute and skip fusion for unsupported sizes.
+        # Ref (old kernel): https://github.com/ROCm/aiter/blob/6a0e7b26ccf33164785531212cc2ec2cde0b9243/csrc/include/custom_all_reduce.cuh#L2590
+        aiter_ar = rocm_aiter_ops.get_aiter_allreduce()
+        _AITER_OLD_FUSED_AR_RMS_HIDDEN = (512, 1024, 2048, 4096)
+        if aiter_ar is not None and not hasattr(aiter_ar, "_pool") and hidden_dim not in _AITER_OLD_FUSED_AR_RMS_HIDDEN:
+            logger.warning_once(
+                "AITER allreduce-rmsnorm fusion disabled: aiter<0.1.12 "
+                "only supports hidden_dim in %s; got %d. Upgrade aiter to "
+                ">=0.1.12 to enable fusion for this model.",
+                _AITER_OLD_FUSED_AR_RMS_HIDDEN,
+                hidden_dim,
+            )
+            # Tear down aiter's custom-allreduce so its IPC handles don't
+            # race with aphrodite's ca_comm on the unfused fallback path.
+            with contextlib.suppress(Exception):
+                rocm_aiter_ops.destroy_aiter_allreduce()
+            return
+
+        max_token_num = max_size // (hidden_dim * element_size)
+        self.max_token_num = min(
+            max_token_num,
+            config.scheduler_config.max_num_batched_tokens,
+        )
+
+        for epsilon in [1e-5, 1e-6]:
+            self.register(
+                AiterAllreduceFusedRMSNormPattern(
+                    epsilon,
+                    self.model_dtype,
+                    self.device,
+                )
+            )
+            self.register(
+                AiterAllreduceFusedAddRMSNormPattern(
+                    epsilon,
+                    self.model_dtype,
+                    self.device,
+                )
+            )
+
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+        self.disabled = False
+
+        self.dump_patterns(config, self.pm_pass)
+
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
+        if self.disabled:
+            logger.warning_once("AllReduce fusion pass is disabled.")
+            return False
+        return bool(compile_range.end <= self.max_token_num)
+
+    def __del__(self) -> None:
+        if getattr(self, "disabled", True):
+            return
+        with contextlib.suppress(Exception):
+            rocm_aiter_ops.destroy_aiter_allreduce()
diff --git a/aphrodite/compilation/passes/fusion/collective_fusion.py b/aphrodite/compilation/passes/fusion/collective_fusion.py
index cb5a6411a7..8eff40bfb5 100644
--- a/aphrodite/compilation/passes/fusion/collective_fusion.py
+++ b/aphrodite/compilation/passes/fusion/collective_fusion.py
@@ -1,8 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
+from contextlib import suppress
+
 import torch
 import torch._inductor.pattern_matcher as pm
+import torch.distributed.distributed_c10d as c10d
 import torch.fx as fx
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
@@ -15,8 +19,14 @@
 )
 from aphrodite.logger import init_logger
 from aphrodite.platforms import current_platform
+from aphrodite.utils.torch_utils import direct_register_custom_op
 
-from ..aphrodite_inductor_pass import AphroditeInductorPass, AphroditePatternMatcherPass
+from ..aphrodite_inductor_pass import (
+    AphroditeFusionPatternMatcherPass,
+    AphroditeInductorPass,
+    AphroditePatternMatcherPass,
+    AphroditePatternReplacement,
+)
 from ..inductor_pass import enable_fake_mode
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -24,6 +34,172 @@
 logger = init_logger(__name__)
 
 
+def _flashinfer_scaled_mm_out(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    *,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    scale_result: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
+    use_fast_accum: bool = False,
+) -> None:
+    # Import lazily to avoid a circular import during module initialization
+    # when docs or other tooling import the pass without FlashInfer.
+    from aphrodite.utils.flashinfer import flashinfer_scaled_fp8_mm_out
+
+    assert bias is None, "FlashInfer symm_mem adapter does not support bias"
+    assert scale_result is None, "FlashInfer symm_mem adapter does not support result scaling"
+    assert not use_fast_accum, "FlashInfer symm_mem adapter does not support use_fast_accum"
+    assert A.ndim == 2 and B.ndim == 2 and out.ndim == 2, "FlashInfer symm_mem adapter expects 2D inputs and output"
+    assert scale_a.numel() == 1 and scale_b.numel() == 1, (
+        "FlashInfer symm_mem adapter only supports tensor-wise FP8 scales"
+    )
+
+    flashinfer_scaled_fp8_mm_out(
+        A,
+        B,
+        scale_a,
+        scale_b,
+        out=out,
+        out_dtype=out_dtype or out.dtype,
+    )
+
+
+def fused_flashinfer_scaled_matmul_reduce_scatter_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    world_size = c10d._resolve_process_group(group_name).size()
+    result_shape = list(output_shape)
+    result_shape[orig_scatter_dim] //= world_size
+    return torch.empty(
+        result_shape,
+        dtype=out_dtype or torch.bfloat16,
+        device=A.device,
+    )
+
+
+def fused_flashinfer_scaled_matmul_reduce_scatter(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    assert orig_scatter_dim == 0 and scatter_dim_after_maybe_reshape == 0, (
+        "FlashInfer symm_mem adapter currently only supports scatter_dim=0"
+    )
+    world_size = c10d._resolve_process_group(group_name).size()
+    assert A.ndim == 2 and B.ndim == 2, "FlashInfer symm_mem adapter expects 2D inputs"
+    assert A.is_contiguous(), "FlashInfer symm_mem adapter expects contiguous A"
+    assert A_scale.numel() == 1 and B_scale.numel() == 1, (
+        "FlashInfer symm_mem adapter only supports tensor-wise FP8 scales"
+    )
+    assert A.shape[0] % world_size == 0, "FlashInfer symm_mem adapter expects M divisible by world size"
+
+    kwargs = {
+        "scale_b": B_scale,
+        "bias": None,
+        "scale_result": None,
+        "out_dtype": out_dtype,
+        "use_fast_accum": False,
+    }
+    return torch.distributed._symmetric_memory._fused_scaled_matmul_reduce_scatter_impl(
+        mm_out_op=_flashinfer_scaled_mm_out,
+        A=A,
+        B=B,
+        A_scale=A_scale,
+        kwargs=kwargs,
+        out_dtype=out_dtype,
+        reduce_op=reduce_op,
+        orig_scatter_dim=orig_scatter_dim,
+        scatter_dim_after_maybe_reshape=scatter_dim_after_maybe_reshape,
+        group_name=group_name,
+        output_shape=output_shape,
+    )
+
+
+def fused_all_gather_flashinfer_scaled_matmul_fake(
+    A_shard: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    gather_dim: int,
+    group_name: str,
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    world_size = c10d._resolve_process_group(group_name).size()
+    output_shape = list(A_shard.shape)
+    output_shape[gather_dim] *= world_size
+    output_shape[-1] = B.shape[1]
+    return torch.empty(
+        output_shape,
+        dtype=out_dtype or torch.bfloat16,
+        device=A_shard.device,
+    )
+
+
+def fused_all_gather_flashinfer_scaled_matmul(
+    A_shard: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    gather_dim: int,
+    group_name: str,
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    assert gather_dim == 0, "FlashInfer symm_mem adapter currently only supports gather_dim=0"
+    _, outputs = torch.distributed._symmetric_memory._fused_all_gather_matmul_impl(
+        mm_out_op=_flashinfer_scaled_mm_out,
+        A_shard=A_shard,
+        Bs=[B],
+        A_scale=A_scale,
+        kwargs_list=[
+            {
+                "scale_b": B_scale,
+                "bias": None,
+                "scale_result": None,
+                "out_dtype": out_dtype,
+                "use_fast_accum": False,
+            }
+        ],
+        out_dtypes=[out_dtype],
+        gather_dim=gather_dim,
+        group_name=group_name,
+        return_A=False,
+    )
+    return outputs[0]
+
+
+direct_register_custom_op(
+    op_name="fused_flashinfer_scaled_matmul_reduce_scatter",
+    op_func=fused_flashinfer_scaled_matmul_reduce_scatter,
+    fake_impl=fused_flashinfer_scaled_matmul_reduce_scatter_fake,
+)
+
+direct_register_custom_op(
+    op_name="fused_all_gather_flashinfer_scaled_matmul",
+    op_func=fused_all_gather_flashinfer_scaled_matmul,
+    fake_impl=fused_all_gather_flashinfer_scaled_matmul_fake,
+)
+
+
 class BasePattern:
     def __init__(self, dtype: torch.dtype, device: str | None) -> None:
         self.dtype = dtype
@@ -343,29 +519,145 @@ def replacement(
         pm.register_replacement(pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass)
 
 
-class AsyncTPPass(AphroditePatternMatcherPass):
+class FlashInferBMMFP8ReduceScatterPattern(BasePattern, AphroditePatternReplacement[..., torch.Tensor]):
+    def get_inputs(self) -> list[torch.Tensor]:
+        a_2d = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE)
+        b_2d = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        a_scale = torch.empty([1], device=self.device, dtype=torch.float32)
+        b_scale = torch.empty([1], device=self.device, dtype=torch.float32)
+        return [a_2d, b_2d, a_scale, b_scale]
+
+    @property
+    def pattern(self) -> Callable[..., torch.Tensor]:
+        def _pattern(
+            a_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale: torch.Tensor,
+            b_scale: torch.Tensor,
+        ) -> torch.Tensor:
+            bmm = torch.ops.aphrodite.bmm_fp8.default(
+                torch.ops.aten.unsqueeze.default(a_2d, 0),
+                torch.ops.aten.unsqueeze.default(b_2d, 0),
+                a_scale,
+                b_scale,
+                self.dtype,
+                "auto",
+            )
+            output = torch.ops.aten.reshape.default(bmm, list(bmm.shape[1:]))
+            return torch.ops.aphrodite.reduce_scatter.default(
+                output,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name,
+            )
+
+        return _pattern
+
+    @property
+    def replacement(self) -> Callable[..., torch.Tensor]:
+        def _replacement(
+            a_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale: torch.Tensor,
+            b_scale: torch.Tensor,
+        ) -> torch.Tensor:
+            return torch.ops.aphrodite.fused_flashinfer_scaled_matmul_reduce_scatter.default(
+                a_2d,
+                b_2d,
+                a_scale,
+                b_scale,
+                "sum",
+                0,
+                0,
+                self.tp.device_group.group_name,
+                [a_2d.shape[0], b_2d.shape[1]],
+                self.dtype,
+            )
+
+        return _replacement
+
+
+class FlashInferAllGatherBMMFP8Pattern(BasePattern, AphroditePatternReplacement[..., torch.Tensor]):
+    def get_inputs(self) -> list[torch.Tensor]:
+        a_shard_2d = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
+        b_2d = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+        a_scale = torch.empty([1], device=self.device, dtype=torch.float32)
+        b_scale = torch.empty([1], device=self.device, dtype=torch.float32)
+        return [a_shard_2d, b_2d, a_scale, b_scale]
+
+    @property
+    def pattern(self) -> Callable[..., torch.Tensor]:
+        def _pattern(
+            a_shard_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale: torch.Tensor,
+            b_scale: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather = torch.ops.aphrodite.all_gather.default(
+                a_shard_2d,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name,
+            )
+            return torch.ops.aphrodite.bmm_fp8.default(
+                torch.ops.aten.unsqueeze.default(all_gather, 0),
+                torch.ops.aten.unsqueeze.default(b_2d, 0),
+                a_scale,
+                b_scale,
+                self.dtype,
+                "auto",
+            )
+
+        return _pattern
+
+    @property
+    def replacement(self) -> Callable[..., torch.Tensor]:
+        def _replacement(
+            a_shard_2d: torch.Tensor,
+            b_2d: torch.Tensor,
+            a_scale: torch.Tensor,
+            b_scale: torch.Tensor,
+        ) -> torch.Tensor:
+            fused = torch.ops.aphrodite.fused_all_gather_flashinfer_scaled_matmul.default(
+                a_shard_2d,
+                b_2d,
+                a_scale,
+                b_scale,
+                0,
+                self.tp.device_group.group_name,
+                self.dtype,
+            )
+            return torch.ops.aten.unsqueeze.default(fused, 0)
+
+        return _replacement
+
+
+class AsyncTPPass(AphroditeFusionPatternMatcherPass):
     @enable_fake_mode
     def __init__(self, config: AphroditeConfig) -> None:
-        super().__init__(config)
+        super().__init__(config, pass_name="async_tp_pass")
 
-        # Enable symmetric memory for the TP process group
         enable_symm_mem_for_group(get_tp_group().device_group.group_name)
-        self.patterns: PatternMatcherPass = PatternMatcherPass(pass_name="async_tp_pass")
-        GEMMReduceScatterPattern(self.model_dtype, self.device).register(self.patterns)
+        GEMMReduceScatterPattern(self.model_dtype, self.device).register(self.pm_pass)
 
-        AllGatherGEMMPattern(self.model_dtype, self.device).register(self.patterns)
+        AllGatherGEMMPattern(self.model_dtype, self.device).register(self.pm_pass)
 
         # These fusions are enabled only for bfloat16 models because
         # `scaled_mm` or `cutlass_scaled_mm` with per-token (row-wise) scaling
         # only supports bfloat16 as the output dtype.
         if self.model_dtype == torch.bfloat16:
-            ScaledMMReduceScatterPattern(self.model_dtype, self.device).register(self.patterns)
-            AllGatherScaledMMPattern(self.model_dtype, self.device).register(self.patterns)
+            ScaledMMReduceScatterPattern(self.model_dtype, self.device).register(self.pm_pass)
+            AllGatherScaledMMPattern(self.model_dtype, self.device).register(self.pm_pass)
 
-            CutlassScaledMMReduceScatterPattern(self.model_dtype, self.device).register(self.patterns)
-            AllGatherCutlassScaledMMPattern(self.model_dtype, self.device).register(self.patterns)
+            CutlassScaledMMReduceScatterPattern(self.model_dtype, self.device).register(self.pm_pass)
+            AllGatherCutlassScaledMMPattern(self.model_dtype, self.device).register(self.pm_pass)
+            with suppress(ImportError):
+                import aphrodite.utils.flashinfer  # noqa: F401
+            if hasattr(torch.ops.aphrodite, "bmm_fp8"):
+                self.register(FlashInferAllGatherBMMFP8Pattern(self.model_dtype, self.device))
+                self.register(FlashInferBMMFP8ReduceScatterPattern(self.model_dtype, self.device))
 
-        self.dump_patterns(config, self.patterns)
+        self.dump_patterns(config, self.pm_pass)
 
     def is_applicable_for_range(self, compile_range: Range) -> bool:
         # This pass is applied on top of the sequence parallelism pass,
@@ -377,5 +669,6 @@ def is_applicable_for_range(self, compile_range: Range) -> bool:
 
     @AphroditeInductorPass.time_and_log
     def __call__(self, graph: fx.Graph) -> None:
-        self.matched_count = self.patterns.apply(graph)
+        self.matched_count = self.pm_pass.apply(graph)
+        AphroditePatternMatcherPass.match_table[self.pass_name] += self.matched_count
         logger.debug("Replaced %s patterns", self.matched_count)
diff --git a/aphrodite/compilation/passes/fusion/sequence_parallelism.py b/aphrodite/compilation/passes/fusion/sequence_parallelism.py
index 669f91020b..6b908a12ec 100644
--- a/aphrodite/compilation/passes/fusion/sequence_parallelism.py
+++ b/aphrodite/compilation/passes/fusion/sequence_parallelism.py
@@ -31,6 +31,7 @@
 # Only apply sequence parallelism for models with hidden_size >= threshold
 SP_MIN_HIDDEN_SIZE: dict[int, int] = {
     90: 8192,  # H100: only for models with hidden_size >= 8192
+    100: 8192,  # Blackwell family: only for models with hidden_size >= 8192
 }
 
 # Min size per GPU per device capability for sequence parallelism
@@ -38,6 +39,8 @@
 # This ensures the threshold scales appropriately with tensor parallelism
 SP_MIN_PER_GPU_SIZE_MB: dict[int, float] = {
     90: 8,  # 8MB per GPU for H100
+    # Use a more conservative threshold on Blackwell so TP8 starts later.
+    100: 32,
 }
 
 
@@ -67,7 +70,12 @@ def get_sequence_parallelism_threshold(
     capability = current_platform.get_device_capability()
     if capability is None:
         return None
-    device_capability = capability.to_int()
+
+    # Collapse Blackwell variants (sm100/sm103/...) into one policy bucket.
+    if current_platform.is_device_capability_family(100):
+        device_capability = 100
+    else:
+        device_capability = capability.to_int()
 
     # Check if device has configured thresholds
     min_hidden_size = SP_MIN_HIDDEN_SIZE.get(device_capability)
diff --git a/aphrodite/compilation/passes/pass_manager.py b/aphrodite/compilation/passes/pass_manager.py
index da4900e48b..b69128386e 100644
--- a/aphrodite/compilation/passes/pass_manager.py
+++ b/aphrodite/compilation/passes/pass_manager.py
@@ -18,6 +18,9 @@
 from .ir.lowering_pass import AphroditeIRLoweringPass
 
 if rocm_aiter_ops.is_enabled():
+    from .fusion.allreduce_rms_fusion import (
+        RocmAiterAllReduceFusionPass,
+    )
     from .fusion.rocm_aiter_fusion import (
         MLADualRMSNormFusionPass,
         RocmAiterRMSNormQuantFusionPass,
@@ -137,7 +140,10 @@ def configure(self, config: AphroditeConfig) -> None:
                     self.passes += [AsyncTPPass(config)]
 
             if self.pass_config.fuse_allreduce_rms:
-                self.passes += [AllReduceFusionPass(config)]
+                if rocm_aiter_ops.is_enabled():
+                    self.passes += [RocmAiterAllReduceFusionPass(config)]
+                else:
+                    self.passes += [AllReduceFusionPass(config)]
 
             if self.pass_config.fuse_minimax_qk_norm:
                 self.passes += [MiniMaxQKNormPass(config)]
diff --git a/aphrodite/compilation/wrapper.py b/aphrodite/compilation/wrapper.py
index 87810e33da..8deda0fa8f 100644
--- a/aphrodite/compilation/wrapper.py
+++ b/aphrodite/compilation/wrapper.py
@@ -53,12 +53,6 @@ class TorchCompileWithNoGuardsWrapper:
     since we drop all guards.
     """
 
-    def check_invariants_and_forward(self, *args: Any, **kwargs: Any) -> Any:
-        assert hasattr(self, "_check_shape_invariants")
-        self._check_shape_invariants(*args, **kwargs)
-
-        return self.forward(*args, **kwargs)
-
     def _call_with_optional_nvtx_range(self, callable_fn: Callable[P, R], *args: P.args, **kwargs: P.kwargs) -> Any:
         if self.layerwise_nvtx_tracing_enabled:
             args_list = list(args)
@@ -109,6 +103,8 @@ def __init__(
                     "compilation_config.dynamic_shapes_config.evaluate_guards requires APHRODITE_USE_BYTECODE_HOOK=0. "
                 )
 
+                assert ds_type != DynamicShapesType.UNBACKED, "UNBACKED dynamic shapes do not add guards"
+
                 options["guard_filter_fn"] = lambda x: [entry.guard_type == "SHAPE_ENV" for entry in x]
             else:
                 if hasattr(torch.compiler, "skip_all_guards_unsafe"):
@@ -121,19 +117,6 @@ def __init__(
         compiled_ptr: Any = self.forward
         # Validate that unbacked dynamic shapes require APHRODITE_USE_BYTECODE_HOOK=False
 
-        if ds_type == DynamicShapesType.UNBACKED:
-            # reason is that bytecode does torch._dynamo.eval_frame.
-            # remove_from_cache(self.original_code_object()) to force a new
-            # re-compilation. And if we use
-            # compiled_ptr = self.check_invariants_and_forward
-            # it will reset all entries.
-            assert not envs.APHRODITE_USE_BYTECODE_HOOK, (
-                "UNBACKED dynamic shapes requires APHRODITE_USE_BYTECODE_HOOK=0. "
-            )
-            assert not self.evaluate_guards, "UNBACKED dynamic shapes do not add guards"
-
-            compiled_ptr = self.check_invariants_and_forward
-
         # Apply the constrain_to_fx_strides patch before first compilation.
         # This covers STOCK_TORCH_COMPILE and DYNAMO_ONCE paths. The APHRODITE
         # compile paths call this from their own compile() methods too.
diff --git a/aphrodite/config/aphrodite.py b/aphrodite/config/aphrodite.py
index 306a73b4d5..b8b6b165f2 100644
--- a/aphrodite/config/aphrodite.py
+++ b/aphrodite/config/aphrodite.py
@@ -123,6 +123,15 @@ def enable_allreduce_rms_fusion(cfg: "AphroditeConfig") -> bool:
     from aphrodite.platforms import current_platform
     from aphrodite.utils.flashinfer import has_flashinfer
 
+    if current_platform.is_rocm():
+        from aphrodite._aiter_ops import rocm_aiter_ops
+
+        return (
+            rocm_aiter_ops.is_enabled()
+            and rocm_aiter_ops.is_rmsnorm_enabled()
+            and cfg.parallel_config.tensor_parallel_size > 1
+        )
+
     return (
         cfg.parallel_config.tensor_parallel_size > 1
         and current_platform.is_cuda()
@@ -1331,6 +1340,10 @@ def _set_cudagraph_sizes(self):
         cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
             range(256, max_graph_size + 1, 16))
 
+        `max_num_batched_tokens` is also appended to the list if it fits
+        within `max_cudagraph_capture_size`, so the max batch size is captured
+        even when off-stride.
+
         In the end, `aphrodite_config.compilation_config.cudagraph_capture_sizes`
         will be the final sizes to capture cudagraph (in ascending order).
 
@@ -1402,6 +1415,9 @@ def _set_cudagraph_sizes(self):
                 if max_cudagraph_capture_size >= 256:
                     # Step size 16 for larger batch sizes
                     cudagraph_capture_sizes += list(range(256, max_cudagraph_capture_size + 1, 16))
+                # ensure max_num_tokens is captured if within max capture size
+                if max_num_tokens <= max_cudagraph_capture_size and max_num_tokens not in cudagraph_capture_sizes:
+                    cudagraph_capture_sizes.append(max_num_tokens)
                 # de-duplicate and sort the sizes
                 cudagraph_capture_sizes = sorted(set(cudagraph_capture_sizes))
 
@@ -1466,10 +1482,15 @@ def _set_compile_ranges(self):
         if compile_range_end is not None:
             computed_compile_ranges_endpoints.append(compile_range_end)
 
-        # Add the compile ranges for flashinfer
+        # Add the compile ranges for flashinfer/aiter.
         if compilation_config.pass_config.fuse_allreduce_rms:
             tp_size = self.parallel_config.tensor_parallel_size
-            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
+            from aphrodite._aiter_ops import rocm_aiter_ops
+
+            if rocm_aiter_ops.is_enabled():
+                max_size = rocm_aiter_ops.get_aiter_allreduce_max_size()
+            else:
+                max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
             if max_size is not None:
                 assert isinstance(self.model_config.dtype, torch.dtype)
                 max_token_num = max_size // (self.model_config.get_hidden_size() * self.model_config.dtype.itemsize)
@@ -1718,6 +1739,18 @@ def validate_block_size(self) -> None:
                 "in the middle of a mm input"
             )
 
+    @model_validator(mode="after")
+    def validate_nvfp4_kv_cache_with_mla(self) -> "AphroditeConfig":
+        if self.model_config is None:
+            return self
+        if self.cache_config.cache_dtype == "nvfp4" and self.model_config.use_mla:
+            raise ValueError(
+                "nvfp4 KV cache is not supported with MLA (Multi-head Latent "
+                "Attention) backends. Please use a different --kv-cache-dtype "
+                "(e.g., 'fp8' or 'auto') for MLA models such as DeepSeek."
+            )
+        return self
+
     @model_validator(mode="after")
     def validate_mamba_block_size(self) -> "AphroditeConfig":
         if self.model_config is None:
diff --git a/aphrodite/config/attention.py b/aphrodite/config/attention.py
index d131e3bd9f..48bbe88056 100644
--- a/aphrodite/config/attention.py
+++ b/aphrodite/config/attention.py
@@ -6,8 +6,12 @@
 from pydantic import field_validator
 
 from aphrodite.config.utils import config
+from aphrodite.logger import init_logger
+from aphrodite.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
 from aphrodite.v1.attention.backends.registry import AttentionBackendEnum
 
+logger = init_logger(__name__)
+
 
 @config
 class AttentionConfig:
@@ -33,7 +37,7 @@ class AttentionConfig:
     and buffers can be pre-allocated to avoid inflating the memory estimate."""
 
     use_cudnn_prefill: bool = False
-    """Whether to use cudnn prefill."""
+    """Deprecated: cuDNN prefill backend has been removed."""
 
     use_trtllm_ragged_deepseek_prefill: bool = False
     """Whether to use TRTLLM ragged deepseek prefill."""
@@ -42,18 +46,27 @@ class AttentionConfig:
     """If set to True/False, use or don't use the TRTLLM attention backend
     in flashinfer. If None, auto-detect the attention backend in flashinfer."""
 
-    disable_flashinfer_prefill: bool = True
+    disable_flashinfer_prefill: bool | None = None
     """Whether to disable flashinfer prefill."""
 
     disable_flashinfer_q_quantization: bool = False
     """If set, when using fp8 kv, do not quantize Q to fp8."""
 
+    mla_prefill_backend: MLAPrefillBackendEnum | None = None
+    """MLA prefill backend to use. If None, will be selected automatically.
+    Valid options: FLASH_ATTN (FA3/FA4), FLASHINFER, TRTLLM_RAGGED.
+    This option supersedes use_trtllm_ragged_deepseek_prefill
+    and disable_flashinfer_prefill which are deprecated."""
+
     use_prefill_query_quantization: bool = False
     """If set, quantize query for attention in prefill."""
 
     use_fp4_indexer_cache: bool = False
     """If set, use fp4 indexer cache for dsv32 family model (not support yet)"""
 
+    use_non_causal: bool = False
+    """Whether to use non-causal (bidirectional) attention."""
+
     def compute_hash(self) -> str:
         """
         Provide a hash that uniquely identifies all the configs
@@ -81,3 +94,48 @@ def validate_backend_before(cls, value: Any) -> Any:
                 return None
             return AttentionBackendEnum[value.upper()]
         return value
+
+    @field_validator("mla_prefill_backend", mode="before")
+    @classmethod
+    def validate_mla_prefill_backend_before(cls, value: Any) -> Any:
+        """Enable parsing of the `mla_prefill_backend` enum type from string."""
+        if isinstance(value, str):
+            return MLAPrefillBackendEnum[value.upper()]
+        return value
+
+    def __post_init__(self) -> None:
+        self._migrate_deprecated_mla_prefill_flags()
+
+    def _migrate_deprecated_mla_prefill_flags(self) -> None:
+        """Migrate deprecated MLA prefill flags to mla_prefill_backend."""
+        # If the new option is already set, it takes precedence
+        if self.mla_prefill_backend is not None:
+            return
+
+        # Check for deprecated flags and migrate them.
+        # Only the first flag encountered sets the backend.
+        if self.use_cudnn_prefill:
+            raise ValueError(
+                "The cuDNN MLA prefill backend has been removed. "
+                "Use --attention-config.mla_prefill_backend=FLASH_ATTN or "
+                "FLASHINFER or TRTLLM_RAGGED instead."
+            )
+
+        if self.use_trtllm_ragged_deepseek_prefill:
+            if self.mla_prefill_backend is None:
+                self.mla_prefill_backend = MLAPrefillBackendEnum.TRTLLM_RAGGED
+            logger.warning_once(
+                "use_trtllm_ragged_deepseek_prefill is deprecated and "
+                "will be removed in v0.22. Use "
+                "--attention-config.mla_prefill_backend=TRTLLM_RAGGED "
+                "instead."
+            )
+
+        if self.disable_flashinfer_prefill:
+            if self.mla_prefill_backend is None:
+                self.mla_prefill_backend = MLAPrefillBackendEnum.FLASH_ATTN
+            logger.warning_once(
+                "disable_flashinfer_prefill is deprecated and will be removed "
+                "in v0.22. Use --attention-config.mla_prefill_backend="
+                "FLASH_ATTN instead."
+            )
diff --git a/aphrodite/config/model.py b/aphrodite/config/model.py
index db056ba48f..61b1e558af 100644
--- a/aphrodite/config/model.py
+++ b/aphrodite/config/model.py
@@ -515,12 +515,12 @@ def __post_init__(
         if dict_overrides:
             self._apply_dict_overrides(hf_config, dict_overrides)
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.model_arch_config = self.get_model_arch_config()
         self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None)
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision
         )
-        self.model_arch_config = self.get_model_arch_config()
 
         architectures = self.architectures
         registry = self.registry
diff --git a/aphrodite/config/parallel.py b/aphrodite/config/parallel.py
index 5d07853d8e..deb193749c 100644
--- a/aphrodite/config/parallel.py
+++ b/aphrodite/config/parallel.py
@@ -636,6 +636,26 @@ def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
         aggregated_has_unfinished = bool(tensor.item())
         return aggregated_has_unfinished
 
+    @staticmethod
+    def sync_dp_state(dp_group: ProcessGroup, has_unfinished: bool, pending_pause: bool) -> tuple[bool, bool]:
+        """Combined all-reduce for DP state synchronization.
+        Uses a single SUM all-reduce on a 2-element tensor:
+          [0] = 1 if this rank has unfinished work, else 0.
+                SUM > 0 ≡ logical OR across ranks → any rank has work.
+          [1] = 1 if this rank has a pending pause request, else 0.
+                SUM == dp_size ≡ all ranks reached pause consensus.
+        has_unfinished_global is true if any rank has unfinished work,
+        or if some ranks are waiting for a pause consensus.
+        Returns:
+            (has_unfinished_global, pause_consensus)
+        """
+        tensor = torch.tensor([int(has_unfinished), int(pending_pause)], dtype=torch.int32, device="cpu")
+        torch.distributed.all_reduce(tensor, op=ReduceOp.SUM, group=dp_group)
+        dp_size = dp_group.size()
+        pause_count = tensor[1].item()
+        has_unfinished_global = tensor[0].item() > 0 or pause_count % dp_size != 0
+        return has_unfinished_global, pause_count == dp_size
+
     @staticmethod
     def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
         if kv_cache_memory == -1:
@@ -686,6 +706,14 @@ def compute_hash(self):
             "worker_extension_cls",
             "_api_process_count",
             "_api_process_rank",
+            # NUMA binding is per-rank host-side memory locality; it does
+            # not affect collective-communication semantics. When numa_bind
+            # is enabled with auto-detection, each DP rank stores its own
+            # NUMA node in numa_bind_nodes (see aphrodite/utils/numa_utils.py
+            # `_get_numa_node`), which would otherwise diverge the DP hash.
+            "numa_bind",
+            "numa_bind_nodes",
+            "numa_bind_cpus",
         }
 
         from aphrodite.config.utils import get_hash_factors, hash_factors
diff --git a/aphrodite/config/speculative.py b/aphrodite/config/speculative.py
index 3f63c0e14d..9a4d513cba 100644
--- a/aphrodite/config/speculative.py
+++ b/aphrodite/config/speculative.py
@@ -5,7 +5,7 @@
 import copy
 from typing import TYPE_CHECKING, Any, Literal, get_args
 
-from pydantic import Field, SkipValidation, model_validator
+from pydantic import Field, SkipValidation, field_validator, model_validator
 from typing_extensions import Self
 
 from aphrodite.config.kernel import MoEBackend
@@ -17,6 +17,7 @@
 from aphrodite.transformers_utils.config import get_hf_text_config
 from aphrodite.utils.hashing import safe_hash
 from aphrodite.utils.import_utils import LazyLoader, has_arctic_inference
+from aphrodite.v1.attention.backends.registry import AttentionBackendEnum
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
@@ -32,6 +33,7 @@
 MTPModelTypes = Literal[
     "deepseek_mtp",
     "mimo_mtp",
+    "mimo_v2_mtp",
     "glm4_moe_mtp",
     "glm4_moe_lite_mtp",
     "glm_ocr_mtp",
@@ -101,6 +103,10 @@ class SpeculativeConfig:
     inherits the target model's `--moe-backend` setting. Useful when the
     drafter and generator require different MoE kernels (e.g. quantized
     generator with unquantized drafter)."""
+    attention_backend: AttentionBackendEnum | None = None
+    """Attention backend to use for the draft model. When `None`, the backend is
+    automatically selected. Useful when the drafter requires a different attention
+    backend (e.g. DFlash needs a non-causal-capable backend like FLASH_ATTN)."""
     max_model_len: int | None = Field(default=None, ge=1)
     """The maximum model length of the draft model. Used when testing the
     ability to skip speculation for some sequences."""
@@ -311,6 +317,48 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
                 }
             )
 
+        if (arch := hf_config.architectures[0]) in (
+            "MiMoV2ForCausalLM",
+            "MiMoV2OmniForCausalLM",
+        ):
+            from aphrodite.model_executor.models.mimo_v2_mtp import (
+                _MIMO_V2_PRO_NUM_MTP_LAYERS,
+            )
+
+            mtp_arch_maps = {
+                "MiMoV2ForCausalLM": "MiMoV2MTPModel",
+                "MiMoV2OmniForCausalLM": "MiMoV2OmniMTPModel",
+            }
+
+            hf_config.model_type = "mimo_v2_mtp"
+            # Aphrodite currently supports only the first MiMo-V2 MTP layer.
+            n_predict = _MIMO_V2_PRO_NUM_MTP_LAYERS
+            hf_config.update(
+                {
+                    "num_hidden_layers": 0,
+                    "n_predict": n_predict,
+                    "num_nextn_predict_layers": n_predict,
+                    "architectures": [mtp_arch_maps[arch]],
+                }
+            )
+
+        if hf_config.architectures[0] == "MiMoV2FlashForCausalLM":
+            from aphrodite.model_executor.models.mimo_v2_mtp import (
+                _MIMO_V2_FLASH_NUM_MTP_LAYERS,
+            )
+
+            hf_config.model_type = "mimo_v2_mtp"
+            # Aphrodite currently supports only the first MiMo-V2 MTP layer.
+            n_predict = _MIMO_V2_FLASH_NUM_MTP_LAYERS
+            hf_config.update(
+                {
+                    "num_hidden_layers": 0,
+                    "n_predict": n_predict,
+                    "num_nextn_predict_layers": n_predict,
+                    "architectures": ["MiMoV2MTPModel"],
+                }
+            )
+
         if hf_config.architectures[0] == "Glm4MoeForCausalLM":
             hf_config.model_type = "glm4_moe_mtp"
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
@@ -775,6 +823,15 @@ def create_draft_parallel_config(
 
         return draft_parallel_config
 
+    @field_validator("attention_backend", mode="before")
+    @classmethod
+    def _parse_attention_backend(cls, value: Any) -> Any:
+        if isinstance(value, str):
+            if value.lower() == "auto":
+                return None
+            return AttentionBackendEnum[value.upper()]
+        return value
+
     @model_validator(mode="after")
     def _verify_args(self) -> Self:
         if self.tensor_parallel_size is not None:
diff --git a/aphrodite/distributed/device_communicators/all2all.py b/aphrodite/distributed/device_communicators/all2all.py
index ba4cf8740d..ec7bf9daee 100644
--- a/aphrodite/distributed/device_communicators/all2all.py
+++ b/aphrodite/distributed/device_communicators/all2all.py
@@ -10,7 +10,6 @@
 from aphrodite.distributed import get_dp_group, get_ep_group
 from aphrodite.forward_context import get_forward_context
 from aphrodite.logger import init_logger
-from aphrodite.platforms import current_platform
 from aphrodite.utils.flashinfer import (
     has_flashinfer_nvlink_one_sided,
     has_flashinfer_nvlink_two_sided,
@@ -218,11 +217,8 @@ def _make_all2all_kwargs(self) -> dict[Any, Any]:
             num_rdma_bytes=num_rdma_bytes,
             low_latency_mode=False,
             num_qps_per_rank=num_qps_per_rank,
+            explicitly_destroy=True,
         )
-        if not current_platform.is_rocm():
-            kwargs.update(
-                explicitly_destroy=True,
-            )
         return kwargs
 
     def get_handle(self, kwargs):
@@ -293,13 +289,10 @@ def _make_all2all_kwargs(
             num_rdma_bytes=num_rdma_bytes,
             low_latency_mode=True,
             num_qps_per_rank=num_qps_per_rank,
+            allow_nvlink_for_low_latency_mode=True,
+            allow_mnnvl=envs.APHRODITE_DEEPEP_LOW_LATENCY_USE_MNNVL,
+            explicitly_destroy=True,
         )
-        if not current_platform.is_rocm():
-            kwargs.update(
-                allow_nvlink_for_low_latency_mode=True,
-                allow_mnnvl=envs.APHRODITE_DEEPEP_LOW_LATENCY_USE_MNNVL,
-                explicitly_destroy=True,
-            )
         return kwargs
 
     def get_handle(self, kwargs):
@@ -552,6 +545,8 @@ def initialize(
         top_k: int,
         num_experts: int,
         hidden_size: int,
+        dispatch_dtype_bytes_per_elem: int = 0,
+        dispatch_scale_bytes_per_token: int = 0,
     ):
         """Initialize the MoeAlltoAll workspace."""
         if self.initialized:
@@ -582,9 +577,13 @@ def initialize(
         ep_config = MnnvlConfig(
             comm_backend=CustomCommunicator(self.cpu_group),
         )
+        if dispatch_dtype_bytes_per_elem == 0:
+            hidden_bytes = hidden_size // 2
+        else:
+            hidden_bytes = hidden_size * dispatch_dtype_bytes_per_elem
         total_dispatch_payload_size_per_token = (
-            hidden_size // 2  # nvfp4 hidden states
-            + hidden_size // 16  # fp8 scaling factors
+            hidden_bytes
+            + dispatch_scale_bytes_per_token
             + top_k * 4  # int32 topks ids
             + top_k * 4  # float32 topk weights
         )
diff --git a/aphrodite/distributed/eplb/eplb_communicator.py b/aphrodite/distributed/eplb/eplb_communicator.py
index 982908d724..372cce10ce 100644
--- a/aphrodite/distributed/eplb/eplb_communicator.py
+++ b/aphrodite/distributed/eplb/eplb_communicator.py
@@ -11,6 +11,7 @@
 from collections.abc import Sequence
 from datetime import timedelta
 
+import numpy as np
 import torch
 from torch.distributed import (
     P2POp,
@@ -47,15 +48,25 @@ class EplbCommunicator(ABC):
     """Abstract EPLB communicator for expert weight transfers."""
 
     @abstractmethod
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
+    def add_send(
+        self,
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,
+    ) -> None:
         pass
 
     @abstractmethod
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,
+    ) -> None:
         pass
 
     @abstractmethod
-    def execute(self) -> None:
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
         pass
 
     @property
@@ -85,27 +96,39 @@ def __init__(
         self._p2p_ops: list[P2POp] = []
         self._log_initialized()
 
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
-        self._p2p_ops.append(
-            P2POp(
-                torch.distributed.isend,
-                tensor,
-                dst_rank,
-                self._ep_group,
+    def add_send(
+        self,
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
+        for tensor in tensors:
+            self._p2p_ops.append(
+                P2POp(
+                    torch.distributed.isend,
+                    tensor,
+                    dst_rank,
+                    self._ep_group,
+                )
             )
-        )
 
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
-        self._p2p_ops.append(
-            P2POp(
-                torch.distributed.irecv,
-                tensor,
-                src_rank,
-                self._ep_group,
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
+        for tensor in tensors:
+            self._p2p_ops.append(
+                P2POp(
+                    torch.distributed.irecv,
+                    tensor,
+                    src_rank,
+                    self._ep_group,
+                )
             )
-        )
 
-    def execute(self) -> None:
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
         if not self._p2p_ops:
             return
         try:
@@ -130,13 +153,25 @@ def __init__(
         self._ops: list[tuple[str, torch.Tensor, int]] = []
         self._log_initialized()
 
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
-        self._ops.append(("send", tensor, dst_rank))
+    def add_send(
+        self,
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
+        for tensor in tensors:
+            self._ops.append(("send", tensor, dst_rank))
 
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
-        self._ops.append(("recv", tensor, src_rank))
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
+        for tensor in tensors:
+            self._ops.append(("recv", tensor, src_rank))
 
-    def execute(self) -> None:
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
         if not self._ops:
             return
 
@@ -207,30 +242,29 @@ def __init__(
         self._cuda_stream = cuda_stream
         self._world_size = cpu_group.size()
         self._rank = cpu_group.rank()
-        self._send_tensors: dict[torch.dtype, list[list[torch.Tensor]]] = {}
-        self._recv_tensors: dict[torch.dtype, list[list[torch.Tensor]]] = {}
-        self._dtypes: list[torch.dtype] = []
+        # expert_id -> weight tensors to pack into the send buffer.
+        self._expert_send_map: dict[int, list[torch.Tensor]] = {}
+        # src_rank -> expert_id -> weight tensors to unpack after transfer.
+        self._recv_map: dict[int, dict[int, list[torch.Tensor]]] = {}
+        self._num_local_experts: int = expert_weights[0].shape[0]
         self._device = expert_weights[0].device
         for tensor in expert_weights:
             assert tensor.device == self._device, (
                 "All local EPLB tensors are expected to be on the same device: "
                 f"expected={self._device}, got={tensor.device}"
             )
-            if tensor.dtype not in self._dtypes:
-                self._dtypes.append(tensor.dtype)
 
         config = nixl_agent_config(capture_telemetry=False) if nixl_agent_config is not None else None
         self._nixl_wrapper = NixlWrapper(self._make_agent_name(), config)
         self._nixl_memory_type = "VRAM"
         self._registered_desc: object | None = None
         self._remote_agents: dict[int, str] = {}
-        self._remote_send_meta: dict[int, tuple[int, int, int]] = {}
+        self._remote_send_meta: dict[int, tuple[int, int]] = {}
         self._send_buffer: torch.Tensor = torch.empty(0)
         self._recv_buffer: torch.Tensor = torch.empty(0)
-        self._peer_partition_bytes: int = 0
-        self._dtype_max_bytes: dict[torch.dtype, int] = {}
+        self._expert_bytes: int = 0
+
         self._cuda_device_id = int(self._device.index or 0)
-        self._xfer_cache: dict[tuple[int, int, int], tuple[int, int, int]] = {}
         self._init_step("buffers", self._init_registered_buffers, expert_weights)
         self._init_step("agents", self._init_remote_agents)
         self._init_step("send meta", self._exchange_remote_send_meta)
@@ -254,28 +288,31 @@ def _make_agent_name(self) -> str:
         uid = uuid.uuid4().hex[:8]
         return f"eplb-{self._rank}{pp_suffix}-{uid}"
 
-    def _get_peer_buckets(
+    def add_send(
         self,
-        bucket_map: dict[torch.dtype, list[list[torch.Tensor]]],
-        dtype: torch.dtype,
-    ) -> list[list[torch.Tensor]]:
-        peer_buckets = bucket_map.get(dtype)
-        if peer_buckets is None:
-            peer_buckets = [[] for _ in range(self._world_size)]
-            bucket_map[dtype] = peer_buckets
-        return peer_buckets
-
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,
+    ) -> None:
         assert dst_rank != self._rank, (
             f"EPLB communicator should not enqueue same-rank sends: rank={self._rank}, dst_rank={dst_rank}"
         )
-        self._get_peer_buckets(self._send_tensors, tensor.dtype)[dst_rank].append(tensor)
+        # An expert sent to multiple peers is packed only once; skip duplicates.
+        if expert_id not in self._expert_send_map:
+            self._expert_send_map[expert_id] = tensors
 
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,
+    ) -> None:
         assert src_rank != self._rank, (
             f"EPLB communicator should not enqueue same-rank recvs: rank={self._rank}, src_rank={src_rank}"
         )
-        self._get_peer_buckets(self._recv_tensors, tensor.dtype)[src_rank].append(tensor)
+        recv_experts = self._recv_map.setdefault(src_rank, {})
+        if expert_id not in recv_experts:
+            recv_experts[expert_id] = tensors
 
     def _init_remote_agents(self) -> None:
         local_metadata = self._nixl_wrapper.get_agent_metadata()
@@ -289,25 +326,15 @@ def _init_remote_agents(self) -> None:
             self._remote_agents[peer] = self._nixl_wrapper.add_remote_agent(peer_metadata)
 
     def _init_registered_buffers(self, expert_weights: Sequence[torch.Tensor]) -> None:
-        total_max_bytes = 0
-        for dtype in self._dtypes:
-            max_numel = max(sum(t.numel() for t in expert_weights if t.dtype == dtype), 1)
-            max_bytes = max_numel * dtype.itemsize
-            self._dtype_max_bytes[dtype] = max_bytes
-            total_max_bytes += max_bytes
-
-        self._peer_partition_bytes = total_max_bytes
-
-        # The send buffer needs world_size partitions because remote peers
-        # READ from fixed offsets (rank * partition_bytes).
-        # This allocates world_size * partition_bytes
-        # which can cause OOM on large models.
-        # TODO(ilmarkov): shrink to const * partition_bytes and execute
-        # communication in multiple steps dealing with the worst case.
-        send_total_bytes = self._peer_partition_bytes * self._world_size
-
-        self._send_buffer = torch.empty(send_total_bytes, device=self._device, dtype=torch.uint8)
-        self._recv_buffer = torch.empty(self._peer_partition_bytes, device=self._device, dtype=torch.uint8)
+        total_bytes = max(sum(t.nbytes for t in expert_weights), 1)
+        assert total_bytes % self._num_local_experts == 0, (
+            f"Number of bytes in moe layer {total_bytes} is not divisible "
+            f"by number of local experts {self._num_local_experts}"
+        )
+        self._expert_bytes = total_bytes // self._num_local_experts
+
+        self._send_buffer = torch.empty(total_bytes, device=self._device, dtype=torch.uint8)
+        self._recv_buffer = torch.empty(total_bytes, device=self._device, dtype=torch.uint8)
 
         descs = self._nixl_wrapper.get_reg_descs([self._send_buffer, self._recv_buffer])
         self._nixl_wrapper.register_memory(descs)
@@ -316,12 +343,11 @@ def _init_registered_buffers(self, expert_weights: Sequence[torch.Tensor]) -> No
     def _exchange_remote_send_meta(self) -> None:
         """Exchange send-buffer metadata so each rank can build dynamic
         descriptors at execute time."""
-        local_meta: tuple[int, int, int] = (
+        local_meta: tuple[int, int] = (
             self._send_buffer.data_ptr(),
-            self._peer_partition_bytes,
             self._cuda_device_id,
         )
-        gathered_meta: list[tuple[int, int, int] | None] = [None] * self._world_size
+        gathered_meta: list[tuple[int, int] | None] = [None] * self._world_size
         torch.distributed.all_gather_object(gathered_meta, local_meta, group=self._cpu_group)
 
         for peer in self._remote_agents:
@@ -331,31 +357,24 @@ def _exchange_remote_send_meta(self) -> None:
 
     @staticmethod
     def _pack_send_buffer(
-        peer_tensors: list[torch.Tensor],
+        in_tensors: list[torch.Tensor],
         send_buffer: torch.Tensor,
         byte_offset: int,
-    ) -> int:
-        """
-        Returns the byte offset after the last written byte.
-        """
-        for tensor in peer_tensors:
+    ) -> None:
+        for tensor in in_tensors:
             raw = tensor.reshape(-1).view(torch.uint8)
             if raw.numel() == 0:
                 continue
             send_buffer[byte_offset : byte_offset + raw.numel()].copy_(raw, non_blocking=True)
             byte_offset += raw.numel()
-        return byte_offset
 
     @staticmethod
     def _unpack_recv_buffer(
         recv_buffer: torch.Tensor,
-        peer_tensors: list[torch.Tensor],
+        out_tensors: list[torch.Tensor],
         byte_offset: int,
-    ) -> int:
-        """
-        Returns the byte offset after the last read byte.
-        """
-        for tensor in peer_tensors:
+    ) -> None:
+        for tensor in out_tensors:
             num_bytes = tensor.numel() * tensor.element_size()
             if num_bytes == 0:
                 continue
@@ -364,19 +383,6 @@ def _unpack_recv_buffer(
                 non_blocking=True,
             )
             byte_offset += num_bytes
-        return byte_offset
-
-    def _release_all_cached_handles(self) -> None:
-        """Best-effort release of every cached dlist and xfer handle."""
-        for local_dlist, remote_dlist, xfer in self._xfer_cache.values():
-            for release_fn, handle in (
-                (self._nixl_wrapper.release_xfer_handle, xfer),
-                (self._nixl_wrapper.release_dlist_handle, local_dlist),
-                (self._nixl_wrapper.release_dlist_handle, remote_dlist),
-            ):
-                with contextlib.suppress(Exception):
-                    release_fn(handle)
-        self._xfer_cache.clear()
 
     def _wait_for_all_transfers(self, handles: list[int]) -> None:
         pending = set(handles)
@@ -394,78 +400,59 @@ def _wait_for_all_transfers(self, handles: list[int]) -> None:
             if pending:
                 time.sleep(0.0005)
 
-    def _get_or_create_xfer(self, src: int, total_bytes: int, recv_offset: int) -> int:
-        """Return a cached xfer handle or create and cache a new one."""
-        key = (src, total_bytes, recv_offset)
-        cached = self._xfer_cache.get(key)
-        if cached is not None:
-            return cached[2]
-
-        recv_base = self._recv_buffer.data_ptr()
-        local_desc = self._nixl_wrapper.get_xfer_descs(
-            [
-                (
-                    recv_base + recv_offset,
-                    total_bytes,
-                    self._cuda_device_id,
-                )
-            ],
-            self._nixl_memory_type,
-        )
+    def _create_peer_xfer(
+        self,
+        src: int,
+        local_descs: list[tuple[int, int, int]],
+        remote_descs: list[tuple[int, int, int]],
+    ) -> tuple[int, int, int]:
+        """Create a batched xfer for multiple descriptors from one peer.
+
+        Each element in *local_descs* / *remote_descs* is an
+        ``(address, size, device_id)`` tuple.
+
+        Returns ``(local_dlist, remote_dlist, xfer_handle)``.
+        """
+        local_desc = self._nixl_wrapper.get_xfer_descs(local_descs, self._nixl_memory_type)
         local_handle = self._nixl_wrapper.prep_xfer_dlist(
             "NIXL_INIT_AGENT",
             local_desc,
         )
 
-        remote_base, remote_part_bytes, remote_dev = self._remote_send_meta[src]
-        agent_name = self._remote_agents[src]
-        remote_desc = self._nixl_wrapper.get_xfer_descs(
-            [
-                (
-                    remote_base + self._rank * remote_part_bytes,
-                    total_bytes,
-                    remote_dev,
-                )
-            ],
-            self._nixl_memory_type,
-        )
+        remote_desc = self._nixl_wrapper.get_xfer_descs(remote_descs, self._nixl_memory_type)
         remote_handle = self._nixl_wrapper.prep_xfer_dlist(
-            agent_name,
+            self._remote_agents[src],
             remote_desc,
         )
 
+        indices = list(range(len(local_descs)))
         xfer_handle = self._nixl_wrapper.make_prepped_xfer(
             "READ",
             local_handle,
-            [0],
+            indices,
             remote_handle,
-            [0],
+            indices,
         )
-        self._xfer_cache[key] = (local_handle, remote_handle, xfer_handle)
-        return xfer_handle
+        return (local_handle, remote_handle, xfer_handle)
+
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
+        assert old_indices is not None, "NixlEplbCommunicator.execute requires old_indices"
 
-    def execute(self) -> None:
-        xfer_handles: list[int] = []
+        xfer_entries: list[tuple[int, int, int]] = []
         try:
-            # Phase 1: pack send buffers.
+            n = self._num_local_experts
+            rank_experts = old_indices[: self._world_size * n].reshape(self._world_size, n)
+            # Build expert_id -> send slot mapping per rank.
+            expert_to_send_slot: list[dict[int, int]] = [
+                {int(eid): i for i, eid in enumerate(row) if eid != -1} for row in rank_experts
+            ]
+
+            # Phase 1: pack each expert at its slot offset in the send buffer.
             with torch.cuda.stream(self._cuda_stream):
-                for dst in range(self._world_size):
-                    byte_offset = dst * self._peer_partition_bytes
-                    for dtype in self._dtypes:
-                        peer_tensors = self._send_tensors.get(dtype, [[] for _ in range(self._world_size)])[dst]
-                        actual_bytes = sum(t.numel() * t.element_size() for t in peer_tensors)
-                        if actual_bytes > self._dtype_max_bytes[dtype]:
-                            raise RuntimeError(
-                                "NIXL EPLB send overflow for dtype "
-                                f"{dtype}: peer={dst}, "
-                                f"required={actual_bytes}, "
-                                f"capacity={self._dtype_max_bytes[dtype]}"
-                            )
-                        byte_offset = self._pack_send_buffer(
-                            peer_tensors,
-                            self._send_buffer,
-                            byte_offset,
-                        )
+                for expert_id, tensors in self._expert_send_map.items():
+                    slot = expert_to_send_slot[self._rank][expert_id]
+                    byte_offset = slot * self._expert_bytes
+                    self._pack_send_buffer(tensors, self._send_buffer, byte_offset)
 
             # Ensure all packed data is visible in device memory before pulls.
             if self._cuda_stream is not None:
@@ -480,50 +467,61 @@ def execute(self) -> None:
                 timeout=timedelta(minutes=5),
             )
 
-            # Phase 2: look up or create descriptors and issue all READs.
-            # Data from all peers is packed sequentially into the single
-            # partition-sized recv buffer at running offsets.
-            recv_offsets: dict[int, int] = {}
+            # Phase 2: issue one batched READ per peer.
+            recv_offsets: dict[tuple[int, int], int] = {}
             recv_offset = 0
+            recv_base = self._recv_buffer.data_ptr()
             for src in range(self._world_size):
                 if src == self._rank:
                     continue
-                actual_total_bytes = 0
-                for dtype in self._dtypes:
-                    peer_tensors = self._recv_tensors.get(dtype, [[] for _ in range(self._world_size)])[src]
-                    actual_total_bytes += sum(t.numel() * t.element_size() for t in peer_tensors)
-                if actual_total_bytes == 0:
+                recv_experts = self._recv_map.get(src)
+                if not recv_experts:
                     continue
+                expert_ids = list(recv_experts.keys())
+                remote_base, remote_dev = self._remote_send_meta[src]
+                local_descs: list[tuple[int, int, int]] = []
+                remote_descs: list[tuple[int, int, int]] = []
+                for expert_id in expert_ids:
+                    slot = expert_to_send_slot[src][expert_id]
+                    remote_off = slot * self._expert_bytes
+                    recv_offsets[(src, expert_id)] = recv_offset
+                    local_descs.append(
+                        (
+                            recv_base + recv_offset,
+                            self._expert_bytes,
+                            self._cuda_device_id,
+                        )
+                    )
+                    remote_descs.append((remote_base + remote_off, self._expert_bytes, remote_dev))
+                    recv_offset += self._expert_bytes
+                    assert recv_offset <= self._recv_buffer.nbytes
+                local_h, remote_h, xfer_h = self._create_peer_xfer(src, local_descs, remote_descs)
+                self._nixl_wrapper.transfer(xfer_h)
+                xfer_entries.append((local_h, remote_h, xfer_h))
 
-                recv_offsets[src] = recv_offset
-                xfer_handle = self._get_or_create_xfer(src, actual_total_bytes, recv_offset)
-                self._nixl_wrapper.transfer(xfer_handle)
-                xfer_handles.append(xfer_handle)
-                recv_offset += actual_total_bytes
-
-            # Phase 3: single wait for all in-flight transfers, then unpack.
-            self._wait_for_all_transfers(xfer_handles)
+            # Phase 3: wait for all in-flight transfers, then unpack.
+            self._wait_for_all_transfers([x[2] for x in xfer_entries])
 
             with torch.cuda.stream(self._cuda_stream):
-                for src, offset in recv_offsets.items():
-                    byte_offset = offset
-                    for dtype in self._dtypes:
-                        peer_tensors = self._recv_tensors.get(dtype, [[] for _ in range(self._world_size)])[src]
-                        byte_offset = self._unpack_recv_buffer(
-                            self._recv_buffer,
-                            peer_tensors,
-                            byte_offset,
-                        )
-        except Exception:
-            self._release_all_cached_handles()
-            raise
+                for (src, expert_id), offset in recv_offsets.items():
+                    self._unpack_recv_buffer(
+                        self._recv_buffer,
+                        self._recv_map[src][expert_id],
+                        offset,
+                    )
         finally:
-            self._send_tensors.clear()
-            self._recv_tensors.clear()
+            for local_h, remote_h, xfer_h in xfer_entries:
+                with contextlib.suppress(Exception):
+                    self._nixl_wrapper.release_xfer_handle(xfer_h)
+                with contextlib.suppress(Exception):
+                    self._nixl_wrapper.release_dlist_handle(local_h)
+                with contextlib.suppress(Exception):
+                    self._nixl_wrapper.release_dlist_handle(remote_h)
+            self._expert_send_map.clear()
+            self._recv_map.clear()
 
     def __del__(self) -> None:
         try:
-            self._release_all_cached_handles()
             if self._registered_desc is not None:
                 self._nixl_wrapper.deregister_memory(self._registered_desc)
                 self._registered_desc = None
@@ -552,15 +550,27 @@ def _ensure_group_started(self) -> None:
             self._pynccl_comm.group_start()
             self._group_started = True
 
-    def add_send(self, tensor: torch.Tensor, dst_rank: int) -> None:
+    def add_send(
+        self,
+        tensors: list[torch.Tensor],
+        dst_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
         self._ensure_group_started()
-        self._pynccl_comm.send(tensor, dst_rank, stream=self._cuda_stream)
+        for tensor in tensors:
+            self._pynccl_comm.send(tensor, dst_rank, stream=self._cuda_stream)
 
-    def add_recv(self, tensor: torch.Tensor, src_rank: int) -> None:
+    def add_recv(
+        self,
+        tensors: list[torch.Tensor],
+        src_rank: int,
+        expert_id: int,  # unused by this backend
+    ) -> None:
         self._ensure_group_started()
-        self._pynccl_comm.recv(tensor, src_rank, stream=self._cuda_stream)
+        for tensor in tensors:
+            self._pynccl_comm.recv(tensor, src_rank, stream=self._cuda_stream)
 
-    def execute(self) -> None:
+    def execute(self, old_indices: np.ndarray | None = None) -> None:
         if self._group_started:
             self._pynccl_comm.group_end()
             self._group_started = False
diff --git a/aphrodite/distributed/eplb/rebalance_execute.py b/aphrodite/distributed/eplb/rebalance_execute.py
index 14c58e460f..2313ac2988 100644
--- a/aphrodite/distributed/eplb/rebalance_execute.py
+++ b/aphrodite/distributed/eplb/rebalance_execute.py
@@ -280,9 +280,9 @@ def move_to_buffer(
             recver_pos = remainder_start + sender_pos
             if recver_pos < len(ranks_to_recv):
                 recv_ranks.append(ranks_to_recv[recver_pos])
+            expert_tensors = [w[src] for w in expert_weights]
             for dst in recv_ranks:
-                for w in expert_weights:
-                    communicator.add_send(w[src], dst)
+                communicator.add_send(expert_tensors, dst, expert_id=int(expert))
 
     # 3. Post recvs
     if recv_count > 0:
@@ -311,11 +311,14 @@ def move_to_buffer(
                 src = ranks_to_send[recver_pos // num_dst_per_sender]
             else:
                 src = ranks_to_send[recver_pos - remainder_start]
-            for b in expert_weights_buffers:
-                communicator.add_recv(b[dst], src)
+            communicator.add_recv(
+                [b[dst] for b in expert_weights_buffers],
+                src,
+                expert_id=int(expert),
+            )
 
     # 4. Execute the P2P operations. The real communication happens here.
-    communicator.execute()
+    communicator.execute(old_indices=old_indices)
     # wait for the communication to finish
     return TransferMetadata(
         is_unchanged=is_unchanged,
diff --git a/aphrodite/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/aphrodite/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 258efb72fa..00da7d07ff 100644
--- a/aphrodite/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/aphrodite/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 import torch
 
@@ -18,6 +18,8 @@
     KVConnectorMetadata,
     KVConnectorRole,
     KVConnectorWorkerMetadata,
+    SupportsHMA,
+    supports_hma,
 )
 from aphrodite.distributed.kv_transfer.kv_connector.v1.metrics import (
     KVConnectorPromMetrics,
@@ -121,7 +123,7 @@ def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
             self._prom_metrics[connector_id].observe(stats_data["data"], engine_idx)
 
 
-class MultiConnector(KVConnectorBase_V1):
+class MultiConnector(KVConnectorBase_V1, SupportsHMA):
     """
     A wrapper for using multiple KVConnectors at the same time.
 
@@ -160,6 +162,11 @@ def __init__(
             self._connectors.append(connector_cls(temp_config, role, kv_cache_config))
             self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config)
 
+        self._all_support_hma = all(supports_hma(c) for c in self._connectors)
+        assert aphrodite_config.scheduler_config.disable_hybrid_kv_cache_manager or self._all_support_hma, (
+            "HMA should not be enabled unless all sub-connectors support it"
+        )
+
         # A mapping from request id to the index of the connector chosen to
         # load the request from (if any).
         self._requests_to_connector: dict[str, int] = {}
@@ -406,15 +413,15 @@ def set_xfer_handshake_metadata(self, metadata: dict[int, KVConnectorHandshakeMe
         for c in self._connectors:
             c.set_xfer_handshake_metadata(metadata)
 
-    def request_finished(
+    def _aggregate_request_finished(
         self,
         request: "Request",
-        blocks: list[int],
+        per_connector_fn: Callable[[KVConnectorBase_V1], tuple[bool, dict[str, Any] | None]],
     ) -> tuple[bool, dict[str, Any] | None]:
         async_saves = 0
         kv_txfer_params = None
         for c in self._connectors:
-            async_save, txfer_params = c.request_finished(request, blocks)
+            async_save, txfer_params = per_connector_fn(c)
             if async_save:
                 async_saves += 1
             if txfer_params is not None:
@@ -426,11 +433,34 @@ def request_finished(
         if async_saves > 1:
             self._extra_async_saves[request.request_id] = async_saves - 1
 
-        # Clean up other state for this request.
         self._requests_to_connector.pop(request.request_id, None)
 
         return async_saves > 0, kv_txfer_params
 
+    def request_finished(
+        self,
+        request: "Request",
+        blocks: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        return self._aggregate_request_finished(
+            request,
+            lambda c: c.request_finished(request, blocks),
+        )
+
+    def request_finished_all_groups(
+        self,
+        request: "Request",
+        block_ids: tuple[list[int], ...],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        if not self._all_support_hma:
+            assert len(block_ids) == 1, "HMA with multiple kv_cache_groups requires all sub-connectors to support HMA"
+            return self.request_finished(request, block_ids[0])
+
+        return self._aggregate_request_finished(
+            request,
+            lambda c: cast(SupportsHMA, c).request_finished_all_groups(request, block_ids),
+        )
+
     def take_events(self) -> Iterable["KVCacheEvent"]:
         for c in self._connectors:
             yield from c.take_events()
diff --git a/aphrodite/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py b/aphrodite/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
index a890fa8a04..3e6057bf89 100644
--- a/aphrodite/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
+++ b/aphrodite/distributed/kv_transfer/kv_connector/v1/nixl/scheduler.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Copyright contributors to the aphrodite project
 """Scheduler-side logic for the NIXL connector."""
 
 import threading
@@ -109,6 +110,25 @@ def __init__(
             cdiv(n_tokens, block_size) + 1 if n_tokens else 0 for n_tokens, block_size in sw_sizes_tokens
         ]
 
+        # Threshold to decide whether to compute kv cache locally
+        # or pull from a remote node: minimum number of remote
+        # tokens to amortize the xfer latencies
+        self.kv_recompute_threshold: int = int(
+            aphrodite_config.kv_transfer_config.get_from_extra_config("kv_recompute_threshold", 64)
+        )
+
+        # Bi-directional KV transfer feature supports KV block
+        # transfers from D node to P node
+        self.is_bidirectional_kv_xfer_enabled = aphrodite_config.kv_transfer_config.get_from_extra_config(
+            "bidirectional_kv_xfer", False
+        )
+
+        if self.is_bidirectional_kv_xfer_enabled and self.kv_recompute_threshold > 0:
+            logger.info(
+                "Bidirectional KV transfer is enabled and the kv recompute threshold is set to %d tokens",
+                self.kv_recompute_threshold,
+            )
+
     def shutdown(self):
         self._stop_event.set()
         if self._nixl_handshake_listener_t is not None:
@@ -276,6 +296,39 @@ def get_num_new_matched_tokens(self, request: "Request", num_computed_tokens: in
         if params is not None and params.get("do_remote_decode") and self._has_mamba:
             self._truncate_mamba_request_for_prefill(request)
 
+        if (
+            params is not None
+            and params.get("do_remote_decode")
+            and params.get("remote_block_ids")
+            and all(
+                p in params
+                for p in (
+                    "remote_engine_id",
+                    "remote_request_id",
+                    "remote_host",
+                    "remote_port",
+                )
+            )
+        ):
+            # Decode node has kv blocks for part of prefill request, so, provide them
+            # as an external token count to scheduler.
+            # The tokens will be loaded if not already present
+            # in the prefill node local cache
+            remote_num_tokens = params.get("remote_num_tokens") or 0
+            count = min(remote_num_tokens, request.num_prompt_tokens) - num_computed_tokens
+            if count > 0:
+                # Check kv_recompute_threshold: skip pull if
+                # remote tokens are below the threshold.
+                if self.kv_recompute_threshold > 0 and count < self.kv_recompute_threshold:
+                    logger.debug(
+                        "Skipping remote pull for %s: %d remote tokens < threshold %d",
+                        request.request_id,
+                        count,
+                        self.kv_recompute_threshold,
+                    )
+                    return 0, False
+                return count, True
+
         # No remote prefill for this request.
         return 0, False
 
@@ -290,13 +343,19 @@ def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks",
         if not params:
             return
 
-        if params.get("do_remote_decode"):
+        if params.get("do_remote_decode") or (
+            params.get("do_remote_prefill") and self.is_bidirectional_kv_xfer_enabled
+        ):
             self._reqs_in_batch.add(request.request_id)
         if self.use_host_buffer and params.get("do_remote_decode"):
             # NOTE: when accelerator is not directly supported by Nixl,
             # prefilled blocks need to be saved to host memory before transfer.
             self._reqs_need_save[request.request_id] = request
-        elif params.get("do_remote_prefill"):
+        elif params.get("do_remote_prefill") or (
+            params.get("do_remote_decode")
+            and self.is_bidirectional_kv_xfer_enabled
+            and not params.get("_remote_blocks_processed")
+        ):
             if params.get("remote_block_ids"):
                 if all(
                     p in params
@@ -308,8 +367,8 @@ def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks",
                     )
                 ):
                     # If remote_blocks and num_external_tokens = 0, we have
-                    # a full prefix cache hit on the D worker. We need to call
-                    # send_notif in _read_blocks to free the memory on the P.
+                    # a full prefix cache hit on the local node. We need to call
+                    # send_notif in _read_blocks to free the memory on the remote node.
 
                     unhashed_local_block_ids: BlockIds = (
                         blocks.get_unhashed_block_ids_all_groups() if num_external_tokens > 0 else ()
@@ -332,6 +391,7 @@ def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks",
                 assert num_external_tokens == 0
             # Only trigger 1 KV transfer per request.
             params["do_remote_prefill"] = False
+            params["_remote_blocks_processed"] = True
 
     def _build_save_meta(
         self,
@@ -417,6 +477,9 @@ def request_finished(
         if not params:
             return False, None
 
+        is_p_node = bool(params.get("do_remote_decode"))
+        is_d_node = not is_p_node
+
         if params.get("do_remote_prefill"):
             # If do_remote_prefill is still True when the request is finished,
             # update_state_after_alloc must not have been called (the request
@@ -428,9 +491,13 @@ def request_finished(
             params["do_remote_prefill"] = False
             return False, None
 
-        if not params.get("do_remote_decode"):
+        if is_d_node and not self.is_bidirectional_kv_xfer_enabled:
             return False, None
-        if request.status != RequestStatus.FINISHED_LENGTH_CAPPED:
+
+        if request.status not in (
+            RequestStatus.FINISHED_LENGTH_CAPPED,
+            RequestStatus.FINISHED_STOPPED,
+        ):
             # Also include the case of a P/D Prefill request with immediate
             # block free (eg abort). Stop tracking this request.
             self._reqs_not_processed.add(request.request_id)
@@ -441,6 +508,7 @@ def request_finished(
         # TODO: check whether block_ids actually ever be 0. If not we could
         # remove the conditional below
         delay_free_blocks = any(len(group) > 0 for group in block_ids)
+        remote_num_tokens = 0
 
         if delay_free_blocks:
             # Prefill request on remote. It will be read from D upon completion
@@ -456,13 +524,16 @@ def request_finished(
             # Here we "unpad" blocks to send the actual remote blocks to be read.
             block_ids = self.get_sw_clipped_blocks(block_ids)
 
+            remote_num_tokens = request.num_computed_tokens
+
         return delay_free_blocks, dict(
-            do_remote_prefill=True,
-            do_remote_decode=False,
+            do_remote_prefill=is_p_node,
+            do_remote_decode=is_d_node,
             remote_block_ids=block_ids,
             remote_engine_id=self.engine_id,
             remote_request_id=request.request_id,
             remote_host=self.side_channel_host,
             remote_port=self.side_channel_port,
             tp_size=self.aphrodite_config.parallel_config.tensor_parallel_size,
+            remote_num_tokens=remote_num_tokens,
         )
diff --git a/aphrodite/distributed/kv_transfer/kv_connector/v1/nixl/worker.py b/aphrodite/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
index f2e7c5f98b..064b5a0022 100644
--- a/aphrodite/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
+++ b/aphrodite/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
@@ -1746,7 +1746,7 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
             if self.use_mla and tp_ratio < 0:
                 # ..but we still need to notify the other remote ranks that we
                 # have the blocks we need so they can update the request state.
-                notif_id = f"{req_id}:{self.world_size}".encode()
+                notif_id = f"{meta.remote.request_id}:{self.world_size}".encode()
                 remote_agents = self._remote_agents[meta.remote.engine_id]
                 for rank_to_notify, agent in remote_agents.items():
                     if rank_to_notify != remote_rank:
diff --git a/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/common.py b/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/common.py
index 3e8567a37b..601f22c44b 100644
--- a/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/common.py
+++ b/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/common.py
@@ -1,15 +1,56 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
-from aphrodite.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
+from aphrodite.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorMetadata,
+    KVConnectorWorkerMetadata,
+)
 from aphrodite.v1.kv_offload.worker.worker import TransferSpec
 
 ReqId = str
 
 
+@dataclass
+class TransferJob:
+    """A transfer job bundling request context with transfer spec.
+    Used for both loads and stores, keyed by scheduler-assigned job ID.
+    The worker reports the job ID back when the transfer finishes,
+    and the scheduler processes the completion.
+    """
+
+    req_id: ReqId
+    transfer_spec: TransferSpec
+
+
 @dataclass
 class OffloadingConnectorMetadata(KVConnectorMetadata):
-    reqs_to_load: dict[ReqId, TransferSpec]
-    reqs_to_store: dict[ReqId, TransferSpec]
-    reqs_to_flush: set[str] | None = None
+    # Keyed by scheduler-assigned job IDs.
+    load_jobs: dict[int, TransferJob]
+    store_jobs: dict[int, TransferJob]
+    jobs_to_flush: set[int] | None = None
+
+
+@dataclass
+class OffloadingWorkerMetadata(KVConnectorWorkerMetadata):
+    """Worker -> Scheduler metadata for completed transfer jobs.
+    Each worker reports {job_id: 1} for newly completed transfer jobs
+    (load or store). aggregate() sums counts across workers within a step.
+    The scheduler accumulates across steps and processes
+    a transfer completion only when count reaches num_workers.
+    """
+
+    completed_jobs: dict[int, int] = field(default_factory=dict)
+
+    def mark_completed(self, job_id: int) -> None:
+        """Record a transfer job completion from this worker."""
+        self.completed_jobs[job_id] = 1
+
+    def aggregate(self, other: "KVConnectorWorkerMetadata") -> "KVConnectorWorkerMetadata":
+        assert isinstance(other, OffloadingWorkerMetadata)
+
+        merged = dict(self.completed_jobs)
+        for job_id, v in other.completed_jobs.items():
+            merged[job_id] = merged.get(job_id, 0) + v
+
+        return OffloadingWorkerMetadata(completed_jobs=merged)
diff --git a/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py b/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
index bc29bc5959..f2cdfaa642 100644
--- a/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
+++ b/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections import defaultdict
 from collections.abc import Iterable, Sequence
 from dataclasses import dataclass, field
 from itertools import islice
@@ -11,48 +10,95 @@
 from aphrodite.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from aphrodite.distributed.kv_transfer.kv_connector.v1.offloading.common import (
     OffloadingConnectorMetadata,
+    OffloadingWorkerMetadata,
     ReqId,
+    TransferJob,
 )
 from aphrodite.logger import init_logger
 from aphrodite.utils.math_utils import cdiv
 from aphrodite.v1.core.kv_cache_manager import KVCacheBlocks
 from aphrodite.v1.core.sched.output import SchedulerOutput
-from aphrodite.v1.kv_offload.abstract import (
+from aphrodite.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheSpec,
+    MambaSpec,
+    SlidingWindowSpec,
+)
+from aphrodite.v1.kv_offload.base import (
+    GPULoadStoreSpec,
     OffloadingManager,
+    OffloadingSpec,
     OffloadKey,
     ReqContext,
     get_offload_block_hash,
     make_offload_key,
 )
-from aphrodite.v1.kv_offload.mediums import GPULoadStoreSpec
-from aphrodite.v1.kv_offload.spec import OffloadingSpec
-from aphrodite.v1.kv_offload.worker.worker import TransferSpec
 from aphrodite.v1.outputs import KVConnectorOutput
 from aphrodite.v1.request import Request
 
 logger = init_logger(__name__)
 
 
+@dataclass(slots=True)
+class TransferJobStatus:
+    """Tracks scheduler-side state for a single transfer job."""
+
+    req_id: ReqId
+    # Number of workers still pending. Starts at num_workers,
+    # decremented as each worker reports completion. Job is done at 0.
+    pending_count: int
+    # Offload keys this job covers; passed to manager.complete_*().
+    keys: set[OffloadKey]
+    is_store: bool
+    # Store src block IDs whose ref_cnt protects them while the request
+    # runs. Only registered in _block_id_to_pending_jobs on request_finished.
+    non_sliding_window_block_ids: list[int] | None = None
+    # Store src block IDs that may be freed before the request finishes.
+    # Registered in _block_id_to_pending_jobs at store creation time.
+    sliding_window_block_ids: list[int] | None = None
+
+
 class GroupOffloadConfig(NamedTuple):
     group_idx: int
     gpu_block_size: int
     offloaded_block_size: int
     hash_block_size_factor: int
+    # None below means full attention
+    sliding_window_size_in_blocks: int | None
+
+
+def get_sliding_window_size_in_blocks(kv_cache_spec: KVCacheSpec, offloaded_block_size: int) -> int | None:
+    if isinstance(kv_cache_spec, SlidingWindowSpec):
+        assert kv_cache_spec.sliding_window > 0
+        return cdiv(kv_cache_spec.sliding_window, offloaded_block_size)
+
+    if isinstance(kv_cache_spec, MambaSpec):
+        # Mamba depends on a single state
+        return 1
+
+    assert isinstance(kv_cache_spec, FullAttentionSpec)
+    return None
 
 
 class SchedulerOffloadConfig(NamedTuple):
     kv_group_configs: tuple[GroupOffloadConfig, ...]
     block_size_factor: int
+    num_workers: int
 
     @classmethod
     def from_spec(cls, spec: OffloadingSpec) -> "SchedulerOffloadConfig":
         return cls(
+            num_workers=spec.aphrodite_config.parallel_config.world_size,
             kv_group_configs=tuple(
                 GroupOffloadConfig(
                     group_idx=idx,
                     gpu_block_size=gpu_block_size,
                     offloaded_block_size=gpu_block_size * spec.block_size_factor,
                     hash_block_size_factor=((gpu_block_size * spec.block_size_factor) // spec.hash_block_size),
+                    sliding_window_size_in_blocks=get_sliding_window_size_in_blocks(
+                        spec.kv_cache_config.kv_cache_groups[idx].kv_cache_spec,
+                        gpu_block_size * spec.block_size_factor,
+                    ),
                 )
                 for idx, gpu_block_size in enumerate(spec.gpu_block_size)
             ),
@@ -66,6 +112,9 @@ class RequestGroupState:
     block_ids: list[int] = field(default_factory=list)
     # index of next block (of size offloaded_block_size) to offload
     next_stored_block_idx: int = 0
+    # number of offloaded blocks hit (including GPU prefix cache)
+    # when the request first started
+    num_hit_blocks: int = 0
 
 
 @dataclass(slots=True)
@@ -76,6 +125,9 @@ class RequestOffloadState:
     req_context: ReqContext = field(init=False)
     # number of hits in the GPU cache
     num_locally_computed_tokens: int = 0
+    # In-flight job IDs. Per the connector's invariant, at any given time
+    # this contains either a single load job, or one or more store jobs.
+    transfer_jobs: set[int] = field(default_factory=set)
 
     def __post_init__(self) -> None:
         self.group_states = tuple(RequestGroupState() for _ in self.config.kv_group_configs)
@@ -106,6 +158,10 @@ def advance_stored_idx(self, num_offloadable_tokens: int) -> None:
             num_blocks = num_offloadable_tokens // group_config.offloaded_block_size
             group_state.next_stored_block_idx = num_blocks
 
+    def update_num_hit_blocks(self, num_cached_tokens: int) -> None:
+        for group_config, group_state in zip(self.config.kv_group_configs, self.group_states):
+            group_state.num_hit_blocks = num_cached_tokens // group_config.offloaded_block_size
+
 
 class OffloadingConnectorScheduler:
     """Implementation of Scheduler side methods"""
@@ -114,28 +170,61 @@ def __init__(self, spec: OffloadingSpec):
         self.config = SchedulerOffloadConfig.from_spec(spec)
         self.manager: OffloadingManager = spec.get_manager()
 
-        attention_groups: list[int] = []
-        for idx, _ in enumerate(spec.kv_cache_config.kv_cache_groups):
-            # currently treat all groups as full attention
-            attention_groups.append(idx)
+        full_attention_groups: list[int] = []
+        sliding_window_groups: list[int] = []
+        for group_config in self.config.kv_group_configs:
+            if group_config.sliding_window_size_in_blocks is None:
+                full_attention_groups.append(group_config.group_idx)
+            else:
+                sliding_window_groups.append(group_config.group_idx)
+
+        # sort sliding window groups by window size in decreasing order
+        def _sliding_window_sort_key(i: int) -> int:
+            val = self.config.kv_group_configs[i].sliding_window_size_in_blocks
+            assert val is not None
+            return val
+
+        sliding_window_groups.sort(key=_sliding_window_sort_key, reverse=True)
 
-        self.lookup_groups = attention_groups
+        # used by _lookup
+        self._sliding_window_groups: tuple[int, ...] = tuple(sliding_window_groups)
+        self._lookup_groups = tuple(full_attention_groups) + self._sliding_window_groups
 
         self._req_status: dict[ReqId, RequestOffloadState] = {}
-        # requests to load for the current scheduler step
-        self._reqs_to_load: dict[ReqId, TransferSpec] = {}
+        self._current_batch_load_jobs: dict[int, TransferJob] = {}
+        self._current_batch_jobs_to_flush: set[int] = set()
         # if GPU prefix caching is enabled,
         # track loaded blocks to avoid redundant loads
         self._blocks_being_loaded: set[OffloadKey] | None = (
             set() if spec.aphrodite_config.cache_config.enable_prefix_caching else None
         )
 
-        # request ID -> set(offload keys being stored/loaded)
-        self._reqs_being_stored = defaultdict[ReqId, set[OffloadKey]](set)
-        self._reqs_being_loaded = defaultdict[ReqId, set[OffloadKey]](set)
+        # Job ID counter shared by loads and stores.
+        self._job_counter: int = 0
+        self._jobs: dict[int, TransferJobStatus] = {}
+
+        # block_id -> pending store job_ids. Used to track jobs that needs
+        # flushing in case a block is re-allocated by the KV cache manager.
+        # Populated only for finished requests (running-request blocks are
+        # protected by their ref_cnt) and for sliding window blocks (which can
+        # be freed before a request finishes).
+        self._block_id_to_pending_jobs: dict[int, set[int]] = {}
+
+    def _generate_job_id(self) -> int:
+        job_id = self._job_counter
+        self._job_counter += 1
+        return job_id
+
+    def _remove_pending_job(self, job_id: int, block_ids: list[int] | None) -> None:
+        for bid in block_ids or ():
+            pending = self._block_id_to_pending_jobs[bid]
+            pending.remove(job_id)
+            if not pending:
+                del self._block_id_to_pending_jobs[bid]
 
     def _maximal_prefix_lookup(self, keys: Iterable[OffloadKey], req_context: ReqContext) -> int | None:
-        """Find the length of the maximal prefix of offloaded blocks."""
+        """Return the number of consecutive offloaded blocks from the start,
+        or None if the backend deferred a lookup."""
         hit_count = 0
         defer_lookup = False
         for key in keys:
@@ -156,8 +245,9 @@ def _sliding_window_lookup(
         sliding_window_size: int,
         req_context: ReqContext,
     ) -> int | None:
-        """Find the maximal ending position of consecutive offloaded blocks
-        within a sliding window."""
+        """Return the end index (in `keys`) of the last run of
+        `sliding_window_size` consecutive hits, scanning from the end.
+        Returns 0 on miss, None if the backend deferred a lookup."""
         defer_lookup = False
         consecutive_hits = 0
         for idx in range(len(keys) - 1, -1, -1):
@@ -175,6 +265,137 @@ def _sliding_window_lookup(
                     return idx + sliding_window_size if not defer_lookup else None
         return consecutive_hits if not defer_lookup else None
 
+    def _touch(self, req_status: RequestOffloadState):
+        for group_config, group_state in zip(self.config.kv_group_configs, req_status.group_states):
+            if group_config.sliding_window_size_in_blocks is None:
+                self.manager.touch(group_state.offload_keys)
+            else:
+                # we aim to keep just blocks that are necessary to hit
+                # the original request (+ decoded blocks)
+                blocks_to_skip = max(
+                    0,
+                    group_state.num_hit_blocks - group_config.sliding_window_size_in_blocks,
+                )
+                self.manager.touch(group_state.offload_keys[blocks_to_skip:])
+
+    def _lookup(self, req_status: RequestOffloadState) -> int | None:
+        """
+        Find how many tokens beyond num_locally_computed_tokens can be loaded.
+
+        Iterates full-attention groups first (prefix lookup), then sliding-window
+        groups (suffix lookup). Each group may tighten max_hit_size_tokens, which
+        can invalidate an earlier group's result, so the loop re-runs when that
+        happens until num_hit_tokens converges.
+        """
+        num_computed_tokens = req_status.num_locally_computed_tokens
+        max_hit_size_tokens: int = req_status.req.num_tokens
+        if self._sliding_window_groups:
+            # the last prompt token has to be recomputed to get the logprobs
+            # for sliding window attention, we must reduce by 1 to make sure
+            # we still have a hit after reduction
+            max_hit_size_tokens -= 1
+        num_hit_tokens: int = 0
+        defer_lookup = False
+        lookup_groups = self._lookup_groups
+        while lookup_groups:
+            looked_up_sliding_window: bool = False
+            groups_iter = iter(lookup_groups)
+            lookup_groups = ()
+            for group_idx in groups_iter:
+                group_config: GroupOffloadConfig = self.config.kv_group_configs[group_idx]
+                group_state: RequestGroupState = req_status.group_states[group_idx]
+                offloaded_block_size = group_config.offloaded_block_size
+                offload_keys = group_state.offload_keys
+
+                assert len(offload_keys) >= req_status.req.num_tokens // offloaded_block_size
+
+                # Constrain to block-aligned boundary for this group
+                max_hit_size_tokens = min(max_hit_size_tokens, len(offload_keys) * offloaded_block_size)
+                if max_hit_size_tokens - num_computed_tokens < offloaded_block_size:
+                    # we can only load less than a block, better skip
+                    return 0
+
+                num_blocks = min(cdiv(max_hit_size_tokens, offloaded_block_size), len(offload_keys))
+                start_block_idx = num_computed_tokens // offloaded_block_size
+                offload_keys = offload_keys[start_block_idx:num_blocks]
+                sliding_window_size_in_blocks = group_config.sliding_window_size_in_blocks
+
+                # end index (in the sliced offload_keys) up to which we
+                # have backend-confirmed hits
+                num_hit_blocks: int | None
+                if sliding_window_size_in_blocks is None:
+                    num_hit_blocks = self._maximal_prefix_lookup(offload_keys, req_status.req_context)
+                else:
+                    num_hit_blocks = self._sliding_window_lookup(
+                        offload_keys,
+                        sliding_window_size_in_blocks,
+                        req_status.req_context,
+                    )
+                if num_hit_blocks == 0:
+                    return 0
+
+                if num_hit_blocks is None:
+                    defer_lookup = True
+                else:
+                    max_hit_size_tokens = min(
+                        max_hit_size_tokens,
+                        offloaded_block_size * (start_block_idx + num_hit_blocks),
+                    )
+
+                new_num_hit_tokens = max_hit_size_tokens - num_computed_tokens
+                if new_num_hit_tokens < offloaded_block_size:
+                    # we can only load less than a block, better skip
+                    return 0
+
+                if new_num_hit_tokens < num_hit_tokens:
+                    if defer_lookup:
+                        # make another iteration on all groups to check
+                        # if we still need to defer lookup
+                        defer_lookup = False
+                        lookup_groups = self._lookup_groups
+                    elif looked_up_sliding_window and not lookup_groups:
+                        # we need another iteration to confirm previously looked up
+                        # sliding window works with the new_num_hit_tokens
+                        lookup_groups = self._sliding_window_groups
+
+                looked_up_sliding_window |= sliding_window_size_in_blocks is not None
+                num_hit_tokens = new_num_hit_tokens
+
+        if defer_lookup:
+            logger.debug(
+                "Offloading manager delayed request %s as backend requested",
+                req_status.req.request_id,
+            )
+            return None
+
+        # possibly delay request if any of the hit blocks is already being loaded
+        if self._blocks_being_loaded:
+            for group_config, group_state in zip(self.config.kv_group_configs, req_status.group_states):
+                offloaded_block_size = group_config.offloaded_block_size
+                sliding_window_size_in_blocks = group_config.sliding_window_size_in_blocks
+                offload_keys = group_state.offload_keys
+                num_blocks = cdiv(num_computed_tokens + num_hit_tokens, offloaded_block_size)
+                start_block_idx = num_computed_tokens // offloaded_block_size
+                offload_keys = offload_keys[start_block_idx:num_blocks]
+                if sliding_window_size_in_blocks is not None:
+                    offload_keys = offload_keys[-sliding_window_size_in_blocks:]
+                if any(key in self._blocks_being_loaded for key in offload_keys):
+                    # hit blocks are being loaded, delay request
+                    logger.debug(
+                        "Delaying request %s since some of its blocks are already being loaded",
+                        req_status.req.request_id,
+                    )
+                    return None
+
+        logger.debug(
+            "Request %s hit %s offloaded tokens after %s GPU hit tokens",
+            req_status.req.request_id,
+            num_hit_tokens,
+            num_computed_tokens,
+        )
+
+        return num_hit_tokens
+
     def get_num_new_matched_tokens(self, request: Request, num_computed_tokens: int) -> tuple[int | None, bool]:
         """
         Get number of new tokens that can be loaded beyond the
@@ -195,89 +416,26 @@ def get_num_new_matched_tokens(self, request: Request, num_computed_tokens: int)
                 - `True` if tokens will be loaded asynchronously
                   (between scheduler steps).
         """
+        is_new_request = False
         if req_status := self._req_status.get(request.request_id):
             # make sure block IDs are cleared
             for group_state in req_status.group_states:
                 group_state.block_ids.clear()
         else:
+            is_new_request = True
             req_status = RequestOffloadState(config=self.config, req=request)
             self._req_status[request.request_id] = req_status
 
         req_status.update_offload_keys()
         req_status.num_locally_computed_tokens = num_computed_tokens
 
-        for gs in req_status.group_states:
-            self.manager.touch(gs.offload_keys)
-
-        # Start with the full request size as the maximum loadable
-        max_hit_size_tokens: int = req_status.req.num_tokens
-        num_hit_tokens: int = 0
-        defer_lookup = False
-        delay_request = False
-        for group_idx in self.lookup_groups:
-            group_config: GroupOffloadConfig = self.config.kv_group_configs[group_idx]
-            offloaded_block_size = group_config.offloaded_block_size
-            offload_keys = req_status.group_states[group_idx].offload_keys
-
-            num_blocks = max_hit_size_tokens // offloaded_block_size
-            assert len(offload_keys) >= num_blocks
-
-            # Constrain to block-aligned boundary for this group
-            max_hit_size_tokens = num_blocks * offloaded_block_size
-            num_hit_tokens = max_hit_size_tokens - num_computed_tokens
-            if num_hit_tokens < offloaded_block_size:
-                # we can only load less than a block, better skip
-                return 0, False
-
-            start_block_idx = num_computed_tokens // offloaded_block_size
-            offload_keys = offload_keys[start_block_idx:num_blocks]
-            # Full attention relies on all previous KV cache blocks.
-            # Thus, we search for a maximal prefix of KV cache which are all cached.
-            block_hits = self._maximal_prefix_lookup(offload_keys, req_status.req_context)
-            if block_hits == 0:
-                return 0, False
-
-            if block_hits is None:
-                defer_lookup = True
-            else:
-                # Further constrain based on what's actually available by backend
-                max_hit_size_tokens = offloaded_block_size * (start_block_idx + block_hits)
-
-            num_hit_tokens = max_hit_size_tokens - num_computed_tokens
-            if num_hit_tokens < offloaded_block_size:
-                # we can only load less than a block, better skip
-                return 0, False
-
-            if (
-                block_hits
-                and self._blocks_being_loaded
-                and any(key in self._blocks_being_loaded for key in offload_keys[:block_hits])
-            ):
-                # hit blocks are being loaded, delay request
-                delay_request = True
-
-        if defer_lookup:
-            logger.debug(
-                "Offloading manager delayed request %s as backend requested",
-                req_status.req.request_id,
-            )
-            return None, False
-
-        if delay_request:
-            logger.debug(
-                "Delaying request %s since some of its blocks are already being loaded",
-                req_status.req.request_id,
-            )
-            return None, False
+        num_hit_tokens = self._lookup(req_status)
+        if is_new_request:
+            req_status.update_num_hit_blocks(num_computed_tokens + (num_hit_tokens or 0))
 
-        logger.debug(
-            "Request %s hit %s offloaded tokens after %s GPU hit tokens",
-            request.request_id,
-            num_hit_tokens,
-            num_computed_tokens,
-        )
+        self._touch(req_status)
 
-        return num_hit_tokens, True
+        return num_hit_tokens, bool(num_hit_tokens)
 
     def update_state_after_alloc(self, request: Request, blocks: KVCacheBlocks, num_external_tokens: int):
         if num_external_tokens == 0:
@@ -317,6 +475,11 @@ def update_state_after_alloc(self, request: Request, blocks: KVCacheBlocks, num_
             assert num_locally_computed_tokens <= num_locally_computed_gpu_blocks * gpu_block_size
             num_pending_gpu_blocks = num_gpu_blocks - num_locally_computed_gpu_blocks
 
+            if group_config.sliding_window_size_in_blocks is not None:
+                assert (
+                    num_pending_gpu_blocks <= group_config.sliding_window_size_in_blocks * self.config.block_size_factor
+                )
+
             num_blocks = cdiv(num_cached_tokens, offloaded_block_size)
             assert len(offload_keys) >= num_blocks
             if num_pending_gpu_blocks:
@@ -335,19 +498,39 @@ def update_state_after_alloc(self, request: Request, blocks: KVCacheBlocks, num_
                 # entire KV cache so a remote decode node can consume it.
                 group_state.next_stored_block_idx = num_blocks
 
+        # Fence dst blocks against finished-request pending stores.
+        if self._block_id_to_pending_jobs and not self._block_id_to_pending_jobs.keys().isdisjoint(dst_block_ids):
+            self._current_batch_jobs_to_flush.update(
+                jid for bid in dst_block_ids for jid in self._block_id_to_pending_jobs.get(bid, ())
+            )
+
         src_spec = self.manager.prepare_load(keys_to_load, req_status.req_context)
         dst_spec = GPULoadStoreSpec(dst_block_ids, group_sizes=group_sizes, block_indices=block_indices)
 
-        self._reqs_to_load[request.request_id] = (src_spec, dst_spec)
-        req_blocks_being_loaded = self._reqs_being_loaded[request.request_id]
-        req_blocks_being_loaded.update(keys_to_load)
+        load_job_id = self._generate_job_id()
+        self._current_batch_load_jobs[load_job_id] = TransferJob(
+            req_id=request.request_id,
+            transfer_spec=(src_spec, dst_spec),
+        )
+        # a load can only be issued when no other jobs are pending.
+        assert not req_status.transfer_jobs
+        req_status.transfer_jobs.add(load_job_id)
+        self._jobs[load_job_id] = TransferJobStatus(
+            req_id=request.request_id,
+            pending_count=self.config.num_workers,
+            keys=set(keys_to_load),
+            is_store=False,
+        )
 
         if self._blocks_being_loaded is not None:
-            self._blocks_being_loaded.update(req_blocks_being_loaded)
+            self._blocks_being_loaded.update(keys_to_load)
 
-    def _get_reqs_to_store(self, scheduler_output: SchedulerOutput) -> dict[ReqId, TransferSpec]:
+    def _build_store_jobs(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> dict[int, TransferJob]:
         block_size_factor = self.config.block_size_factor
-        reqs_to_store: dict[ReqId, TransferSpec] = {}
+        store_jobs: dict[int, TransferJob] = {}
         # iterate over both new and cached requests
         for req_id, new_block_id_groups, preempted in yield_req_data(scheduler_output):
             req_status = self._req_status[req_id]
@@ -360,6 +543,13 @@ def _get_reqs_to_store(self, scheduler_output: SchedulerOutput) -> dict[ReqId, T
 
             if new_block_id_groups:
                 req_status.update_block_id_groups(new_block_id_groups)
+                # Fence new blocks against in-flight stores.
+                if self._block_id_to_pending_jobs:
+                    new_blocks_flat = [bid for new_blocks in new_block_id_groups for bid in new_blocks]
+                    if not self._block_id_to_pending_jobs.keys().isdisjoint(new_blocks_flat):
+                        self._current_batch_jobs_to_flush.update(
+                            jid for bid in new_blocks_flat for jid in self._block_id_to_pending_jobs.get(bid, ())
+                        )
 
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_tokens_after_batch = req.num_computed_tokens + num_scheduled_tokens
@@ -405,15 +595,17 @@ def _get_reqs_to_store(self, scheduler_output: SchedulerOutput) -> dict[ReqId, T
                 req_status.advance_stored_idx(num_offloadable_tokens)
                 continue
 
-            for group_state in req_status.group_states:
-                self.manager.touch(group_state.offload_keys)
+            self._touch(req_status)
 
             keys_to_store = set(store_output.keys_to_store)
 
             group_sizes: list[int] = []
             block_indices: list[int] = []
             src_block_ids: list[int] = []
+            sliding_window_block_ids: list[int] = []
+            non_sliding_window_block_ids: list[int] = []
             for group_config, group_state in zip(self.config.kv_group_configs, req_status.group_states):
+                is_sliding_window = group_config.sliding_window_size_in_blocks is not None
                 num_blocks = num_offloadable_tokens // group_config.offloaded_block_size
                 start_block_idx = group_state.next_stored_block_idx
                 block_ids = group_state.block_ids
@@ -435,6 +627,11 @@ def _get_reqs_to_store(self, scheduler_output: SchedulerOutput) -> dict[ReqId, T
                         elif start_gpu_block_idx is None:
                             start_gpu_block_idx = gpu_block_idx + i
                         src_block_ids.append(block_id)
+                        if is_sliding_window:
+                            sliding_window_block_ids.append(block_id)
+                        else:
+                            non_sliding_window_block_ids.append(block_id)
+
                 group_sizes.append(num_group_blocks)
                 block_indices.append(start_gpu_block_idx or 0)
                 group_state.next_stored_block_idx = num_blocks
@@ -442,34 +639,57 @@ def _get_reqs_to_store(self, scheduler_output: SchedulerOutput) -> dict[ReqId, T
             src_spec = GPULoadStoreSpec(src_block_ids, group_sizes=group_sizes, block_indices=block_indices)
             dst_spec = store_output.store_spec
 
-            reqs_to_store[req_id] = (src_spec, dst_spec)
-            self._reqs_being_stored[req_id] |= keys_to_store
+            job_id = self._generate_job_id()
+            # a store can only be issued when no load is pending.
+            if req_status.transfer_jobs:
+                any_jid = next(iter(req_status.transfer_jobs))
+                assert self._jobs[any_jid].is_store
+            req_status.transfer_jobs.add(job_id)
+
+            # Watch sliding window blocks as they may get evicted
+            # before the request finishes
+            for bid in sliding_window_block_ids or ():
+                self._block_id_to_pending_jobs.setdefault(bid, set()).add(job_id)
+
+            # the non-sliding window blocks will be watched only
+            # when the request finishes
+            self._jobs[job_id] = TransferJobStatus(
+                req_id=req_id,
+                pending_count=self.config.num_workers,
+                keys=set(keys_to_store),
+                is_store=True,
+                non_sliding_window_block_ids=non_sliding_window_block_ids,
+                sliding_window_block_ids=sliding_window_block_ids or None,
+            )
+
+            store_jobs[job_id] = TransferJob(req_id=req_id, transfer_spec=(src_spec, dst_spec))
 
             logger.debug(
-                "Request %s offloading %s blocks upto %d tokens",
+                "Request %s offloading %s blocks upto %d tokens (job %d)",
                 req_id,
                 len(keys_to_store),
                 num_offloadable_tokens,
+                job_id,
             )
 
-        return reqs_to_store
+        return store_jobs
 
     def build_connector_meta(self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
-        meta = OffloadingConnectorMetadata(
-            reqs_to_load=self._reqs_to_load,
-            reqs_to_store=self._get_reqs_to_store(scheduler_output),
-            reqs_to_flush=scheduler_output.preempted_req_ids,
-        )
-        self._reqs_to_load = {}
-
-        # NOTE (orozery): we should move this logic to update_connector_output
-        # once KVConnectorOutput allows us to report completed transfers
         for req_id in scheduler_output.preempted_req_ids or ():
-            keys = self._reqs_being_stored.get(req_id)
-            if keys:
-                self.manager.complete_store(keys)
-                keys.clear()
+            req_status = self._req_status.get(req_id)
+            if req_status is None or not req_status.transfer_jobs:
+                continue
+            any_jid = next(iter(req_status.transfer_jobs))
+            assert self._jobs[any_jid].is_store
+            self._current_batch_jobs_to_flush.update(req_status.transfer_jobs)
 
+        meta = OffloadingConnectorMetadata(
+            load_jobs=self._current_batch_load_jobs,
+            store_jobs=self._build_store_jobs(scheduler_output),
+            jobs_to_flush=self._current_batch_jobs_to_flush,
+        )
+        self._current_batch_load_jobs = {}
+        self._current_batch_jobs_to_flush = set()
         return meta
 
     def update_connector_output(self, connector_output: KVConnectorOutput):
@@ -480,22 +700,43 @@ def update_connector_output(self, connector_output: KVConnectorOutput):
             connector_output (KVConnectorOutput): the worker-side
                 connectors output.
         """
-        for req_id in connector_output.finished_sending or []:
-            keys = self._reqs_being_stored.pop(req_id, None)
-            if keys:
-                self.manager.complete_store(keys)
-
-        for req_id in connector_output.finished_recving or []:
-            keys = self._reqs_being_loaded.pop(req_id, None)
-            if keys:
+        meta = connector_output.kv_connector_worker_meta
+        if not isinstance(meta, OffloadingWorkerMetadata):
+            assert meta is None
+            meta = OffloadingWorkerMetadata()
+        for job_id, count in meta.completed_jobs.items():
+            assert count > 0
+            job_status = self._jobs[job_id]
+            job_status.pending_count -= count
+            if job_status.pending_count > 0:
+                continue
+            assert job_status.pending_count == 0
+
+            if job_status.is_store:
+                self.manager.complete_store(job_status.keys)
+            else:
+                self.manager.complete_load(job_status.keys)
                 if self._blocks_being_loaded:
-                    self._blocks_being_loaded.difference_update(keys)
-                self.manager.complete_load(keys)
+                    self._blocks_being_loaded.difference_update(job_status.keys)
+
+            req_status = self._req_status[job_status.req_id]
+            if self._block_id_to_pending_jobs:
+                # Sliding window blocks are tracked from store creation
+                # and must be cleaned up unconditionally.
+                self._remove_pending_job(job_id, job_status.sliding_window_block_ids)
+                # Non-sliding-window blocks are only tracked after
+                # request_finished, so only clean up for finished requests.
+                if req_status.req.is_finished():
+                    self._remove_pending_job(job_id, job_status.non_sliding_window_block_ids)
+
+            del self._jobs[job_id]
+            req_status.transfer_jobs.remove(job_id)
+            if not req_status.transfer_jobs and req_status.req.is_finished():
+                del self._req_status[job_status.req_id]
 
     def request_finished(
         self,
         request: Request,
-        block_ids: list[int],
     ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called when a request has finished, before its blocks are freed.
@@ -507,14 +748,21 @@ def request_finished(
             Optional KVTransferParams to be included in the request outputs
             returned by the engine.
         """
-        req_id = request.request_id
-
         # TODO(orozery): possibly kickoff offload for last block
         # which may have been deferred due to async scheduling
-        self._req_status.pop(req_id, None)
-
-        request_being_stored = req_id in self._reqs_being_stored
-        return request_being_stored, None
+        req_status = self._req_status.get(request.request_id)
+        if req_status is None:
+            return False, None
+        if not req_status.transfer_jobs:
+            del self._req_status[request.request_id]
+            return False, None
+        # Pending stores will outlive the request's block ownership.
+        # Register them so future block reuse triggers a flush.
+        for job_id in req_status.transfer_jobs:
+            job_status = self._jobs[job_id]
+            for bid in job_status.non_sliding_window_block_ids or ():
+                self._block_id_to_pending_jobs.setdefault(bid, set()).add(job_id)
+        return False, None
 
     def take_events(self) -> Iterable[KVCacheEvent]:
         """Take the KV cache events from the connector.
diff --git a/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/worker.py b/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/worker.py
index 7a2ab4669a..401af293bb 100644
--- a/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/worker.py
+++ b/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading/worker.py
@@ -11,6 +11,7 @@
 )
 from aphrodite.distributed.kv_transfer.kv_connector.v1.offloading.common import (
     OffloadingConnectorMetadata,
+    OffloadingWorkerMetadata,
     ReqId,
 )
 from aphrodite.distributed.kv_transfer.kv_connector.v1.offloading.metrics import (
@@ -24,7 +25,7 @@
     MambaSpec,
     UniformTypeKVCacheSpecs,
 )
-from aphrodite.v1.kv_offload.spec import (
+from aphrodite.v1.kv_offload.base import (
     CanonicalKVCacheRef,
     CanonicalKVCaches,
     CanonicalKVCacheTensor,
@@ -45,24 +46,11 @@ def __init__(self, spec: OffloadingSpec):
         self.spec = spec
         self.worker = OffloadingWorker()
 
-        self._job_counter = 0
-
         self.kv_connector_stats = OffloadingConnectorStats()
-        # req_id -> (job_id, store)
-        self._jobs: dict[int, tuple[ReqId, bool]] = {}
-        # req_id -> active job IDs
-        self._load_job: dict[ReqId, int] = {}
-        # req_id -> set(active job IDs)
-        self._store_jobs = defaultdict[ReqId, set[int]](set)
-        # list of store jobs pending submission (job_id, transfer_spec)
+        # job_id -> req_id for in-flight loads.
+        self._load_jobs: dict[int, ReqId] = {}
         self._unsubmitted_store_jobs: list[tuple[int, TransferSpec]] = []
-
-        self._finished_reqs_waiting_for_store: set[ReqId] = set()
-
-    def _generate_job_id(self) -> int:
-        job_id = self._job_counter
-        self._job_counter = job_id + 1
-        return job_id
+        self._connector_worker_meta = OffloadingWorkerMetadata()
 
     def _register_handlers(self, kv_caches: CanonicalKVCaches):
         for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches):
@@ -274,10 +262,8 @@ def handle_preemptions(self, kv_connector_metadata: OffloadingConnectorMetadata)
             assert success
         self._unsubmitted_store_jobs.clear()
 
-        for req_id in kv_connector_metadata.reqs_to_flush or ():
-            job_ids = self._store_jobs.get(req_id)
-            if job_ids:
-                self.worker.wait(job_ids)
+        if kv_connector_metadata.jobs_to_flush:
+            self.worker.wait(kv_connector_metadata.jobs_to_flush)
 
     def start_kv_transfers(self, metadata: OffloadingConnectorMetadata):
         for job_id, transfer_spec in self._unsubmitted_store_jobs:
@@ -285,41 +271,33 @@ def start_kv_transfers(self, metadata: OffloadingConnectorMetadata):
             assert success
         self._unsubmitted_store_jobs.clear()
 
-        for req_id, transfer_spec in metadata.reqs_to_load.items():
-            job_id = self._generate_job_id()
-            self._jobs[job_id] = (req_id, False)
-            assert req_id not in self._load_job
-            self._load_job[req_id] = job_id
-            success = self.worker.transfer_async(job_id, transfer_spec)
+        for job_id, entry in metadata.load_jobs.items():
+            self._load_jobs[job_id] = entry.req_id
+            success = self.worker.transfer_async(job_id, entry.transfer_spec)
             assert success
 
     def prepare_store_kv(self, metadata: OffloadingConnectorMetadata):
-        for req_id, transfer_spec in metadata.reqs_to_store.items():
-            job_id = self._generate_job_id()
-            self._jobs[job_id] = (req_id, True)
-            self._store_jobs[req_id].add(job_id)
-            # NOTE(orozery): defer the store to the beginning of the next engine step,
-            # so that offloading starts AFTER transfers related to token sampling,
-            # thereby avoiding delays to token generation due to offloading.
-            self._unsubmitted_store_jobs.append((job_id, transfer_spec))
+        for job_id, entry in metadata.store_jobs.items():
+            # NOTE(orozery): defer the store to the beginning of the next
+            # engine step, so that offloading starts AFTER transfers related
+            # to token sampling, thereby avoiding delays to token generation.
+            self._unsubmitted_store_jobs.append((job_id, entry.transfer_spec))
 
     def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         """
-        Notifies worker-side connector ids of requests that have
-        finished generating tokens.
-        Returns a list of request IDs that finished loading or storing.
-
         Returns:
-            ids of requests that have finished asynchronous transfer
-            tuple of (sending/saving ids, recving/loading ids).
+            tuple of (finished_sending, finished_recving). Stores never
+            emit finished_sending — the scheduler tracks store completion
+            via kv_connector_worker_meta.completed_jobs and fences any
+            block reuse via jobs_to_flush. Loads still emit
+            finished_recving so the base scheduler can resume requests
+            blocked on remote KV (and free aborted-during-load reqs).
         """
-        finished_sending = set()
-        finished_recving = set()
+        finished_recving: set[str] = set()
         for transfer_result in self.worker.get_finished():
             # we currently do not support job failures
             job_id = transfer_result.job_id
             assert transfer_result.success
-            req_id, store = self._jobs.pop(job_id)
             if (
                 transfer_result.transfer_time
                 and transfer_result.transfer_size is not None
@@ -330,31 +308,21 @@ def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
                     time=transfer_result.transfer_time,
                     transfer_type=transfer_result.transfer_type,
                 )
-            if store:
-                req_jobs = self._store_jobs[req_id]
-                req_jobs.remove(job_id)
-                if req_jobs:
-                    continue
-
-                if req_id in self._finished_reqs_waiting_for_store:
-                    self._finished_reqs_waiting_for_store.remove(req_id)
-                    finished_sending.add(req_id)
-                    del self._store_jobs[req_id]
-            else:
-                req_job = self._load_job[req_id]
-                assert job_id == req_job
-                del self._load_job[req_id]
+
+            self._connector_worker_meta.mark_completed(job_id)
+            req_id = self._load_jobs.pop(job_id, None)
+            if req_id is not None:
                 finished_recving.add(req_id)
 
-        for req_id in finished_req_ids:
-            pending_req_jobs = self._store_jobs.get(req_id)
-            if pending_req_jobs:
-                self._finished_reqs_waiting_for_store.add(req_id)
-            elif pending_req_jobs is not None:
-                finished_sending.add(req_id)
-                del self._store_jobs[req_id]
+        return set(), finished_recving
 
-        return finished_sending, finished_recving
+    def build_connector_worker_meta(self) -> OffloadingWorkerMetadata | None:
+        """Return completed transfer job IDs since the last call."""
+        if not self._connector_worker_meta.completed_jobs:
+            return None
+        meta = self._connector_worker_meta
+        self._connector_worker_meta = OffloadingWorkerMetadata()
+        return meta
 
     def get_kv_connector_stats(self) -> KVConnectorStats | None:
         """
@@ -369,11 +337,7 @@ def get_kv_connector_stats(self) -> KVConnectorStats | None:
         return kv_connector_stats
 
     def shutdown(self) -> None:
-        # Drop deferred store jobs: there is no point in submitting
-        # them during shutdown.
         self._unsubmitted_store_jobs.clear()
-        self._jobs.clear()
-        self._load_job.clear()
-        self._store_jobs.clear()
-        self._finished_reqs_waiting_for_store.clear()
+        self._load_jobs.clear()
+        self._connector_worker_meta = OffloadingWorkerMetadata()
         self.worker.shutdown()
diff --git a/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index a272184f57..f856c6c5d2 100644
--- a/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/aphrodite/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -10,6 +10,7 @@
 from aphrodite.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
+    SupportsHMA,
 )
 from aphrodite.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from aphrodite.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -20,6 +21,7 @@
 )
 from aphrodite.distributed.kv_transfer.kv_connector.v1.offloading.common import (
     OffloadingConnectorMetadata,
+    OffloadingWorkerMetadata,
 )
 from aphrodite.distributed.kv_transfer.kv_connector.v1.offloading.metrics import (
     OffloadingConnectorStats,
@@ -41,7 +43,7 @@
 from aphrodite.v1.request import Request
 
 
-class OffloadingConnector(KVConnectorBase_V1):
+class OffloadingConnector(KVConnectorBase_V1, SupportsHMA):
     @property
     def prefer_cross_layer_blocks(self) -> bool:
         return True
@@ -109,6 +111,11 @@ def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         assert self.connector_worker is not None
         return self.connector_worker.get_finished(finished_req_ids)
 
+    def build_connector_worker_meta(self) -> OffloadingWorkerMetadata | None:
+        if self.connector_worker is not None:
+            return self.connector_worker.build_connector_worker_meta()
+        return None
+
     def get_num_new_matched_tokens(self, request: "Request", num_computed_tokens: int) -> tuple[int | None, bool]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.get_num_new_matched_tokens(request, num_computed_tokens)
@@ -131,7 +138,15 @@ def request_finished(
         block_ids: list[int],
     ) -> tuple[bool, dict[str, Any] | None]:
         assert self.connector_scheduler is not None
-        return self.connector_scheduler.request_finished(request, block_ids)
+        return self.connector_scheduler.request_finished(request)
+
+    def request_finished_all_groups(
+        self,
+        request: "Request",
+        block_ids: tuple[list[int], ...],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.request_finished(request)
 
     def take_events(self) -> Iterable[KVCacheEvent]:
         assert self.connector_scheduler is not None
diff --git a/aphrodite/distributed/parallel_state.py b/aphrodite/distributed/parallel_state.py
index cf93af7f1c..4bdf0f43b3 100644
--- a/aphrodite/distributed/parallel_state.py
+++ b/aphrodite/distributed/parallel_state.py
@@ -447,6 +447,7 @@ def graph_capture(self, graph_capture_context: GraphCaptureContext | None = None
         # only cuda uses this function,
         # so we don't abstract it into the base class
         maybe_ca_context = nullcontext()
+        maybe_aiter_context = nullcontext()
         from aphrodite.distributed.device_communicators.cuda_communicator import (
             CudaCommunicator,
         )
@@ -457,13 +458,20 @@ def graph_capture(self, graph_capture_context: GraphCaptureContext | None = None
             if ca_comm is not None:
                 maybe_ca_context = ca_comm.capture()  # type: ignore
 
+            from aphrodite._aiter_ops import rocm_aiter_ops
+
+            if rocm_aiter_ops.is_enabled():
+                aiter_ar = rocm_aiter_ops.get_aiter_allreduce()
+                if aiter_ar is not None:
+                    maybe_aiter_context = aiter_ar.capture()  # type: ignore
+
         # ensure all initialization operations complete before attempting to
         # capture the graph on another stream
         curr_stream = torch.cuda.current_stream()
         if curr_stream != stream:
             stream.wait_stream(curr_stream)
 
-        with torch.cuda.stream(stream), maybe_ca_context:
+        with torch.cuda.stream(stream), maybe_ca_context, maybe_aiter_context:
             yield graph_capture_context
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
diff --git a/aphrodite/engine/protocol.py b/aphrodite/engine/protocol.py
index 3477293406..71440eee93 100644
--- a/aphrodite/engine/protocol.py
+++ b/aphrodite/engine/protocol.py
@@ -75,6 +75,7 @@ def generate(
         priority: int = 0,
         data_parallel_rank: int | None = None,
         reasoning_ended: bool | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request."""
         ...
diff --git a/aphrodite/entrypoints/anthropic/protocol.py b/aphrodite/entrypoints/anthropic/protocol.py
index add0dfbffb..ad95ef800c 100644
--- a/aphrodite/entrypoints/anthropic/protocol.py
+++ b/aphrodite/entrypoints/anthropic/protocol.py
@@ -39,6 +39,7 @@ class AnthropicContentBlock(BaseModel):
         "image",
         "tool_use",
         "tool_result",
+        "tool_reference",
         "thinking",
         "redacted_thinking",
     ]
@@ -52,6 +53,8 @@ class AnthropicContentBlock(BaseModel):
     input: dict[str, Any] | None = None
     content: str | list[dict[str, Any]] | None = None
     is_error: bool | None = None
+    # For tool_reference content
+    tool_name: str | None = None
     # For thinking content
     thinking: str | None = None
     signature: str | None = None
@@ -72,6 +75,7 @@ class AnthropicTool(BaseModel):
     name: str
     description: str | None = None
     input_schema: dict[str, Any]
+    defer_loading: bool | None = None
 
     @field_validator("input_schema")
     @classmethod
diff --git a/aphrodite/entrypoints/anthropic/serving.py b/aphrodite/entrypoints/anthropic/serving.py
index c4e506be20..9e1eace211 100644
--- a/aphrodite/entrypoints/anthropic/serving.py
+++ b/aphrodite/entrypoints/anthropic/serving.py
@@ -233,6 +233,10 @@ def _convert_block(
             cls._convert_tool_use_block(block, tool_calls)
         elif block.type == "tool_result":
             cls._convert_tool_result_block(block, role, openai_messages, content_parts)
+        elif block.type == "tool_reference":
+            # Tool references are expanded during tool_result processing
+            # when they appear inside tool_result content.
+            pass
 
     @classmethod
     def _convert_tool_use_block(cls, block, tool_calls: list[dict[str, Any]]) -> None:
@@ -267,6 +271,7 @@ def _convert_user_tool_result(cls, block, openai_messages: list[dict[str, Any]])
         """Convert user tool_result with text and image support"""
         tool_text = ""
         tool_image_urls: list[str] = []
+        tool_reference: list[dict[str, Any]] = []
 
         if isinstance(block.content, str):
             tool_text = block.content
@@ -283,6 +288,10 @@ def _convert_user_tool_result(cls, block, openai_messages: list[dict[str, Any]])
                     url = cls._convert_image_source_to_url(source)
                     if url:
                         tool_image_urls.append(url)
+                elif item_type == "tool_reference":
+                    ref_name = item.get("tool_name") or item.get("name")
+                    if ref_name:
+                        tool_reference.append({"type": "tool_reference", "name": ref_name})
             tool_text = "\n".join(text_parts)
 
         openai_messages.append(
@@ -303,6 +312,15 @@ def _convert_user_tool_result(cls, block, openai_messages: list[dict[str, Any]])
                 }
             )
 
+        if tool_reference:
+            openai_messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": block.tool_use_id or "",
+                    "content": tool_reference,  # type: ignore[dict-item]
+                }
+            )
+
     @classmethod
     def _build_base_request(
         cls,
@@ -389,6 +407,7 @@ def _convert_tools(
                             "name": tool.name,
                             "description": tool.description,
                             "parameters": tool.input_schema,
+                            "defer_loading": tool.defer_loading,
                         },
                     }
                 )
diff --git a/aphrodite/entrypoints/chat_utils.py b/aphrodite/entrypoints/chat_utils.py
index c8406dcfb5..26f7ec98a9 100644
--- a/aphrodite/entrypoints/chat_utils.py
+++ b/aphrodite/entrypoints/chat_utils.py
@@ -11,7 +11,7 @@
 from functools import cached_property, lru_cache, partial
 from itertools import accumulate
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast
+from typing import TYPE_CHECKING, Any, Final, Generic, Literal, TypeAlias, TypeVar, cast
 
 from openai.types.chat import (
     ChatCompletionAssistantMessageParam,
@@ -36,10 +36,11 @@
 from pydantic import BaseModel, ConfigDict, TypeAdapter
 
 # pydantic needs the TypedDict from typing_extensions
-from typing_extensions import Required, TypedDict
+from typing_extensions import Required, TypedDict, override
 
 from aphrodite import envs
 from aphrodite.config import ModelConfig
+from aphrodite.exceptions import APHRODITEValidationError
 from aphrodite.inputs import MultiModalDataDict, MultiModalUUIDDict
 from aphrodite.logger import init_logger
 from aphrodite.model_executor.models import SupportsMultiModal
@@ -54,6 +55,10 @@
 )
 from aphrodite.multimodal.media import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from aphrodite.multimodal.processing import BaseMultiModalProcessor
+from aphrodite.renderers.embed_utils import (
+    safe_load_prompt_embeds,
+    safe_load_prompt_embeds_async,
+)
 from aphrodite.utils import random_uuid
 from aphrodite.utils.collection_utils import is_list_of
 from aphrodite.utils.import_utils import LazyLoader
@@ -97,9 +102,36 @@ class ChatTemplateResolutionError(ValueError):
     "image": "<##IMAGE##>",
     "audio": "<##AUDIO##>",
     "video": "<##VIDEO##>",
+    "prompt_embeds": "<##PROMPT_EMBEDS##>",
 }
 
 
+PROMPT_EMBEDS_PLACEHOLDER_TOKEN: Final[str] = "<prompt_embeds>"
+"""The special token used as a placeholder for each embedding
+position during chat template rendering.
+
+Registered as an additional special token when `--enable-prompt-embeds` is set.
+See `_ensure_prompt_embeds_placeholder_token` in `aphrodite/renderers/hf.py`.
+"""
+
+
+_REQUIRE_MM_PROCESSOR_ERROR: Final[str] = (
+    "Resolving modality {modality!r} requires a multimodal processor but none is available."
+)
+
+_ENABLE_PROMPT_EMBEDS_ERROR: Final[str] = "You must set `--enable-prompt-embeds` to input `prompt_embeds`"
+
+_PROMPT_EMBEDS_MISSING_DATA_ERROR: Final[str] = (
+    "prompt_embeds content part requires a non-empty `data` field with base64-encoded tensor bytes."
+)
+
+_RESERVED_PLACEHOLDER_IN_TEXT_ERROR: Final[str] = (
+    "Text content may not contain the reserved placeholder {token!r}. "
+    "This placeholder is used internally to mark `prompt_embeds` splice "
+    "positions in the tokenized prompt."
+)
+
+
 class AudioURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -146,6 +178,17 @@ class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False):
     """
 
 
+class ChatCompletionContentPartPromptEmbedsParam(TypedDict, total=False):
+    data: Required[str]
+    """
+    Base64-encoded bytes of a serialized `torch.Tensor` of shape
+    `(num_tokens, hidden_size)`. The tensor's `dtype` and `hidden_size` must
+    match the model's input embedding layer.
+    """
+    type: Required[Literal["prompt_embeds"]]
+    """The type of the content part."""
+
+
 class VideoURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -254,6 +297,23 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     """The thinking type."""
 
 
+class CustomChatCompletionContentToolReferenceParam(TypedDict, total=False):
+    """A tool reference content param that only accepts a plain tool name.
+
+    Example:
+    {
+        "name": "get_weather",
+        "type": "tool_reference"
+    }
+    """
+
+    name: str
+    """The name of the tool being referenced."""
+
+    type: Literal["tool_reference"]
+    """The content type."""
+
+
 ChatCompletionContentPartParam: TypeAlias = (
     OpenAIChatCompletionContentPartParam
     | ChatCompletionContentPartAudioParam
@@ -264,8 +324,10 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     | CustomChatCompletionContentSimpleImageParam
     | ChatCompletionContentPartImageEmbedsParam
     | ChatCompletionContentPartAudioEmbedsParam
+    | ChatCompletionContentPartPromptEmbedsParam
     | CustomChatCompletionContentSimpleAudioParam
     | CustomChatCompletionContentSimpleVideoParam
+    | CustomChatCompletionContentToolReferenceParam
     | str
     | CustomThinkCompletionContentParam
 )
@@ -345,7 +407,15 @@ class ConversationMessage(TypedDict, total=False):
 ChatTemplateContentFormat = Literal["string", "openai"]
 
 
-ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds", "vision_chunk"]
+ModalityStr = Literal[
+    "image",
+    "audio",
+    "video",
+    "image_embeds",
+    "audio_embeds",
+    "vision_chunk",
+    "prompt_embeds",
+]
 _T = TypeVar("_T")
 
 
@@ -503,7 +573,17 @@ def add(self, modality: ModalityStr, item: _T) -> str | None:
 
         An optional uuid can be added which serves as a unique identifier of the
         media.
+
+        Note:
+            `prompt_embeds` bypass MM-processor validation because they are
+            pre-computed embeddings that do not go through any HF processor, encoder,
+            or model-specific placeholder logic. The corresponding placeholder string is
+            managed by the parser via `_add_placeholder`, so we return None here.
         """
+        if modality == "prompt_embeds":
+            self._items_by_modality["prompt_embeds"].append(item)
+            return None
+
         input_modality = modality.replace("_embeds", "")
         original_modality = modality
         use_vision_chunk = self.use_unified_vision_chunk_modality and original_modality in ["video", "image"]
@@ -605,17 +685,30 @@ def _resolve_vision_chunk_items(
 
 def _resolve_items(
     items_by_modality: dict[str, list[tuple[object, str | None]]],
-    mm_processor: BaseMultiModalProcessor,
+    mm_processor: BaseMultiModalProcessor | None,
     modality_order: dict[str, list[str]],
 ) -> tuple[MultiModalDataDict, MultiModalUUIDDict]:
+    """
+    Materialize the tracker's per-modality items into `mm_data` / `mm_uuids`.
+
+    Note:
+        `mm_processor` is `None` for text-only models (no registered HF
+        processor) whose only modality is `prompt_embeds`. Every other
+        modality requires a processor, enforced by the guard below.
+    """
     if "image" in items_by_modality and "image_embeds" in items_by_modality:
         raise ValueError("Mixing raw image and embedding inputs is not allowed")
     if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
         raise ValueError("Mixing raw audio and embedding inputs is not allowed")
+    # `prompt_embeds` bypasses HF MM processors. Every other modality requires one.
+    processor_modalities = items_by_modality.keys() - {"prompt_embeds"}
+    if processor_modalities and mm_processor is None:
+        raise RuntimeError(_REQUIRE_MM_PROCESSOR_ERROR.format(modality=processor_modalities))
 
     mm_data = {}
     mm_uuids = {}
     if "image_embeds" in items_by_modality:
+        assert mm_processor is not None
         mm_data["image"] = _get_embeds_data(
             "image",
             [data for data, uuid in items_by_modality["image_embeds"]],
@@ -626,6 +719,7 @@ def _resolve_items(
         mm_data["image"] = [data for data, uuid in items_by_modality["image"]]
         mm_uuids["image"] = [uuid for data, uuid in items_by_modality["image"]]
     if "audio_embeds" in items_by_modality:
+        assert mm_processor is not None
         mm_data["audio"] = _get_embeds_data(
             "audio",
             [data for data, uuid in items_by_modality["audio_embeds"]],
@@ -639,6 +733,7 @@ def _resolve_items(
         mm_data["video"] = [data for data, uuid in items_by_modality["video"]]
         mm_uuids["video"] = [uuid for data, uuid in items_by_modality["video"]]
     if "vision_chunk" in items_by_modality:
+        assert mm_processor is not None
         # Process vision_chunk items - extract from (data, modality) tuples
         # and convert to VisionChunk types with proper UUID handling
         processed_chunks, vision_chunk_uuids = _resolve_vision_chunk_items(
@@ -648,6 +743,8 @@ def _resolve_items(
         )
         mm_data["vision_chunk"] = processed_chunks
         mm_uuids["vision_chunk"] = vision_chunk_uuids
+    if "prompt_embeds" in items_by_modality:
+        mm_data["prompt_embeds"] = [data for data, _uuid in items_by_modality["prompt_embeds"]]
 
     return mm_data, mm_uuids
 
@@ -659,7 +756,15 @@ def resolve_items(
         if not self._items_by_modality:
             return None, None
 
-        return _resolve_items(dict(self._items_by_modality), self.mm_processor, self._modality_order)
+        # Text-only models (`is_multimodal_model=False`) with inputs of
+        # modality `prompt_embeds` have no MM processor since `prompt_embeds` are
+        # pre-computed and require no processing, so we pass `None`.
+        mm_processor = self.mm_processor if self._model_config.is_multimodal_model else None
+        return _resolve_items(
+            dict(self._items_by_modality),
+            mm_processor,
+            self._modality_order,
+        )
 
     def create_parser(self, mm_processor_kwargs: dict[str, Any] | None = None) -> "BaseMultiModalContentParser":
         return MultiModalContentParser(self, mm_processor_kwargs=mm_processor_kwargs)
@@ -676,7 +781,12 @@ async def resolve_items(
             modality: await asyncio.gather(*coros) for modality, coros in self._items_by_modality.items()
         }
 
-        return _resolve_items(resolved_items_by_modality, self.mm_processor, self._modality_order)
+        mm_processor = self.mm_processor if self._model_config.is_multimodal_model else None
+        return _resolve_items(
+            resolved_items_by_modality,
+            mm_processor,
+            self._modality_order,
+        )
 
     def create_parser(self, mm_processor_kwargs: dict[str, Any] | None = None) -> "BaseMultiModalContentParser":
         return AsyncMultiModalContentParser(self, mm_processor_kwargs=mm_processor_kwargs)
@@ -690,10 +800,16 @@ def __init__(self) -> None:
         # general MM placeholder:
         # {
         #   "<##IMAGE##>": ["<image>", "<image>", "<image>"],
-        #   "<##AUDIO##>": ["<audio>", "<audio>"]
+        #   "<##AUDIO##>": ["<audio>", "<audio>"],
+        #   "<##PROMPT_EMBEDS##>": ["<prompt_embeds>", "<prompt_embeds>"]
         # }
         self._placeholder_storage: dict[str, list] = defaultdict(list)
 
+    @property
+    @abstractmethod
+    def model_config(self) -> ModelConfig:
+        raise NotImplementedError
+
     def _add_placeholder(self, modality: ModalityStr, placeholder: str | None):
         mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality]
         if placeholder:
@@ -734,6 +850,10 @@ def parse_audio_embeds(
     ) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_prompt_embeds(self, data: str) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
@@ -762,6 +882,21 @@ def __init__(
     def model_config(self) -> ModelConfig:
         return self._tracker.model_config
 
+    @override
+    def parse_prompt_embeds(self, data: str) -> None:
+        """Decode a base64 prompt embeds tensor and store it in the tracker.
+
+        Emits a single `PROMPT_EMBEDS_PLACEHOLDER_TOKEN` sentinel per
+        content part. The renderer later expands each sentinel to a span of
+        `tensor.shape[0]` placeholder tokens after tokenization.
+        """
+        if not self.model_config.enable_prompt_embeds:
+            raise ValueError(_ENABLE_PROMPT_EMBEDS_ERROR)
+
+        tensor = safe_load_prompt_embeds(self.model_config, data.encode())
+        self._tracker.add("prompt_embeds", (tensor, None))
+        self._add_placeholder("prompt_embeds", PROMPT_EMBEDS_PLACEHOLDER_TOKEN)
+
     def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image = self._connector.fetch_image(image_url) if image_url else None
 
@@ -868,6 +1003,27 @@ def __init__(
     def model_config(self) -> ModelConfig:
         return self._tracker.model_config
 
+    @override
+    def parse_prompt_embeds(self, data: str) -> None:
+        """Schedule async prompt embeds decode and store the coroutine in the tracker.
+
+        Like the sync variant, emits a single sentinel `PROMPT_EMBEDS_PLACEHOLDER_TOKEN`
+        per content part. Unlike the sync variant, the tensor decode is deferred to a
+        thread-pool executor via `safe_load_prompt_embeds_async`.
+        """
+        if not self.model_config.enable_prompt_embeds:
+            raise ValueError(_ENABLE_PROMPT_EMBEDS_ERROR)
+
+        coro = self._load_prompt_embeds_async(data.encode())
+        self._tracker.add("prompt_embeds", coro)
+        self._add_placeholder("prompt_embeds", PROMPT_EMBEDS_PLACEHOLDER_TOKEN)
+
+    async def _load_prompt_embeds_async(self, data_bytes: bytes) -> tuple[torch.Tensor, None]:
+        # Second tuple slot fills the tracker's generic `(item, uuid | None)`
+        # contract. prompt_embeds has no UUID concept, so it's always `None`.
+        tensor = await safe_load_prompt_embeds_async(self.model_config, data_bytes)
+        return tensor, None
+
     async def _image_with_uuid_async(self, image_url: str | None, uuid: str | None):
         image = await self._connector.fetch_image_async(image_url) if image_url else None
         return image, uuid
@@ -1141,6 +1297,7 @@ def _get_full_multimodal_text_prompt(
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _AudioEmbedsParser = partial(cast, ChatCompletionContentPartAudioEmbedsParam)
+_PromptEmbedsParser = partial(cast, ChatCompletionContentPartPromptEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
@@ -1166,11 +1323,13 @@ def _get_full_multimodal_text_prompt(
     "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
     "audio_embeds": lambda part: _AudioEmbedsParser(part).get("audio_embeds", None),
+    "prompt_embeds": lambda part: _PromptEmbedsParser(part).get("data", None),
     "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
     "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None),
     "refusal": lambda part: _RefusalParser(part).get("refusal", None),
     "video_url": lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
+    "tool_reference": lambda part: cast(CustomChatCompletionContentToolReferenceParam, part).get("name", None),
 }
 
 
@@ -1237,6 +1396,11 @@ def _parse_chat_message_content_mm_part(
             )
             audio_embeds = audio_params.get("audio_embeds", None)
             return "audio_embeds", audio_embeds
+        if "prompt_embeds" in part:
+            prompt_embeds_params = cast(  # type: ignore[assignment]
+                ChatCompletionContentPartPromptEmbedsParam, part
+            )
+            return "prompt_embeds", prompt_embeds_params.get("data", None)
         if "audio_url" in part:
             audio_params = cast(  # type: ignore[assignment]
                 CustomChatCompletionContentSimpleAudioParam, part
@@ -1258,6 +1422,10 @@ def _parse_chat_message_content_mm_part(
                 # with url as a dict of {"url": url}
                 video_url = video_url.get("url", None)
             return "video_url", video_url
+        if "tool_reference" in part:
+            tool_reference_params = cast(CustomChatCompletionContentToolReferenceParam, part)
+            tool_reference = tool_reference_params.get("name", None)
+            return "tool_reference", tool_reference
         # Raise an error if no 'type' or direct URL is found.
         raise ValueError("Missing 'type' field in multimodal part.")
 
@@ -1314,6 +1482,20 @@ def _parse_chat_message_content_parts(
     return [ConversationMessage(role=role, content=text_prompt)]
 
 
+def _reject_reserved_placeholder_in_text(text: str, model_config: ModelConfig) -> None:
+    """Reject user-supplied text parts that contains the reserved `prompt_embeds`
+    placeholder sentinel.
+
+    When the server accepts `prompt_embeds`, the placeholder token is
+    registered as a single unsplittable special token on the tokenizer. Any
+    user text that happens to contain the literal sequence would tokenize to
+    the same ID and be mistaken for a splice point by the renderer, letting a
+    caller move or inject splice positions via plain text content.
+    """
+    if model_config.enable_prompt_embeds and PROMPT_EMBEDS_PLACEHOLDER_TOKEN in text:
+        raise ValueError(_RESERVED_PLACEHOLDER_IN_TEXT_ERROR.format(token=PROMPT_EMBEDS_PLACEHOLDER_TOKEN))
+
+
 def _parse_chat_message_content_part(
     part: ChatCompletionContentPartParam,
     mm_parser: BaseMultiModalContentParser,
@@ -1329,6 +1511,7 @@ def _parse_chat_message_content_part(
     with multimodal placeholders.
     """
     if isinstance(part, str):  # Handle plain text parts
+        _reject_reserved_placeholder_in_text(part, mm_parser.model_config)
         if wrap_dicts:
             return {"type": "text", "text": part}
         return part
@@ -1346,6 +1529,7 @@ def _parse_chat_message_content_part(
 
     if part_type in ("text", "input_text", "output_text", "refusal", "thinking"):
         str_content = cast(str, content)
+        _reject_reserved_placeholder_in_text(str_content, mm_parser.model_config)
         if wrap_dicts:
             return {"type": "text", "text": str_content}
         else:
@@ -1374,6 +1558,11 @@ def _parse_chat_message_content_part(
         content = cast(str | dict[str, str], content) if content is not None else None
         mm_parser.parse_audio_embeds(content, uuid)
         modality = "audio"
+    elif part_type == "prompt_embeds":
+        if not content:
+            raise ValueError(_PROMPT_EMBEDS_MISSING_DATA_ERROR)
+        mm_parser.parse_prompt_embeds(cast(str, content))
+        modality = "prompt_embeds"
     elif part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content, uuid)
@@ -1386,11 +1575,33 @@ def _parse_chat_message_content_part(
         str_content = cast(str, content)
         mm_parser.parse_video(str_content, uuid)
         modality = "video"
+    elif part_type == "tool_reference":
+        # Tool references are not multimodal data — they reference deferred
+        # tools and are passed through as-is for the chat template to expand.
+        if wrap_dicts:
+            return {"type": "tool_reference", "name": cast(str, content)}
+        return cast(str, content)
     else:
-        raise NotImplementedError(f"Unknown part type: {part_type}")
+        supported = sorted(MM_PARSER_MAP.keys() | set(PART_TYPES_TO_SKIP_NONE_CONTENT))
+        raise APHRODITEValidationError(
+            f"Unsupported chat content part type: {part_type!r}. Supported types: {', '.join(supported)}.",
+            parameter="type",
+            value=part_type,
+        )
 
     if wrap_dicts:
+        if modality == "prompt_embeds":
+            # Chat templates don't know about the "prompt_embeds" modality,
+            # emit the single sentinel token as text so the template renders
+            # it inline. The renderer later expands it to N tokens post-tokenize.
+            return {"type": "text", "text": PROMPT_EMBEDS_PLACEHOLDER_TOKEN}
         return {"type": modality}
+    if modality == "prompt_embeds":
+        # Emit the renderer token inline regardless of `interleave_strings`,
+        # prompt_embeds are spliced at the token offset so position matters.
+        # Falling back to front-padding via `missing_placeholders` would
+        # reorder them relative to surrounding text.
+        return PROMPT_EMBEDS_PLACEHOLDER_TOKEN
     return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
 
 
@@ -1444,14 +1655,21 @@ def _parse_chat_message_content(
             # string. Clients like Claude Code / Cursor send tool results as
             # [{"type": "text", "text": "..."}], but most chat templates only
             # handle string content for tool messages.
+            # However, tool_reference items must be preserved as structured
+            # dicts for the chat template to expand them.
             msg_content = result_msg.get("content")
             if isinstance(msg_content, list):
-                texts = [
-                    item.get("text", "")
-                    for item in msg_content
-                    if isinstance(item, dict) and item.get("type") == "text"
-                ]
-                result_msg["content"] = "\n".join(texts) if texts else ""
+                has_non_text = any(isinstance(item, dict) and item.get("type") != "text" for item in msg_content)
+                if has_non_text:
+                    # Keep structured content (e.g., tool_reference)
+                    result_msg["content"] = msg_content
+                else:
+                    texts = [
+                        item.get("text", "")
+                        for item in msg_content
+                        if isinstance(item, dict) and item.get("type") == "text"
+                    ]
+                    result_msg["content"] = "\n".join(texts) if texts else ""
 
         if "name" in message and isinstance(message["name"], str):
             result_msg["name"] = message["name"]
@@ -1502,7 +1720,10 @@ def parse_chat_messages(
     MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
-    mm_tracker = MultiModalItemTracker(model_config, media_io_kwargs=media_io_kwargs)
+    mm_tracker = MultiModalItemTracker(
+        model_config,
+        media_io_kwargs=media_io_kwargs,
+    )
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
@@ -1538,7 +1759,10 @@ async def parse_chat_messages_async(
     MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
-    mm_tracker = AsyncMultiModalItemTracker(model_config, media_io_kwargs=media_io_kwargs)
+    mm_tracker = AsyncMultiModalItemTracker(
+        model_config,
+        media_io_kwargs=media_io_kwargs,
+    )
 
     for msg in messages:
         sub_messages = _parse_chat_message_content(
diff --git a/aphrodite/entrypoints/llm.py b/aphrodite/entrypoints/llm.py
index 082f1d4740..d1c7586d5a 100644
--- a/aphrodite/entrypoints/llm.py
+++ b/aphrodite/entrypoints/llm.py
@@ -885,7 +885,7 @@ def _preprocess_chat(
                     add_generation_prompt=add_generation_prompt,
                     continue_final_message=continue_final_message,
                     tools=tools,
-                    tokenize=is_mistral_tokenizer(renderer.tokenizer),
+                    tokenize=(is_mistral_tokenizer(renderer.tokenizer) or self.model_config.enable_prompt_embeds),
                 ),
             ),
             mm_processor_kwargs=mm_processor_kwargs,
diff --git a/aphrodite/entrypoints/openai/chat_completion/batch_serving.py b/aphrodite/entrypoints/openai/chat_completion/batch_serving.py
index 01f83223be..b522ffda7b 100644
--- a/aphrodite/entrypoints/openai/chat_completion/batch_serving.py
+++ b/aphrodite/entrypoints/openai/chat_completion/batch_serving.py
@@ -294,4 +294,5 @@ async def chat_completion_full_generator_batch(
             model=model_name,
             choices=choices,
             usage=usage,
+            system_fingerprint=self.system_fingerprint,
         )
diff --git a/aphrodite/entrypoints/openai/chat_completion/protocol.py b/aphrodite/entrypoints/openai/chat_completion/protocol.py
index 052d29768b..d773db3705 100644
--- a/aphrodite/entrypoints/openai/chat_completion/protocol.py
+++ b/aphrodite/entrypoints/openai/chat_completion/protocol.py
@@ -11,7 +11,7 @@
     ChatCompletionAudio as OpenAIChatCompletionAudio,
 )
 from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
-from pydantic import AliasChoices, Field, PrivateAttr, model_validator
+from pydantic import AliasChoices, Field, PrivateAttr, model_serializer, model_validator
 
 from aphrodite.config import ModelConfig
 from aphrodite.config.utils import replace
@@ -149,6 +149,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     model: str
     choices: list[ChatCompletionResponseStreamChoice]
     usage: UsageInfo | None = Field(default=None)
+    # Set only on the final chunk of a stream to mirror non-streaming responses
+    # without the per-chunk serialization overhead.
+    system_fingerprint: str | None = None
     # not part of the OpenAI spec but for tracing the tokens
     prompt_token_ids: list[int] | None = None
 
@@ -156,6 +159,20 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
 class ChatCompletionToolsParam(OpenAIBaseModel):
     type: Literal["function"] = "function"
     function: FunctionDefinition
+    defer_loading: bool | None = None
+
+    @model_validator(mode="after")
+    def _propagate_defer_loading(self) -> "ChatCompletionToolsParam":
+        if self.defer_loading is not None and self.function.defer_loading is None:
+            self.function.defer_loading = self.defer_loading
+        return self
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.defer_loading is None:
+            data.pop("defer_loading", None)
+        return data
 
 
 class ChatCompletionNamedFunction(OpenAIBaseModel):
@@ -194,7 +211,17 @@ class ChatCompletionRequest(OpenAIBaseModel):
     tool_choice: Literal["none"] | Literal["auto"] | Literal["required"] | ChatCompletionNamedToolChoiceParam | None = (
         "none"
     )
-    reasoning_effort: Literal["none", "low", "medium", "high"] | None = None
+    reasoning_effort: Literal["none", "minimal", "low", "medium", "high", "xhigh", "max"] | None = Field(
+        default=None,
+        description=(
+            "Constrains effort on reasoning for reasoning models. "
+            "Currently supported values are none, minimal, low, medium, "
+            "high, xhigh, and max. Reducing reasoning effort can result in "
+            "faster responses and fewer tokens used on reasoning in a response. "
+            "Note that 'max' is specific to the DeepSeek V4 series and is not "
+            "part of the standard OpenAI API specification."
+        ),
+    )
     thinking_token_budget: int | None = None
     include_reasoning: bool = True
     parallel_tool_calls: bool | None = True
diff --git a/aphrodite/entrypoints/openai/chat_completion/serving.py b/aphrodite/entrypoints/openai/chat_completion/serving.py
index 68aff37769..3bb70fad5e 100644
--- a/aphrodite/entrypoints/openai/chat_completion/serving.py
+++ b/aphrodite/entrypoints/openai/chat_completion/serving.py
@@ -9,10 +9,7 @@
 from http import HTTPStatus
 from typing import TYPE_CHECKING, Any, Final
 
-import partial_json_parser
-import regex as re
 from fastapi import Request
-from partial_json_parser.core.options import Allow
 
 from aphrodite.engine.protocol import EngineClient
 from aphrodite.entrypoints.chat_utils import (
@@ -73,7 +70,10 @@
 from aphrodite.renderers import ChatParams
 from aphrodite.sampling_params import BeamSearchParams, SamplingParams
 from aphrodite.tokenizers import TokenizerLike
-from aphrodite.tool_parsers.utils import partial_json_loads
+from aphrodite.tool_parsers.streaming import (
+    extract_named_tool_call_streaming,
+    extract_required_tool_call_streaming,
+)
 from aphrodite.utils.collection_utils import as_list
 from aphrodite.utils.mistral import is_mistral_tokenizer, is_mistral_tool_parser
 
@@ -324,6 +324,11 @@ async def create_chat_completion(
                     priority=request.priority,
                     data_parallel_rank=data_parallel_rank,
                     reasoning_ended=reasoning_ended,
+                    reasoning_parser_kwargs={
+                        "chat_template_kwargs": chat_template_kwargs,
+                    }
+                    if reasoning_parser
+                    else None,
                 )
 
             generators.append(generator)
@@ -360,45 +365,6 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
             return self.response_role
         return request.messages[-1]["role"]
 
-    @staticmethod
-    def _bracket_level(s: str, opening="{", closing="}") -> int:
-        """
-        Calculate the current level of nested brackets in a given string.
-        """
-        level = 0
-        for char in s:
-            if char == opening:
-                level += 1
-            elif char == closing:
-                level -= 1
-        return level
-
-    @staticmethod
-    def _filter_delta_text(delta_text: str, previous_text: str) -> tuple[str, bool]:
-        # remove last '},' of the tool definition stemming from the
-        # "name"/"parameters" outer object or closing ']' of the tool list
-        # count occurrences of opening and closing curly braces and
-        # once level 0 is reached stop outputting text
-        # if 0 is reached while parsing the delta_text we know the current
-        # tool will finish in this current iteration
-        bracket_level = OpenAIServingChat._bracket_level(previous_text)
-        updated_delta, passed_zero = "", False
-        for c in delta_text:
-            if c == "{":
-                bracket_level += 1
-                passed_zero = bracket_level == 0
-            elif c == "}":
-                bracket_level -= 1
-                passed_zero = bracket_level == 0
-
-            if bracket_level != 0:
-                updated_delta += c
-            else:
-                # if a comma is reached at level 0 we can stop
-                if c == ",":
-                    break
-        return updated_delta, passed_zero
-
     def extract_tool_call_required_streaming(
         self,
         previous_text: str,
@@ -407,87 +373,14 @@ def extract_tool_call_required_streaming(
         function_name_returned: bool,
         tool_call_idx: int | None = None,
     ) -> tuple[DeltaMessage | None, bool]:
-        if current_text is None or current_text == "":
-            # if the current text is empty, we cannot parse it
-            return None, function_name_returned
-        try:
-            flags = Allow.ALL
-            obj, _ = partial_json_loads(current_text, flags)
-        except (
-            partial_json_parser.core.exceptions.MalformedJSON,
-            json.JSONDecodeError,
-        ):
-            logger.debug("not enough tokens to parse into JSON yet")
-            obj = None
-
-        # check if the current text is a valid array
-        # containing a partial tool calling object
-        # if not repeat
-        if obj is None or not isinstance(obj, list) or not len(obj) > 0:
-            function_name_returned = False
-            delta_message = None
-        else:
-            _, finishes_previous_tool = OpenAIServingChat._filter_delta_text(delta_text, previous_text)
-            # take the last tool call from the generated list
-            current_tool_call = obj[-1]
-
-            # once parameters have been generated the name is complete as well
-            if not finishes_previous_tool and (
-                "name" not in current_tool_call or "parameters" not in current_tool_call
-            ):
-                function_name_returned = False
-                delta_message = None
-            else:
-                if not function_name_returned:
-                    # get partly generated arguments from the latest tool call
-                    param_match = re.search(r'.*"parameters":\s*(.*)', current_text, re.DOTALL)
-                    arguments = param_match.group(1) if param_match else ""
-                    arguments, _ = OpenAIServingChat._filter_delta_text(arguments, previous_text)
-
-                    # if this iteration finishes a previous tool call but a
-                    # new incomplete tool is already generated, take the
-                    # previous from the list
-                    if finishes_previous_tool and "parameters" not in current_tool_call:
-                        current_tool_call = obj[-2]
-
-                    function_name_returned = True
-                    tool_call_id = make_tool_call_id(
-                        id_type=self.tool_call_id_type,
-                        func_name=current_tool_call["name"],
-                        idx=tool_call_idx,
-                    )
-                    delta_message = DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                id=tool_call_id,
-                                function=DeltaFunctionCall(name=current_tool_call["name"], arguments=arguments),
-                                index=len(obj) - 1,
-                                type="function",
-                            )
-                        ]
-                    )
-
-                else:
-                    delta_text, _ = OpenAIServingChat._filter_delta_text(delta_text, previous_text)
-
-                    if delta_text != "":
-                        delta_message = DeltaMessage(
-                            tool_calls=[
-                                DeltaToolCall(
-                                    function=DeltaFunctionCall(
-                                        # OpenAI API returns None
-                                        # instead of name every time
-                                        name=None,
-                                        arguments=delta_text,
-                                    ),
-                                    index=len(obj) - 1,
-                                )
-                            ]
-                        )
-                    else:
-                        delta_message = None
-
-        return delta_message, function_name_returned
+        return extract_required_tool_call_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            function_name_returned=function_name_returned,
+            tool_call_idx=tool_call_idx,
+            tool_call_id_type=self.tool_call_id_type,
+        )
 
     async def chat_completion_stream_generator(
         self,
@@ -813,43 +706,22 @@ async def chat_completion_stream_generator(
                                 delta_text = previous_text + delta_text
                                 current_text = ""
 
-                            if function_name_returned[i]:
-                                delta_tool_call = DeltaToolCall(
-                                    function=DeltaFunctionCall(arguments=delta_text),
-                                    index=i,
-                                )
-                            else:
-                                # Generate ID based on tokenizer type
-                                if is_mistral_tokenizer(tokenizer):
-                                    from aphrodite.tool_parsers.mistral_tool_parser import (
-                                        MistralToolCall,
-                                    )
-
-                                    tool_call_id = MistralToolCall.generate_random_id()
-                                else:
-                                    tool_call_id = make_tool_call_id(
-                                        id_type=self.tool_call_id_type,
-                                        func_name=tool_choice_function_name,
-                                        idx=history_tool_call_cnt,
-                                    )
-                                delta_tool_call = DeltaToolCall(
-                                    id=tool_call_id,
-                                    type="function",
-                                    function=DeltaFunctionCall(
-                                        name=tool_choice_function_name,
-                                        arguments=delta_text,
-                                    ),
-                                    index=i,
-                                )
-                                function_name_returned[i] = True
-                                history_tool_call_cnt += 1
-
-                            delta_message = DeltaMessage(
-                                tool_calls=[
-                                    delta_tool_call,
-                                ]
+                            delta_message, function_name_returned[i] = extract_named_tool_call_streaming(
+                                delta_text=delta_text,
+                                function_name=tool_choice_function_name,
+                                function_name_returned=function_name_returned[i],
+                                tool_call_idx=history_tool_call_cnt,
+                                tool_call_id_type=self.tool_call_id_type,
+                                tokenizer=tokenizer,
+                                tool_call_array_index=i,
                             )
-                            tools_streamed[i] = True
+                            if (
+                                delta_message
+                                and delta_message.tool_calls
+                                and delta_message.tool_calls[0].id is not None
+                            ):
+                                history_tool_call_cnt += 1
+                                tools_streamed[i] = True
 
                     # Skip when tool_choice_uses_parser so it falls through
                     # to the auto tool_parser branches below.
@@ -1067,6 +939,16 @@ async def chat_completion_stream_generator(
                         choices=[choice_data],
                         model=model_name,
                     )
+                    # Stamp the fingerprint on terminal chunks only (those with
+                    # finish_reason set). When ``include_usage`` is on, the
+                    # trailing usage chunk below overrides this as the true
+                    # final message.
+                    if (
+                        not include_usage
+                        and self.system_fingerprint is not None
+                        and choice_data.finish_reason is not None
+                    ):
+                        chunk.system_fingerprint = self.system_fingerprint
 
                     # handle usage stats if requested & if continuous
                     if include_continuous_usage:
@@ -1099,6 +981,7 @@ async def chat_completion_stream_generator(
                     choices=[],
                     model=model_name,
                     usage=final_usage,
+                    system_fingerprint=self.system_fingerprint,
                 )
                 final_usage_data = final_usage_chunk.model_dump_json(exclude_unset=True, exclude_none=True)
                 yield f"data: {final_usage_data}\n\n"
@@ -1466,6 +1349,7 @@ async def chat_completion_full_generator(
             model=model_name,
             choices=choices,
             usage=usage,
+            system_fingerprint=self.system_fingerprint,
             prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
             prompt_token_ids=(final_res.prompt_token_ids if request.return_token_ids else None),
             kv_transfer_params=final_res.kv_transfer_params,
diff --git a/aphrodite/entrypoints/openai/cli_args.py b/aphrodite/entrypoints/openai/cli_args.py
index 25c3bf2d3e..dee890b844 100644
--- a/aphrodite/entrypoints/openai/cli_args.py
+++ b/aphrodite/entrypoints/openai/cli_args.py
@@ -151,9 +151,20 @@ class BaseFrontendArgs:
     """If set to True, log the stack trace of error responses"""
     tokens_only: bool = False
     """
-    If set to True, only enable the Tokens In<>Out endpoint. 
+    If set to True, only enable the Tokens In<>Out endpoint.
     This is intended for use in a Disaggregated Everything setup.
     """
+    fingerprint_mode: Literal["full", "hash", "custom", "none"] = "full"
+    """Controls the ``system_fingerprint`` field on responses.
+    - ``full`` (default): ``aphrodite-<version>[-<parallelism>]-<hash8>``. Encodes
+      server version, non-trivial parallelism degrees (tp/pp/dp/ep), and an
+      8-char config hash.
+    - ``hash``: ``aphrodite-<version>-<hash8>``. Parallelism stripped.
+    - ``custom``: emits the literal string from ``--fingerprint-value``.
+    - ``none``: the field is omitted (serialized as ``null``).
+    """
+    fingerprint_value: str | None = None
+    """Literal fingerprint string used when ``--fingerprint-mode=custom``."""
 
     @classmethod
     def _customize_cli_kwargs(
diff --git a/aphrodite/entrypoints/openai/completion/protocol.py b/aphrodite/entrypoints/openai/completion/protocol.py
index 32ada8dd4a..eb09891920 100644
--- a/aphrodite/entrypoints/openai/completion/protocol.py
+++ b/aphrodite/entrypoints/openai/completion/protocol.py
@@ -556,3 +556,6 @@ class CompletionStreamResponse(OpenAIBaseModel):
     model: str
     choices: list[CompletionResponseStreamChoice]
     usage: UsageInfo | None = Field(default=None)
+    # Set only on the final chunk of a stream to mirror non-streaming responses
+    # without the per-chunk serialization overhead.
+    system_fingerprint: str | None = None
diff --git a/aphrodite/entrypoints/openai/completion/serving.py b/aphrodite/entrypoints/openai/completion/serving.py
index 5580b58191..c3a8b081e2 100644
--- a/aphrodite/entrypoints/openai/completion/serving.py
+++ b/aphrodite/entrypoints/openai/completion/serving.py
@@ -369,6 +369,7 @@ async def completion_stream_generator(
 
                     chunk = CompletionStreamResponse(
                         id=request_id,
+                        object="text_completion",
                         created=created_time,
                         model=model_name,
                         choices=[
@@ -383,6 +384,10 @@ async def completion_stream_generator(
                             )
                         ],
                     )
+                    # Stamp on terminal chunk only when no trailing usage chunk
+                    # will follow (that one is the true final message).
+                    if not include_usage and self.system_fingerprint is not None and finish_reason is not None:
+                        chunk.system_fingerprint = self.system_fingerprint
                     if include_continuous_usage:
                         prompt_tokens = num_prompt_tokens[prompt_idx]
                         completion_tokens = previous_num_tokens[i]
@@ -392,7 +397,7 @@ async def completion_stream_generator(
                             total_tokens=prompt_tokens + completion_tokens,
                         )
 
-                    response_json = chunk.model_dump_json(exclude_unset=False)
+                    response_json = chunk.model_dump_json(exclude_unset=True)
                     yield f"data: {response_json}\n\n"
 
             total_prompt_tokens = sum(num_prompt_tokens)
@@ -413,6 +418,7 @@ async def completion_stream_generator(
                     model=model_name,
                     choices=[],
                     usage=final_usage_info,
+                    system_fingerprint=self.system_fingerprint,
                 )
                 final_usage_data = final_usage_chunk.model_dump_json(exclude_unset=False, exclude_none=True)
                 yield f"data: {final_usage_data}\n\n"
@@ -530,6 +536,7 @@ def request_output_to_completion_response(
             model=model_name,
             choices=choices,
             usage=usage,
+            system_fingerprint=self.system_fingerprint,
             kv_transfer_params=kv_transfer_params,
         )
 
diff --git a/aphrodite/entrypoints/openai/engine/protocol.py b/aphrodite/entrypoints/openai/engine/protocol.py
index 42e39df698..42bd3fe029 100644
--- a/aphrodite/entrypoints/openai/engine/protocol.py
+++ b/aphrodite/entrypoints/openai/engine/protocol.py
@@ -12,6 +12,7 @@
     BaseModel,
     ConfigDict,
     Field,
+    model_serializer,
     model_validator,
 )
 
@@ -162,6 +163,14 @@ class FunctionDefinition(OpenAIBaseModel):
     name: str
     description: str | None = None
     parameters: dict[str, Any] | None = None
+    defer_loading: bool | None = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.defer_loading is None:
+            data.pop("defer_loading", None)
+        return data
 
 
 # extra="forbid" is a workaround to have kwargs as a field,
diff --git a/aphrodite/entrypoints/openai/engine/serving.py b/aphrodite/entrypoints/openai/engine/serving.py
index 477aa47f37..8d995f03c2 100644
--- a/aphrodite/entrypoints/openai/engine/serving.py
+++ b/aphrodite/entrypoints/openai/engine/serving.py
@@ -148,6 +148,17 @@ def __init__(
         self.renderer = engine_client.renderer
         self.input_processor = engine_client.input_processor
 
+        # Computed once at startup (cached by ``aphrodite_config`` identity) and
+        # stamped on non-streaming responses. Streaming chunks deliberately
+        # omit it to avoid per-chunk overhead.
+        from aphrodite.entrypoints.openai.fingerprint import get_system_fingerprint
+
+        try:
+            self.system_fingerprint: str | None = get_system_fingerprint(engine_client.aphrodite_config)
+        except Exception:
+            # Never fail server startup over the fingerprint.
+            self.system_fingerprint = None
+
     async def beam_search(
         self,
         prompt: EngineInput,
@@ -692,6 +703,8 @@ def _get_decoded_token(
     def _is_model_supported(self, model_name: str | None) -> bool:
         if not model_name:
             return True
+        if envs.APHRODITE_SKIP_MODEL_NAME_VALIDATION:
+            return True
         return self.models.is_base_model(model_name)
 
 
diff --git a/aphrodite/entrypoints/openai/fingerprint.py b/aphrodite/entrypoints/openai/fingerprint.py
new file mode 100644
index 0000000000..b8aea5c149
--- /dev/null
+++ b/aphrodite/entrypoints/openai/fingerprint.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Build the ``system_fingerprint`` string returned by the OpenAI-compatible
+server.
+Four modes, configured via ``--fingerprint-mode``:
+* ``full`` (default): ``aphrodite-<version>[-<parallelism>]-<hash8>`` — encodes
+  server version, any non-trivial parallelism degree (tp/pp/dp/ep), and an
+  8-char prefix of ``aphrodite_config.compute_hash()`` (covers model identity,
+  quant config, speculative, attention backend, etc.).
+* ``hash``: ``aphrodite-<version>-<hash8>`` — parallelism stripped.
+* ``custom``: user-provided literal via ``--fingerprint-value``.
+* ``none``: the field is omitted (serialized as ``null``).
+``get_system_fingerprint`` is only called at serving-class init (a handful
+of times per server); each subclass caches the returned string on
+``self.system_fingerprint``, so per-request cost is one attribute read.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Literal
+
+FingerprintMode = Literal["full", "hash", "custom", "none"]
+
+_DEFAULT_MODE: FingerprintMode = "full"
+_CUSTOM_VALUE: str | None = None
+
+
+def set_default_fingerprint_mode(
+    mode: FingerprintMode,
+    custom_value: str | None = None,
+) -> None:
+    """Configure the fingerprint mode for subsequent ``get_system_fingerprint``
+    calls. Called once at server startup."""
+    global _DEFAULT_MODE, _CUSTOM_VALUE
+    _DEFAULT_MODE = mode
+    _CUSTOM_VALUE = custom_value
+
+
+def get_system_fingerprint(aphrodite_config: Any) -> str | None:
+    """Return the fingerprint for ``aphrodite_config`` using the mode configured by
+    ``set_default_fingerprint_mode``."""
+    return build_system_fingerprint(aphrodite_config, _DEFAULT_MODE, _CUSTOM_VALUE)
+
+
+def build_system_fingerprint(
+    aphrodite_config: Any,
+    mode: FingerprintMode = "full",
+    custom_value: str | None = None,
+) -> str | None:
+    if mode == "none":
+        return None
+    if mode == "custom":
+        return custom_value
+
+    from aphrodite import __version__ as aphrodite_version
+
+    try:
+        hash8 = aphrodite_config.compute_hash()[:8]
+    except Exception:
+        hash8 = "nohash"
+
+    if mode == "hash":
+        return f"aphrodite-{aphrodite_version}-{hash8}"
+
+    # mode == "full"
+    parts: list[str] = [f"aphrodite-{aphrodite_version}"]
+    pc = getattr(aphrodite_config, "parallel_config", None)
+    if pc is not None:
+        tp = getattr(pc, "tensor_parallel_size", 1)
+        if tp > 1:
+            parts.append(f"tp{tp}")
+        pp = getattr(pc, "pipeline_parallel_size", 1)
+        if pp > 1:
+            parts.append(f"pp{pp}")
+        dp = getattr(pc, "data_parallel_size", 1)
+        if dp > 1:
+            parts.append(f"dp{dp}")
+        if getattr(pc, "enable_expert_parallel", False):
+            parts.append("ep")
+    parts.append(hash8)
+    return "-".join(parts)
diff --git a/aphrodite/entrypoints/openai/generate/api_router.py b/aphrodite/entrypoints/openai/generate/api_router.py
index ab16fbdc49..104d446cab 100644
--- a/aphrodite/entrypoints/openai/generate/api_router.py
+++ b/aphrodite/entrypoints/openai/generate/api_router.py
@@ -67,10 +67,18 @@ async def init_generate_state(
     )
     from aphrodite.entrypoints.openai.chat_completion.serving import OpenAIServingChat
     from aphrodite.entrypoints.openai.completion.serving import OpenAIServingCompletion
+    from aphrodite.entrypoints.openai.fingerprint import set_default_fingerprint_mode
     from aphrodite.entrypoints.openai.kobold.serving import OpenAIServingKobold
     from aphrodite.entrypoints.openai.responses.serving import OpenAIServingResponses
     from aphrodite.entrypoints.serve.disagg.serving import ServingTokens
 
+    # Applied before any serving class is constructed so that each one picks
+    # up the chosen mode on its first cache miss.
+    set_default_fingerprint_mode(
+        getattr(args, "fingerprint_mode", "full"),
+        getattr(args, "fingerprint_value", None),
+    )
+
     if args.tool_server == "demo":
         tool_server: ToolServer | None = DemoToolServer()
         assert isinstance(tool_server, DemoToolServer)
diff --git a/aphrodite/entrypoints/openai/parser/harmony_utils.py b/aphrodite/entrypoints/openai/parser/harmony_utils.py
index fa751803b9..c9464c2c36 100644
--- a/aphrodite/entrypoints/openai/parser/harmony_utils.py
+++ b/aphrodite/entrypoints/openai/parser/harmony_utils.py
@@ -3,7 +3,6 @@
 
 import datetime
 from collections.abc import Iterable, Sequence
-from typing import Literal
 
 from openai.types.responses.tool import Tool
 from openai_harmony import (
@@ -66,7 +65,7 @@ def get_encoding():
 
 def get_system_message(
     model_identity: str | None = None,
-    reasoning_effort: Literal["high", "medium", "low"] | None = None,
+    reasoning_effort: str | None = None,
     start_date: str | None = None,
     browser_description: str | None = None,
     python_description: str | None = None,
@@ -82,6 +81,12 @@ def get_system_message(
         new_identity = f"{current_identity}\n{instructions}" if current_identity else instructions
         sys_msg_content = sys_msg_content.with_model_identity(new_identity)
     if reasoning_effort is not None:
+        if reasoning_effort not in REASONING_EFFORT:
+            supported_values = ", ".join(REASONING_EFFORT)
+            raise ValueError(
+                f"reasoning_effort={reasoning_effort!r} is not supported by "
+                f"Harmony. Supported values are: {supported_values}."
+            )
         sys_msg_content = sys_msg_content.with_reasoning_effort(REASONING_EFFORT[reasoning_effort])
     if start_date is None:
         # NOTE(woosuk): This brings non-determinism in Aphrodite.
diff --git a/aphrodite/entrypoints/openai/responses/serving.py b/aphrodite/entrypoints/openai/responses/serving.py
index 95ecabeac7..2c71e97ff1 100644
--- a/aphrodite/entrypoints/openai/responses/serving.py
+++ b/aphrodite/entrypoints/openai/responses/serving.py
@@ -3,7 +3,6 @@
 
 import asyncio
 import time
-import uuid
 from collections import deque
 from collections.abc import AsyncGenerator, AsyncIterator, Callable, Mapping, Sequence
 from contextlib import AsyncExitStack
@@ -13,29 +12,14 @@
 
 from fastapi import Request
 from openai.types.responses import (
-    ResponseContentPartAddedEvent,
-    ResponseContentPartDoneEvent,
-    ResponseFunctionCallArgumentsDeltaEvent,
-    ResponseFunctionCallArgumentsDoneEvent,
     ResponseFunctionToolCall,
-    ResponseFunctionToolCallItem,
     ResponseOutputItem,
-    ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent,
     ResponseOutputMessage,
     ResponseOutputText,
-    ResponseReasoningItem,
-    ResponseReasoningTextDeltaEvent,
-    ResponseReasoningTextDoneEvent,
     ResponseStatus,
-    ResponseTextDeltaEvent,
-    ResponseTextDoneEvent,
     response_text_delta_event,
 )
 from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob
-from openai.types.responses.response_reasoning_item import (
-    Content as ResponseReasoningTextContent,
-)
 from openai.types.responses.tool import Mcp, Tool
 from openai_harmony import Message as OpenAIHarmonyMessage
 from pydantic import TypeAdapter
@@ -89,15 +73,15 @@
     ResponseInProgressEvent,
     ResponseInputOutputItem,
     ResponseInputOutputMessage,
-    ResponseReasoningPartAddedEvent,
-    ResponseReasoningPartDoneEvent,
     ResponsesRequest,
     ResponsesResponse,
     ResponseUsage,
     StreamingResponsesResponse,
 )
 from aphrodite.entrypoints.openai.responses.streaming_events import (
+    SimpleStreamingEventProcessor,
     StreamingState,
+    _StateType,
     emit_content_delta_events,
     emit_previous_item_done_events,
     emit_tool_action_events,
@@ -435,9 +419,13 @@ async def create_responses(
                     context = SimpleContext()
 
             if self.parser and self.parser.reasoning_parser_cls is not None:
+                chat_template_kwargs = self._effective_chat_template_kwargs(request)
+                reasoning_parser_kwargs = {
+                    "chat_template_kwargs": chat_template_kwargs,
+                }
                 reasoning_parser = self.parser.reasoning_parser_cls(
                     tokenizer,
-                    chat_template_kwargs=self._effective_chat_template_kwargs(request),
+                    chat_template_kwargs=chat_template_kwargs,
                 )
                 if (
                     isinstance(
@@ -446,7 +434,7 @@ async def create_responses(
                     )
                     and struct_out.all_non_structural_tag_constraints_none()
                 ):
-                    sampling_params.structured_outputs = replace(  # type: ignore[type-var]
+                    sampling_params.structured_outputs = replace(
                         struct_out,
                         structural_tag=reasoning_parser.prepare_structured_tag(
                             struct_out.structural_tag, self.tool_server
@@ -460,6 +448,9 @@ async def create_responses(
                 lora_request=lora_request,
                 priority=request.priority,
                 trace_headers=trace_headers,
+                reasoning_parser_kwargs=reasoning_parser_kwargs
+                if self.parser and self.parser.reasoning_parser_cls is not None
+                else None,
             )
             generators.append(generator)
 
@@ -604,6 +595,7 @@ async def _generate_with_builtin_tools(
         lora_request: LoRARequest | None = None,
         priority: int = 0,
         trace_headers: Mapping[str, str] | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ):
         max_model_len = self.model_config.max_model_len
 
@@ -627,6 +619,7 @@ async def _generate_with_builtin_tools(
                 lora_request=lora_request,
                 trace_headers=trace_headers,
                 priority=priority,
+                reasoning_parser_kwargs=reasoning_parser_kwargs,
             )
 
             async for res in generator:
@@ -1265,546 +1258,59 @@ async def _process_simple_streaming_events(
         created_time: int,
         _increment_sequence_number_and_return: Callable[[StreamingResponsesResponse], StreamingResponsesResponse],
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
-        current_content_index = 0
-        current_output_index = 0
-        current_item_id = ""
-        current_tool_call_index: int | None = None
+        processor = SimpleStreamingEventProcessor()
         parser = self.parser(tokenizer, request.tools) if self.parser else None
-        first_delta_sent = False
-        previous_delta_messages: list[DeltaMessage] = []
+
+        def _get_logprobs(
+            output: CompletionOutput,
+        ) -> list[response_text_delta_event.Logprob]:
+            if not request.is_include_output_logprobs():
+                return []
+            return self._create_stream_response_logprobs(
+                token_ids=output.token_ids,
+                logprobs=output.logprobs,
+                tokenizer=tokenizer,
+                top_logprobs=request.top_logprobs,
+            )
+
         async for ctx in result_generator:
             assert isinstance(ctx, SimpleContext)
-            if ctx.last_output is None:
+            if ctx.last_output is None or not ctx.last_output.outputs:
                 continue
-            if ctx.last_output.outputs:
-                output = ctx.last_output.outputs[0]
-                # finish_reason='error' indicates a retryable error
-                self._raise_if_error(output.finish_reason, request.request_id)
-                delta_text = output.text
-                delta_token_ids = as_list(output.token_ids)
-
-                if parser:
-                    delta_message = parser.parse_delta(
-                        delta_text=delta_text,
-                        delta_token_ids=delta_token_ids,
-                        request=request,
-                        prompt_token_ids=ctx.last_output.prompt_token_ids,
-                    )
-                else:
-                    delta_message = DeltaMessage(
-                        content=output.text,
-                    )
-                if not delta_message:
-                    continue
-                tool_call_item_started = False
-                if not first_delta_sent:
-                    current_item_id = random_uuid()
-                    if delta_message.tool_calls:
-                        current_tool_call_id = f"call_{random_uuid()}"
-                        assert len(delta_message.tool_calls) == 1, "Multiple tool calls in one delta is not supported"
-                        assert delta_message.tool_calls[0].function is not None, (
-                            "Tool call without function is not supported"
-                        )
-                        assert delta_message.tool_calls[0].function.name is not None, (
-                            "Tool call without function name is not supported"
-                        )
-                        current_tool_call_name = delta_message.tool_calls[0].function.name
-                        current_tool_call_index = delta_message.tool_calls[0].index
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemAddedEvent(
-                                type="response.output_item.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseFunctionToolCallItem(
-                                    type="function_call",
-                                    id=current_item_id,
-                                    call_id=current_tool_call_id,
-                                    name=current_tool_call_name,
-                                    arguments="",
-                                    status="in_progress",
-                                ),
-                            )
-                        )
-                        tool_call_item_started = True
-                    elif delta_message.reasoning:
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemAddedEvent(
-                                type="response.output_item.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseReasoningItem(
-                                    type="reasoning",
-                                    id=current_item_id,
-                                    summary=[],
-                                    status="in_progress",
-                                ),
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseReasoningPartAddedEvent(
-                                type="response.reasoning_part.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item_id=current_item_id,
-                                content_index=current_content_index,
-                                part=ResponseReasoningTextContent(
-                                    text="",
-                                    type="reasoning_text",
-                                ),
-                            )
-                        )
-                    elif not delta_message.tool_calls:
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemAddedEvent(
-                                type="response.output_item.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseOutputMessage(
-                                    id=current_item_id,
-                                    type="message",
-                                    role="assistant",
-                                    content=[],
-                                    status="in_progress",
-                                ),
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseContentPartAddedEvent(
-                                type="response.content_part.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item_id=current_item_id,
-                                content_index=current_content_index,
-                                part=ResponseOutputText(
-                                    type="output_text",
-                                    text="",
-                                    annotations=[],
-                                    logprobs=[],
-                                ),
-                            )
-                        )
-                    first_delta_sent = True
-
-                # check delta message and previous delta message are
-                # same as content or reasoning content
-                if (
-                    previous_delta_messages
-                    and previous_delta_messages[-1].reasoning is not None
-                    and delta_message.content is not None
-                ):
-                    # from reasoning to normal content, send done
-                    # event for reasoning
-                    reason_content = "".join(pm.reasoning for pm in previous_delta_messages if pm.reasoning is not None)
-
-                    # delta message could have both reasoning and
-                    # content. Include current delta's reasoning in the
-                    # finalization since it may carry the tail end of
-                    # reasoning text (e.g. when reasoning end and
-                    # content start arrive in the same delta).
-                    if delta_message.reasoning is not None:
-                        yield _increment_sequence_number_and_return(
-                            ResponseReasoningTextDeltaEvent(
-                                type="response.reasoning_text.delta",
-                                sequence_number=-1,
-                                content_index=current_content_index,
-                                output_index=current_output_index,
-                                item_id=current_item_id,
-                                delta=delta_message.reasoning,
-                            )
-                        )
-                        reason_content += delta_message.reasoning
-                        delta_message = DeltaMessage(content=delta_message.content)
-
-                    yield _increment_sequence_number_and_return(
-                        ResponseReasoningTextDoneEvent(
-                            type="response.reasoning_text.done",
-                            item_id=current_item_id,
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            content_index=current_content_index,
-                            text=reason_content,
-                        )
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseReasoningPartDoneEvent(
-                            type="response.reasoning_part.done",
-                            sequence_number=-1,
-                            item_id=current_item_id,
-                            output_index=current_output_index,
-                            content_index=current_content_index,
-                            part=ResponseReasoningTextContent(
-                                text=reason_content,
-                                type="reasoning_text",
-                            ),
-                        )
-                    )
-                    current_content_index = 0
-                    reasoning_item = ResponseReasoningItem(
-                        type="reasoning",
-                        content=[
-                            ResponseReasoningTextContent(
-                                text=reason_content,
-                                type="reasoning_text",
-                            ),
-                        ],
-                        status="completed",
-                        id=current_item_id,
-                        summary=[],
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseOutputItemDoneEvent(
-                            type="response.output_item.done",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item=reasoning_item,
-                        )
-                    )
-                    current_output_index += 1
-                    current_item_id = str(uuid.uuid4())
-                    yield _increment_sequence_number_and_return(
-                        ResponseOutputItemAddedEvent(
-                            type="response.output_item.added",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item=ResponseOutputMessage(
-                                id=current_item_id,
-                                type="message",
-                                role="assistant",
-                                content=[],
-                                status="in_progress",
-                            ),
-                        )
-                    )
-                    yield _increment_sequence_number_and_return(
-                        ResponseContentPartAddedEvent(
-                            type="response.content_part.added",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                            content_index=current_content_index,
-                            part=ResponseOutputText(
-                                type="output_text",
-                                text="",
-                                annotations=[],
-                                logprobs=[],
-                            ),
-                        )
-                    )
-                    # reset previous delta messages
-                    previous_delta_messages = []
-                if delta_message.tool_calls and delta_message.tool_calls[0].function:
-                    tool_call = delta_message.tool_calls[0]
-                    tool_call_function = tool_call.function
-                    if (
-                        current_tool_call_index is not None
-                        and tool_call.index is not None
-                        and tool_call.index != current_tool_call_index
-                        and tool_call_function is not None
-                        and tool_call_function.name is not None
-                    ):
-                        # From one tool call to another, finalize the previous
-                        # function-call item before opening the next one.
-                        parts = []
-                        for pm in previous_delta_messages:
-                            if pm.tool_calls:
-                                previous_tool_call = pm.tool_calls[0]
-                                if previous_tool_call.function is not None:
-                                    parts.append(previous_tool_call.function.arguments or "")
-
-                        tool_call_arguments = "".join(parts)
-                        yield _increment_sequence_number_and_return(
-                            ResponseFunctionCallArgumentsDoneEvent(
-                                type="response.function_call_arguments.done",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item_id=current_item_id,
-                                arguments=tool_call_arguments,
-                                name=current_tool_call_name,
-                            )
-                        )
-                        function_call_item = ResponseFunctionToolCall(
-                            type="function_call",
-                            name=current_tool_call_name,
-                            arguments=tool_call_arguments,
-                            status="completed",
-                            id=current_item_id,
-                            call_id=current_tool_call_id,
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemDoneEvent(
-                                type="response.output_item.done",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=function_call_item,
-                            )
-                        )
-                        # Reset previous delta messages so the next tool call
-                        # does not reuse arguments from the completed item.
-                        previous_delta_messages = []
-                        current_output_index += 1
-                        current_item_id = random_uuid()
-                        current_tool_call_name = tool_call_function.name
-                        current_tool_call_id = f"call_{random_uuid()}"
-                        current_tool_call_index = tool_call.index
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemAddedEvent(
-                                type="response.output_item.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseFunctionToolCallItem(
-                                    type="function_call",
-                                    id=current_item_id,
-                                    call_id=current_tool_call_id,
-                                    name=current_tool_call_name,
-                                    arguments="",
-                                    status="in_progress",
-                                ),
-                            )
-                        )
-                        current_content_index = 0
-                        tool_call_item_started = True
-
-                    if delta_message.tool_calls[0].function.arguments:
-                        yield _increment_sequence_number_and_return(
-                            ResponseFunctionCallArgumentsDeltaEvent(
-                                type="response.function_call_arguments.delta",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item_id=current_item_id,
-                                delta=delta_message.tool_calls[0].function.arguments,
-                            )
-                        )
-                    # tool call initiated with no arguments
-                    elif delta_message.tool_calls[0].function.name and not tool_call_item_started:
-                        # send done with current content part
-                        # and add new function call item
-                        yield _increment_sequence_number_and_return(
-                            ResponseTextDoneEvent(
-                                type="response.output_text.done",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                content_index=current_content_index,
-                                text="",
-                                logprobs=[],
-                                item_id=current_item_id,
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseContentPartDoneEvent(
-                                type="response.content_part.done",
-                                sequence_number=-1,
-                                item_id=current_item_id,
-                                output_index=current_output_index,
-                                content_index=current_content_index,
-                                part=ResponseOutputText(
-                                    type="output_text",
-                                    text="",
-                                    annotations=[],
-                                    logprobs=[],
-                                ),
-                            )
-                        )
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemDoneEvent(
-                                type="response.output_item.done",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseOutputMessage(
-                                    id=current_item_id,
-                                    type="message",
-                                    role="assistant",
-                                    content=[],
-                                    status="completed",
-                                ),
-                            )
-                        )
-                        current_output_index += 1
-                        current_item_id = random_uuid()
-                        current_tool_call_name = delta_message.tool_calls[0].function.name
-                        current_tool_call_id = f"call_{random_uuid()}"
-                        current_tool_call_index = delta_message.tool_calls[0].index
-                        yield _increment_sequence_number_and_return(
-                            ResponseOutputItemAddedEvent(
-                                type="response.output_item.added",
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                item=ResponseFunctionToolCallItem(
-                                    type="function_call",
-                                    id=current_item_id,
-                                    call_id=current_tool_call_id,
-                                    name=current_tool_call_name,
-                                    arguments="",
-                                    status="in_progress",
-                                ),
-                            )
-                        )
-                        # skip content part for tool call
-                        current_content_index = 1
-                        continue
-                elif delta_message.reasoning is not None:
-                    yield _increment_sequence_number_and_return(
-                        ResponseReasoningTextDeltaEvent(
-                            type="response.reasoning_text.delta",
-                            sequence_number=-1,
-                            content_index=current_content_index,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                            delta=delta_message.reasoning,
-                        )
-                    )
-                elif delta_message.content:
-                    yield _increment_sequence_number_and_return(
-                        ResponseTextDeltaEvent(
-                            type="response.output_text.delta",
-                            sequence_number=-1,
-                            content_index=current_content_index,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                            delta=delta_message.content,
-                            logprobs=(
-                                self._create_stream_response_logprobs(
-                                    token_ids=output.token_ids,
-                                    logprobs=output.logprobs,
-                                    tokenizer=tokenizer,
-                                    top_logprobs=request.top_logprobs,
-                                )
-                                if request.is_include_output_logprobs()
-                                else []
-                            ),
-                        )
-                    )
 
-                previous_delta_messages.append(delta_message)
+            output = ctx.last_output.outputs[0]
+            self._raise_if_error(output.finish_reason, request.request_id)
+            delta_text = output.text
+            delta_token_ids = as_list(output.token_ids)
+
+            if parser:
+                delta_message = parser.parse_delta(
+                    delta_text=delta_text,
+                    delta_token_ids=delta_token_ids,
+                    request=request,
+                    prompt_token_ids=ctx.last_output.prompt_token_ids,
+                )
+            else:
+                delta_message = DeltaMessage(content=output.text)
 
-        if previous_delta_messages:
-            parts = []
-            for pm in previous_delta_messages:
-                if pm.tool_calls:
-                    assert len(pm.tool_calls) == 1, "Multiple tool calls in one delta is not supported"
-                    assert pm.tool_calls[0].function is not None, "Tool call without function is not supported"
-                    parts.append(pm.tool_calls[0].function.arguments or "")
+            if not delta_message:
+                continue
 
-            tool_call_arguments = "".join(parts)
-            if tool_call_arguments:
-                yield _increment_sequence_number_and_return(
-                    ResponseFunctionCallArgumentsDoneEvent(
-                        type="response.function_call_arguments.done",
-                        sequence_number=-1,
-                        output_index=current_output_index,
-                        item_id=current_item_id,
-                        arguments=tool_call_arguments,
-                        name=current_tool_call_name,
-                    )
-                )
-                current_content_index = 0
-                function_call_item = ResponseFunctionToolCall(
-                    type="function_call",
-                    name=current_tool_call_name,
-                    arguments=tool_call_arguments,
-                    status="completed",
-                    id=current_item_id,
-                    call_id=current_tool_call_id,
-                )
-                yield _increment_sequence_number_and_return(
-                    ResponseOutputItemDoneEvent(
-                        type="response.output_item.done",
-                        sequence_number=-1,
-                        output_index=current_output_index,
-                        item=function_call_item,
-                    )
-                )
+            target_state, tool_call = processor.resolve_target_state(delta_message)
+            if target_state == _StateType.NONE:
+                continue
 
-            elif previous_delta_messages[-1].reasoning is not None:
-                reason_content = "".join(pm.reasoning for pm in previous_delta_messages if pm.reasoning is not None)
-                yield _increment_sequence_number_and_return(
-                    ResponseReasoningTextDoneEvent(
-                        type="response.reasoning_text.done",
-                        item_id=current_item_id,
-                        sequence_number=-1,
-                        output_index=current_output_index,
-                        content_index=current_content_index,
-                        text=reason_content,
-                    )
-                )
-                yield _increment_sequence_number_and_return(
-                    ResponseReasoningPartDoneEvent(
-                        type="response.reasoning_part.done",
-                        sequence_number=-1,
-                        item_id=current_item_id,
-                        output_index=current_output_index,
-                        content_index=current_content_index,
-                        part=ResponseReasoningTextContent(
-                            text=reason_content,
-                            type="reasoning_text",
-                        ),
-                    )
-                )
-                reasoning_item = ResponseReasoningItem(
-                    type="reasoning",
-                    content=[
-                        ResponseReasoningTextContent(
-                            text=reason_content,
-                            type="reasoning_text",
-                        ),
-                    ],
-                    status="completed",
-                    id=current_item_id,
-                    summary=[],
-                )
-                yield _increment_sequence_number_and_return(
-                    ResponseOutputItemDoneEvent(
-                        type="response.output_item.done",
-                        sequence_number=-1,
-                        output_index=current_output_index,
-                        item=reasoning_item,
-                    )
-                )
-            elif previous_delta_messages[-1].content:
-                final_content = "".join(pm.content for pm in previous_delta_messages if pm.content)
-                yield _increment_sequence_number_and_return(
-                    ResponseTextDoneEvent(
-                        type="response.output_text.done",
-                        sequence_number=-1,
-                        output_index=current_output_index,
-                        content_index=current_content_index,
-                        text=final_content,
-                        logprobs=[],
-                        item_id=current_item_id,
-                    )
-                )
-                part = ResponseOutputText(
-                    text=final_content,
-                    type="output_text",
-                    annotations=[],
-                )
-                yield _increment_sequence_number_and_return(
-                    ResponseContentPartDoneEvent(
-                        type="response.content_part.done",
-                        sequence_number=-1,
-                        item_id=current_item_id,
-                        output_index=current_output_index,
-                        content_index=current_content_index,
-                        part=part,
-                    )
-                )
-                item = ResponseOutputMessage(
-                    type="message",
-                    role="assistant",
-                    content=[
-                        part,
-                    ],
-                    status="completed",
-                    id=current_item_id,
-                    summary=[],
-                )
-                yield _increment_sequence_number_and_return(
-                    ResponseOutputItemDoneEvent(
-                        type="response.output_item.done",
-                        sequence_number=-1,
-                        output_index=current_output_index,
-                        item=item,
-                    )
-                )
+            if processor.needs_transition(target_state, tool_call):
+                for event in processor.close_current():
+                    yield _increment_sequence_number_and_return(event)
+                for event in processor.open(target_state, tool_call):
+                    yield _increment_sequence_number_and_return(event)
+
+            for event in processor.emit_delta(delta_message, output, _get_logprobs):
+                yield _increment_sequence_number_and_return(event)
+
+        for event in processor.close_current():
+            yield _increment_sequence_number_and_return(event)
 
     async def _process_harmony_streaming_events(
         self,
@@ -1872,7 +1378,7 @@ def _increment_sequence_number_and_return(
         async with AsyncExitStack() as exit_stack:
             if self.use_harmony:
                 # TODO: in streaming, we noticed this bug:
-                # https://github.com/vllm-project/vllm/issues/25697
+                # https://github.com/aphrodite-project/aphrodite/issues/25697
                 await self._initialize_tool_sessions(request, context, exit_stack)
                 processor = self._process_harmony_streaming_events
             else:
diff --git a/aphrodite/entrypoints/openai/responses/streaming_events.py b/aphrodite/entrypoints/openai/responses/streaming_events.py
index a38b5f734f..ea498ac8b4 100644
--- a/aphrodite/entrypoints/openai/responses/streaming_events.py
+++ b/aphrodite/entrypoints/openai/responses/streaming_events.py
@@ -16,8 +16,10 @@
 """
 
 import json
-from dataclasses import dataclass
-from typing import Final
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from typing import Any, ClassVar, Final, NamedTuple
 
 from openai.types.responses import (
     ResponseCodeInterpreterCallCodeDeltaEvent,
@@ -31,6 +33,7 @@
     ResponseFunctionCallArgumentsDeltaEvent,
     ResponseFunctionCallArgumentsDoneEvent,
     ResponseFunctionToolCall,
+    ResponseFunctionToolCallItem,
     ResponseFunctionWebSearch,
     ResponseMcpCallArgumentsDeltaEvent,
     ResponseMcpCallArgumentsDoneEvent,
@@ -49,6 +52,7 @@
     ResponseWebSearchCallInProgressEvent,
     ResponseWebSearchCallSearchingEvent,
     response_function_web_search,
+    response_text_delta_event,
 )
 from openai.types.responses.response_output_item import McpCall
 from openai.types.responses.response_reasoning_item import (
@@ -57,12 +61,14 @@
 from openai_harmony import Message as HarmonyMessage
 
 from aphrodite.entrypoints.mcp.tool_server import ToolServer
+from aphrodite.entrypoints.openai.engine.protocol import DeltaMessage
 from aphrodite.entrypoints.openai.responses.context import StreamingHarmonyContext
 from aphrodite.entrypoints.openai.responses.protocol import (
     ResponseReasoningPartAddedEvent,
     ResponseReasoningPartDoneEvent,
     StreamingResponsesResponse,
 )
+from aphrodite.outputs import CompletionOutput
 from aphrodite.utils import random_uuid
 
 TOOL_NAME_TO_MCP_SERVER_LABEL: Final[dict[str, str]] = {
@@ -792,3 +798,443 @@ def emit_tool_action_events(
             events.extend(emit_mcp_completion_events(recipient, previous_item.content[0].text, state))
 
     return events
+
+
+# =====================================================================
+# Simple streaming helpers
+# =====================================================================
+
+
+class _StateType(Enum):
+    NONE = auto()
+    CONTENT = auto()
+    REASONING = auto()
+    TOOL_CALL = auto()
+
+
+@dataclass
+class SimpleStreamingState:
+    output_index: int = 0
+    current_item_id: str = ""
+    content_index: int = 0
+    accumulated_text: str = ""
+    tool_call_id: str = ""
+    tool_call_name: str = ""
+    tool_call_index: int | None = None
+    has_emitted_tool_call_delta: bool = False
+    current_state: _StateType = field(default_factory=lambda: _StateType.NONE)
+
+
+def emit_simple_content_open(
+    state: SimpleStreamingState,
+) -> list[StreamingResponsesResponse]:
+    state.current_state = _StateType.CONTENT
+    state.current_item_id = random_uuid()
+    state.content_index = 0
+    state.accumulated_text = ""
+    return [
+        ResponseOutputItemAddedEvent(
+            type="response.output_item.added",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item=ResponseOutputMessage(
+                id=state.current_item_id,
+                type="message",
+                role="assistant",
+                content=[],
+                status="in_progress",
+            ),
+        ),
+        ResponseContentPartAddedEvent(
+            type="response.content_part.added",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item_id=state.current_item_id,
+            content_index=state.content_index,
+            part=ResponseOutputText(
+                type="output_text",
+                text="",
+                annotations=[],
+                logprobs=[],
+            ),
+        ),
+    ]
+
+
+def emit_simple_content_delta(
+    state: SimpleStreamingState,
+    delta: str,
+    logprobs: list[response_text_delta_event.Logprob] | None = None,
+) -> list[StreamingResponsesResponse]:
+    state.accumulated_text += delta
+    return [
+        ResponseTextDeltaEvent(
+            type="response.output_text.delta",
+            sequence_number=-1,
+            content_index=state.content_index,
+            output_index=state.output_index,
+            item_id=state.current_item_id,
+            delta=delta,
+            logprobs=logprobs or [],
+        )
+    ]
+
+
+def emit_simple_content_done(
+    state: SimpleStreamingState,
+) -> list[StreamingResponsesResponse]:
+    part = ResponseOutputText(
+        type="output_text",
+        text=state.accumulated_text,
+        annotations=[],
+    )
+    events: list[StreamingResponsesResponse] = [
+        ResponseTextDoneEvent(
+            type="response.output_text.done",
+            sequence_number=-1,
+            output_index=state.output_index,
+            content_index=state.content_index,
+            text=state.accumulated_text,
+            logprobs=[],
+            item_id=state.current_item_id,
+        ),
+        ResponseContentPartDoneEvent(
+            type="response.content_part.done",
+            sequence_number=-1,
+            item_id=state.current_item_id,
+            output_index=state.output_index,
+            content_index=state.content_index,
+            part=part,
+        ),
+        ResponseOutputItemDoneEvent(
+            type="response.output_item.done",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item=ResponseOutputMessage(
+                id=state.current_item_id,
+                type="message",
+                role="assistant",
+                content=[part] if state.accumulated_text else [],
+                status="completed",
+                summary=[],
+            ),
+        ),
+    ]
+    state.output_index += 1
+    state.current_state = _StateType.NONE
+    return events
+
+
+def emit_simple_reasoning_open(
+    state: SimpleStreamingState,
+) -> list[StreamingResponsesResponse]:
+    state.current_state = _StateType.REASONING
+    state.current_item_id = random_uuid()
+    state.content_index = 0
+    state.accumulated_text = ""
+    return [
+        ResponseOutputItemAddedEvent(
+            type="response.output_item.added",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item=ResponseReasoningItem(
+                type="reasoning",
+                id=state.current_item_id,
+                summary=[],
+                status="in_progress",
+            ),
+        ),
+        ResponseReasoningPartAddedEvent(
+            type="response.reasoning_part.added",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item_id=state.current_item_id,
+            content_index=state.content_index,
+            part=ResponseReasoningTextContent(
+                text="",
+                type="reasoning_text",
+            ),
+        ),
+    ]
+
+
+def emit_simple_reasoning_delta(
+    state: SimpleStreamingState,
+    delta: str,
+) -> list[StreamingResponsesResponse]:
+    state.accumulated_text += delta
+    return [
+        ResponseReasoningTextDeltaEvent(
+            type="response.reasoning_text.delta",
+            item_id=state.current_item_id,
+            sequence_number=-1,
+            output_index=state.output_index,
+            content_index=state.content_index,
+            delta=delta,
+        )
+    ]
+
+
+def emit_simple_reasoning_done(
+    state: SimpleStreamingState,
+) -> list[StreamingResponsesResponse]:
+    part = ResponseReasoningTextContent(
+        text=state.accumulated_text,
+        type="reasoning_text",
+    )
+    events: list[StreamingResponsesResponse] = [
+        ResponseReasoningTextDoneEvent(
+            type="response.reasoning_text.done",
+            item_id=state.current_item_id,
+            sequence_number=-1,
+            output_index=state.output_index,
+            content_index=state.content_index,
+            text=state.accumulated_text,
+        ),
+        ResponseReasoningPartDoneEvent(
+            type="response.reasoning_part.done",
+            sequence_number=-1,
+            item_id=state.current_item_id,
+            output_index=state.output_index,
+            content_index=state.content_index,
+            part=part,
+        ),
+        ResponseOutputItemDoneEvent(
+            type="response.output_item.done",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item=ResponseReasoningItem(
+                type="reasoning",
+                content=[part],
+                status="completed",
+                id=state.current_item_id,
+                summary=[],
+            ),
+        ),
+    ]
+    state.output_index += 1
+    state.current_state = _StateType.NONE
+    return events
+
+
+def emit_simple_tool_call_open(
+    state: SimpleStreamingState,
+    name: str,
+    index: int | None,
+) -> list[StreamingResponsesResponse]:
+    state.current_state = _StateType.TOOL_CALL
+    state.current_item_id = random_uuid()
+    state.tool_call_id = f"call_{random_uuid()}"
+    state.tool_call_name = name
+    state.tool_call_index = index
+    state.accumulated_text = ""
+    state.has_emitted_tool_call_delta = False
+    return [
+        ResponseOutputItemAddedEvent(
+            type="response.output_item.added",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item=ResponseFunctionToolCallItem(
+                type="function_call",
+                id=state.current_item_id,
+                call_id=state.tool_call_id,
+                name=name,
+                arguments="",
+                status="in_progress",
+            ),
+        ),
+    ]
+
+
+def emit_simple_tool_call_delta(
+    state: SimpleStreamingState,
+    delta: str,
+) -> list[StreamingResponsesResponse]:
+    state.accumulated_text += delta
+    state.has_emitted_tool_call_delta = True
+    return [
+        ResponseFunctionCallArgumentsDeltaEvent(
+            type="response.function_call_arguments.delta",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item_id=state.current_item_id,
+            delta=delta,
+        )
+    ]
+
+
+def emit_simple_tool_call_done(
+    state: SimpleStreamingState,
+) -> list[StreamingResponsesResponse]:
+    events: list[StreamingResponsesResponse] = []
+    if state.has_emitted_tool_call_delta:
+        events.append(
+            ResponseFunctionCallArgumentsDoneEvent(
+                type="response.function_call_arguments.done",
+                sequence_number=-1,
+                output_index=state.output_index,
+                item_id=state.current_item_id,
+                arguments=state.accumulated_text,
+                name=state.tool_call_name,
+            )
+        )
+    events.append(
+        ResponseOutputItemDoneEvent(
+            type="response.output_item.done",
+            sequence_number=-1,
+            output_index=state.output_index,
+            item=ResponseFunctionToolCall(
+                type="function_call",
+                name=state.tool_call_name,
+                arguments=state.accumulated_text,
+                status="completed",
+                id=state.current_item_id,
+                call_id=state.tool_call_id,
+            ),
+        ),
+    )
+    state.output_index += 1
+    state.current_state = _StateType.NONE
+    return events
+
+
+class _StateHandlers(NamedTuple):
+    """Tuple for each state: open(start), delta(chunk), done(finish)."""
+
+    open_fn: Callable[..., list[StreamingResponsesResponse]]
+    delta_fn: Callable[..., list[StreamingResponsesResponse]]
+    done_fn: Callable[..., list[StreamingResponsesResponse]]
+
+
+class SimpleStreamingEventProcessor:
+    """
+    State-machine processor for the simple (non-Harmony) streaming path.
+
+    Core flow:
+      1. Resolve the target state from the delta_message
+         (CONTENT / REASONING / TOOL_CALL).
+      2. If the target state differs from the current one,
+         close_current() then open() the new state.
+      3. emit_delta() produces the incremental events for the state.
+
+    State lifecycle:
+      open()  ->  repeated emit_delta()  ->  close_current()
+    """
+
+    _STATE_HANDLERS: ClassVar[dict[_StateType, _StateHandlers]] = {
+        _StateType.CONTENT: _StateHandlers(
+            emit_simple_content_open,
+            emit_simple_content_delta,
+            emit_simple_content_done,
+        ),
+        _StateType.REASONING: _StateHandlers(
+            emit_simple_reasoning_open,
+            emit_simple_reasoning_delta,
+            emit_simple_reasoning_done,
+        ),
+        _StateType.TOOL_CALL: _StateHandlers(
+            emit_simple_tool_call_open,
+            emit_simple_tool_call_delta,
+            emit_simple_tool_call_done,
+        ),
+    }
+
+    def __init__(self, state: SimpleStreamingState | None = None) -> None:
+        self.state = state or SimpleStreamingState()
+
+    def resolve_target_state(self, delta_message: DeltaMessage) -> tuple[_StateType, Any]:
+        """
+        Decide which state the next delta belongs to.
+
+        Priority: TOOL_CALL > REASONING > CONTENT, fallback to NONE.
+        For TOOL_CALL the first tool_call object is also returned so
+        callers can detect a switch between consecutive tools.
+        """
+        if delta_message.tool_calls and delta_message.tool_calls[0].function is not None:
+            return _StateType.TOOL_CALL, delta_message.tool_calls[0]
+        if delta_message.reasoning is not None:
+            return _StateType.REASONING, None
+        if delta_message.content:
+            return _StateType.CONTENT, None
+        return _StateType.NONE, None
+
+    def needs_transition(self, target_state: _StateType, tool_call: Any) -> bool:
+        """
+        Return True when we must close the current state and open a new one.
+
+        Two cases trigger a transition:
+          1. The target state differs from the current state
+             (e.g. CONTENT -> TOOL_CALL).
+          2. We are already in TOOL_CALL but the next tool_call has a
+             different index (multiple consecutive tool calls).
+        """
+        if self.state.current_state != target_state:
+            return True
+        return (
+            target_state == _StateType.TOOL_CALL
+            and tool_call is not None
+            and self.state.tool_call_index is not None
+            and tool_call.index is not None
+            and self.state.tool_call_index != tool_call.index
+        )
+
+    def close_current(self) -> list[StreamingResponsesResponse]:
+        """Close the current state and emit its 'done' event sequence."""
+        handlers = self._STATE_HANDLERS.get(self.state.current_state)
+        if handlers is None:
+            return []
+        return handlers.done_fn(self.state)
+
+    def open(self, target_state: _StateType, tool_call: Any = None) -> list[StreamingResponsesResponse]:
+        """Open a new state and emit its 'added' / 'open' event sequence."""
+        handlers = self._STATE_HANDLERS[target_state]
+        if target_state == _StateType.TOOL_CALL:
+            assert tool_call is not None
+            return handlers.open_fn(self.state, tool_call.function.name, tool_call.index)
+        return handlers.open_fn(self.state)
+
+    def emit_delta(
+        self,
+        delta_message: DeltaMessage,
+        output: CompletionOutput,
+        get_logprobs: Callable[[CompletionOutput], list[response_text_delta_event.Logprob]] | None = None,
+    ) -> list[StreamingResponsesResponse]:
+        """
+        Emit incremental events for the current state from the delta.
+
+        Special case: when already in REASONING and the same delta also
+        carries content, we emit the reasoning delta, close reasoning,
+        open content, and then emit the content delta.
+        """
+        handlers = self._STATE_HANDLERS[self.state.current_state]
+        events: list[StreamingResponsesResponse] = []
+
+        # Special case: reasoning -> content inside a single delta.
+        if (
+            self.state.current_state == _StateType.REASONING
+            and delta_message.reasoning is not None
+            and delta_message.content is not None
+        ):
+            events.extend(handlers.delta_fn(self.state, delta_message.reasoning))
+            events.extend(self.close_current())
+            events.extend(self.open(_StateType.CONTENT))
+            content_handlers = self._STATE_HANDLERS[_StateType.CONTENT]
+            logprobs = get_logprobs(output) if get_logprobs else []
+            events.extend(content_handlers.delta_fn(self.state, delta_message.content, logprobs))
+            return events
+
+        if self.state.current_state == _StateType.TOOL_CALL:
+            assert delta_message.tool_calls is not None
+            tool_call_function = delta_message.tool_calls[0].function
+            assert tool_call_function is not None
+            if tool_call_function.arguments:
+                return handlers.delta_fn(self.state, tool_call_function.arguments)
+            return []
+        elif self.state.current_state == _StateType.REASONING:
+            assert delta_message.reasoning is not None
+            return handlers.delta_fn(self.state, delta_message.reasoning)
+        elif self.state.current_state == _StateType.CONTENT:
+            assert delta_message.content is not None
+            logprobs = get_logprobs(output) if get_logprobs else []
+            return handlers.delta_fn(self.state, delta_message.content, logprobs)
+        return []
diff --git a/aphrodite/entrypoints/serve/render/serving.py b/aphrodite/entrypoints/serve/render/serving.py
index b762aff569..12f34d4ccb 100644
--- a/aphrodite/entrypoints/serve/render/serving.py
+++ b/aphrodite/entrypoints/serve/render/serving.py
@@ -498,7 +498,7 @@ async def preprocess_chat(
             default_template_kwargs,
             dict(
                 tools=tool_dicts,
-                tokenize=is_mistral_tokenizer(renderer.tokenizer),
+                tokenize=(is_mistral_tokenizer(renderer.tokenizer) or self.model_config.enable_prompt_embeds),
             ),
         )
 
diff --git a/aphrodite/env_override.py b/aphrodite/env_override.py
index 2fc8993ce7..f2495eba6e 100644
--- a/aphrodite/env_override.py
+++ b/aphrodite/env_override.py
@@ -94,12 +94,12 @@ def _maybe_set_cuda_compatibility_path():
 # that interact with aphrodite workers.
 # they are executed whenever `import aphrodite` is called.
 
-# see https://github.com/vllm-project/vllm/pull/15951
+# see https://github.com/aphrodite-project/aphrodite/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()
 os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1"
 
-# see https://github.com/vllm-project/vllm/issues/10480 and
-# https://github.com/vllm-project/vllm/issues/10619.
+# see https://github.com/aphrodite-project/aphrodite/issues/10480 and
+# https://github.com/aphrodite-project/aphrodite/issues/10619.
 os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
 # Enable Triton autotuning result caching to disk by default.
@@ -289,7 +289,7 @@ def is_none_layout(buf_name: str) -> bool:
 # torch 2.9 Inductor Scheduler monkeypatch
 # ========================================
 # This change monkeypatches a function in Inductor to work around the following
-# bug: https://github.com/vllm-project/vllm/issues/26678
+# bug: https://github.com/aphrodite-project/aphrodite/issues/26678
 #
 # The bug occurs when `use_inductor_graph_partition` is turned on and there
 # exists operators inside of `splitting_ops` that have an in-place mutation. In
@@ -407,7 +407,7 @@ def _update_scheduler_patched(self) -> None:
 # ===================================================
 # Workaround for TorchInductor autotune using get_raw_stream() without defining it.
 # This occurs when compile_sizes > 1 in compilation_config.
-# For more context, see https://github.com/vllm-project/vllm/issues/30905.
+# For more context, see https://github.com/aphrodite-project/aphrodite/issues/30905.
 def _patch_get_raw_stream_if_needed():
     """Workaround for TorchInductor autotune get_raw_stream() bug."""
     from aphrodite.utils.torch_utils import is_torch_equal
@@ -574,3 +574,118 @@ def _patch_fxgraphcache_pickle_if_needed():
 
 
 _patch_fxgraphcache_pickle_if_needed()
+
+# ===================================================
+# torch 2.11 Inductor cpp codegen indirect_assert scalar-mask fix
+# ===================================================
+# CppVecKernel.indirect_assert wraps a scalar mask with
+# `VecMask<...>(scalar)`, which is not a valid constructor and triggers a
+# C++ compile error during torch.compile of any model that does indirect
+# indexing inside a tail-vectorized loop (e.g. Qwen3-VL).
+# Failure looks like:
+#   no matching function for call to 'VecMask<int64_t,2>::VecMask(int&)'
+# Upstream fix in PyTorch mainline replaces the call with
+# `VecMask<...>::from(scalar)`, see pytorch/pytorch#178148 (lands in 2.12).
+# This is a thin backport for torch >= 2.11 and < 2.12; remove once the
+# minimum supported torch is 2.12.
+
+
+def _apply_cpp_indirect_assert_patch():
+    """Replace CppVecKernel.indirect_assert with a fixed copy that uses
+    `VecMask<...>::from(scalar)` for scalar masks.
+    Idempotent: marks the class with `_aphrodite_indirect_assert_patched` after
+    the first apply.
+    """
+    from torch._inductor.codegen.cpp import CppVecKernel
+
+    if getattr(CppVecKernel, "_aphrodite_indirect_assert_patched", False):
+        return
+
+    from torch._inductor.codegen.cpp import CppCSEVariable, cexpr_index
+
+    def patched_indirect_assert(self, var, lower, upper, mask=None):
+        assert isinstance(var, CppCSEVariable)
+        assert var.dtype is not None
+        if not var.is_vec:
+            if isinstance(mask, CppCSEVariable) and mask.is_vec:
+                mask = f"({mask}).all_masked()"
+            return super(CppVecKernel, self).indirect_assert(var, lower, upper, mask)
+        lower_scalar = lower
+        upper_scalar = upper
+        if lower:
+            lower = f"{self._get_vec_type(var.dtype)}({lower})"
+        if upper:
+            upper = f"{self._get_vec_type(var.dtype)}({upper})"
+        if lower and upper:
+            cond = f"({lower} <= {var}) & ({var} < {upper})"
+            cond_print = f"{lower_scalar} <= {var} < {upper_scalar}"
+        elif lower:
+            cond = f"{lower} <= {var}"
+            cond_print = f"{lower_scalar} <= {var}"
+        else:
+            assert upper
+            cond = f"{var} < {upper}"
+            cond_print = f"{var} < {upper_scalar}"
+        cond = f"{self._get_mask_type(var.dtype)}({cond})"
+        if mask:
+            if not mask.is_vec:
+                # Backport of pytorch/pytorch#178148 -- use ::from for
+                # scalar masks so g++ picks the correct overload.
+                mask = f"{self._get_mask_type(var.dtype)}::from({mask})"
+            cond = f"({cond}) | ~({mask})"
+        if self.tail_size:
+            cond = (
+                f"{self._get_mask_type(var.dtype)}::set("
+                f"{self._get_mask_type(var.dtype)}::from(1)"
+                f", ({cond}), {cexpr_index(self.tail_size)})"
+            )
+        cond = f"({cond}).all_masked()"
+        return f'{self.assert_function}({cond}, "index out of bounds: {cond_print}")'
+
+    CppVecKernel.indirect_assert = patched_indirect_assert
+    CppVecKernel._aphrodite_indirect_assert_patched = True  # type: ignore[attr-defined]
+
+
+def _patch_cpp_indirect_assert_if_needed():
+    """Apply cpp codegen indirect_assert backport when on torch 2.11.x.
+    Defers application until torch._inductor.codegen.cpp is naturally
+    imported by Inductor. Importing it eagerly during aphrodite.__init__ pulls
+    in torch._inductor.scheduler, whose top-level
+    `import torch._inductor.async_compile` can fail with
+    `ModuleNotFoundError: import of torch._inductor.async_compile halted;
+    None in sys.modules` depending on the import order on the runner
+    (observed in Aphrodite CPU CI).
+    """
+    if not is_torch_equal_or_newer("2.11.0") or is_torch_equal_or_newer("2.12.0.dev"):
+        return
+
+    import sys
+
+    target_name = "torch._inductor.codegen.cpp"
+    if target_name in sys.modules:
+        _apply_cpp_indirect_assert_patch()
+        return
+
+    import importlib.abc
+
+    class _CppCodegenPatchFinder(importlib.abc.MetaPathFinder):
+        def find_spec(self, fullname, path, target=None):
+            if fullname != target_name:
+                return None
+            sys.meta_path.remove(self)
+            spec = importlib.util.find_spec(fullname)
+            if spec is None or spec.loader is None:
+                return None
+            original_exec = spec.loader.exec_module
+
+            def _exec_then_patch(module):
+                original_exec(module)
+                _apply_cpp_indirect_assert_patch()
+
+            spec.loader.exec_module = _exec_then_patch  # type: ignore[method-assign]
+            return spec
+
+    sys.meta_path.insert(0, _CppCodegenPatchFinder())
+
+
+_patch_cpp_indirect_assert_if_needed()
diff --git a/aphrodite/envs.py b/aphrodite/envs.py
index 52f756efb9..a7f1882abe 100755
--- a/aphrodite/envs.py
+++ b/aphrodite/envs.py
@@ -47,7 +47,7 @@
     APHRODITE_CONFIGURE_LOGGING: bool = True
     NO_COLOR: bool = False
     APHRODITE_TRACE_FUNCTION: int = 0
-    APHRODITE_USE_FLASHINFER_SAMPLER: bool | None = None
+    APHRODITE_USE_FLASHINFER_SAMPLER: bool = True
     APHRODITE_PP_LAYER_PARTITION: str | None = None
     APHRODITE_CPU_KVCACHE_SPACE: int | None = 0
     APHRODITE_CPU_OMP_THREADS_BIND: str = "auto"
@@ -243,6 +243,7 @@
     APHRODITE_DEBUG_WORKSPACE: bool = False
     APHRODITE_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     APHRODITE_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
+    APHRODITE_MULTI_STREAM_GEMM_TOKEN_THRESHOLD: int = 4096
     APHRODITE_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     APHRODITE_USE_V2_MODEL_RUNNER: bool = False
     APHRODITE_LOG_MODEL_INSPECTION: bool = False
@@ -254,6 +255,10 @@
     APHRODITE_LORA_DISABLE_PDL: bool = False
     APHRODITE_ENABLE_CUDA_COMPATIBILITY: bool = False
     APHRODITE_CUDA_COMPATIBILITY_PATH: str | None = None
+    APHRODITE_SKIP_MODEL_NAME_VALIDATION: bool = False
+    """If set, Aphrodite will skip model name validation in API requests.
+    This allows any model name to be accepted in the 'model' field of requests,
+    making the server model-name agnostic. Useful for proxy/gateway scenarios."""
     APHRODITE_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
     APHRODITE_ELASTIC_EP_DRAIN_REQUESTS: bool = False
     APHRODITE_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = True
@@ -693,11 +698,13 @@ def _get_or_set_default() -> str:
     # If set to 1, aphrodite will trace function calls
     # Useful for debugging
     "APHRODITE_TRACE_FUNCTION": lambda: int(os.getenv("APHRODITE_TRACE_FUNCTION", "0")),
-    # If set, aphrodite will use flashinfer sampler
+    # Whether to use the FlashInfer top-k / top-p sampler on CUDA. Enabled
+    # by default when the hardware supports it — set to 0 to opt out
+    # explicitly, which forces the PyTorch-native (Triton for bs>=8) path.
     "APHRODITE_USE_FLASHINFER_SAMPLER": lambda: (
         bool(int(os.environ["APHRODITE_USE_FLASHINFER_SAMPLER"]))
         if "APHRODITE_USE_FLASHINFER_SAMPLER" in os.environ
-        else None
+        else True
     ),
     # Pipeline stage partition strategy
     "APHRODITE_PP_LAYER_PARTITION": lambda: os.getenv("APHRODITE_PP_LAYER_PARTITION", None),
@@ -919,6 +926,7 @@ def _get_or_set_default() -> str:
     # use aiter linear op if aiter ops are enabled
     # The following list of related ops
     # - scaled_mm (per-tensor / rowwise)
+    # - use aiter tuned gemms for unquantized gemms
     "APHRODITE_ROCM_USE_AITER_LINEAR": lambda: (
         os.getenv("APHRODITE_ROCM_USE_AITER_LINEAR", "True").lower() in ("true", "1")
     ),
@@ -1443,9 +1451,17 @@ def _get_or_set_default() -> str:
     "APHRODITE_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool(
         int(os.getenv("APHRODITE_DEEPEP_LOW_LATENCY_USE_MNNVL", "0"))
     ),
-    # The number of SMs to allocate for communication kernels when running DBO
-    # the rest of the SMs on the device will be allocated to compute
-    "APHRODITE_DBO_COMM_SMS": lambda: int(os.getenv("APHRODITE_DBO_COMM_SMS", "20")),
+    # The number of SMs/CUs to allocate for communication kernels when
+    # running DBO; the rest will be allocated to compute.
+    # Default: 20 on CUDA (SMs), 64 on ROCm (CUs).
+    "APHRODITE_DBO_COMM_SMS": lambda: int(
+        os.getenv(
+            "APHRODITE_DBO_COMM_SMS",
+            "64"
+            if hasattr(__import__("torch").version, "hip") and __import__("torch").version.hip is not None
+            else "20",
+        )
+    ),
     # Enable max_autotune & coordinate_descent_tuning in inductor_config
     # to compile static shapes passed from compile_sizes in compilation_config
     # If set to 1, enable max_autotune; By default, this is enabled (1)
@@ -1483,6 +1499,17 @@ def _get_or_set_default() -> str:
     "APHRODITE_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD": lambda: int(
         int(os.getenv("APHRODITE_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD", 256))
     ),
+    # Token-count cutoff for multi-stream overlap of the attention input
+    # GEMM with auxiliary GEMMs (e.g. fused_wqa_wkv overlapped with indexer
+    # weights / kv-score projections in DeepSeek-V4). At or below this many
+    # tokens the FP8 main GEMM has idle SMs to share with the bf16 aux GEMMs
+    # and overlap is a 5-45% win; above it the FP8 GEMM saturates the device
+    # and the cross-stream sync becomes pure overhead. Set to 0 to disable
+    # the multi-stream path entirely. Empirical crossover on B300 (148 SMs)
+    # is ~4096; B200 (132 SMs) is expected ~3072.
+    "APHRODITE_MULTI_STREAM_GEMM_TOKEN_THRESHOLD": lambda: int(
+        os.getenv("APHRODITE_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "4096")
+    ),
     # Format for saving torch.compile cache artifacts
     # - "binary": saves as binary file
     #     Safe for multiple aphrodite serve processes accessing the same torch compile cache.
@@ -1525,6 +1552,13 @@ def _get_or_set_default() -> str:
     ),
     # Path to the CUDA compatibility libraries when CUDA compatibility is enabled.
     "APHRODITE_CUDA_COMPATIBILITY_PATH": lambda: os.environ.get("APHRODITE_CUDA_COMPATIBILITY_PATH", None),
+    # Skip model name validation in OpenAI API requests.
+    # When set to 1, any model name will be accepted in the 'model' field
+    # of API requests. This is useful for proxy/gateway scenarios where
+    # the actual model is served but different names may be used in requests.
+    "APHRODITE_SKIP_MODEL_NAME_VALIDATION": lambda: (
+        os.getenv("APHRODITE_SKIP_MODEL_NAME_VALIDATION", "0").strip().lower() in ("1", "true")
+    ),
     # Whether it is a scale up launch engine for elastic EP,
     # Should only be set by EngineCoreClient.
     "APHRODITE_ELASTIC_EP_SCALE_UP_LAUNCH": lambda: bool(int(os.getenv("APHRODITE_ELASTIC_EP_SCALE_UP_LAUNCH", "0"))),
@@ -1692,6 +1726,7 @@ def compile_factors() -> dict[str, object]:
         "APHRODITE_TEST_FORCE_LOAD_FORMAT",
         "APHRODITE_ENABLE_CUDA_COMPATIBILITY",
         "APHRODITE_CUDA_COMPATIBILITY_PATH",
+        "APHRODITE_SKIP_MODEL_NAME_VALIDATION",
         "LOCAL_RANK",
         "CUDA_VISIBLE_DEVICES",
         "NO_COLOR",
diff --git a/aphrodite/inputs/engine.py b/aphrodite/inputs/engine.py
index bae426e57f..2fa6f779e9 100644
--- a/aphrodite/inputs/engine.py
+++ b/aphrodite/inputs/engine.py
@@ -71,12 +71,27 @@ class EmbedsInput(_InputOptions):
     prompt: NotRequired[str]
     """The prompt text corresponding to the token IDs, if available."""
 
+    prompt_token_ids: NotRequired[list[int]]
+    """Token IDs of the rendered prompt. Only set for mixed-mode inputs
+    (chat completion with `prompt_embeds` content parts). When present,
+    `is_token_ids` MUST also be present and have the same length. 
+    For pure-embeds inputs this field is absent."""
+
+    is_token_ids: NotRequired[list[bool]]
+    """Per-position mask for mixed-mode inputs. `True` means the position
+    is a real token ID (use the model's embedding layer); `False` means
+    the position uses a pre-computed embedding row from `prompt_embeds`.
+    Length MUST equal `len(prompt_token_ids)`.
+    For pure-embeds inputs this field is absent."""
+
 
 def embeds_input(
     prompt_embeds: "torch.Tensor",
     *,
     prompt: str | None = None,
     cache_salt: str | None = None,
+    prompt_token_ids: list[int] | None = None,
+    is_token_ids: list[bool] | None = None,
 ) -> EmbedsInput:
     """
     Construct [`EmbedsInput`][aphrodite.inputs.engine.EmbedsInput]
@@ -88,6 +103,10 @@ def embeds_input(
         inputs["prompt"] = prompt
     if cache_salt is not None:
         inputs["cache_salt"] = cache_salt
+    if prompt_token_ids is not None:
+        inputs["prompt_token_ids"] = prompt_token_ids
+    if is_token_ids is not None:
+        inputs["is_token_ids"] = is_token_ids
 
     return inputs
 
diff --git a/aphrodite/inputs/llm.py b/aphrodite/inputs/llm.py
index 523ac1e82f..520bd743a7 100644
--- a/aphrodite/inputs/llm.py
+++ b/aphrodite/inputs/llm.py
@@ -125,6 +125,17 @@ class EmbedsPrompt(_PromptOptions):
     prompt: NotRequired[str]
     """The prompt text corresponding to the token embeddings, if available."""
 
+    prompt_token_ids: NotRequired[list[int]]
+    """Token IDs for mixed-mode inputs (chat completion with
+    `prompt_embeds` content parts). The tokens at positions where 
+    `prompt_is_token_ids` is `False` are placeholder tokens that 
+    get replaced by entries from `prompt_embeds` in the forward pass."""
+
+    prompt_is_token_ids: NotRequired[list[bool]]
+    """Per-position mask, `True` uses the real token ID, `False` uses
+    the corresponding entry from `prompt_embeds`. 
+    Must be the same length as `prompt_token_ids` when both are set."""
+
 
 DecoderOnlyPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt | EmbedsPrompt
 """
diff --git a/aphrodite/lora/worker_manager.py b/aphrodite/lora/worker_manager.py
index 3b04c2a772..9bfb3cde6c 100644
--- a/aphrodite/lora/worker_manager.py
+++ b/aphrodite/lora/worker_manager.py
@@ -17,11 +17,7 @@
 )
 from aphrodite.lora.peft_helper import PEFTHelper
 from aphrodite.lora.request import LoRARequest
-from aphrodite.lora.utils import (
-    get_adapter_absolute_path,
-    is_in_target_modules,
-    is_supported_lora_module,
-)
+from aphrodite.lora.utils import get_adapter_absolute_path
 
 logger = init_logger(__name__)
 
@@ -140,34 +136,6 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 skip_prefixes=lora_skip_prefixes,
             )
 
-            # Warn about adapter modules that will be ignored.
-            target_modules = self.lora_config.target_modules
-            expected_lora_modules_lst = list(expected_lora_modules)
-            for module_name in lora.loras:
-                if not is_supported_lora_module(module_name, expected_lora_modules_lst):
-                    logger.warning_once(
-                        "LoRA module '%s' in adapter '%s' is not in the "
-                        "model's supported LoRA target modules [%s]. "
-                        "These parameters will be ignored, which may "
-                        "cause abnormal model behavior.",
-                        module_name,
-                        lora_request.lora_path,
-                        ", ".join(sorted(expected_lora_modules_lst)),
-                    )
-                elif not is_in_target_modules(
-                    module_name,
-                    target_modules,
-                    packed_modules_mapping,
-                ):
-                    logger.warning_once(
-                        "LoRA module '%s' in adapter '%s' is not in the "
-                        "deployment-time target_modules restriction [%s]."
-                        " These parameters will be ignored.",
-                        module_name,
-                        lora_request.lora_path,
-                        ", ".join(sorted(target_modules)),
-                    )
-
         except FileNotFoundError as e:
             # FileNotFoundError should be raised if both
             # - No adapter found to download from huggingface (or in
diff --git a/aphrodite/model_executor/kernels/linear/scaled_mm/pytorch.py b/aphrodite/model_executor/kernels/linear/scaled_mm/pytorch.py
index 8b13992f17..6ef3b8f253 100644
--- a/aphrodite/model_executor/kernels/linear/scaled_mm/pytorch.py
+++ b/aphrodite/model_executor/kernels/linear/scaled_mm/pytorch.py
@@ -134,13 +134,21 @@ def apply_scaled_mm(
         #  For CUDA platform please validate if the torch._scaled_mm supports
         #  rowwise scaled GEMM before using it
 
+        # torch._scaled_mm rowwise requires scale_a = (m, 1), scale_b = (1, n).
+        # CompressedTensors stores weight_scale as (n, 1), so `.t()` yields (1, n).
+        # ModelOpt FP8_PER_CHANNEL_PER_TOKEN stores it as 1-D (n,); reshape to
+        # (1, n) so both paths satisfy the rowwise contract.
+        scale_b = Bs.view(1, -1) if Bs.dim() == 1 else Bs.t()
+        if As.dim() == 1:
+            As = As.view(-1, 1)
+
         # Fused GEMM_DQ Rowwise GEMM
         output = torch._scaled_mm(
             A,
             B,
             out_dtype=out_dtype,
             scale_a=As,
-            scale_b=Bs.t(),
+            scale_b=scale_b,
             bias=bias,
         )
 
diff --git a/aphrodite/model_executor/layers/attention/mla_attention.py b/aphrodite/model_executor/layers/attention/mla_attention.py
index 0b2b271cf3..768a7a065e 100644
--- a/aphrodite/model_executor/layers/attention/mla_attention.py
+++ b/aphrodite/model_executor/layers/attention/mla_attention.py
@@ -189,12 +189,9 @@
 
 import functools
 from abc import abstractmethod
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from enum import Enum
-from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast
-
-if TYPE_CHECKING:
-    from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
+from typing import ClassVar, Generic, TypeVar, cast
 
 import torch
 import torch.nn as nn
@@ -242,7 +239,7 @@
     kNvfp4Dynamic,
 )
 from aphrodite.platforms import current_platform
-from aphrodite.utils.flashinfer import has_flashinfer, has_nvidia_artifactory
+from aphrodite.utils.flashinfer import has_flashinfer
 from aphrodite.utils.math_utils import cdiv, round_down
 from aphrodite.utils.torch_utils import (
     LayerNameType,
@@ -262,11 +259,9 @@
     MLAAttentionImpl,
     SparseMLAAttentionImpl,
 )
-from aphrodite.v1.attention.backends.fa_utils import get_flash_attn_version
+from aphrodite.v1.attention.backends.mla.prefill import MLAPrefillBackend
 from aphrodite.v1.attention.backends.utils import (
     get_dcp_local_seq_lens,
-    get_per_layer_parameters,
-    infer_global_hyperparameters,
     split_decodes_and_prefills,
 )
 from aphrodite.v1.attention.ops.common import cp_lse_ag_out_rs
@@ -1080,33 +1075,6 @@ class QueryLenSupport(Enum):
     VARLEN = "varlen"
 
 
-try:
-    from aphrodite.vllm_flash_attn import (  # type: ignore[attr-defined]
-        flash_attn_varlen_func,
-    )
-
-    is_aphrodite_fa = True
-except ImportError:
-    is_aphrodite_fa = False
-    flash_attn_varlen_func = None  # type: ignore[assignment]
-    # On ROCm, vllm_flash_attn is not available, try upstream flash_attn instead.
-    # On CUDA, vllm_flash_attn should always be available (built with Aphrodite),
-    # so we don't attempt the fallback there.
-    if current_platform.is_rocm():
-        try:
-            from flash_attn import flash_attn_varlen_func  # type: ignore[no-redef]
-        except ImportError:
-            logger.debug(
-                "flash_attn not available on ROCm; "
-                "MLA models using TRITON_MLA will require flash_attn. "
-                "AITER_MLA backends use aiter kernels instead."
-            )
-    elif current_platform.is_xpu():
-        from aphrodite._xpu_ops import xpu_ops
-
-        flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func  # type: ignore[no-redef,attr-defined,assignment]
-
-
 def dynamic_per_batched_tensor_quant(x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn):
     DTYPE_MAX = torch.finfo(dtype).max
     min_val, max_val = x.aminmax()
@@ -1116,9 +1084,6 @@ def dynamic_per_batched_tensor_quant(x: torch.Tensor, dtype: torch.dtype = torch
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
 
 
-logger = init_logger(__name__)
-
-
 @CustomOp.register(
     "mla_decode_concat_quant_fp8",
     dynamic_arg_dims={"decode_ql_nope": 0, "decode_q_pe": 0},
@@ -1152,9 +1117,6 @@ def forward(
     forward_hip = _make_forward(QuantFP8.forward_hip)  # type: ignore[arg-type]
 
 
-CUDNN_WORKSPACE_SIZE = 12800
-
-
 class MLACommonBackend(AttentionBackend):
     @staticmethod
     def get_name() -> str:
@@ -1223,24 +1185,9 @@ class ChunkedContextMetadata:
     query_start_loc: torch.Tensor
     max_query_len: int
     chunked_context: ChunkedContextMetadata | None = None
-    query_seq_lens: torch.Tensor | None = None
-    workspace_buffer: torch.Tensor | None = None
     q_data_type: torch.dtype | None = None
     output_dtype: torch.dtype | None = None
-
-
-@dataclass
-class FlashInferPrefillMetadata(MLACommonPrefillMetadata):
-    prefill_main: "BatchPrefillWithRaggedKVCacheWrapper | None" = None
-    prefill_chunks: "list[BatchPrefillWithRaggedKVCacheWrapper]" = field(default_factory=list)
-
-
-@dataclass
-class CudnnPrefillMetadata(MLACommonPrefillMetadata):
-    class ChunkedContextMetadata(MLACommonPrefillMetadata.ChunkedContextMetadata):
-        seq_lens: torch.Tensor
-
-    cudnn_workspace: torch.Tensor | None = None
+    prefill_backend: MLAPrefillBackend | None = None
 
 
 @dataclass
@@ -1286,8 +1233,8 @@ class MLACommonMetadata(AttentionMetadata, Generic[D]):
     # The dimension of the attention heads
     head_dim: int | None = None
 
+    prefill: MLACommonPrefillMetadata | None = None
     decode: D | None = None
-    prefill: MLACommonPrefillMetadata | FlashInferPrefillMetadata | CudnnPrefillMetadata | None = None
 
     def __post_init__(self):
         if self.head_dim is not None and not MLACommonBackend.supports_head_size(self.head_dim):
@@ -1298,64 +1245,6 @@ def __post_init__(self):
 A = TypeVar("A", bound=AttentionMetadata)
 
 
-def is_deepseek_r1_mla_compatible(aphrodite_config: AphroditeConfig) -> bool:
-    # Check if model has DeepSeek R1 compatible MLA dimensions:
-    # qk_nope_head_dim = 128, qk_rope_head_dim = 64, v_head_dim = 128
-    # which results in query/key head dim = 192.
-    if aphrodite_config.model_config is None:
-        return False
-    hf_text_config = aphrodite_config.model_config.hf_text_config
-    qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
-    qk_rope_head_dim = getattr(hf_text_config, "qk_rope_head_dim", 1)
-    v_head_dim = getattr(hf_text_config, "v_head_dim", 1)
-    return qk_nope_head_dim == 128 and qk_rope_head_dim == 64 and v_head_dim == 128
-
-
-@functools.cache
-def use_flashinfer_prefill() -> bool:
-    from aphrodite.config import get_current_aphrodite_config
-
-    aphrodite_config = get_current_aphrodite_config()
-    if not (
-        not aphrodite_config.attention_config.disable_flashinfer_prefill
-        and has_flashinfer()
-        and not aphrodite_config.attention_config.use_cudnn_prefill
-        and current_platform.is_device_capability_family(100)
-    ):
-        return False
-
-    return is_deepseek_r1_mla_compatible(aphrodite_config)
-
-
-@functools.cache
-def use_cudnn_prefill() -> bool:
-    from aphrodite.config import get_current_aphrodite_config
-
-    aphrodite_config = get_current_aphrodite_config()
-    return (
-        has_flashinfer()
-        and aphrodite_config.attention_config.use_cudnn_prefill
-        and current_platform.is_device_capability_family(100)
-        and has_nvidia_artifactory()
-    )
-
-
-@functools.cache
-def use_trtllm_ragged_deepseek_prefill() -> bool:
-    """Check if TRT-LLM ragged DeepSeek prefill should be used."""
-    from aphrodite.config import get_current_aphrodite_config
-
-    aphrodite_config = get_current_aphrodite_config()
-    if not (
-        has_flashinfer()
-        and aphrodite_config.attention_config.use_trtllm_ragged_deepseek_prefill
-        and current_platform.is_device_capability_family(100)
-    ):
-        return False
-
-    return is_deepseek_r1_mla_compatible(aphrodite_config)
-
-
 @dataclass
 class MLADims:
     q_lora_rank: int | None
@@ -1393,15 +1282,14 @@ def get_mla_dims(model_config: ModelConfig) -> MLADims:
 
 @functools.cache
 def backend_supports_prefill_query_quantization() -> bool:
-    """Check if the selected MLA backend supports prefill query quantization.
+    """Check if the selected MLA prefill backend supports query quantization.
 
     Currently supported backends:
-    - FlashInfer prefill
-    - TRT-LLM ragged DeepSeek prefill
+    - FlashInfer
+    - TRT-LLM Ragged
 
     Not supported:
-    - cuDNN Prefill
-    - FlashAttention
+    - FlashAttention (FA3/FA4)
     - Non-GB200 devices (FP8 prefill requires device capability 100)
     """
     # FP8 prefill query quantization requires GB200 (device capability 100)
@@ -1409,7 +1297,15 @@ def backend_supports_prefill_query_quantization() -> bool:
     if not current_platform.is_device_capability_family(100):
         return False
 
-    return use_flashinfer_prefill() or use_trtllm_ragged_deepseek_prefill()
+    from aphrodite.config import get_current_aphrodite_config
+    from aphrodite.v1.attention.backends.mla.prefill import get_mla_prefill_backend
+
+    aphrodite_config = get_current_aphrodite_config()
+    backend_cls = get_mla_prefill_backend(aphrodite_config)
+    return backend_cls.get_name() in (
+        "FLASHINFER",
+        "TRTLLM_RAGGED",
+    )
 
 
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
@@ -1434,9 +1330,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     reorder_batch_threshold: int = 1
 
     @staticmethod
-    def determine_chunked_prefill_workspace_size(
-        aphrodite_config: AphroditeConfig,
-    ) -> int:
+    def determine_chunked_prefill_workspace_size(aphrodite_config: AphroditeConfig) -> int:
         scheduler_config = aphrodite_config.scheduler_config
         cache_config = aphrodite_config.cache_config
         model_config = aphrodite_config.model_config
@@ -1520,7 +1414,6 @@ def __init__(
     ):
         self.metadata_cls = metadata_cls if metadata_cls is not None else MLACommonMetadata
         self.kv_cache_spec = kv_cache_spec
-        scheduler_config = aphrodite_config.scheduler_config
         self.model_config = aphrodite_config.model_config
         parallel_config = aphrodite_config.parallel_config
         self.compilation_config = aphrodite_config.compilation_config
@@ -1575,135 +1468,30 @@ def __init__(
                 device=device,
             )
 
-        self._use_cudnn_prefill = use_cudnn_prefill()
-        self._use_fi_prefill = use_flashinfer_prefill()
-        self._use_trtllm_ragged_prefill = use_trtllm_ragged_deepseek_prefill()
-        self.prefill_metadata_cls = (
-            FlashInferPrefillMetadata
-            if self._use_fi_prefill
-            else CudnnPrefillMetadata
-            if self._use_cudnn_prefill
-            else MLACommonPrefillMetadata
-        )
-
-        if self._use_fi_prefill:
-            self._workspace_buffer = torch.empty(
-                envs.APHRODITE_FLASHINFER_WORKSPACE_BUFFER_SIZE,
-                dtype=torch.uint8,
-                device=device,
-            )
-
-            self._fi_prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
-            self._fi_prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = []
-
-            self._global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(aphrodite_config, layer_names, MLACommonImpl)  # type: ignore[type-abstract]
-            )
-
-        if self._use_trtllm_ragged_prefill:
-            self._workspace_buffer = torch.empty(
-                envs.APHRODITE_FLASHINFER_WORKSPACE_BUFFER_SIZE,
-                dtype=torch.uint8,
-                device=device,
-            )
+        from aphrodite.v1.attention.backends.mla.prefill import get_mla_prefill_backend
 
-        if self._use_cudnn_prefill:
-            self.cudnn_workspace = torch.empty(
-                CUDNN_WORKSPACE_SIZE * scheduler_config.max_num_seqs,
-                dtype=torch.int8,
-                device=device,
-            )
+        prefill_backend_cls = get_mla_prefill_backend(aphrodite_config)
+        self._prefill_backend = prefill_backend_cls(
+            num_heads=self.num_heads,
+            scale=self.model_config.get_head_size() ** -0.5,
+            kv_lora_rank=self.mla_dims.kv_lora_rank,
+            qk_nope_head_dim=self.mla_dims.qk_nope_head_dim,
+            qk_rope_head_dim=self.mla_dims.qk_rope_head_dim,
+            v_head_dim=self.mla_dims.v_head_dim,
+            aphrodite_config=aphrodite_config,
+            device=device,
+            layer_names=layer_names,
+        )
 
         supports_spec_decode = self.query_len_support != QueryLenSupport.SINGLE_ONLY
         self._init_reorder_batch_threshold(self.reorder_batch_threshold, supports_spec_decode, supports_dcp_with_varlen)
 
-        # Validate consistency between query_len_support and reorder_batch_threshold
         if self.query_len_support == QueryLenSupport.SINGLE_ONLY:
             assert self.reorder_batch_threshold == 1, (
                 f"reorder_batch_threshold must be 1 when query_len_support is "
                 f"SINGLE_ONLY, got {self.reorder_batch_threshold}"
             )
 
-    def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata):
-        qo_indptr = prefill.query_start_loc
-
-        has_context = False
-        if prefill.chunked_context is not None:
-            chunked_context = prefill.chunked_context
-            has_context = True
-
-        if self._fi_prefill_main is None:
-            from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
-
-            self._fi_prefill_main = BatchPrefillWithRaggedKVCacheWrapper(
-                self._workspace_buffer, "NHD", backend="cutlass"
-            )
-
-        if has_context:
-            num_chunks = chunked_context.cu_seq_lens.shape[0]
-            # Allocate more prefill chunk wrappers if needed
-            if len(self._fi_prefill_chunks) < num_chunks:
-                from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
-
-                for _ in range(len(self._fi_prefill_chunks), num_chunks):
-                    self._fi_prefill_chunks.append(
-                        BatchPrefillWithRaggedKVCacheWrapper(self._workspace_buffer, "NHD", backend="cutlass")
-                    )
-            assert num_chunks <= len(self._fi_prefill_chunks)
-
-        # In MLA, the non-latent num_qo_heads == num_kv_heads
-        num_qo_heads = self.num_heads
-        num_kv_heads = num_qo_heads
-
-        # Sanity: Verify that num_kv_heads == 1 since it is latent space
-        assert self.kv_cache_spec.num_kv_heads == 1
-
-        # Get non-latent head_dim_qk and head_dim_vo
-        head_dim_qk = self.mla_dims.qk_nope_head_dim + self.mla_dims.qk_rope_head_dim
-        head_dim_vo = self.mla_dims.v_head_dim
-
-        # For main run, qo_indptr == kv_indptr
-        kv_indptr = qo_indptr.clone()
-
-        # Prepare main prefill
-        self._fi_prefill_main.plan(
-            qo_indptr=qo_indptr,
-            kv_indptr=kv_indptr,
-            num_qo_heads=num_qo_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim_qk=head_dim_qk,
-            head_dim_vo=head_dim_vo,
-            causal=True,  # This is main run
-            sm_scale=self._global_hyperparameters.sm_scale,
-            window_left=self._global_hyperparameters.window_left,
-            logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
-            q_data_type=self.q_data_type,
-            o_data_type=prefill.output_dtype,
-        )
-
-        # Prepare context prefills
-        if has_context:
-            for i in range(num_chunks):
-                kv_indptr_chunk = chunked_context.cu_seq_lens[i]
-
-                self._fi_prefill_chunks[i].plan(
-                    qo_indptr=qo_indptr,
-                    kv_indptr=kv_indptr_chunk,
-                    num_qo_heads=num_qo_heads,
-                    num_kv_heads=num_kv_heads,
-                    head_dim_qk=head_dim_qk,
-                    head_dim_vo=head_dim_vo,
-                    causal=False,  # This is context run
-                    sm_scale=self._global_hyperparameters.sm_scale,
-                    window_left=self._global_hyperparameters.window_left,
-                    logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
-                    q_data_type=self.q_data_type,
-                    o_data_type=prefill.output_dtype,
-                )
-
-        prefill.prefill_main = self._fi_prefill_main
-        prefill.prefill_chunks = self._fi_prefill_chunks
-
     def _build_decode(
         self,
         block_table_tensor: torch.Tensor,
@@ -1879,16 +1667,12 @@ def build(
                         dtype=torch.int32,
                     )
 
-                chunked_context_metadata_cls = (
-                    CudnnPrefillMetadata.ChunkedContextMetadata
-                    if self._use_cudnn_prefill
-                    else MLACommonPrefillMetadata.ChunkedContextMetadata
-                )
                 prefill_tokens_with_context = None
                 if num_prefills_with_context_cpu > 0:
                     prefill_tokens_with_context = prefill_query_start_loc_cpu[num_prefills_with_context_cpu].item()
+                _ChunkedMetadata = MLACommonPrefillMetadata.ChunkedContextMetadata
                 if self.dcp_world_size > 1:
-                    chunked_context_metadata = chunked_context_metadata_cls(
+                    chunked_context_metadata = _ChunkedMetadata(
                         cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
                         starts=local_chunk_starts.to(device, non_blocking=True),
                         seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
@@ -1905,7 +1689,7 @@ def build(
                         prefill_tokens_with_context=prefill_tokens_with_context,
                     )
                 else:
-                    chunked_context_metadata = chunked_context_metadata_cls(
+                    chunked_context_metadata = _ChunkedMetadata(
                         cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
                         starts=chunk_starts.to(device, non_blocking=True),
                         seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
@@ -1917,28 +1701,19 @@ def build(
                         prefill_tokens_with_context=prefill_tokens_with_context,
                     )
 
-                if self._use_cudnn_prefill:
-                    chunked_context_metadata.seq_lens = chunk_seq_lens
-
                 assert max(chunked_context_metadata.max_seq_lens) <= self.chunked_prefill_workspace_size
 
-            prefill_metadata = self.prefill_metadata_cls(
+            prefill_metadata = MLACommonPrefillMetadata(
                 block_table=block_table_tensor[reqs_start:, ...],
                 query_start_loc=prefill_query_start_loc,
                 max_query_len=max_query_len,
                 chunked_context=chunked_context_metadata,
                 output_dtype=self.model_config.dtype,
                 q_data_type=self.q_data_type,
+                prefill_backend=self._prefill_backend,
             )
 
-            if self._use_cudnn_prefill:
-                assert isinstance(prefill_metadata, CudnnPrefillMetadata)
-                prefill_metadata.query_seq_lens = prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
-                prefill_metadata.cudnn_workspace = self.cudnn_workspace
-
-            if self._use_trtllm_ragged_prefill:
-                prefill_metadata.query_seq_lens = prefill_query_start_loc[1:] - prefill_query_start_loc[:-1]
-                prefill_metadata.workspace_buffer = self._workspace_buffer
+            self._prefill_backend.prepare_metadata(prefill_metadata)
 
         decode_metadata = None
         if num_decodes > 0:
@@ -1981,10 +1756,6 @@ def build(
             decode=decode_metadata,
         )
 
-        if self._use_fi_prefill and num_prefills > 0:
-            assert isinstance(attn_metadata.prefill, FlashInferPrefillMetadata)
-            self._build_fi_prefill_wrappers(attn_metadata.prefill)
-
         return attn_metadata  # type: ignore[return-value]
 
 
@@ -2126,278 +1897,12 @@ def __init__(
             and (self.qk_rope_head_dim == 64)
         )
 
-        if use_trtllm_ragged_deepseek_prefill():
-            logger.info_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
-            self._run_prefill_context_chunk = self._run_prefill_context_chunk_trtllm_ragged
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
-            self._pad_v = False
-        elif use_flashinfer_prefill():
-            logger.info_once("Using FlashInfer prefill for MLA")
-            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
-            self._pad_v = False
-        elif use_cudnn_prefill():
-            logger.info_once("Using CUDNN prefill for MLA")
-            self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
-            self._pad_v = False
-        else:  # Use FlashAttention
-            if flash_attn_varlen_func is None:
-                raise RuntimeError(
-                    "MLA attention requires FlashAttention but it is not "
-                    "available. Please install flash_attn or use "
-                    "--attention-backend ROCM_AITER_MLA."
-                )
-            logger.info_once("Using FlashAttention prefill for MLA")
-            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa
-
-            # Handle the differences between the flash_attn_varlen from
-            # flash_attn and the one from vllm_flash_attn. The former is used on
-            # RoCM and the latter has an additional parameter to control
-            # FA2 vs FA3
-            self.flash_attn_varlen_func = flash_attn_varlen_func
-            self.vllm_flash_attn_version = get_flash_attn_version(head_size=self.qk_head_dim)
-            if self.vllm_flash_attn_version is not None:
-                self.flash_attn_varlen_func = functools.partial(
-                    flash_attn_varlen_func, fa_version=self.vllm_flash_attn_version
-                )
-
-            # For MLA the v head dim is smaller than qk head dim so we pad out
-            # v with 0s to match the qk head dim for attention backends that do
-            # not support different headdims.
-            # FA3 on Hopper (SM90) and FA4 natively handle diff headdims.
-            device_capability = current_platform.get_device_capability()
-            self._pad_v = self.vllm_flash_attn_version is None or not (
-                (self.vllm_flash_attn_version == 3 and device_capability is not None and device_capability[0] == 9)
-                or self.vllm_flash_attn_version == 4
-            )
-
         self.dcp_world_size: int = -1
 
         self.cp_kv_cache_interleave_size: int = (
             get_current_aphrodite_config().parallel_config.cp_kv_cache_interleave_size
         )
 
-    def _flash_attn_varlen_diff_headdims(self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs):
-        maybe_padded_v = v
-        if self._pad_v:
-            maybe_padded_v = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]], value=0)
-
-        if is_aphrodite_fa:
-            kwargs["return_softmax_lse"] = return_softmax_lse
-        else:
-            # ROCm leverages the upstream flash_attn, which takes a parameter
-            # called "return_attn_probs" instead of return_softmax_lse
-            kwargs["return_attn_probs"] = return_softmax_lse
-        if envs.APHRODITE_BATCH_INVARIANT:
-            kwargs["num_splits"] = 1
-
-        attn_out = self.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=maybe_padded_v,
-            softmax_scale=softmax_scale,
-            **kwargs,
-        )
-
-        # Unpack the output if there is multiple results
-        lse = None
-        if isinstance(attn_out, tuple):
-            attn_out, lse = attn_out[0], attn_out[1]
-
-        # Remain consistent with old `flash_attn_varlen_func` where there
-        # is only one output tensor if `return_softmax_lse` is False.
-        if return_softmax_lse:
-            return attn_out, lse
-        return attn_out
-
-    def _run_prefill_new_tokens_fa(self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse):
-        return self._flash_attn_varlen_diff_headdims(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=prefill.query_start_loc,
-            cu_seqlens_k=prefill.query_start_loc,
-            max_seqlen_q=prefill.max_query_len,
-            max_seqlen_k=prefill.max_query_len,
-            softmax_scale=self.scale,
-            causal=True,
-            return_softmax_lse=return_softmax_lse,
-        )
-
-    def _run_prefill_new_tokens_fi(self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse):
-        assert isinstance(prefill, FlashInferPrefillMetadata)
-        assert prefill.prefill_main is not None
-
-        ret = prefill.prefill_main.run(
-            q=q,
-            k=k,
-            v=v,
-            return_lse=return_softmax_lse,
-        )
-
-        if isinstance(ret, tuple):
-            return ret[0], ret[1].transpose(0, 1).contiguous()
-        return ret
-
-    def _run_prefill_new_tokens_cudnn(self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse):
-        assert isinstance(prefill, CudnnPrefillMetadata)
-        assert prefill.query_seq_lens is not None
-        from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache
-
-        output, lse = cudnn_batch_prefill_with_kv_cache(
-            q=q,
-            k_cache=k,
-            v_cache=v,
-            scale=self.scale,
-            workspace_buffer=prefill.cudnn_workspace,
-            max_token_per_sequence=prefill.max_query_len,
-            max_sequence_kv=prefill.max_query_len,
-            actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
-            actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
-            causal=True,
-            # Do not support False for now
-            return_lse=True,
-            # Indicates actual_seq_lens are on GPU or CPU.
-            is_cuda_graph_compatible=True,
-        )
-        if return_softmax_lse:
-            return output, lse
-        return output
-
-    def _run_prefill_context_chunk_fa(self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v):
-        assert prefill.chunked_context is not None
-        return self._flash_attn_varlen_diff_headdims(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=prefill.query_start_loc,
-            cu_seqlens_k=prefill.chunked_context.cu_seq_lens[chunk_idx],
-            max_seqlen_q=prefill.max_query_len,
-            max_seqlen_k=prefill.chunked_context.max_seq_lens[chunk_idx],
-            softmax_scale=self.scale,
-            causal=False,  # Context is unmasked
-            return_softmax_lse=True,
-        )
-
-    def _run_prefill_context_chunk_fi(self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v):
-        assert isinstance(prefill, FlashInferPrefillMetadata)
-
-        attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
-            q=q,
-            k=k,
-            v=v,
-            return_lse=True,
-        )
-
-        # Convert from (q_len, num_heads) to (num_heads, q_len)
-        return attn_out, lse.transpose(0, 1).contiguous()
-
-    def _run_prefill_context_chunk_cudnn(self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v):
-        assert isinstance(prefill, CudnnPrefillMetadata)
-        assert prefill.chunked_context is not None
-        assert prefill.chunked_context.seq_lens[chunk_idx] is not None
-        assert prefill.query_seq_lens is not None
-        from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache
-
-        return cudnn_batch_prefill_with_kv_cache(
-            q=q,
-            k_cache=k,
-            v_cache=v,
-            scale=self.scale,
-            workspace_buffer=prefill.cudnn_workspace,
-            max_token_per_sequence=prefill.max_query_len,
-            max_sequence_kv=prefill.chunked_context.max_seq_lens[chunk_idx],
-            actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
-            actual_seq_lens_kv=prefill.chunked_context.seq_lens[chunk_idx].view(-1, 1, 1, 1),
-            causal=False,
-            return_lse=True,
-            # Indicates actual_seq_lens are on GPU or CPU.
-            is_cuda_graph_compatible=True,
-        )
-
-    def _run_prefill_new_tokens_trtllm_ragged(self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse):
-        """TRT-LLM ragged attention for new tokens (causal)."""
-        from flashinfer.prefill import trtllm_ragged_attention_deepseek
-
-        assert prefill.query_seq_lens is not None
-        assert prefill.workspace_buffer is not None
-        # allocate BF16 / FP16 output tensor for TRT-LLM ragged attention
-        out = torch.empty(
-            q.shape[0],
-            q.shape[1],
-            v.shape[2],
-            device=q.device,
-            dtype=prefill.output_dtype,
-        )
-
-        ret = trtllm_ragged_attention_deepseek(
-            query=q,
-            key=k,
-            value=v,
-            workspace_buffer=prefill.workspace_buffer,
-            seq_lens=prefill.query_seq_lens,
-            max_q_len=prefill.max_query_len,
-            max_kv_len=prefill.max_query_len,
-            bmm1_scale=self.scale,
-            bmm2_scale=1.0,
-            o_sf_scale=1.0,
-            batch_size=prefill.query_seq_lens.shape[0],
-            window_left=-1,
-            cum_seq_lens_q=prefill.query_start_loc,
-            cum_seq_lens_kv=prefill.query_start_loc,
-            enable_pdl=False,
-            is_causal=True,
-            return_lse=return_softmax_lse,
-            out=out,
-        )
-
-        if isinstance(ret, tuple):
-            # Convert from (q_len, num_heads) to (num_heads, q_len)
-            return ret[0], ret[1].transpose(0, 1).contiguous()
-        return ret
-
-    def _run_prefill_context_chunk_trtllm_ragged(self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v):
-        """TRT-LLM ragged attention for context chunks (non-causal)."""
-        from flashinfer.prefill import trtllm_ragged_attention_deepseek
-
-        assert prefill.chunked_context is not None
-        assert prefill.chunked_context.seq_lens[chunk_idx] is not None
-        assert prefill.workspace_buffer is not None
-
-        out = torch.empty(
-            q.shape[0],
-            q.shape[1],
-            v.shape[2],
-            device=q.device,
-            dtype=prefill.output_dtype,
-        )
-
-        attn_out, lse = trtllm_ragged_attention_deepseek(
-            query=q,
-            key=k,
-            value=v,
-            workspace_buffer=prefill.workspace_buffer,
-            seq_lens=prefill.chunked_context.seq_lens[chunk_idx],
-            max_q_len=prefill.max_query_len,
-            max_kv_len=prefill.chunked_context.max_seq_lens[chunk_idx],
-            bmm1_scale=self.scale,
-            bmm2_scale=1.0,
-            o_sf_scale=1.0,
-            batch_size=prefill.chunked_context.seq_lens[chunk_idx].shape[0],
-            window_left=-1,
-            cum_seq_lens_q=prefill.query_start_loc,
-            cum_seq_lens_kv=prefill.chunked_context.cu_seq_lens[chunk_idx],
-            enable_pdl=False,
-            is_causal=False,
-            return_lse=True,
-            out=out,
-        )
-
-        # Convert from (q_len, num_heads) to (num_heads, q_len)
-        return attn_out, lse.transpose(0, 1).contiguous()
-
     def _concat_k_nope_k_pe(self, k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
         """
         Efficiently concatenate k_nope and k_pe tensors along the last dimension.
@@ -2436,6 +1941,7 @@ def _compute_prefill_context(
     ):
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.prefill_backend is not None
         assert prefill_metadata.chunked_context is not None
 
         use_fp8_prefill = prefill_metadata.q_data_type == current_platform.fp8_dtype()
@@ -2499,8 +2005,7 @@ def _compute_prefill_context(
 
             k = self._concat_k_nope_k_pe(k_nope, k_pe)
 
-            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
-                prefill=prefill_metadata,
+            attn_output, attn_softmax_lse = prefill_metadata.prefill_backend.run_prefill_context_chunk(
                 chunk_idx=i,
                 q=q,
                 k=k,
@@ -2537,6 +2042,7 @@ def _context_parallel_compute_prefill_context(
         assert k_scale is None, "DCP not support scaled kvcache now."
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.prefill_backend is not None
         assert prefill_metadata.chunked_context is not None
         assert prefill_metadata.chunked_context.padded_local_chunk_seq_lens is not None
         assert prefill_metadata.chunked_context.local_context_lens_allranks is not None
@@ -2590,8 +2096,7 @@ def _context_parallel_compute_prefill_context(
             k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
             k = self._concat_k_nope_k_pe(k_nope, k_pe)
 
-            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
-                prefill=prefill_metadata,
+            attn_output, attn_softmax_lse = prefill_metadata.prefill_backend.run_prefill_context_chunk(
                 chunk_idx=i,
                 q=q,
                 k=k,
@@ -2627,11 +2132,11 @@ def forward_mha(
         k_scale: torch.Tensor,
         output: torch.Tensor,
     ) -> None:
-        # TODO (zyongye): Prefill function here
         assert attn_metadata.prefill is not None
         assert self.dcp_world_size != -1
 
         prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.prefill_backend is not None
         use_fp8_prefill = prefill_metadata.q_data_type == current_platform.fp8_dtype()
 
         # Convert q to FP8 if FP8 prefill attention is enabled
@@ -2648,8 +2153,7 @@ def forward_mha(
             k = k.to(prefill_metadata.q_data_type)
             v = v.to(prefill_metadata.q_data_type)
 
-        output_prefill = self._run_prefill_new_tokens(
-            prefill=prefill_metadata,
+        output_prefill = prefill_metadata.prefill_backend.run_prefill_new_tokens(
             q=q,
             k=k,
             v=v,
@@ -2672,11 +2176,6 @@ def forward_mha(
                     q, kv_c_and_k_pe_cache, attn_metadata, k_scale
                 )
 
-            # unpad if necessary
-            if self._pad_v:
-                context_output = context_output[..., : v.shape[-1]]
-                suffix_output = suffix_output[..., : v.shape[-1]]
-
             output = output.view(-1, self.num_heads, self.v_head_dim)
             merge_attn_states(
                 output=output,
@@ -2687,7 +2186,8 @@ def forward_mha(
                 prefill_tokens_with_context=prefill_metadata.chunked_context.prefill_tokens_with_context,
             )
         else:
-            output_prefill = output_prefill[..., : v.shape[-1]].flatten(start_dim=-2)
+            assert isinstance(output_prefill, torch.Tensor)
+            output_prefill = output_prefill.flatten(start_dim=-2)
             output.copy_(output_prefill)
 
     @abstractmethod
diff --git a/aphrodite/model_executor/layers/batch_invariant.py b/aphrodite/model_executor/layers/batch_invariant.py
index 6f564995ed..e47c4c8fe6 100644
--- a/aphrodite/model_executor/layers/batch_invariant.py
+++ b/aphrodite/model_executor/layers/batch_invariant.py
@@ -886,25 +886,27 @@ def enable_batch_invariant_mode():
     _batch_invariant_MODE = True
     _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
 
-    if current_platform.is_device_capability_family(100) or current_platform.is_device_capability_family(80):
-        # For PyTorch 2.9, B200 uses GEMV for bs=1
-        # Requires https://github.com/pytorch/pytorch/pull/166735
+    if current_platform.is_device_capability_family(80):
+        # SM80 (Ampere) cannot rely on cuBLASLt-only determinism; install the
+        # triton persistent matmul overrides for mm/addmm/matmul/linear.
         _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
         _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA")
         _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA")
         _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA")
-
-        # Query the shared memory size and set block size
-        # accordingly to avoid triton OutOfResources
-        _fp16_block_size_n = 256 if get_max_shared_memory_bytes() > 106496 else 128
     else:
-        # Only source of batch invariance for Hopper is split-k, can disable through
-        # cuBLAS workspace config
+        # Hopper (SM90) and Blackwell (SM100): the only source of batch
+        # variance is split-k, which we disable via the cuBLAS workspace
+        # config.
         _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None)
         _original_cublaslt_workspace_size = os.environ.get("CUBLASLT_WORKSPACE_SIZE", None)
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
         os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
 
+    # Triton bmm/persistent-matmul kernels read this for the FP16 N-tile size;
+    # set unconditionally because bmm is overridden on all CUDA platforms.
+    if current_platform.is_cuda():
+        _fp16_block_size_n = 256 if get_max_shared_memory_bytes() > 106496 else 128
+
     _batch_invariant_LIB.impl("aten::_log_softmax", _log_softmax_batch_invariant, "CUDA")
     _batch_invariant_LIB.impl("aten::softmax", softmax_batch_invariant, "CUDA")
     _batch_invariant_LIB.impl("aten::_softmax", softmax_batch_invariant, "CUDA")
diff --git a/aphrodite/model_executor/layers/deepseek_compressor.py b/aphrodite/model_executor/layers/deepseek_compressor.py
index a4999ef59f..855a66d235 100644
--- a/aphrodite/model_executor/layers/deepseek_compressor.py
+++ b/aphrodite/model_executor/layers/deepseek_compressor.py
@@ -14,7 +14,6 @@
 from aphrodite.model_executor.layers.linear import (
     MergedColumnParallelLinear,
 )
-from aphrodite.model_executor.layers.utils import cublas_gemm_bf16_bf16_fp32
 from aphrodite.platforms import current_platform
 from aphrodite.triton_utils import tl, triton
 from aphrodite.v1.attention.backend import (
@@ -263,16 +262,12 @@ def __init__(
 
     def forward(
         self,
-        # [num_tokens, hidden_size]
-        x: torch.Tensor,
+        # [num_tokens, 2 * self.coff * self.head_dim]
+        kv_score: torch.Tensor,
         # [num_tokens]
         positions: torch.Tensor,
         rotary_emb,
     ) -> None:
-        num_tokens, _ = x.shape
-        # bf16 weights/activations but fp32 output for numerical stability of
-        # the downstream compressor math.
-        kv_score = cublas_gemm_bf16_bf16_fp32(x, self.fused_wkv_wgate.weight)
         # Each of shape [num_tokens, coff * self.head_dim]
         # input bf16, output are fp32
         kv, score = kv_score.split([self.coff * self.head_dim, self.coff * self.head_dim], dim=-1)
diff --git a/aphrodite/model_executor/layers/deepseek_v4_attention.py b/aphrodite/model_executor/layers/deepseek_v4_attention.py
index 929f96092d..738cadbf50 100644
--- a/aphrodite/model_executor/layers/deepseek_v4_attention.py
+++ b/aphrodite/model_executor/layers/deepseek_v4_attention.py
@@ -4,14 +4,16 @@
 DeepseekV4 MLA Attention Layer
 """
 
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import DeepseekV2Config, DeepseekV3Config
 
+import aphrodite.envs as envs
 from aphrodite.model_executor.layers.linear import (
     ReplicatedLinear,
 )
@@ -51,7 +53,10 @@
 from aphrodite.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
 )
-from aphrodite.utils.multi_stream_utils import maybe_execute_in_parallel
+from aphrodite.utils.multi_stream_utils import (
+    execute_in_parallel,
+    maybe_execute_in_parallel,
+)
 from aphrodite.v1.attention.backend import AttentionBackend, AttentionMetadata
 from aphrodite.v1.attention.backends.mla.flashmla_sparse import (
     DeepseekV4FlashMLASparseBackend,
@@ -94,7 +99,7 @@ class DeepseekV4MLAModules:
     indexer: torch.nn.Module | None
     indexer_rotary_emb: torch.nn.Module
     topk_indices_buffer: torch.Tensor | None
-    aux_stream: torch.cuda.Stream | None = None
+    aux_stream_list: list[torch.cuda.Stream] | None = None
 
 
 # --8<-- [start:multi_head_latent_attention]
@@ -214,8 +219,11 @@ def __init__(
             + 1  # 1B pad
         )
 
-        self.aux_stream = mla_modules.aux_stream
-        self.ln_events = [torch.cuda.Event(), torch.cuda.Event()]
+        self.aux_stream_list = mla_modules.aux_stream_list
+        # [0]: GEMM start / post-GEMM event0. [1..3]: GEMM done events;
+        # [1] doubles as post-GEMM event1. Reuse is safe: GEMM fully joins
+        # before post-GEMM starts.
+        self.ln_events = [torch.cuda.Event() for _ in range(4)]
 
         assert cache_config is not None, "DeepseekV4 attention requires cache_config"
         self.swa_cache_layer = DeepseekV4SWACache(
@@ -274,9 +282,6 @@ def forward(
         hidden_states: torch.Tensor,
         llama_4_scaling: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        qr_kv, _ = self.fused_wqa_wkv(hidden_states)
-        qr, kv = qr_kv.split([self.q_lora_rank, self.head_dim], dim=-1)
-
         # Pre-allocate attention output with FlashMLA-padded head count.
         # The op writes into `o_padded`; we slice to n_local_heads after.
         num_tokens = hidden_states.shape[0]
@@ -289,8 +294,6 @@ def forward(
         # Attention (inside custom op for torch.compile boundary)
         torch.ops.aphrodite.deepseek_v4_attention(
             hidden_states,
-            qr,
-            kv,
             positions,
             o_padded,
             self.layer_name,
@@ -329,17 +332,74 @@ def forward(
 
         return self.wo_b(z.flatten(1))
 
+    def attn_gemm_parallel_execute(self, hidden_states) -> tuple[Any, ...]:
+        assert self.aux_stream_list is not None
+        assert len(self.aux_stream_list) >= 3
+
+        # fused_wqa_wkv (heaviest) on default; the three lighter input GEMMs
+        # on aux streams 0..2 when their owning module exists. ln_events[0]
+        # is the fan-out start event; ln_events[1..3] are per-aux done events.
+        aux_fns: list[Callable[[], Any] | None] = [None, None, None]
+
+        if self.compressor is not None:
+            # Local ref so the closure keeps a non-None type for mypy.
+            compressor = self.compressor
+
+            def compressor_kv_score() -> torch.Tensor:
+                return torch.mm(
+                    hidden_states,
+                    compressor.fused_wkv_wgate.weight.T,
+                    out_dtype=torch.float32,
+                )
+
+            aux_fns[0] = compressor_kv_score
+
+        if self.indexer is not None:
+            indexer = self.indexer
+
+            def indexer_weights_proj() -> torch.Tensor:
+                # ReplicatedLinear returns (output, bias); bias is None.
+                weights, _ = indexer.weights_proj(hidden_states)
+                return weights
+
+            def indexer_compressor_kv_score() -> torch.Tensor:
+                return torch.mm(
+                    hidden_states,
+                    indexer.compressor.fused_wkv_wgate.weight.T,
+                    out_dtype=torch.float32,
+                )
+
+            aux_fns[1] = indexer_weights_proj
+            aux_fns[2] = indexer_compressor_kv_score
+
+        def fused_wqa_wkv() -> torch.Tensor:
+            # MergedColumnParallelLinear returns (output, bias); bias is None.
+            qr_kv, _ = self.fused_wqa_wkv(hidden_states)
+            return qr_kv
+
+        qr_kv, (kv_score, indexer_weights, indexer_kv_score) = execute_in_parallel(
+            fused_wqa_wkv,
+            aux_fns,
+            self.ln_events[0],
+            self.ln_events[1:4],
+            self.aux_stream_list[:3],
+            enable=hidden_states.shape[0] <= envs.APHRODITE_MULTI_STREAM_GEMM_TOKEN_THRESHOLD,
+        )
+
+        return qr_kv, kv_score, indexer_kv_score, indexer_weights
+
     def attention_impl(
         self,
         hidden_states: torch.Tensor,
-        qr: torch.Tensor,
-        kv: torch.Tensor,
         positions: torch.Tensor,
         out: torch.Tensor,  # [num_tokens, padded_heads, head_dim], written in place
     ) -> None:
         forward_context = get_forward_context()
         attn_metadata = forward_context.attn_metadata
 
+        qr_kv, kv_score, indexer_kv_score, indexer_weights = self.attn_gemm_parallel_execute(hidden_states)
+
+        qr, kv = qr_kv.split([self.q_lora_rank, self.head_dim], dim=-1)
         qr, kv = fused_q_kv_rmsnorm(
             qr,
             kv,
@@ -347,40 +407,60 @@ def attention_impl(
             self.kv_norm.weight.data,
             self.eps,
         )
-        q = self.wq_b(qr).view(-1, self.n_local_heads, self.head_dim)
 
-        # Overlap kv_insert with whichever of indexer/compressor is present.
-        # Indexer implies compressor; when both exist, compressor rides on the
-        # aux stream alongside kv_insert so the heavy indexer owns default.
+        # wq_b + kv_insert (+ MLA compressor when an indexer is present) ride
+        # on the default stream so q stays on its consumer stream (mla_attn
+        # downstream reads q on default). Indexer/compressor go on aux for
+        # overlap with default's GEMM + cache write.
         if self.indexer is not None:
+            assert self.aux_stream_list is not None
+            aux_stream = self.aux_stream_list[0]
             indexer = self.indexer
             # Local ref so the closure keeps a non-None type for mypy.
             assert self.compressor is not None
             compressor = self.compressor
 
-            def kv_insert_and_compress() -> None:
+            def wq_b_kv_insert_and_compress() -> torch.Tensor:
+                q = self.wq_b(qr).view(-1, self.n_local_heads, self.head_dim)
                 self._fused_qnorm_rope_kv_insert(q, kv, positions, attn_metadata)
-                compressor(hidden_states, positions, self.rotary_emb)
-
-            maybe_execute_in_parallel(
-                lambda: indexer(hidden_states, qr, positions, self.indexer_rotary_emb),
-                kv_insert_and_compress,
+                compressor(kv_score, positions, self.rotary_emb)
+                return q
+
+            q, _ = maybe_execute_in_parallel(
+                wq_b_kv_insert_and_compress,
+                lambda: indexer(
+                    hidden_states,
+                    qr,
+                    indexer_kv_score,
+                    indexer_weights,
+                    positions,
+                    self.indexer_rotary_emb,
+                ),
                 self.ln_events[0],
                 self.ln_events[1],
-                self.aux_stream,
+                aux_stream,
             )
         elif self.compressor is not None:
-            # Compressor on default, kv_insert on aux.
+            # wq_b + kv_insert on default, compressor on aux.
+            assert self.aux_stream_list is not None
+            aux_stream = self.aux_stream_list[0]
             compressor = self.compressor
-            maybe_execute_in_parallel(
-                lambda: compressor(hidden_states, positions, self.rotary_emb),
-                lambda: self._fused_qnorm_rope_kv_insert(q, kv, positions, attn_metadata),
+
+            def wq_b_kv_insert() -> torch.Tensor:
+                q = self.wq_b(qr).view(-1, self.n_local_heads, self.head_dim)
+                self._fused_qnorm_rope_kv_insert(q, kv, positions, attn_metadata)
+                return q
+
+            q, _ = maybe_execute_in_parallel(
+                wq_b_kv_insert,
+                lambda: compressor(kv_score, positions, self.rotary_emb),
                 self.ln_events[0],
                 self.ln_events[1],
-                self.aux_stream,
+                aux_stream,
             )
         else:
             # SWA-only layer: no compressor, no overlap.
+            q = self.wq_b(qr).view(-1, self.n_local_heads, self.head_dim)
             self._fused_qnorm_rope_kv_insert(q, kv, positions, attn_metadata)
 
         # Handle dummy run (no metadata).
@@ -444,21 +524,17 @@ def _fused_qnorm_rope_kv_insert(
 
 def deepseek_v4_attention(
     hidden_states: torch.Tensor,
-    qr: torch.Tensor,
-    kv: torch.Tensor,
     positions: torch.Tensor,
     out: torch.Tensor,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self.attention_impl(hidden_states, qr, kv, positions, out)
+    self.attention_impl(hidden_states, positions, out)
 
 
 def deepseek_v4_attention_fake(
     hidden_states: torch.Tensor,
-    qr: torch.Tensor,
-    kv: torch.Tensor,
     positions: torch.Tensor,
     out: torch.Tensor,
     layer_name: str,
@@ -950,7 +1026,7 @@ def __init__(
         self.compress_ratio = compress_ratio
         self.use_fp4_kv = self.aphrodite_config.attention_config.use_fp4_indexer_cache
         logger.info_once(
-            "Using %s indexer cache for Lighening Indexer.",
+            "Using %s indexer cache for Lightning Indexer.",
             "MXFP4" if self.use_fp4_kv else "FP8",
         )
 
@@ -1022,18 +1098,20 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         qr: torch.Tensor,
+        compressed_kv_score: torch.Tensor,
+        indexer_weights: torch.Tensor,
         positions: torch.Tensor,
         rotary_emb: nn.Module,
     ) -> torch.Tensor:
+        # ReplicatedLinear returns (output, bias); bias is None.
         q, _ = self.wq_b(qr)
         q = q.view(-1, self.n_head, self.head_dim)
-        k = self.compressor(hidden_states, positions, rotary_emb)
-        weights, _ = self.weights_proj(hidden_states)
+        k = self.compressor(compressed_kv_score, positions, rotary_emb)
         q_quant, weights = fused_indexer_q_rope_quant(
             positions,
             q,
             rotary_emb.cos_sin_cache,
-            weights,
+            indexer_weights,
             self.softmax_scale,
             self.n_head**-0.5,
             use_fp4=self.use_fp4_kv,
diff --git a/aphrodite/model_executor/layers/fla/ops/kda.py b/aphrodite/model_executor/layers/fla/ops/kda.py
index 01145c18d6..e65e4a9342 100644
--- a/aphrodite/model_executor/layers/fla/ops/kda.py
+++ b/aphrodite/model_executor/layers/fla/ops/kda.py
@@ -1034,10 +1034,10 @@ def chunk_gla_fwd_kernel_o(
         )
         p_h = tl.make_block_ptr(
             h + (i_tg * H + i_h) * K * V,
-            (K, V),
-            (V, 1),
-            (i_k * BK, i_v * BV),
-            (BK, BV),
+            (V, K),
+            (K, 1),
+            (i_v * BV, i_k * BK),
+            (BV, BK),
             (1, 0),
         )
 
@@ -1048,12 +1048,12 @@ def chunk_gla_fwd_kernel_o(
         b_g = tl.load(p_g, boundary_check=(0, 1))
         # [BT, BK]
         b_qg = (b_q * exp(b_g)).to(b_q.dtype)
-        # [BK, BV]
+        # [BV, BK]
         b_h = tl.load(p_h, boundary_check=(0, 1))
         # works but dkw, owing to divine benevolence
         # [BT, BV]
         if i_k >= 0:
-            b_o += tl.dot(b_qg, b_h.to(b_qg.dtype))
+            b_o += tl.dot(b_qg, tl.trans(b_h).to(b_qg.dtype))
     p_v = tl.make_block_ptr(
         v + (bos * H + i_h) * V,
         (T, V),
diff --git a/aphrodite/model_executor/layers/fused_moe/__init__.py b/aphrodite/model_executor/layers/fused_moe/__init__.py
index e5656cc5cd..ec7286d7ef 100644
--- a/aphrodite/model_executor/layers/fused_moe/__init__.py
+++ b/aphrodite/model_executor/layers/fused_moe/__init__.py
@@ -73,18 +73,23 @@ def get_config() -> dict[str, Any] | None:
 
 if HAS_TRITON:
     # import to register the custom ops
-    from aphrodite.model_executor.layers.fused_moe.cutlass_moe import (
+    from aphrodite.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
+        BatchedDeepGemmExperts,
+    )
+    from aphrodite.model_executor.layers.fused_moe.experts.cutlass_moe import (
         CutlassBatchedExpertsFp8,
         CutlassExpertsFp8,
         CutlassExpertsW4A8Fp8,
         cutlass_moe_w4a8_fp8,
     )
-    from aphrodite.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
-        BatchedDeepGemmExperts,
-    )
     from aphrodite.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
         DeepGemmExperts,
     )
+    from aphrodite.model_executor.layers.fused_moe.experts.xpu_moe import (
+        XPUExperts,
+        XPUExpertsFp8,
+        XPUExpertsMXFp4,
+    )
     from aphrodite.model_executor.layers.fused_moe.fused_batched_moe import (
         BatchedTritonExperts,
     )
@@ -106,11 +111,6 @@ def get_config() -> dict[str, Any] | None:
     from aphrodite.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
         TritonOrDeepGemmExperts,
     )
-    from aphrodite.model_executor.layers.fused_moe.xpu_fused_moe import (
-        XPUExperts,
-        XPUExpertsFp8,
-        XPUExpertsMXFp4,
-    )
 
     __all__ += [
         "AiterExperts",
diff --git a/aphrodite/model_executor/layers/fused_moe/all2all_utils.py b/aphrodite/model_executor/layers/fused_moe/all2all_utils.py
index d72c517243..634c0ebd9a 100644
--- a/aphrodite/model_executor/layers/fused_moe/all2all_utils.py
+++ b/aphrodite/model_executor/layers/fused_moe/all2all_utils.py
@@ -220,21 +220,35 @@ def maybe_make_prepare_finalize(
 
     elif moe.use_fi_nvl_one_sided_kernels:
         assert quant_config is not None
-        if quant_config.quant_dtype != "nvfp4":
-            raise ValueError(
-                "The 'flashinfer_nvlink_one_sided' all2all backend only "
-                "supports nvfp4 activation quantization, but got "
-                f"quant_dtype={quant_config.quant_dtype!r}. Use a different "
-                "all2all backend (e.g. 'flashinfer_nvlink_two_sided' or "
-                "'allgather_reducescatter') for non-nvfp4 models."
-            )
         max_num_tokens = get_current_aphrodite_config().scheduler_config.max_num_batched_tokens
+        if quant_config.quant_dtype is None:
+            dispatch_dtype_bytes_per_elem = 2
+            dispatch_scale_bytes_per_token = 0
+        elif quant_config.quant_dtype == "nvfp4":
+            dispatch_dtype_bytes_per_elem = 0
+            dispatch_scale_bytes_per_token = moe.hidden_dim // 16
+        elif quant_config.quant_dtype == "mxfp8":
+            dispatch_dtype_bytes_per_elem = 1
+            align = quant_config.mx_alignment
+            if align > 0:
+                padded_k = ((moe.hidden_dim + align - 1) // align) * align
+            else:
+                padded_k = moe.hidden_dim
+            dispatch_scale_bytes_per_token = padded_k // 32
+        else:
+            raise NotImplementedError(
+                "flashinfer_nvlink_one_sided dispatch supports nvfp4, mxfp8, "
+                "and bf16 (quant_dtype=None) today; got "
+                f"quant_dtype={quant_config.quant_dtype!r}"
+            )
         prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(
             max_num_tokens=max_num_tokens,
             top_k=moe.experts_per_token,
             num_experts=moe.num_experts,
             hidden_size=moe.hidden_dim,
             num_dispatchers=all2all_manager.world_size,
+            dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,
+            dispatch_scale_bytes_per_token=dispatch_scale_bytes_per_token,
         )
 
     elif moe.use_ag_rs_all2all_kernels and allow_new_interface:
diff --git a/aphrodite/model_executor/layers/fused_moe/config.py b/aphrodite/model_executor/layers/fused_moe/config.py
index 84eba73a2f..a6b75c6ae4 100644
--- a/aphrodite/model_executor/layers/fused_moe/config.py
+++ b/aphrodite/model_executor/layers/fused_moe/config.py
@@ -8,11 +8,7 @@
 
 from aphrodite.config import ParallelConfig, SchedulerConfig
 from aphrodite.config.kernel import MoEBackend
-from aphrodite.distributed import (
-    get_dp_group,
-    get_pcp_group,
-    get_tensor_model_parallel_rank,
-)
+from aphrodite.distributed import get_dp_group, get_pcp_group, get_tensor_model_parallel_rank
 from aphrodite.logger import init_logger
 from aphrodite.model_executor.layers.fused_moe.activation import MoEActivation
 from aphrodite.model_executor.layers.quantization.utils.ocp_mx_utils import (
@@ -116,14 +112,17 @@ class RoutingMethodType(IntEnum):
     RenormalizeNaive = (4,)
     # TopK: TopK (no softmax)
     TopK = (5,)
-    # Custom
-    Custom = (6,)
-    # Simulated
-    Simulated = (7,)
-    # Deepseek V4 -> sqrtsoftplus + Bias + Normalize
-    DeepseekV4 = (8,)
+    # SigmoidRenorm: Sigmoid -> TopK -> Renormalize (divide by sum of top-K)
+    SigmoidRenorm = (6,)
+    # MiniMax2: Sigmoid + Bias -> TopK -> ScaledSumNormalize
+    MiniMax2 = (7,)
     # Unspecified
-    Unspecified = 9.0
+    Unspecified = (8,)
+    # other routing types (not passed to FlashInfer kernels)
+    # Deepseek V4 -> sqrtsoftplus + Bias + Normalize
+    DeepseekV4 = (100,)
+    Custom = (101,)
+    Simulated = (102,)
 
 
 def get_routing_method_type(
@@ -140,15 +139,20 @@ def get_routing_method_type(
             return RoutingMethodType.DeepseekV4
         else:
             return RoutingMethodType.Unspecified
+
     if has_e_score_bias:
         if (num_expert_group or 0) > 0 and scoring_func == "sigmoid":
             return RoutingMethodType.DeepSeekV3
+        elif scoring_func == "sigmoid":
+            return RoutingMethodType.MiniMax2
         else:
             return RoutingMethodType.Unspecified
 
     if scoring_func == "sigmoid":
         if top_k == 1:
             return RoutingMethodType.Llama4
+        elif renormalize:
+            return RoutingMethodType.SigmoidRenorm
         else:
             return RoutingMethodType.Unspecified
 
@@ -249,6 +253,8 @@ class FusedMoEQuantConfig:
     gemm1_beta: float | None = None
     gemm1_clamp_limit: float | None = None
 
+    mx_alignment: int = 0
+
     def __post_init__(self):
         assert not self.per_act_token_quant or self.block_shape is None, "illegal quantization"
 
@@ -640,9 +646,7 @@ def gptq_marlin_moe_quant_config(
     """
     Construct a quant config for gptq marlin quantization.
     """
-    from aphrodite.model_executor.layers.quantization.utils.quant_utils import (
-        GroupShape,
-    )
+    from aphrodite.model_executor.layers.quantization.utils.quant_utils import GroupShape
 
     w_shape = None if group_size == -1 else GroupShape(row=1, col=group_size)
 
@@ -699,6 +703,7 @@ def mxfp4_mxfp8_moe_quant_config(
     gemm1_alpha: float | None = None,
     gemm1_beta: float | None = None,
     gemm1_clamp_limit: float | None = None,
+    mx_alignment: int = 0,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for mxfp4 activations and mxfp4 weights.
@@ -711,6 +716,7 @@ def mxfp4_mxfp8_moe_quant_config(
         gemm1_alpha=gemm1_alpha,
         gemm1_beta=gemm1_beta,
         gemm1_clamp_limit=gemm1_clamp_limit,
+        mx_alignment=mx_alignment,
     )
 
 
@@ -932,9 +938,7 @@ def awq_marlin_moe_quant_config(
     """
     Construct a quant config for awq marlin quantization.
     """
-    from aphrodite.model_executor.layers.quantization.utils.quant_utils import (
-        GroupShape,
-    )
+    from aphrodite.model_executor.layers.quantization.utils.quant_utils import GroupShape
 
     w_shape = None if group_size == -1 else GroupShape(row=1, col=group_size)
 
diff --git a/aphrodite/model_executor/layers/fused_moe/configs/E=128,N=2880,device_name=NVIDIA_H100_80GB_HBM3.json b/aphrodite/model_executor/layers/fused_moe/configs/E=128,N=2880,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000..2d53aedbed
--- /dev/null
+++ b/aphrodite/model_executor/layers/fused_moe/configs/E=128,N=2880,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.6.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/aphrodite/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py b/aphrodite/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
index 09a585d0e7..b6e5cfed15 100644
--- a/aphrodite/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-
 import torch
 
 import aphrodite.model_executor.layers.fused_moe.modular_kernel as mk
@@ -16,9 +15,7 @@
     FusedMoEQuantConfig,
     RoutingMethodType,
 )
-from aphrodite.model_executor.layers.fused_moe.lora_experts_mixin import (
-    LoRAExpertsMixin,
-)
+from aphrodite.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin
 from aphrodite.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
diff --git a/aphrodite/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/aphrodite/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
index 7f741adb23..948dea14e8 100644
--- a/aphrodite/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
@@ -168,13 +168,6 @@ def apply(
         # Pack topk ids and weights into format expected by the kernel.
         packed_topk_ids = trtllm_moe_pack_topk_ids_weights(topk_ids, topk_weights)
 
-        # trtllm_fp8_block_scale_routed_moe does not support autotuning
-        # so skip this kernel during dummy run for autotuning.
-        import aphrodite.utils.flashinfer as fi_utils
-
-        if fi_utils._is_fi_autotuning:
-            return
-
         assert a1q_scale is not None
 
         is_mxfp8 = self.quant_config.block_shape == [1, 32]
@@ -189,11 +182,7 @@ def apply(
             weight_layout = WeightLayout.BlockMajorK
             hidden_states_scale = a1q_scale.t().contiguous()
 
-        # `trtllm_fp8_block_scale_routed_moe` has a bug and does not write to the
-        # output tensor in-place so we need to manually copy the result to the
-        # output tensor
-        # https://github.com/flashinfer-ai/flashinfer/issues/2703
-        result = flashinfer.fused_moe.trtllm_fp8_block_scale_routed_moe(
+        flashinfer.fused_moe.trtllm_fp8_block_scale_routed_moe(
             topk_ids=packed_topk_ids,
             routing_bias=None,
             hidden_states=hidden_states,
@@ -210,13 +199,12 @@ def apply(
             local_expert_offset=self.ep_rank * self.local_num_experts,
             local_num_experts=self.local_num_experts,
             routed_scaling_factor=None,
-            routing_method_type=1,
+            routing_method_type=1,  # not used
             use_shuffled_weight=use_shuffled_weight,
             weight_layout=weight_layout,
             fp8_quantization_type=fp8_quant_type,
-            # output=output,
+            output=output,
         )
-        output.copy_(result)
 
 
 class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolithic):
@@ -268,20 +256,6 @@ def _supports_router_logits_dtype(
         router_logits_dtype: torch.dtype | None,
         routing_method: RoutingMethodType,
     ) -> bool:
-        """
-        The FlashInfer TRTLLM FP8 kernel expects bfloat16 router_logits by default.
-        DeepSeekV3 routing supports float32 router_logits (converted internally).
-        Simulated routing generates synthetic decisions and is agnostic to dtype.
-        """
-        if router_logits_dtype == torch.float32:
-            # DeepSeekV3 routing handles float32 logits internally.
-            # Simulated routing generates synthetic decisions, so the
-            # kernel doesn't care about the actual logits dtype.
-            # https://github.com/flashinfer-ai/flashinfer/issues/2469
-            return routing_method in (
-                RoutingMethodType.DeepSeekV3,
-                RoutingMethodType.Simulated,
-            )
         return True
 
     @staticmethod
@@ -301,18 +275,22 @@ def _supports_routing_method(
             # NOTE(rob): potentially allow others here. This is a conservative list.
             return routing_method in [
                 RoutingMethodType.DeepSeekV3,
-                RoutingMethodType.Simulated,
                 RoutingMethodType.Renormalize,
                 RoutingMethodType.RenormalizeNaive,
+                RoutingMethodType.SigmoidRenorm,
+                RoutingMethodType.MiniMax2,
+                RoutingMethodType.Simulated,
             ]
         elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
             # NOTE(dbari): as above, potentially allow others here.
             return routing_method in [
                 RoutingMethodType.DeepSeekV3,
                 RoutingMethodType.Llama4,
-                RoutingMethodType.Simulated,
                 RoutingMethodType.Renormalize,
                 RoutingMethodType.RenormalizeNaive,
+                RoutingMethodType.SigmoidRenorm,
+                RoutingMethodType.MiniMax2,
+                RoutingMethodType.Simulated,
             ]
         else:
             raise ValueError("Unsupported quantization scheme.")
@@ -348,14 +326,6 @@ def _apply_block_scale(
         # TODO: fuse into the quant kernel.
         assert a1q_scale is not None
 
-        if self.routing_method_type == RoutingMethodType.DeepSeekV3:
-            router_logits = router_logits.to(torch.float32)
-
-        # Currently FI requires bfloat16 routing bias.
-        # https://github.com/flashinfer-ai/flashinfer/issues/2909
-        if e_score_correction_bias is not None:
-            e_score_correction_bias = e_score_correction_bias.to(torch.bfloat16)
-
         is_mxfp8 = self.quant_config.block_shape == [1, 32]
         if is_mxfp8:
             fp8_quant_type = Fp8QuantizationType.MxFp8
@@ -422,10 +392,6 @@ def _apply_per_tensor(
         else:
             assert not apply_router_weight_on_input
 
-        # The DeepSeekV3 routing method requires float32 router logits.
-        if self.routing_method_type == RoutingMethodType.DeepSeekV3:
-            router_logits = router_logits.to(torch.float32)
-
         # Currently FI requires bfloat16 routing bias.
         # https://github.com/flashinfer-ai/flashinfer/issues/2909
         if e_score_correction_bias is not None:
diff --git a/aphrodite/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py b/aphrodite/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
index af7f8f0504..123dd0fa05 100644
--- a/aphrodite/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
@@ -42,6 +42,7 @@ def __init__(
         self.topk = moe_config.experts_per_token
         self.intermediate_size_per_partition = moe_config.intermediate_size_per_partition
         self.hidden_dim = moe_config.hidden_dim
+        self.hidden_dim_unpadded = moe_config.hidden_dim_unpadded or moe_config.hidden_dim
         self.local_num_experts = moe_config.num_local_experts
         self.ep_rank = moe_config.moe_parallel_config.ep_rank
 
@@ -78,9 +79,6 @@ def __init__(
 
         self.max_capture_size = get_current_aphrodite_config().compilation_config.max_cudagraph_capture_size
 
-        # P1-5 fix: use public quant_dtype property instead of private _a1
-        self.use_mxfp8_input = quant_config.quant_dtype == "mxfp8"
-
     @staticmethod
     def _supports_current_device() -> bool:
         p = current_platform
@@ -117,8 +115,7 @@ def supports_expert_map(self) -> bool:
 
     @property
     def expects_unquantized_inputs(self) -> bool:
-        # Expert handles MXFP8 quantization internally if needed
-        return True
+        return False
 
 
 class TrtLlmMxfp4ExpertsMonolithic(TrtLlmMxfp4ExpertsBase, mk.FusedMoEExpertsMonolithic):
@@ -175,22 +172,19 @@ def apply(
     ) -> torch.Tensor:
         from flashinfer import trtllm_fp4_block_scale_moe
 
-        # Handle input quantization
-        if self.use_mxfp8_input:
-            from flashinfer import mxfp8_quantize
-
-            x_quant, x_scale = mxfp8_quantize(
-                hidden_states,
-                is_sf_swizzled_layout=False,
-                alignment=256,
-            )
-            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*hidden_states.shape[:-1], -1)
+        if a1q_scale is not None:
+            x_quant = hidden_states
+            x_scale = a1q_scale.view(torch.float8_e4m3fn)
         else:
             assert hidden_states.dtype == torch.bfloat16
             x_quant = hidden_states
             x_scale = None
-
-        output = torch.empty_like(hidden_states)
+        output = torch.empty(
+            *hidden_states.shape[:-1],
+            self.hidden_dim_unpadded,
+            dtype=torch.bfloat16,
+            device=hidden_states.device,
+        )
 
         from aphrodite.utils.flashinfer import _is_fi_autotuning, autotune
 
@@ -236,10 +230,6 @@ class TrtLlmMxfp4ExpertsModular(TrtLlmMxfp4ExpertsBase, mk.FusedMoEExpertsModula
     Moved from trtllm_moe.py.
     """
 
-    @property
-    def expects_unquantized_inputs(self) -> bool:
-        return True
-
     @staticmethod
     def _supports_parallel_config(
         moe_parallel_config: FusedMoEParallelConfig,
@@ -276,7 +266,7 @@ def workspace_shapes(
         # The workspaces for this implementation are managed by flashinfer.
         workspace1 = (0,)
         workspace2 = (0,)
-        output = (M, K)
+        output = (M, self.hidden_dim_unpadded)
         return (workspace1, workspace2, output)
 
     def apply(
@@ -302,16 +292,9 @@ def apply(
         intermediate_size = self.intermediate_size_per_partition
         local_expert_offset = self.moe_config.ep_rank * local_num_experts
 
-        # Handle input quantization
-        if self.use_mxfp8_input:
-            from flashinfer import mxfp8_quantize
-
-            x_quant, x_scale = mxfp8_quantize(
-                hidden_states,
-                is_sf_swizzled_layout=False,
-                alignment=256,
-            )
-            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*hidden_states.shape[:-1], -1)
+        if a1q_scale is not None:
+            x_quant = hidden_states
+            x_scale = a1q_scale.view(torch.float8_e4m3fn)
         else:
             assert hidden_states.dtype == torch.bfloat16
             x_quant = hidden_states
diff --git a/aphrodite/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py b/aphrodite/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
index d46baf14dc..566b4b45dd 100644
--- a/aphrodite/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
@@ -16,9 +16,7 @@
 from aphrodite.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
-from aphrodite.model_executor.layers.fused_moe.utils import (
-    trtllm_moe_pack_topk_ids_weights,
-)
+from aphrodite.model_executor.layers.fused_moe.utils import trtllm_moe_pack_topk_ids_weights
 from aphrodite.model_executor.layers.quantization.utils.flashinfer_utils import (
     activation_to_flashinfer_int,
 )
@@ -106,8 +104,12 @@ def _supports_quant_scheme(
 
     @staticmethod
     def _supports_activation(activation: MoEActivation) -> bool:
-        """Supports only SiLU and RELU^2 non-gated activation."""
-        return activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]
+        """Supports only SiLU, RELU^2 non-gated and GELU activation."""
+        return activation in [
+            MoEActivation.SILU,
+            MoEActivation.RELU2_NO_MUL,
+            MoEActivation.GELU,
+        ]
 
     @staticmethod
     def _supports_shape(hidden_dim: int) -> bool:
@@ -184,7 +186,7 @@ def apply(
     ):
         import flashinfer
 
-        assert activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]
+        assert self._supports_activation(activation)
         assert a1q_scale is not None
         assert self.quant_config.w1_scale is not None
         assert self.quant_config.w2_scale is not None
@@ -192,13 +194,6 @@ def apply(
         # Pack topk ids and weights into format expected by the kernel.
         packed_tensor = trtllm_moe_pack_topk_ids_weights(topk_ids, topk_weights)
 
-        # trtllm_fp4_block_scale_routed_moe does not support autotuning
-        # so skip this kernel during dummy run for autotuning.
-        import aphrodite.utils.flashinfer as fi_utils
-
-        if fi_utils._is_fi_autotuning:
-            return
-
         # Invoke kernel.
         flashinfer.fused_moe.trtllm_fp4_block_scale_routed_moe(
             topk_ids=packed_tensor,
@@ -225,7 +220,7 @@ def apply(
             local_expert_offset=self.ep_rank * self.local_num_experts,
             local_num_experts=self.local_num_experts,
             routed_scaling_factor=None,
-            routing_method_type=1,
+            routing_method_type=1,  # not used
             do_finalize=True,
             activation_type=activation_to_flashinfer_int(activation),
             output=output,
@@ -254,7 +249,10 @@ def _supports_routing_method(
             RoutingMethodType.Renormalize,
             RoutingMethodType.RenormalizeNaive,
             RoutingMethodType.Llama4,
+            RoutingMethodType.SigmoidRenorm,
+            RoutingMethodType.MiniMax2,
             RoutingMethodType.Simulated,
+            RoutingMethodType.SigmoidRenorm,
         ]
 
     @staticmethod
@@ -262,20 +260,6 @@ def _supports_router_logits_dtype(
         router_logits_dtype: torch.dtype | None,
         routing_method: RoutingMethodType,
     ) -> bool:
-        """
-        The FlashInfer TRTLLM NvFp4 kernel expects bfloat16 router_logits by default.
-        DeepSeekV3 routing supports float32 router_logits (converted internally).
-        Simulated routing generates synthetic decisions and is agnostic to dtype.
-        """
-        if router_logits_dtype == torch.float32:
-            # DeepSeekV3 routing handles float32 logits internally.
-            # Simulated routing generates synthetic decisions, so the
-            # kernel doesn't care about the actual logits dtype.
-            # https://github.com/flashinfer-ai/flashinfer/issues/2469
-            return routing_method in (
-                RoutingMethodType.DeepSeekV3,
-                RoutingMethodType.Simulated,
-            )
         return True
 
     def apply(
@@ -297,7 +281,7 @@ def apply(
     ) -> torch.Tensor:
         import flashinfer
 
-        assert activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]
+        assert self._supports_activation(activation)
         assert a1q_scale is not None
         assert self.quant_config.w1_scale is not None
         assert self.quant_config.w2_scale is not None
@@ -305,13 +289,6 @@ def apply(
             not apply_router_weight_on_input and self.routing_method_type != RoutingMethodType.Llama4
         )
 
-        # Prepare router logits for kernel format.
-        router_logits = (
-            router_logits.to(torch.float32)
-            if self.routing_method_type == RoutingMethodType.DeepSeekV3
-            else router_logits
-        )
-
         # Currently FI requires bfloat16 routing bias.
         # https://github.com/flashinfer-ai/flashinfer/issues/2909
         if e_score_correction_bias is not None:
diff --git a/aphrodite/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/aphrodite/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 1ab187ba53..51e5775b37 100644
--- a/aphrodite/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -114,7 +114,7 @@ def _supports_current_device() -> bool:
             and (
                 p.is_device_capability(90)
                 or p.is_device_capability_family(100)
-                or p.is_device_capability_family(110)
+                # SM110 excluded: flashinfer-ai/flashinfer#3134
                 or p.is_device_capability_family(120)
             )
             and has_flashinfer_cutlass_fused_moe()
diff --git a/aphrodite/model_executor/layers/fused_moe/fused_batched_moe.py b/aphrodite/model_executor/layers/fused_moe/fused_batched_moe.py
index 9dd4662c96..cda2f35111 100644
--- a/aphrodite/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -11,9 +11,7 @@
     FusedMoEParallelConfig,
     FusedMoEQuantConfig,
 )
-from aphrodite.model_executor.layers.fused_moe.fused_moe import (
-    try_get_optimal_moe_config,
-)
+from aphrodite.model_executor.layers.fused_moe.fused_moe import try_get_optimal_moe_config
 from aphrodite.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
 )
diff --git a/aphrodite/model_executor/layers/fused_moe/fused_humming_moe.py b/aphrodite/model_executor/layers/fused_moe/fused_humming_moe.py
index d5d34afe2a..539af083a3 100644
--- a/aphrodite/model_executor/layers/fused_moe/fused_humming_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/fused_humming_moe.py
@@ -20,9 +20,7 @@
 from aphrodite.model_executor.layers.fused_moe.moe_align_block_size import (
     moe_align_block_size,
 )
-from aphrodite.model_executor.layers.fused_moe.moe_fused_mul_sum import (
-    moe_fused_mul_sum,
-)
+from aphrodite.model_executor.layers.fused_moe.moe_fused_mul_sum import moe_fused_mul_sum
 from aphrodite.model_executor.layers.fused_moe.moe_permute_unpermute import (
     moe_permute,
     moe_unpermute,
diff --git a/aphrodite/model_executor/layers/fused_moe/fused_marlin_moe.py b/aphrodite/model_executor/layers/fused_moe/fused_marlin_moe.py
index 8e0b517623..aa3aaef2db 100644
--- a/aphrodite/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -17,9 +17,7 @@
     FusedMoEParallelConfig,
     FusedMoEQuantConfig,
 )
-from aphrodite.model_executor.layers.fused_moe.lora_experts_mixin import (
-    LoRAExpertsMixin,
-)
+from aphrodite.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin
 from aphrodite.model_executor.layers.fused_moe.moe_align_block_size import (
     batched_moe_align_block_size,
     moe_align_block_size,
diff --git a/aphrodite/model_executor/layers/fused_moe/fused_moe.py b/aphrodite/model_executor/layers/fused_moe/fused_moe.py
index b91cd6e7df..28d00c2edb 100644
--- a/aphrodite/model_executor/layers/fused_moe/fused_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/fused_moe.py
@@ -25,9 +25,7 @@
     FusedMoEQuantConfig,
     _get_config_dtype_str,
 )
-from aphrodite.model_executor.layers.fused_moe.lora_experts_mixin import (
-    LoRAExpertsMixin,
-)
+from aphrodite.model_executor.layers.fused_moe.lora_experts_mixin import LoRAExpertsMixin
 from aphrodite.model_executor.layers.fused_moe.moe_align_block_size import (
     moe_align_block_size,
 )
diff --git a/aphrodite/model_executor/layers/fused_moe/fused_moe_method_base.py b/aphrodite/model_executor/layers/fused_moe/fused_moe_method_base.py
index 08288b5cae..391981f96f 100644
--- a/aphrodite/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/aphrodite/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -137,10 +137,6 @@ def skip_forward_padding(self) -> bool:
     def supports_eplb(self) -> bool:
         return False
 
-    @property
-    def supports_shared_expert_overlap(self) -> bool:
-        return True
-
     @property
     def method_name(self) -> str:
         return self.__class__.__name__
diff --git a/aphrodite/model_executor/layers/fused_moe/layer.py b/aphrodite/model_executor/layers/fused_moe/layer.py
index 4300ef9753..2e17398756 100644
--- a/aphrodite/model_executor/layers/fused_moe/layer.py
+++ b/aphrodite/model_executor/layers/fused_moe/layer.py
@@ -1409,16 +1409,19 @@ def _maybe_make_contiguous(name: str, p: torch.nn.Parameter) -> torch.nn.Paramet
             "w2_input_scale",
         }
 
+        # Parameters of non-expert submodules that live inside runner (MoERunner).
+        # These must be excluded from EPLB weight rearrangement.
+        NON_EXPERT_PREFIXES = (
+            "runner._shared_experts.",
+            "runner.gate.",
+            "runner.routed_input_transform.",
+            "runner.routed_output_transform.",
+        )
+
         assert all(
             weight.is_contiguous()
             for name, weight in weights
-            if not (
-                name.startswith("_shared_experts.")
-                or name.startswith("_gate.")
-                or name.startswith("_routed_input_transform.")
-                or name.startswith("_routed_output_transform.")
-            )
-            and name not in NON_EXPERT_WEIGHTS
+            if not name.startswith(NON_EXPERT_PREFIXES) and name not in NON_EXPERT_WEIGHTS
         )
 
         return [
@@ -1426,12 +1429,7 @@ def _maybe_make_contiguous(name: str, p: torch.nn.Parameter) -> torch.nn.Paramet
             for name, weight in weights
             if name not in NON_EXPERT_WEIGHTS
             and weight.shape != torch.Size([])
-            and not name.startswith("_shared_experts.")
-            # exclude parameters from non-expert submodules,
-            # e.g. gate/shared/transforms.
-            and not name.startswith("_gate.")
-            and not name.startswith("_routed_input_transform.")
-            and not name.startswith("_routed_output_transform.")
+            and not name.startswith(NON_EXPERT_PREFIXES)
         ]
 
     def set_eplb_state(
diff --git a/aphrodite/model_executor/layers/fused_moe/oracle/int_wna16.py b/aphrodite/model_executor/layers/fused_moe/oracle/int_wna16.py
index 15669202b0..d89cdc0491 100644
--- a/aphrodite/model_executor/layers/fused_moe/oracle/int_wna16.py
+++ b/aphrodite/model_executor/layers/fused_moe/oracle/int_wna16.py
@@ -27,9 +27,7 @@
 )
 
 if TYPE_CHECKING:
-    from aphrodite.model_executor.layers.quantization.gptq_marlin import (
-        GPTQMarlinConfig,
-    )
+    from aphrodite.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
 
 logger = init_logger(__name__)
 
diff --git a/aphrodite/model_executor/layers/fused_moe/oracle/mxfp4.py b/aphrodite/model_executor/layers/fused_moe/oracle/mxfp4.py
index d7e1000a05..0a69145e7a 100644
--- a/aphrodite/model_executor/layers/fused_moe/oracle/mxfp4.py
+++ b/aphrodite/model_executor/layers/fused_moe/oracle/mxfp4.py
@@ -22,9 +22,7 @@
     mxfp4_w4a16_moe_quant_config,
     ocp_mx_moe_quant_config,
 )
-from aphrodite.model_executor.layers.quantization.utils.mxfp4_utils import (
-    _swizzle_mxfp4,
-)
+from aphrodite.model_executor.layers.quantization.utils.mxfp4_utils import _swizzle_mxfp4
 from aphrodite.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     kFp8Dynamic128Sym,
@@ -153,9 +151,7 @@ def backend_to_kernel_cls(
         return [AiterExperts]
 
     elif backend == Mxfp4MoeBackend.XPU:
-        from aphrodite.model_executor.layers.fused_moe.experts.xpu_moe import (
-            XPUExpertsMXFp4,
-        )
+        from aphrodite.model_executor.layers.fused_moe.experts.xpu_moe import XPUExpertsMXFp4
 
         return [XPUExpertsMXFp4]
 
@@ -1118,10 +1114,18 @@ def make_mxfp4_moe_quant_config(
             gemm1_beta=gemm1_beta,
             gemm1_clamp_limit=swiglu_limit,
         )
-    elif mxfp4_backend in (
-        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
-        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
-    ):
+    elif mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8:
+        return mxfp4_mxfp8_moe_quant_config(
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            gemm1_alpha=gemm1_alpha,
+            gemm1_beta=gemm1_beta,
+            gemm1_clamp_limit=swiglu_limit,
+            mx_alignment=256,
+        )
+    elif mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8:
         return mxfp4_mxfp8_moe_quant_config(
             w1_bias=w1_bias,
             w2_bias=w2_bias,
@@ -1173,7 +1177,6 @@ def make_mxfp4_moe_kernel(
     """Create a FusedMoEKernel for the given MXFP4 backend."""
     is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)
 
-    # Create Prepare/Finalize.
     prepare_finalize = maybe_make_prepare_finalize(
         moe=moe_config,
         quant_config=moe_quant_config,
diff --git a/aphrodite/model_executor/layers/fused_moe/oracle/nvfp4.py b/aphrodite/model_executor/layers/fused_moe/oracle/nvfp4.py
index 7f05c4e907..7e5b40ee8e 100644
--- a/aphrodite/model_executor/layers/fused_moe/oracle/nvfp4.py
+++ b/aphrodite/model_executor/layers/fused_moe/oracle/nvfp4.py
@@ -31,6 +31,9 @@
 from aphrodite.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     prepare_nvfp4_moe_layer_for_marlin,
 )
+from aphrodite.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (
+    kE2M1ToFloat_handle,
+)
 from aphrodite.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
 )
@@ -255,7 +258,6 @@ def _return_or_raise(
                 activation_key,
                 activation_format,
             )
-
             if supported:
                 logger.info_once(_make_log_backend(backend))
                 return backend, k_cls
@@ -352,6 +354,10 @@ def convert_to_nvfp4_moe_kernel_format(
             is_act_and_mul=is_act_and_mul,
         )
     elif nvfp4_backend == NvFp4MoeBackend.EMULATION:
+        # Move the E2M1 lookup table to the device now, because
+        # `.to(device)` is not allowed during CUDA graph capture.
+        kE2M1ToFloat_handle.val = kE2M1ToFloat_handle.val.to(layer.weight.device)
+
         if a13_scale is None or a2_scale is None:
             raise ValueError(
                 f"Activation global scales should not be None, got a13_scale={a13_scale}, a2_scale={a2_scale}"
diff --git a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
index 72f317697c..19586d1071 100644
--- a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
+++ b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
@@ -16,7 +16,6 @@
     moe_kernel_quantize_input,
     normalize_batched_scales_shape,
 )
-from aphrodite.platforms import current_platform
 from aphrodite.v1.worker.ubatching import (
     dbo_current_ubatch_id,
     dbo_enabled,
@@ -258,46 +257,29 @@ def prepare_async(
 
         # Dispatch
         dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids)
-        if current_platform.is_rocm():
-            (
-                expert_x,
-                expert_num_tokens,
-                handle,
-                _,
-                hook,
-            ) = self.buffer.low_latency_dispatch(
-                a1,
-                dispatch_topk_ids,
-                self.max_tokens_per_rank,
-                num_experts,
-                use_fp8=self.use_fp8_dispatch,
-                async_finish=False,
-                return_recv_hook=True,
-            )
-        else:
-            (
-                expert_x,
-                expert_num_tokens,
-                handle,
-                _,
-                hook,
-            ) = self.buffer.low_latency_dispatch(
-                a1,
-                dispatch_topk_ids,
-                self.max_tokens_per_rank,
-                num_experts,
-                use_fp8=self.use_fp8_dispatch,
-                round_scale=self.use_ue8m0_dispatch,
-                use_ue8m0=self.use_ue8m0_dispatch,
-                **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
-                **(
-                    dict(x_global_scale=qc_a1_gscale_or_scale)
-                    if qc_a1_gscale_or_scale is not None and nvfp4_dispatch
-                    else dict()
-                ),
-                async_finish=False,
-                return_recv_hook=True,
-            )
+        (
+            expert_x,
+            expert_num_tokens,
+            handle,
+            _,
+            hook,
+        ) = self.buffer.low_latency_dispatch(
+            a1,
+            dispatch_topk_ids,
+            self.max_tokens_per_rank,
+            num_experts,
+            use_fp8=self.use_fp8_dispatch,
+            round_scale=self.use_ue8m0_dispatch,
+            use_ue8m0=self.use_ue8m0_dispatch,
+            **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
+            **(
+                dict(x_global_scale=qc_a1_gscale_or_scale)
+                if qc_a1_gscale_or_scale is not None and nvfp4_dispatch
+                else dict()
+            ),
+            async_finish=False,
+            return_recv_hook=True,
+        )
         self.handles[a2a_idx] = handle
 
         return (
diff --git a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
index bc3c7e36a0..14f4bcad03 100644
--- a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
+++ b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
@@ -31,6 +31,8 @@ def __init__(
         num_experts: int,
         hidden_size: int,
         num_dispatchers: int = 1,
+        dispatch_dtype_bytes_per_elem: int = 0,
+        dispatch_scale_bytes_per_token: int = 0,
     ):
         super().__init__()
         self.max_num_tokens = max_num_tokens
@@ -38,6 +40,7 @@ def __init__(
         self.num_experts = num_experts
         self.hidden_size = hidden_size
         self.num_dispatchers_ = num_dispatchers
+        self.scale_elems_per_token = dispatch_scale_bytes_per_token
 
         device_communicator = get_ep_group().device_communicator
         assert device_communicator is not None
@@ -49,6 +52,8 @@ def __init__(
             top_k=self.top_k,
             num_experts=self.num_experts,
             hidden_size=self.hidden_size,
+            dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,
+            dispatch_scale_bytes_per_token=dispatch_scale_bytes_per_token,
         )
 
     @property
@@ -88,19 +93,24 @@ def prepare(
             max(global_num_tokens_cpu) if global_num_tokens_cpu is not None else a1.shape[0]
         )
 
-        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1,
-            quant_config.a1_gscale,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
-        )
+        if defer_input_quant:
+            a1q, a1q_scale = a1, None
+        else:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                quant_config.a1_gscale,
+                quant_config.quant_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+                is_fp4_scale_swizzled=False,  # delay swizzle to after comm
+                mx_alignment=quant_config.mx_alignment,
+            )
 
         payloads = []
         payloads.append(a1q)
         if a1q_scale is not None:
             payloads.append(a1q_scale)
+        topk_ids_payload_index = len(payloads)
         payloads.append(topk_ids)
         payloads.append(topk_weights)
 
@@ -109,6 +119,8 @@ def prepare(
             token_selected_experts=topk_ids,
             input_payloads=payloads,
             runtime_max_tokens_per_rank=self.runtime_max_tokens_per_rank,
+            invalid_token_expert_id=-1,  # Follow TRTLLM Pattern
+            expert_id_payload_index=topk_ids_payload_index,
         )
         if a1q_scale is not None:
             a1q_recv, a1q_scale_recv, topk_ids_recv, topk_weights_recv = recv_payloads
@@ -117,7 +129,8 @@ def prepare(
                 a1q_scale_recv = a1q_scale_recv.view(-1, a1q_scale_recv.shape[-1])
                 a1q_scale_recv = a1q_scale_recv.view(torch.uint8)
                 a1q_scale_recv = nvfp4_block_scale_interleave(a1q_scale_recv)
-            a1q_scale_recv = a1q_scale_recv.view(-1, self.hidden_size // 16)
+            assert self.scale_elems_per_token > 0
+            a1q_scale_recv = a1q_scale_recv.view(-1, self.scale_elems_per_token)
         else:
             a1q_recv, topk_ids_recv, topk_weights_recv = recv_payloads
             a1q_scale_recv = None
diff --git a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py
index 75207df1b6..37b7f05694 100644
--- a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py
+++ b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py
@@ -164,6 +164,7 @@ def flashinfer_alltoall_dispatch(
             # the hidden states, breaking the A2A kernel. So, we
             # delay the swizzling until after the A2A.
             is_fp4_scale_swizzled=False,
+            mx_alignment=quant_config.mx_alignment,
         )
 
         x = MnnvlMoe.mnnvl_moe_alltoallv(
diff --git a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
index 77ea537f23..39c00ce778 100644
--- a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
+++ b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
@@ -36,6 +36,7 @@ def _quantize_and_setup_dispatch(
             per_act_token_quant=quant_config.per_act_token_quant,
             block_shape=quant_config.block_shape,
             is_fp4_scale_swizzled=False,
+            mx_alignment=quant_config.mx_alignment,
         )
 
     # Skip gathering scales if we have static quantization
diff --git a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py
index b63cdf89af..b77863db2f 100644
--- a/aphrodite/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py
+++ b/aphrodite/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py
@@ -29,6 +29,7 @@ def _quantize_input(
         per_act_token_quant=quant_config.per_act_token_quant,
         block_shape=quant_config.block_shape,
         is_fp4_scale_swizzled=quant_config.is_nvfp4_scale_swizzled,
+        mx_alignment=quant_config.mx_alignment,
     )
 
     return a1q, a1q_scale
diff --git a/aphrodite/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/aphrodite/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 5aced67203..79fc887c74 100644
--- a/aphrodite/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -224,7 +224,8 @@ def rocm_aiter_fused_experts(
 
     else:
         quant_method = QuantMethod.NO.value
-        # mxfp4: both w4a4 (quark) and w4a16 (oracle CK) use BLOCK_1X32
+        # mxfp4 i.e. w4a4, w4a16 uses BLOCK_1X32
+        # mxfp6 and mxfp8 are unsupported in AITER currently and use emulation instead
         if quant_config.use_mxfp4_w4a4 or quant_config.use_mxfp4_w4a16:
             quant_method = QuantMethod.BLOCK_1X32.value
         # w8a8 block-scaled
diff --git a/aphrodite/model_executor/layers/fused_moe/routed_experts_capturer.py b/aphrodite/model_executor/layers/fused_moe/routed_experts_capturer.py
index a4615e7db4..a2cc58a50f 100644
--- a/aphrodite/model_executor/layers/fused_moe/routed_experts_capturer.py
+++ b/aphrodite/model_executor/layers/fused_moe/routed_experts_capturer.py
@@ -24,6 +24,23 @@
 
 logger = logging.getLogger(__name__)
 
+
+def _get_num_experts_per_tok(hf_config) -> int:
+    """Resolve the per-token expert count from the HF config.
+
+    Different model families store this under different attribute names
+    (e.g. ``num_experts_per_tok`` for DeepSeek, ``top_k_experts`` for Gemma 4).
+    """
+    val = getattr(hf_config, "num_experts_per_tok", None)
+    if val is None:
+        val = getattr(hf_config, "top_k_experts", None)
+    if val is None:
+        raise ValueError(
+            "Cannot determine num_experts_per_tok: HF config has neither 'num_experts_per_tok' nor 'top_k_experts'"
+        )
+    return val
+
+
 # Constants
 _TMP_DIR = tempfile.gettempdir()
 _LOCK_FILE_PREFIX = os.path.join(_TMP_DIR, "aphrodite_routed_experts")
@@ -125,7 +142,7 @@ def init_buffer(
 
         hf_config = aphrodite_config.model_config.hf_text_config
         num_layers = hf_config.num_hidden_layers
-        num_experts_per_tok = hf_config.num_experts_per_tok
+        num_experts_per_tok = _get_num_experts_per_tok(hf_config)
 
         # Initialize device buffer
         self._device_buffer = torch.zeros(
@@ -294,7 +311,7 @@ def attach_buffer(
         shape = (
             max_num_kv_tokens,
             hf_config.num_hidden_layers,
-            hf_config.num_experts_per_tok,
+            _get_num_experts_per_tok(hf_config),
         )
 
         self.dp_rank = aphrodite_config.parallel_config.data_parallel_rank
diff --git a/aphrodite/model_executor/layers/fused_moe/router/custom_routing_router.py b/aphrodite/model_executor/layers/fused_moe/router/custom_routing_router.py
index c468261a65..203c5107a3 100644
--- a/aphrodite/model_executor/layers/fused_moe/router/custom_routing_router.py
+++ b/aphrodite/model_executor/layers/fused_moe/router/custom_routing_router.py
@@ -34,11 +34,15 @@ def __init__(
 
     @property
     def routing_method_type(self) -> RoutingMethodType:
+        from aphrodite.model_executor.models.cohere_moe import token_choice_with_bias
         from aphrodite.model_executor.models.llama4 import Llama4MoE
 
         # NOTE: FLASHINFER_TRTLLM support the Llama4 router.
         if self.custom_routing_function == Llama4MoE.custom_routing_function:
             return RoutingMethodType.Llama4
+        # Cohere MoE uses a sigmoid -> top-k -> renormalize routing function.
+        if self.custom_routing_function == token_choice_with_bias:
+            return RoutingMethodType.SigmoidRenorm
         return RoutingMethodType.Custom
 
     def _compute_routing(
diff --git a/aphrodite/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/aphrodite/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
index 39a3b5871e..552ab48536 100644
--- a/aphrodite/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
+++ b/aphrodite/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
@@ -195,12 +195,10 @@ def fused_topk_bias(
         scores = F.softplus(gating_output).sqrt()
     else:
         raise ValueError(f"Unsupported scoring function: {scoring_func}")
-
     if e_score_correction_bias is not None:
         scores_for_choice = scores.view(-1, n_routed_experts) + e_score_correction_bias.unsqueeze(0)
     else:
         scores_for_choice = scores.view(-1, n_routed_experts)
-
     # For batch invariance, use sorted=True to ensure deterministic expert selection
     if hash_indices_table is not None:
         topk_indices = hash_indices_table[input_tokens]
diff --git a/aphrodite/model_executor/layers/fused_moe/router/gate_linear.py b/aphrodite/model_executor/layers/fused_moe/router/gate_linear.py
index 9da7ae594f..5baaf3d3b0 100644
--- a/aphrodite/model_executor/layers/fused_moe/router/gate_linear.py
+++ b/aphrodite/model_executor/layers/fused_moe/router/gate_linear.py
@@ -97,7 +97,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, Paramet
 
         # Tier 2: cuBLAS bf16→fp32
         if self.allow_cublas_router_gemm and x.dtype == torch.bfloat16:
-            output = ops.router_gemm_bf16_fp32(x, self.weight)
+            output = torch.mm(x, self.weight.T, out_dtype=torch.float32)
             return output, None
 
         # Tier 3: F.linear (ReplicatedLinear)
diff --git a/aphrodite/model_executor/layers/fused_moe/runner/moe_runner.py b/aphrodite/model_executor/layers/fused_moe/runner/moe_runner.py
index f153e35a3b..440b2f6448 100644
--- a/aphrodite/model_executor/layers/fused_moe/runner/moe_runner.py
+++ b/aphrodite/model_executor/layers/fused_moe/runner/moe_runner.py
@@ -220,7 +220,7 @@ def __init__(
         self.routed_output_transform = routed_output_transform
         self.routed_scaling_factor = routed_scaling_factor
         self.gate = gate
-        self.quant_method = quant_method
+        self._quant_method = quant_method
         self.enable_dbo = enable_dbo
 
         self._shared_experts: SharedExperts | None = None
@@ -261,7 +261,7 @@ def shared_experts(self) -> SharedExperts | None:
     def _replace_quant_method(self, quant_method: FusedMoEMethodBase):
         if self._shared_experts is not None:
             self._shared_experts._quant_method = quant_method
-        self.quant_method = quant_method
+        self._quant_method = quant_method
 
     def is_internal_router(self) -> bool:
         return self.gate is not None
@@ -325,7 +325,7 @@ def _maybe_apply_routed_scale_to_output(
 
     @property
     def _fused_output_is_reduced(self) -> bool:
-        return self.quant_method.moe_kernel is not None and self.quant_method.moe_kernel.output_is_reduced()
+        return self._quant_method.moe_kernel is not None and self._quant_method.moe_kernel.output_is_reduced()
 
     def _maybe_reduce_shared_expert_output(
         self,
@@ -390,7 +390,7 @@ def _maybe_pad_hidden_states(
         """
         shared_experts_hidden_dim = shared_experts_input.shape[-1] if shared_experts_input is not None else 0
         transformed_hidden_dim = hidden_states.shape[-1]
-        if not self.quant_method.skip_forward_padding and self.moe_config.hidden_dim != transformed_hidden_dim:
+        if not self._quant_method.skip_forward_padding and self.moe_config.hidden_dim != transformed_hidden_dim:
             hidden_states = F.pad(
                 hidden_states,
                 (0, self.moe_config.hidden_dim - transformed_hidden_dim),
@@ -430,8 +430,8 @@ def _apply_quant_method(
         """
         self._maybe_apply_shared_experts(shared_experts_input, SharedExpertsOrder.NO_OVERLAP)
 
-        if self.quant_method.is_monolithic:
-            fused_out = self.quant_method.apply_monolithic(
+        if self._quant_method.is_monolithic:
+            fused_out = self._quant_method.apply_monolithic(
                 layer=layer,
                 x=hidden_states,
                 router_logits=router_logits,
@@ -446,7 +446,7 @@ def _apply_quant_method(
 
             # Passing shared_experts_input in case SharedExpertsOrder is
             # MK_INTERNAL_OVERLAPPED.
-            fused_out = self.quant_method.apply(
+            fused_out = self._quant_method.apply(
                 layer=layer,
                 x=hidden_states,
                 topk_weights=topk_weights,
@@ -586,7 +586,7 @@ def forward(
 
     @property
     def do_naive_dispatch_combine(self) -> bool:
-        return self.moe_config.dp_size > 1 and not self.quant_method.supports_internal_mk
+        return self.moe_config.dp_size > 1 and not self._quant_method.supports_internal_mk
 
     def _maybe_dispatch(
         self,
diff --git a/aphrodite/model_executor/layers/fused_moe/runner/moe_runner_interface.py b/aphrodite/model_executor/layers/fused_moe/runner/moe_runner_interface.py
index 9c53ab0dc7..0012b2c0a1 100644
--- a/aphrodite/model_executor/layers/fused_moe/runner/moe_runner_interface.py
+++ b/aphrodite/model_executor/layers/fused_moe/runner/moe_runner_interface.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from aphrodite.model_executor.custom_op import PluggableLayer
 from aphrodite.model_executor.layers.fused_moe.fused_moe_method_base import (
     FusedMoEMethodBase,
 )
@@ -12,7 +13,7 @@
 )
 
 
-class MoERunnerInterface(ABC):
+class MoERunnerInterface(PluggableLayer, ABC):
     """
     Abstract base class for Mixture of Experts (MoE) runners.
 
diff --git a/aphrodite/model_executor/layers/fused_moe/runner/shared_experts.py b/aphrodite/model_executor/layers/fused_moe/runner/shared_experts.py
index c53c2855b1..5bad3baf24 100644
--- a/aphrodite/model_executor/layers/fused_moe/runner/shared_experts.py
+++ b/aphrodite/model_executor/layers/fused_moe/runner/shared_experts.py
@@ -6,7 +6,6 @@
 
 import aphrodite.envs as envs
 from aphrodite.logger import init_logger
-from aphrodite.model_executor.layers.activation import SiluAndMul
 from aphrodite.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
 )
@@ -65,13 +64,6 @@ def __init__(
         self._moe_config = moe_config
         self._quant_method = quant_method
 
-        # Shared expert MLPs often use a plain SiluAndMul custom op. When
-        # custom ops are globally disabled for torch.compile, that activation
-        # may try to compile its native fallback during CUDA graph capture.
-        act_fn = getattr(self._layer, "act_fn", None)
-        if isinstance(act_fn, SiluAndMul) and not getattr(act_fn, "_enforce_enable", False):
-            self._layer.act_fn = SiluAndMul(enforce_enable=True)
-
         # Allow disabling of the separate shared experts stream for
         # debug purposes.
         # TODO: Remove this after more extensive testings with TP/DP
@@ -100,13 +92,7 @@ def _determine_shared_experts_order(
         self,
         hidden_states: torch.Tensor,
     ) -> SharedExpertsOrder:
-        is_capturing = current_platform.is_cuda() and torch.cuda.is_current_stream_capturing()
-        if (
-            self._disable_shared_experts_overlap
-            or not self._quant_method.supports_shared_expert_overlap
-            or torch.compiler.is_compiling()
-            or is_capturing
-        ):
+        if self._disable_shared_experts_overlap:
             return SharedExpertsOrder.NO_OVERLAP
 
         if self._quant_method.mk_owns_shared_expert:
diff --git a/aphrodite/model_executor/layers/fused_moe/triton_cutlass_moe.py b/aphrodite/model_executor/layers/fused_moe/triton_cutlass_moe.py
index 90f649e091..99350a5fce 100644
--- a/aphrodite/model_executor/layers/fused_moe/triton_cutlass_moe.py
+++ b/aphrodite/model_executor/layers/fused_moe/triton_cutlass_moe.py
@@ -10,9 +10,7 @@
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
-from aphrodite.model_executor.layers.fused_moe.experts.cutlass_moe import (
-    CutlassExpertsFp8,
-)
+from aphrodite.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
 from aphrodite.model_executor.layers.fused_moe.fallback import FallbackExperts
 from aphrodite.model_executor.layers.fused_moe.fused_moe import TritonExperts
 from aphrodite.platforms import current_platform
diff --git a/aphrodite/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/aphrodite/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index d69ca4c018..99bbe214f4 100644
--- a/aphrodite/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/aphrodite/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -292,7 +292,13 @@ def forward_cuda(
         topk_ids: torch.Tensor,
         shared_experts_input: torch.Tensor | None,
     ) -> torch.Tensor:
-        return self.forward_native(layer, x, topk_weights, topk_ids, shared_experts_input)
+        return self.forward_native(
+            layer,
+            x,
+            topk_weights,
+            topk_ids,
+            shared_experts_input,
+        )
 
     def apply_monolithic(
         self,
diff --git a/aphrodite/model_executor/layers/fused_moe/utils.py b/aphrodite/model_executor/layers/fused_moe/utils.py
index 99c56e8b5c..31f18eebde 100644
--- a/aphrodite/model_executor/layers/fused_moe/utils.py
+++ b/aphrodite/model_executor/layers/fused_moe/utils.py
@@ -202,11 +202,12 @@ def _mxfp8_e4m3_quantize(
     per_act_token_quant: bool,
     block_shape: list[int] | None = None,
     is_sf_swizzled_layout: bool = False,
+    mx_alignment: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert A_scale is None
     assert not per_act_token_quant
     assert block_shape is None or block_shape == [1, 32]
-    return mxfp8_e4m3_quantize(A, is_sf_swizzled_layout)
+    return mxfp8_e4m3_quantize(A, is_sf_swizzled_layout, mx_alignment)
 
 
 def _mxfp6_e3m2_quantize(
@@ -252,6 +253,7 @@ def moe_kernel_quantize_input(
     is_fp4_scale_swizzled: bool = True,
     ocp_mx_scheme: str | None = None,
     quantization_emulation: bool = False,
+    mx_alignment: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor | None]:
     # Handle OCP MX scheme that requires QDQ (quantize-dequantize) for emulation
     if ocp_mx_scheme is not None:
@@ -288,7 +290,8 @@ def moe_kernel_quantize_input(
         if not quantization_emulation:
             return _nvfp4_quantize(A, A_scale, is_sf_swizzled_layout=is_fp4_scale_swizzled)
         else:
-            return ref_nvfp4_quant_dequant(A, A_scale, block_size=16)
+            A = ref_nvfp4_quant_dequant(A, A_scale, block_size=16)
+            return A, None
     elif quant_dtype == "mxfp4":
         if not quantization_emulation:
             raise NotImplementedError(
@@ -308,7 +311,8 @@ def moe_kernel_quantize_input(
             A_scale,
             per_act_token_quant,
             block_shape,
-            is_sf_swizzled_layout=is_fp4_scale_swizzled,
+            is_sf_swizzled_layout=False,
+            mx_alignment=mx_alignment,
         )
     elif quant_dtype == "mxfp6_e3m2":
         if not quantization_emulation:
diff --git a/aphrodite/model_executor/layers/linear.py b/aphrodite/model_executor/layers/linear.py
index 4bda266b27..d67def3bcc 100644
--- a/aphrodite/model_executor/layers/linear.py
+++ b/aphrodite/model_executor/layers/linear.py
@@ -1053,7 +1053,10 @@ def _load_fused_module_from_checkpoint(self, param: BaseAphroditeParameter, load
             # Special case for Quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
-            if (
+            if isinstance(param, BlockQuantScaleParameter):
+                weight_block_size = getattr(self, "weight_block_size", None)
+                shard_size, shard_offset = adjust_block_scale_shard(weight_block_size, shard_size, shard_offset)
+            elif (
                 isinstance(param, (PackedColumnParameter, PackedAphroditeParameter))
                 and param.packed_dim == param.output_dim
             ):
diff --git a/aphrodite/model_executor/layers/mamba/linear_attn.py b/aphrodite/model_executor/layers/mamba/linear_attn.py
index 7690a419e5..1565be43d4 100644
--- a/aphrodite/model_executor/layers/mamba/linear_attn.py
+++ b/aphrodite/model_executor/layers/mamba/linear_attn.py
@@ -3,6 +3,7 @@
 
 import math
 from collections.abc import Callable
+from functools import partial
 from typing import cast
 
 import torch
@@ -36,28 +37,44 @@
 from aphrodite.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
 
+@CustomOp.register("minimax_text01_rmsnorm_tp")
 class MiniMaxText01RMSNormTP(CustomOp):
-    name = "MiniMaxText01RMSNormTP"
-
-    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        *,
+        weight_shard_world_size: int | None = None,
+        weight_shard_rank: int | None = None,
+    ) -> None:
         super().__init__()
         self.tp_world = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
-        self.weight = nn.Parameter(torch.ones(int(hidden_size / self.tp_world)))
-
-        self.weight.weight_loader = self.weight_loader
+        self.weight_shard_world = weight_shard_world_size or self.tp_world
+        self.weight_shard_rank = self.tp_rank if weight_shard_rank is None else weight_shard_rank
+
+        self.weight = nn.Parameter(torch.ones(hidden_size // self.weight_shard_world))
+        self.weight.weight_loader = partial(
+            self.weight_loader,
+            shard_world_size=self.weight_shard_world,
+            shard_rank=self.weight_shard_rank,
+        )
         self.variance_epsilon = eps
 
     @staticmethod
     def weight_loader(
         param: nn.Parameter,
         loaded_weight: torch.Tensor,
+        shard_world_size: int | None = None,
+        shard_rank: int | None = None,
     ) -> None:
-        tp_world = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
+        if shard_world_size is None:
+            shard_world_size = get_tensor_model_parallel_world_size()
+        if shard_rank is None:
+            shard_rank = get_tensor_model_parallel_rank()
 
-        shard_size = loaded_weight.shape[0] // tp_world
-        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        shard_size = loaded_weight.shape[0] // shard_world_size
+        shard = slice(shard_rank * shard_size, (shard_rank + 1) * shard_size)
         param.data.copy_(loaded_weight[shard])
 
     def _forward(
diff --git a/aphrodite/model_executor/layers/mamba/mamba_utils.py b/aphrodite/model_executor/layers/mamba/mamba_utils.py
index 13bf4eaee5..8c773bca5d 100644
--- a/aphrodite/model_executor/layers/mamba/mamba_utils.py
+++ b/aphrodite/model_executor/layers/mamba/mamba_utils.py
@@ -54,9 +54,6 @@ def linear_attention_state_dtype(
         model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
-        # TODO (tdoublep) requires testing
-        if mamba_cache_dtype == "float32":
-            raise ValueError("fp32 state for minimax is not yet supported")
         state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
         return (state_dtype,)
 
diff --git a/aphrodite/model_executor/layers/mhc.py b/aphrodite/model_executor/layers/mhc.py
index d01c39b064..937e2f734e 100644
--- a/aphrodite/model_executor/layers/mhc.py
+++ b/aphrodite/model_executor/layers/mhc.py
@@ -478,3 +478,133 @@ def _mhc_post_fake(
     mutates_args=[],
     fake_impl=_mhc_post_fake,
 )
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
+    },
+)
+def hc_head_fuse_tilelang(
+    residual,
+    fn,
+    hc_scale,
+    hc_base,
+    out,
+    hidden_size: int,
+    rms_eps: float,
+    hc_eps: float,
+    hc_mult: int = 4,
+    n_thr: int = 128,
+    h_blk: int = 1024,
+):
+    """Two-pass fused kernel for hc_head.
+    Pass 1: accumulate per-token squared sum and hc_mult dot-products
+            (projections onto fn rows) using cross-thread reducers.
+    Pass 2: apply sigmoid-gated weighted sum of residual channels to output.
+    Avoids materialising mixes / rsqrt / pre tensors to global memory.
+    """
+    num_tokens = T.dynamic("num_tokens")
+    hc_dim = hc_mult * hidden_size
+    h_block = math.gcd(h_blk, hidden_size)
+    n_h = hidden_size // h_block
+
+    residual: T.Tensor[[num_tokens, hc_mult, hidden_size], T.bfloat16]  # type: ignore[no-redef,valid-type]
+    fn: T.Tensor[[hc_mult, hc_dim], T.float32]  # type: ignore[no-redef,valid-type]
+    hc_scale: T.Tensor[[1], T.float32]  # type: ignore[no-redef,valid-type]
+    hc_base: T.Tensor[[hc_mult], T.float32]  # type: ignore[no-redef,valid-type]
+    out: T.Tensor[[num_tokens, hidden_size], T.bfloat16]  # type: ignore[no-redef,valid-type]
+
+    with T.Kernel(num_tokens, threads=n_thr) as i:
+        T.pdl_sync()
+
+        # ------------------------------------------------------------------
+        # Pass 1 – for each residual channel m_c and h_block:
+        #   • accumulate squared sum (for RMS norm denominator)
+        #   • accumulate hc_mult dot-products with fn rows
+        # ------------------------------------------------------------------
+        sqrsum_r = T.alloc_reducer((1,), T.float32, replication="all")
+        mixes_r = T.alloc_reducer((hc_mult,), T.float32, replication="all")
+        T.fill(sqrsum_r, 0.0)
+        T.fill(mixes_r, 0.0)
+
+        for m_c in T.serial(hc_mult):
+            for i_h in T.serial(n_h):
+                x_local = T.alloc_fragment(h_block, T.float32)
+                T.copy(residual[i, m_c, i_h * h_block], x_local)
+
+                for k in T.Parallel(h_block):
+                    sqrsum_r[0] += x_local[k] * x_local[k]
+
+                for m_m in T.unroll(hc_mult):
+                    fn_local = T.alloc_fragment(h_block, T.float32)
+                    T.copy(fn[m_m, m_c * hidden_size + i_h * h_block], fn_local)
+                    for k in T.Parallel(h_block):
+                        mixes_r[m_m] += x_local[k] * fn_local[k]
+
+        T.finalize_reducer(sqrsum_r)
+        T.finalize_reducer(mixes_r)
+
+        # ------------------------------------------------------------------
+        # Compute pre_mix = sigmoid(mix * rsqrt * scale + base) + eps
+        # ------------------------------------------------------------------
+        pre_mix_shared = T.alloc_shared(hc_mult, T.float32)
+        rsqrt_val = T.alloc_fragment(1, T.float32)
+        rsqrt_val[0] = T.rsqrt(sqrsum_r[0] / hc_dim + rms_eps)
+        for m in T.Parallel(hc_mult):
+            pre_mix_shared[m] = T.sigmoid(mixes_r[m] * rsqrt_val[0] * hc_scale[0] + hc_base[m]) + hc_eps
+
+        # ------------------------------------------------------------------
+        # Pass 2 – apply_mix: pipelined weighted sum over residual channels
+        # ------------------------------------------------------------------
+        for i0_h in T.Pipelined(n_h, num_stages=2):
+            xs = T.alloc_shared((hc_mult, h_block), T.bfloat16)
+            xl = T.alloc_fragment((hc_mult, h_block), T.float32)
+            T.copy(residual[i, 0, i0_h * h_block], xs, disable_tma=True)
+            T.copy(xs, xl)
+
+            ol = T.alloc_fragment(h_block, T.float32)
+            T.clear(ol)
+            for i_hc in T.serial(hc_mult):
+                pre = pre_mix_shared[i_hc]
+                for i1_h in T.Parallel(h_block):
+                    ol[i1_h] += pre * xl[i_hc, i1_h]
+
+            T.copy(ol, out[i, i0_h * h_block], disable_tma=True)
+
+        T.pdl_trigger()
+
+
+def _hc_head_fused_kernel(
+    hs_flat: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    out: torch.Tensor,
+    hidden_size: int,
+    rms_eps: float,
+    hc_eps: float,
+    hc_mult: int,
+) -> None:
+    """Fill pre-allocated `out` (T, H) in-place with the hc_head result."""
+    if hs_flat.shape[0] > 0:
+        hc_head_fuse_tilelang(
+            hs_flat,
+            fn,
+            hc_scale,
+            hc_base,
+            out,
+            hidden_size,
+            rms_eps,
+            hc_eps,
+            hc_mult,
+        )
+
+
+direct_register_custom_op(
+    op_name="hc_head_fused_kernel",
+    op_func=_hc_head_fused_kernel,
+    mutates_args=["out"],
+)
diff --git a/aphrodite/model_executor/layers/mla.py b/aphrodite/model_executor/layers/mla.py
index a5b2647e65..e6ec9d8d8c 100644
--- a/aphrodite/model_executor/layers/mla.py
+++ b/aphrodite/model_executor/layers/mla.py
@@ -64,6 +64,7 @@ def __init__(
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
+        skip_topk: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -87,6 +88,12 @@ def __init__(
         self.indexer_rope_emb = mla_modules.indexer_rotary_emb
         self.is_sparse = mla_modules.is_sparse
 
+        # Whether to skip top-k token selection computation in this layer.
+        # When True, the indexer will not be called, and the layer will reuse
+        # the topk_tokens buffer written by a previous layer in the same pass.
+        # Refer: https://arxiv.org/abs/2603.12201 for more details.
+        self.skip_topk = skip_topk
+
         if self.indexer is not None:
             assert hasattr(self.indexer, "topk_tokens")
             self.topk_tokens = self.indexer.topk_tokens
@@ -147,8 +154,8 @@ def forward(
         if self.rotary_emb is not None:
             q[..., self.qk_nope_head_dim :], k_pe = self.rotary_emb(positions, q[..., self.qk_nope_head_dim :], k_pe)
 
-        if self.indexer and self.is_sparse:
-            _topk_indices = self.indexer(hidden_states, q_c, positions, self.indexer_rope_emb)
+        if self.indexer and self.is_sparse and not self.skip_topk:
+            self.indexer(hidden_states, q_c, positions, self.indexer_rope_emb)
 
         if llama_4_scaling is not None:
             q *= llama_4_scaling
diff --git a/aphrodite/model_executor/layers/pooler/tokwise/methods.py b/aphrodite/model_executor/layers/pooler/tokwise/methods.py
index 0fcfe7cde4..b1294e42b6 100644
--- a/aphrodite/model_executor/layers/pooler/tokwise/methods.py
+++ b/aphrodite/model_executor/layers/pooler/tokwise/methods.py
@@ -47,13 +47,17 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> list[TokenPoolingMethodOutputItem]:
         pooling_cursor = pooling_metadata.get_pooling_cursor()
-        hidden_states_lst = [
-            hidden_states[first : last + 1]
-            for first, last in zip(
-                pooling_cursor.first_token_indices_gpu.tolist(),
-                pooling_cursor.last_token_indices_gpu.tolist(),
-            )
-        ]
+        split_sizes = pooling_cursor.num_scheduled_tokens_cpu.tolist()
+        if split_sizes:
+            # DispatchPooler passes the full hidden_states tensor.
+            # slice out the subgroup once, then split it by
+            # per-request token counts
+            group_start = int(pooling_cursor.first_token_indices_gpu[0].item())
+            group_end = int(pooling_cursor.last_token_indices_gpu[-1].item()) + 1
+            hidden_states_group = hidden_states[group_start:group_end]
+            hidden_states_lst = list(hidden_states_group.split(split_sizes))
+        else:
+            hidden_states_lst = []
 
         if not self.enable_chunked_prefill:
             return hidden_states_lst
@@ -109,7 +113,7 @@ def forward(
                 if step_tag_id is not None:
                     data = data[token_id == step_tag_id]
 
-            pooled_data.append(data)
+                pooled_data.append(data)
 
         return pooled_data
 
diff --git a/aphrodite/model_executor/layers/quantization/modelopt.py b/aphrodite/model_executor/layers/quantization/modelopt.py
index 9ece83007c..2955adfb36 100644
--- a/aphrodite/model_executor/layers/quantization/modelopt.py
+++ b/aphrodite/model_executor/layers/quantization/modelopt.py
@@ -114,12 +114,12 @@
     # MIXED_PRECISION,
     "MIXED_PRECISION",
 ]
-KV_CACHE_QUANT_ALGOS = ["FP8"]
+KV_CACHE_QUANT_ALGOS = ["FP8", "NVFP4"]
 
 
-class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+class ModelOptKVCacheMethod(BaseKVCacheMethod):
     """
-    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    Supports loading kv-cache scaling factors from FP8 or NVFP4 checkpoints.
     """
 
     def __init__(self, quant_config: "ModelOptQuantConfigBase"):
@@ -944,7 +944,7 @@ def apply(
 
 ModelOptFp8Config.LinearMethodCls = ModelOptFp8LinearMethod
 ModelOptFp8Config.FusedMoEMethodCls = ModelOptFp8MoEMethod
-ModelOptFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+ModelOptFp8Config.KVCacheMethodCls = ModelOptKVCacheMethod
 
 
 class ModelOptNvFp4Config(ModelOptQuantConfigBase):
@@ -1407,7 +1407,7 @@ def apply(
 
 ModelOptNvFp4Config.LinearMethodCls = ModelOptNvFp4LinearMethod
 ModelOptNvFp4Config.FusedMoEMethodCls = ModelOptNvFp4FusedMoE
-ModelOptNvFp4Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+ModelOptNvFp4Config.KVCacheMethodCls = ModelOptKVCacheMethod
 
 
 class ModelOptMxFp8Config(ModelOptQuantConfigBase):
@@ -1892,7 +1892,7 @@ def apply(
 # Register the method classes for ModelOptMxFp8Config
 ModelOptMxFp8Config.LinearMethodCls = ModelOptMxFp8LinearMethod
 ModelOptMxFp8Config.FusedMoEMethodCls = ModelOptMxFp8FusedMoE
-ModelOptMxFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+ModelOptMxFp8Config.KVCacheMethodCls = ModelOptKVCacheMethod
 
 
 class ModelOptMixedPrecisionConfig(ModelOptQuantConfigBase):
@@ -2032,7 +2032,7 @@ def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> "QuantizeMeth
         # KV-cache quantization
         if isinstance(layer, Attention):
             if self.kv_cache_quant_method:
-                return ModelOptFp8KVCacheMethod(self)
+                return ModelOptKVCacheMethod(self)
             return None
 
         # Excluded layers
diff --git a/aphrodite/model_executor/layers/quantization/quark/quark.py b/aphrodite/model_executor/layers/quantization/quark/quark.py
index 8323d95362..cd272c5007 100644
--- a/aphrodite/model_executor/layers/quantization/quark/quark.py
+++ b/aphrodite/model_executor/layers/quantization/quark/quark.py
@@ -5,7 +5,6 @@
 from typing import TYPE_CHECKING, Any, cast
 
 import torch
-from transformers import PretrainedConfig
 
 from aphrodite.logger import init_logger
 from aphrodite.model_executor.layers.attention import Attention
@@ -45,10 +44,6 @@
 
 logger = init_logger(__name__)
 
-# model_type values that use dynamic MXFP4 re-quantization for
-# OCP MX fp4 Quark checkpoints
-_DEEPSEEK_V3_FAMILY_MODEL_TYPES = frozenset({"deepseek_v3"})
-
 
 class QuarkConfig(QuantizationConfig):
     def __init__(
@@ -65,25 +60,12 @@ def __init__(
         self.kv_cache_group = kv_cache_group
         self.kv_cache_config = kv_cache_config
         self.pack_method = pack_method
+        # Note : this flag is kept disabled because the overhead of
+        # dynamic mxfp4 quantization negates the performance gains
+        # that come from shifting to mxfp4. It is left here in case
+        # we want to re-enable it in the future.
         self.dynamic_mxfp4_quant = False
 
-    def maybe_update_config(
-        self,
-        model_name: str,
-        hf_config: PretrainedConfig | None = None,
-        revision: str | None = None,
-    ):
-        """Enable dynamic MXFP4 only for DeepSeek-V3-family + fp4 Quark checkpoints."""
-
-        if getattr(hf_config, "model_type", None) not in _DEEPSEEK_V3_FAMILY_MODEL_TYPES:
-            return
-
-        quant_config = getattr(hf_config, "quantization_config", None)
-        if quant_config is not None:
-            quant_dtype = quant_config.get("global_quant_config", {}).get("weight", {}).get("dtype")
-            if quant_dtype == "fp4":
-                self.dynamic_mxfp4_quant = True
-
     def get_linear_method(self) -> "QuarkLinearMethod":
         return QuarkLinearMethod(self)
 
diff --git a/aphrodite/model_executor/layers/quantization/quark/quark_moe.py b/aphrodite/model_executor/layers/quantization/quark/quark_moe.py
index 5bf7df8fa0..d06f11b1e7 100644
--- a/aphrodite/model_executor/layers/quantization/quark/quark_moe.py
+++ b/aphrodite/model_executor/layers/quantization/quark/quark_moe.py
@@ -910,9 +910,11 @@ def __init__(
 
         self.model_type = getattr(get_current_aphrodite_config().model_config.hf_config, "model_type", None)
 
-        self.emulate = (not current_platform.supports_mx() or not self.ocp_mx_scheme.startswith("w_mxfp4")) and (
-            self.mxfp4_backend is Mxfp4MoeBackend.NONE or not self.use_rocm_aiter_moe
-        )
+        # TODO: Remove once all OCP MX schemes use the kernel abstraction
+        _AITER_NATIVE_OCP_MX_SCHEMES = ("w_mxfp4", "w_mxfp4_a_mxfp4")
+        self.emulate = (
+            not current_platform.supports_mx() or self.ocp_mx_scheme not in _AITER_NATIVE_OCP_MX_SCHEMES
+        ) and (self.mxfp4_backend is Mxfp4MoeBackend.NONE or not self.use_rocm_aiter_moe)
 
         if self.ocp_mx_scheme == "w_mxfp4":
             self.mxfp4_backend, self.experts_cls = select_gpt_oss_mxfp4_moe_backend(moe)
@@ -1401,13 +1403,13 @@ def is_monolithic(self) -> bool:
 
     def apply_monolithic(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        expert_map: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        input_ids: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         if layer.enable_eplb:
-            raise NotImplementedError("EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet.")
+            raise NotImplementedError(f"EPLB not supported for {self.__class__.__name__} yet.")
 
         from aphrodite.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (  # noqa: E501
             triton_kernel_moe_forward,
@@ -1423,7 +1425,7 @@ def apply_monolithic(
             topk=layer.top_k,
             renormalize=layer.renormalize,
             global_num_experts=layer.global_num_experts,
-            expert_map=expert_map,
+            expert_map=layer.expert_map,
             quant_config=self.moe_quant_config,
             apply_router_weight_on_input=layer.apply_router_weight_on_input,
             unpadded_N_w1=self.moe.intermediate_size_per_partition_unpadded * 2,
diff --git a/aphrodite/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/aphrodite/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index c2e56d9997..68eef86dc5 100644
--- a/aphrodite/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/aphrodite/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -94,21 +94,12 @@ def gemm_with_dynamic_quant(
                     x_q = x
                     x_s = x_scales
 
-                # 32 alignment is enough for dim0 padding of output for
-                # gemm_a4w4 kernel
-                y = torch.empty(
-                    (M + 31) // 32 * 32,
-                    weight.shape[0],
-                    device=x_q.device,
-                    dtype=out_dtype,
-                )
-
-                gemm_a4w4(
+                y = gemm_a4w4(
                     x_q,
                     weight.view(x_q.dtype),
                     x_s,
                     weight_scale.view(x_s.dtype),
-                    y,
+                    dtype=out_dtype,
                     bpreshuffle=True,
                 )
             return y[:M]
@@ -347,11 +338,15 @@ def apply_weights(
             dq_w = self.dequant_func(layer.weight, layer.weight_scale, x.dtype)
             qdq_x = self.quant_dequant_func(x)
             return F.linear(qdq_x, dq_w, bias)
-        else:
-            return torch.ops.aphrodite.gemm_with_dynamic_quant(
-                x,
-                layer.weight,
-                layer.weight_scale,
-                self.rocm_use_aiter_fp4_asm_gemm,
-                self.out_dtype,
-            )
+        y = torch.ops.aphrodite.gemm_with_dynamic_quant(
+            x,
+            layer.weight,
+            layer.weight_scale,
+            self.rocm_use_aiter_fp4_asm_gemm,
+            self.out_dtype,
+        )
+        # gemm_with_dynamic_quant has no bias argument; add it here so the
+        # native path matches F.linear (e.g. qkv_proj with qkv_bias=True).
+        if bias is not None:
+            y = y + bias
+        return y
diff --git a/aphrodite/model_executor/layers/quantization/utils/mxfp8_utils.py b/aphrodite/model_executor/layers/quantization/utils/mxfp8_utils.py
index c9a1a854d2..538df7210c 100644
--- a/aphrodite/model_executor/layers/quantization/utils/mxfp8_utils.py
+++ b/aphrodite/model_executor/layers/quantization/utils/mxfp8_utils.py
@@ -83,14 +83,20 @@ def _mxfp8_e4m3_quantize_torch(
 
 
 def _mxfp8_e4m3_quantize_impl(
-    x: torch.Tensor, is_sf_swizzled_layout: bool = False
+    x: torch.Tensor,
+    is_sf_swizzled_layout: bool = False,
+    alignment: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     from aphrodite.platforms import current_platform
 
     if current_platform.has_device_capability(100):
         from flashinfer import mxfp8_quantize as flashinfer_mxfp8_quantize
 
-        x_q, x_scales = flashinfer_mxfp8_quantize(x, is_sf_swizzled_layout=is_sf_swizzled_layout)
+        x_q, x_scales = flashinfer_mxfp8_quantize(
+            x,
+            is_sf_swizzled_layout=is_sf_swizzled_layout,
+            alignment=alignment if alignment > 0 else 32,
+        )
         if x_scales.ndim == 1 and x.ndim == 2 and not is_sf_swizzled_layout:
             x_scales = x_scales.view(x.size(0), -1)
         return x_q, x_scales
@@ -98,8 +104,12 @@ def _mxfp8_e4m3_quantize_impl(
     return _mxfp8_e4m3_quantize_torch(x, is_sf_swizzled_layout)
 
 
-def mxfp8_e4m3_quantize(x: torch.Tensor, is_sf_swizzled_layout: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
-    return torch.ops.aphrodite.mxfp8_quantize(x, is_sf_swizzled_layout)
+def mxfp8_e4m3_quantize(
+    x: torch.Tensor,
+    is_sf_swizzled_layout: bool = False,
+    alignment: int = 0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.ops.aphrodite.mxfp8_quantize(x, is_sf_swizzled_layout, alignment)
 
 
 def dequant_mxfp8_to_bf16(x: torch.Tensor, scales: torch.Tensor) -> torch.Tensor:
@@ -118,7 +128,11 @@ def dequant_mxfp8_to_bf16(x: torch.Tensor, scales: torch.Tensor) -> torch.Tensor
     return dequantized.to(torch.bfloat16)
 
 
-def mxfp8_e4m3_quantize_fake(x: torch.Tensor, is_sf_swizzled_layout: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
+def mxfp8_e4m3_quantize_fake(
+    x: torch.Tensor,
+    is_sf_swizzled_layout: bool = False,
+    alignment: int = 0,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Fake implementation for torch.compile tracing."""
     fp_data = torch.empty_like(x, dtype=MXFP8_VALUE_DTYPE)
 
diff --git a/aphrodite/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/aphrodite/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index 563508b5ce..6ce5cdb4c7 100644
--- a/aphrodite/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/aphrodite/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -4,7 +4,9 @@
 
 import torch
 
+from aphrodite.platforms import current_platform
 from aphrodite.scalar_type import scalar_types
+from aphrodite.triton_utils import tl, triton
 
 __all__ = [
     "break_fp4_bytes",
@@ -18,6 +20,286 @@
 kE2M1ToFloat_handle = SimpleNamespace(val=torch.tensor([0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32))
 
 
+@triton.jit
+def _e2m1_inline(magnitude):
+    """Inline E2M1 lookup using binary tree - 3 levels instead of 7 sequential.
+    Maps 3-bit magnitude to float: [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
+    Uses bit decomposition for fewer comparisons.
+    """
+    # Bit 2 (MSB): separates 0-3 from 4-7
+    # Bit 1: separates within groups
+    # Bit 0 (LSB): separates within pairs
+    b2 = (magnitude >> 2) & 1  # 0 for mag 0-3, 1 for mag 4-7
+    b1 = (magnitude >> 1) & 1  # middle bit
+    b0 = magnitude & 1  # LSB
+
+    # For mag 0-3: [0.0, 0.5, 1.0, 1.5]
+    low_group = tl.where(b1 == 1, tl.where(b0 == 1, 1.5, 1.0), tl.where(b0 == 1, 0.5, 0.0))
+    # For mag 4-7: [2.0, 3.0, 4.0, 6.0]
+    high_group = tl.where(b1 == 1, tl.where(b0 == 1, 6.0, 4.0), tl.where(b0 == 1, 3.0, 2.0))
+    return tl.where(b2 == 1, high_group, low_group)
+
+
+@triton.jit
+def _dequantize_nvfp4_kernel(
+    fp4_ptr,
+    scale_ptr,
+    global_scale_ptr,
+    output_ptr,
+    rows_per_batch: tl.constexpr,
+    num_blocks: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    has_batch_global_scale: tl.constexpr,
+    TILE_BLOCKS: tl.constexpr,
+):
+    """Triton kernel for NVFP4 dequantization (swizzle=False).
+    Optimized with 2D tile processing + interleave for coalesced stores.
+    """
+    BLOCK_PACKED: tl.constexpr = BLOCK_SIZE // 2
+
+    row_idx = tl.program_id(0)
+    tile_idx = tl.program_id(1)
+
+    if has_batch_global_scale:
+        batch_idx = row_idx // rows_per_batch
+        global_scale = tl.load(global_scale_ptr + batch_idx).to(tl.float32)
+    else:
+        global_scale = tl.load(global_scale_ptr).to(tl.float32)
+
+    fp4_row_offset = row_idx * num_blocks * BLOCK_PACKED
+    scale_row_offset = row_idx * num_blocks
+    output_row_offset = row_idx * num_blocks * BLOCK_SIZE
+
+    start_block = tile_idx * TILE_BLOCKS
+
+    # Load scales for this tile: [TILE_BLOCKS]
+    block_offsets = tl.arange(0, TILE_BLOCKS)
+    block_mask = (start_block + block_offsets) < num_blocks
+
+    raw_scales = tl.load(
+        scale_ptr + scale_row_offset + start_block + block_offsets,
+        mask=block_mask,
+        other=0,
+    )
+    scale_f32 = tl.cast(raw_scales, tl.float8e4nv, bitcast=True).to(tl.float32)
+    scale_values = (scale_f32 * global_scale)[:, None]
+
+    # Load [TILE_BLOCKS, BLOCK_PACKED] packed bytes
+    packed_offsets = tl.arange(0, BLOCK_PACKED)[None, :]
+    byte_indices = fp4_row_offset + (start_block + block_offsets[:, None]) * BLOCK_PACKED + packed_offsets
+    elem_mask = block_mask[:, None]
+    raw_bytes = tl.load(fp4_ptr + byte_indices, mask=elem_mask, other=0)
+
+    low_nibble = raw_bytes & 0x0F
+    high_nibble = (raw_bytes >> 4) & 0x0F
+
+    # Binary tree E2M1 decode
+    low_mag = low_nibble & 0x07
+    low_val = _e2m1_inline(low_mag)
+    low_sign = (low_nibble >> 3) & 1
+    low_result = tl.where(low_sign == 1, -low_val, low_val) * scale_values
+
+    high_mag = high_nibble & 0x07
+    high_val = _e2m1_inline(high_mag)
+    high_sign = (high_nibble >> 3) & 1
+    high_result = tl.where(high_sign == 1, -high_val, high_val) * scale_values
+
+    # Interleave for coalesced contiguous store
+    result = tl.interleave(low_result, high_result)
+
+    elem_offsets = tl.arange(0, BLOCK_SIZE)[None, :]
+    out_indices = output_row_offset + (start_block + block_offsets[:, None]) * BLOCK_SIZE + elem_offsets
+    tl.store(output_ptr + out_indices, result, mask=block_mask[:, None])
+
+
+@triton.jit
+def _e2m1_lookup(magnitude):
+    """Lookup E2M1 float value from 3-bit magnitude."""
+    result = tl.where(magnitude == 1, 0.5, 0.0)
+    result = tl.where(magnitude == 2, 1.0, result)
+    result = tl.where(magnitude == 3, 1.5, result)
+    result = tl.where(magnitude == 4, 2.0, result)
+    result = tl.where(magnitude == 5, 3.0, result)
+    result = tl.where(magnitude == 6, 4.0, result)
+    result = tl.where(magnitude == 7, 6.0, result)
+    return result
+
+
+@triton.jit
+def _round_to_fp4(x):
+    """Round float values to the nearest E2M1 representable value.
+    Matches the thresholds in the Python ``cast_to_fp4`` exactly.
+    """
+    sign = tl.where(x < 0.0, -1.0, 1.0)
+    abs_x = tl.abs(x)
+    result = tl.where(abs_x > 5.0, 6.0, 0.0)
+    result = tl.where((abs_x >= 3.5) & (abs_x <= 5.0), 4.0, result)
+    result = tl.where((abs_x > 2.5) & (abs_x < 3.5), 3.0, result)
+    result = tl.where((abs_x >= 1.75) & (abs_x <= 2.5), 2.0, result)
+    result = tl.where((abs_x > 1.25) & (abs_x < 1.75), 1.5, result)
+    result = tl.where((abs_x >= 0.75) & (abs_x <= 1.25), 1.0, result)
+    result = tl.where((abs_x > 0.25) & (abs_x < 0.75), 0.5, result)
+    return result * sign
+
+
+@triton.jit
+def _nvfp4_quant_dequant_kernel(
+    input_ptr,
+    output_ptr,
+    global_scale_ptr,
+    k: tl.constexpr,
+    num_blocks: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    FP4_MAX_RECIPROCAL: tl.constexpr,
+    TILE_BLOCKS: tl.constexpr,
+):
+    """Fused NVFP4 quantize-dequantize kernel.
+    Uses a 2D grid (rows x tiles) to parallelize across both rows
+    and quantization groups within a row. Each program handles
+    TILE_BLOCKS groups at once using vectorized 2D operations.
+    """
+    row_idx = tl.program_id(0)
+    tile_idx = tl.program_id(1)
+    global_scale = tl.load(global_scale_ptr).to(tl.float32)
+    row_offset = row_idx * k
+
+    start_block = tile_idx * TILE_BLOCKS
+    block_offsets = tl.arange(0, TILE_BLOCKS)
+    block_mask = (start_block + block_offsets) < num_blocks
+
+    # Load [TILE_BLOCKS, BLOCK_SIZE] elements
+    indices = row_offset + (start_block + block_offsets[:, None]) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)[None, :]
+    mask_2d = block_mask[:, None]
+    x = tl.load(input_ptr + indices, mask=mask_2d, other=0.0).to(tl.float32)
+
+    # Per-group scale: [TILE_BLOCKS]
+    vec_max = tl.max(tl.abs(x), axis=1)
+    scale = global_scale * (vec_max * FP4_MAX_RECIPROCAL)
+    scale = tl.clamp(scale, -448.0, 448.0)
+    scale = scale.to(tl.float8e4nv).to(tl.float32)
+
+    # Safe reciprocal, broadcast to [TILE_BLOCKS, 1]
+    output_scale = tl.where(scale == 0.0, 0.0, global_scale / scale)[:, None]
+
+    # Quantize: scale, clamp, round to FP4
+    scaled_x = tl.clamp(x * output_scale, -6.0, 6.0)
+    fp4_val = _round_to_fp4(scaled_x)
+
+    # Dequantize: fp4_val * (scale / global_scale)
+    dequant_scale = (scale / global_scale)[:, None]
+    result = fp4_val * dequant_scale
+
+    tl.store(output_ptr + indices, result, mask=mask_2d)
+
+
+def _triton_nvfp4_quant_dequant(
+    x: torch.Tensor,
+    global_scale: torch.Tensor,
+    block_size: int,
+) -> torch.Tensor:
+    """Triton-accelerated NVFP4 quantize-dequantize."""
+    x_m, x_k = x.shape
+
+    if not torch.compiler.is_compiling():
+        assert x_k % block_size == 0, f"Weight shape K={x_k} is not divisible by block_size={block_size}"
+
+    output_dtype = x.dtype
+    num_blocks = x_k // block_size
+
+    output = torch.empty(x_m, x_k, dtype=output_dtype, device=x.device)
+
+    tile_blocks = min(64, triton.next_power_of_2(num_blocks))
+    num_tiles = (num_blocks + tile_blocks - 1) // tile_blocks
+    grid = (x_m, num_tiles)
+    _nvfp4_quant_dequant_kernel[grid](
+        x,
+        output,
+        global_scale,
+        x_k,
+        num_blocks,
+        block_size,
+        FLOAT4_E2M1_MAX_RECIPROCAL,
+        tile_blocks,
+    )
+
+    return output
+
+
+def _triton_dequantize_nvfp4(
+    tensor_fp4: torch.Tensor,
+    tensor_sf: torch.Tensor,
+    global_scale: torch.Tensor,
+    dtype: torch.dtype,
+    block_size: int = 16,
+) -> torch.Tensor:
+    """Dequantize NVFP4 using Triton (swizzle=False only).
+    Supports both 2D and 3D inputs:
+    - 2D: [m, packed_k] -> [m, k]
+    - 3D: [dim0, m, packed_k] -> [dim0, m, k]
+    """
+    assert tensor_fp4.dtype == torch.uint8
+
+    is_3d = tensor_fp4.ndim == 3
+    if is_3d:
+        dim0, m_per_batch, packed_k = tensor_fp4.shape
+        tensor_fp4_2d = tensor_fp4.reshape(-1, packed_k)
+        tensor_sf_2d = tensor_sf.reshape(-1, tensor_sf.shape[-1])
+        total_rows_flat = dim0 * m_per_batch
+    else:
+        m_per_batch, packed_k = tensor_fp4.shape
+        tensor_fp4_2d = tensor_fp4
+        tensor_sf_2d = tensor_sf
+        total_rows_flat = m_per_batch
+
+    k = packed_k * 2
+    num_blocks = k // block_size
+
+    output = torch.empty(total_rows_flat, k, dtype=dtype, device=tensor_fp4.device)
+
+    # View as uint8 so Triton can load raw bytes and bitcast to float8_e4m3fn
+    scale_raw = tensor_sf_2d.contiguous().view(torch.uint8)
+
+    # Shape-adaptive tile sizing: for large row counts (3D), process
+    # entire row in one tile. For small row counts (2D), use smaller
+    # tiles to increase parallelism across CUs.
+    np2 = triton.next_power_of_2(num_blocks)
+    if total_rows_flat >= 4096:
+        # Many rows: maximize work per CTA, one tile per row
+        tile_blocks = np2
+        nw = 1
+        ns = 2
+    elif total_rows_flat >= 2048:
+        # Medium-many rows: full row, 2 warps
+        tile_blocks = np2
+        nw = 2
+        ns = 2
+    else:
+        # Few rows: use moderate tiles for CU utilization
+        tile_blocks = min(64, np2)
+        nw = 4
+        ns = 2
+    num_tiles = (num_blocks + tile_blocks - 1) // tile_blocks
+    grid = (total_rows_flat, num_tiles)
+    _dequantize_nvfp4_kernel[grid](
+        tensor_fp4_2d,
+        scale_raw,
+        global_scale,
+        output,
+        m_per_batch,
+        num_blocks,
+        block_size,
+        is_3d,
+        tile_blocks,
+        num_warps=nw,
+        num_stages=ns,
+    )
+
+    if is_3d:
+        output = output.reshape(dim0, m_per_batch, k)
+
+    return output
+
+
 def break_fp4_bytes(a, dtype):
     assert a.dtype == torch.uint8
     m, n = a.shape
@@ -65,6 +347,9 @@ def dequantize_to_dtype(
     # Two fp4 values are packed into one uint8.
     assert tensor_fp4.dtype == torch.uint8
 
+    if not swizzle and current_platform.is_cuda_alike():
+        return _triton_dequantize_nvfp4(tensor_fp4, tensor_sf, global_scale, dtype, block_size)
+
     # We handle 3D tensors reshaping them to 2D.
     is_3d = tensor_fp4.ndim == 3
 
@@ -141,12 +426,15 @@ def ref_nvfp4_quant(x, global_scale, block_size):
     return cast_to_fp4(clipped_x), scale.squeeze(-1)
 
 
-def ref_nvfp4_quant_dequant(x: torch.Tensor, global_scale: torch.Tensor, block_size: int) -> tuple[torch.Tensor, None]:
+def ref_nvfp4_quant_dequant(x: torch.Tensor, global_scale: torch.Tensor, block_size: int) -> torch.Tensor:
     """
     NVFP4 quantize-dequantize operation.
 
     `global_scale` is expected to have a single element.
     """
+    if current_platform.is_cuda_alike():
+        return _triton_nvfp4_quant_dequant(x, global_scale, block_size)
+
     x_m, x_k = x.shape
     output_dtype = x.dtype
 
@@ -158,7 +446,7 @@ def ref_nvfp4_quant_dequant(x: torch.Tensor, global_scale: torch.Tensor, block_s
     x_blockscale = x_blockscale.unsqueeze(-1) / global_scale
     x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype)
 
-    return x_dq, None
+    return x_dq
 
 
 def run_nvfp4_emulations(
@@ -172,7 +460,7 @@ def run_nvfp4_emulations(
     output_dtype = x.dtype
     group_size = 16
 
-    x_dq, _ = ref_nvfp4_quant_dequant(x, input_global_scale, block_size=group_size)
+    x_dq = ref_nvfp4_quant_dequant(x, input_global_scale, block_size=group_size)
 
     # dequantize weight
     w_fp4 = weight.data.view(torch.uint8)
diff --git a/aphrodite/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/aphrodite/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index 1be412e1de..4ee233414c 100644
--- a/aphrodite/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/aphrodite/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -45,6 +45,7 @@ def __init__(
         beta_slow: int = 1,
         mscale: float = 1,
         mscale_all_dim: float = 0,
+        init_cache: bool = True,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
@@ -64,7 +65,15 @@ def __init__(
             and has_flashinfer()
             and head_size in [64, 128, 256, 512]
         )
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype)
+        super().__init__(
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            dtype,
+            init_cache=init_cache,
+        )
 
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
         pos_freqs = self.base ** (
@@ -203,7 +212,9 @@ class DeepseekV4ScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        # Avoid compute cache repeatedly
+        kwargs.pop("init_cache", None)
+        super().__init__(*args, **kwargs, init_cache=False)
         cache_fp32 = self._compute_cos_sin_cache()
         self.register_buffer("cos_sin_cache", cache_fp32, persistent=False)
 
diff --git a/aphrodite/model_executor/layers/utils.py b/aphrodite/model_executor/layers/utils.py
index b4838bd1f5..9d0ed2a9bd 100644
--- a/aphrodite/model_executor/layers/utils.py
+++ b/aphrodite/model_executor/layers/utils.py
@@ -157,17 +157,20 @@ def rocm_unquantized_gemm_impl(x: torch.Tensor, weight: torch.Tensor, bias: torc
         and k % 8 == 0
     )
 
-    if not use_skinny:
-        return torch.nn.functional.linear(x, weight, bias)
-
-    x_view = x.reshape(-1, x.size(-1))
-    if m > 8 and 0 < n <= 4:
-        cu_count = num_compute_units()
-        out = ops.wvSplitK(weight, x_view, cu_count, bias)
-        return out.reshape(*x.shape[:-1], weight.shape[0])
-    elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None:
-        out = ops.LLMM1(weight, x_view, 4)
-        return out.reshape(*x.shape[:-1], weight.shape[0])
+    if use_skinny:
+        x_view = x.reshape(-1, x.size(-1))
+        if m > 8 and 0 < n <= 4:
+            cu_count = num_compute_units()
+            out = ops.wvSplitK(weight, x_view, cu_count, bias)
+            return out.reshape(*x.shape[:-1], weight.shape[0])
+        elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None:
+            out = ops.LLMM1(weight, x_view, 4)
+            return out.reshape(*x.shape[:-1], weight.shape[0])
+
+    if rocm_aiter_ops.is_tgemm_enabled():
+        from aiter.tuned_gemm import tgemm
+
+        return tgemm.mm(x, weight, bias)
     return torch.nn.functional.linear(x, weight, bias)
 
 
@@ -261,13 +264,6 @@ def cpu_unquantized_gemm(
     return layer.cpu_linear(x, weight, bias)
 
 
-def cublas_gemm_bf16_bf16_fp32(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-):
-    return ops.router_gemm_bf16_fp32(x, weight)
-
-
 def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
     if current_platform.is_rocm():
         return rocm_unquantized_gemm
diff --git a/aphrodite/model_executor/model_loader/base_loader.py b/aphrodite/model_executor/model_loader/base_loader.py
index 7e568be8db..5301b44874 100644
--- a/aphrodite/model_executor/model_loader/base_loader.py
+++ b/aphrodite/model_executor/model_loader/base_loader.py
@@ -66,7 +66,7 @@ def load_model(
 
             # Log peak GPU memory after loading weights. This is needed
             # to have test coverage on peak memory for online quantization.
-            if current_platform.is_cuda():
+            if current_platform.is_cuda_alike():
                 peak_memory = torch.accelerator.max_memory_allocated()
                 logger.debug_once(
                     "Peak GPU memory after loading weights: %s GiB",
diff --git a/aphrodite/model_executor/model_loader/default_loader.py b/aphrodite/model_executor/model_loader/default_loader.py
index 966667dc62..8619aba2cf 100644
--- a/aphrodite/model_executor/model_loader/default_loader.py
+++ b/aphrodite/model_executor/model_loader/default_loader.py
@@ -79,7 +79,11 @@ def __init__(self, load_config: LoadConfig):
         self.local_expert_ids: set[int] | None = None
 
         extra_config = load_config.model_loader_extra_config
-        allowed_keys = {"enable_multithread_load", "num_threads"}
+        allowed_keys = {
+            "enable_multithread_load",
+            "num_threads",
+            "enable_weights_track",
+        }
         unexpected_keys = set(extra_config.keys()) - allowed_keys
 
         if unexpected_keys:
@@ -87,6 +91,8 @@ def __init__(self, load_config: LoadConfig):
                 f"Unexpected extra config keys for load format {load_config.load_format}: {unexpected_keys}"
             )
 
+        self.enable_weights_track: bool | None = extra_config.get("enable_weights_track", None)
+
     def _prepare_weights(
         self,
         model_name_or_path: str,
@@ -385,7 +391,6 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
 
         self._init_ep_weight_filter(model_config)
 
-        weights_to_load = {name for name, _ in model.named_parameters()}
         weights_iter, total_bytes = self.get_all_weights(model_config, model)
         if self.load_config.use_tqdm_on_load:
             weights_iter = tensor_progress_bar(weights_iter, total_bytes, "Loading model weights")
@@ -397,8 +402,28 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
             self.counter_after_loading_weights - self.counter_before_loading_weights,
         )
         # We only enable strict check for non-quantized models
-        # that have loaded weights tracking currently.
-        if model_config.quantization is None and loaded_weights is not None:
+        # that have loaded weights tracking by default.
+        default_enable_weights_track = model_config.quantization is None and loaded_weights is not None
+        enable_weights_track = (
+            self.enable_weights_track if self.enable_weights_track is not None else default_enable_weights_track
+        )
+        if enable_weights_track:
+            self.track_weights_loading(model, loaded_weights)
+
+    def track_weights_loading(self, model: nn.Module, loaded_weights: set[str] | None) -> None:
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        if loaded_weights is not None:
+            # ignore online quantization scales
+            for name, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                has_online_quant = getattr(quant_method, "uses_meta_device", False)
+                has_postprocess_quant = getattr(quant_method, "process_weights_after_loading", None)
+                # ignore kv_cache scale and online quant scale,
+                # which can be missing in checkpoints
+                if has_online_quant or has_postprocess_quant:
+                    for param_name, _ in module.named_parameters():
+                        full_name = f"{name}.{param_name}" if name else param_name
+                        loaded_weights.add(full_name)
             weights_not_loaded = weights_to_load - loaded_weights
             if weights_not_loaded:
                 raise ValueError(f"Following weights were not initialized from checkpoint: {weights_not_loaded}")
diff --git a/aphrodite/model_executor/model_loader/reload/layerwise.py b/aphrodite/model_executor/model_loader/reload/layerwise.py
index b73376a0cf..770ce9cd8c 100644
--- a/aphrodite/model_executor/model_loader/reload/layerwise.py
+++ b/aphrodite/model_executor/model_loader/reload/layerwise.py
@@ -3,7 +3,7 @@
 import inspect
 from collections.abc import Callable
 from functools import wraps
-from weakref import WeakKeyDictionary
+from weakref import WeakKeyDictionary, WeakSet
 
 import torch
 
@@ -21,7 +21,13 @@
     restore_layer_on_meta,
 )
 from .types import LayerReloadingInfo
-from .utils import get_layer_params_buffers, get_layer_size, get_layer_tensors
+from .utils import (
+    get_info_size,
+    get_layer_params_buffers,
+    get_layer_size,
+    get_layer_tensors,
+    has_device_tensors,
+)
 
 logger = init_logger(__name__)
 
@@ -42,6 +48,10 @@
 LAYERWISE_INFO: WeakKeyDictionary[torch.nn.Module, LayerReloadingInfo] = WeakKeyDictionary()
 
 
+# Global set used to track loading for logging purposes only
+LOADING_LAYERS: WeakSet[torch.nn.Module] = WeakSet()
+
+
 def get_layerwise_info(layer: torch.nn.Module) -> LayerReloadingInfo:
     """
     Get information related to restoring and layerwise processing. If no previous
@@ -172,11 +182,28 @@ def online_process_loader(*args, **kwargs):
             info.load_numel_total,
         )
 
+        # Do not online process attention layers, must wait until finalize
+        if isinstance(layer, (Attention, MLAAttention)):
+            return ret
+
+        # Log warnings allocating excessive buffers on device
+        if has_device_tensors(bound_args):
+            LOADING_LAYERS.add(layer)
+            if len(LOADING_LAYERS) >= 2:
+                names = sorted([layer.__class__.__name__ for layer in LOADING_LAYERS])
+                mem_used = sum(get_info_size(LAYERWISE_INFO[layer]) for layer in LOADING_LAYERS)
+                logger.warning_once(
+                    "Allocating %.1f MB of device memory to buffers to load %s layers. "
+                    "This extra memory usage can be avoided by ordering weights "
+                    "by their parent layer when reloading.",
+                    mem_used / 1e6,
+                    str(list(names)),
+                )
+
         # Process and copy when all weights are loaded
-        if info.load_numel >= info.load_numel_total and not isinstance(  # type: ignore[operator]
-            layer, (Attention, MLAAttention)
-        ):
+        if info.load_numel >= info.load_numel_total:  # type: ignore[operator]
             _layerwise_process(layer, info)
+            LOADING_LAYERS.discard(layer)
 
         return ret
 
@@ -238,6 +265,8 @@ def finalize_layerwise_processing(model: torch.nn.Module, model_config: ModelCon
         _finalize_attention_layer(layer, info, model_config)
         info.reset()
 
+    LOADING_LAYERS.clear()
+
 
 def finalize_layerwise_reload(*args, **kwargs):
     finalize_layerwise_processing(*args, **kwargs)
diff --git a/aphrodite/model_executor/model_loader/reload/utils.py b/aphrodite/model_executor/model_loader/reload/utils.py
index 20f6fa2fe2..7ba818a265 100644
--- a/aphrodite/model_executor/model_loader/reload/utils.py
+++ b/aphrodite/model_executor/model_loader/reload/utils.py
@@ -1,14 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from inspect import BoundArguments
 
 import torch
 
-from .types import LayerTensors
+from .types import LayerReloadingInfo, LayerTensors
 
 __all__ = [
     "get_layer_tensors",
     "get_layer_params_buffers",
     "get_layer_size",
+    "has_device_tensors",
+    "get_info_size",
 ]
 
 
@@ -35,3 +38,29 @@ def get_layer_size(layer: torch.nn.Module) -> int:
     from .meta import SKIP_TENSORS
 
     return sum(tensor.numel() for name, tensor in get_layer_tensors(layer).items() if name not in SKIP_TENSORS)
+
+
+def has_device_tensors(bound_args: BoundArguments) -> bool:
+    """
+    Return True if the loaded weights exist on an accelerator device
+    :param bound_args: args to load weights
+    :return: True if weights are on accelerator device
+    """
+    return any(
+        isinstance(value, torch.Tensor) and value.device.type not in ("meta", "cpu")
+        for value in bound_args.arguments.values()
+    )
+
+
+def get_info_size(info: LayerReloadingInfo) -> int:
+    """
+    Calculate the number of bytes used by loaded weights for a given layer
+    :param info: layerwise info to get size of
+    :return: number of bytes used by loaded weights
+    """
+    return sum(
+        value.nbytes
+        for _, args in info.loaded_weights
+        for value in args.arguments.values()
+        if isinstance(value, torch.Tensor) and value.device.type not in ("meta", "cpu")
+    )
diff --git a/aphrodite/model_executor/models/bailing_moe_linear.py b/aphrodite/model_executor/models/bailing_moe_linear.py
index 4052a64dbd..9b021c5472 100644
--- a/aphrodite/model_executor/models/bailing_moe_linear.py
+++ b/aphrodite/model_executor/models/bailing_moe_linear.py
@@ -22,6 +22,7 @@
 )
 from aphrodite.forward_context import get_forward_context
 from aphrodite.logger import init_logger
+from aphrodite.model_executor.custom_op import PluggableLayer
 from aphrodite.model_executor.layers.fla.ops.layernorm_guard import (
     RMSNormGated,
     layernorm_fn,
@@ -210,14 +211,17 @@ def __init__(
             self.q_a_layernorm = None
             self.q_b_proj = None
 
-        rope_parameters = _build_rope_parameters(config)
+        rope_parameters = _build_rope_parameters(config) or {}
+        # MLA rotates the full qk_rope_head_dim,
+        # partial_rotary_factor is for the linear-attn head only.
+        rope_parameters = {k: v for k, v in rope_parameters.items() if k != "partial_rotary_factor"}
+        rope_parameters["rope_dim"] = self.qk_rope_head_dim
         max_position = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             head_size=self.qk_rope_head_dim,
             max_position=max_position,
             is_neox_style=False,
-            rope_parameters=rope_parameters or None,
-            dtype=torch.float32,
+            rope_parameters=rope_parameters,
         )
 
         # Build MLAModules for MultiHeadLatentAttentionWrapper
@@ -422,9 +426,11 @@ def _weight_loader(param: torch.nn.Parameter, loaded_weight: torch.Tensor) -> No
         param.data.copy_(loaded_weight[shard].contiguous())
 
 
-class BailingMoELinearAttention(nn.Module, MambaBase):
-    """
-    Bailing MoE Linear Attention implementation using minimax backend.
+# --8<-- [start:bailing_moe_linear_attention]
+@PluggableLayer.register("bailing_moe_linear_attention")
+class BailingMoELinearAttention(PluggableLayer, MambaBase):
+    """Pluggable Bailing MoE Linear Attention layer which allows OOT backends
+    to add custom implementations.
 
     This implements the linear attention mechanism from sglang, adapted for Aphrodite's
     v1 engine with MambaBase interface support.
@@ -558,7 +564,6 @@ def __init__(
             self.head_dim,
             max_position=self.max_position_embeddings,
             is_neox_style=True,
-            dtype=torch.float32,
             rope_parameters=rope_parameters or None,
         )
 
@@ -723,8 +728,6 @@ def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_m
 
     def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata):
         """Handle decode (single token per sequence)."""
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_prefills = attn_metadata.num_prefills
         hidden = linear_attention_decode(
             q,
             k,
@@ -732,10 +735,10 @@ def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata):
             kv_cache,
             self.tp_slope,
             state_indices_tensor,
-            q_start=num_prefill_tokens,
-            q_end=None,
-            slot_start=num_prefills,
-            slot_end=None,
+            q_start=0,
+            q_end=attn_metadata.num_decode_tokens,
+            slot_start=0,
+            slot_end=attn_metadata.num_decodes,
             block_size=32,
         )
         return hidden
@@ -1098,6 +1101,7 @@ def __init__(
                 config.vocab_size,
                 config.hidden_size,
                 quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
             )
             self.logits_processor = LogitsProcessor(config.vocab_size)
         else:
diff --git a/aphrodite/model_executor/models/cohere2_vision.py b/aphrodite/model_executor/models/cohere2_vision.py
index aff7c4a7c1..cb12fe2d3f 100644
--- a/aphrodite/model_executor/models/cohere2_vision.py
+++ b/aphrodite/model_executor/models/cohere2_vision.py
@@ -44,7 +44,12 @@
 from aphrodite.sequence import IntermediateTensors
 from aphrodite.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
 from .siglip import SiglipVisionModel
 from .utils import (
     AutoWeightsLoader,
@@ -302,16 +307,20 @@ def get_replacement(item_idx: int):
     info=Cohere2VisionProcessingInfo,
     dummy_inputs=Cohere2VisionDummyInputsBuilder,
 )
-class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant):
     hf_to_aphrodite_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_tower.": "vision_tower.",
             "model.multi_modal_projector.": "multi_modal_projector.",
             "model.language_model.": "language_model.model.",
-            "lm_head.": "language_model.lm_head.",
         }
     )
 
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
     def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
         super().__init__()
         config: Cohere2VisionConfig = aphrodite_config.model_config.hf_config
diff --git a/aphrodite/model_executor/models/cohere_asr.py b/aphrodite/model_executor/models/cohere_asr.py
index f401d686c2..0d178eae6c 100644
--- a/aphrodite/model_executor/models/cohere_asr.py
+++ b/aphrodite/model_executor/models/cohere_asr.py
@@ -3,6 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, ClassVar
 
 import torch
 import torch.nn.functional as F
@@ -19,7 +20,7 @@
 from aphrodite.config.multimodal import BaseDummyOptions
 from aphrodite.config.speech_to_text import SpeechToTextParams
 from aphrodite.distributed import get_tensor_model_parallel_world_size
-from aphrodite.inputs import MultiModalDataDict, PromptType, TextPrompt
+from aphrodite.inputs import MultiModalDataDict, PromptType, TokensPrompt
 from aphrodite.logger import init_logger
 from aphrodite.model_executor.layers.activation import get_act_fn
 from aphrodite.model_executor.layers.attention import (
@@ -53,6 +54,7 @@
     PromptUpdate,
 )
 from aphrodite.renderers import TokenizeParams
+from aphrodite.tokenizers import cached_tokenizer_from_config
 from aphrodite.transformers_utils.processors.cohere_asr import (
     INF_VAL,
     CohereASRFeatureExtractor,
@@ -1913,6 +1915,7 @@ class CohereAsrForConditionalGeneration(nn.Module, SupportsTranscription, Suppor
     supported_languages = ISO639_1_SUPPORTED_LANGS
     skip_warmup_audio_preprocessing = True
     no_space_languages = {"ja", "zh"}
+    _default_prompt_token_ids_cache: ClassVar[dict[tuple[str | None, str | None, str], tuple[int, ...]]] = {}
 
     @classmethod
     def validate_language(cls, language: str | None) -> str | None:
@@ -1930,28 +1933,76 @@ def get_generation_prompt(cls, stt_params: SpeechToTextParams) -> PromptType:
         audio = stt_params.audio
         stt_config = stt_params.stt_config
         language = stt_params.language
-        request_prompt = stt_params.request_prompt
+        model_config = stt_params.model_config
 
         if language is None:
             raise ValueError("Language must be specified when creating the CohereASR prompt")
 
-        # NOTE: this function is used only by online inference and not offline inference
-        # CohereASR doesnt have encoder prompt
-        language_tag = f"<|{language}|><|{language}|>"
-        pnc = True  # TODO(ekagra): make this configurable later
-        pnc_tag = "<|pnc|>" if pnc else "<|nopnc|>"
-        default_prompt = (
-            f"<|startofcontext|><|startoftranscript|>"
-            f"<|emo:undefined|>{language_tag}{pnc_tag}"
-            f"<|noitn|><|notimestamp|><|nodiarize|>"
+        tokenizer = cached_tokenizer_from_config(model_config)
+
+        # prompt_text is None because CoherASR uses fast implementation of
+        # sentencepiece tokenizer which needs "▁" as the first token
+        # (which is different from "_") and encode("▁ABC") ignores the first token
+        # so the prompt_text is unreliable. However, prompt_token_ids can be used
+        # to get prompt_text but it wont have the first token "▁".
+        prompt_text = None
+        prompt_token_ids = cls._get_default_prompt_token_ids(
+            tokenizer,
+            model_config,
+            language,
         )
-        prompt_text = request_prompt if request_prompt else default_prompt
 
-        return TextPrompt(
+        return TokensPrompt(
             prompt=prompt_text,
+            prompt_token_ids=prompt_token_ids,
             multi_modal_data={"audio": (audio, stt_config.sample_rate)},
         )
 
+    @classmethod
+    def _get_default_prompt_tokens(cls, language: str) -> tuple[str, ...]:
+        # Use token-level control tags so fast tokenizers do not have to parse
+        # the raw string form of the decoder prefix.
+        return (
+            "▁",
+            "<|startofcontext|>",
+            "<|startoftranscript|>",
+            "<|emo:undefined|>",
+            f"<|{language}|>",
+            f"<|{language}|>",
+            "<|pnc|>",
+            "<|noitn|>",
+            "<|notimestamp|>",
+            "<|nodiarize|>",
+        )
+
+    @classmethod
+    def _get_default_prompt_token_ids(
+        cls,
+        tokenizer: Any,
+        model_config: ModelConfig,
+        language: str,
+    ) -> list[int]:
+        cache_key = (
+            getattr(model_config, "tokenizer", None),
+            getattr(model_config, "tokenizer_revision", None),
+            language,
+        )
+        prompt_token_ids = cls._default_prompt_token_ids_cache.get(cache_key)
+        if prompt_token_ids is None:
+            prompt_tokens = list(cls._get_default_prompt_tokens(language))
+            token_ids = tokenizer.convert_tokens_to_ids(prompt_tokens)
+            if not isinstance(token_ids, list):
+                token_ids = [token_ids]
+            unk_token_id = getattr(tokenizer, "unk_token_id", None)
+            if unk_token_id is not None and any(token_id == unk_token_id for token_id in token_ids):
+                raise ValueError(
+                    "Failed to resolve the CohereASR decoder control tokens with the configured tokenizer."
+                )
+            prompt_token_ids = tuple(int(token_id) for token_id in token_ids)
+            cls._default_prompt_token_ids_cache[cache_key] = prompt_token_ids
+
+        return list(prompt_token_ids)
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         # Required as part of SupportsMultiModal interface.
diff --git a/aphrodite/model_executor/models/cohere_moe.py b/aphrodite/model_executor/models/cohere_moe.py
new file mode 100644
index 0000000000..8df7449ef3
--- /dev/null
+++ b/aphrodite/model_executor/models/cohere_moe.py
@@ -0,0 +1,485 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+from transformers import CohereConfig
+
+from aphrodite.compilation.decorators import support_torch_compile
+from aphrodite.config import AphroditeConfig, CacheConfig
+from aphrodite.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from aphrodite.model_executor.layers.activation import SiluAndMul
+from aphrodite.model_executor.layers.attention import Attention
+from aphrodite.model_executor.layers.fused_moe import FusedMoE
+from aphrodite.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from aphrodite.model_executor.layers.logits_processor import LogitsProcessor
+from aphrodite.model_executor.layers.quantization import QuantizationConfig
+from aphrodite.model_executor.layers.rotary_embedding import get_rope
+from aphrodite.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from aphrodite.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from aphrodite.platforms import current_platform
+from aphrodite.sequence import IntermediateTensors
+
+from .commandr import LayerNorm
+from .interfaces import SupportsPP, SupportsQuant
+from .utils import (
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+
+@torch.compile(backend=current_platform.simple_compile_backend)
+def token_choice_with_bias(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    """Sigmoid -> top-k (-> renormalize) custom routing for CohereMoe."""
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    scores = gating_output.float().sigmoid()
+    topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1, sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+
+class CohereMoeMLP(nn.Module):
+    """Cohere MLP used as shared experts in the MoE block."""
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        intermediate_size: int | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=False,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class CohereMoeAttention(nn.Module):
+    """Cohere MoE attention with sliding-window interleave."""
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        tp_size = get_tensor_model_parallel_world_size()
+        self.config = config
+        self.layer_idx = extract_layer_index(prefix)
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.total_num_heads)
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = getattr(config, "model_max_length", None) or getattr(
+            config, "max_position_embeddings", 8192
+        )
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=self.max_position_embeddings,
+            rope_parameters=config.rope_parameters,
+            is_neox_style=False,
+        )
+
+        self.sliding_window = None
+        layer_types = getattr(config, "layer_types", None)
+        if layer_types is not None and layer_types[self.layer_idx] == "sliding_attention":
+            self.sliding_window = config.sliding_window
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.sliding_window:
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class CohereMoe(nn.Module):
+    """Tensor-parallel MoE block for CohereMoe with shared experts."""
+
+    def __init__(
+        self,
+        config: CohereConfig,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than the number of experts {config.num_experts}."
+            )
+
+        if hasattr(config, "expert_selection_fn") and config.expert_selection_fn == "sigmoid":
+            self.custom_routing_function = token_choice_with_bias
+        else:
+            self.custom_routing_function = None
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        if hasattr(config, "num_shared_experts") and config.num_shared_experts > 0:
+            self.shared_experts = CohereMoeMLP(
+                config=config,
+                intermediate_size=config.intermediate_size * config.num_shared_experts,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+            )
+            self.shared_expert_combination_strategy = getattr(config, "shared_expert_combination_strategy", "sum")
+            assert self.shared_expert_combination_strategy in ("average", "sum"), (
+                "shared_expert_combination_strategy must be one of ['average', 'sum']"
+            )
+        else:
+            self.shared_experts = None
+            self.shared_expert_combination_strategy = None
+
+        self.experts = FusedMoE(
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            params_dtype=params_dtype,
+            renormalize=getattr(config, "norm_topk_prob", True),
+            quant_config=quant_config,
+            tp_size=tp_size,
+            prefix=f"{prefix}.experts",
+            custom_routing_function=self.custom_routing_function,
+            shared_experts=self.shared_experts,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits, _ = self.gate(hidden_states)
+        # FusedMoE handles shared expert overlap internally and returns
+        # shared_output + routed_output when shared_experts is set.
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        if self.shared_expert_combination_strategy == "average":
+            final_hidden_states = final_hidden_states / 2
+        return final_hidden_states.view(orig_shape)
+
+
+class CohereMoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: CohereConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = CohereMoeAttention(
+            config,
+            cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = CohereMoe(config=config, quant_config=quant_config, prefix=f"{prefix}.mlp")
+        self.input_layernorm = LayerNorm(param_shape=(config.hidden_size,), eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states_attention = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states_mlp = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+        return hidden_states, residual
+
+
+@support_torch_compile
+class CohereMoeModel(nn.Module):
+    """Transformer decoder for CohereMoe."""
+
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
+        super().__init__()
+
+        config = aphrodite_config.model_config.hf_config
+        cache_config = aphrodite_config.cache_config
+        quant_config = aphrodite_config.quant_config
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: CohereMoeDecoderLayer(config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = LayerNorm(param_shape=(config.hidden_size,), eps=config.layer_norm_eps)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states, "residual": residual})
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class CohereMoeForCausalLM(nn.Module, SupportsPP, SupportsQuant):
+    is_text_generation_model = True
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
+        super().__init__()
+        config = aphrodite_config.model_config.hf_config
+        quant_config = aphrodite_config.quant_config
+        self.config = config
+        assert getattr(config, "tie_word_embeddings", True)
+        self.unpadded_vocab_size = config.vocab_size
+        self.quant_config = quant_config
+        self.logits_scale = config.logit_scale
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, scale=self.logits_scale)
+        self.model = CohereMoeModel(aphrodite_config=aphrodite_config, prefix=maybe_prefix(prefix, "model"))
+        self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        return self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.logits_processor(self.model.embed_tokens, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name)):
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if (name.endswith(".bias") or name.endswith("_bias")) and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if "lm_head.weight" in name:
+                        continue
+                    if (name.endswith(".bias") or name.endswith("_bias")) and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
diff --git a/aphrodite/model_executor/models/deepseek_v4.py b/aphrodite/model_executor/models/deepseek_v4.py
index ab94b37f74..035f32356e 100644
--- a/aphrodite/model_executor/models/deepseek_v4.py
+++ b/aphrodite/model_executor/models/deepseek_v4.py
@@ -7,10 +7,9 @@
 import regex as re
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 from aphrodite.compilation.decorators import support_torch_compile
-from aphrodite.config import AphroditeConfig
+from aphrodite.config import AphroditeConfig, get_current_aphrodite_config
 from aphrodite.distributed import (
     get_ep_group,
     get_tensor_model_parallel_rank,
@@ -35,7 +34,10 @@
     RowParallelLinear,
 )
 from aphrodite.model_executor.layers.logits_processor import LogitsProcessor
-from aphrodite.model_executor.layers.quantization import QuantizationConfig, QuantizationMethods
+from aphrodite.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
 from aphrodite.model_executor.layers.quantization.fp8 import Fp8Config
 from aphrodite.model_executor.layers.quantization.mxfp4 import Mxfp4MoEMethod
 from aphrodite.model_executor.layers.quantization.utils.quant_utils import (
@@ -51,7 +53,6 @@
 from aphrodite.platforms import current_platform
 from aphrodite.sequence import IntermediateTensors
 from aphrodite.triton_utils import tl, triton
-from aphrodite.utils.multi_stream_utils import AuxStreamType
 from aphrodite.utils.torch_utils import direct_register_custom_op
 
 from .utils import (
@@ -62,6 +63,8 @@
     maybe_prefix,
 )
 
+_DEEPSEEK_V4_EXPERT_DTYPES = ("fp4", "fp8")
+
 
 class DeepseekV4MLP(nn.Module):
     def __init__(
@@ -113,16 +116,57 @@ def forward(self, x):
 
 
 class DeepseekV4FP8Config(Fp8Config):
-    """FP8 config that routes MoE layers to MXFP4 quantization.
-
-    DeepSeek V4 checkpoints use FP8 for linear/attention layers but
-    MXFP4 for MoE expert weights. This config inherits standard FP8
-    behavior and overrides only the MoE dispatch.
+    """FP8 config for DeepSeek V4 with expert-dtype-aware MoE dispatch.
+
+    DeepSeek V4 checkpoints always use FP8 block quantization for
+    linear/attention layers. The MoE expert weights vary by checkpoint:
+    - ``expert_dtype="fp4"`` (e.g. DeepSeek-V4-Flash): MXFP4 experts
+      with ue8m0 (e8m0fnu) FP8 linear scales.
+    - ``expert_dtype="fp8"`` (e.g. DeepSeek-V4-Flash-Base): FP8 block
+      experts with float32 FP8 linear scales.
+
+    The dispatch and the linear scale dtype are both keyed off
+    ``expert_dtype`` from the model's hf_config; missing values default
+    to ``"fp4"`` so existing FP4 checkpoints stay unchanged.
+
+    NOTE: ``expert_dtype`` is resolved lazily because this config is
+    constructed during AphroditeConfig setup, before ``set_current_aphrodite_config``
+    is active. Reading hf_config eagerly in ``__init__`` would always see
+    the default ``"fp4"`` and silently misroute Flash-Base checkpoints.
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.is_scale_e8m0: bool = True
+        self._resolved_expert_dtype: str | None = None
+        # ``is_scale_e8m0`` is a property that resolves on first read,
+        # by which time the current aphrodite_config has been set.
+
+    @property
+    def expert_dtype(self) -> str:
+        if self._resolved_expert_dtype is None:
+            try:
+                hf_config = get_current_aphrodite_config().model_config.hf_config
+            except Exception:
+                # aphrodite_config not yet set; defer the decision until a
+                # later call lands inside set_current_aphrodite_config.
+                return "fp4"
+            expert_dtype = getattr(hf_config, "expert_dtype", "fp4")
+            if expert_dtype not in _DEEPSEEK_V4_EXPERT_DTYPES:
+                raise ValueError(
+                    f"Unsupported DeepSeek V4 expert_dtype={expert_dtype!r}; "
+                    f"expected one of {_DEEPSEEK_V4_EXPERT_DTYPES}."
+                )
+            self._resolved_expert_dtype = expert_dtype
+            from aphrodite.logger import init_logger
+
+            init_logger(__name__).info_once("DeepSeek V4 expert_dtype resolved to %r", expert_dtype)
+        return self._resolved_expert_dtype
+
+    @property
+    def is_scale_e8m0(self) -> bool:
+        # FP4 checkpoints store FP8 linear scales as e8m0fnu; FP8 expert
+        # checkpoints (Flash-Base) store them as float32.
+        return self.expert_dtype == "fp4"
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -145,11 +189,14 @@ def get_quant_method(self, layer, prefix):
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedFusedMoEMethod(layer.moe_config)
-            return Mxfp4MoEMethod(layer.moe_config)
+            if self.expert_dtype == "fp4":
+                return Mxfp4MoEMethod(layer.moe_config)
+            # expert_dtype == "fp8": fall through to Fp8Config which
+            # returns Fp8MoEMethod with block-wise float32 scales.
         return super().get_quant_method(layer, prefix)
 
     def is_mxfp4_quant(self, prefix, layer):
-        return isinstance(layer, FusedMoE)
+        return isinstance(layer, FusedMoE) and self.expert_dtype == "fp4"
 
 
 @triton.jit
@@ -654,6 +701,12 @@ def __init__(
         self.scoring_func = getattr(config, "scoring_func", "sqrtsoftplus")
         if self.use_mega_moe and self.scoring_func != "sqrtsoftplus":
             raise NotImplementedError("DeepSeek V4 MegaMoE currently supports sqrtsoftplus routing only.")
+        if self.use_mega_moe and getattr(config, "expert_dtype", "fp4") != "fp4":
+            raise NotImplementedError(
+                "DeepSeek V4 MegaMoE only supports fp4 experts; got expert_dtype="
+                f"{config.expert_dtype!r}. Drop --kernel-config moe_backend="
+                "deep_gemm_mega_moe for this checkpoint."
+            )
 
         self.gate = GateLinear(
             config.hidden_size,
@@ -764,10 +817,9 @@ def _init_fused_moe_experts(
         )
 
     def forward(self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None) -> torch.Tensor:
-        if self.gate.tid2eid is not None:
-            if input_ids is None:
-                raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.")
-            input_ids = input_ids.to(dtype=self.hash_indices_dtype)
+        if self.gate.tid2eid is not None and input_ids is None:
+            raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.")
+
         if not self.use_mega_moe:
             return self._forward_fused_moe(hidden_states, input_ids)
 
@@ -831,7 +883,7 @@ def __init__(
         aphrodite_config: AphroditeConfig,
         prefix: str,
         topk_indices_buffer: torch.Tensor | None = None,
-        aux_stream: torch.cuda.Stream | None = None,
+        aux_stream_list: list[torch.cuda.Stream] | None = None,
     ):
         super().__init__()
         config = aphrodite_config.model_config.hf_config
@@ -929,7 +981,6 @@ def __init__(
             max_position=self.max_position_embeddings,
             rope_parameters=rope_parameters,
             is_neox_style=False,
-            dtype=config.torch_dtype,
         )
 
         self.indexer = None
@@ -960,7 +1011,7 @@ def __init__(
             indexer=self.indexer,
             indexer_rotary_emb=self.rotary_emb,
             topk_indices_buffer=topk_indices_buffer,
-            aux_stream=aux_stream,
+            aux_stream_list=aux_stream_list,
         )
         self.mla_attn = DeepseekV4MultiHeadLatentAttentionWrapper(
             hidden_size=self.hidden_size,
@@ -996,9 +1047,14 @@ def __init__(
         aphrodite_config,
         prefix,
         topk_indices_buffer: torch.Tensor | None = None,
-        aux_stream_dict: dict[AuxStreamType, torch.cuda.Stream] | None = None,
+        aux_stream_list: list[torch.cuda.Stream] | None = None,
     ):
         super().__init__()
+
+        # Lazy import to avoid top-level tilelang dependency.
+        # Registers both torch.ops.aphrodite.mhc_pre and mhc_post
+        import aphrodite.model_executor.layers.mhc  # noqa: F401
+
         config = aphrodite_config.model_config.hf_config
         self.hidden_size = config.hidden_size
 
@@ -1007,7 +1063,7 @@ def __init__(
             aphrodite_config,
             prefix=f"{prefix}.attn",
             topk_indices_buffer=topk_indices_buffer,
-            aux_stream=aux_stream_dict.get(AuxStreamType.Attention) if aux_stream_dict is not None else None,
+            aux_stream_list=aux_stream_list,
         )
         self.ffn = DeepseekV4MoE(aphrodite_config, prefix=f"{prefix}.ffn")
 
@@ -1069,11 +1125,6 @@ def hc_pre(
         hc_scale: torch.Tensor,
         hc_base: torch.Tensor,
     ):
-        # Lazy import to avoid top-level tilelang dependency.
-        # Registers both torch.ops.aphrodite.mhc_pre and mhc_post,
-        # so hc_post() doesn't need its own import.
-        import aphrodite.model_executor.layers.mhc  # noqa: F401
-
         post_mix, res_mix, layer_input = torch.ops.aphrodite.mhc_pre(
             residual=x,
             fn=hc_fn,
@@ -1124,17 +1175,21 @@ def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
         config = aphrodite_config.model_config.hf_config
         quant_config = aphrodite_config.quant_config
         self.config = config
-
+        if aphrodite_config.parallel_config.enable_expert_parallel:
+            self.use_mega_moe = aphrodite_config.kernel_config.moe_backend == "deep_gemm_mega_moe"
+        else:
+            self.use_mega_moe = False
         self.vocab_size = config.vocab_size
         self.hc_eps = config.hc_eps
         self.hc_mult = config.hc_mult
         self.hc_dim = self.hc_mult * config.hidden_size
         self.rms_norm_eps = config.rms_norm_eps
 
-        aux_stream_list = [torch.cuda.Stream() for _ in range(1)]
-        self.aux_stream_dict = {
-            AuxStreamType.Attention: aux_stream_list[0],
-        }
+        # Three aux streams: one per non-default input GEMM in
+        # DeepseekV4MultiHeadLatentAttentionWrapper.attn_gemm_parallel_execute
+        # (compressor kv_score, indexer.weights_proj, indexer.compressor
+        # kv_score). fused_wqa_wkv stays on the default stream.
+        aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
 
         self.device = current_platform.device_type
         # Reserved topk indices buffer for all Indexer layers to reuse.
@@ -1158,7 +1213,7 @@ def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
                 aphrodite_config,
                 prefix=prefix,
                 topk_indices_buffer=self.topk_indices_buffer,
-                aux_stream_dict=self.aux_stream_dict,
+                aux_stream_list=aux_stream_list,
             ),
             prefix=f"{prefix}.layers",
         )
@@ -1207,7 +1262,8 @@ def forward(
     ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.embed_input_ids(input_ids)
         hidden_states = hidden_states.unsqueeze(-2).repeat(1, self.hc_mult, 1)
-
+        if self.use_mega_moe:
+            input_ids = input_ids.to(torch.int64)
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(
                 hidden_states,
@@ -1342,20 +1398,43 @@ def hc_head(
     rms_norm_eps: float,
     hc_eps: float,
 ) -> torch.Tensor:
-    x = hidden_states
-    shape, dtype = x.size(), x.dtype
-    x = x.flatten(1).float()
-    rsqrt = torch.rsqrt(x.square().mean(-1, keepdim=True) + rms_norm_eps)
-    mixes = F.linear(x, hc_fn) * rsqrt
-    pre = torch.sigmoid(mixes * hc_scale + hc_base) + hc_eps
-    y = torch.sum(pre.unsqueeze(-1) * x.view(shape), dim=1)
-    return y.to(dtype)
-
+    hc_mult, hidden_size = hidden_states.shape[-2:]
+    outer_shape = hidden_states.shape[:-2]
+    hs_flat = hidden_states.view(-1, hc_mult, hidden_size)
+    num_tokens = hs_flat.shape[0]
+    out = torch.empty(num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device)
+    torch.ops.aphrodite.hc_head_fused_kernel(
+        hs_flat,
+        hc_fn,
+        hc_scale,
+        hc_base,
+        out,
+        hidden_size,
+        rms_norm_eps,
+        hc_eps,
+        hc_mult,
+    )
+    return out.view(*outer_shape, hidden_size)
 
-class DeepseekV4ForCausalLM(nn.Module):
-    model_cls = DeepseekV4Model
 
-    hf_to_aphrodite_mapper = WeightsMapper(
+def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper:
+    if expert_dtype == "fp4":
+        # MXFP4 experts use Mxfp4MoEMethod, which registers scales as
+        # ``w{1,2,3}_weight_scale`` (no _inv suffix). FP8 linear and
+        # shared experts use Fp8LinearMethod's block scales, which
+        # register as ``weight_scale_inv``.
+        scale_regex = {
+            re.compile(r"(\.experts\.\d+\.w[123])\.scale$"): r"\1.weight_scale",
+            re.compile(r"\.scale$"): ".weight_scale_inv",
+        }
+    else:
+        # FP8 experts use Fp8MoEMethod (block_quant=True), which registers
+        # scales as ``w{13,2}_weight_scale_inv``. Map all ``.scale`` keys
+        # there.
+        scale_regex = {
+            re.compile(r"\.scale$"): ".weight_scale_inv",
+        }
+    return WeightsMapper(
         orig_to_new_prefix={
             "layers.": "model.layers.",
             "embed.": "model.embed.",
@@ -1363,12 +1442,7 @@ class DeepseekV4ForCausalLM(nn.Module):
             "hc_head": "model.hc_head",
             "mtp.": "model.mtp.",
         },
-        orig_to_new_regex={
-            # Routed MoE expert scales: experts.N.wX.scale -> .weight_scale
-            re.compile(r"(\.experts\.\d+\.w[123])\.scale$"): r"\1.weight_scale",
-            # Everything else (FP8 linear + shared experts): .scale -> .weight_scale_inv
-            re.compile(r"\.scale$"): ".weight_scale_inv",
-        },
+        orig_to_new_regex=scale_regex,
         orig_to_new_suffix={
             "head.weight": "lm_head.weight",
             "embed.weight": "embed_tokens.weight",
@@ -1380,11 +1454,22 @@ class DeepseekV4ForCausalLM(nn.Module):
         },
     )
 
+
+class DeepseekV4ForCausalLM(nn.Module):
+    model_cls = DeepseekV4Model
+
+    # Default mapper assumes the original FP4-expert checkpoint layout.
+    # Overridden per-instance in __init__ when expert_dtype != "fp4".
+    hf_to_aphrodite_mapper = _make_deepseek_v4_weights_mapper("fp4")
+
     def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
         super().__init__()
 
         config = aphrodite_config.model_config.hf_config
         self.config = config
+        expert_dtype = getattr(config, "expert_dtype", "fp4")
+        if expert_dtype != "fp4":
+            self.hf_to_aphrodite_mapper = _make_deepseek_v4_weights_mapper(expert_dtype)
 
         self.model = self.model_cls(aphrodite_config=aphrodite_config, prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(
diff --git a/aphrodite/model_executor/models/deepseek_v4_mtp.py b/aphrodite/model_executor/models/deepseek_v4_mtp.py
index 69a379b817..167af434c9 100644
--- a/aphrodite/model_executor/models/deepseek_v4_mtp.py
+++ b/aphrodite/model_executor/models/deepseek_v4_mtp.py
@@ -35,7 +35,6 @@
 from aphrodite.model_executor.model_loader.weight_utils import default_weight_loader
 from aphrodite.platforms import current_platform
 from aphrodite.sequence import IntermediateTensors
-from aphrodite.utils.multi_stream_utils import AuxStreamType
 
 from .deepseek_mtp import SharedHead
 from .deepseek_v2 import get_spec_layer_idx_from_weight_name
@@ -48,9 +47,14 @@
 
 logger = init_logger(__name__)
 
-# MoE expert scales are fused into per-layer w13/w2 tensors; other FP8 linear
-# scales use `.weight_scale_inv`. Mirrors the regex in
-# DeepseekV4ForCausalLM.hf_to_aphrodite_mapper.
+# MoE expert scales are fused into per-layer w13/w2 tensors. The exact
+# parameter suffix depends on which FusedMoE method handles the experts:
+# - fp4 experts (Mxfp4MoEMethod) register ``w{1,2,3}_weight_scale``;
+# - fp8 experts (Fp8MoEMethod with block_quant=True) register
+#   ``w{1,2,3}_weight_scale_inv``.
+# Other FP8 linear scales (including shared experts) always use
+# ``.weight_scale_inv``. Mirrors the per-instance mapper built by
+# ``_make_deepseek_v4_weights_mapper`` in deepseek_v4.py.
 _EXPERT_SCALE_RE = re.compile(r"\.experts\.\d+\.w[123]\.scale$")
 
 
@@ -60,6 +64,7 @@ def __init__(
         aphrodite_config: AphroditeConfig,
         topk_indices_buffer: torch.Tensor,
         prefix: str,
+        aux_stream_list: list[torch.cuda.Stream] | None = None,
     ) -> None:
         super().__init__()
 
@@ -105,14 +110,11 @@ def __init__(
         )
 
         self.shared_head = SharedHead(config=config, prefix=prefix, quant_config=quant_config)
-        self.aux_stream_dict = {
-            AuxStreamType.Attention: torch.cuda.Stream(),
-        }
         self.mtp_block = DeepseekV4DecoderLayer(
             aphrodite_config,
             prefix,
             topk_indices_buffer=topk_indices_buffer,
-            aux_stream_dict=self.aux_stream_dict,
+            aux_stream_list=aux_stream_list,
         )
 
     def forward(
@@ -156,6 +158,10 @@ def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
             device=self.device,
         )
 
+        # Three aux streams shared across all MTP layers, mirroring
+        # DeepseekV4Model.
+        aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
+
         # to map the exact layer index from weights
         self.layers = torch.nn.ModuleDict(
             {
@@ -163,6 +169,7 @@ def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
                     aphrodite_config,
                     self.topk_indices_buffer,
                     f"{prefix}.layers.{idx}",
+                    aux_stream_list=aux_stream_list,
                 )
                 for idx in range(
                     self.mtp_start_layer_idx,
@@ -310,6 +317,13 @@ def _find_mtp_layer_idx(name: str) -> int:
                 num_experts=self.config.n_routed_experts,
             )
 
+        # FP8 experts register ``..._weight_scale_inv`` (block_quant) while
+        # FP4/MXFP4 experts register ``..._weight_scale``. Choose the suffix
+        # for the rename below based on the model's expert dtype.
+        expert_scale_suffix = (
+            ".weight_scale" if getattr(self.config, "expert_dtype", "fp4") == "fp4" else ".weight_scale_inv"
+        )
+
         for name, loaded_weight in weights:
             mtp_layer_idx = _find_mtp_layer_idx(name)
             # V4 checkpoints store MTP weights as `mtp.{i}.*`; remap to
@@ -330,7 +344,7 @@ def _find_mtp_layer_idx(name: str) -> int:
             if spec_layer != self.model.mtp_start_layer_idx and ".layers" not in name:
                 continue
             if name.endswith(".scale"):
-                suffix = ".weight_scale" if _EXPERT_SCALE_RE.search(name) else ".weight_scale_inv"
+                suffix = expert_scale_suffix if _EXPERT_SCALE_RE.search(name) else ".weight_scale_inv"
                 name = name.removesuffix(".scale") + suffix
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
diff --git a/aphrodite/model_executor/models/gemma4.py b/aphrodite/model_executor/models/gemma4.py
index 95e0dec39e..bffdcc73b1 100644
--- a/aphrodite/model_executor/models/gemma4.py
+++ b/aphrodite/model_executor/models/gemma4.py
@@ -85,6 +85,10 @@
 logger = init_logger(__name__)
 
 
+def _remap_gemma4_expert_weight_name(name: str) -> str:
+    return re.sub(r"(?<!\.moe)\.experts\.(\d+)\.", r".moe.experts.\1.", name)
+
+
 @triton.jit
 def _gemma4_routing_kernel(
     gating_ptr,
@@ -1066,11 +1070,6 @@ def _make_empty_intermediate_tensors(
                     dtype=dtype,
                     device=device,
                 ),
-                "residual": torch.zeros(
-                    (batch_size, hidden_size),
-                    dtype=dtype,
-                    device=device,
-                ),
             }
             if ple_dim and ple_dim > 0:
                 tensors["per_layer_inputs"] = torch.zeros(
@@ -1214,13 +1213,12 @@ def forward(
                 # Compute per-layer inputs for PLE
                 per_layer_embeds = self.get_per_layer_inputs(input_ids)
                 per_layer_inputs = self.project_per_layer_inputs(hidden_states, per_layer_embeds)
-            residual = None
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-            per_layer_inputs = intermediate_tensors.get("per_layer_inputs")
-
+            if per_layer_inputs is not None:
+                per_layer_inputs = intermediate_tensors["per_layer_inputs"]
+        residual = None
         aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)
         for layer_idx, layer in enumerate(islice(self.layers, self.start_layer, self.end_layer)):
             # Extract the per-layer embedding for this specific layer
@@ -1238,13 +1236,12 @@ def forward(
             )
             self._maybe_add_hidden_state(aux_hidden_states, layer_idx + 1, hidden_states, residual)
         if not get_pp_group().is_last_rank:
-            return IntermediateTensors(
-                {
-                    "hidden_states": hidden_states,
-                    "residual": residual,
-                    "per_layer_inputs": per_layer_inputs,
-                }
-            )
+            tensors: dict[str, torch.Tensor] = {
+                "hidden_states": hidden_states,
+            }
+            if per_layer_inputs is not None:
+                tensors["per_layer_inputs"] = per_layer_inputs
+            return IntermediateTensors(tensors)
         # Gemma4 incorporates residual into hidden_states directly
         # Apply norm without residual fusion when possible.
         if residual is None:
@@ -1538,7 +1535,7 @@ def _weight_iterator():
                 # Remap individual 2D expert weights:
                 # .experts.{id}.{proj} → .moe.experts.{id}.{proj}
                 # (This handles per-expert 2D quantized weights)
-                name = re.sub(r"\.experts\.(\d+)\.", r".moe.experts.\1.", name)
+                name = _remap_gemma4_expert_weight_name(name)
 
                 # MoE expert weights: checkpoint stores as 3D packed
                 # tensors.  Explode into per-expert 2D weights for
diff --git a/aphrodite/model_executor/models/granite4_vision.py b/aphrodite/model_executor/models/granite4_vision.py
index 8f5eb4de9c..b402f8f09f 100644
--- a/aphrodite/model_executor/models/granite4_vision.py
+++ b/aphrodite/model_executor/models/granite4_vision.py
@@ -814,8 +814,9 @@ def forward(
         # including during CUDA graph capture (buffers are zero → no-op injection).
         # This ensures the graph captures the injection code path.
         if inputs_embeds is not None and get_pp_group().is_first_rank and self._ds_layer_indices:
+            n = inputs_embeds.size(0)
             ds: IntermediateTensors | None = IntermediateTensors(
-                {f"ds_{llm_layer}": self._ds_buffers[lvl] for lvl, llm_layer in enumerate(self._ds_layer_indices)}
+                {f"ds_{llm_layer}": self._ds_buffers[lvl][:n] for lvl, llm_layer in enumerate(self._ds_layer_indices)}
             )
         else:
             ds = None
diff --git a/aphrodite/model_executor/models/laguna.py b/aphrodite/model_executor/models/laguna.py
new file mode 100644
index 0000000000..7f17790eab
--- /dev/null
+++ b/aphrodite/model_executor/models/laguna.py
@@ -0,0 +1,827 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Laguna model compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from aphrodite.compilation.decorators import support_torch_compile
+from aphrodite.config import AphroditeConfig, CacheConfig, get_current_aphrodite_config
+from aphrodite.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from aphrodite.logger import init_logger
+from aphrodite.model_executor.layers.attention import Attention
+from aphrodite.model_executor.layers.fused_moe import FusedMoE
+from aphrodite.model_executor.layers.layernorm import RMSNorm
+from aphrodite.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from aphrodite.model_executor.layers.logits_processor import LogitsProcessor
+from aphrodite.model_executor.layers.quantization import QuantizationConfig
+from aphrodite.model_executor.layers.rotary_embedding import get_rope
+from aphrodite.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from aphrodite.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from aphrodite.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from aphrodite.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from aphrodite.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class LagunaMLP(nn.Module):
+    """Dense MLP for Laguna (used in mlp_only_layers)."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # gate_proj and up_proj are kept as separate ColumnParallelLinear
+        # rather than merged via MergedColumnParallelLinear. The merged form
+        # requires per-partition NVFP4 global scales (weight_global_scale,
+        # input_global_scale) to be packed into a length-2 PerTensorScaleParameter
+        # and then collapsed via .max() in process_weights_after_loading; this
+        # doesn't round-trip cleanly through Marlin's NVFP4 stacked-layer code
+        # path. Splitting yields one global scale per Linear, exactly matching
+        # the standard compressed-tensors per-Linear schema on disk.
+        self.gate_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_proj",
+        )
+        self.up_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. Only silu is supported.")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate, _ = self.gate_proj(x)
+        up, _ = self.up_proj(x)
+        x, _ = self.down_proj(F.silu(gate) * up)
+        return x
+
+
+class LagunaMoE(nn.Module):
+    """Sparse MoE block for Laguna with optional shared expert and sigmoid routing.
+
+    Key differences from other MoE implementations:
+    - Uses SIGMOID routing activation (not softmax)
+    - Shared expert runs in parallel with routed experts (when enabled)
+    - Matches HF reference: modular_laguna.py LagunaSparseMoeBlock
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+
+        self.n_routed_experts = config.num_experts
+        self.n_shared_experts = 1 if config.shared_expert_intermediate_size > 0 else 0
+        self.routed_scaling_factor = float(getattr(config, "moe_routed_scaling_factor", 1.0))
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than the number of experts {config.num_experts}."
+            )
+
+        # Load balancing settings.
+        aphrodite_config = get_current_aphrodite_config()
+        eplb_config = aphrodite_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+        eplb_config.num_redundant_experts = (
+            eplb_config.num_redundant_experts if eplb_config.num_redundant_experts is not None else 0
+        )
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = self.physical_expert_start + self.n_local_physical_experts
+
+        # Router gate
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        # Shared expert (optional) - passed to FusedMoE for overlap optimization
+        self.shared_expert: LagunaMLP | None
+        if config.shared_expert_intermediate_size > 0:
+            self.shared_expert = LagunaMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.shared_expert_intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,  # Reduce after shared+routed combine
+                prefix=f"{prefix}.shared_expert",
+            )
+        else:
+            self.shared_expert = None
+
+        # Auxiliary-loss-free load-balancing bias (arXiv:2408.15664). The
+        # checkpoint stores one [num_experts] tensor per MoE layer at
+        # `mlp.experts.e_score_correction_bias`; registering it as a Parameter
+        # on the FusedMoE lets the weight loader pick it up and the router
+        # add it during top-k selection. The fused top-k bias router requires
+        # float32 regardless of model dtype.
+        e_score_correction_bias = torch.nn.Parameter(
+            torch.zeros(config.num_experts, dtype=torch.float32),
+            requires_grad=False,
+        )
+
+        # FusedMoE with SIGMOID routing. Passing `shared_experts=` lets the
+        # layer overlap the shared-expert compute with the all2all dispatch.
+        # `apply_routed_scale_to_output=True` makes FusedMoE handle the
+        # routed_scaling_factor, shared+routed combine, and TP all-reduce
+        # internally, so forward() just returns the final hidden states.
+        self.experts = FusedMoE(
+            shared_experts=self.shared_expert,
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            scoring_func="sigmoid",
+            use_grouped_topk=False,
+            apply_router_weight_on_input=bool(config.moe_apply_router_weight_on_input),
+            e_score_correction_bias=e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            routed_scaling_factor=self.routed_scaling_factor,
+            apply_routed_scale_to_output=True,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits, _ = self.gate(hidden_states)
+        router_logits = router_logits.float()
+        softcap = getattr(self.config, "moe_router_logit_softcapping", 0.0) or 0.0
+        if softcap > 0.0:
+            router_logits = torch.tanh(router_logits / softcap) * softcap
+
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class LagunaAttention(nn.Module):
+    """Laguna attention with optional softplus output gating.
+
+    Supports per-layer sliding window attention when ``config.layer_types``
+    is present.  Layers whose type is ``"sliding_attention"`` use
+    ``config.sliding_window``; all other layers (typically labelled
+    ``"full_attention"``) use full attention.  When ``layer_types`` is
+    absent every layer defaults to full attention for backwards
+    compatibility.
+    """
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position_embeddings: int = 131072,
+        head_dim: int | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attention_sink: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        # Gating flag
+        self.gating = config.gating
+
+        # Per-layer sliding window (follows Gemma2/Cohere2 convention)
+        layer_types = getattr(config, "layer_types", None)
+        if layer_types is not None:
+            layer_idx = extract_layer_index(prefix)
+            is_sliding = layer_types[layer_idx] == "sliding_attention"
+            self.sliding_window = config.sliding_window if is_sliding else None
+        else:
+            self.sliding_window = None
+
+        # QKV projection (no bias for Laguna)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        # Output projection
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # Gating projection (Laguna-specific, optional)
+        # config.gating may be:
+        #   - True / "per-element": one gate per (head, head_dim) channel
+        #   - "per-head":           one gate per head, broadcast across head_dim
+        if self.gating:
+            # v5 LagunaConfig uses ``gating=True`` for per-head; older configs
+            # used ``"per-head"``. Accept both. ``"per-element"`` (or legacy
+            # ``True``) means per-element gating with output size num_heads ×
+            # head_dim.
+            gate_per_head = self.gating is True or self.gating == "per-head"
+            g_out = self.total_num_heads if gate_per_head else self.total_num_heads * self.head_dim
+            self.g_proj = ColumnParallelLinear(
+                hidden_size,
+                g_out,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.g_proj",
+            )
+            self.gate_per_head = gate_per_head
+        else:
+            self.g_proj = None
+            self.gate_per_head = False
+
+        # Attention sinks (learnable per-head bias for SWA layers)
+        sinks = None
+        if attention_sink:
+            self.sink = torch.nn.Parameter(torch.empty(self.total_num_heads // tp_size, requires_grad=False))
+            sinks = self.sink
+
+        # Resolve rope params per-layer-type. ``config.rope_parameters`` is
+        # either a flat dict (legacy) or a nested ``{layer_type: rope_dict}``
+        # (v5 Laguna-XS schema). The v5 form is unhashable as-is and would
+        # crash `get_rope`'s cache lookup, so always pull out the layer's
+        # sub-dict before forwarding.
+        layer_type = layer_types[extract_layer_index(prefix)] if layer_types is not None else "full_attention"
+        is_sliding = layer_type == "sliding_attention"
+
+        top_rope = getattr(config, "rope_parameters", None) or {}
+        if any(isinstance(v, dict) for v in top_rope.values()):
+            # Nested per-layer-type form.
+            base_rope = top_rope.get(layer_type) or top_rope.get("full_attention") or {}
+        else:
+            base_rope = top_rope
+
+        # Older flat-rope ckpts can carry a separate `swa_rope_parameters`
+        # for SWA layers. Prefer it when present; otherwise the nested
+        # rope dict above already supplies the correct sub-config.
+        swa_rope = getattr(config, "swa_rope_parameters", None)
+        if is_sliding and swa_rope is None and not any(isinstance(v, dict) for v in top_rope.values()):
+            logger.warning_once(
+                "Laguna config has sliding_attention layers but neither "
+                "`swa_rope_parameters` nor a nested per-layer-type "
+                "`rope_parameters` — SWA layers will reuse the global rope. "
+                "If the checkpoint was trained with distinct SWA rope "
+                "(theta / partial_rotary_factor), regenerate its HF config "
+                "to include either form."
+            )
+        rope_params = swa_rope if (is_sliding and swa_rope is not None) else base_rope
+        # `partial_rotary_factor` may live on the top-level config (main attention)
+        # or on the per-layer rope dict itself (e.g. SWA can differ). Inject the
+        # top-level value into `rope_params` if the dict doesn't already set it.
+        top_partial = getattr(config, "partial_rotary_factor", None)
+        if top_partial is not None and "partial_rotary_factor" not in rope_params:
+            rope_params = {**rope_params, "partial_rotary_factor": top_partial}
+
+        # Rotary embeddings (YaRN)
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            max_position=max_position_embeddings,
+            is_neox_style=True,
+            rope_parameters=rope_params,
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+            sinks=sinks,
+        )
+
+        # QK normalization (like Qwen3)
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.view(q.shape)
+
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.view(k.shape)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+
+        # Apply gating if enabled (compute softplus in float32 for precision)
+        if self.gating and self.g_proj is not None:
+            gate, _ = self.g_proj(hidden_states)
+            gate = F.softplus(gate.float()).type_as(attn_output)
+            if self.gate_per_head:
+                # gate: [..., num_heads]; broadcast across head_dim
+                attn_shape = attn_output.shape
+                attn_output = (
+                    attn_output.view(*attn_shape[:-1], self.num_heads, self.head_dim) * gate.unsqueeze(-1)
+                ).view(attn_shape)
+            else:
+                attn_output = attn_output * gate
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class LagunaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        layer_idx = extract_layer_index(prefix)
+
+        # Determine if this layer uses sliding window attention
+        layer_types = getattr(config, "layer_types", None)
+        is_sliding = layer_types is not None and layer_types[layer_idx] == "sliding_attention"
+
+        # Enable attention sinks on SWA layers when configured
+        attention_sink = is_sliding and getattr(config, "swa_attention_sink_enabled", False)
+
+        # Optional per-layer override of head count (Laguna-XS).
+        per_layer_heads = getattr(config, "num_attention_heads_per_layer", None)
+        layer_num_heads = per_layer_heads[layer_idx] if per_layer_heads is not None else config.num_attention_heads
+
+        self.self_attn = LagunaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=layer_num_heads,
+            num_kv_heads=config.num_key_value_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            head_dim=getattr(config, "head_dim", None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attention_sink=attention_sink,
+        )
+
+        # Check if this layer uses MoE or dense MLP (matches Qwen2/Qwen3 convention)
+        mlp_only_layers = [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
+        self.is_moe_layer = (
+            (layer_idx not in mlp_only_layers)
+            and (config.num_experts > 0)
+            and ((layer_idx + 1) % config.decoder_sparse_step == 0)
+        )
+
+        if self.is_moe_layer:
+            self.mlp = LagunaMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = LagunaMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class LagunaModel(nn.Module):
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
+        super().__init__()
+
+        config = aphrodite_config.model_config.hf_config
+        cache_config = aphrodite_config.cache_config
+        quant_config = aphrodite_config.quant_config
+        enable_eplb = aphrodite_config.parallel_config.enable_eplb
+        eplb_config = aphrodite_config.parallel_config.eplb_config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+        self.config = config
+        self.quant_config = quant_config
+
+        # Disable the model-level sliding-window fallback in Attention.__init__.
+        # Laguna drives SWA per-layer via `layer_types`, passing
+        # `per_layer_sliding_window=self.sliding_window` (None for global
+        # layers). Without this, global layers whose `per_layer_sliding_window`
+        # is None would pick up `cache_config.sliding_window`
+        # (populated from `config.sliding_window`) as a fallback, silently
+        # applying a 512-token window to full-attention layers.
+        if cache_config is not None:
+            cache_config.sliding_window = None
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: LagunaDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states, "residual": residual})
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        """Get expert parameter mapping for weight loading.
+
+        Returns mapping tuples of (param_name, weight_name, expert_id, shard_id)
+        that handle both weights and quantization scales.
+        """
+        return FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            # gate_proj and up_proj are loaded as separate Linears (see
+            # LagunaMLP) so no merge entry is needed here.
+        ]
+
+        # Suffixes to skip for GPTQ/modelopt models if param doesn't exist
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+
+        tp_rank = get_tensor_model_parallel_rank()
+
+        for name, loaded_weight in weights:
+            # Handle attention sinks (distributed across ranks). Derive the
+            # per-rank slice from the parameter's own shape so per-layer
+            # variations in head count are handled correctly.
+            if "sink" in name:
+                param = params_dict.get(name)
+                if param is not None:
+                    layer_heads_per_rank = param.shape[0]
+                    layer_head_start = tp_rank * layer_heads_per_rank
+                    narrow_weight = loaded_weight.narrow(0, layer_head_start, layer_heads_per_rank)
+                    param.data.copy_(narrow_weight)
+                    loaded_params.add(name)
+                continue
+
+            # Handle KV cache quantization scales
+            if self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name)):
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                assert loaded_weight.numel() == 1, f"KV scale numel {loaded_weight.numel()} != 1"
+                loaded_weight = loaded_weight.squeeze()
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            # Handle stacked params (QKV, gate_up for
+            # non-expert layers and shared_expert)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # Skip expert weights - handled below via expert_params_mapping
+                if "mlp.experts" in name and "shared_expert" not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                if name.endswith(ignore_suffixes) and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                # Remap FP8 kv_scale names for backwards compatibility
+                if name.endswith("scale"):
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                # Try expert params mapping (handles weights + quantization scales)
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Mark as expert weight so we skip regular loading below
+                    is_expert_weight = True
+
+                    # Create mapped name without modifying original
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+                    if name_mapped.endswith(ignore_suffixes) and name_mapped not in params_dict:
+                        continue
+                    if name_mapped not in params_dict:
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # Use return_success to handle expert parallelism correctly
+                    weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        loaded_params.add(name_mapped)
+                        break
+                else:
+                    # Expert weight not mapped to this rank - skip
+                    if is_expert_weight:
+                        continue
+
+                    # Remap kv_scale names before the ignore_suffixes filter:
+                    # the suffix list includes .k_scale/.v_scale, so filtering
+                    # first drops the checkpoint key before remap can rewrite
+                    # it to the .attn.* name that exists in params_dict.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+
+        return loaded_params
+
+
+class LagunaForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
+        super().__init__()
+        config = aphrodite_config.model_config.hf_config
+        quant_config = aphrodite_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = LagunaModel(aphrodite_config=aphrodite_config, prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if self.config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/aphrodite/model_executor/models/llama.py b/aphrodite/model_executor/models/llama.py
index c33b10d5a7..042ba56705 100644
--- a/aphrodite/model_executor/models/llama.py
+++ b/aphrodite/model_executor/models/llama.py
@@ -325,18 +325,15 @@ def get_quant_config(self, aphrodite_config: AphroditeConfig) -> QuantizationCon
         return aphrodite_config.quant_config
 
 
-def llama_model_invariants(input_ids, positions, intermediate_tensors=None, inputs_embeds=None):
-    """Shape invariants for Llama model compilation, those are translated to
-    runtime assertions for unbacked dynamic shapes and are compiled away for
-    backed"""
-    if input_ids is not None:
-        torch._check(positions.size()[0] == input_ids.size()[0])
-
-
 @support_torch_compile(
     # TODO[#32068]: Investigate recompilation
     # mark_unbacked_dims={"input_ids": 0},
-    shape_invariants=llama_model_invariants
+    dynamic_arg_dims={
+        "input_ids": {0: "b"},
+        "positions": {0: "b"},
+        "intermediate_tensors": {0: "b"},
+        "inputs_embeds": {0: "b"},
+    },
 )
 class LlamaModel(nn.Module, EagleModelMixin):
     def __init__(
diff --git a/aphrodite/model_executor/models/longcat_flash.py b/aphrodite/model_executor/models/longcat_flash.py
index 841e3f0fb7..0ce8b25f75 100644
--- a/aphrodite/model_executor/models/longcat_flash.py
+++ b/aphrodite/model_executor/models/longcat_flash.py
@@ -69,6 +69,7 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
+    AutoWeightsLoader,
     PPMissingLayer,
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
@@ -457,6 +458,7 @@ def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
         cache_config = aphrodite_config.cache_config
         quant_config = aphrodite_config.quant_config
         self.config = config
+        self.quant_config = quant_config
 
         self.vocab_size = config.vocab_size
 
@@ -521,69 +523,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    """Flash model for causal language modeling."""
-
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
-        super().__init__()
-        config = FlashConfig(**aphrodite_config.model_config.hf_config.__dict__)
-        quant_config = aphrodite_config.quant_config
-
-        self.config = config
-        config.intermediate_size = (
-            config.ffn_hidden_size if hasattr(config, "ffn_hidden_size") else config.intermediate_size
-        )
-
-        self.quant_config = quant_config
-
-        self.model = FlashModel(aphrodite_config=aphrodite_config, prefix=maybe_prefix(prefix, "model"))
-
-        if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-        else:
-            self.lm_head = PPMissingLayer()
-
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_input_ids(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor | None,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
@@ -684,9 +623,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             loaded_params.add(name)
         for layer_id in range(self.config.num_hidden_layers):
             for i in range(2):
-                if isinstance(self.model.layers[layer_id], PPMissingLayer):
+                if isinstance(self.layers[layer_id], PPMissingLayer):
                     continue
-                self_attn = self.model.layers[layer_id].self_attn[i]
+                self_attn = self.layers[layer_id].self_attn[i]
                 if hasattr(self.quant_config, "weight_block_size") and self_attn.kv_b_proj.weight.dtype in (
                     torch.float8_e4m3fn,
                     torch.float8_e4m3fnuz,
@@ -713,3 +652,73 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 if self.config.mla_scale_kv_lora:
                     self_attn.kv_a_layernorm.weight.data *= (self.config.hidden_size / self.config.kv_lora_rank) ** 0.5
         return loaded_params
+
+
+class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    """Flash model for causal language modeling."""
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
+        super().__init__()
+        config = FlashConfig(**aphrodite_config.model_config.hf_config.__dict__)
+        quant_config = aphrodite_config.quant_config
+
+        self.config = config
+        config.intermediate_size = (
+            config.ffn_hidden_size if hasattr(config, "ffn_hidden_size") else config.intermediate_size
+        )
+
+        self.quant_config = quant_config
+
+        self.model = FlashModel(aphrodite_config=aphrodite_config, prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/aphrodite/model_executor/models/mimo_audio.py b/aphrodite/model_executor/models/mimo_audio.py
new file mode 100644
index 0000000000..8430394d88
--- /dev/null
+++ b/aphrodite/model_executor/models/mimo_audio.py
@@ -0,0 +1,1269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""MiMo audio: tokenizer, encoding utilities, and audio encoder.
+
+Ported from SGLang's mimo_audio.py.
+Audio tokenizer adapted from https://github.com/XiaomiMiMo/MiMo-Audio-Tokenizer.git
+"""
+
+import dataclasses
+import json
+import logging
+import math
+import os
+import typing as tp
+from dataclasses import dataclass
+from functools import wraps
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.qwen2.modeling_qwen2 import Qwen2Model
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Vector quantization (from MiMo-Audio-Tokenizer)
+# ---------------------------------------------------------------------------
+
+
+def _vq_default(val: tp.Any, d: tp.Any) -> tp.Any:
+    return val if val is not None else d
+
+
+def _ema_inplace(moving_avg, new, decay: float):
+    if dist.is_initialized():
+        dist.all_reduce(new, op=dist.ReduceOp.SUM)
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+
+
+def _laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+
+
+def _uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+
+
+def _sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+
+    selected_samples = samples[indices]
+
+    if dist.is_initialized():
+        dist.broadcast(selected_samples, src=0)
+
+    return selected_samples
+
+
+def _kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+
+    means = _sample_vectors(samples, num_clusters)
+
+    for _ in range(num_iters):
+        dists = -(samples.pow(2).sum(1, keepdim=True) - 2 * samples @ means.t() + means.t().pow(2).sum(0, keepdim=True))
+
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means = new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
+
+        if dist.is_initialized():
+            dist.all_reduce(bins, op=dist.ReduceOp.SUM)
+            dist.all_reduce(new_means, op=dist.ReduceOp.SUM)
+
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+
+        new_means = new_means / bins_min_clamped[..., None]
+
+        means = torch.where(zero_mask[..., None], means, new_means)
+
+    return means, bins
+
+
+def _rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def _compute_default_rope_parameters(config=None, device=None, seq_len=None, **rope_kwargs):
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError("Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive")
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
+    return inv_freq, attention_factor
+
+
+_ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+}
+
+
+def _dynamic_rope_update(rope_forward):
+    def dynamic_frequency_update(self, position_ids, device):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @wraps(rope_forward)
+    def wrapper(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device)
+        return rope_forward(self, x, position_ids)
+
+    return wrapper
+
+
+class AudioRotaryEmbedding(nn.Module):
+    def __init__(self, base, dim, max_seq_len, rope_type="default", device=None):
+        super().__init__()
+        self.max_seq_len = max_seq_len
+        self.rope_type = rope_type
+        self.rope_init_fn = _ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(device=device, base=base, dim=dim)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @_dynamic_rope_update
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[:, None].float().expand(-1, 1).to(x.device)
+        position_ids_expanded = position_ids[None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(0, 1)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class EuclideanCodebook(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Callable[..., torch.Tensor] | tp.Any = _uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+
+        embed, cluster_size = _kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+
+    def replace_(self, samples, mask):
+        replace_num = mask.sum()
+        modified_codebook = self.embed.clone()
+        modified_codebook[mask] = _sample_vectors(samples, replace_num)
+        self.embed.data.copy_(modified_codebook)
+
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace_(batch_samples, mask=expired_codes)
+
+    def preprocess(self, x):
+        x = rearrange(x, "... d -> (...) d")
+        return x
+
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist_val = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed + embed.pow(2).sum(0, keepdim=True))
+        embed_ind = dist_val.max(dim=-1).indices
+        return embed_ind
+
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+
+    def encode(self, x):
+        shape = x.shape
+        x = self.preprocess(x)
+        embed_ind = self.quantize(x)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+
+        self.init_embed_(x)
+
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+
+        if self.training:
+            self.expire_codes_(x)
+            _ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            _ema_inplace(self.embed_avg, embed_sum.t().contiguous(), self.decay)
+            cluster_size = (
+                _laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+
+        return quantize, embed_ind
+
+
+class VectorQuantization(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: int | None = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+        commitment_weight: float = 1.0,
+    ):
+        super().__init__()
+        _codebook_dim: int = _vq_default(codebook_dim, dim)
+
+        requires_projection = _codebook_dim != dim
+        self.project_in = nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
+        self.project_out = nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
+
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+
+        self._codebook = EuclideanCodebook(
+            dim=_codebook_dim,
+            codebook_size=codebook_size,
+            kmeans_init=kmeans_init,
+            kmeans_iters=kmeans_iters,
+            decay=decay,
+            epsilon=epsilon,
+            threshold_ema_dead_code=threshold_ema_dead_code,
+        )
+        self.codebook_size = codebook_size
+
+    @property
+    def codebook(self):
+        return self._codebook.embed
+
+    def encode(self, x):
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        return quantize
+
+    def forward(self, x):
+        device = x.device
+        x = self.project_in(x)
+
+        quantize, embed_ind = self._codebook(x)
+
+        if self.training:
+            quantize = x + (quantize - x).detach()
+
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+
+        quantize = self.project_out(quantize)
+        return quantize, embed_ind, loss
+
+
+class ResidualVectorQuantization(nn.Module):
+    def __init__(self, *, num_quantizers, codebook_size, **kwargs):
+        super().__init__()
+        if isinstance(codebook_size, int):
+            codebook_size = [codebook_size] * num_quantizers
+        elif len(codebook_size) < num_quantizers:
+            codebook_size += [codebook_size[-1]] * (num_quantizers - len(codebook_size))
+        self.layers = nn.ModuleList(
+            [VectorQuantization(codebook_size=codebook_size[i], **kwargs) for i in range(num_quantizers)]
+        )
+
+    def forward(self, x, n_q: int | None = None, layers: list | None = None):
+        quantized_out = 0.0
+        residual = x
+
+        all_losses = []
+        all_indices = []
+        out_quantized = []
+
+        n_q = n_q or len(self.layers)
+
+        for i, layer in enumerate(self.layers[:n_q]):
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+
+            all_indices.append(indices)
+            all_losses.append(loss)
+            if layers and i in layers:
+                out_quantized.append(quantized_out)
+
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses, out_quantized
+
+    def encode(self, x: torch.Tensor, n_q: int | None = None, st: int | None = None) -> torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = len(self.layers) if n_q is None else n_q
+        st = 0 if st is None else st
+        for layer in self.layers[st:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, q_indices: torch.Tensor, st: int = 0) -> torch.Tensor:
+        quantized_out = self.layers[st].decode(q_indices[0])
+        for i in range(1, len(q_indices)):
+            layer = self.layers[st + i]
+            quantized = layer.decode(q_indices[i])
+            quantized_out = quantized_out + quantized
+        return quantized_out
+
+
+class ResidualVectorQuantizer(nn.Module):
+    def __init__(
+        self,
+        dimension: int = 256,
+        n_q: int = 8,
+        bins: int | list = 1024,
+        decay: float = 0.99,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.n_q = n_q
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            threshold_ema_dead_code=self.threshold_ema_dead_code,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        n_q: int | None = None,
+        layers: list | None = None,
+    ):
+        n_q = n_q if n_q else self.n_q
+        quantized, codes, commit_loss, quantized_list = self.vq(x, n_q=n_q, layers=layers)
+        return quantized, codes, torch.mean(commit_loss), quantized_list
+
+    def encode(self, x: torch.Tensor, n_q: int | None = None, st: int | None = None) -> torch.Tensor:
+        n_q = n_q if n_q else self.n_q
+        st = st or 0
+        codes = self.vq.encode(x, n_q=n_q, st=st)
+        return codes
+
+    def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor:
+        quantized = self.vq.decode(codes, st=st)
+        return quantized
+
+
+# ---------------------------------------------------------------------------
+# Audio tokenizer
+# ---------------------------------------------------------------------------
+
+
+class MiMoAudioTokenizerConfig(PretrainedConfig):
+    model_type = "mimo_audio_tokenizer"
+
+    def __init__(
+        self,
+        max_audio_seconds: int = 1800,
+        stride_size: int = 2,
+        avg_pooler: int = 1,
+        d_model: int = 768,
+        scale_embedding: bool = True,
+        kernel_size: int = 3,
+        activation_function: str = "gelu",
+        encoder_layers: int = 8,
+        encoder_skip_layer_id: int = None,
+        encoder_attention_heads: int = 12,
+        encoder_ffn_dim: int = 3072,
+        encoder_causal: bool = False,
+        encoder_attn_window_size: list = None,
+        decoder_layers: int = 8,
+        decoder_attention_heads: int = 12,
+        decoder_ffn_dim: int = 3072,
+        decoder_kernel_size: int = 3,
+        decoder_stride_size: int = 2,
+        decoder_causal: bool = True,
+        decoder_attn_window_size: list = None,
+        nfft: int = 1024,
+        vocoder_dim: int = 512,
+        vocoder_intermediate_dim: int = 4096,
+        vocoder_num_layers: int = 30,
+        n_mels: int = 80,
+        sampling_rate: int = 24000,
+        hop_length: int = 240,
+        window_size: int = 1024,
+        vocoder_padding: str = "same",
+        fmin: int = 0,
+        fmax: int = None,
+        num_quantizers: int = 12,
+        codebook_size: list = None,
+        threshold_ema_dead_code: int = 10,
+        position_embedding_type: str = "rope",
+        rope_theta: int = 10000,
+        rope_type: str = "default",
+        ln_type: str = "LayerNorm",
+        vocoder_attention_heads: int = 4,
+        vocoder_attn_window_size: list = None,
+        use_istft_only: bool = False,
+        hybrid_attention: bool = False,
+        hybrid_block_size: int = 8,
+        swa_per_block: int = 2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_audio_seconds = max_audio_seconds
+        self.stride_size = stride_size
+        self.avg_pooler = avg_pooler
+        self.d_model = d_model
+        self.scale_embedding = scale_embedding
+        self.kernel_size = kernel_size
+        self.activation_function = activation_function
+        self.encoder_layers = encoder_layers
+        self.encoder_skip_layer_id = encoder_skip_layer_id
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_causal = encoder_causal
+        self.encoder_attn_window_size = encoder_attn_window_size if encoder_attn_window_size is not None else [-1, -1]
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_kernel_size = decoder_kernel_size
+        self.decoder_stride_size = decoder_stride_size
+        self.decoder_causal = decoder_causal
+        self.decoder_attn_window_size = decoder_attn_window_size if decoder_attn_window_size is not None else [-1, -1]
+        self.nfft = nfft
+        self.vocoder_dim = vocoder_dim
+        self.vocoder_intermediate_dim = vocoder_intermediate_dim
+        self.vocoder_num_layers = vocoder_num_layers
+        self.n_mels = n_mels
+        self.sampling_rate = sampling_rate
+        self.hop_length = hop_length
+        self.window_size = window_size
+        self.vocoder_padding = vocoder_padding
+        self.fmin = fmin
+        self.fmax = fmax
+        self.num_quantizers = num_quantizers
+        self.codebook_size = codebook_size if codebook_size is not None else [1024]
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.position_embedding_type = position_embedding_type
+        self.rope_theta = rope_theta
+        self.rope_type = rope_type
+        self.ln_type = ln_type
+        self.vocoder_attention_heads = vocoder_attention_heads
+        self.vocoder_attn_window_size = vocoder_attn_window_size if vocoder_attn_window_size is not None else [40, 10]
+        self.use_istft_only = use_istft_only
+        self.hybrid_attention = hybrid_attention
+        self.hybrid_block_size = hybrid_block_size
+        self.swa_per_block = swa_per_block
+
+
+def get_sequence_mask(inputs, inputs_length):
+    if inputs.dim() == 3:
+        bsz, tgt_len, _ = inputs.size()
+    else:
+        bsz, tgt_len = inputs_length.shape[0], torch.max(inputs_length)
+    sequence_mask = torch.arange(0, tgt_len).to(inputs.device)
+    sequence_mask = torch.lt(sequence_mask, inputs_length.reshape(bsz, 1)).view(bsz, tgt_len, 1)
+    unpacking_index = torch.cumsum(sequence_mask.to(torch.int64).view(-1), dim=0) - 1
+    return sequence_mask, unpacking_index
+
+
+def unpack_hidden_states(hidden_states, lengths, sequence_mask=None, unpacking_index=None):
+    bsz = lengths.shape[0]
+    if sequence_mask is None or unpacking_index is None:
+        sequence_mask, unpacking_index = get_sequence_mask(hidden_states, lengths)
+    hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+        bsz, torch.max(lengths), hidden_states.shape[-1]
+    )
+    return torch.where(sequence_mask, hidden_states, 0)
+
+
+def get_position_ids(lengths):
+    total_len = lengths.sum()
+    offset = torch.cat([torch.zeros(1).to(lengths), lengths[:-1].cumsum(dim=0)])
+    offset = torch.repeat_interleave(offset, lengths)
+    return torch.arange(0, total_len).to(offset) - offset
+
+
+LAYER_NORM = {"LayerNorm": nn.LayerNorm}
+
+
+class AudioEncoderAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        window_size: tuple[int, int] = (-1, -1),
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.window_size = window_size
+        self.causal = causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int,
+        rope_position_embeddings=None,
+    ):
+        from aphrodite.aphrodite_flash_attn import flash_attn_varlen_func
+
+        bsz, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+
+        if rope_position_embeddings is not None:
+            cos, sin = rope_position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+            causal=self.causal,
+            window_size=list(self.window_size),
+        )
+
+        attn_output = attn_output.reshape(bsz, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+
+
+class AudioEncoderTransformerLayer(nn.Module):
+    def __init__(
+        self,
+        config: MiMoAudioTokenizerConfig,
+        causal: bool,
+        attn_window_size: tuple[int, int] = (-1, -1),
+    ):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = AudioEncoderAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            window_size=attn_window_size,
+            causal=causal,
+        )
+        self.self_attn_layer_norm = LAYER_NORM[config.ln_type](self.embed_dim)
+
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LAYER_NORM[config.ln_type](self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int,
+        rope_position_embeddings: tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states,
+            cu_seqlens,
+            max_seqlen,
+            rope_position_embeddings=rope_position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class AudioEncoder(nn.Module):
+    def __init__(
+        self,
+        config: MiMoAudioTokenizerConfig,
+    ):
+        super().__init__()
+        self.config = config
+        self.max_source_positions = (
+            config.max_audio_seconds * config.sampling_rate // config.hop_length
+        ) // config.stride_size
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.skip_layer_idx = config.encoder_skip_layer_id
+
+        self.conv1 = nn.Conv1d(
+            config.n_mels,
+            config.d_model,
+            kernel_size=config.kernel_size,
+            padding=1,
+        )
+        self.conv2 = nn.Conv1d(
+            config.d_model,
+            config.d_model,
+            kernel_size=config.kernel_size,
+            stride=config.stride_size,
+            padding=1,
+        )
+
+        self.position_embedding = AudioRotaryEmbedding(
+            config.rope_theta,
+            config.d_model // config.encoder_attention_heads,
+            self.max_source_positions,
+            config.rope_type,
+        )
+
+        attn_window_sizes = []
+        if config.hybrid_attention:
+            for i in range(config.encoder_layers):
+                if i % config.swa_per_block < config.swa_per_block - 1:
+                    attn_window_sizes.append(tuple(config.encoder_attn_window_size))
+                else:
+                    attn_window_sizes.append((-1, -1))
+        else:
+            attn_window_sizes = [tuple(config.encoder_attn_window_size)] * config.encoder_layers
+
+        self.layers = nn.ModuleList(
+            [
+                AudioEncoderTransformerLayer(
+                    config=config,
+                    causal=config.encoder_causal,
+                    attn_window_size=attn_window_sizes[i],
+                )
+                for i in range(config.encoder_layers)
+            ]
+        )
+
+        self.layer_norm = LAYER_NORM[config.ln_type](config.d_model)
+
+        if config.avg_pooler != 1:
+            self.down_sample_layer = nn.Sequential(
+                nn.Conv1d(
+                    config.d_model,
+                    config.d_model,
+                    config.avg_pooler,
+                    config.avg_pooler,
+                    bias=False,
+                ),
+                nn.GELU(),
+            )
+            self.down_sample_norm = LAYER_NORM[config.ln_type](config.d_model)
+        else:
+            self.down_sample_layer = None
+
+        if config.num_quantizers != 0:
+            self.quantizer = ResidualVectorQuantizer(
+                dimension=config.d_model,
+                n_q=config.num_quantizers,
+                bins=config.codebook_size,
+                threshold_ema_dead_code=config.threshold_ema_dead_code,
+            )
+        else:
+            self.quantizer = None
+
+    def get_features(self, input_features, output_length):
+        input_features = input_features.to(self.conv1.weight)
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        bsz, tgt_len, _ = inputs_embeds.size()
+        hidden_states = inputs_embeds
+
+        position_ids = get_position_ids(output_length).long().to(input_features.device)
+        rope_position_embeddings = self.position_embedding(input_features, position_ids)
+
+        attention_mask, unpacking_index = get_sequence_mask(hidden_states, output_length)
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+            torch.sum(output_length), self.config.d_model
+        )
+
+        cu_seqlens = F.pad(torch.cumsum(output_length, dim=0), (1, 0), "constant", 0).to(
+            device=hidden_states.device, dtype=torch.int32
+        )
+        max_seqlen = torch.max(output_length).to(torch.int32).item()
+
+        skip_connect_hidden_states = 0.0
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                cu_seqlens,
+                max_seqlen,
+                rope_position_embeddings=rope_position_embeddings,
+            )
+            if (self.skip_layer_idx is not None) and idx == self.skip_layer_idx - 1:
+                skip_connect_hidden_states = hidden_states.clone()
+
+        hidden_states += skip_connect_hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+
+        if self.down_sample_layer is not None:
+            hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+                bsz, tgt_len, self.config.d_model
+            )
+            if hidden_states.size(1) % self.config.avg_pooler:
+                pad_len = self.config.avg_pooler - hidden_states.size(1) % self.config.avg_pooler
+                hidden_states = torch.nn.functional.pad(hidden_states, (0, 0, 0, pad_len), mode="constant", value=0.0)
+                tgt_len += pad_len
+            tgt_len = tgt_len // self.config.avg_pooler
+            hidden_states = self.down_sample_layer(hidden_states.transpose(1, 2))
+            output_length = (
+                output_length // self.config.avg_pooler + (output_length % self.config.avg_pooler != 0).int()
+            )
+            hidden_states = hidden_states.transpose(1, 2)
+            attention_mask, unpacking_index = get_sequence_mask(hidden_states, output_length)
+            hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+                torch.sum(output_length), self.config.d_model
+            )
+            hidden_states = self.down_sample_norm(hidden_states)
+
+        return (
+            hidden_states,
+            output_length,
+            attention_mask,
+            unpacking_index,
+            tgt_len,
+            bsz,
+        )
+
+    def get_output_length(self, mel_len):
+        tgt_len = mel_len + 3 - self.config.kernel_size
+        return (tgt_len + 2 - self.config.kernel_size) // self.config.stride_size + 1
+
+    @torch.no_grad()
+    def encode(
+        self,
+        input_features,
+        input_lens=None,
+        output_length=None,
+        return_codes_only=False,
+        n_q=None,
+        use_quantizer=True,
+    ):
+        if output_length is None:
+            output_length = self.get_output_length(input_lens)
+        input_features = unpack_hidden_states(input_features, input_lens)
+        hidden_states, output_length, attention_mask, unpacking_index, tgt_len, bsz = self.get_features(
+            input_features=input_features.transpose(1, 2),
+            output_length=output_length,
+        )
+
+        dtype = hidden_states.dtype
+        if use_quantizer and self.quantizer is not None:
+            self.quantizer.float()
+            codes = self.quantizer.encode(hidden_states.float(), n_q=n_q)
+            if return_codes_only:
+                return codes, output_length
+            hidden_states = self.quantizer.decode(codes)
+            hidden_states = hidden_states.to(dtype)
+        else:
+            codes = None
+
+        hidden_states_packed = hidden_states.clone()
+        hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(bsz, tgt_len, self.config.d_model)
+        hidden_states = torch.where(attention_mask, hidden_states, 0)
+        return hidden_states, hidden_states_packed, output_length, codes
+
+    @torch.no_grad()
+    def decode_vq(self, codes):
+        self.quantizer.float()
+        return self.quantizer.decode(codes)
+
+
+class MiMoAudioTokenizer(PreTrainedModel):
+    config_class = MiMoAudioTokenizerConfig
+
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__(config)
+        self.config = config
+        self.sampling_rate = config.sampling_rate
+        self.encoder = AudioEncoder(config=config)
+        self.downsample_rate = int(config.hop_length * 2 * config.avg_pooler)
+
+    def get_output_length(self, mel_len):
+        tgt_len = mel_len + 3 - self.config.kernel_size
+        return (tgt_len + 2 - self.config.kernel_size) // self.config.stride_size + 1
+
+    @torch.no_grad()
+    def encode(self, mels, input_lens, use_quantizer=True):
+        input_features = mels
+        encoder_output_length = self.get_output_length(input_lens)
+        hidden_states, hidden_states_packed, encoder_output_length, codes = self.encoder.encode(
+            input_features, input_lens=input_lens, use_quantizer=use_quantizer
+        )
+        return hidden_states, hidden_states_packed, encoder_output_length, codes
+
+
+# ---------------------------------------------------------------------------
+# Audio encoding utilities
+# ---------------------------------------------------------------------------
+
+
+def group_by_length(features: torch.Tensor, lengths: torch.Tensor, max_length: int):
+    if features.size(0) != lengths.sum().item():
+        raise ValueError(f"Feature size mismatch: {features.size(0)} vs {lengths.sum().item()}")
+
+    split_points = []
+    current_sum = 0
+
+    for i, seq_len in enumerate(lengths):
+        if current_sum + seq_len > max_length and current_sum > 0:
+            split_points.append(i)
+            current_sum = seq_len.item()
+        else:
+            current_sum += seq_len.item()
+
+    group_sizes = []
+    prev = 0
+    for point in split_points:
+        group_sizes.append(point - prev)
+        prev = point
+    if prev < len(lengths):
+        group_sizes.append(len(lengths) - prev)
+
+    len_groups = torch.split(lengths, group_sizes)
+    feature_sizes = [group.sum().item() for group in len_groups]
+    feature_groups = torch.split(features, feature_sizes)
+
+    return feature_groups, len_groups
+
+
+@torch.no_grad()
+def encode_batch(
+    audio_tokenizer_encoder,
+    input_features: torch.Tensor,
+    input_lens: torch.Tensor,
+    max_length: int = 256000,
+):
+    feature_groups, len_groups = group_by_length(input_features, input_lens, max_length)
+
+    encoded_parts = []
+    for features, lengths in zip(feature_groups, len_groups):
+        codes, _ = audio_tokenizer_encoder.encode(input_features=features, input_lens=lengths, return_codes_only=True)
+        encoded_parts.append(codes)
+
+    return torch.cat(encoded_parts, dim=-1)
+
+
+def _segment_lengths_for_mel(mel: torch.Tensor, segment_size: int):
+    """Split mel into segments of segment_size with a possible shorter remainder."""
+    input_len = mel.size(0)
+    segs = [segment_size] * (input_len // segment_size)
+    if input_len % segment_size > 0:
+        segs.append(input_len % segment_size)
+    return segs
+
+
+@torch.no_grad()
+def tokenize_audio_batch(mels, audio_tokenizer_encoder, segment_size=6000, device=None):
+    """Tokenize multiple mels in one encode_batch call.
+
+    Returns list of code tensors, each [T_i, C] for that mel.
+    """
+    if not mels:
+        return []
+    if device is None:
+        device = next(audio_tokenizer_encoder.parameters()).device
+    input_len_seg_per_mel = [_segment_lengths_for_mel(m, segment_size) for m in mels]
+    input_lens_flat = [s for segs in input_len_seg_per_mel for s in segs]
+    input_features = torch.cat([m.to(device) for m in mels], dim=0)
+    input_lens_t = torch.tensor(input_lens_flat, dtype=torch.long, device=device)
+    codes_packed = encode_batch(
+        audio_tokenizer_encoder,
+        input_features=input_features,
+        input_lens=input_lens_t,
+    )
+    codes = codes_packed.transpose(0, 1).detach()  # [total_code_T, C]
+    code_lengths = []
+    for segs in input_len_seg_per_mel:
+        out_len = audio_tokenizer_encoder.get_output_length(torch.tensor(segs, dtype=torch.long, device=device))
+        if getattr(audio_tokenizer_encoder, "down_sample_layer", None) is not None:
+            avg = audio_tokenizer_encoder.config.avg_pooler
+            out_len = out_len // avg + (out_len % avg != 0).long()
+        code_lengths.append(out_len.sum().item())
+    code_list = torch.split(codes, code_lengths)
+    return list(code_list)
+
+
+# ---------------------------------------------------------------------------
+# MimoAudioEncoderConfig
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MimoAudioEncoderConfig:
+    """Config for MimoAudioEncoder.
+
+    Field names match the audio_config dict in the model checkpoint.
+    """
+
+    speech_vocab_size: str = "1025-1025-129-129-129-129-129-129"
+    speech_zeroemb_idx: str = "1024-1024-128-128-128-128-128-128"
+    group_size: int = 4
+    audio_channels: int = 8
+    input_local_layers: int = 6
+    input_local_dim: int = 1024
+    input_full_attention: bool = True
+    input_local_attn_heads: int = 64
+    input_local_head_dim: int = 16
+    input_local_intermediate_size: int = 4096
+    input_local_hidden_dropout: float = 0.0
+    out_hidden_size: int = 4096
+    rope_theta: float = 640000.0
+    partial_rotary_factor: float = 0.334
+    projection_layers: int = 1
+    add_post_norm: bool = False
+    audio_segment_size: int = 6000
+
+    @classmethod
+    def from_dict(cls, d: dict) -> "MimoAudioEncoderConfig":
+        known = {f.name for f in dataclasses.fields(cls)}
+        return cls(**{k: v for k, v in d.items() if k in known})
+
+
+# ---------------------------------------------------------------------------
+# AudioProjection
+# ---------------------------------------------------------------------------
+
+
+class AudioProjection(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        output_size: int,
+    ) -> None:
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(input_size, hidden_size, bias=False),
+            nn.GELU(),
+            nn.Linear(hidden_size, output_size, bias=False),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+
+
+# ---------------------------------------------------------------------------
+# MimoAudioEncoder
+# ---------------------------------------------------------------------------
+
+
+class MimoAudioEncoder(nn.Module):
+    """Audio encoder for MiMo-V2-Omni.
+
+    Encodes mel spectrograms into LLM-compatible embeddings via:
+      1. Audio tokenizer (VQ codes)
+      2. Speech embeddings lookup
+      3. Local Qwen2 transformer
+      4. Linear projection
+    """
+
+    def __init__(self, config, model_path: str = "") -> None:
+        super().__init__()
+        if isinstance(config, dict):
+            config = MimoAudioEncoderConfig.from_dict(config)
+        self.config = config
+        self.audio_channels = config.audio_channels
+        self.audio_group_size = config.group_size
+        self.audio_segment_size = config.audio_segment_size
+
+        speech_vocab_sizes = self._parse_maybe_list(config.speech_vocab_size, config.audio_channels)
+        speech_empty_ids = self._parse_maybe_list(config.speech_zeroemb_idx, config.audio_channels)
+
+        input_local_config = Qwen2Config(
+            hidden_size=config.input_local_dim,
+            num_hidden_layers=config.input_local_layers,
+            num_attention_heads=config.input_local_attn_heads,
+            num_key_value_heads=config.input_local_attn_heads,
+            intermediate_size=config.input_local_intermediate_size,
+            attention_dropout=config.input_local_hidden_dropout,
+            rope_theta=config.rope_theta,
+            partial_rotary_factor=config.partial_rotary_factor,
+        )
+
+        self.input_local_transformer = Qwen2Model(input_local_config)
+
+        if not config.add_post_norm:
+            self.input_local_transformer.norm = nn.Identity()
+
+        self.speech_embeddings = nn.ModuleList(
+            [
+                nn.Embedding(
+                    speech_vocab_sizes[i],
+                    config.input_local_dim,
+                    padding_idx=speech_empty_ids[i],
+                )
+                for i in range(config.audio_channels)
+            ]
+        )
+
+        if config.projection_layers == 1:
+            self.projection = nn.Linear(
+                config.input_local_dim * config.group_size,
+                config.out_hidden_size,
+                bias=False,
+            )
+        elif config.projection_layers == 2:
+            self.projection = AudioProjection(
+                config.input_local_dim * config.group_size,
+                config.input_local_dim * config.group_size * 4,
+                config.out_hidden_size,
+            )
+        else:
+            raise ValueError(f"Invalid projection_layers: {config.projection_layers}")
+
+        self.audio_tokenizer: MiMoAudioTokenizer | None = None
+        if model_path:
+            audio_tokenizer_path = os.path.join(model_path, "audio_tokenizer")
+            if os.path.exists(audio_tokenizer_path):
+                dev = torch.get_default_device()
+                self.audio_tokenizer = self._load_audio_tokenizer(audio_tokenizer_path, dev)
+            else:
+                logger.warning(
+                    "Audio tokenizer not found at %s, audio encoding disabled",
+                    audio_tokenizer_path,
+                )
+
+    @staticmethod
+    def _load_audio_tokenizer(path: str, device: torch.device) -> MiMoAudioTokenizer:
+        """Load MiMoAudioTokenizer from directory."""
+        from safetensors.torch import load_file
+
+        config_path = os.path.join(path, "config.json")
+        with open(config_path) as f:
+            config_dict = json.load(f)
+        config = MiMoAudioTokenizer.config_class(**config_dict)
+        model = MiMoAudioTokenizer(config)
+        safetensors_path = os.path.join(path, "model.safetensors")
+        bin_path = os.path.join(path, "pytorch_model.bin")
+        if os.path.exists(safetensors_path):
+            state_dict = load_file(safetensors_path, device="cpu")
+        elif os.path.exists(bin_path):
+            state_dict = torch.load(bin_path, map_location="cpu", weights_only=True)
+        else:
+            raise FileNotFoundError(
+                f"No model weights found in {path} (expected model.safetensors or pytorch_model.bin)"
+            )
+        model.load_state_dict(state_dict, strict=False)
+        model = model.to(device=device, dtype=torch.bfloat16)
+        model.eval()
+        model.requires_grad_(False)
+        return model
+
+    def _parse_maybe_list(self, value, length: int) -> list[int]:
+        if isinstance(value, str) and "-" in value:
+            return [int(s) for s in value.split("-")]
+        return [int(value)] * length
+
+    def apply_input_local_transformer(self, speech_embeddings: torch.Tensor):
+        output = self.input_local_transformer(
+            inputs_embeds=speech_embeddings,
+            return_dict=True,
+            is_causal=not self.config.input_full_attention,
+        )
+        return output.last_hidden_state
+
+    def apply_speech_embeddings(self, audio_codes: torch.Tensor) -> torch.Tensor:
+        num_segments = audio_codes.shape[0]
+        _audio_embeddings = torch.zeros(
+            (num_segments, self.config.group_size, self.config.input_local_dim),
+            dtype=next(self.speech_embeddings[0].parameters()).dtype,
+            device=audio_codes.device,
+        )
+        for i in range(self.config.audio_channels):
+            _audio_embeddings.add_(self.speech_embeddings[i](audio_codes[:, :, i]))
+        return _audio_embeddings
+
+    def process_audio(self, audio: torch.Tensor) -> torch.Tensor:
+        """Pad audio codes to group_size boundary.
+
+        Args:
+            audio: [T, audio_channels] code tensor
+
+        Returns:
+            [T//group_size, group_size, audio_channels]
+        """
+        T = audio.shape[0]
+        audio = audio[:, : self.audio_channels]
+        padded_T = (T + self.audio_group_size - 1) // self.audio_group_size * self.audio_group_size
+        padded_audio = torch.cat(
+            [
+                audio,
+                torch.zeros(
+                    padded_T - T,
+                    self.audio_channels,
+                    dtype=torch.int32,
+                    device=audio.device,
+                )
+                + audio[-1, :],
+            ],
+            dim=0,
+        )
+        padded_audio = padded_audio.reshape(
+            padded_T // self.audio_group_size,
+            self.audio_group_size,
+            self.audio_channels,
+        )
+        return padded_audio
+
+    def get_audio_feature(self, mel_specs: list[torch.Tensor]) -> tuple[torch.Tensor, list[int]]:
+        """Encode mel spectrograms into LLM embedding space.
+
+        Args:
+            mel_specs: list of mel spectrogram tensors, each [T, n_mels]
+
+        Returns:
+            Tuple of:
+            - audio_embeds: [total_tokens, out_hidden_size] concatenated embeddings
+            - item_token_lens: list of int, number of tokens per input item
+        """
+        if self.audio_tokenizer is None:
+            raise RuntimeError(
+                "audio_tokenizer is not loaded. Ensure model_path points to a directory containing audio_tokenizer/."
+            )
+
+        if not mel_specs:
+            device = next(self.projection.parameters()).device
+            dtype = next(self.projection.parameters()).dtype
+            return (
+                torch.empty(0, self.config.out_hidden_size, device=device, dtype=dtype),
+                [],
+            )
+
+        device = next(self.audio_tokenizer.encoder.parameters()).device
+        code_list = tokenize_audio_batch(
+            mel_specs,
+            self.audio_tokenizer.encoder,
+            segment_size=self.audio_segment_size,
+            device=device,
+        )
+
+        item_token_lens: list[int] = []
+        codecs_to_concat = []
+        for codecs in code_list:
+            padded_codes = self.process_audio(codecs)
+            codecs_to_concat.append(padded_codes)
+            item_token_lens.append(padded_codes.shape[0])
+
+        audio_codes = torch.cat(codecs_to_concat, dim=0)  # [total_T//group_size, group_size, audio_channels]
+
+        _audio_embeddings = self.apply_speech_embeddings(audio_codes)
+        audio_embeds = self.apply_input_local_transformer(_audio_embeddings)
+        B = audio_embeds.shape[0]
+        audio_embeds = self.projection(audio_embeds.reshape(B, -1))
+        return audio_embeds, item_token_lens
diff --git a/aphrodite/model_executor/models/mimo_v2_flash.py b/aphrodite/model_executor/models/mimo_v2.py
similarity index 96%
rename from aphrodite/model_executor/models/mimo_v2_flash.py
rename to aphrodite/model_executor/models/mimo_v2.py
index 7e70585144..44db0c4fb8 100644
--- a/aphrodite/model_executor/models/mimo_v2_flash.py
+++ b/aphrodite/model_executor/models/mimo_v2.py
@@ -6,6 +6,7 @@
 import torch
 from torch import nn
 
+from aphrodite.compilation.decorators import support_torch_compile
 from aphrodite.config import (
     AphroditeConfig,
     CacheConfig,
@@ -254,7 +255,7 @@ def __init__(
             self.total_num_heads * self.v_head_dim,
             hidden_size,
             bias=False,
-            quant_config=quant_config,
+            quant_config=quant_config if "mtp.layers" not in prefix else None,
             reduce_results=True,
             prefix=f"{prefix}.o_proj",
         )
@@ -422,6 +423,7 @@ def is_compressed_softmax_layer(self) -> bool:
         return self.config.hybrid_layer_pattern[self.layer_id] == 1
 
 
+@support_torch_compile
 class MiMoV2Model(nn.Module):
     def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
         super().__init__()
@@ -577,7 +579,13 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
             if expert_matched:
                 continue
-
+            # Support fused qkv_proj checkpoint (Pro format)
+            if "qkv_proj" in name:
+                if name in params_dict:
+                    param = params_dict[name]
+                    loaded_weight = loaded_weight.chunk(tp_size, dim=0)[tp_rank]
+                    default_weight_loader(param, loaded_weight)
+                continue
             stacked_matched = False
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
@@ -633,6 +641,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
 
 class MiMoV2FlashForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
     def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
         super().__init__()
         config = aphrodite_config.model_config.hf_config
@@ -685,3 +698,10 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
+
+
+class MiMoV2ForCausalLM(MiMoV2FlashForCausalLM):
+    packed_modules_mapping = {
+        "qkv_proj": ["qkv_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
diff --git a/aphrodite/model_executor/models/mimo_v2_mtp.py b/aphrodite/model_executor/models/mimo_v2_mtp.py
new file mode 100644
index 0000000000..a1bb795385
--- /dev/null
+++ b/aphrodite/model_executor/models/mimo_v2_mtp.py
@@ -0,0 +1,346 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Inference-only MiMo-V2 MTP (Multi-Token Prediction) draft model.
+
+Supports both MiMo-V2-Pro and MiMo-V2-Flash checkpoints.
+
+Checkpoint weight layout (model.mtp.layers.{idx}.*):
+  enorm            - RMSNorm for token embeddings
+  hnorm            - RMSNorm for previous hidden states
+  eh_proj          - ReplicatedLinear(hidden*2 -> hidden)
+  input_layernorm  - pre-attention RMSNorm
+  self_attn.*      - attention weights; format differs by variant:
+                       Pro:   fused qkv_proj  [Q;K;V] concatenated
+                       Flash: separate q_proj, k_proj, v_proj
+  pre_mlp_layernorm - post-attention / pre-MLP RMSNorm
+  mlp.*            - dense MLP (gate_proj / up_proj / down_proj)
+  final_layernorm  - norm applied before logit computation
+"""
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from aphrodite.config import AphroditeConfig
+from aphrodite.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from aphrodite.model_executor.layers.layernorm import RMSNorm
+from aphrodite.model_executor.layers.linear import ReplicatedLinear
+from aphrodite.model_executor.layers.logits_processor import LogitsProcessor
+from aphrodite.model_executor.layers.quantization import QuantizationConfig
+from aphrodite.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from aphrodite.model_executor.model_loader.weight_utils import default_weight_loader
+from aphrodite.sequence import IntermediateTensors
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    _require_is_multimodal,
+)
+from .mimo_v2 import MiMoV2Attention, MiMoV2MLP
+from .utils import _merge_multimodal_embeddings, maybe_prefix
+
+# MiMo-V2 checkpoints contain multiple MTP layers, but Aphrodite currently supports
+# only the first layer and only one speculative token.
+_MIMO_V2_PRO_NUM_MTP_LAYERS = 1
+_MIMO_V2_FLASH_NUM_MTP_LAYERS = 1
+
+
+class MiMoV2MTPLayer(nn.Module):
+    """Single MTP predictor layer for MiMo-V2 (Pro and Flash).
+
+    Mirrors the single-layer MiMo-V2 nextn reference implementation.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
+        super().__init__()
+
+        # Predictor head components
+        self.enorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.eh_proj = ReplicatedLinear(config.hidden_size * 2, config.hidden_size, bias=False)
+
+        # MTP uses the SWA attention configuration
+        # implementation.
+        swa_rope_theta = getattr(
+            config,
+            "swa_rope_theta",
+            getattr(config, "rope_theta", 1000000),
+        )
+        sliding_window_size = getattr(config, "sliding_window_size", -1)
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.self_attn = MiMoV2Attention(
+            hidden_size=config.hidden_size,
+            num_heads=config.swa_num_attention_heads,
+            num_kv_heads=config.swa_num_key_value_heads,
+            head_dim=config.swa_head_dim,
+            v_head_dim=getattr(config, "swa_v_head_dim", None),
+            v_scale=getattr(config, "attention_value_scale", None),
+            sliding_window_size=sliding_window_size,
+            attention_bias=config.attention_bias,
+            add_swa_attention_sink_bias=getattr(config, "add_swa_attention_sink_bias", False),
+            layer_id=0,
+            rope_theta=swa_rope_theta,
+            max_position_embeddings=getattr(config, "max_position_embeddings", 32768),
+            quant_config=quant_config,
+            partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+            prefix=f"{prefix}.self_attn",
+        )
+        self.pre_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.mlp = MiMoV2MLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Combine token embedding and previous hidden state
+        h, _ = self.eh_proj(torch.cat([self.enorm(inputs_embeds), self.hnorm(previous_hidden_states)], dim=-1))
+
+        # Transformer block with fused residual norms
+        residual = h
+        h = self.input_layernorm(h)
+        h = self.self_attn(positions=positions, hidden_states=h)
+        h, residual = self.pre_mlp_layernorm(h, residual)
+        h = self.mlp(h)
+        h = h + residual
+
+        return self.final_layernorm(h)
+
+
+class _MiMoV2MTPLayers(nn.Module):
+    """Thin wrapper so parameter paths match checkpoint: model.mtp.layers.*"""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        num_mtp_layers: int,
+        quant_config: QuantizationConfig | None,
+        prefix: str,
+    ) -> None:
+        super().__init__()
+        self.layers = nn.ModuleDict(
+            {
+                str(i): MiMoV2MTPLayer(
+                    config=config,
+                    prefix=f"{prefix}.{i}",
+                    quant_config=quant_config,
+                )
+                for i in range(num_mtp_layers)
+            }
+        )
+
+
+class MiMoV2MultiTokenPredictor(nn.Module):
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = aphrodite_config.model_config.hf_config
+        spec_cfg = aphrodite_config.speculative_config
+        assert spec_cfg is not None
+        if spec_cfg.num_speculative_tokens != 1:
+            raise ValueError("MiMo-V2 MTP in Aphrodite only supports num_speculative_tokens=1.")
+        num_mtp_layers = 1
+
+        self.num_mtp_layers = num_mtp_layers
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+
+        self.mtp = _MiMoV2MTPLayers(
+            config=config,
+            num_mtp_layers=num_mtp_layers,
+            quant_config=aphrodite_config.quant_config,
+            prefix=maybe_prefix(prefix, "mtp.layers"),
+        )
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        assert spec_step_idx == 0, "MiMo-V2 MTP only supports one speculative token."
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_input_ids(input_ids)
+        return self.mtp.layers[str(spec_step_idx)](inputs_embeds, positions, previous_hidden_states)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: ParallelLMHead,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        assert spec_step_idx == 0, "MiMo-V2 MTP only supports one speculative token."
+        return self.logits_processor(lm_head, hidden_states)
+
+
+class MiMoV2MTP(nn.Module):
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = "") -> None:
+        super().__init__()
+        self.config = aphrodite_config.model_config.hf_config
+        self.model = MiMoV2MultiTokenPredictor(aphrodite_config=aphrodite_config, prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        assert spec_step_idx == 0, "MiMo-V2 MTP only supports one speculative token."
+        return self.model(input_ids, positions, hidden_states, inputs_embeds, spec_step_idx)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor | None:
+        assert spec_step_idx == 0, "MiMo-V2 MTP only supports one speculative token."
+        return self.model.compute_logits(hidden_states, self.lm_head, spec_step_idx)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            # Flash format: separate projections → fused qkv_proj
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            # Only load MTP-related weights, shared embeddings, and lm_head
+            if "model.mtp" not in name and "model.embed_tokens" not in name and not name.startswith("lm_head"):
+                continue
+
+            # Support fused qkv_proj checkpoint (Pro format).
+            # The checkpoint is stored pre-sharded for TP=8 as
+            # [Q_rank0, K_rank0, V_rank0, Q_rank1, ...], so splitting along
+            # dim 0 with chunk(tp_size) gives each rank its Q+K+V slice for
+            # both the FP8 weight and the block weight_scale_inv. This matches
+            # how the main model loads the same layout.
+            if "qkv_proj" in name:
+                if name in params_dict:
+                    param = params_dict[name]
+                    loaded_weight = loaded_weight.chunk(tp_size, dim=0)[tp_rank]
+                    default_weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+                continue
+
+            # gate_proj/up_proj → gate_up_proj stacking (both formats);
+            # Flash: q_proj/k_proj/v_proj → qkv_proj merging.
+            stacked_matched = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name_rewritten = name.replace(weight_name, param_name)
+                if name_rewritten.endswith(".bias") and name_rewritten not in params_dict:
+                    continue
+                if name_rewritten not in params_dict:
+                    continue
+                param = params_dict[name_rewritten]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name_rewritten)
+                stacked_matched = True
+                break
+
+            if stacked_matched:
+                continue
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if name not in params_dict:
+                continue
+
+            param = params_dict[name]
+            # attention_sink_bias is head-parallel; slice by tp
+            if "attention_sink_bias" in name:
+                total_heads = loaded_weight.shape[0]
+                heads_per_rank = total_heads // tp_size
+                loaded_weight = loaded_weight.narrow(0, tp_rank * heads_per_rank, heads_per_rank)
+
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class MiMoV2OmniMTP(MiMoV2MTP, SupportsMultiModal):
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self._embed_text_input_ids(
+            input_ids,
+            self.model.embed_input_ids,
+            is_multimodal=is_multimodal,
+        )
+
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        is_multimodal = _require_is_multimodal(is_multimodal)
+
+        inputs_embeds = _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
+
+        return inputs_embeds
diff --git a/aphrodite/model_executor/models/mimo_v2_omni.py b/aphrodite/model_executor/models/mimo_v2_omni.py
new file mode 100644
index 0000000000..984166aec1
--- /dev/null
+++ b/aphrodite/model_executor/models/mimo_v2_omni.py
@@ -0,0 +1,1417 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any
+
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BatchFeature, PretrainedConfig
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
+
+from aphrodite.config import AphroditeConfig
+from aphrodite.config.multimodal import BaseDummyOptions
+from aphrodite.distributed import parallel_state
+from aphrodite.distributed import utils as dist_utils
+from aphrodite.inputs import MultiModalDataDict
+from aphrodite.model_executor.layers.activation import get_act_and_mul_fn
+from aphrodite.model_executor.layers.attention import MMEncoderAttention
+from aphrodite.model_executor.layers.layernorm import RMSNorm
+from aphrodite.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from aphrodite.model_executor.layers.quantization import QuantizationConfig
+from aphrodite.model_executor.layers.rotary_embedding import get_rope
+from aphrodite.model_executor.layers.rotary_embedding.common import ApplyRotaryEmb
+from aphrodite.model_executor.model_loader.weight_utils import default_weight_loader
+from aphrodite.model_executor.models.vision import is_vit_use_data_parallel
+from aphrodite.multimodal import MULTIMODAL_REGISTRY
+from aphrodite.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
+from aphrodite.multimodal.parse import ImageSize, MultiModalDataItems
+from aphrodite.multimodal.processing import (
+    BaseDummyInputsBuilder,
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from aphrodite.transformers_utils.configs.mimo_v2_omni import Mimo_VLVisionConfig
+from aphrodite.transformers_utils.processors.mimo_v2_omni import (
+    MiMoOmniProcessor,
+    VideoAudioInput,
+    _format_timestamp,
+)
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
+from .mimo_audio import MimoAudioEncoder
+from .mimo_v2 import MiMoV2FlashForCausalLM
+from .qwen2_5_vl import (
+    Qwen2_5_VisionMLP,
+    Qwen2_5_VisionPatchEmbed,
+    Qwen2_5_VLImageEmbeddingInputs,
+    Qwen2_5_VLImageInputs,
+    Qwen2_5_VLImagePixelInputs,
+    Qwen2_5_VLVideoEmbeddingInputs,
+    Qwen2_5_VLVideoInputs,
+    Qwen2_5_VLVideoPixelInputs,
+)
+from .qwen2_vl import _create_qwen2vl_field_factory
+from .utils import AutoWeightsLoader, IntermediateTensors, WeightsMapper, maybe_prefix
+
+
+class MiMoVisionMLP(Qwen2_5_VisionMLP):
+    pass
+
+
+class MiMoVisionPatchEmbed(Qwen2_5_VisionPatchEmbed):
+    pass
+
+
+class MiMoVisionPatchMerger(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        spatial_merge_size: int = 2,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        use_data_parallel = is_vit_use_data_parallel()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+
+        self.mlp = nn.Sequential(
+            ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp.0",
+                return_bias=False,
+                disable_tp=use_data_parallel,
+            ),
+            nn.GELU(),
+            RowParallelLinear(
+                self.hidden_size,
+                d_model,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp.2",
+                return_bias=False,
+                disable_tp=use_data_parallel,
+            ),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+        out = self.mlp(x)
+        return out
+
+
+class MiMoVisionAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        qk_channels: int,
+        kv_channels: int,
+        use_sink: bool = False,
+        visual_token_window_size: int = 64,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        use_data_parallel = is_vit_use_data_parallel()
+        self.tp_size = 1 if use_data_parallel else parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.qk_channels = qk_channels
+        self.kv_channels = kv_channels
+        self.embed_dim = embed_dim
+
+        self.num_heads_per_partition = dist_utils.divide(num_heads, self.tp_size)
+        self.num_kv_heads_per_partition = dist_utils.divide(num_kv_heads, self.tp_size)
+
+        # Attention scale uses the Q/K head dimension (qk_channels)
+        self.scale = qk_channels**-0.5
+
+        # QKV: Q is (num_heads * qk_channels), KV are (num_kv_heads * kv_channels)
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=qk_channels,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_kv_heads,
+            v_head_size=kv_channels,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+
+        # Output projection: input is (num_heads * kv_channels) after attention
+        self.proj = RowParallelLinear(
+            input_size=num_heads * kv_channels,
+            output_size=embed_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
+        )
+
+        # For full attention (non-window blocks)
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_heads_per_partition,
+            head_size=kv_channels,
+            scale=self.scale,
+            num_kv_heads=self.num_kv_heads_per_partition,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Rotary embeddings applied separately to Q and K
+        self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True)
+
+        # Sink attention weights (loaded but not used in Aphrodite flash_attn)
+        # The checkpoint stores these only for non-full-attention blocks
+        self.use_sink = use_sink
+        if use_sink:
+            self.sinks = nn.Parameter(
+                torch.empty(num_heads),
+                requires_grad=False,
+            )
+        else:
+            self.sinks = None
+
+        self.visual_token_window_size = visual_token_window_size
+
+    def _forward_window_attn(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: torch.Tensor,
+    ) -> torch.Tensor:
+        """Window attention via flash_attn_varlen_func with window_size."""
+        from aphrodite.aphrodite_flash_attn import flash_attn_varlen_func
+
+        w = self.visual_token_window_size
+        output = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+            softmax_scale=self.scale,
+            causal=False,
+            window_size=[w, w],
+        )
+        return output
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
+        max_seqlen: torch.Tensor,
+        full_attn: bool = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: [seq_len, batch=1, embed_dim]  (seq-first convention)
+            cu_seqlens: cumulative sequence lengths [num_seqs+1], int32
+            rotary_pos_emb_cos: [seq_len, qk_channels // 2]
+            rotary_pos_emb_sin: [seq_len, qk_channels // 2]
+            max_seqlen: maximum sequence length
+            full_attn: if True, full attention; if False, window attention
+        """
+        # [seq_len, 1, embed_dim] -> QKV projection
+        qkv, _ = self.qkv(x)  # [seq_len, 1, q_size + kv_size + kv_size]
+        seq_len, batch_size, _ = qkv.shape
+
+        q_size = self.num_heads_per_partition * self.qk_channels
+        kv_size = self.num_kv_heads_per_partition * self.kv_channels
+        q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
+
+        # Rearrange to [batch, seq, head, head_dim] for rotary application
+        q = einops.rearrange(q, "s b (h d) -> b s h d", h=self.num_heads_per_partition)
+        k = einops.rearrange(k, "s b (h d) -> b s h d", h=self.num_kv_heads_per_partition)
+        v = einops.rearrange(v, "s b (h d) -> b s h d", h=self.num_kv_heads_per_partition)
+
+        # Apply rotary embeddings to Q and K independently (handles GQA)
+        if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
+            q = self.apply_rotary_emb(q, rotary_pos_emb_cos, rotary_pos_emb_sin)
+            k = self.apply_rotary_emb(k, rotary_pos_emb_cos, rotary_pos_emb_sin)
+
+        if full_attn:
+            # Full attention via MMEncoderAttention
+            # Flatten to [batch, seq, heads * head_dim]
+            q_flat = q.reshape(batch_size, seq_len, -1)
+            k_flat = k.reshape(batch_size, seq_len, -1)
+            v_flat = v.reshape(batch_size, seq_len, -1)
+            context_layer = self.attn(
+                query=q_flat,
+                key=k_flat,
+                value=v_flat,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+            )
+            # context_layer: [batch, seq, num_heads, head_dim] or [batch, seq, hidden]
+            # Ensure shape is [seq, batch, num_heads * kv_channels]
+            if context_layer.dim() == 4:
+                context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
+            else:
+                context_layer = einops.rearrange(context_layer, "b s d -> s b d").contiguous()
+        else:
+            # Window attention via flash_attn_varlen_func with window_size
+            # Flatten batch dimension: [seq, head, head_dim]
+            q_varlen = einops.rearrange(q, "b s h d -> (b s) h d")
+            k_varlen = einops.rearrange(k, "b s h d -> (b s) h d")
+            v_varlen = einops.rearrange(v, "b s h d -> (b s) h d")
+            output = self._forward_window_attn(q_varlen, k_varlen, v_varlen, cu_seqlens, max_seqlen)
+            # output: [total_tokens, num_heads, kv_channels]
+            context_layer = einops.rearrange(output, "(b s) h d -> s b (h d)", b=batch_size).contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class MiMoVisionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        qk_channels: int,
+        kv_channels: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        norm_eps: float = 1e-6,
+        use_sink: bool = False,
+        visual_token_window_size: int = 64,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(dim, eps=norm_eps)
+        self.norm2 = RMSNorm(dim, eps=norm_eps)
+        self.attn = MiMoVisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            qk_channels=qk_channels,
+            kv_channels=kv_channels,
+            use_sink=use_sink,
+            visual_token_window_size=visual_token_window_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.mlp = MiMoVisionMLP(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_fn=act_fn,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
+        max_seqlen: torch.Tensor,
+        full_attn: bool = True,
+    ) -> torch.Tensor:
+        # x: [seq_len, batch=1, dim]
+        x_attn = self.attn(
+            self.norm1(x),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
+            max_seqlen=max_seqlen,
+            full_attn=full_attn,
+        )
+        # Fused residual add + norm2
+        x_norm, residual = self.norm2(x, residual=x_attn)
+        x = residual + self.mlp(x_norm)
+        return x
+
+
+class MiMoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        vision_cfg: PretrainedConfig,
+        *,
+        norm_eps: float = 1e-6,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.spatial_merge_size = vision_cfg.spatial_merge_size
+        self.spatial_merge_unit = self.spatial_merge_size**2
+        self.fullatt_block_indexes = vision_cfg.fullatt_block_indexes
+        self.vit_window_attn_types = vision_cfg.vit_window_attn_types
+        self.visual_token_window_size = vision_cfg.visual_token_window_size
+        self.hidden_size = vision_cfg.hidden_size
+        self.num_heads = vision_cfg.num_heads
+        self.num_kv_heads = vision_cfg.num_key_value_heads
+        self.qk_channels = vision_cfg.qk_channels
+        self.kv_channels = vision_cfg.kv_channels
+
+        self.patch_embed = MiMoVisionPatchEmbed(
+            patch_size=vision_cfg.patch_size,
+            temporal_patch_size=vision_cfg.temporal_patch_size,
+            in_channels=vision_cfg.in_channels,
+            hidden_size=vision_cfg.hidden_size,
+        )
+
+        norm_layer = partial(RMSNorm, eps=norm_eps)
+
+        # Rotary embedding for 2D positions.
+        # With partial_rotary_factor=0.5 and head_size=qk_channels:
+        #   rotary_dim = qk_channels // 2
+        #   get_cos_sin returns cos, sin each of shape [pos, rotary_dim // 2]
+        # After indexing with 2D pos_ids and flattening:
+        #   result shape = [tokens, rotary_dim] = [tokens, qk_channels // 2]
+        # which is what ApplyRotaryEmb expects as cos/sin input.
+        self.rotary_pos_emb = get_rope(
+            head_size=vision_cfg.qk_channels,
+            max_position=8192,
+            is_neox_style=True,
+            rope_parameters={"partial_rotary_factor": 0.5},
+        )
+
+        self.blocks = nn.ModuleList(
+            [
+                MiMoVisionBlock(
+                    dim=vision_cfg.hidden_size,
+                    num_heads=vision_cfg.num_heads,
+                    num_kv_heads=vision_cfg.num_key_value_heads,
+                    qk_channels=vision_cfg.qk_channels,
+                    kv_channels=vision_cfg.kv_channels,
+                    mlp_hidden_dim=vision_cfg.intermediate_size,
+                    act_fn=get_act_and_mul_fn(vision_cfg.hidden_act),
+                    norm_eps=norm_eps,
+                    use_sink=(vision_cfg.use_sink and i not in vision_cfg.fullatt_block_indexes),
+                    visual_token_window_size=vision_cfg.visual_token_window_size,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{i}",
+                )
+                for i in range(vision_cfg.depth)
+            ]
+        )
+
+        self.merger = MiMoVisionPatchMerger(
+            d_model=vision_cfg.out_hidden_size,
+            context_dim=vision_cfg.hidden_size,
+            norm_layer=norm_layer,
+            spatial_merge_size=vision_cfg.spatial_merge_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.merger",
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def apply_index(self, tensor: torch.Tensor, index: torch.Tensor) -> torch.Tensor:
+        """Reindex tensor at the spatial_merge_unit granularity."""
+        tensor = tensor.unflatten(0, (-1, self.spatial_merge_unit))
+        tensor = tensor[index]
+        tensor = tensor.flatten(0, 1)
+        return tensor
+
+    def get_window_index_1d(self, grid_thw: torch.Tensor, col: bool = True) -> torch.Tensor:
+        """Compute 1D window indices for col-based or row-based SWA reordering."""
+        window_index: list[torch.Tensor] = []
+        window_index_id = 0
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h = grid_h // self.spatial_merge_size
+            llm_grid_w = grid_w // self.spatial_merge_size
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+            index_new = index.transpose(1, 2).reshape(-1) if col else index.reshape(-1)
+            window_index.append(index_new + window_index_id)
+            window_index_id += int((grid_t * llm_grid_h * llm_grid_w).item())
+        return torch.cat(window_index, dim=0)
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute 2D rotary position embedding cos/sin for given grid sizes.
+
+        Returns:
+            cos: [total_tokens, qk_channels // 2]
+            sin: [total_tokens, qk_channels // 2]
+        """
+        cos_list, sin_list = [], []
+        for i in range(grid_thw.size(0)):
+            t, h, w = int(grid_thw[i, 0]), int(grid_thw[i, 1]), int(grid_thw[i, 2])
+
+            # Build 2D position IDs with spatial_merge_size interleaving
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
+            # pos_ids: [t*h*w, 2]
+
+            max_grid_size = max(h, w)
+            # get_cos_sin returns cos, sin each of shape [max_grid_size, rotary_dim//2]
+            # where rotary_dim = qk_channels // 2 (from partial_rotary_factor=0.5)
+            cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+            # [t*h*w, 2, rotary_dim//2] -> [t*h*w, rotary_dim] (= qk_channels // 2)
+            cos_img = cos[pos_ids].flatten(1)
+            sin_img = sin[pos_ids].flatten(1)
+            cos_list.append(cos_img)
+            sin_list.append(sin_img)
+
+        return torch.cat(cos_list, dim=0), torch.cat(sin_list, dim=0)
+
+    def forward(self, x: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [total_tokens, C] pre-flattened patches
+            grid_thw: [num_images, 3] tensor of (t, h, w) for each image/video
+        Returns:
+            [merged_tokens, out_hidden_size]
+        """
+        # Ensure grid_thw is a tensor
+        if not isinstance(grid_thw, torch.Tensor):
+            grid_thw = torch.tensor(grid_thw, dtype=torch.long)
+
+        # Move to visual model device/dtype
+        x = x.to(device=self.device, dtype=self.dtype)
+
+        # Patch embedding: [total_tokens, hidden_size]
+        x = self.patch_embed(x)
+
+        # Compute 2D rotary positional embeddings
+        # cos, sin: [total_tokens, qk_channels // 2]
+        rotary_cos, rotary_sin = self.rot_pos_emb(grid_thw)
+        rotary_cos = rotary_cos.to(device=x.device)
+        rotary_sin = rotary_sin.to(device=x.device)
+
+        # Compute cu_seqlens for flash_attn (per-image/video sequence lengths)
+        seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0])
+        cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], device=x.device, dtype=torch.int32),
+                seqlens.cumsum(dim=0).to(device=x.device, dtype=torch.int32),
+            ]
+        )
+        max_seqlen = seqlens.max()
+
+        # Precompute col-based window index for type=1 (col SWA) layers
+        window_index_1d_col = self.get_window_index_1d(grid_thw, col=True).to(device=x.device)
+        reverse_window_index_1d_col = torch.argsort(window_index_1d_col)
+
+        # Col-based rotary embeddings (reordered at spatial_merge_unit granularity).
+        # apply_index reorders groups of spatial_merge_unit tokens, just like x.
+        col_cos = self.apply_index(rotary_cos, window_index_1d_col)
+        col_sin = self.apply_index(rotary_sin, window_index_1d_col)
+
+        # Add batch dimension: [total_tokens, 1, hidden_size]
+        x = x.unsqueeze(1)
+
+        for i, blk in enumerate(self.blocks):
+            window_attn_type = self.vit_window_attn_types[i]
+
+            # Reorder tokens to col-based layout when entering col-SWA region
+            if window_attn_type == 1 and (i == 0 or self.vit_window_attn_types[i - 1] != 1):
+                x = self.apply_index(x, window_index_1d_col)
+
+            # Restore row-based order when leaving col-SWA region
+            if i > 0 and window_attn_type != 1 and self.vit_window_attn_types[i - 1] == 1:
+                x = self.apply_index(x, reverse_window_index_1d_col)
+
+            # Use col-based embeddings for col-SWA layers
+            cos_now = col_cos if window_attn_type == 1 else rotary_cos
+            sin_now = col_sin if window_attn_type == 1 else rotary_sin
+
+            full_attn = i in self.fullatt_block_indexes
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb_cos=cos_now,
+                rotary_pos_emb_sin=sin_now,
+                max_seqlen=max_seqlen,
+                full_attn=full_attn,
+            )
+
+        # Restore row-based order if last block was col-SWA
+        if self.vit_window_attn_types[-1] == 1:
+            x = self.apply_index(x, reverse_window_index_1d_col)
+
+        # Remove batch dim and merge spatial tokens
+        # x: [total_tokens, 1, hidden_size] -> [total_tokens, hidden_size]
+        x = x.squeeze(1)
+        x = self.merger(x)
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("mlp.gate_up_proj", "mlp.gate_proj", 0),
+            ("mlp.gate_up_proj", "mlp.up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MiMoV2OmniProcessingInfo(BaseProcessingInfo):
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"audio": None, "image": None, "video": None}
+
+    def get_hf_config(self):
+        config = self.ctx.get_hf_config()
+        if isinstance(config.vision_config, dict):
+            config.vision_config = Mimo_VLVisionConfig.from_dict(config.vision_config)
+        return config
+
+    def get_hf_processor(self, **kwargs: object) -> MiMoOmniProcessor:
+        hf_config = self.get_hf_config()
+        tokenizer = self.get_tokenizer()
+        return MiMoOmniProcessor.from_hf_config(tokenizer, hf_config)
+
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_data_parser(self):
+        from aphrodite.multimodal.parse import MultiModalDataParser
+
+        return MultiModalDataParser(target_sr=24000.0)
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len, mm_counts),
+        }
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor,
+        mm_kwargs: Mapping[str, object],
+    ) -> tuple[ImageSize, int]:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        tokens_per_second = vision_config.tokens_per_second
+
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        size = image_processor.size
+        if override_size := mm_kwargs.get("size"):
+            size = size | override_size
+        if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+            size = size | {"shortest_edge": override_min_pixels}
+        if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+            size = size | {"longest_edge": override_max_pixels}
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                min_pixels=size["shortest_edge"],
+                max_pixels=size["longest_edge"],
+            )
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+        # For video, MiMo resamples to tokens_per_second fps before temporal patching,
+        # effective tokens = num_frames * tokens_per_second / temporal_patch_size.
+        # For images (num_frames == 1) no resampling is applied.
+        if num_frames > 1:
+            effective_frames = num_frames * tokens_per_second
+        else:
+            effective_frames = num_frames
+        padded_num_frames = effective_frames + effective_frames % temporal_patch_size
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor,
+        mm_kwargs: Mapping[str, object],
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=1,
+            image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
+        )
+        return num_image_tokens
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor,
+        mm_kwargs: Mapping[str, object],
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
+        )
+        return num_video_tokens
+
+    def get_image_size_with_most_features(self, max_pixels: int | None = None) -> ImageSize:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+
+        if max_pixels is None:
+            image_processor = self.get_image_processor()
+            mm_kwargs = self.ctx.get_merged_mm_kwargs({})
+            size = image_processor.size
+            if override_size := mm_kwargs.get("size"):
+                size = size | override_size
+            if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None:
+                size = size | {"shortest_edge": override_min_pixels}
+            if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None:
+                size = size | {"longest_edge": override_max_pixels}
+            max_pixels = size["longest_edge"]
+
+        unit = patch_size * merge_size
+        max_seq_len = max_pixels // (unit * unit)
+
+        def closest_factor_pair(n: int) -> tuple[int, int]:
+            for d in range(math.isqrt(n), 0, -1):
+                if n % d == 0:
+                    return d, n // d
+            return 1, n
+
+        height_factor, width_factor = 1, max_seq_len
+        for seq_len in range(max_seq_len, 0, -1):
+            height_factor, width_factor = closest_factor_pair(seq_len)
+            if width_factor / height_factor <= 200:
+                break
+
+        return ImageSize(width=unit * width_factor, height=unit * height_factor)
+
+    def get_max_image_tokens(self) -> int:
+        image_processor = self.get_image_processor()
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=image_processor,
+            mm_kwargs={},
+        )
+
+    def _get_max_video_frames(self, max_tokens: int, start_num_frames: int = 1) -> int:
+        image_processor = self.get_image_processor()
+        target_width, target_height = self.get_image_size_with_most_features()
+        num_frames = start_num_frames
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=image_processor,
+                mm_kwargs={},
+            )
+            if next_max_tokens > max_tokens:
+                break
+            num_frames = next_num_frames
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        max_frames_per_video: int = 14,
+    ) -> int:
+        max_videos = mm_counts.get("video", 0)
+        max_total_frames = self._get_max_video_frames(seq_len)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1), max_frames_per_video)
+        return max(max_frames_per_video, 1)
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        image_processor = self.get_image_processor()
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
+            image_processor=image_processor,
+            mm_kwargs={},
+        )
+
+
+class MiMoV2OmniMultiModalProcessor(BaseMultiModalProcessor[MiMoV2OmniProcessingInfo]):
+    """Aphrodite multimodal processor for MiMo-Omni (image + video).
+
+    Key differences from Qwen2.5-VL:
+    - Videos use timestamp tokens between temporal grid positions.
+    - The HF processor expects ``(TCHW_tensor, timestamps_T_tensor)`` video
+      tuples rather than plain numpy arrays.
+    - ``video_start_times`` is tracked so prompt-update reconstruction can
+      regenerate the exact same timestamp token IDs.
+    """
+
+    # fps assumed for aphrodite-decoded video (numpy T,H,W,C arrays).
+    # The video loader samples ~32 frames; treat each frame as 1 s apart so
+    # MiMoVLProcessor sees 1 fps input and resamples internally.
+    _INPUT_FPS: float = 1.0
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        merge_size = self.info.get_hf_config().vision_config.spatial_merge_size
+        fields: dict[str, MultiModalFieldConfig] = dict(
+            **_create_qwen2vl_field_factory(merge_size)(hf_inputs),
+            second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+            video_start_times=MultiModalFieldConfig.batched("video"),
+            audio_features=MultiModalFieldConfig.batched("audio"),
+            audio_token_lens=MultiModalFieldConfig.batched("audio"),
+        )
+        # video_audio fields: only present when video_audio content was processed
+        if "video_audio_n_segs" in hf_inputs:
+            fields["video_audio_n_segs"] = MultiModalFieldConfig.batched("video")
+        # video_audio_seg_lens: list of per-video 1D tensors, batched("video")
+        if "video_audio_seg_lens" in hf_inputs:
+            fields["video_audio_seg_lens"] = MultiModalFieldConfig.batched("video")
+        if "va_audio_features" in hf_inputs:
+            fields["va_audio_features"] = MultiModalFieldConfig.batched("va_audio")
+        return fields
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        """Convert numpy video arrays to (TCHW, timestamps) tuples for MiMo.
+        Also remap 'audios' → 'audio' since MiMoOmniProcessor.__call__ uses
+        the singular form.
+        """
+        # Remap audios → audio (MiMoOmniProcessor uses singular param name)
+        if "audios" in mm_data:
+            mm_data = {**mm_data, "audio": mm_data["audios"]}
+            mm_data = {k: v for k, v in mm_data.items() if k != "audios"}
+
+        # Handle video_audio items: convert video part to (TCHW, timestamps) tuple
+        if "video_audio" in mm_data:
+            va_converted: list[VideoAudioInput] = []
+            for va_item in mm_data["video_audio"]:
+                if isinstance(va_item, VideoAudioInput):
+                    vid = va_item.video
+                else:
+                    # Expect (video_frames, audio_source) tuple
+                    vid, audio_src = va_item
+                    va_item = VideoAudioInput(video=vid, audio=audio_src)
+                    vid = vid
+                # Convert video frames to (TCHW, timestamps) if needed
+                if (
+                    isinstance(vid, tuple)
+                    and len(vid) == 2
+                    and isinstance(vid[0], torch.Tensor)
+                    and isinstance(vid[1], torch.Tensor)
+                ):
+                    va_converted.append(va_item)
+                else:
+                    if isinstance(vid, np.ndarray):
+                        frames = torch.from_numpy(vid)
+                    elif isinstance(vid, torch.Tensor):
+                        frames = vid
+                    else:
+                        frames = torch.tensor(np.array(vid))
+                    if frames.ndim == 4 and frames.shape[-1] in (1, 3, 4):
+                        frames = frames.permute(0, 3, 1, 2).float()
+                    else:
+                        frames = frames.float()
+                    T = frames.shape[0]
+                    timestamps = torch.arange(T, dtype=torch.float32) / self._INPUT_FPS
+                    va_converted.append(
+                        VideoAudioInput(
+                            video=(frames, timestamps),
+                            audio=va_item.audio,
+                        )
+                    )
+            mm_data = {**mm_data, "video_audio": va_converted}
+
+        if "videos" in mm_data:
+            converted: list[tuple[torch.Tensor, torch.Tensor]] = []
+            for video in mm_data["videos"]:
+                if (
+                    isinstance(video, tuple)
+                    and len(video) == 2
+                    and isinstance(video[0], torch.Tensor)
+                    and isinstance(video[1], torch.Tensor)
+                ):
+                    # already in MiMo format
+                    converted.append(video)
+                else:
+                    # numpy (T, H, W, C) or torch (T, H, W, C) / (T, C, H, W)
+                    if isinstance(video, np.ndarray):
+                        frames = torch.from_numpy(video)
+                    elif isinstance(video, torch.Tensor):
+                        frames = video
+                    else:
+                        frames = torch.tensor(np.array(video))
+
+                    if frames.ndim == 4 and frames.shape[-1] in (1, 3, 4):
+                        # THWC → TCHW
+                        frames = frames.permute(0, 3, 1, 2).float()
+                    else:
+                        frames = frames.float()
+
+                    T = frames.shape[0]
+                    timestamps = torch.arange(T, dtype=torch.float32) / self._INPUT_FPS
+                    converted.append((frames, timestamps))
+
+            mm_data = {**mm_data, "videos": converted}
+
+        return super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs)
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        merge_size = hf_config.vision_config.spatial_merge_size
+        p = hf_processor.mimo_processor
+
+        image_pad_id = vocab[hf_processor.image_token]
+        video_pad_id = vocab[hf_processor.video_token]
+        audio_pad_id = vocab.get("<|audio_pad|>")
+        vision_start_id = p.vision_start_token_id
+        vision_end_id = p.vision_end_token_id
+        video_start_id = p.video_start_token_id
+        video_end_id = p.video_end_token_id
+        audio_start_id = p.audio_start_token_id
+        audio_end_id = p.audio_end_token_id
+
+        def get_image_replacement(item_idx: int) -> PromptUpdateDetails:
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid_thw = out_item["image_grid_thw"].data
+            n_tokens = int(grid_thw.prod()) // merge_size**2
+            return [image_pad_id] * n_tokens
+
+        def get_video_replacement(item_idx: int) -> PromptUpdateDetails:
+            out_item = out_mm_kwargs["video"][item_idx]
+            grid_thw = out_item["video_grid_thw"].data
+            spt = float(out_item["second_per_grid_ts"].data)
+            start = float(out_item["video_start_times"].data)
+
+            T, H, W = map(int, grid_thw)
+            n_per_grid = H * W // (merge_size * merge_size)
+
+            # Check if this is a video_audio item
+            n_segs_field = out_item.get("video_audio_n_segs")
+            n_segs_val = int(n_segs_field.data) if n_segs_field is not None else 0
+            va_seg_lens: list[int] | None = None
+            if n_segs_val > 0:
+                seg_lens_field = out_item.get("video_audio_seg_lens")
+                if seg_lens_field is not None:
+                    va_seg_lens = seg_lens_field.data[:n_segs_val].tolist()
+
+            full: list[int] = [video_start_id]
+            is_embed_mask: list[bool] = [False]
+
+            if va_seg_lens is None:
+                # Regular video: timestamp + vision tokens per grid
+                for j in range(T):
+                    ts_text = _format_timestamp(start + j * spt)
+                    ts_ids = tokenizer.encode(ts_text, add_special_tokens=False)
+                    full.extend(ts_ids)
+                    is_embed_mask.extend([False] * len(ts_ids))
+                    full.append(vision_start_id)
+                    is_embed_mask.append(False)
+                    full.extend([video_pad_id] * n_per_grid)
+                    is_embed_mask.extend([True] * n_per_grid)
+                    full.append(vision_end_id)
+                    is_embed_mask.append(False)
+            else:
+                # video_audio: interleaved vision+audio per group
+                n_groups = len(va_seg_lens)
+                frames_per_group = T // n_groups  # 1 for il=0, T for il=-1
+                for g in range(n_groups):
+                    # Timestamp for first frame of this group
+                    frame0 = g * frames_per_group
+                    ts_text = _format_timestamp(start + frame0 * spt)
+                    ts_ids = tokenizer.encode(ts_text, add_special_tokens=False)
+                    full.extend(ts_ids)
+                    is_embed_mask.extend([False] * len(ts_ids))
+                    # Vision tokens for all frames in this group
+                    for f in range(frames_per_group):
+                        full.append(vision_start_id)
+                        is_embed_mask.append(False)
+                        full.extend([video_pad_id] * n_per_grid)
+                        is_embed_mask.extend([True] * n_per_grid)
+                        full.append(vision_end_id)
+                        is_embed_mask.append(False)
+                    # Audio tokens for this group
+                    seg_len = va_seg_lens[g]
+                    full.append(audio_start_id)
+                    is_embed_mask.append(False)
+                    full.extend([audio_pad_id] * seg_len)
+                    is_embed_mask.extend([True] * seg_len)
+                    full.append(audio_end_id)
+                    is_embed_mask.append(False)
+
+            full.append(video_end_id)
+            is_embed_mask.append(False)
+
+            embed_t = torch.tensor(is_embed_mask)
+            return PromptUpdateDetails(
+                full=full,
+                is_embed=lambda _tok, _seq: embed_t,
+            )
+
+        def get_audio_replacement(item_idx: int) -> PromptUpdateDetails:
+            out_item = out_mm_kwargs["audio"][item_idx]
+            tok_len = int(out_item["audio_token_lens"].data)
+            return [audio_pad_id] * tok_len
+
+        updates: list[PromptUpdate] = [
+            PromptReplacement(
+                modality="image",
+                target=[image_pad_id],
+                replacement=get_image_replacement,
+            ),
+            PromptReplacement(
+                modality="video",
+                target=[video_pad_id],
+                replacement=get_video_replacement,
+            ),
+        ]
+        if audio_pad_id is not None and audio_start_id is not None:
+            updates.append(
+                PromptReplacement(
+                    modality="audio",
+                    target=[audio_pad_id],
+                    replacement=get_audio_replacement,
+                )
+            )
+        return updates
+
+
+class MiMoV2OmniDummyInputsBuilder(BaseDummyInputsBuilder[MiMoV2OmniProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        num_audios = mm_counts.get("audio", 0)
+        image_ph = "<|vision_start|><|image_pad|><|vision_end|>"
+        video_ph = "<|vision_start|><|video_pad|><|vision_end|>"
+        audio_ph = "<|mimo_audio_start|><|audio_pad|><|mimo_audio_end|>"
+        return image_ph * num_images + video_ph * num_videos + audio_ph * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+        target_num_frames = self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=mm_options.get("image"),
+            ),
+            "video": self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+                overrides=mm_options.get("video"),
+            ),
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiMoV2OmniMultiModalProcessor,
+    info=MiMoV2OmniProcessingInfo,
+    dummy_inputs=MiMoV2OmniDummyInputsBuilder,
+)
+class MiMoV2OmniForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant):
+    # To ensure correct weight loading and mapping.
+    hf_to_aphrodite_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # audio encoder
+            "speech_embeddings.": "audio_encoder.speech_embeddings.",
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|video_pad|><|vision_end|>"
+        if modality.startswith("audio"):
+            return "<|mimo_audio_start|><|audio_pad|><|mimo_audio_end|>"
+
+        raise ValueError(f"Unsupported modality: {modality}")
+
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = ""):
+        super().__init__()
+        config = aphrodite_config.model_config.hf_config
+        self.config = config
+        # Omni ViT/Audio Encoder BF16
+        vision_config = (
+            Mimo_VLVisionConfig.from_dict(config.vision_config)
+            if isinstance(config.vision_config, dict)
+            else config.vision_config
+        )
+        with self._mark_tower_model(aphrodite_config, {"image", "video"}):
+            self.visual = MiMoVisionTransformer(
+                vision_config,
+                norm_eps=getattr(aphrodite_config, "rms_norm_eps", 1e-6),
+                quant_config=None,
+                prefix=maybe_prefix(prefix, "visual"),
+            )
+        audio_config = getattr(config, "audio_config", None)
+        model_path = aphrodite_config.model_config.model
+        if audio_config is not None:
+            with self._mark_tower_model(aphrodite_config, "audio"):
+                self.audio_encoder = MimoAudioEncoder(audio_config, model_path=model_path)
+        else:
+            self.audio_encoder = None
+        with self._mark_language_model(aphrodite_config):
+            self.language_model = MiMoV2FlashForCausalLM(
+                aphrodite_config=aphrodite_config,
+                prefix=maybe_prefix(prefix, "language_model"),
+            )
+
+        self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors
+
+    def _parse_and_validate_image_input(self, **kwargs: object) -> Qwen2_5_VLImageInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            return Qwen2_5_VLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            return Qwen2_5_VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _parse_and_validate_video_input(self, **kwargs: object) -> Qwen2_5_VLVideoInputs | None:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        second_per_grid_ts = kwargs.pop("second_per_grid_ts", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            return Qwen2_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+            )
+
+        if video_embeds is not None:
+            return Qwen2_5_VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+            )
+
+    def _process_image_input(self, image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"]
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
+        return image_embeds.split(sizes)
+
+    def _process_video_input(self, video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+
+        if video_input["type"] == "video_embeds":
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"]
+            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw_list)
+
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
+        return video_embeds.split(sizes)
+
+    def _parse_and_validate_audio_input(self, **kwargs: object) -> dict | None:
+        audio_features = kwargs.pop("audio_features", None)
+        audio_token_lens = kwargs.pop("audio_token_lens", None)
+        if audio_features is None:
+            return None
+        return {
+            "type": "audio",
+            "audio_features": audio_features,
+            "audio_token_lens": audio_token_lens,
+        }
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "image_embeds") and "image" not in mm_input_by_modality:
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds") and "video" not in mm_input_by_modality:
+                mm_input_by_modality["video"] = self._parse_and_validate_video_input(**kwargs)
+            if input_key == "audio_features" and "audio" not in mm_input_by_modality:
+                mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(**kwargs)
+        return mm_input_by_modality
+
+    def _process_audio_input(self, audio_input: dict) -> tuple[torch.Tensor, ...]:
+        mel_specs = audio_input["audio_features"]
+        if self.audio_encoder is None:
+            return ()
+        # Normalize to List[2D-Tensor].
+        # MultiModalBatchedField._reduce_data either wraps a single [T, 128]
+        # into [1, T, 128] via unsqueeze(0) or stacks N same-T items into
+        # [N, T, 128]. Indexing along dim-0 extracts the per-item [T, 128].
+        if isinstance(mel_specs, torch.Tensor):
+            mel_specs = list(mel_specs)  # [1,T,128] or [N,T,128] → [[T,128],...]
+        if not mel_specs:
+            return ()
+        audio_embeds, item_token_lens = self.audio_encoder.get_audio_feature(mel_specs)
+        return tuple(audio_embeds.split(item_token_lens))
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        # Pop video_audio-specific fields before main mm parsing
+        video_audio_n_segs = kwargs.pop("video_audio_n_segs", None)
+        video_audio_seg_lens = kwargs.pop("video_audio_seg_lens", None)
+        va_audio_features = kwargs.pop("va_audio_features", None)
+
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not mm_input_by_modality and va_audio_features is None:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor corresponding to a multimodal data item (image, video, or audio).
+        multimodal_embeddings: list[torch.Tensor] = []
+
+        # Pre-process va audio: one mel spec per va video → per-video audio embeddings
+        # keyed by va video index (0-based among va videos only)
+        va_audio_embs_list: list[tuple[torch.Tensor, ...]] = []
+        if va_audio_features is not None and self.audio_encoder is not None:
+            mel_list = (
+                list(va_audio_features) if isinstance(va_audio_features, torch.Tensor) else list(va_audio_features)
+            )
+            for mel_spec in mel_list:
+                embs, tok_lens = self.audio_encoder.get_audio_feature([mel_spec])
+                # tok_lens is a list/tensor with one entry (total tokens for this mel)
+                va_audio_embs_list.append(embs)  # shape (total_tok, hidden)
+
+        va_cursor = 0  # index into va_audio_embs_list
+
+        # NOTE: Iterate in dict insertion order to preserve token sequence order.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                multimodal_embeddings.extend(self._process_image_input(multimodal_input))
+            elif modality == "video":
+                video_embs_tuple = self._process_video_input(multimodal_input)
+                if video_audio_n_segs is None:
+                    multimodal_embeddings.extend(video_embs_tuple)
+                else:
+                    grid_thw = multimodal_input["video_grid_thw"]
+                    for i, vid_embs in enumerate(video_embs_tuple):
+                        n_segs = int(video_audio_n_segs[i])
+                        if n_segs == 0 or not va_audio_embs_list:
+                            multimodal_embeddings.append(vid_embs)
+                        else:
+                            T = int(grid_thw[i][0])
+                            n_per_grid = vid_embs.shape[0] // T
+                            frames = list(vid_embs.split(n_per_grid, dim=0))
+                            frames_per_group = T // n_segs
+                            # Per-group audio token lengths for this va video
+                            # video_audio_seg_lens is (num_videos, max_T); row i
+                            # has valid values in [:n_segs], rest are zeros.
+                            seg_lens = video_audio_seg_lens[i][:n_segs].tolist()
+                            # Split full audio embs for this va video by group lengths
+                            full_va_embs = va_audio_embs_list[va_cursor]
+                            va_cursor += 1
+                            group_audio_embs = full_va_embs.split(seg_lens)
+                            # Interleave: all vid frames in group, then audio for group
+                            for g in range(n_segs):
+                                for f in range(frames_per_group):
+                                    multimodal_embeddings.append(frames[g * frames_per_group + f])
+                                multimodal_embeddings.append(group_audio_embs[g])
+            elif modality == "audio":
+                multimodal_embeddings.extend(self._process_audio_input(multimodal_input))
+        return tuple(multimodal_embeddings)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        """Run forward pass for Qwen2.5-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch. **NOTE**: If mrope is enabled (default setting for
+                Qwen2.5-VL opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        audio_loaded: set[str] = set()
+
+        loader = AutoWeightsLoader(self, skip_prefixes=["audio_tokenizer."])
+        auto_loaded = loader.load_weights(weights, mapper=self.hf_to_aphrodite_mapper)
+        return audio_loaded | auto_loaded
diff --git a/aphrodite/model_executor/models/minimax_m2.py b/aphrodite/model_executor/models/minimax_m2.py
index 84b8ed04af..f99e66e387 100644
--- a/aphrodite/model_executor/models/minimax_m2.py
+++ b/aphrodite/model_executor/models/minimax_m2.py
@@ -35,6 +35,7 @@
 from aphrodite.config import AphroditeConfig, CacheConfig, ModelConfig
 from aphrodite.distributed import (
     get_pp_group,
+    get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
 from aphrodite.model_executor.layers.attention import Attention
@@ -208,7 +209,18 @@ def __init__(
         )
 
         self.q_norm = MiniMaxText01RMSNormTP(self.head_dim * self.total_num_heads, eps=rms_norm_eps)
-        self.k_norm = MiniMaxText01RMSNormTP(self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps)
+        if self.total_num_kv_heads >= tp_size:
+            self.k_norm = MiniMaxText01RMSNormTP(self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps)
+        else:
+            # KV heads are replicated across TP ranks; shard k_norm weight by
+            # total_num_kv_heads rather than tp_size to avoid incorrect sharding.
+            num_kv_head_replicas = tp_size // self.total_num_kv_heads
+            self.k_norm = MiniMaxText01RMSNormTP(
+                self.head_dim * self.total_num_kv_heads,
+                eps=rms_norm_eps,
+                weight_shard_world_size=self.total_num_kv_heads,
+                weight_shard_rank=get_tensor_model_parallel_rank() // num_kv_head_replicas,
+            )
 
     def forward(
         self,
diff --git a/aphrodite/model_executor/models/mistral_eagle.py b/aphrodite/model_executor/models/mistral_eagle.py
new file mode 100644
index 0000000000..c482623970
--- /dev/null
+++ b/aphrodite/model_executor/models/mistral_eagle.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+
+from aphrodite.compilation.decorators import support_torch_compile
+from aphrodite.config import AphroditeConfig
+from aphrodite.logger import init_logger
+from aphrodite.model_executor.layers.layernorm import RMSNorm
+from aphrodite.model_executor.layers.linear import RowParallelLinear
+from aphrodite.model_executor.layers.logits_processor import LogitsProcessor
+from aphrodite.model_executor.layers.quantization.base_config import QuantizationConfig
+from aphrodite.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from aphrodite.model_executor.models.interfaces import MultiModalEmbeddings
+from aphrodite.model_executor.models.llama import LlamaConfig
+from aphrodite.model_executor.models.mistral import (
+    MistralDecoderLayer,
+    MistralForCausalLM,
+    MistralModel,
+)
+from aphrodite.model_executor.models.utils import (
+    _merge_multimodal_embeddings,
+    get_draft_quant_config,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class EagleMistralDecoderLayer(MistralDecoderLayer):
+    def __init__(
+        self,
+        aphrodite_config: AphroditeConfig,
+        prefix: str = "",
+        config: LlamaConfig | None = None,
+    ) -> None:
+        super().__init__(aphrodite_config, prefix=prefix, config=config)
+
+    def get_quant_config(self, aphrodite_config: AphroditeConfig) -> QuantizationConfig | None:
+        return get_draft_quant_config(aphrodite_config)
+
+
+@support_torch_compile
+class EagleMistralModel(MistralModel):
+    def __init__(
+        self,
+        *,
+        aphrodite_config: AphroditeConfig,
+        prefix: str = "",
+        start_layer_id: int = 0,
+    ) -> None:
+        # Bypass MistralModel.__init__ to avoid creating duplicate attention
+        # layer entries in the global context.
+        nn.Module.__init__(self)
+        self.config = aphrodite_config.speculative_config.draft_model_config.hf_config
+        self.vocab_size = self.config.vocab_size
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(aphrodite_config)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+            quant_config=self.quant_config,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                EagleMistralDecoderLayer(
+                    aphrodite_config,
+                    prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                    config=self.config,
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
+        self.fc = RowParallelLinear(
+            self.config.hidden_size * 2,
+            self.config.hidden_size,
+            bias=False,
+            input_is_parallel=False,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
+        )
+        self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_input_ids(input_ids)
+        hidden_states = self.fc(torch.cat((inputs_embeds, hidden_states), dim=-1))
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        # Pretend embed_tokens is loaded; the actual weight is shared
+        # from the target model at runtime by `load_eagle_model`.
+        return super().load_weights(weights) | {"embed_tokens.weight"}
+
+
+class EagleMistralForCausalLM(MistralForCausalLM):
+    mistral_mapping = MistralForCausalLM.mistral_mapping | {
+        "eagle_linear": "model.fc",
+    }
+
+    def __init__(self, *, aphrodite_config: AphroditeConfig, prefix: str = "") -> None:
+        # Bypass MistralForCausalLM.__init__ to use the draft model config
+        # and to avoid creating an lm_head.
+        nn.Module.__init__(self)
+        self.config = aphrodite_config.speculative_config.draft_model_config.hf_config
+        target_layer_num = aphrodite_config.model_config.get_num_layers(aphrodite_config.parallel_config)
+        self.model = EagleMistralModel(
+            aphrodite_config=aphrodite_config, prefix="model", start_layer_id=target_layer_num
+        )
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size, scale=logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.model(input_ids, positions, hidden_states, inputs_embeds)
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        inputs_embeds = super().embed_input_ids(input_ids)
+
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        assert is_multimodal is not None
+
+        return _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
diff --git a/aphrodite/model_executor/models/moondream3.py b/aphrodite/model_executor/models/moondream3.py
new file mode 100644
index 0000000000..7a3903e83d
--- /dev/null
+++ b/aphrodite/model_executor/models/moondream3.py
@@ -0,0 +1,1370 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Moondream3 model implementation."""
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+from functools import cached_property
+from itertools import islice
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BatchFeature
+
+from aphrodite.config import AphroditeConfig
+from aphrodite.config.multimodal import BaseDummyOptions
+from aphrodite.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from aphrodite.inputs import MultiModalDataDict
+from aphrodite.logger import init_logger
+from aphrodite.model_executor.layers.activation import get_act_fn
+from aphrodite.model_executor.layers.attention import Attention
+from aphrodite.model_executor.layers.attention.mm_encoder_attention import (
+    MMEncoderAttention,
+)
+from aphrodite.model_executor.layers.fused_moe import MoEActivation, fused_experts
+from aphrodite.model_executor.layers.fused_moe.config import biased_moe_quant_config
+from aphrodite.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from aphrodite.model_executor.layers.logits_processor import LogitsProcessor
+from aphrodite.model_executor.layers.quantization import QuantizationConfig
+from aphrodite.model_executor.layers.rotary_embedding import get_rope
+from aphrodite.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from aphrodite.model_executor.model_loader.weight_utils import default_weight_loader
+from aphrodite.multimodal import MULTIMODAL_REGISTRY
+from aphrodite.multimodal.inputs import (
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from aphrodite.multimodal.parse import ImageSize, MultiModalDataItems
+from aphrodite.multimodal.processing import (
+    BaseDummyInputsBuilder,
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from aphrodite.sequence import IntermediateTensors
+from aphrodite.transformers_utils.configs.moondream3 import (
+    Moondream3Config,
+    Moondream3TextConfig,
+    Moondream3VisionConfig,
+)
+from aphrodite.transformers_utils.processors.moondream3 import Moondream3Processor
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    extract_layer_index,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+# ============================================================================
+# Image Processing Utilities
+# ============================================================================
+
+
+def reconstruct_from_crops(
+    crops: torch.Tensor,
+    tiling: tuple[int, int],
+    overlap_margin: int,
+    patch_size: int = 14,
+) -> torch.Tensor:
+    """Reconstruct features from overlapping crops."""
+    tiling_h, tiling_w = tiling
+    crop_height, crop_width = crops[0].shape[:2]
+    margin_pixels = overlap_margin * patch_size
+
+    output_h = (crop_height - 2 * margin_pixels) * tiling_h + 2 * margin_pixels
+    output_w = (crop_width - 2 * margin_pixels) * tiling_w + 2 * margin_pixels
+
+    reconstructed = torch.zeros(
+        (output_h, output_w, crops[0].shape[2]),
+        device=crops[0].device,
+        dtype=crops[0].dtype,
+    )
+
+    for i, crop in enumerate(crops):
+        tile_y = i // tiling_w
+        tile_x = i % tiling_w
+
+        x_start = 0 if tile_x == 0 else margin_pixels
+        x_end = crop_width if tile_x == tiling_w - 1 else crop_width - margin_pixels
+        y_start = 0 if tile_y == 0 else margin_pixels
+        y_end = crop_height if tile_y == tiling_h - 1 else crop_height - margin_pixels
+
+        out_x = tile_x * (crop_width - 2 * margin_pixels)
+        out_y = tile_y * (crop_height - 2 * margin_pixels)
+
+        reconstructed[out_y + y_start : out_y + y_end, out_x + x_start : out_x + x_end] = crop[
+            y_start:y_end, x_start:x_end
+        ]
+
+    return reconstructed
+
+
+# ============================================================================
+# Vision Encoder Components
+# ============================================================================
+
+
+class Moondream3VisionMLP(nn.Module):
+    """MLP for vision encoder blocks."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.act = get_act_fn("gelu_pytorch_tanh")
+        self.fc2 = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class Moondream3VisionAttention(nn.Module):
+    """Self-attention for vision encoder (bidirectional)."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = num_heads // tp_size
+
+        self.attn = MMEncoderAttention(
+            num_heads=self.num_heads_per_partition,
+            head_size=self.head_dim,
+            scale=self.head_dim**-0.5,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(3, dim=-1)
+        out = self.attn(q, k, v)
+        out, _ = self.out_proj(out)
+        return out
+
+
+class Moondream3VisionBlock(nn.Module):
+    """Transformer block for vision encoder."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        num_heads: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(hidden_size, eps=1e-5)
+        self.attn = Moondream3VisionAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.ln2 = nn.LayerNorm(hidden_size, eps=1e-5)
+        self.mlp = Moondream3VisionMLP(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+class Moondream3VisionEncoder(nn.Module):
+    """Vision encoder (SigLIP-style ViT)."""
+
+    def __init__(
+        self,
+        config: Moondream3VisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        # Patch embedding
+        self.patch_emb = nn.Linear(
+            config.enc_patch_size * config.enc_patch_size * 3,
+            config.enc_dim,
+            bias=True,
+        )
+
+        # Position embeddings (27x27 = 729 patches for 378x378 / 14)
+        num_patches = (config.crop_size // config.enc_patch_size) ** 2
+        self.pos_emb = nn.Parameter(torch.zeros(1, num_patches, config.enc_dim))
+
+        # Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                Moondream3VisionBlock(
+                    hidden_size=config.enc_dim,
+                    intermediate_size=config.enc_ff_dim,
+                    num_heads=config.enc_n_heads,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.blocks.{i}",
+                )
+                for i in range(config.enc_n_layers)
+            ]
+        )
+
+        self.post_ln = nn.LayerNorm(config.enc_dim, eps=1e-5)
+
+    def create_patches(self, images: torch.Tensor) -> torch.Tensor:
+        """Convert images to patch embeddings.
+
+        Args:
+            images: (batch, channels, height, width)
+
+        Returns:
+            patches: (batch, num_patches, patch_dim)
+        """
+        patch_size = self.config.enc_patch_size
+        batch, channels, height, width = images.shape
+        patches_h = height // patch_size
+        patches_w = width // patch_size
+
+        # Unfold into patches
+        patches = images.unfold(2, patch_size, patch_size).unfold(3, patch_size, patch_size)
+        # (batch, channels, patches_h, patches_w, patch_size, patch_size)
+        patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
+        # (batch, patches_h, patches_w, channels, patch_size, patch_size)
+        patches = patches.view(batch, patches_h * patches_w, -1)
+        # (batch, num_patches, channels * patch_size * patch_size)
+
+        return patches
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """Encode images.
+
+        Args:
+            pixel_values: (batch, channels, height, width)
+
+        Returns:
+            features: (batch, num_patches, hidden_size)
+        """
+        # Create patches and embed
+        patches = self.create_patches(pixel_values)
+        x = self.patch_emb(patches)
+
+        # Add position embeddings
+        x = x + self.pos_emb
+
+        # Apply transformer blocks
+        for block in self.blocks:
+            x = block(x)
+
+        # Final layer norm
+        x = self.post_ln(x)
+
+        return x
+
+
+class Moondream3VisionProjection(nn.Module):
+    """Projects vision features to text embedding dimension."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        output_dim: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # Input is concatenated global and local features (2 * input_dim)
+        self.fc1 = ColumnParallelLinear(
+            input_dim * 2,
+            inner_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.act = get_act_fn("gelu_pytorch_tanh")
+        self.fc2 = RowParallelLinear(
+            inner_dim,
+            output_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+# ============================================================================
+# Text Decoder Components
+# ============================================================================
+
+
+class Moondream3TextMLP(nn.Module):
+    """Standard MLP for non-MoE layers (layers 0-3)."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.act = get_act_fn("gelu_pytorch_tanh")
+        self.fc2 = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class Moondream3TextMoE(nn.Module):
+    """Mixture of Experts layer for layers 4+ with expert parallelism.
+
+    Moondream3 uses a custom GeGLU activation: gelu(h) * (g + 1)
+    where fc1 outputs [gate, up] and the activation is gelu(gate) * (up + 1).
+
+    Uses expert parallelism where each GPU stores num_experts/tp_size experts.
+    Routing and communication handled via all-to-all or replicated computation.
+
+    Checkpoint format:
+    - fc1.weight: [num_experts, expert_inner_dim * 2, hidden_size] (gate+up)
+    - fc2.weight: [num_experts, hidden_size, expert_inner_dim] (down)
+    - router.weight: [num_experts, hidden_size]
+    - router.bias: [num_experts]
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        expert_inner_dim: int,
+        num_experts: int,
+        experts_per_token: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.expert_inner_dim = expert_inner_dim
+        self.num_experts = num_experts
+        self.experts_per_token = experts_per_token
+
+        # Expert parallelism: each GPU stores a subset of experts
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.experts_per_rank = num_experts // self.tp_size
+        self.num_local_experts = self.experts_per_rank
+
+        # Router (gate) - use ReplicatedLinear for compatibility
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=True,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        # Local expert weights (only store experts_per_rank experts)
+        # fc1: [experts_per_rank, expert_inner_dim * 2, hidden_size]
+        # fc2: [experts_per_rank, hidden_size, expert_inner_dim]
+        self.fc1_weight = nn.Parameter(torch.empty(self.num_local_experts, expert_inner_dim * 2, hidden_size))
+        self.fc2_weight = nn.Parameter(torch.empty(self.num_local_experts, hidden_size, expert_inner_dim))
+        self._use_fused_moe = True
+
+        local_expert_start = get_tensor_model_parallel_rank() * self.experts_per_rank
+        expert_map = torch.full((num_experts,), -1, dtype=torch.int32)
+        expert_map[local_expert_start : local_expert_start + self.num_local_experts] = torch.arange(
+            self.num_local_experts, dtype=torch.int32
+        )
+        self.register_buffer("_expert_map", expert_map, persistent=False)
+
+        # Preserve Moondream3's exact GeGLU variant (gelu(h) * (g + 1)) by
+        # adding +1 bias to the second half of the fused fc1 activations.
+        fused_w1_bias = torch.zeros(
+            self.num_local_experts,
+            expert_inner_dim * 2,
+            dtype=torch.float32,
+        )
+        fused_w1_bias[:, expert_inner_dim:] = 1.0
+        self.register_buffer("_fused_w1_bias", fused_w1_bias, persistent=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass with expert parallelism and custom GeGLU activation."""
+
+        # Get router logits and compute top-k
+        router_logits, _ = self.gate(x)  # [num_tokens, num_experts]
+        topk_logits, topk_ids = torch.topk(router_logits, self.experts_per_token, dim=-1)
+        # Softmax over selected experts
+        topk_weights = F.softmax(topk_logits, dim=-1, dtype=torch.float32).to(x.dtype)
+
+        if self._use_fused_moe and x.is_cuda:
+            try:
+                out = fused_experts(
+                    hidden_states=x.contiguous(),
+                    w1=self.fc1_weight,
+                    w2=self.fc2_weight,
+                    topk_weights=topk_weights.contiguous(),
+                    topk_ids=topk_ids.contiguous(),
+                    activation=MoEActivation.GELU,
+                    global_num_experts=self.num_experts,
+                    expert_map=self._expert_map,
+                    quant_config=biased_moe_quant_config(self._fused_w1_bias, None),
+                )
+                out = tensor_model_parallel_all_reduce(out)
+                return out
+            except (NotImplementedError, RuntimeError) as exc:
+                self._use_fused_moe = False
+                logger.warning_once(
+                    "Disabling fused Moondream3 MoE path and falling back to the Python expert loop: %s",
+                    str(exc),
+                )
+
+        tp_rank = get_tensor_model_parallel_rank()
+        # Compute local expert range
+        local_expert_start = tp_rank * self.experts_per_rank
+
+        # Fallback path for environments where fused kernels are unavailable.
+        out = x.new_zeros(x.shape)
+
+        for local_expert_idx in range(self.num_local_experts):
+            global_expert_id = local_expert_start + local_expert_idx
+
+            # Find tokens assigned to this expert
+            token_pos, which_k = (topk_ids == global_expert_id).nonzero(as_tuple=True)
+            if token_pos.numel() == 0:
+                continue
+
+            # Get tokens and their routing weights
+            x_tok = x.index_select(0, token_pos)  # [n_tokens, hidden_size]
+            gate_tok = topk_weights[token_pos, which_k]  # [n_tokens]
+
+            # fc1: [expert_inner_dim * 2, hidden_size]
+            # h_full: [n_tokens, expert_inner_dim * 2]
+            h_full = F.linear(x_tok, self.fc1_weight[local_expert_idx])
+
+            # GeGLU with (g + 1): h, g = split; output = gelu(h) * (g + 1)
+            # HF MoE uses exact GELU (not tanh approximation).
+            h, g = h_full.chunk(2, dim=-1)  # Each [n_tokens, expert_inner_dim]
+            h = F.gelu(h) * (g + 1.0)
+
+            # fc2: [hidden_size, expert_inner_dim]
+            # y: [n_tokens, hidden_size]
+            y = F.linear(h, self.fc2_weight[local_expert_idx])
+
+            # Apply routing weight
+            y = y * gate_tok.unsqueeze(-1)
+
+            # Accumulate output
+            out.index_add_(0, token_pos, y)
+
+        # All-reduce to combine results from all experts across GPUs
+        out = tensor_model_parallel_all_reduce(out)
+
+        return out
+
+
+class Moondream3Attention(nn.Module):
+    """Decoder attention with RoPE and tau scaling.
+
+    Moondream3 uses a tau attention mechanism that scales Q and V
+    based on both token content and position.
+    """
+
+    def __init__(
+        self,
+        config: Moondream3TextConfig,
+        layer_idx: int,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.dim
+        self.num_heads = config.n_heads
+        self.num_kv_heads = config.n_kv_heads
+        self.head_dim = config.dim // config.n_heads
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = self.num_heads // tp_size
+        self.num_kv_heads_per_partition = max(1, self.num_kv_heads // tp_size)
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            total_num_kv_heads=self.num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.out_proj = RowParallelLinear(
+            input_size=self.hidden_size,
+            output_size=self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        # Moondream uses 32-dim rotation out of 64-dim head (partial_rotary_factor=0.5)
+        # HF Moondream uses non-interleaved RoPE (split by half)
+        # In Aphrodite, is_neox_style=True means split by half (GPT-NeoX style)
+        rope_parameters = {
+            "rope_theta": config.rope_theta,
+            "partial_rotary_factor": 32 / self.head_dim,  # 32/64 = 0.5
+        }
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            max_position=config.max_context,
+            rope_parameters=rope_parameters,
+            is_neox_style=True,  # Moondream uses split-by-half (GPT-NeoX) style
+        )
+
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(
+            num_heads=self.num_heads_per_partition,
+            head_size=self.head_dim,
+            scale=self.scaling,
+            num_kv_heads=self.num_kv_heads_per_partition,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Tau scaling parameters for position-dependent attention
+        # These are learned during training to modulate attention based on position
+        # tau_wq and tau_wv need full qkv_dim for correct computation
+        # Only heads are partitioned, qkv dimension is kept full for all-gather
+        qkv_dim = self.hidden_size * 3  # Q + K + V dimension (full)
+        self.tau_alpha = nn.Parameter(torch.zeros(self.num_heads_per_partition))
+        self.tau_wq = nn.Parameter(torch.zeros(self.num_heads_per_partition, qkv_dim))
+        self.tau_wv = nn.Parameter(torch.zeros(self.num_heads_per_partition, qkv_dim))
+        self.tp_size = tp_size
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = qkv.split(
+            [
+                self.num_heads_per_partition * self.head_dim,
+                self.num_kv_heads_per_partition * self.head_dim,
+                self.num_kv_heads_per_partition * self.head_dim,
+            ],
+            dim=-1,
+        )
+
+        # Apply tau scaling to Q and V
+        # Tau scaling has two components:
+        # 1. Token-based: tok_q = tanh(gelu(qkv) @ tau_wq.T)
+        # 2. Position-based: tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)
+        # Final: tau = tok + tau_pos
+        #
+        # For TP, tau weights are sharded by head, but qkv_dim is kept full
+
+        # Get full qkv for tau computation
+        # With TP, reconstruct qkv in correct layout [q_full, k_full, v_full]
+        # (all-gather would produce [q_0, k_0, v_0, q_1, k_1, v_1] - wrong)
+        if self.tp_size > 1:
+            # All-gather once, then reconstruct [q_full, k_full, v_full].
+            qkv_full_sharded = tensor_model_parallel_all_gather(qkv.contiguous())
+            q_local_dim = q.shape[-1]
+            kv_local_dim = k.shape[-1]
+            qkv_full_sharded = qkv_full_sharded.view(
+                qkv.shape[0],
+                self.tp_size,
+                q_local_dim + 2 * kv_local_dim,
+            )
+            q_full = qkv_full_sharded[:, :, :q_local_dim].reshape(qkv.shape[0], -1)
+            k_full = qkv_full_sharded[:, :, q_local_dim : q_local_dim + kv_local_dim].reshape(qkv.shape[0], -1)
+            v_full = qkv_full_sharded[:, :, q_local_dim + kv_local_dim :].reshape(qkv.shape[0], -1)
+            qkv_full = torch.cat([q_full, k_full, v_full], dim=-1).contiguous()
+        else:
+            qkv_full = qkv
+
+        # Compute tau scaling factors matching HF implementation exactly:
+        # tok_feat = gelu(qkv)
+        # tok_q = tanh(tok_feat @ tau_wq.T)  # [num_tokens, num_heads]
+        # tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)  # [num_heads, num_tokens]
+        # tau = (tok_q.T + tau_pos).T  # [num_tokens, num_heads]
+        num_tokens = qkv_full.shape[0]
+        orig_dtype = q.dtype
+
+        # Token-based component
+        tok_feat = F.gelu(qkv_full)  # Apply GELU activation
+        tok_q = torch.tanh(tok_feat @ self.tau_wq.t())  # [N, H_per_partition]
+        tok_v = torch.tanh(tok_feat @ self.tau_wv.t())  # [N, H_per_partition]
+
+        # Position-based component
+        # tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)
+        # positions is [num_tokens], need to compute for each head
+        # tau_alpha: [num_heads_per_partition]
+        pos_float = (positions.to(orig_dtype) + 1.0).clamp(min=1e-6)
+        pos_log = pos_float.log()  # [num_tokens]
+        # alpha[:, None] * pos_log[None, :] -> [num_heads, num_tokens]
+        tau_pos = 1.0 + (torch.sigmoid(self.tau_alpha[:, None] * pos_log[None, :]) - 0.5)  # [H_per_partition, N]
+
+        # Combine token and position components
+        tau_q = (tok_q + tau_pos.t()).to(orig_dtype)  # [N, H_per_partition]
+        tau_v = (tok_v + tau_pos.t()).to(orig_dtype)  # [N, H_per_partition]
+
+        # Reshape q and v to apply per-head tau scaling
+        q = q.view(num_tokens, self.num_heads_per_partition, self.head_dim)
+        v = v.view(num_tokens, self.num_kv_heads_per_partition, self.head_dim)
+
+        # Apply tau scaling
+        q = q * tau_q.unsqueeze(-1)
+        v = v * tau_v[:, : self.num_kv_heads_per_partition].unsqueeze(-1)
+
+        # Reshape back
+        q = q.view(num_tokens, -1)
+        v = v.view(num_tokens, -1)
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class Moondream3DecoderLayer(nn.Module):
+    """Decoder layer with attention + MLP/MoE."""
+
+    def __init__(
+        self,
+        config: Moondream3TextConfig,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+
+        self.ln = nn.LayerNorm(config.dim, eps=1e-5, bias=True)
+
+        self.attn = Moondream3Attention(
+            config=config,
+            layer_idx=layer_idx,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Use MoE for layers >= moe_start_layer, standard MLP otherwise
+        if layer_idx >= config.moe_start_layer:
+            self.mlp = Moondream3TextMoE(
+                hidden_size=config.dim,
+                expert_inner_dim=config.moe_expert_inner_dim,
+                num_experts=config.moe_num_experts,
+                experts_per_token=config.moe_experts_per_token,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = Moondream3TextMLP(
+                hidden_size=config.dim,
+                intermediate_size=config.ff_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Pre-norm architecture
+        normed = self.ln(hidden_states)
+        attn_out = self.attn(positions, normed)
+        mlp_out = self.mlp(normed)
+        hidden_states = hidden_states + attn_out + mlp_out
+        return hidden_states
+
+
+class Moondream3TextModel(nn.Module):
+    """Text decoder model."""
+
+    def __init__(
+        self,
+        config: Moondream3TextConfig,
+        cache_config=None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.dim,
+            prefix=f"{prefix}.wte",
+        )
+
+        blocks_prefix = maybe_prefix(prefix, "blocks")
+        self.start_layer, self.end_layer, self.blocks = make_layers(
+            config.n_layers,
+            lambda prefix: Moondream3DecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=blocks_prefix,
+        )
+
+        self.post_ln = nn.LayerNorm(config.dim, eps=1e-5, bias=True)
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(["hidden_states"], config.dim)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        pp_group = get_pp_group()
+        if pp_group.is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                assert input_ids is not None
+                hidden_states = self.embed_input_ids(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for i, layer in enumerate(islice(self.blocks, self.start_layer, self.end_layer)):
+            hidden_states = layer(positions, hidden_states)
+
+        if not pp_group.is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        hidden_states = self.post_ln(hidden_states)
+        return hidden_states
+
+
+@dataclass(frozen=True)
+class Moondream3ImageInput:
+    """Container holding per-image inputs for embedding."""
+
+    pixel_values: torch.Tensor
+    tiling: tuple[int, int] | None
+
+
+# ============================================================================
+# Multimodal Processing
+# ============================================================================
+
+
+class Moondream3ProcessingInfo(BaseProcessingInfo):
+    """Processing info for Moondream3."""
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(Moondream3Processor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": 1}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        # HF pre-fills BOS together with the fixed 27x27 vision grid under
+        # the same bidirectional prefix mask: 1 BOS + 729 image embeddings.
+        return 730
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        return ImageSize(width=378, height=378)
+
+    def get_max_image_tokens(self) -> int:
+        return 730
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+
+class Moondream3DummyInputsBuilder(BaseDummyInputsBuilder[Moondream3ProcessingInfo]):
+    """Dummy inputs builder for profiling."""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>What is this image?<|md_reserved_2|>"
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+        mm_processor_kwargs: Mapping[str, object] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        return {
+            "image": self._get_dummy_images(
+                width=378,
+                height=378,
+                num_images=num_images,
+            )
+        }
+
+
+class Moondream3MultiModalProcessor(BaseMultiModalProcessor[Moondream3ProcessingInfo]):
+    """Multimodal processor for Moondream3."""
+
+    image_placeholder: str = "<image>"
+    bos_image_placeholder: str = "<|endoftext|><image>"
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Moondream3's processor handles images directly rather than exposing a
+        # separate `image_processor`, so keep the cache path on text+MM calls.
+        return super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs)
+
+    @cached_property
+    def bos_image_placeholder_tokens(self) -> list[int]:
+        tokenizer = self.info.get_tokenizer()
+        token_ids = tokenizer.encode(
+            self.bos_image_placeholder,
+            add_special_tokens=False,
+        )
+        if len(token_ids) < 2:
+            raise ValueError(
+                f"Tokenizer could not encode Moondream3 BOS/image placeholder {self.bos_image_placeholder!r}."
+            )
+        return token_ids
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return {
+            "pixel_values": MultiModalFieldConfig.batched("image"),
+            "tilings": MultiModalFieldConfig.batched("image", keep_on_cpu=True),
+        }
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        # Moondream3 HF processor does NOT expand placeholder tokens.
+        # Aphrodite expands BOS + <image> so the whole HF image prefix is marked
+        # bidirectional by the multimodal prefix-LM mask.
+        return False
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> list[PromptUpdate]:
+        image_size = self.info.get_image_size_with_most_features()
+        num_image_tokens = self.info.get_num_image_tokens(
+            image_width=image_size.width,
+            image_height=image_size.height,
+        )
+        placeholder_tokens = self.bos_image_placeholder_tokens
+        bos_token = placeholder_tokens[0]
+        image_token = placeholder_tokens[-1]
+        return [
+            PromptReplacement(
+                modality="image",
+                target=placeholder_tokens,
+                replacement=PromptUpdateDetails(
+                    full=[bos_token] + [image_token] * (num_image_tokens - 1),
+                ),
+            ),
+        ]
+
+
+# ============================================================================
+# Main Model
+# ============================================================================
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Moondream3MultiModalProcessor,
+    info=Moondream3ProcessingInfo,
+    dummy_inputs=Moondream3DummyInputsBuilder,
+)
+class Moondream3ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    """Moondream3 multimodal model for causal language modeling.
+
+    Aphrodite supports the standard autoregressive Moondream3 query and caption
+    prompt formats. The region-module point/detect skills require custom
+    coordinate decoding and are intentionally not exposed here.
+    """
+
+    supports_multimodal = True
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    def __init__(
+        self,
+        *,
+        aphrodite_config: AphroditeConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        hf_config = aphrodite_config.model_config.hf_config
+        quant_config = aphrodite_config.quant_config
+        cache_config = aphrodite_config.cache_config
+
+        # Reuse the transformers_utils config implementation.
+        if isinstance(hf_config, Moondream3Config):
+            self.config = hf_config
+        else:
+            config_dict = hf_config.config if hasattr(hf_config, "config") else {}
+            self.config = Moondream3Config(config=config_dict)
+
+        with self._mark_tower_model(aphrodite_config, "image"):
+            # Vision encoder
+            self.vision = Moondream3VisionEncoder(
+                config=self.config.vision_config,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "vision"),
+            )
+
+            # Vision projection
+            self.vision_proj = Moondream3VisionProjection(
+                input_dim=self.config.vision_config.enc_dim,
+                inner_dim=self.config.vision_config.proj_inner_dim,
+                output_dim=self.config.text_config.dim,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "vision_proj"),
+            )
+
+        with self._mark_language_model(aphrodite_config):
+            # Text decoder
+            self.text = Moondream3TextModel(
+                config=self.config.text_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "text"),
+            )
+
+            # LM head (with bias - Moondream3 has lm_head bias)
+            self.lm_head = ParallelLMHead(
+                self.config.text_config.vocab_size,
+                self.config.text_config.dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+
+        self.logits_processor = LogitsProcessor(self.config.text_config.vocab_size)
+        self.make_empty_intermediate_tensors = self.text.make_empty_intermediate_tensors
+        self._answer_id = getattr(
+            self.config,
+            "answer_token_id",
+            getattr(hf_config, "answer_token_id", 3),
+        )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality == "image":
+            return "<image>"
+        return None
+
+    def get_language_model(self) -> nn.Module:
+        return self.text
+
+    def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
+        return num_image_tokens
+
+    def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
+        return num_vision_tokens
+
+    def _split_pixel_values(
+        self,
+        pixel_values: object,
+    ) -> list[torch.Tensor]:
+        # The processor should standardize image inputs into:
+        # - torch.Tensor [num_images, num_crops, C, H, W], or
+        # - list[torch.Tensor[num_crops, C, H, W]] for ragged crops.
+        if isinstance(pixel_values, torch.Tensor):
+            if pixel_values.dim() != 5:
+                raise ValueError(
+                    "Expected `pixel_values` tensor with shape "
+                    "[num_images, num_crops, C, H, W], got "
+                    f"{tuple(pixel_values.shape)}."
+                )
+            return [pv.contiguous() for pv in pixel_values]
+
+        if isinstance(pixel_values, (list, tuple)):
+            tensors: list[torch.Tensor] = []
+            for value in pixel_values:
+                if not isinstance(value, torch.Tensor):
+                    raise TypeError(f"Expected each `pixel_values` element to be a tensor, got {type(value)!r}.")
+                if value.dim() != 4:
+                    raise ValueError(f"Unsupported pixel_values element shape {tuple(value.shape)}.")
+                tensors.append(value.contiguous())
+            return tensors
+
+        raise TypeError(f"pixel_values must be a tensor or a sequence of tensors, got {type(pixel_values)!r}.")
+
+    def _split_tilings(
+        self,
+        tilings: object,
+        expected: int,
+    ) -> list[tuple[int, int] | None]:
+        if tilings is None:
+            return [None] * expected
+
+        if isinstance(tilings, torch.Tensor):
+            if tilings.dim() != 2 or tilings.shape[1] != 2:
+                raise ValueError(f"Expected `tilings` tensor with shape [num_images, 2], got {tuple(tilings.shape)}.")
+            tiling_items = tilings.tolist()
+        elif isinstance(tilings, (list, tuple)):
+            tiling_items = list(tilings)
+        else:
+            raise TypeError(f"tilings must be None, a tensor or a sequence of tuples, got {type(tilings)!r}.")
+
+        if len(tiling_items) != expected:
+            raise ValueError(
+                f"Mismatch between the number of pixel_values entries ({expected}) and tilings ({len(tiling_items)})."
+            )
+
+        normalized: list[tuple[int, int] | None] = []
+        for tiling in tiling_items:
+            if tiling is None:
+                normalized.append(None)
+                continue
+            if isinstance(tiling, torch.Tensor):
+                tiling = tiling.tolist()
+            if isinstance(tiling, (list, tuple)) and len(tiling) == 2:
+                normalized.append((int(tiling[0]), int(tiling[1])))
+            else:
+                raise ValueError(f"Each tiling entry must be a pair of integers, got {tiling!r}.")
+        return normalized
+
+    def _parse_image_inputs(self, **kwargs: object) -> list[Moondream3ImageInput]:
+        pixel_values = kwargs.get("pixel_values")
+        if pixel_values is None:
+            return []
+
+        pixel_values_list = self._split_pixel_values(pixel_values)
+        tilings_list = self._split_tilings(kwargs.get("tilings"), len(pixel_values_list))
+
+        image_inputs: list[Moondream3ImageInput] = []
+        for value, tiling in zip(pixel_values_list, tilings_list):
+            if value.dim() != 4:
+                raise ValueError(f"Expected 4D tensor for crops, got {tuple(value.shape)}.")
+            image_inputs.append(Moondream3ImageInput(pixel_values=value, tiling=tiling))
+        return image_inputs
+
+    def _encode_image_input(self, image_input: Moondream3ImageInput) -> torch.Tensor:
+        pixel_values = image_input.pixel_values
+        if pixel_values.dim() != 4:
+            raise ValueError(f"Expected 4D tensor for crops, got {tuple(pixel_values.shape)}.")
+
+        device = self.vision.patch_emb.weight.device
+        dtype = self.vision.patch_emb.weight.dtype
+        pixel_values = pixel_values.to(device=device, dtype=dtype)
+
+        features = self.vision(pixel_values)
+
+        # Grid size = crop_size / patch_size (e.g., 378 / 14 = 27)
+        grid_size = self.config.vision_config.crop_size // self.config.vision_config.enc_patch_size
+        enc_dim = self.config.vision_config.enc_dim
+        global_features = features[0]
+
+        if features.shape[0] > 1:
+            if image_input.tiling is None:
+                raise ValueError("Missing tiling metadata for multi-crop Moondream image.")
+            local = features[1:].contiguous().view(-1, grid_size, grid_size, enc_dim)
+            reconstructed = reconstruct_from_crops(
+                local,
+                image_input.tiling,
+                overlap_margin=self.config.vision_config.overlap_margin,
+                patch_size=1,
+            )
+        else:
+            reconstructed = global_features.view(grid_size, grid_size, enc_dim)
+
+        recon = reconstructed.permute(2, 0, 1).contiguous()
+        # Mirror HF reference behavior: reconstructed local features are pooled
+        # to enc_n_layers x enc_n_layers. For moondream3-preview this is 27x27.
+        pooled_size = self.config.vision_config.enc_n_layers
+        if pooled_size != grid_size:
+            logger.warning_once(
+                "Moondream3 pooled_size (%d) differs from crop grid (%d). "
+                "Using enc_n_layers to match HF reference behavior.",
+                pooled_size,
+                grid_size,
+            )
+        recon = F.adaptive_avg_pool2d(recon, output_size=(pooled_size, pooled_size))
+        recon = recon.permute(1, 2, 0).contiguous().view(-1, enc_dim)
+
+        combined = torch.cat([global_features, recon], dim=-1).unsqueeze(0)
+        projected = self.vision_proj(combined).squeeze(0)
+
+        # Note: Vision embeddings are already synchronized across TP ranks
+        # because the vision projection uses RowParallelLinear which performs
+        # all-reduce internally, ensuring identical outputs on all ranks.
+
+        return projected
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        """Generate the HF image prefix: BOS embedding + 729 image embeddings."""
+        image_inputs = self._parse_image_inputs(**kwargs)
+        if not image_inputs:
+            return []
+
+        device = self.vision.patch_emb.weight.device
+        bos_ids = torch.tensor([self.config.bos_token_id], device=device)
+        bos_embedding = self.text.embed_input_ids(bos_ids)
+
+        embeddings: list[torch.Tensor] = []
+        for image_input in image_inputs:
+            image_embeddings = self._encode_image_input(image_input)
+            embeddings.append(torch.cat([bos_embedding.to(image_embeddings.dtype), image_embeddings]))
+        return embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.text(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        if logits is not None:
+            logits[:, self._answer_id] = float("-inf")
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Load weights with remapping from HuggingFace format."""
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        # Get expert intermediate size for fc1 splitting
+
+        for name, loaded_weight in weights:
+            # Map from HF naming to Aphrodite naming
+            # model.vision.* -> vision.*
+            # model.text.* -> text.*
+            if name.startswith("model."):
+                name = name[6:]  # Remove "model." prefix
+
+            # Specific name mappings
+            # Vision projection: vision.proj_mlp.fc1 -> vision_proj.fc1
+            name = name.replace("vision.proj_mlp.", "vision_proj.")
+
+            # Text embedding: text.wte (no suffix) -> text.wte.weight
+            if name == "text.wte":
+                name = "text.wte.weight"
+
+            # LM head: text.lm_head -> lm_head
+            name = name.replace("text.lm_head.", "lm_head.")
+
+            # Attention mapping
+            name = name.replace(".attn.qkv.", ".attn.qkv_proj.")
+            name = name.replace(".attn.proj.", ".attn.out_proj.")
+
+            # Tau attention scaling weights
+            # HF format: .attn.tau.alpha -> .attn.tau_alpha
+            name = name.replace(".attn.tau.alpha", ".attn.tau_alpha")
+            name = name.replace(".attn.tau.wq", ".attn.tau_wq")
+            name = name.replace(".attn.tau.wv", ".attn.tau_wv")
+
+            # MoE router mapping: mlp.router -> mlp.gate
+            name = name.replace(".mlp.router.", ".mlp.gate.")
+
+            # Handle MoE expert weights for layers 4+ with expert parallelism
+            # fc1.weight: [n_experts, expert_inner_dim * 2, hidden_size] (gate+up)
+            # fc2.weight: [n_experts, hidden_size, expert_inner_dim] (down)
+            # Each GPU stores n_experts/tp_size experts
+            # Note: Only 3D weights are MoE, 2D weights are standard MLP
+            if ".mlp.fc1.weight" in name and loaded_weight.dim() == 3:
+                from aphrodite.distributed import get_tensor_model_parallel_rank
+
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                num_experts = loaded_weight.shape[0]
+                experts_per_rank = num_experts // tp_size
+                expert_start = tp_rank * experts_per_rank
+                expert_end = expert_start + experts_per_rank
+                # Shard by expert dimension
+                loaded_weight = loaded_weight[expert_start:expert_end].contiguous()
+                # Map to our custom MoE format: mlp.fc1_weight
+                name = name.replace(".mlp.fc1.weight", ".mlp.fc1_weight")
+
+            if ".mlp.fc2.weight" in name and loaded_weight.dim() == 3:
+                from aphrodite.distributed import get_tensor_model_parallel_rank
+
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                num_experts = loaded_weight.shape[0]
+                experts_per_rank = num_experts // tp_size
+                expert_start = tp_rank * experts_per_rank
+                expert_end = expert_start + experts_per_rank
+                # Shard by expert dimension
+                loaded_weight = loaded_weight[expert_start:expert_end].contiguous()
+                # Map to our custom MoE format: mlp.fc2_weight
+                name = name.replace(".mlp.fc2.weight", ".mlp.fc2_weight")
+
+            # Handle tau weights with tensor parallelism
+            # tau_alpha: [num_heads] -> [num_heads/tp]
+            # tau_wq: [num_heads, qkv_dim] -> [num_heads/tp, qkv_dim/tp]
+            # tau_wv: [num_heads, qkv_dim] -> [num_heads/tp, qkv_dim/tp]
+            if ".tau_alpha" in name:
+                from aphrodite.distributed import get_tensor_model_parallel_rank
+
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                num_heads = loaded_weight.shape[0]
+                heads_per_partition = num_heads // tp_size
+                start = tp_rank * heads_per_partition
+                end = start + heads_per_partition
+                loaded_weight = loaded_weight[start:end].contiguous()
+
+            if ".tau_wq" in name or ".tau_wv" in name:
+                from aphrodite.distributed import get_tensor_model_parallel_rank
+
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                num_heads, qkv_dim = loaded_weight.shape
+                heads_per_partition = num_heads // tp_size
+                # Only shard by head dimension, keep full qkv_dim for all-gather
+                head_start = tp_rank * heads_per_partition
+                head_end = head_start + heads_per_partition
+                loaded_weight = loaded_weight[head_start:head_end, :].contiguous()
+
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        return loaded_params
diff --git a/aphrodite/model_executor/models/qwen2.py b/aphrodite/model_executor/models/qwen2.py
index 4b5a0f7348..da373516f9 100644
--- a/aphrodite/model_executor/models/qwen2.py
+++ b/aphrodite/model_executor/models/qwen2.py
@@ -303,44 +303,15 @@ def forward(
         return hidden_states, residual
 
 
-def qwen_2_model_invariants(
-    input_ids: torch.Tensor,
-    positions: torch.Tensor,
-    intermediate_tensors: IntermediateTensors | None = None,
-    inputs_embeds: torch.Tensor | None = None,
-):
-    """Shape invariants for Qwen2Model Model, those are translated to
-    runtime assertions for unbacked dynamic shapes and are compiled away for
-    backed"""
-    # All these should be equal.
-    # input_ids.size()[0]
-    # positions.size()[-1]
-    # intermediate_tensors["hidden_states"].size()[0]
-    # inputs_embeds.size()[0]
-    torch._check(input_ids.size()[0] == positions.size()[-1])
-    if intermediate_tensors is not None:
-        torch._check(input_ids.size()[0] == intermediate_tensors["hidden_states"].size()[0])
-
-    if inputs_embeds is not None:
-        torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
-
-    # Hidden dimensions should match (hidden_size)
-    # intermediate_tensors["hidden_states"].size()[1]
-    # inputs_embeds.size()[1]
-    if inputs_embeds is not None and intermediate_tensors is not None:
-        torch._check(inputs_embeds.size()[1] == intermediate_tensors["hidden_states"].size()[1])
-
-
 @support_torch_compile(
     dynamic_arg_dims={
-        "input_ids": 0,
+        "input_ids": {0: "b"},
         # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
         # otherwise (seq_len, ).
-        "positions": -1,
-        "intermediate_tensors": 0,
-        "inputs_embeds": 0,
-    },
-    shape_invariants=qwen_2_model_invariants,
+        "positions": {-1: "b"},
+        "intermediate_tensors": {0: "b"},
+        "inputs_embeds": {0: "b"},
+    }
 )
 class Qwen2Model(nn.Module, EagleModelMixin):
     def __init__(
diff --git a/aphrodite/model_executor/models/registry.py b/aphrodite/model_executor/models/registry.py
index 132860815e..dc446a1b34 100644
--- a/aphrodite/model_executor/models/registry.py
+++ b/aphrodite/model_executor/models/registry.py
@@ -91,6 +91,7 @@
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
+    "CohereMoeForCausalLM": ("cohere_moe", "CohereMoeForCausalLM"),
     "CwmForCausalLM": ("llama", "LlamaForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
@@ -152,6 +153,7 @@
     "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"),
     "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),
     "Lfm2MoeForCausalLM": ("lfm2_moe", "Lfm2MoeForCausalLM"),
+    "LagunaForCausalLM": ("laguna", "LagunaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),
     # For decapoda-research/llama-*
@@ -173,7 +175,8 @@
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"),
-    "MiMoV2FlashForCausalLM": ("mimo_v2_flash", "MiMoV2FlashForCausalLM"),
+    "MiMoV2FlashForCausalLM": ("mimo_v2", "MiMoV2FlashForCausalLM"),
+    "MiMoV2ForCausalLM": ("mimo_v2", "MiMoV2ForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
     "NemotronHPuzzleForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
@@ -470,6 +473,7 @@
     ),
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),
     "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
+    "MiMoV2OmniForCausalLM": ("mimo_v2_omni", "MiMoV2OmniForCausalLM"),
     "MiniMaxVL01ForConditionalGeneration": (
         "minimax_vl_01",
         "MiniMaxVL01ForConditionalGeneration",
@@ -482,6 +486,8 @@
     ),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "Molmo2ForConditionalGeneration": ("molmo2", "Molmo2ForConditionalGeneration"),
+    "Moondream3ForCausalLM": ("moondream3", "Moondream3ForCausalLM"),
+    "HfMoondream": ("moondream3", "Moondream3ForCausalLM"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
     "NemotronH_Nano_Omni_Reasoning_V3": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
     "NemotronH_Super_Omni_Reasoning_V3": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
@@ -572,6 +578,8 @@
 _SPECULATIVE_DECODING_MODELS = {
     "ExtractHiddenStatesModel": ("extract_hidden_states", "ExtractHiddenStatesModel"),
     "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"),
+    "MiMoV2MTPModel": ("mimo_v2_mtp", "MiMoV2MTP"),
+    "MiMoV2OmniMTPModel": ("mimo_v2_mtp", "MiMoV2OmniMTP"),
     "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
     "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"),
     "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
@@ -581,6 +589,7 @@
     "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "EagleMistralForCausalLM": ("mistral_eagle", "EagleMistralForCausalLM"),
     "EagleMistralLarge3ForCausalLM": (
         "mistral_large_3_eagle",
         "EagleMistralLarge3ForCausalLM",
diff --git a/aphrodite/multimodal/cache.py b/aphrodite/multimodal/cache.py
index 7bff5de34b..e88a7eb2ed 100644
--- a/aphrodite/multimodal/cache.py
+++ b/aphrodite/multimodal/cache.py
@@ -502,11 +502,25 @@ def get_and_update_item(
 
             self._p0_cache[mm_hash] = prompt_updates
             return self.address_as_item(address, monotonic_id), prompt_updates
-        except (ValueError, MemoryError) as e:
-            # put may fail if the object is too large or
-            # the cache is full.
-            # In this case we log the error and keep the original mm_input.
-            logger.debug("Failed to cache mm_input with hash %s: %s", mm_hash, e)
+        except ValueError as e:
+            # `put` raises ValueError either for an oversize item or for a
+            # duplicate key (concurrent insert); the latter is benign so we
+            # only warn on the oversize case. Subsequent UUID-only requests
+            # for an oversize item will fail with a cache miss.
+            if "already exists" not in str(e):
+                logger.warning_once(
+                    "mm_input %s too large to cache; raise --mm-shm-cache-max-object-size-mb. (%s)",
+                    mm_hash,
+                    str(e),
+                )
+            return mm_item
+        except MemoryError as e:
+            # Cache full and protected items prevent eviction.
+            logger.debug(
+                "mm_input %s not cached; shm cache full, consider raising --mm-processor-cache-gb. (%s)",
+                mm_hash,
+                str(e),
+            )
             return mm_item
 
     @override
diff --git a/aphrodite/multimodal/registry.py b/aphrodite/multimodal/registry.py
index 9bfebd2336..11fc205341 100644
--- a/aphrodite/multimodal/registry.py
+++ b/aphrodite/multimodal/registry.py
@@ -203,7 +203,8 @@ def create_processor(
         Create a multi-modal processor for a specific model and tokenizer.
         """
         if not model_config.is_multimodal_model:
-            raise ValueError(f"{model_config.model} is not a multimodal model")
+            model_name = model_config.served_model_name or model_config.model
+            raise ValueError(f"{model_name} is not a multimodal model")
 
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
diff --git a/aphrodite/parser/abstract_parser.py b/aphrodite/parser/abstract_parser.py
index 07d0508517..f994972a4d 100644
--- a/aphrodite/parser/abstract_parser.py
+++ b/aphrodite/parser/abstract_parser.py
@@ -38,6 +38,10 @@
 from aphrodite.reasoning.abs_reasoning_parsers import ReasoningParser
 from aphrodite.tokenizers import TokenizerLike
 from aphrodite.tool_parsers.abstract_tool_parser import ToolParser
+from aphrodite.tool_parsers.streaming import (
+    extract_named_tool_call_streaming,
+    extract_required_tool_call_streaming,
+)
 from aphrodite.tool_parsers.utils import Tool
 from aphrodite.utils import random_uuid
 
@@ -53,6 +57,11 @@ class StreamState:
     prompt_reasoning_checked: bool = False
     previous_text: str = ""
     previous_token_ids: list[int] = field(default_factory=list)
+    history_tool_call_cnt: int = 0
+    tool_call_id_type: str = "random"
+    # only used for "required" and "named tool" choices,
+    # tracks whether function name has been fully returned in the stream yet
+    function_name_returned: bool = False
 
 
 class Parser:
@@ -413,6 +422,13 @@ def extract_response_outputs(
 
         return outputs
 
+    def _get_function_name(self, request: ChatCompletionRequest | ResponsesRequest) -> str:
+        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
+            return request.tool_choice.name
+        if request.tool_choice and isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
+            return request.tool_choice.function.name
+        raise ValueError("Invalid tool_choice for function name extraction.")
+
     def _parse_tool_calls(
         self,
         request: ResponsesRequest,
@@ -430,16 +446,13 @@ def _parse_tool_calls(
         """
         function_calls: list[FunctionCall] = []
 
-        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
-            # Forced Function Call (Responses API style)
-            assert content is not None
-            function_calls.append(FunctionCall(name=request.tool_choice.name, arguments=content))
-            return function_calls, None  # Clear content since tool is called.
-
-        if request.tool_choice and isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
-            # Forced Function Call (Chat Completion API style)
+        if request.tool_choice and isinstance(
+            request.tool_choice,
+            (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
+        ):
+            # Forced Function Call
             assert content is not None
-            function_calls.append(FunctionCall(name=request.tool_choice.function.name, arguments=content))
+            function_calls.append(FunctionCall(name=self._get_function_name(request), arguments=content))
             return function_calls, None  # Clear content since tool is called.
 
         if request.tool_choice == "required":
@@ -544,6 +557,55 @@ def extract_tool_calls_streaming(
             request,
         )
 
+    def _extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest | ResponsesRequest,
+        # The following parameters are used for "required" tool choice parsing and are
+        # tracked in StreamState for streaming parsing.
+        tool_call_idx: int | None = None,
+        tool_call_id_type: str = "random",
+        function_name_returned: bool = False,
+    ) -> tuple[DeltaMessage | None, bool]:
+        if request.tool_choice and isinstance(
+            request.tool_choice,
+            (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
+        ):
+            delta_message, function_name_returned = extract_named_tool_call_streaming(
+                delta_text=delta_text,
+                function_name=self._get_function_name(request),
+                function_name_returned=function_name_returned,
+                tool_call_idx=tool_call_idx,
+                tool_call_id_type=tool_call_id_type,
+                tokenizer=self.model_tokenizer,
+            )
+            return delta_message, function_name_returned
+
+        if request.tool_choice == "required":
+            delta_message, function_name_returned = extract_required_tool_call_streaming(
+                previous_text=previous_text,
+                current_text=current_text,
+                delta_text=delta_text,
+                function_name_returned=function_name_returned,
+                tool_call_idx=tool_call_idx,
+                tool_call_id_type=tool_call_id_type,
+            )
+            return delta_message, function_name_returned
+        return self.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request,  # type: ignore[arg-type]
+        ), False
+
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         if self._reasoning_parser is None:
             return False
@@ -614,7 +676,8 @@ def parse_delta(
                 state.previous_token_ids = []
                 delta_text = current_text
                 delta_token_ids = current_token_ids
-            delta_message = self.extract_tool_calls_streaming(
+
+            delta_message, state.function_name_returned = self._extract_tool_calls_streaming(
                 previous_text=state.previous_text,
                 current_text=current_text,
                 delta_text=delta_text,
@@ -622,6 +685,9 @@ def parse_delta(
                 current_token_ids=current_token_ids,
                 delta_token_ids=delta_token_ids,
                 request=request,  # type: ignore[arg-type]
+                tool_call_idx=state.history_tool_call_cnt,
+                tool_call_id_type=state.tool_call_id_type,
+                function_name_returned=state.function_name_returned,
             )
 
         # No parsers: pass through as content
diff --git a/aphrodite/platforms/cpu.py b/aphrodite/platforms/cpu.py
index 738bb31937..0ae45499a0 100644
--- a/aphrodite/platforms/cpu.py
+++ b/aphrodite/platforms/cpu.py
@@ -16,7 +16,6 @@
     get_memory_node_info,
 )
 from aphrodite.utils.mem_constants import GiB_bytes
-from aphrodite.utils.torch_utils import is_quantized_kv_cache
 from aphrodite.v1.attention.backends.registry import AttentionBackendEnum
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
@@ -126,16 +125,6 @@ def check_and_update_config(cls, aphrodite_config: AphroditeConfig) -> None:
         scheduler_config = aphrodite_config.scheduler_config
         # async scheduling is not required on CPU
         scheduler_config.async_scheduling = False
-        if (scheduler_config.enable_chunked_prefill or cache_config.enable_prefix_caching) and is_quantized_kv_cache(
-            cache_config.cache_dtype
-        ):
-            raise RuntimeError(
-                "Chunked-prefill and prefix-cache on the CPU backend is not compatible with FP8 KV cache."
-            )
-
-        if is_quantized_kv_cache(cache_config.cache_dtype):
-            logger.warning("CPU backend doesn't support KV cache quantization fallback to auto.")
-            cache_config.cache_dtype = "auto"
 
         parallel_config = aphrodite_config.parallel_config
         # OMP requires the MP executor to function correctly, UniProc is not
@@ -429,6 +418,8 @@ def pack_kv_cache(
         block_offsets = torch.arange(block_size, device="cpu", dtype=torch.long)
         num_blocks = len(block_ids)
         slot_mapping = (block_offsets.reshape(1, block_size) + indices.reshape(num_blocks, 1) * block_size).flatten()
+        if key_cache.dtype == torch.uint8:
+            raise NotImplementedError("FP8 KV cache is not yet supported with KV transfer on CPU")
         cpu_attn_reshape_and_cache(
             key,
             value,
diff --git a/aphrodite/platforms/interface.py b/aphrodite/platforms/interface.py
index eff61b1798..e1346bc555 100644
--- a/aphrodite/platforms/interface.py
+++ b/aphrodite/platforms/interface.py
@@ -86,6 +86,9 @@ def __gt__(self, other: Any) -> bool:
             return NotImplemented
         return (self.major, self.minor) > (other.major, other.minor)
 
+    def __hash__(self) -> int:
+        return hash((self.major, self.minor))
+
     def as_version_str(self) -> str:
         return f"{self.major}.{self.minor}"
 
diff --git a/aphrodite/platforms/rocm.py b/aphrodite/platforms/rocm.py
index 559f6cda0d..1b6eb43dcb 100644
--- a/aphrodite/platforms/rocm.py
+++ b/aphrodite/platforms/rocm.py
@@ -409,10 +409,17 @@ class RocmPlatform(Platform):
         "gguf",
         "quark",
         "mxfp4",
-        "gpt_oss_mxfp4",
+        "mxfp8",
         "torchao",
         "bitsandbytes",
+        "modelopt",
         "modelopt_fp4",
+        "modelopt_mxfp8",
+        "modelopt_mixed",
+        "fp8_per_tensor",
+        "fp8_per_block",
+        "online",
+        "gpt_oss_mxfp4",
     ]
 
     @classmethod
@@ -754,9 +761,9 @@ def get_punica_wrapper(cls) -> str:
 
     @classmethod
     def get_current_memory_usage(cls, device: torch.types.Device | None = None) -> float:
+        torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats(device)
-        free_mem, total_mem = torch.cuda.mem_get_info(device)
-        return total_mem - free_mem
+        return torch.cuda.max_memory_allocated(device)
 
     @classmethod
     def get_device_communicator_cls(cls) -> str:
diff --git a/aphrodite/reasoning/__init__.py b/aphrodite/reasoning/__init__.py
index 8d69611f55..a6a2545b31 100644
--- a/aphrodite/reasoning/__init__.py
+++ b/aphrodite/reasoning/__init__.py
@@ -35,6 +35,18 @@
         "deepseek_v3_reasoning_parser",
         "DeepSeekV3ReasoningParser",
     ),
+    "poolside_v1": (
+        "poolside_v1_reasoning_parser",
+        "PoolsideV1ReasoningParser",
+    ),
+    "cohere_command3": (
+        "cohere_command_reasoning_parser",
+        "CohereCommand3ReasoningParser",
+    ),
+    "cohere_command4": (
+        "cohere_command_reasoning_parser",
+        "CohereCommand4ReasoningParser",
+    ),
     "ernie45": (
         "ernie45_reasoning_parser",
         "Ernie45ReasoningParser",
diff --git a/aphrodite/reasoning/cohere_command_reasoning_parser.py b/aphrodite/reasoning/cohere_command_reasoning_parser.py
new file mode 100644
index 0000000000..564374cc18
--- /dev/null
+++ b/aphrodite/reasoning/cohere_command_reasoning_parser.py
@@ -0,0 +1,519 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import json
+from collections.abc import Mapping, Sequence
+from typing import Any, NamedTuple, TypedDict, TypeGuard
+
+import regex as re
+import xgrammar as xgr
+
+try:
+    from cohere_melody import PyFilter, PyFilterOptions
+except ImportError as e:
+    raise ImportError(
+        "The Cohere reasoning parser requires the `cohere_melody` "
+        "package, which is not installed. Install it with:\n"
+        "    pip install cohere_melody"
+    ) from e
+
+
+from aphrodite.entrypoints.mcp.tool_server import ToolServer
+from aphrodite.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from aphrodite.entrypoints.openai.engine.protocol import (
+    AnyResponseFormat,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
+from aphrodite.entrypoints.openai.responses.protocol import ResponsesRequest
+from aphrodite.reasoning import ReasoningParser
+from aphrodite.sampling_params import StructuredOutputsParams
+from aphrodite.tokenizers import TokenizerLike
+
+REPLACEMENT_CHAR = "\ufffd"
+
+
+class CohereTagRegistry(NamedTuple):
+    """A single ``structural_tag`` begin("trigger")/end pair."""
+
+    trigger: str
+    end: str
+
+
+class CohereTagStyle(NamedTuple):
+    """The structural tags style for a given model architecture."""
+
+    json: CohereTagRegistry
+    tools: CohereTagRegistry
+
+
+class CohereNormalizedTool(TypedDict):
+    """A tool definition normalized to the shape ``collect_tool_schema`` expects.
+
+    ``parameters`` is a JSON Schema object (possibly empty) describing the tool's
+    call signature.
+    """
+
+    name: str
+    parameters: dict[str, Any]
+
+
+COMMAND_A_TOOLS_TAG = CohereTagRegistry(trigger="<|START_ACTION|>", end="<|END_ACTION|>")
+COMMAND_A_JSON_TAG = CohereTagRegistry(trigger="<|START_RESPONSE|>", end="<|END_RESPONSE|>")
+
+MODEL_TO_TAG_STYLE: dict[str, CohereTagStyle] = {
+    "Cohere2ForCausalLM": CohereTagStyle(json=COMMAND_A_JSON_TAG, tools=COMMAND_A_TOOLS_TAG),
+    "Cohere2VisionForConditionalGeneration": CohereTagStyle(json=COMMAND_A_JSON_TAG, tools=COMMAND_A_TOOLS_TAG),
+}
+
+
+def collect_tool_schema(tool_schema: list[CohereNormalizedTool]) -> str:
+    """Build an xgrammar EBNF grammar that matches a JSON array of tool calls.
+
+    The grammar shape is architecture-independent; callers are responsible for
+    wrapping it in the correct structural tag (see ``CohereTagStyle.tools``).
+    """
+    tool_dictionary: dict[str, str] = {}
+    for tool in tool_schema:
+        tool_name = tool["name"]
+        tool_parameters = json.dumps(tool["parameters"])
+        json_schema = f"""{{
+                        "type": "object",
+                        "properties": {{
+                            "tool_call_id": {{
+                                "type": "string",
+                                "pattern": "^[0-9]+$"
+                            }},
+                            "tool_name": {{
+                                "type": "string",
+                                "const": "{tool_name}"
+                            }},
+                            "parameters": {tool_parameters}
+                            }}
+                            }}"""
+        tool_grammar = str(xgr.Grammar.from_json_schema(json_schema))
+        for match in re.findall(r"\b(\w+)\s*::=", tool_grammar):
+            tool_grammar = re.sub(rf"\b{re.escape(match)}\b", tool_name + match, tool_grammar)
+        tool_dictionary[tool_name] = f"{tool_name} ::= {tool_name}root\n{tool_grammar}"
+    # Emitted grammar shape:
+    #   root  ::= tools
+    #   tools ::= ws "[" ws tool ws ("," ws tool)* ws "]" ws
+    #   ws    ::= (" " | "\t" | "\n")*
+    #   tool  ::= <tool_a> | <tool_b> | ...         (one alternative per input)
+    #   <tool_x>     ::= <tool_x>root               (per-tool xgrammar rules)
+    #   <tool_x>root ::= ...                        (from xgr.Grammar.from_json_schema)
+    tool_alternatives = "tool ::= " + " | ".join(tool_dictionary.keys())
+    tool_rules = "\n    ".join(tool_dictionary.values())
+    grammar = f"""root ::= tools
+    tools ::= ws "[" ws tool ws ("," ws tool)*  ws "]" ws
+    ws    ::= (" " | "\\t" | "\\n")*
+    {tool_alternatives}
+    {tool_rules}
+    """
+    return grammar
+
+
+def _tool_definitions_to_schema_list(
+    tools: str | list[Any],
+) -> list[CohereNormalizedTool]:
+    """
+    Build the list of ``CohereNormalizedTool`` dicts expected by
+    ``collect_tool_schema``.
+
+    Accepts:
+    - JSON string
+    - list of dicts with top-level ``name`` / ``parameters``
+    - list of Chat Completions-style ``{"type": "function", "function": {...}}``
+    - list of Pydantic models with ``model_dump()``
+    """
+    if isinstance(tools, str):
+        try:
+            parsed = json.loads(tools)
+        except json.JSONDecodeError:
+            return []
+        if not isinstance(parsed, list):
+            return []
+    else:
+        parsed = list(tools)
+
+    out: list[CohereNormalizedTool] = []
+    for raw in parsed:
+        t = raw.model_dump() if hasattr(raw, "model_dump") else raw
+        if not isinstance(t, dict):
+            continue
+        # Unwrap Chat Completions' ``{"type": "function", "function": {...}}``
+        # shape; otherwise take the dict as-is.
+        if t.get("type") == "function" and isinstance(t.get("function"), dict):
+            t = t["function"]
+        name = t.get("name")
+        if not isinstance(name, str):
+            continue
+        params = t.get("parameters")
+        out.append(
+            CohereNormalizedTool(
+                name=name,
+                parameters=params if isinstance(params, dict) else {},
+            )
+        )
+    return out
+
+
+def _has_effective_tools(
+    tools: str | list[Any] | None,
+) -> TypeGuard[str | list[Any]]:
+    """
+    True when ``tools`` contains at least one tool definition to convert.
+
+    ``ResponsesRequest`` defaults ``tools`` to ``[]``; ``ChatCompletionRequest``
+    uses ``None``. Both mean "no tools" here. Strings (e.g. a JSON blob) are
+    treated as effective only when non-blank.
+    """
+    if tools is None:
+        return False
+    if isinstance(tools, str):
+        return bool(tools.strip())
+    return len(tools) > 0
+
+
+# Builder: produces Aphrodite response_format in xgrammar's canonical format.
+# See xgrammar docs: type "structural_tag" with "format" = triggered_tags
+# and tag content type = json_schema | grammar.
+def convert_schema_to_structural_tags(
+    schema: dict | None = None,
+    tools: str | list[Any] | None = None,
+    model_architecture: str | None = None,
+) -> str | None:
+    """
+    Returns a response_format string accepted by xgrammar's structural tag format.
+    Uses the canonical shape: {"type": "structural_tag", "format": {...}} with
+    format.type "triggered_tags" and tag content type "json_schema" or "grammar".
+
+    Callers that are not on an engine path (e.g. the reasoning parser) must pass
+    ``model_architecture`` explicitly.
+    """
+    if model_architecture is None or model_architecture not in MODEL_TO_TAG_STYLE:
+        return None
+    style = MODEL_TO_TAG_STYLE[model_architecture]
+
+    tags: list[dict] = []
+
+    def _add_tag(tag: CohereTagRegistry, content: dict) -> None:
+        tags.append({"begin": tag.trigger, "content": content, "end": tag.end})
+
+    if schema is not None:
+        # Add the JSON-schema tag both for schema-only requests and for the
+        # "tools plus JSON mode" case (North use case: follow the schema when
+        # the model decides not to call any tool).
+        _add_tag(style.json, {"type": "json_schema", "json_schema": schema})
+
+    if _has_effective_tools(tools):
+        # ``tools`` may be a JSON string (poseidon / RESPONSE_FORMAT_TOOL_DEFINITIONS)
+        # or a list (Chat Completions ``request.tools`` as Pydantic models or dicts).
+        tool_schema_list = _tool_definitions_to_schema_list(tools)
+        if not tool_schema_list:
+            raise ValueError(
+                "No valid tool definitions could be parsed from the request for structural tag conversion."
+            )
+        tool_grammar = collect_tool_schema(tool_schema_list)
+        _add_tag(style.tools, {"type": "grammar", "grammar": tool_grammar})
+
+    if not tags:
+        return None
+    return json.dumps(
+        {
+            "type": "structural_tag",
+            "format": {
+                "type": "triggered_tags",
+                "triggers": [t["begin"] for t in tags],
+                "tags": tags,
+            },
+        }
+    )
+
+
+def _response_format_type(
+    response_format: AnyResponseFormat | dict | None,
+) -> str | None:
+    if response_format is None:
+        return None
+    if isinstance(response_format, dict):
+        t = response_format.get("type")
+        return t if isinstance(t, str) else None
+    return response_format.type
+
+
+def _maybe_parse_json_dict(value: Any) -> dict | None:
+    """If value is a JSON string, parse to dict; otherwise require dict."""
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, str):
+        try:
+            parsed = json.loads(value)
+        except (TypeError, json.JSONDecodeError):
+            return None
+        return parsed if isinstance(parsed, dict) else None
+    return None
+
+
+def _unwrap_nested_schema(candidate: Any) -> dict | None:
+    """Return ``candidate`` as a dict, unwrapping a nested ``schema`` if present.
+
+    Returns ``None`` if ``candidate`` is not (and cannot be parsed into) a dict.
+    """
+    cand = _maybe_parse_json_dict(candidate)
+    if not isinstance(cand, dict):
+        return None
+    nested = cand.get("schema")
+    return nested if isinstance(nested, dict) else cand
+
+
+def _schema_from_json_schema_field(js_wr: Any) -> dict | None:
+    """
+    Extract the JSON Schema object from Chat Completions ``json_schema`` payload.
+
+    Accepts:
+    - ``JsonSchemaResponseFormat`` (Pydantic) with ``schema`` / ``json_schema`` field
+    - dict in OpenAI shape ``{"name": ..., "schema": {...}}``
+    - dict with ``json_schema`` key holding either the schema or a nested wrapper
+    - dict that is already a JSON Schema document (some clients omit the wrapper)
+    - JSON strings for any of the above
+    """
+    if js_wr is None:
+        return None
+
+    parsed_wr = _maybe_parse_json_dict(js_wr)
+    if parsed_wr is not None:
+        js_wr = parsed_wr
+
+    if hasattr(js_wr, "model_dump"):
+        for by_alias in (True, False):
+            try:
+                data = js_wr.model_dump(by_alias=by_alias, exclude_none=False)
+            except TypeError:
+                data = js_wr.model_dump(by_alias=by_alias)
+            out = _unwrap_nested_schema(data.get("schema") or data.get("json_schema"))
+            if out is not None:
+                return out
+        inner_attr = getattr(js_wr, "json_schema", None)
+        return inner_attr if isinstance(inner_attr, dict) else None
+
+    if isinstance(js_wr, dict):
+        for key in ("schema", "json_schema"):
+            out = _unwrap_nested_schema(js_wr.get(key))
+            if out is not None:
+                return out
+        return js_wr
+
+    return None
+
+
+def _schema_dict_from_chat_response_format(
+    rf: AnyResponseFormat | dict | None,
+) -> dict | None:
+    """JSON schema dict from Chat Completions ``request.response_format`` only."""
+    if rf is None:
+        return None
+    rf_type = _response_format_type(rf)
+    if rf_type == "json_object":
+        return {"type": "object"}
+    if rf_type != "json_schema":
+        return None
+    js_wr = rf.get("json_schema") if isinstance(rf, dict) else getattr(rf, "json_schema", None)
+    return _schema_from_json_schema_field(js_wr)
+
+
+def _schema_dict_from_structured_outputs(
+    so: StructuredOutputsParams | None,
+) -> dict | None:
+    """Schema dict from ``structured_outputs`` (``json`` / ``json_object``).
+
+    Same unwrapping as ``json_schema``. ``json`` is expected to be ``str`` or
+    ``dict`` (enforced by ``StructuredOutputsParams`` / request models); other
+    types raise ``ValueError`` only if a caller bypasses that validation.
+    """
+    if so is None:
+        return None
+    if so.json_object:
+        return {"type": "object"}
+    raw: Any = so.json
+    if raw is None:
+        return None
+
+    if hasattr(raw, "model_dump"):
+        out = _schema_from_json_schema_field(raw)
+        if out is None:
+            raise ValueError("structured_outputs.json model has no extractable JSON Schema.")
+        return out
+
+    if isinstance(raw, str):
+        if not raw.strip():
+            raise ValueError("structured_outputs.json cannot be empty.")
+        try:
+            raw = json.loads(raw)
+        except json.JSONDecodeError as e:
+            raise ValueError("structured_outputs.json must be valid JSON.") from e
+        if not isinstance(raw, dict):
+            raise ValueError("structured_outputs.json must decode to a JSON object.")
+
+    if isinstance(raw, Mapping):
+        body = raw if isinstance(raw, dict) else dict(raw)
+        return _schema_from_json_schema_field(body) or body
+
+    raise ValueError(f"structured_outputs.json has unsupported type {type(raw).__name__}.")
+
+
+class BaseCohereCommandReasoningParser(ReasoningParser):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        *args,
+        streaming_opts: PyFilterOptions,
+        unary_opts: PyFilterOptions,
+        **kwargs,
+    ):
+        super().__init__(tokenizer, *args, **kwargs)
+        self.end_token_id = tokenizer.convert_tokens_to_ids("<|END_THINKING|>")
+        self.unary_opts = unary_opts
+        self.melody_unary = PyFilter(unary_opts)
+        self.melody_streaming = PyFilter(streaming_opts)
+
+    @property
+    def reasoning_start_str(self) -> str | None:
+        return "<|START_THINKING|>"
+
+    @property
+    def reasoning_end_str(self) -> str | None:
+        return "<|END_THINKING|>"
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        r = self.melody_streaming.write_decoded(delta_text)
+        if r.content is None and r.reasoning is None and not r.tool_calls:
+            return None
+        msg = DeltaMessage()
+        if r.content is not None:
+            msg.content = r.content
+        if r.reasoning is not None:
+            msg.reasoning = r.reasoning
+        if r.tool_calls:
+            msg.tool_calls = [
+                DeltaToolCall(
+                    id=tc.id,
+                    index=tc.index,
+                    type="function",
+                    function=DeltaFunctionCall(name=tc.name, arguments=tc.arguments),
+                )
+                for tc in r.tool_calls
+            ]
+        return msg
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
+        result = self.melody_unary.process_full_text(model_output)
+        return result.reasoning, result.content
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        token_buf: list[int] = []
+        content_ids: list[int] = []
+        content_filter = PyFilter(self.unary_opts)
+        for t in input_ids:
+            token_buf.append(t)
+            s = self.model_tokenizer.decode(token_buf, skip_special_tokens=False)
+            if s.endswith(REPLACEMENT_CHAR):
+                continue
+            r = content_filter.write_decoded(s)
+            if r.content is not None:
+                content_ids.extend(token_buf)
+            token_buf = []
+        return content_ids
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return any(tid == self.end_token_id for tid in reversed(input_ids))
+
+    def prepare_structured_tag(self, original_tag: str | None, tool_server: ToolServer | None) -> str | None:
+        # Responses API replaces ``structural_tag`` via the reasoning parser.
+        # Default ``ReasoningParser.prepare_structured_tag`` returns None, which
+        # would clear a Cohere tag produced in ``adjust_request`` and break
+        # ``StructuredOutputsParams`` validation. Preserve the existing tag.
+        return original_tag
+
+    def adjust_request(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> ChatCompletionRequest | ResponsesRequest:
+        so = request.structured_outputs
+        if so is not None and so.structural_tag:
+            return request
+        # Schema: prefer ``response_format`` (OpenAI Chat Completions), then
+        # ``structured_outputs.json`` / ``json_object`` (Aphrodite direct). Tools stay
+        # on ``request.tools``.
+        rf = request.response_format if isinstance(request, ChatCompletionRequest) else None
+        if rf is not None and _response_format_type(rf) == "structural_tag":
+            return request
+        model_architecture = self._model_config.architecture if self._model_config is not None else None
+        tools = request.tools
+        # ``response_format`` wins if both it and ``structured_outputs`` supply JSON.
+        schema = _schema_dict_from_chat_response_format(rf)
+        if schema is None:
+            schema = _schema_dict_from_structured_outputs(so)
+        if schema is None and not _has_effective_tools(tools):
+            return request
+        if model_architecture is None:
+            return request
+        result = convert_schema_to_structural_tags(
+            schema=schema,
+            tools=tools,
+            model_architecture=model_architecture,
+        )
+        if result is None:
+            # Unsupported architectures are not in ``MODEL_TO_TAG_STYLE``; conversion
+            raise ValueError(
+                "Failed to build structural_tag guided decoding constraints from "
+                "this request's JSON schema and/or tools. The configured model "
+                f"architecture ({model_architecture!r}) does not support Cohere "
+                "command structural tags, or the schema cannot be expressed in "
+                "that format."
+            )
+        request.structured_outputs = StructuredOutputsParams(structural_tag=result)
+        # Folded JSON constraints into ``structural_tag``; drop ``response_format``
+        # when it was the source so ``to_sampling_params`` does not also set ``json`` /
+        # ``json_object`` (mutually exclusive in ``StructuredOutputsParams``).
+        if isinstance(request, ChatCompletionRequest) and rf is not None:
+            rf_type = _response_format_type(rf)
+            if rf_type in ("json_schema", "json_object"):
+                request.response_format = None
+        return request
+
+
+class CohereCommand3ReasoningParser(BaseCohereCommandReasoningParser):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(
+            tokenizer,
+            *args,
+            streaming_opts=PyFilterOptions().cmd3(),
+            unary_opts=PyFilterOptions().cmd3().no_tools(),
+            **kwargs,
+        )
+
+
+class CohereCommand4ReasoningParser(BaseCohereCommandReasoningParser):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(
+            tokenizer,
+            *args,
+            streaming_opts=PyFilterOptions().cmd4(),
+            unary_opts=PyFilterOptions().cmd4().no_tools(),
+            **kwargs,
+        )
diff --git a/aphrodite/reasoning/olmo3_reasoning_parser.py b/aphrodite/reasoning/olmo3_reasoning_parser.py
index 5cc2a922ca..3eae98f1e2 100644
--- a/aphrodite/reasoning/olmo3_reasoning_parser.py
+++ b/aphrodite/reasoning/olmo3_reasoning_parser.py
@@ -212,22 +212,26 @@ class Olmo3ReasoningParser(ReasoningParser):
           token is missing from generation.
     """
 
+    think_start: str = r"<think>"
+    think_end: str = r"</think>"
+    # </think> is split in 3 by the pre-tokenizer, first split can be tokenized
+    # with an optional leading space, so there are 2 possible tokenizations
+    think_end_first_split: list[str] = [r"Ġ</", r"</"]
+    think_end_rest_split: list[str] = [r"think", r">"]
+    # notice that the first think is optional; this allows template to
+    # work in cases when we hardcode a <think> at the beginning of the
+    # reasoning template.
+    reasoning_regex: re.Pattern = re.compile(
+        rf"^(?:{think_start})?(?P<reasoning>.*?)"
+        rf"{think_end}(?P<content>.*)$",
+        re.DOTALL,
+    )
+
     def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
-
-        self.think_start = r"<think>"
-        self.think_end = r"</think>"
-
-        # notice that the first think is optional; this allows template to
-        # work in cases when we hardcode a <think> at the beginning of the
-        # reasoning template.
-        reasoning_expr = (
-            rf"^(?:{self.think_start})?(?P<reasoning>.*?)"
-            rf"{self.think_end}(?P<content>.*)$"
-        )
-        self.reasoning_regex = re.compile(reasoning_expr, re.DOTALL)
-
         self.buffer = Olmo3ReasoningBuffer(think_start=self.think_start, think_end=self.think_end)
+        self.think_end_first_token_ids: list[int] = [self.vocab[token] for token in self.think_end_first_split]
+        self.think_end_rest_token_ids: list[int] = [self.vocab[token] for token in self.think_end_rest_split]
 
     @property
     def reasoning_start_str(self) -> str:
@@ -238,8 +242,12 @@ def reasoning_end_str(self) -> str:
         return self.think_end
 
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
-        text = self.model_tokenizer.decode(input_ids)
-        return self.think_end in text
+        rest_ids = self.think_end_rest_token_ids
+        rest_len = len(rest_ids)
+        for i in range(len(input_ids) - rest_len, -1, -1):
+            if list(input_ids[i + 1 : i + 1 + rest_len]) == rest_ids and input_ids[i] in self.think_end_first_token_ids:
+                return True
+        return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         # for Olmo 3 streaming reason parsing, the stream parse
diff --git a/aphrodite/reasoning/poolside_v1_reasoning_parser.py b/aphrodite/reasoning/poolside_v1_reasoning_parser.py
new file mode 100644
index 0000000000..32135a9784
--- /dev/null
+++ b/aphrodite/reasoning/poolside_v1_reasoning_parser.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Laguna reasoning parser.
+
+``DeepSeekV3ReasoningParser.is_reasoning_end`` walks the entire
+token sequence backwards and returns ``True`` on the first ``</think>`` it
+sees. When called on ``prompt_token_ids`` that mistakes any stray
+``</think>`` in conversation history, few-shot examples or tool descriptions
+for a template-injected "thinking already ended" marker. In the streaming
+path (see ``aphrodite/entrypoints/openai/chat_completion/serving.py``,
+``prompt_is_reasoning_end_arr``) that false positive short-circuits the
+reasoning parser for the whole response, so any ``<think>...</think>`` the
+model emits itself ends up in the content field instead of the reasoning
+field.
+
+As we have more flexible templates, we instead scope
+the backward search to the current assistant turn: the
+walk terminates as soon as we hit the ``<assistant>`` start-of-message
+token. A ``</think>`` in a prior user turn or few-shot example is no longer
+visible.
+"""
+
+from collections.abc import Sequence
+
+from transformers import PreTrainedTokenizerBase
+
+from aphrodite.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from aphrodite.reasoning.deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
+from aphrodite.reasoning.identity_reasoning_parser import IdentityReasoningParser
+
+
+class PoolsideV1ReasoningParser(DeepSeekV3ReasoningParser):
+    """Drop-in replacement for ``deepseek_v3`` that tolerates ``</think>``
+    tokens appearing anywhere in the prompt other than the generation prefix.
+    """
+
+    _start_of_assistant_message = "<assistant>"
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        if self._start_of_assistant_message not in self.vocab:
+            raise ValueError(f"Tokenizer must contain {self._start_of_assistant_message!r} token")
+        self._start_of_assistant_message_token_id = self.vocab[self._start_of_assistant_message]
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        # IdentityReasoningParser always returns True: no reasoning to parse.
+        if isinstance(self._parser, IdentityReasoningParser):
+            return True
+
+        assert isinstance(self._parser, DeepSeekR1ReasoningParser)
+        for tok_id in reversed(input_ids):
+            # <think>: reasoning is not yet ended.
+            if tok_id == self._parser.start_token_id:
+                return False
+            # </think>: reasoning has ended.
+            if tok_id == self._parser.end_token_id:
+                return True
+            # <assistant>: reached the start of the current assistant turn
+            # without seeing either marker. Anything further back belongs to
+            # the prior conversation and should be ignored.
+            if tok_id == self._start_of_assistant_message_token_id:
+                return False
+        return False
+
+
+__all__ = ["PoolsideV1ReasoningParser"]
diff --git a/aphrodite/renderers/base.py b/aphrodite/renderers/base.py
index c43b7e811e..f7904d3c54 100644
--- a/aphrodite/renderers/base.py
+++ b/aphrodite/renderers/base.py
@@ -101,7 +101,7 @@ def __init__(self, config: "AphroditeConfig", tokenizer: _T | None) -> None:
         self._mm_cache_stats: MultiModalCacheStats | None = None
         self._clear_mm_cache_async = make_async(self.clear_mm_cache, executor=self._executor)
         self._process_multimodal_async = make_async(self._process_multimodal, executor=self._mm_executor)
-        if config.model_config.is_multimodal_model:
+        if mm_registry.supports_multimodal_inputs(config.model_config):
             mm_processor_cache = mm_registry.processor_cache_from_config(config)
 
             # Deep-copy the tokenizer so the multimodal processor gets its
@@ -743,6 +743,8 @@ def _process_embeds(self, prompt: EmbedsPrompt) -> EmbedsInput:
         return embeds_input(
             prompt_embeds=prompt_embeds,
             cache_salt=prompt.get("cache_salt"),
+            prompt_token_ids=prompt.get("prompt_token_ids"),
+            is_token_ids=prompt.get("prompt_is_token_ids"),
         )
 
     async def _process_tokens_async(
diff --git a/aphrodite/renderers/embed_utils.py b/aphrodite/renderers/embed_utils.py
index 6daa0c29f0..4a93c1f720 100644
--- a/aphrodite/renderers/embed_utils.py
+++ b/aphrodite/renderers/embed_utils.py
@@ -7,6 +7,7 @@
 import torch
 
 from aphrodite.exceptions import APHRODITEValidationError
+from aphrodite.utils.async_utils import make_async
 
 if TYPE_CHECKING:
     from aphrodite.config import ModelConfig
@@ -30,15 +31,52 @@ def safe_load_prompt_embeds(
             weights_only=True,
             map_location=torch.device("cpu"),
         )
-        assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
-            torch.float32,
-            torch.bfloat16,
-            torch.float16,
-        )
+        if not isinstance(tensor, torch.Tensor):
+            raise APHRODITEValidationError(
+                "`prompt_embeds` payload did not deserialize to a torch.Tensor.",
+                parameter="prompt_embeds",
+            )
         tensor = tensor.to_dense()
 
     if tensor.dim() > 2:
         tensor = tensor.squeeze(0)
-        assert tensor.dim() == 2
+    if tensor.dim() != 2:
+        raise APHRODITEValidationError(
+            f"`prompt_embeds` must be a 2D tensor of shape (num_tokens, hidden_size); got shape {tuple(tensor.shape)}.",
+            parameter="prompt_embeds",
+        )
+
+    # Pin each tensor to the model's hidden_size. Validating here
+    # also transitively guarantees cross-tensor consistency for requests that
+    # include multiple `prompt_embeds` parts, which is required by downstream
+    # concatenation in `_build_mixed_prompt_embeds`.
+    expected_hidden_size = model_config.get_hidden_size()
+    if tensor.shape[1] != expected_hidden_size:
+        raise APHRODITEValidationError(
+            f"`prompt_embeds` hidden_size {tensor.shape[1]} does not match "
+            f"the model's hidden_size {expected_hidden_size}.",
+            parameter="prompt_embeds",
+        )
+
+    # Cast to the model's dtype so API clients don't need to know the server's
+    # `--dtype` setting ahead of time. Only floating-point source dtypes are
+    # allowed. integer / bool / complex inputs almost certainly indicate caller
+    # error (e.g. quantized payloads, wrong tensor), and a silent `.to()`
+    # could hide a real mistake.
+    expected_dtype = model_config.dtype
+    if tensor.dtype != expected_dtype:
+        if not tensor.is_floating_point():
+            raise APHRODITEValidationError(
+                f"`prompt_embeds` dtype {tensor.dtype} is not a floating-point "
+                f"type, cannot safely cast to the model's dtype {expected_dtype}.",
+                parameter="prompt_embeds",
+            )
+        tensor = tensor.to(expected_dtype)
 
     return tensor
+
+
+safe_load_prompt_embeds_async = make_async(safe_load_prompt_embeds)
+"""Async variant of `safe_load_prompt_embeds` that defers the decode to a
+thread-pool executor, so the asyncio event loop is not blocked by the base64
+decode + `torch.load` work."""
diff --git a/aphrodite/renderers/hf.py b/aphrodite/renderers/hf.py
index f75e21f9d5..003a1b63f9 100644
--- a/aphrodite/renderers/hf.py
+++ b/aphrodite/renderers/hf.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
 import inspect
 import itertools
+import weakref
 from collections import defaultdict, deque
-from collections.abc import Set
+from collections.abc import Sequence
 from functools import lru_cache
-from typing import Any, Literal, cast, overload
+from typing import TYPE_CHECKING, Any, Final, Literal, cast, overload
 
 import jinja2
 import jinja2.ext
@@ -13,20 +16,32 @@
 import jinja2.nodes
 import jinja2.parser
 import jinja2.sandbox
+import torch
+from typing_extensions import override
 
-from aphrodite.config import AphroditeConfig, ModelConfig
 from aphrodite.entrypoints.chat_utils import (
-    ChatCompletionMessageParam,
-    ChatTemplateContentFormat,
-    ChatTemplateContentFormatOption,
+    PROMPT_EMBEDS_PLACEHOLDER_TOKEN,
     ChatTemplateResolutionError,
-    ConversationMessage,
     load_chat_template,
     parse_chat_messages,
     parse_chat_messages_async,
 )
-from aphrodite.inputs import MultiModalDataDict, MultiModalUUIDDict
+from aphrodite.inputs import EmbedsPrompt
+from aphrodite.inputs.engine import MultiModalInput
 from aphrodite.logger import init_logger
+from aphrodite.multimodal.hasher import MultiModalHasher
+from aphrodite.multimodal.inputs import (
+    MultiModalFieldElem,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    MultiModalSharedField,
+    PlaceholderRange,
+)
+from aphrodite.multimodal.processing.processor import (
+    PromptReplacement,
+    apply_token_matches,
+    find_mm_placeholders,
+)
 from aphrodite.tokenizers.hf import HfTokenizer
 from aphrodite.transformers_utils.chat_templates import get_chat_template_fallback_path
 from aphrodite.transformers_utils.processor import cached_get_processor
@@ -34,13 +49,162 @@
 from aphrodite.utils.func_utils import supports_kw
 
 from .base import BaseRenderer
-from .inputs import DictPrompt
 from .inputs.preprocess import parse_dec_only_prompt
-from .params import ChatParams
+
+if TYPE_CHECKING:
+    from collections.abc import Set
+
+    from aphrodite.config import AphroditeConfig, ModelConfig
+    from aphrodite.entrypoints.chat_utils import (
+        ChatCompletionMessageParam,
+        ChatTemplateContentFormat,
+        ChatTemplateContentFormatOption,
+        ConversationMessage,
+    )
+    from aphrodite.inputs import MultiModalDataDict, MultiModalUUIDDict, TokensPrompt
+    from aphrodite.inputs.engine import TokensInput
+    from aphrodite.multimodal.processing.processor import (
+        MultiModalPromptUpdates,
+        ResolvedPromptUpdate,
+    )
+
+    from .inputs import DictPrompt
+    from .params import ChatParams
 
 logger = init_logger(__name__)
 
 
+# Cache of `tokenizer -> prompt_embeds placeholder token ID`. Keyed by the
+# tokenizer object (not `id(tokenizer)`) so a fresh tokenizer landing at a
+# recycled memory address can't pick up a stale tid. Entries evict atomically
+# with the tokenizer's garbage-collection.
+_PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE: Final[weakref.WeakKeyDictionary[HfTokenizer, int]] = (
+    weakref.WeakKeyDictionary()
+)
+_PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_ERROR: Final[str] = (
+    "Expected {token!r} to tokenize to exactly 1 token, got {num_ids} ({ids!r})."
+)
+_PROMPT_EMBEDS_PLACEHOLDER_SPAN_MISMATCH_ERROR: Final[str] = (
+    "Expected {expected} prompt_embeds placeholder spans in the tokenized prompt, found {actual}."
+)
+_MISSING_PROMPT_TOKEN_IDS_ERROR: Final[str] = (
+    "Expected prompt_token_ids in rendered prompt when prompt_embeds "
+    "are present. This indicates the chat template was invoked with "
+    "tokenize=False."
+)
+_TOKENIZE_OVERRIDE_WARNING: Final[str] = (
+    "Overriding `tokenize=False` to `True` because `prompt_embeds` post-processing requires tokenized IDs."
+)
+
+
+def _ensure_prompt_embeds_placeholder_token(tokenizer: HfTokenizer) -> int:
+    """Register `PROMPT_EMBEDS_PLACEHOLDER_TOKEN` as a special token and return
+    its token ID."""
+    cached = _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE.get(tokenizer)
+    if cached is not None:
+        return cached
+
+    tokenizer.add_special_tokens({"additional_special_tokens": [PROMPT_EMBEDS_PLACEHOLDER_TOKEN]})
+
+    ids = tokenizer.encode(PROMPT_EMBEDS_PLACEHOLDER_TOKEN, add_special_tokens=False)
+    if len(ids) != 1:
+        raise RuntimeError(
+            _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_ERROR.format(
+                token=PROMPT_EMBEDS_PLACEHOLDER_TOKEN,
+                num_ids=len(ids),
+                ids=ids,
+            )
+        )
+
+    token_id = ids[0]
+    _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE[tokenizer] = token_id
+    return token_id
+
+
+def _build_prompt_embeds_updates(
+    prompt_embeds_tensors: Sequence[torch.Tensor],
+    placeholder_token_id: int,
+) -> MultiModalPromptUpdates:
+    """Build `MultiModalPromptUpdates` for `prompt_embeds` expansion.
+
+    Each tensor produces a `PromptReplacement` that maps
+    `[placeholder_token_id]` -> `[placeholder_token_id] x N`
+    (where `N = tensor.shape[0]`).
+    """
+    updates: list[Sequence[ResolvedPromptUpdate]] = []
+    for i, tensor in enumerate(prompt_embeds_tensors):
+        update = PromptReplacement(
+            modality="prompt_embeds",
+            target=[placeholder_token_id],
+            replacement=[placeholder_token_id] * tensor.shape[0],
+        )
+        updates.append([update.resolve(item_idx=i)])
+    return {"prompt_embeds": updates}
+
+
+def _expand_prompt_embeds_placeholders(
+    token_ids: list[int],
+    mm_prompt_updates: MultiModalPromptUpdates,
+) -> list[int]:
+    """Expand each 1-token `prompt_embeds` sentinel into an N-token span.
+
+    Uses `apply_token_matches`.  Each single placeholder token in
+    `token_ids` is replaced with a consecutive span of
+    `tensor.shape[0]` copies, following tensors in order.
+    """
+    expanded, _ = apply_token_matches(token_ids, mm_prompt_updates, tokenizer=None)
+    return expanded
+
+
+def _build_prompt_embeds_positions(
+    token_ids: list[int],
+    num_tensors: int,
+    mm_prompt_updates: MultiModalPromptUpdates,
+) -> list[tuple[int, int]]:
+    """Locate each prompt_embeds placeholder span in `token_ids`.
+
+    Expects `token_ids` to already contain expanded N-token spans.
+    Returns `[(start_idx, length), ...]` aligned with the tensors.
+    """
+    placeholders = find_mm_placeholders(
+        prompt=token_ids,
+        mm_prompt_updates=mm_prompt_updates,
+        tokenizer=None,
+    )
+    features = placeholders.get("prompt_embeds", [])
+
+    if len(features) != num_tensors:
+        raise ValueError(
+            _PROMPT_EMBEDS_PLACEHOLDER_SPAN_MISMATCH_ERROR.format(
+                expected=num_tensors,
+                actual=len(features),
+            )
+        )
+
+    return [(f.start_idx, f.length) for f in features]
+
+
+def _build_mixed_prompt_embeds(
+    token_ids: list[int],
+    prompt_embeds_tensors: Sequence[torch.Tensor],
+    positions: list[tuple[int, int]],
+) -> tuple[torch.Tensor, list[bool]]:
+    """Build the full-length `prompt_embeds` tensor and the `is_token_ids`
+    mask aligned to `token_ids`."""
+    total_len = len(token_ids)
+    hidden_size = prompt_embeds_tensors[0].shape[1]
+    dtype = prompt_embeds_tensors[0].dtype
+
+    full_embeds = torch.zeros(total_len, hidden_size, dtype=dtype)
+    is_token_ids = torch.ones(total_len, dtype=torch.bool)
+
+    for (start, length), tensor in zip(positions, prompt_embeds_tensors, strict=True):
+        full_embeds[start : start + length] = tensor
+        is_token_ids[start : start + length] = False
+
+    return full_embeds, is_token_ids.tolist()
+
+
 _PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], str | None]()
 """
 Used in `_try_get_processor_chat_template` to avoid calling
@@ -98,7 +262,7 @@ def resolve_chat_template(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     *,
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
 ) -> str | None:
     # 1st priority: The given chat template
     if chat_template is not None:
@@ -266,7 +430,7 @@ def _resolve_chat_template_content_format(
     tools: list[dict[str, Any]] | None,
     tokenizer: HfTokenizer,
     *,
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
 ) -> ChatTemplateContentFormat:
     resolved_chat_template = resolve_chat_template(
         tokenizer,
@@ -304,7 +468,7 @@ def _log_chat_template_content_format(
             "which is different from the detected format '%s'. "
             "If our automatic detection is incorrect, please consider "
             "opening a GitHub issue so that we can improve it: "
-            "https://github.com/vllm-project/vllm/issues/new/choose",
+            "https://github.com/aphrodite-project/aphrodite/issues/new/choose",
             given_format,
             detected_format,
         )
@@ -316,7 +480,7 @@ def resolve_chat_template_content_format(
     given_format: ChatTemplateContentFormatOption,
     tokenizer: HfTokenizer,
     *,
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
 ) -> ChatTemplateContentFormat:
     if given_format != "auto":
         return given_format
@@ -408,7 +572,7 @@ def resolve_chat_template_kwargs(
 
 @overload
 def safe_apply_chat_template(
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
     tokenizer: HfTokenizer,
     conversation: list[ConversationMessage],
     *,
@@ -419,7 +583,7 @@ def safe_apply_chat_template(
 ) -> list[int]: ...
 @overload
 def safe_apply_chat_template(
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
     tokenizer: HfTokenizer,
     conversation: list[ConversationMessage],
     *,
@@ -429,7 +593,7 @@ def safe_apply_chat_template(
     **kwargs,
 ) -> str: ...
 def safe_apply_chat_template(
-    model_config: "ModelConfig",
+    model_config: ModelConfig,
     tokenizer: HfTokenizer,
     conversation: list[ConversationMessage],
     *,
@@ -457,6 +621,14 @@ def safe_apply_chat_template(
         chat_template_kwargs=kwargs,
     )
 
+    # transformers v5 changed the default of `return_dict` to True, which
+    # makes `apply_chat_template(tokenize=True)` return a `BatchEncoding`
+    # instead of `list[int]`. Force `return_dict=False` so downstream code
+    # that expects a flat token list (e.g. `parse_dec_only_prompt`) works
+    # consistently across v4 and v5.
+    if tokenize and "return_dict" not in resolved_kwargs:
+        resolved_kwargs["return_dict"] = False
+
     try:
         return tokenizer.apply_chat_template(
             conversation=conversation,  # type: ignore[arg-type]
@@ -582,6 +754,10 @@ def render_messages(
         model_config = self.model_config
         tokenizer = self.get_tokenizer()
 
+        prompt_embeds_placeholder_token_id: int | None = None
+        if model_config.enable_prompt_embeds:
+            prompt_embeds_placeholder_token_id = _ensure_prompt_embeds_placeholder_token(tokenizer)
+
         conversation, mm_data, mm_uuids = parse_chat_messages(
             messages,
             model_config,
@@ -596,11 +772,28 @@ def render_messages(
             mm_processor_kwargs=params.mm_processor_kwargs,
         )
 
+        # prompt_embeds tensors are carried by the tracker through mm_data,
+        # but they must NOT be fed to the MM processor (which would reject
+        # the unknown key). Extract them here.
+        prompt_embeds_tensors: list[torch.Tensor] | None = None
+        if mm_data is not None and "prompt_embeds" in mm_data:
+            prompt_embeds_tensors = list(cast(Sequence[torch.Tensor], mm_data["prompt_embeds"]))
+            mm_data = {k: v for k, v in mm_data.items() if k != "prompt_embeds"}
+            if not mm_data:
+                mm_data = None
+
+        chat_template_kwargs = params.get_apply_chat_template_kwargs()
+        if prompt_embeds_tensors:
+            # prompt_embeds post-processing requires prompt_token_ids.
+            if chat_template_kwargs.get("tokenize") is False:
+                logger.warning_once(_TOKENIZE_OVERRIDE_WARNING)
+            chat_template_kwargs["tokenize"] = True
+
         prompt_raw = safe_apply_chat_template(
             model_config,
             tokenizer,
             conversation,
-            **params.get_apply_chat_template_kwargs(),
+            **chat_template_kwargs,
         )
 
         # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
@@ -620,6 +813,29 @@ def render_messages(
             )
 
         prompt = parse_dec_only_prompt(prompt_raw)
+
+        # When `prompt_embeds` is mixed with other modality data,
+        # `_process_tokens` runs `_process_multimodal` first (expanding
+        # `<|AUDIO|>` / `<|IMAGE|>` placeholders) and then
+        # `_apply_prompt_embeds_to_engine_input` augments the result.
+        # Stash the tensors and placeholder ID for that override to consume.
+        if prompt_embeds_tensors and mm_data:
+            assert prompt_embeds_placeholder_token_id is not None
+            cast(dict, prompt)["_prompt_embeds"] = (
+                prompt_embeds_tensors,
+                prompt_embeds_placeholder_token_id,
+            )
+            if params.mm_processor_kwargs:
+                cast(dict, prompt)["mm_processor_kwargs"] = params.mm_processor_kwargs
+        elif prompt_embeds_tensors:
+            # Pure mode: no other MM data, mutate prompt to EmbedsPrompt shape.
+            assert prompt_embeds_placeholder_token_id is not None
+            self._apply_prompt_embeds_to_prompt(
+                prompt,
+                prompt_embeds_tensors,
+                prompt_embeds_placeholder_token_id,
+            )
+
         if mm_data is not None:
             prompt["multi_modal_data"] = mm_data
         if mm_uuids is not None:
@@ -635,6 +851,10 @@ async def render_messages_async(
         model_config = self.model_config
         tokenizer = self.get_tokenizer()
 
+        prompt_embeds_placeholder_token_id: int | None = None
+        if model_config.enable_prompt_embeds:
+            prompt_embeds_placeholder_token_id = _ensure_prompt_embeds_placeholder_token(tokenizer)
+
         conversation, mm_data, mm_uuids = await parse_chat_messages_async(
             messages,
             model_config,
@@ -649,11 +869,25 @@ async def render_messages_async(
             mm_processor_kwargs=params.mm_processor_kwargs,
         )
 
+        prompt_embeds_tensors: list[torch.Tensor] | None = None
+        if mm_data is not None and "prompt_embeds" in mm_data:
+            prompt_embeds_tensors = list(cast(Sequence[torch.Tensor], mm_data["prompt_embeds"]))
+            mm_data = {k: v for k, v in mm_data.items() if k != "prompt_embeds"}
+            if not mm_data:
+                mm_data = None
+
+        chat_template_kwargs = params.get_apply_chat_template_kwargs()
+        if prompt_embeds_tensors:
+            # prompt_embeds post-processing requires prompt_token_ids.
+            if chat_template_kwargs.get("tokenize") is False:
+                logger.warning_once(_TOKENIZE_OVERRIDE_WARNING)
+            chat_template_kwargs["tokenize"] = True
+
         prompt_raw = await self._apply_chat_template_async(
             model_config,
             tokenizer,
             conversation,
-            **params.get_apply_chat_template_kwargs(),
+            **chat_template_kwargs,
         )
 
         # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
@@ -671,9 +905,169 @@ async def render_messages_async(
             )
 
         prompt = parse_dec_only_prompt(prompt_raw)
+
+        # See `render_messages` for the rationale.
+        if prompt_embeds_tensors and mm_data:
+            assert prompt_embeds_placeholder_token_id is not None
+            cast(dict, prompt)["_prompt_embeds"] = (
+                prompt_embeds_tensors,
+                prompt_embeds_placeholder_token_id,
+            )
+            if params.mm_processor_kwargs:
+                cast(dict, prompt)["mm_processor_kwargs"] = params.mm_processor_kwargs
+        elif prompt_embeds_tensors:
+            assert prompt_embeds_placeholder_token_id is not None
+            self._apply_prompt_embeds_to_prompt(
+                prompt,
+                prompt_embeds_tensors,
+                prompt_embeds_placeholder_token_id,
+            )
+
         if mm_data is not None:
             prompt["multi_modal_data"] = mm_data
         if mm_uuids is not None:
             prompt["multi_modal_uuids"] = mm_uuids
 
         return conversation, prompt
+
+    @override
+    def _process_tokens(
+        self,
+        prompt: TokensPrompt,
+        *,
+        skip_mm_cache: bool = False,
+    ) -> TokensInput | MultiModalInput:
+        """Pre-expand `prompt_embeds` sentinels before delegating to the MM
+        processor, then attach `prompt_embeds` modality data to the result.
+
+        Mixed mode only: the `_prompt_embeds` stash is set by
+        `render_messages` when `prompt_embeds` co-exist with other MM data
+        (images, audio, …).  We expand each 1-token sentinel to an N-token
+        span *before* calling `super()._process_tokens()` so the MM
+        processor records all placeholder offsets in the final (post-expansion)
+        coordinate space, no offset shifting needed afterwards.
+        """
+        prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
+        if prompt_embeds_info is not None:
+            tensors, placeholder_token_id = prompt_embeds_info
+            mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
+            cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
+                list(prompt["prompt_token_ids"]), mm_updates
+            )
+        engine_input = super()._process_tokens(prompt, skip_mm_cache=skip_mm_cache)
+        if prompt_embeds_info is not None:
+            tensors, _ = prompt_embeds_info
+            self._apply_prompt_embeds_to_engine_input(
+                cast(MultiModalInput, engine_input),
+                tensors,
+                mm_updates,
+            )
+        return engine_input
+
+    @override
+    async def _process_tokens_async(
+        self,
+        prompt: TokensPrompt,
+        *,
+        skip_mm_cache: bool = False,
+    ) -> TokensInput | MultiModalInput:
+        """Async equivalent of `_process_tokens`."""
+        prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
+        if prompt_embeds_info is not None:
+            tensors, placeholder_token_id = prompt_embeds_info
+            mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
+            cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
+                list(prompt["prompt_token_ids"]), mm_updates
+            )
+        engine_input = await super()._process_tokens_async(prompt, skip_mm_cache=skip_mm_cache)
+        if prompt_embeds_info is not None:
+            tensors, _ = prompt_embeds_info
+            self._apply_prompt_embeds_to_engine_input(
+                cast(MultiModalInput, engine_input),
+                tensors,
+                mm_updates,
+            )
+        return engine_input
+
+    @staticmethod
+    def _apply_prompt_embeds_to_prompt(
+        prompt: DictPrompt,
+        prompt_embeds_tensors: list[torch.Tensor],
+        placeholder_token_id: int,
+    ) -> None:
+        """Mutate `prompt` from `TokensPrompt` to `EmbedsPrompt` shape.
+
+        Pure `prompt_embeds` path only (no other MM modalities).  Expands
+        each `<prompt_embeds>` sentinel token into an N-token span and builds
+        the full-length `prompt_embeds` tensor + `prompt_is_token_ids` mask
+        that the engine's `enable_prompt_embeds` worker branch consumes.
+        """
+        token_ids = cast(list[int] | None, prompt.get("prompt_token_ids"))
+        if token_ids is None:
+            raise RuntimeError(_MISSING_PROMPT_TOKEN_IDS_ERROR)
+
+        embeds_orig_positions: list[int] = [i for i, tok in enumerate(token_ids) if tok == placeholder_token_id]
+        if len(embeds_orig_positions) != len(prompt_embeds_tensors):
+            raise ValueError(
+                f"Expected {len(prompt_embeds_tensors)} prompt_embeds "
+                f"placeholder tokens in the rendered prompt, found "
+                f"{len(embeds_orig_positions)}."
+            )
+
+        mm_updates = _build_prompt_embeds_updates(prompt_embeds_tensors, placeholder_token_id)
+        expanded = _expand_prompt_embeds_placeholders(token_ids, mm_updates)
+        positions = _build_prompt_embeds_positions(expanded, len(prompt_embeds_tensors), mm_updates)
+
+        embeds_prompt = cast(EmbedsPrompt, prompt)
+        embeds_prompt["prompt_token_ids"] = expanded
+        full_embeds, is_token_ids_mask = _build_mixed_prompt_embeds(expanded, prompt_embeds_tensors, positions)
+        embeds_prompt["prompt_embeds"] = full_embeds
+        embeds_prompt["prompt_is_token_ids"] = is_token_ids_mask
+
+    @staticmethod
+    def _apply_prompt_embeds_to_engine_input(
+        engine_input: MultiModalInput,
+        prompt_embeds_tensors: list[torch.Tensor],
+        mm_updates: MultiModalPromptUpdates,
+    ) -> None:
+        """Augment `engine_input` in-place with a `prompt_embeds` modality.
+
+        Mixed mode: called after `_process_multimodal` has already run on the
+        pre-expanded token IDs (expansion was done in `_process_tokens` before
+        calling `super()`).  Locates the already-expanded `prompt_embeds` spans
+        and adds `prompt_embeds` entries to `mm_kwargs`, `mm_hashes`, and
+        `mm_placeholders`.
+        """
+        # token_ids already contain the pre-expanded N-token spans.
+        token_ids = list(engine_input["prompt_token_ids"])
+
+        positions = _build_prompt_embeds_positions(token_ids, len(prompt_embeds_tensors), mm_updates)
+
+        pe_kwargs_items: list[MultiModalKwargsItem] = []
+        pe_hashes: list[str] = []
+        pe_placeholders: list[PlaceholderRange] = []
+        for tensor, (start, length) in zip(prompt_embeds_tensors, positions, strict=True):
+            pe_kwargs_items.append(
+                MultiModalKwargsItem(
+                    {
+                        "embedding": MultiModalFieldElem(
+                            data=tensor,
+                            field=MultiModalSharedField(batch_size=1),
+                        )
+                    }
+                )
+            )
+            pe_hashes.append(MultiModalHasher.hash_kwargs(prompt_embeds=tensor))
+            # `is_embed=None` matches the existing image_embeds-style
+            # "no encoder, just splice the tensor directly" semantics.
+            pe_placeholders.append(PlaceholderRange(offset=start, length=length, is_embed=None))
+
+        cast(
+            MultiModalKwargsItems[MultiModalKwargsItem | None],
+            engine_input["mm_kwargs"],
+        )["prompt_embeds"] = pe_kwargs_items
+        engine_input["mm_hashes"] = {
+            **engine_input["mm_hashes"],
+            "prompt_embeds": pe_hashes,
+        }
+        cast(dict, engine_input["mm_placeholders"])["prompt_embeds"] = pe_placeholders
diff --git a/aphrodite/sampling_params.py b/aphrodite/sampling_params.py
index 3f0f182b94..f7ec9a5ad1 100644
--- a/aphrodite/sampling_params.py
+++ b/aphrodite/sampling_params.py
@@ -25,6 +25,10 @@
 _SAMPLING_EPS = 1e-5
 _MAX_TEMP = 1e-2
 
+MAX_LOGPROB_TOKEN_IDS = 128
+"""Upper bound on `SamplingParams.logprob_token_ids` list length. Must match
+the per-request row width allocated by the sampler's `LogprobTokenIdsState`."""
+
 
 class SamplingType(IntEnum):
     GREEDY = 0
@@ -777,6 +781,16 @@ def bad_words_token_ids(self) -> list[list[int]] | None:
         # For internal use only. Backward compatibility not guaranteed
         return self._bad_words_token_ids
 
+    @property
+    def num_logprobs(self) -> int | None:
+        """Number of sample logprobs to return per output token, or `None` if
+        no sample logprobs were requested. Takes `logprob_token_ids` into
+        account: when `logprobs` is unset but `logprob_token_ids` is set,
+        returns `len(logprob_token_ids)`."""
+        if self.logprobs is not None:
+            return self.logprobs
+        return len(self.logprob_token_ids) if self.logprob_token_ids else None
+
     def clone(self) -> "SamplingParams":
         """If skip_clone is True, uses shallow copy instead of deep copy."""
         if self.skip_clone:
@@ -814,6 +828,17 @@ def _validate_logprobs(self, model_config: ModelConfig) -> None:
                     value=num_logprobs,
                 )
 
+        # Validate logprob_token_ids.
+        if self.logprob_token_ids is not None:
+            n = len(self.logprob_token_ids)
+            if n > MAX_LOGPROB_TOKEN_IDS:
+                raise APHRODITEValidationError(
+                    f"Requested logprob_token_ids of length {n}, "
+                    f"which is greater than max allowed: {MAX_LOGPROB_TOKEN_IDS}",
+                    parameter="logprob_token_ids",
+                    value=n,
+                )
+
         # Validate prompt logprobs.
         if num_prompt_logprobs := self.prompt_logprobs:
             if num_prompt_logprobs == -1:
diff --git a/aphrodite/tokenizers/deepseek_v4.py b/aphrodite/tokenizers/deepseek_v4.py
index c55387f556..d3d694a649 100644
--- a/aphrodite/tokenizers/deepseek_v4.py
+++ b/aphrodite/tokenizers/deepseek_v4.py
@@ -40,10 +40,16 @@ def apply_chat_template(
                 messages.insert(0, {"role": "system"})
                 messages[0]["tools"] = tools  # type: ignore[typeddict-unknown-key]
 
-            # The V4 reference currently accepts only "max", "high", or None.
             reasoning_effort = kwargs.get("reasoning_effort")
-            if reasoning_effort not in ("max", "high"):
+            if not isinstance(reasoning_effort, str):
                 reasoning_effort = None
+            elif reasoning_effort == "none":
+                thinking_mode = "chat"
+                reasoning_effort = None
+            elif reasoning_effort in ("max", "xhigh"):
+                reasoning_effort = "max"
+            else:
+                reasoning_effort = "high"
 
             encode_config = dict(
                 thinking_mode=thinking_mode,
diff --git a/aphrodite/tool_parsers/__init__.py b/aphrodite/tool_parsers/__init__.py
index 2366ac2527..b9a1ea9386 100644
--- a/aphrodite/tool_parsers/__init__.py
+++ b/aphrodite/tool_parsers/__init__.py
@@ -38,6 +38,14 @@
         "deepseekv4_tool_parser",
         "DeepSeekV4ToolParser",
     ),
+    "cohere_command3": (
+        "cohere_command_tool_parser",
+        "CohereCommand3ToolParser",
+    ),
+    "cohere_command4": (
+        "cohere_command_tool_parser",
+        "CohereCommand4ToolParser",
+    ),
     "ernie45": (
         "ernie45_tool_parser",
         "Ernie45ToolParser",
@@ -66,6 +74,10 @@
         "hermes_tool_parser",
         "Hermes2ProToolParser",
     ),
+    "poolside_v1": (
+        "poolside_v1_tool_parser",
+        "PoolsideV1ToolParser",
+    ),
     "hunyuan_a13b": (
         "hunyuan_a13b_tool_parser",
         "HunyuanA13BToolParser",
diff --git a/aphrodite/tool_parsers/cohere_command_tool_parser.py b/aphrodite/tool_parsers/cohere_command_tool_parser.py
new file mode 100644
index 0000000000..1a808b8c26
--- /dev/null
+++ b/aphrodite/tool_parsers/cohere_command_tool_parser.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+try:
+    from cohere_melody import PyFilter, PyFilterOptions
+except ImportError as e:
+    raise ImportError(
+        "The Cohere tool parser requires the `cohere_melody` "
+        "package, which is not installed. Install it with:\n"
+        "    pip install cohere_melody"
+    ) from e
+
+from aphrodite.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from aphrodite.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from aphrodite.entrypoints.openai.responses.protocol import (
+    ResponsesRequest,
+)
+from aphrodite.tokenizers import TokenizerLike
+from aphrodite.tool_parsers import ToolParser
+from aphrodite.tool_parsers.utils import Tool
+
+
+class BaseCohereCommandToolParser(ToolParser):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        streaming_opts: PyFilterOptions,
+        unary_opts: PyFilterOptions,
+    ):
+        super().__init__(tokenizer)
+        self.melody_streaming = PyFilter(streaming_opts)
+        self.melody_unary = PyFilter(unary_opts)
+
+    def adjust_request(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> ChatCompletionRequest | ResponsesRequest:
+        request = super().adjust_request(request)
+        request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        r = self.melody_streaming.write_decoded(delta_text)
+        if r.content is not None:
+            return DeltaMessage(content=r.content)
+        if r.reasoning is not None:
+            return DeltaMessage(reasoning=r.reasoning)
+        if r.tool_calls:
+            return DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        id=tc.id,
+                        index=tc.index,
+                        type="function",
+                        function=DeltaFunctionCall(name=tc.name, arguments=tc.arguments),
+                    )
+                    for tc in r.tool_calls
+                ]
+            )
+        return None
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        result = self.melody_unary.process_full_text(model_output)
+        tool_calls = [
+            ToolCall(
+                id=tc.id,
+                type="function",
+                function=FunctionCall(name=tc.name, arguments=tc.arguments),
+            )
+            for tc in result.tool_calls
+        ]
+        return ExtractedToolCallInformation(
+            tools_called=len(tool_calls) > 0,
+            tool_calls=tool_calls,
+            content=result.content,
+        )
+
+
+class CohereCommand3ToolParser(BaseCohereCommandToolParser):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        tools: list[Tool] | None = None,
+    ):
+        super().__init__(
+            tokenizer,
+            streaming_opts=PyFilterOptions().cmd3(),
+            unary_opts=PyFilterOptions().cmd3(),
+        )
+
+
+class CohereCommand4ToolParser(BaseCohereCommandToolParser):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        tools: list[Tool] | None = None,
+    ):
+        super().__init__(
+            tokenizer,
+            streaming_opts=PyFilterOptions().cmd4(),
+            unary_opts=PyFilterOptions().cmd4(),
+        )
diff --git a/aphrodite/tool_parsers/deepseekv32_tool_parser.py b/aphrodite/tool_parsers/deepseekv32_tool_parser.py
index 1ff3eef941..eca7ab6b46 100644
--- a/aphrodite/tool_parsers/deepseekv32_tool_parser.py
+++ b/aphrodite/tool_parsers/deepseekv32_tool_parser.py
@@ -58,17 +58,13 @@ def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
         self.current_tool_index: int = 0
         self._sent_content_idx: int = 0
 
-        # Regex patterns for complete parsing.
-        #
-        # The wrapper tokens are class attributes so subclasses such as
-        # DeepSeekV4ToolParser can override them.
+        # Regex patterns for complete parsing
         self.tool_call_complete_regex = re.compile(
             re.escape(self.tool_call_start_token) + r"(.*?)" + re.escape(self.tool_call_end_token),
             re.DOTALL,
         )
         self.invoke_complete_regex = re.compile(
-            r'<｜DSML｜invoke\s+name="([^"]+)"\s*>(.*?)</｜DSML｜invoke>',
-            re.DOTALL,
+            r'<｜DSML｜invoke\s+name="([^"]+)"\s*>(.*?)</｜DSML｜invoke>', re.DOTALL
         )
         self.parameter_complete_regex = re.compile(
             r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="(?:true|false)"\s*>(.*?)</｜DSML｜parameter>',
@@ -87,7 +83,7 @@ def adjust_request(
         if request.tools and request.tool_choice != "none":
             # Ensure tool call tokens
             # (e.g. <｜DSML｜function_calls>, </｜DSML｜function_calls>)
-            # are not skipped during decoding.
+            # are not skippedduring decoding.
             # Even though they are not marked as special tokens,
             # setting skip_special_tokens=False ensures proper handling in
             # transformers 5.x where decoding behavior may have changed.
@@ -98,8 +94,8 @@ def _generate_tool_call_id(self) -> str:
         """Generate a unique tool call ID."""
         return f"call_{uuid.uuid4().hex[:24]}"
 
-    def _parse_invoke_params(self, invoke_str: str) -> dict[str, str]:
-        param_dict: dict[str, str] = {}
+    def _parse_invoke_params(self, invoke_str: str) -> dict:
+        param_dict = dict()
         for param_name, param_val in self.parameter_complete_regex.findall(invoke_str):
             param_dict[param_name] = param_val
         return param_dict
@@ -127,11 +123,8 @@ def _convert_param_value_checked(self, value: str, param_type: str) -> Any:
         else:
             return json.loads(value)
 
-    def _convert_param_value(self, value: Any, param_type: str | list[str]) -> Any:
+    def _convert_param_value(self, value: str, param_type: str | list[str]) -> Any:
         """Convert parameter value to the correct type."""
-        if not isinstance(value, str):
-            return value
-
         if not isinstance(param_type, list):
             param_type = [param_type]
         for current_type in param_type:
@@ -142,45 +135,6 @@ def _convert_param_value(self, value: Any, param_type: str | list[str]) -> Any:
         # return value as fallback
         return value
 
-    def _normalize_arguments_wrapper(
-        self,
-        converted: dict[str, Any],
-    ) -> dict[str, Any]:
-        """Normalize model-generated nested arguments wrapper.
-
-        DeepSeek V4 Flash may generate DSML parameters like:
-
-            <｜DSML｜parameter name="arguments" string="false">
-            {"path": "/tmp/a", "content": "hello"}
-            </｜DSML｜parameter>
-
-        The parser would otherwise produce:
-
-            {"arguments": {"path": "/tmp/a", "content": "hello"}}
-
-        OpenAI-compatible function.arguments should be:
-
-            {"path": "/tmp/a", "content": "hello"}
-        """
-        if set(converted.keys()) != {"arguments"}:
-            return converted
-
-        wrapped = converted.get("arguments")
-
-        if isinstance(wrapped, dict):
-            return wrapped
-
-        if isinstance(wrapped, str):
-            try:
-                parsed = json.loads(wrapped)
-            except Exception:
-                return converted
-
-            if isinstance(parsed, dict):
-                return parsed
-
-        return converted
-
     def _convert_params_with_schema(
         self,
         function_name: str,
@@ -206,8 +160,7 @@ def _convert_params_with_schema(
             if name in param_config and isinstance(param_config[name], dict):
                 param_type = param_config[name].get("type", "string")
             converted[name] = self._convert_param_value(value, param_type)
-
-        return self._normalize_arguments_wrapper(converted)
+        return converted
 
     def extract_tool_calls(
         self,
@@ -227,16 +180,13 @@ def extract_tool_calls(
                 # Find all invokes within this tool_call
                 for invoke_name, invoke_content in self.invoke_complete_regex.findall(tool_call_match):
                     param_dict = self._parse_invoke_params(invoke_content)
-                    converted = self._convert_params_with_schema(
-                        invoke_name,
-                        param_dict,
-                    )
+                    params = self._convert_params_with_schema(invoke_name, param_dict)
                     tool_calls.append(
                         ToolCall(
                             type="function",
                             function=FunctionCall(
                                 name=invoke_name,
-                                arguments=json.dumps(converted, ensure_ascii=False),
+                                arguments=json.dumps(params, ensure_ascii=False),
                             ),
                         )
                     )
@@ -300,29 +250,15 @@ def _extract_delta_tool_calls(
 
         return delta_tool_calls
 
-    def _extract_content(
-        self,
-        current_text: str,
-        *,
-        is_final: bool = False,
-    ) -> str | None:
+    def _extract_content(self, current_text: str) -> str | None:
         """Return unsent non-tool-call text, or None.
 
         Holds back any suffix that could be a partial start marker
         so that split markers are never leaked as content.
-
-        On final streaming step, flush the held-back suffix because it
-        cannot form a complete tool-call start marker anymore.
         """
         if self.tool_call_start_token not in current_text:
-            if is_final:
-                sendable_idx = len(current_text)
-            else:
-                overlap = partial_tag_overlap(
-                    current_text,
-                    self.tool_call_start_token,
-                )
-                sendable_idx = len(current_text) - overlap
+            overlap = partial_tag_overlap(current_text, self.tool_call_start_token)
+            sendable_idx = len(current_text) - overlap
         else:
             sendable_idx = current_text.index(self.tool_call_start_token)
 
@@ -353,11 +289,7 @@ def extract_tool_calls_streaming(
         if not previous_text:
             self._reset_streaming_state()
 
-        # Empty delta with token ids means EOS or a skipped/closing token.
-        # Treat it as final for content flushing purposes.
-        is_final = not delta_text and bool(delta_token_ids)
-
-        content = self._extract_content(current_text, is_final=is_final)
+        content = self._extract_content(current_text)
         delta_tool_calls = self._extract_delta_tool_calls(current_text, request)
 
         if delta_tool_calls or content:
@@ -365,7 +297,7 @@ def extract_tool_calls_streaming(
 
         # Empty delta with token ids means EOS or closing tag; return
         # non-None so the serving framework can finalize finish_reason.
-        if is_final and self.prev_tool_call_arr:
+        if not delta_text and delta_token_ids and self.prev_tool_call_arr:
             return DeltaMessage(content="")
 
         return None
diff --git a/aphrodite/tool_parsers/poolside_v1_tool_parser.py b/aphrodite/tool_parsers/poolside_v1_tool_parser.py
new file mode 100644
index 0000000000..62cf8cc420
--- /dev/null
+++ b/aphrodite/tool_parsers/poolside_v1_tool_parser.py
@@ -0,0 +1,554 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+GLM-4 Tool Call Parser with incremental string streaming support.
+
+This parser fixes the streaming issue reported in Issue #32829 where long string
+parameters (e.g., file content with 4000+ characters of code) are buffered until
+complete, causing multi-second delays before the user sees any content.
+
+The fix streams string values incrementally as they arrive, providing a true
+streaming experience for long content.
+"""
+
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any
+
+import partial_json_parser.core.complete
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from aphrodite.entrypoints.chat_utils import make_tool_call_id
+from aphrodite.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from aphrodite.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from aphrodite.entrypoints.openai.responses.protocol import (
+    ResponsesRequest,
+)
+from aphrodite.logger import init_logger
+from aphrodite.tokenizers import TokenizerLike
+from aphrodite.tool_parsers.abstract_tool_parser import (
+    Tool,
+    ToolParser,
+)
+
+logger = init_logger(__name__)
+
+
+class PoolsideV1ToolParser(ToolParser):
+    """Tool parser for GLM-4 models with incremental string streaming.
+
+    This parser emits tool-call deltas incrementally as arguments arrive.
+    For string-type parameters, content is streamed character-by-character
+    rather than waiting for the complete </arg_value> tag.
+    """
+
+    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
+        super().__init__(tokenizer, tools)
+        # Stateful streaming fields
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict[str, Any]] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = []
+
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.arg_key_start: str = "<arg_key>"
+        self.arg_key_end: str = "</arg_key>"
+        self.arg_val_start: str = "<arg_value>"
+        self.arg_val_end: str = "</arg_value>"
+
+        self.tool_calls_start_token = self.tool_call_start_token
+
+        self.func_call_regex = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
+        self.func_detail_regex = re.compile(r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
+        self.func_arg_regex = re.compile(r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError("The model tokenizer must be passed to the ToolParser constructor during construction.")
+
+        self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+        self._buffer: str = ""
+
+        # Streaming state for incremental tool-call streaming
+        self._in_tool_call: bool = False
+        self._current_tool_name: str | None = None
+        self._pending_key: str | None = None
+        self._streaming_string_value: bool = False
+        self._tool_call_ids: list[str] = []
+        self._args_started: list[bool] = []
+        self._args_closed: list[bool] = []
+        self._seen_keys: list[set[str]] = []
+
+    @staticmethod
+    def _deserialize(value: str) -> Any:
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            pass
+
+        try:
+            return ast.literal_eval(value)
+        except (ValueError, SyntaxError):
+            pass
+
+        return value
+
+    @staticmethod
+    def _json_escape_string_content(s: str) -> str:
+        """JSON-escape string content for incremental streaming.
+
+        This escapes the content that goes INSIDE a JSON string (between quotes),
+        not including the surrounding quotes themselves.
+        """
+        if not s:
+            return ""
+        return json.dumps(s, ensure_ascii=False)[1:-1]
+
+    @staticmethod
+    def _is_string_type(
+        tool_name: str,
+        arg_name: str,
+        tools: list[Tool] | None,
+    ) -> bool:
+        if tools is None:
+            return False
+        for tool in tools:
+            if tool.function.name != tool_name:
+                continue
+            if tool.function.parameters is None:
+                return False
+            arg_type = tool.function.parameters.get("properties", {}).get(arg_name, {}).get("type", None)
+            return arg_type == "string"
+        logger.debug("No tool named '%s'.", tool_name)
+        return False
+
+    @staticmethod
+    def _tools_enabled(request: ChatCompletionRequest) -> bool:
+        """Return whether tool parsing should be applied for this request."""
+        try:
+            tools = getattr(request, "tools", None)
+            tool_choice = getattr(request, "tool_choice", None)
+            return bool(tools) and tool_choice != "none"
+        except Exception:
+            logger.exception("Failed to determine if tools are enabled.")
+            return False
+
+    def adjust_request(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> ChatCompletionRequest | ResponsesRequest:
+        """Adjust request parameters for tool call token handling."""
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
+            # Ensure tool call tokens (<tool_call>, </tool_call>) are not skipped
+            # during decoding. Even though they are not marked as special tokens,
+            # setting skip_special_tokens=False ensures proper handling in
+            # transformers 5.x where decoding behavior may have changed.
+            request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        matched_tool_calls = self.func_call_regex.findall(model_output)
+        logger.debug("model_output: %s", model_output)
+        try:
+            tool_calls: list[ToolCall] = []
+            for match in matched_tool_calls:
+                tc_detail = self.func_detail_regex.search(match)
+                if not tc_detail:
+                    logger.warning(
+                        "Failed to parse tool call details from: %s",
+                        match,
+                    )
+                    continue
+                tc_name = tc_detail.group(1).strip()
+                tc_args = tc_detail.group(2)
+                pairs = self.func_arg_regex.findall(tc_args) if tc_args else []
+                arg_dct: dict[str, Any] = {}
+                for key, value in pairs:
+                    arg_key = key.strip()
+                    arg_val = value.strip()
+                    if not self._is_string_type(tc_name, arg_key, request.tools):
+                        arg_val = self._deserialize(arg_val)
+                    logger.debug("arg_key = %s, arg_val = %s", arg_key, arg_val)
+                    arg_dct[arg_key] = arg_val
+                tool_calls.append(
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=tc_name,
+                            arguments=json.dumps(arg_dct, ensure_ascii=False),
+                        ),
+                    )
+                )
+        except Exception:
+            logger.exception("Failed to extract tool call spec")
+            return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output)
+        else:
+            if len(tool_calls) > 0:
+                content: str | None = model_output[: model_output.find(self.tool_calls_start_token)]
+                # Normalize empty/whitespace-only content to None
+                if not content or not content.strip():
+                    content = None
+                return ExtractedToolCallInformation(tools_called=True, tool_calls=tool_calls, content=content)
+            return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        if not self._tools_enabled(request):
+            return DeltaMessage(content=delta_text) if delta_text else None
+
+        self._buffer += delta_text
+
+        pending_deltas: dict[int, DeltaToolCall] = {}
+        content: str | None = None
+
+        while True:
+            if not self._in_tool_call:
+                start_idx = self._buffer.find(self.tool_call_start_token)
+                if start_idx == -1:
+                    # Check for partial start token at end of buffer
+                    for i in range(1, len(self.tool_call_start_token)):
+                        if self._buffer.endswith(self.tool_call_start_token[:i]):
+                            out = self._buffer[:-i]
+                            self._buffer = self._buffer[-i:]
+                            if out:
+                                content = (content or "") + out
+                            break
+                    else:
+                        out = self._buffer
+                        self._buffer = ""
+                        if out:
+                            content = (content or "") + out
+                    break
+
+                if start_idx > 0:
+                    content = (content or "") + self._buffer[:start_idx]
+                    self._buffer = self._buffer[start_idx:]
+
+                self._buffer = self._buffer[len(self.tool_call_start_token) :]
+                self._begin_tool_call()
+                continue
+
+            # Parse tool name first
+            if not self.current_tool_name_sent:
+                nl = self._buffer.find("\n")
+                ak = self._buffer.find(self.arg_key_start)
+                end = self._buffer.find(self.tool_call_end_token)
+                candidates = [i for i in [nl, ak, end] if i != -1]
+                if not candidates:
+                    break
+                cut = min(candidates)
+                tool_name = self._buffer[:cut].strip()
+                if tool_name == "" and cut == end:
+                    # Handle empty tool call like `<tool_call></tool_call>`.
+                    # Consume the tokens and reset state to avoid infinite loop.
+                    self._buffer = self._buffer[end + len(self.tool_call_end_token) :]
+                    self._finish_tool_call()
+                    self._revert_last_tool_call_state()
+                    continue
+
+                if cut == nl:
+                    self._buffer = self._buffer[nl + 1 :]
+                else:
+                    self._buffer = self._buffer[cut:]
+
+                self._current_tool_name = tool_name
+                self.current_tool_name_sent = True
+                self._update_tool_name(pending_deltas, tool_name)
+                continue
+
+            assert self._current_tool_name is not None
+
+            # Handle incremental string value streaming
+            if self._streaming_string_value:
+                val_end = self._buffer.find(self.arg_val_end)
+                if val_end != -1:
+                    raw_content = self._buffer[:val_end]
+                    self._buffer = self._buffer[val_end + len(self.arg_val_end) :]
+                    self._streaming_string_value = False
+                    self._pending_key = None
+
+                    escaped = self._json_escape_string_content(raw_content)
+                    frag = escaped + '"'
+                    self.streamed_args_for_tool[self.current_tool_id] += frag
+                    self._update_tool_args(pending_deltas, frag)
+                    continue
+
+                # Check for partial </arg_value> at end
+                safe_len = len(self._buffer)
+                for i in range(1, len(self.arg_val_end)):
+                    if self._buffer.endswith(self.arg_val_end[:i]):
+                        safe_len = len(self._buffer) - i
+                        break
+
+                if safe_len > 0:
+                    to_emit = self._buffer[:safe_len]
+                    self._buffer = self._buffer[safe_len:]
+                    escaped = self._json_escape_string_content(to_emit)
+                    if escaped:
+                        self.streamed_args_for_tool[self.current_tool_id] += escaped
+                        self._update_tool_args(pending_deltas, escaped)
+                break
+
+            # If we have a pending key, parse its value
+            if self._pending_key is not None:
+                val_pos = self._buffer.find(self.arg_val_start)
+                if val_pos == -1:
+                    break
+                if val_pos > 0:
+                    self._buffer = self._buffer[val_pos:]
+
+                key = (self._pending_key or "").strip()
+
+                is_string = self._is_string_type(self._current_tool_name, key, request.tools)
+
+                if is_string:
+                    # String type: stream incrementally
+                    self._buffer = self._buffer[len(self.arg_val_start) :]
+
+                    if key in self._seen_keys[self.current_tool_id]:
+                        self._pending_key = None
+                        continue
+
+                    self._seen_keys[self.current_tool_id].add(key)
+                    key_json = json.dumps(key, ensure_ascii=False)
+
+                    if not self._args_started[self.current_tool_id]:
+                        frag = "{" + key_json + ': "'
+                        self._args_started[self.current_tool_id] = True
+                    else:
+                        frag = ", " + key_json + ': "'
+
+                    self.streamed_args_for_tool[self.current_tool_id] += frag
+                    self._streaming_string_value = True
+                    self._update_tool_args(pending_deltas, frag)
+                    continue
+
+                # Non-string type: wait for complete value
+                val_end = self._buffer.find(self.arg_val_end)
+                if val_end == -1:
+                    break
+
+                raw_val = self._buffer[len(self.arg_val_start) : val_end].strip()
+                self._buffer = self._buffer[val_end + len(self.arg_val_end) :]
+                self._pending_key = None
+
+                frag_or_none = self._append_arg_fragment(key=key, raw_val=raw_val)
+                if frag_or_none:
+                    self._update_tool_args(pending_deltas, frag_or_none)
+                continue
+
+            # Parse next arg or close
+            end_pos = self._buffer.find(self.tool_call_end_token)
+            key_pos = self._buffer.find(self.arg_key_start)
+            if end_pos != -1 and (key_pos == -1 or end_pos < key_pos):
+                self._buffer = self._buffer[end_pos + len(self.tool_call_end_token) :]
+                frag_or_none = self._close_args_if_needed()
+                # Finalize prev_tool_call_arr with complete parsed arguments
+                if self._current_tool_name:
+                    try:
+                        full_args_str = self.streamed_args_for_tool[self.current_tool_id]
+                        args_dict = json.loads(full_args_str)
+                        self.prev_tool_call_arr[self.current_tool_id] = {
+                            "name": self._current_tool_name,
+                            "arguments": args_dict,
+                        }
+                    except (json.JSONDecodeError, IndexError) as e:
+                        logger.warning(
+                            "Failed to finalize tool call state for tool %d: %s",
+                            self.current_tool_id,
+                            e,
+                        )
+                self._finish_tool_call()
+                if frag_or_none:
+                    self._update_tool_args(pending_deltas, frag_or_none)
+                continue
+
+            if key_pos == -1:
+                break
+            if key_pos > 0:
+                self._buffer = self._buffer[key_pos:]
+            key_end = self._buffer.find(self.arg_key_end)
+            if key_end == -1:
+                break
+            key = self._buffer[len(self.arg_key_start) : key_end]
+            self._buffer = self._buffer[key_end + len(self.arg_key_end) :]
+            self._pending_key = key
+            continue
+
+        tool_calls = list(pending_deltas.values())
+        if content is None and len(tool_calls) == 0:
+            if request.logprobs:
+                return DeltaMessage(content="")
+            return None
+        return DeltaMessage(content=content, tool_calls=tool_calls)
+
+    def _ensure_tool_state(self) -> None:
+        while len(self._tool_call_ids) <= self.current_tool_id:
+            self._tool_call_ids.append(make_tool_call_id(id_type="random", func_name=None, idx=None))
+        while len(self.streamed_args_for_tool) <= self.current_tool_id:
+            self.streamed_args_for_tool.append("")
+        while len(self.prev_tool_call_arr) <= self.current_tool_id:
+            self.prev_tool_call_arr.append({})
+        while len(self._args_started) <= self.current_tool_id:
+            self._args_started.append(False)
+        while len(self._args_closed) <= self.current_tool_id:
+            self._args_closed.append(False)
+        while len(self._seen_keys) <= self.current_tool_id:
+            self._seen_keys.append(set())
+
+    def _begin_tool_call(self) -> None:
+        if self.current_tool_id == -1:
+            self.current_tool_id = 0
+        else:
+            self.current_tool_id += 1
+        self._ensure_tool_state()
+        self.current_tool_name_sent = False
+        self._current_tool_name = None
+        self._pending_key = None
+        self._streaming_string_value = False
+        self._in_tool_call = True
+
+    def _finish_tool_call(self) -> None:
+        self._in_tool_call = False
+        self._current_tool_name = None
+        self._pending_key = None
+        self._streaming_string_value = False
+
+    def _revert_last_tool_call_state(self) -> None:
+        """Revert the state allocation for the last tool call."""
+        if self.current_tool_id < 0:
+            return
+        self._tool_call_ids.pop()
+        self.streamed_args_for_tool.pop()
+        self.prev_tool_call_arr.pop()
+        self._args_started.pop()
+        self._args_closed.pop()
+        self._seen_keys.pop()
+        self.current_tool_id -= 1
+
+    def _get_or_create_delta(self, pending: dict[int, DeltaToolCall]) -> DeltaToolCall:
+        idx = self.current_tool_id
+        if idx not in pending:
+            pending[idx] = DeltaToolCall(
+                index=idx,
+                function=DeltaFunctionCall(),
+            )
+        delta = pending[idx]
+        assert delta.function is not None
+        return delta
+
+    def _update_tool_name(self, pending: dict[int, DeltaToolCall], tool_name: str) -> None:
+        self.prev_tool_call_arr[self.current_tool_id] = {
+            "name": self._current_tool_name,
+            "arguments": {},
+        }
+        delta = self._get_or_create_delta(pending)
+        delta.id = self._tool_call_ids[self.current_tool_id]
+        delta.type = "function"
+        assert delta.function is not None
+        delta.function.name = tool_name
+        if delta.function.arguments is None:
+            delta.function.arguments = ""
+
+    @staticmethod
+    def _complete_json_prefix(
+        json_prefix: str,
+        allowed_partial_types: Allow,
+    ) -> dict | None:
+        """Complete a partial JSON prefix into a valid JSON object.
+
+        Returns (formatted_prefix, parsed_dict) or None on failure.
+
+        Note: ``partial_json_parser`` strips trailing whitespace before
+        parsing (``complete.py:20``), which means the returned slice is
+        shorter than ``json_prefix`` when it has trailing whitespace.
+        Since the parser controls the construction of the json_prefix value,
+        this code relies on it being a valid prefix and we only use the fix for
+        the completion of the JSON object.
+        """
+        try:
+            _, partial_str_completion = partial_json_parser.core.complete.fix(
+                json_prefix,
+                allowed_partial_types,
+            )
+            return json.loads(json_prefix + partial_str_completion)
+        except Exception:
+            return None
+
+    def _update_tool_args(self, pending: dict[int, DeltaToolCall], fragment: str) -> None:
+        result = self._complete_json_prefix(
+            self.streamed_args_for_tool[self.current_tool_id],
+            Allow.ALL,
+        )
+        if result is not None:
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = result
+        delta = self._get_or_create_delta(pending)
+        assert delta.function is not None
+        if delta.function.arguments is None:
+            delta.function.arguments = ""
+        delta.function.arguments += fragment
+
+    def _append_arg_fragment(
+        self,
+        *,
+        key: str,
+        raw_val: str,
+    ) -> str | None:
+        key = key.strip()
+        if not key:
+            return None
+        if key in self._seen_keys[self.current_tool_id]:
+            return None
+
+        # This function is only called for non-string types (already checked
+        # by _is_string_type in the caller), so we always deserialize.
+        val_obj: Any = self._deserialize(raw_val)
+
+        key_json = json.dumps(key, ensure_ascii=False)
+        val_json = json.dumps(val_obj, ensure_ascii=False)
+
+        if not self._args_started[self.current_tool_id]:
+            fragment = "{" + key_json + ": " + val_json
+            self._args_started[self.current_tool_id] = True
+        else:
+            fragment = ", " + key_json + ": " + val_json
+
+        self._seen_keys[self.current_tool_id].add(key)
+        self.streamed_args_for_tool[self.current_tool_id] += fragment
+        return fragment
+
+    def _close_args_if_needed(self) -> str | None:
+        if self._args_closed[self.current_tool_id]:
+            return None
+        self._args_closed[self.current_tool_id] = True
+        if not self._args_started[self.current_tool_id]:
+            fragment = "{}"
+            self.streamed_args_for_tool[self.current_tool_id] = fragment
+        else:
+            fragment = "}"
+            self.streamed_args_for_tool[self.current_tool_id] += fragment
+        return fragment
diff --git a/aphrodite/tool_parsers/streaming.py b/aphrodite/tool_parsers/streaming.py
new file mode 100644
index 0000000000..be8b03c2ac
--- /dev/null
+++ b/aphrodite/tool_parsers/streaming.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from typing import TYPE_CHECKING
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from aphrodite.entrypoints.chat_utils import make_tool_call_id
+from aphrodite.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
+from aphrodite.tool_parsers.mistral_tool_parser import MistralToolCall
+from aphrodite.tool_parsers.utils import partial_json_loads
+from aphrodite.utils.mistral import is_mistral_tokenizer
+
+if TYPE_CHECKING:
+    from aphrodite.tokenizers import TokenizerLike
+else:
+    TokenizerLike = object
+
+
+def _bracket_level(s: str, opening: str = "{", closing: str = "}") -> int:
+    """Calculate the current level of nested brackets in a string."""
+    level = 0
+    for char in s:
+        if char == opening:
+            level += 1
+        elif char == closing:
+            level -= 1
+    return level
+
+
+def filter_delta_text(
+    delta_text: str,
+    previous_text: str,
+) -> tuple[str, bool]:
+    """Trim trailing tool-list delimiters from required-tool streaming text."""
+    bracket_level = _bracket_level(previous_text)
+    updated_delta = ""
+    passed_zero = False
+    for char in delta_text:
+        if char == "{":
+            bracket_level += 1
+            passed_zero = bracket_level == 0
+        elif char == "}":
+            bracket_level -= 1
+            passed_zero = bracket_level == 0
+
+        if bracket_level != 0:
+            updated_delta += char
+        else:
+            if char == ",":
+                break
+    return updated_delta, passed_zero
+
+
+def extract_named_tool_call_streaming(
+    *,
+    delta_text: str,
+    function_name: str,
+    function_name_returned: bool,
+    tool_call_idx: int | None,
+    tool_call_id_type: str,
+    tokenizer: "TokenizerLike",
+    tool_call_array_index: int = 0,
+) -> tuple[DeltaMessage | None, bool]:
+    """Build a streaming tool-call delta for forced named tool choice."""
+    if function_name_returned:
+        delta_tool_call = DeltaToolCall(
+            function=DeltaFunctionCall(arguments=delta_text),
+            index=tool_call_array_index,
+        )
+    else:
+        if is_mistral_tokenizer(tokenizer):
+            tool_call_id = MistralToolCall.generate_random_id()
+        else:
+            tool_call_id = make_tool_call_id(
+                id_type=tool_call_id_type,
+                func_name=function_name,
+                idx=tool_call_idx,
+            )
+        delta_tool_call = DeltaToolCall(
+            id=tool_call_id,
+            type="function",
+            function=DeltaFunctionCall(
+                name=function_name,
+                arguments=delta_text,
+            ),
+            index=tool_call_array_index,
+        )
+        function_name_returned = True
+    return (
+        DeltaMessage(tool_calls=[delta_tool_call]),
+        function_name_returned,
+    )
+
+
+def extract_required_tool_call_streaming(
+    *,
+    previous_text: str,
+    current_text: str | None,
+    delta_text: str,
+    function_name_returned: bool,
+    tool_call_idx: int | None,
+    tool_call_id_type: str,
+) -> tuple[DeltaMessage | None, bool]:
+    if current_text is None or current_text == "":
+        # if the current text is empty, we cannot parse it
+        return None, function_name_returned
+    try:
+        flags = Allow.ALL
+        obj, _ = partial_json_loads(current_text, flags)
+    except (
+        partial_json_parser.core.exceptions.MalformedJSON,
+        json.JSONDecodeError,
+    ):
+        obj = None
+
+    # check if the current text is a valid array
+    # containing a partial tool calling object
+    # if not repeat
+    if obj is None or not isinstance(obj, list) or not len(obj) > 0:
+        function_name_returned = False
+        delta_message = None
+    else:
+        _, finishes_previous_tool = filter_delta_text(delta_text, previous_text)
+        # take the last tool call from the generated list
+        current_tool_call = obj[-1]
+
+        # once parameters have been generated the name is complete as well
+        if not finishes_previous_tool and ("name" not in current_tool_call or "parameters" not in current_tool_call):
+            function_name_returned = False
+            delta_message = None
+        else:
+            if not function_name_returned:
+                # get partly generated arguments from the latest tool call
+                param_match = re.search(r'.*"parameters":\s*(.*)', current_text, re.DOTALL)
+                arguments = param_match.group(1) if param_match else ""
+                arguments, _ = filter_delta_text(arguments, previous_text)
+
+                # if this iteration finishes a previous tool call but a
+                # new incomplete tool is already generated, take the
+                # previous from the list
+                if finishes_previous_tool and "parameters" not in current_tool_call:
+                    current_tool_call = obj[-2]
+
+                function_name_returned = True
+                tool_call_id = make_tool_call_id(
+                    id_type=tool_call_id_type,
+                    func_name=current_tool_call["name"],
+                    idx=tool_call_idx,
+                )
+                delta_message = DeltaMessage(
+                    tool_calls=[
+                        DeltaToolCall(
+                            id=tool_call_id,
+                            function=DeltaFunctionCall(name=current_tool_call["name"], arguments=arguments),
+                            index=len(obj) - 1,
+                            type="function",
+                        )
+                    ]
+                )
+
+            else:
+                delta_text, _ = filter_delta_text(delta_text, previous_text)
+
+                if delta_text != "":
+                    delta_message = DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                function=DeltaFunctionCall(
+                                    # OpenAI API returns None
+                                    # instead of name every time
+                                    name=None,
+                                    arguments=delta_text,
+                                ),
+                                index=len(obj) - 1,
+                            )
+                        ]
+                    )
+                else:
+                    delta_message = None
+
+    return delta_message, function_name_returned
diff --git a/aphrodite/transformers_utils/config.py b/aphrodite/transformers_utils/config.py
index 7206a81aef..ac786a7da3 100644
--- a/aphrodite/transformers_utils/config.py
+++ b/aphrodite/transformers_utils/config.py
@@ -67,6 +67,14 @@
 logger = init_logger(__name__)
 
 
+if Version(version("transformers")) < Version("5.0.0"):
+    logger.warning(
+        "Support for Transformers v4 is deprecated. The Transformers v4 codepath will "
+        "become unmaintained in Aphrodite v0.22.0 and will be removed in Aphrodite v0.24.0. "
+        "Please upgrade to Transformers v5: pip install --upgrade transformers"
+    )
+
+
 class LazyConfigDict(dict):
     def __getitem__(self, key):
         if isinstance(value := super().__getitem__(key), type):
@@ -107,6 +115,7 @@ def __getitem__(self, key):
     mlp_speculator="MLPSpeculatorConfig",
     medusa="MedusaConfig",
     midashenglm="MiDashengLMConfig",
+    moondream3="Moondream3Config",
     eagle="EAGLEConfig",
     speculators="SpeculatorsConfig",
     nemotron="NemotronConfig",
@@ -120,6 +129,7 @@ def __getitem__(self, key):
     qwen3_next="Qwen3NextConfig",
     qwen3_5="Qwen3_5Config",
     qwen3_5_moe="Qwen3_5MoeConfig",
+    laguna="LagunaConfig",
     lfm2_moe="Lfm2MoeConfig",
     tarsier2="Tarsier2Config",
 )
@@ -384,20 +394,31 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
     ompe = getattr(config, "original_max_position_embeddings", None)
 
     if Version(version("transformers")) < Version("5.0.0"):
-        # Transformers v4 installed, legacy config fields may be present
-        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
-            config.rope_parameters = rope_scaling
-        if (rope_theta is not None or partial_rotary_factor is not None or ompe is not None) and not getattr(
-            config, "rope_parameters", None
-        ):
-            config.rope_parameters = {"rope_type": "default"}
-        # Patch legacy fields into rope_parameters
-        if rope_theta is not None:
-            config.rope_parameters["rope_theta"] = rope_theta
-        if partial_rotary_factor is not None:
-            config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
-        if ompe is not None:
-            config.rope_parameters["original_max_position_embeddings"] = ompe
+        # Transformers v4 installed, legacy config fields may be present.
+        existing_rp = getattr(config, "rope_parameters", None)
+        if isinstance(existing_rp, dict) and is_rope_parameters_nested(existing_rp):
+            # Interleaved-attention models (e.g. Laguna-XS.2) ship a nested
+            # {layer_type: {...}} rope_parameters that the model code indexes
+            # by layer_type. The per-layer-type sub-dicts already carry the
+            # correct rope_theta / partial_rotary_factor / ompe (the converter
+            # places top-level legacy fields inside full_attention), so don't
+            # merge top-level fields here — that would shadow the per-type
+            # values and break sliding-attention layers.
+            pass
+        else:
+            if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
+                config.rope_parameters = rope_scaling
+            if (rope_theta is not None or partial_rotary_factor is not None or ompe is not None) and not getattr(
+                config, "rope_parameters", None
+            ):
+                config.rope_parameters = {"rope_type": "default"}
+            # Patch legacy fields into rope_parameters
+            if rope_theta is not None:
+                config.rope_parameters["rope_theta"] = rope_theta
+            if partial_rotary_factor is not None:
+                config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+            if ompe is not None:
+                config.rope_parameters["original_max_position_embeddings"] = ompe
     elif rope_theta is not None or getattr(config, "rope_parameters", None):
         # Transformers v5 installed
         # Patch these fields in case they used non-standard names
diff --git a/aphrodite/transformers_utils/configs/__init__.py b/aphrodite/transformers_utils/configs/__init__.py
index 1a1a66030d..2a81074e0d 100644
--- a/aphrodite/transformers_utils/configs/__init__.py
+++ b/aphrodite/transformers_utils/configs/__init__.py
@@ -45,10 +45,14 @@
     # `FalconConfig` class from the official HuggingFace transformers library.
     "RWConfig": "aphrodite.transformers_utils.configs.falcon",
     "JAISConfig": "aphrodite.transformers_utils.configs.jais",
+    "LagunaConfig": "aphrodite.transformers_utils.configs.laguna",
     "Lfm2MoeConfig": "aphrodite.transformers_utils.configs.lfm2_moe",
     "MedusaConfig": "aphrodite.transformers_utils.configs.medusa",
     "MiDashengLMConfig": "aphrodite.transformers_utils.configs.midashenglm",
     "MLPSpeculatorConfig": "aphrodite.transformers_utils.configs.mlp_speculator",
+    "Moondream3Config": "aphrodite.transformers_utils.configs.moondream3",
+    "Moondream3TextConfig": "aphrodite.transformers_utils.configs.moondream3",
+    "Moondream3VisionConfig": "aphrodite.transformers_utils.configs.moondream3",
     "MoonViTConfig": "aphrodite.transformers_utils.configs.moonvit",
     "KimiLinearConfig": "aphrodite.transformers_utils.configs.kimi_linear",
     "KimiVLConfig": "aphrodite.transformers_utils.configs.kimi_vl",
@@ -105,10 +109,14 @@
     "IsaacConfig",
     "RWConfig",
     "JAISConfig",
+    "LagunaConfig",
     "Lfm2MoeConfig",
     "MedusaConfig",
     "MiDashengLMConfig",
     "MLPSpeculatorConfig",
+    "Moondream3Config",
+    "Moondream3TextConfig",
+    "Moondream3VisionConfig",
     "MoonViTConfig",
     "KimiLinearConfig",
     "KimiVLConfig",
diff --git a/aphrodite/transformers_utils/configs/laguna.py b/aphrodite/transformers_utils/configs/laguna.py
new file mode 100644
index 0000000000..2702d3af5a
--- /dev/null
+++ b/aphrodite/transformers_utils/configs/laguna.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class LagunaConfig(PretrainedConfig):
+    model_type = "laguna"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.g_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size: int = 100352,
+        hidden_size: int = 2048,
+        intermediate_size: int = 8192,
+        num_hidden_layers: int = 40,
+        num_attention_heads: int = 48,
+        num_key_value_heads: int = 8,
+        head_dim: int = 128,
+        qkv_bias: bool = False,
+        attention_bias: bool = False,
+        gating: bool | str = True,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 500000.0,
+        rope_scaling: dict | None = None,
+        rope_parameters: dict | None = None,
+        partial_rotary_factor: float = 1.0,
+        attention_dropout: float = 0.0,
+        sliding_window: int | None = None,
+        layer_types: list[str] | None = None,
+        swa_attention_sink_enabled: bool = False,
+        swa_rope_parameters: dict | None = None,
+        num_attention_heads_per_layer: list[int] | None = None,
+        num_experts: int = 256,
+        num_experts_per_tok: int = 8,
+        moe_intermediate_size: int = 512,
+        shared_expert_intermediate_size: int = 512,
+        norm_topk_prob: bool = True,
+        decoder_sparse_step: int = 1,
+        mlp_only_layers: list[int] | None = None,
+        router_aux_loss_coef: float = 0.001,
+        output_router_logits: bool = False,
+        moe_routed_scaling_factor: float = 1.0,
+        moe_apply_router_weight_on_input: bool = False,
+        **kwargs,
+    ):
+        if mlp_only_layers is None:
+            mlp_only_layers = [0]
+
+        # Accept either v4-style (rope_theta + rope_scaling) or v5-style
+        # (rope_parameters). Translate v5 → v4 so downstream code has one path.
+        if rope_parameters is not None:
+            rp = dict(rope_parameters)
+            rope_theta = float(rp.pop("rope_theta", rope_theta))
+            rt = rp.pop("rope_type", None)
+            if rt is not None and rt != "default":
+                rope_scaling = {"rope_type": rt, **rp}
+            elif rp and rope_scaling is None:
+                rope_scaling = {"rope_type": "default", **rp}
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.qkv_bias = qkv_bias
+        self.attention_bias = attention_bias
+        self.gating = gating
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self.attention_dropout = attention_dropout
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        self.swa_attention_sink_enabled = swa_attention_sink_enabled
+        self.swa_rope_parameters = swa_rope_parameters
+        self.num_attention_heads_per_layer = num_attention_heads_per_layer
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.norm_topk_prob = norm_topk_prob
+        self.decoder_sparse_step = decoder_sparse_step
+        self.mlp_only_layers = mlp_only_layers
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.output_router_logits = output_router_logits
+        self.moe_routed_scaling_factor = moe_routed_scaling_factor
+        self.moe_apply_router_weight_on_input = moe_apply_router_weight_on_input
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["LagunaConfig"]
diff --git a/aphrodite/transformers_utils/configs/mimo_v2_omni.py b/aphrodite/transformers_utils/configs/mimo_v2_omni.py
new file mode 100644
index 0000000000..7d24cb98e7
--- /dev/null
+++ b/aphrodite/transformers_utils/configs/mimo_v2_omni.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from transformers import PretrainedConfig
+
+
+class Mimo_VLVisionConfig(PretrainedConfig):
+    model_type = "mimovl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=28,
+        hidden_size=1280,
+        hidden_act="silu",
+        intermediate_size=4608,
+        num_heads=32,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=2,
+        window_size=128,
+        out_hidden_size=2048,
+        fullatt_block_indexes=None,
+        initializer_range=0.02,
+        kv_channels=64,  # HACK
+        qk_channels=64,
+        num_query_groups=4,
+        num_key_value_heads=8,
+        vit_window_attn_types=None,
+        visual_token_window_size=64,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        # Support GQA: if num_key_value_heads is not provided,
+        # default to num_heads (MHA)
+        if num_key_value_heads is None:
+            num_key_value_heads = num_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes if fullatt_block_indexes is not None else [7, 15, 23, 31]
+        self.out_hidden_size = out_hidden_size
+        self.initializer_range = initializer_range
+        self.kv_channels = kv_channels
+        self.qk_channels = qk_channels
+        self.num_query_groups = num_query_groups
+        self.vit_window_attn_types = vit_window_attn_types or [-1] * depth
+        self.visual_token_window_size = visual_token_window_size
diff --git a/aphrodite/transformers_utils/configs/moondream3.py b/aphrodite/transformers_utils/configs/moondream3.py
new file mode 100644
index 0000000000..12bdcca261
--- /dev/null
+++ b/aphrodite/transformers_utils/configs/moondream3.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Configuration for Moondream3 model."""
+
+from transformers import PretrainedConfig
+
+
+class Moondream3VisionConfig(PretrainedConfig):
+    """Vision encoder configuration for Moondream3."""
+
+    model_type = "moondream3_vision"
+
+    def __init__(
+        self,
+        enc_dim: int = 1152,
+        enc_patch_size: int = 14,
+        enc_n_layers: int = 27,
+        enc_ff_dim: int = 4304,
+        enc_n_heads: int = 16,
+        proj_inner_dim: int = 8192,
+        crop_size: int = 378,
+        max_crops: int = 12,
+        overlap_margin: int = 4,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.enc_dim = enc_dim
+        self.enc_patch_size = enc_patch_size
+        self.enc_n_layers = enc_n_layers
+        self.enc_ff_dim = enc_ff_dim
+        self.enc_n_heads = enc_n_heads
+        self.proj_inner_dim = proj_inner_dim
+        self.crop_size = crop_size
+        self.max_crops = max_crops
+        self.overlap_margin = overlap_margin
+
+        # Standard HuggingFace attributes for vision config
+        self.hidden_size = enc_dim
+        self.num_attention_heads = enc_n_heads
+        self.num_hidden_layers = enc_n_layers
+        self.intermediate_size = enc_ff_dim
+        self.patch_size = enc_patch_size
+        self.image_size = crop_size
+
+
+class Moondream3TextConfig(PretrainedConfig):
+    """Text decoder configuration for Moondream3."""
+
+    model_type = "moondream3_text"
+
+    def __init__(
+        self,
+        dim: int = 2048,
+        ff_dim: int = 8192,
+        n_layers: int = 24,
+        vocab_size: int = 51200,
+        max_context: int = 4096,
+        n_heads: int = 32,
+        n_kv_heads: int = 32,
+        prefix_attn: int = 730,
+        rope_theta: float = 1500000.0,
+        moe: dict | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Store original moondream3 config names
+        self.dim = dim
+        self.ff_dim = ff_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.prefix_attn = prefix_attn
+        self.max_context = max_context
+        self.rope_theta = rope_theta
+
+        # MoE config
+        moe = moe or {}
+        self.moe_start_layer = moe.get("start_layer", 4)
+        self.moe_num_experts = moe.get("n_experts", 64)
+        self.moe_experts_per_token = moe.get("n_experts_per_tok", 8)
+        self.moe_expert_inner_dim = moe.get("expert_inner_dim", 1024)
+
+        # Standard HuggingFace attributes (required by Aphrodite)
+        self.hidden_size = dim
+        self.num_attention_heads = n_heads
+        self.num_key_value_heads = n_kv_heads
+        self.num_hidden_layers = n_layers
+        self.intermediate_size = ff_dim
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_context
+
+        # Moondream3 uses token 0 (<|endoftext|>) as both BOS and EOS.
+        # Token 3 (<|md_reserved_2|>) is an answer delimiter that the model
+        # implementation suppresses during generation.
+        self.bos_token_id = 0
+        self.eos_token_id = 0
+
+        # MoE standard attributes
+        self.num_local_experts = self.moe_num_experts
+        self.num_experts_per_tok = self.moe_experts_per_token
+
+
+class Moondream3Config(PretrainedConfig):
+    """Combined configuration for Moondream3 multimodal model."""
+
+    model_type = "moondream3"
+    is_composition = True
+
+    def __init__(
+        self,
+        config: dict | None = None,
+        **kwargs,
+    ):
+        config = config or {}
+
+        # Parse text config
+        text_config = config.get("text", {})
+        self.text_config: Moondream3TextConfig = Moondream3TextConfig(**text_config)
+
+        # Parse vision config
+        vision_config = config.get("vision", {})
+        self.vision_config = Moondream3VisionConfig(**vision_config)
+
+        # Store the original config dict for model access
+        self.config = config
+        tokenizer_config = config.get("tokenizer", {})
+        self.answer_token_id = tokenizer_config.get("answer_id", 3)
+
+        super().__init__(**kwargs)
+
+        # Expose key attributes at top level for Aphrodite compatibility
+        self.hidden_size = self.text_config.hidden_size
+        self.num_attention_heads = self.text_config.num_attention_heads
+        self.num_key_value_heads = self.text_config.num_key_value_heads
+        self.num_hidden_layers = self.text_config.num_hidden_layers
+        self.vocab_size = self.text_config.vocab_size
+        self.intermediate_size = self.text_config.intermediate_size
+
+        # Moondream3 uses token 0 (<|endoftext|>) as both BOS and EOS.
+        # Token 3 (<|md_reserved_2|>) is an answer delimiter that the model
+        # implementation suppresses during generation.
+        self.bos_token_id = 0
+        self.eos_token_id = 0
+
+    def get_text_config(self, decoder: bool = False) -> "Moondream3TextConfig":
+        """Return the text config for Aphrodite's text_config detection.
+
+        Args:
+            decoder: Ignored. Only used for encoder-decoder models.
+        """
+        return self.text_config
diff --git a/aphrodite/transformers_utils/model_arch_config_convertor.py b/aphrodite/transformers_utils/model_arch_config_convertor.py
index 02645171d5..b47d293628 100644
--- a/aphrodite/transformers_utils/model_arch_config_convertor.py
+++ b/aphrodite/transformers_utils/model_arch_config_convertor.py
@@ -258,6 +258,7 @@ def is_mm_prefix_lm(self) -> bool:
             "bagel",
             "gemma3",
             "molmo2",
+            "moondream3",
             "paligemma",
             "umm",
         )
@@ -336,6 +337,9 @@ def get_total_num_kv_heads(self) -> int:
         assert enc_num_kv_heads == dec_num_kv_heads, "Encoder and decoder must have the same number of kv heads"
         return enc_num_kv_heads
 
+    def is_mm_prefix_lm(self) -> bool:
+        return False
+
 
 class MambaModelArchConfigConvertor(ModelArchConfigConvertorBase):
     def get_head_size(self) -> int:
@@ -421,6 +425,37 @@ def get_num_hidden_layers(self) -> int:
         return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
 
 
+def _strip_mimo_v2_attention_chunk_size(hf_config: PretrainedConfig, hf_text_config: PretrainedConfig) -> None:
+    # MiMo-V2-Flash's config.json sets `attention_chunk_size=128` but the
+    # architecture does not actually use chunked local attention. Leaving it
+    # set makes Aphrodite disable the hybrid KV cache manager
+    for cfg in (hf_text_config, hf_config):
+        if cfg is not None and hasattr(cfg, "attention_chunk_size"):
+            delattr(cfg, "attention_chunk_size")
+
+
+class MimoV2ModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def __init__(self, hf_config: PretrainedConfig, hf_text_config: PretrainedConfig):
+        if getattr(hf_config, "vision_config", None):
+            hf_config.architectures = ["MiMoV2OmniForCausalLM"]
+        super().__init__(hf_config, hf_text_config)
+        _strip_mimo_v2_attention_chunk_size(hf_config, hf_text_config)
+
+
+class MimoV2MTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def __init__(self, hf_config: PretrainedConfig, hf_text_config: PretrainedConfig):
+        super().__init__(hf_config, hf_text_config)
+        _strip_mimo_v2_attention_chunk_size(hf_config, hf_text_config)
+
+    def get_num_hidden_layers(self) -> int:
+        n = getattr(self.hf_text_config, "num_nextn_predict_layers", None)
+        if n is not None:
+            return n
+        # Fall back to n_predict set by hf_config_override
+        n = getattr(self.hf_text_config, "n_predict", None)
+        return n if n is not None else 0
+
+
 class GLM4MoeMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
     def get_num_hidden_layers(self) -> int:
         return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
@@ -484,6 +519,10 @@ def get_head_size(self) -> int:
     "qwen3_next_mtp": Qwen3NextMTPModelArchConfigConvertor,
     "qwen3_5_mtp": Qwen3_5MTPModelArchConfigConvertor,
     "mimo_mtp": MimoMTPModelArchConfigConvertor,
+    "mimo_v2": MimoV2ModelArchConfigConvertor,
+    "mimo_v2_flash": MimoV2ModelArchConfigConvertor,
+    "mimo_v2_mtp": MimoV2MTPModelArchConfigConvertor,
+    "mimo_v2_omni_mtp": MimoV2MTPModelArchConfigConvertor,
     "glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor,
     "glm_ocr_mtp": GLM4MoeMTPModelArchConfigConvertor,
     "ernie_mtp": ErnieMTPModelArchConfigConvertor,
diff --git a/aphrodite/transformers_utils/processors/__init__.py b/aphrodite/transformers_utils/processors/__init__.py
index 14ce84c03f..47dd216338 100644
--- a/aphrodite/transformers_utils/processors/__init__.py
+++ b/aphrodite/transformers_utils/processors/__init__.py
@@ -23,10 +23,12 @@
     "H2OVLProcessor",
     "HunYuanVLProcessor",
     "HunYuanVLImageProcessor",
+    "Moondream3Processor",
     "InternVLProcessor",
     "IsaacProcessor",
     "KimiAudioProcessor",
     "KimiK25Processor",
+    "MiMoOmniProcessor",
     "MistralCommonPixtralProcessor",
     "MistralCommonVoxtralProcessor",
     "NanoNemotronVLProcessor",
@@ -57,8 +59,10 @@
     "IsaacProcessor": "aphrodite.transformers_utils.processors.isaac",
     "KimiAudioProcessor": "aphrodite.transformers_utils.processors.kimi_audio",
     "KimiK25Processor": "aphrodite.transformers_utils.processors.kimi_k25",
+    "MiMoOmniProcessor": "aphrodite.transformers_utils.processors.mimo_v2_omni",
     "MistralCommonPixtralProcessor": "aphrodite.transformers_utils.processors.pixtral",
     "MistralCommonVoxtralProcessor": "aphrodite.transformers_utils.processors.voxtral",
+    "Moondream3Processor": "aphrodite.transformers_utils.processors.moondream3",
     "NanoNemotronVLProcessor": "aphrodite.transformers_utils.processors.nano_nemotron_vl",
     "NemotronVLProcessor": "aphrodite.transformers_utils.processors.nemotron_vl",
     "LlamaNemotronVLEmbedProcessor": "aphrodite.transformers_utils.processors.nemotron_vl",
diff --git a/aphrodite/transformers_utils/processors/mimo_v2_omni.py b/aphrodite/transformers_utils/processors/mimo_v2_omni.py
new file mode 100644
index 0000000000..e028efb6b1
--- /dev/null
+++ b/aphrodite/transformers_utils/processors/mimo_v2_omni.py
@@ -0,0 +1,1181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# mypy: ignore-errors
+"""MiMo-Omni multimodal processor for Aphrodite.
+
+Ported from SGLang's MiMoV2OmniProcessor / MiMoVLProcessor implementations.
+"""
+
+import contextlib
+import copy
+import io
+import logging
+import math
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from io import BytesIO
+from typing import Any, Literal
+
+import numpy as np
+import regex as re
+import requests
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import BatchFeature, TensorType
+from transformers.processing_utils import ProcessorMixin
+
+try:
+    from torchcodec.decoders import AudioDecoder
+
+    _HAS_TORCHCODEC = True
+except ImportError:
+    AudioDecoder = None
+    _HAS_TORCHCODEC = False
+
+try:
+    import torchaudio
+    from torchaudio.transforms import MelSpectrogram as _MelSpectrogram
+
+    _HAS_TORCHAUDIO = True
+except ImportError:
+    torchaudio = None  # type: ignore[assignment]
+    _MelSpectrogram = None  # type: ignore[assignment,misc]
+    _HAS_TORCHAUDIO = False
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+_PIXEL_MEAN = [123.675, 116.28, 103.53]
+_PIXEL_STD = [58.395, 57.12, 57.375]
+_mean_std_cache: dict[str, tuple[torch.Tensor, torch.Tensor]] = {}
+
+
+# ---------------------------------------------------------------------------
+# Data classes
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ImageInput:
+    # PIL.Image | str (path/url/base64) | bytes | torch.Tensor (C,H,W)
+    image: Any
+    max_pixels: int | None = None
+    min_pixels: int | None = None
+
+
+@dataclass
+class VideoInput:
+    # tuple[frames_TCHW: torch.Tensor, timestamps_T: torch.Tensor]
+    video: Any
+    min_pixels: int | None = None
+    max_pixels: int | None = None
+    total_max_pixels: int | None = None
+    fps: float | None = None
+    num_frames: int | None = None
+    max_frames: int | None = None
+    min_frames: int | None = None
+    do_include_last_frame: bool | None = False
+    start_time: float | None = None
+    end_time: float | None = None
+    segment_type: Literal["individual", "partial"] = "individual"
+
+
+@dataclass
+class AudioInput:
+    # str (path/url/base64) | bytes | tuple[waveform_1D, sr]
+    # | np.ndarray | torch.Tensor (T,n_vq)
+    audio: Any
+
+
+@dataclass
+class VideoAudioInput:
+    video: Any  # same as VideoInput.video
+    audio: Any  # same as AudioInput.audio
+    min_pixels: int | None = None
+    max_pixels: int | None = None
+    total_max_pixels: int | None = None
+    fps: float | None = None
+    num_frames: int | None = None
+    max_frames: int | None = None
+    min_frames: int | None = None
+    do_include_last_frame: bool | None = False
+    start_time: float | None = None
+    end_time: float | None = None
+    segment_type: Literal["individual", "partial"] = "individual"
+
+
+@dataclass
+class Content:
+    type: Literal["text", "image", "video", "audio", "video_audio"]
+    content: Any
+    is_target: bool | None = None
+
+
+@dataclass
+class MiMoVLInputSample:
+    input_ids: torch.Tensor
+    labels: torch.Tensor | None
+    pixel_values: list[torch.Tensor]
+    pixel_values_videos: list[torch.Tensor]
+    image_thw_grids: list[torch.Tensor]
+    video_thw_grids: list[torch.Tensor]
+    audio_inputs: list[torch.Tensor]
+    second_per_grid_ts: list[float] = field(default_factory=list)
+    video_start_times: list[float] = field(default_factory=list)
+    audio_token_lens: list[int] = field(default_factory=list)
+    va_audio_inputs: list[torch.Tensor] = field(default_factory=list)
+    video_audio_n_segs: list[int] = field(default_factory=list)
+    video_audio_seg_lens: list[int] = field(default_factory=list)
+    position_ids: torch.Tensor | None = None
+    rope_deltas: torch.Tensor | None = None
+    extra: dict = field(default_factory=dict)
+
+
+# ---------------------------------------------------------------------------
+# Vision utilities
+# ---------------------------------------------------------------------------
+
+
+def _format_timestamp(ts: float) -> str:
+    return f"{int(ts // 60):02d}:{int(ts % 60):02d}"
+
+
+def _smart_resize(h: int, w: int, factor: int, min_px: int, max_px: int) -> tuple[int, int]:
+    if min(h, w) < factor:
+        if h < w:
+            h, w = factor, int(w * factor / h)
+        else:
+            w, h = factor, int(h * factor / w)
+    elif max(h, w) / min(h, w) > 200:
+        raise ValueError(f"Aspect ratio > 200 not allowed: {h}x{w}")
+    h_bar = round(h / factor) * factor
+    w_bar = round(w / factor) * factor
+    if h_bar * w_bar > max_px:
+        beta = math.sqrt((h * w) / max_px)
+        h_bar = math.floor(h / beta / factor) * factor
+        w_bar = math.floor(w / beta / factor) * factor
+    elif h_bar * w_bar < min_px:
+        beta = math.sqrt(min_px / (h * w))
+        h_bar = math.ceil(h * beta / factor) * factor
+        w_bar = math.ceil(w * beta / factor) * factor
+    return int(h_bar), int(w_bar)
+
+
+def _to_rgb(img: Image.Image) -> Image.Image:
+    if img.mode == "RGBA":
+        bg = Image.new("RGB", img.size, (255, 255, 255))
+        bg.paste(img, mask=img.split()[3])
+        return bg
+    return img.convert("RGB")
+
+
+def _standardize(images: torch.Tensor) -> torch.Tensor:
+    key = str(images.device)
+    if key not in _mean_std_cache:
+        mean = torch.tensor(_PIXEL_MEAN, device=images.device).view(1, -1, 1, 1)
+        std = torch.tensor(_PIXEL_STD, device=images.device).view(1, -1, 1, 1)
+        _mean_std_cache[key] = (mean, std)
+    mean, std = _mean_std_cache[key]
+    return (images - mean) / std
+
+
+def _transform_batch(
+    frames: torch.Tensor,
+    factor: int,
+    min_px: int,
+    max_px: int,
+    device: torch.device | None = None,
+) -> tuple[torch.Tensor, int, int]:
+    if device is not None:
+        frames = frames.to(device)
+    _, _, h, w = frames.shape
+    h_bar, w_bar = _smart_resize(h, w, factor, min_px, max_px)
+    resized = F.interpolate(frames.float(), (h_bar, w_bar), mode="bilinear", align_corners=False)
+    return _standardize(resized), w_bar, h_bar
+
+
+def _transform_single(
+    img: Any,
+    factor: int,
+    min_px: int,
+    max_px: int,
+    device: torch.device | None = None,
+) -> tuple[torch.Tensor, int, int]:
+    if isinstance(img, torch.Tensor):
+        t = img.float()
+        _, h, w = t.shape
+    elif isinstance(img, Image.Image):
+        img = img.convert("RGB")
+        w, h = img.size
+        t = torch.from_numpy(np.array(img)).permute(2, 0, 1).float()
+    else:
+        raise TypeError(f"Expected Tensor or PIL.Image, got {type(img)}")
+    if device is not None:
+        t = t.to(device)
+    h_bar, w_bar = _smart_resize(h, w, factor, min_px, max_px)
+    out = F.interpolate(t.unsqueeze(0), (h_bar, w_bar), mode="bilinear", align_corners=False)
+    return _standardize(out).squeeze(0), w_bar, h_bar
+
+
+def _fetch_image(src: Any) -> Image.Image:
+    if isinstance(src, Image.Image):
+        return _to_rgb(src)
+    if isinstance(src, bytes):
+        return _to_rgb(copy.deepcopy(Image.open(BytesIO(src))))
+    if isinstance(src, str):
+        if src.startswith(("http://", "https://")):
+            r = requests.get(src, timeout=30)
+            r.raise_for_status()
+            return _to_rgb(copy.deepcopy(Image.open(BytesIO(r.content))))
+        if src.startswith("file://"):
+            return _to_rgb(Image.open(src[7:]))
+        if src.startswith("data:image"):
+            import pybase64 as _b64
+
+            _, b64 = src.split("base64,", 1)
+            return _to_rgb(copy.deepcopy(Image.open(BytesIO(_b64.b64decode(b64)))))
+        return _to_rgb(Image.open(src))
+    raise ValueError(f"Unrecognized image source: {type(src)}")
+
+
+# ---------------------------------------------------------------------------
+# Core processor
+# ---------------------------------------------------------------------------
+
+
+class MiMoVLProcessor:
+    """Core MiMo-VL multimodal processor.
+
+    Handles image/video/audio preprocessing and token sequence construction.
+    Ported from SGLang's MiMoVLProcessor.
+    """
+
+    def __init__(
+        self,
+        tokenizer: Any,
+        patch_size: int = 14,
+        merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        temporal_compression_ratio: int = 1,
+        use_video_timestamps: bool = True,
+        video_audio_interleave_length: int = 0,
+        audio_kernel_size: int = 3,
+        audio_stride_size: int = 2,
+        audio_avg_pooler: int = 2,
+        audio_sampling_rate: int = 24000,
+        audio_nfft: int = 960,
+        audio_hop_length: int = 240,
+        audio_window_size: int = 960,
+        audio_fmin: float = 0.0,
+        audio_fmax: float | None = None,
+        audio_n_mels: int = 128,
+        audio_segment_size: int = 6000,
+        audio_channels: int = 8,
+        audio_group_size: int = 4,
+        audio_input_id_per_second: float = 25.0,
+        audio_zeroemb_idx: int = 4096,
+        image_min_pixels: int | None = None,
+        image_max_pixels: int | None = None,
+        video_min_pixels: int | None = None,
+        video_max_pixels: int | None = None,
+        video_total_max_pixels: int | None = None,
+        fps: float | None = None,
+        num_frames: int | None = None,
+        max_frames: int | None = None,
+        min_frames: int | None = None,
+        image_token_id: int | None = None,
+        video_token_id: int | None = None,
+        audio_token_id: int | None = None,
+        vision_start_token_id: int | None = None,
+        vision_end_token_id: int | None = None,
+        audio_start_token_id: int | None = None,
+        audio_end_token_id: int | None = None,
+        video_start_token_id: int | None = None,
+        video_end_token_id: int | None = None,
+        pad_token_id: int | None = None,
+        rope_type: str = "rope",
+        video_process_num_threads: int = 16,
+        device: Any | None = None,
+        **kwargs: Any,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.video_process_num_threads = video_process_num_threads
+        self.device = torch.device(device) if isinstance(device, str) else device
+
+        self.rope_type = "rope" if rope_type == "1d" else rope_type
+        assert self.rope_type in ("rope", "mrope"), f"Unknown rope_type: {self.rope_type}"
+
+        # video timestamps require 1-D rope
+        assert use_video_timestamps, "use_video_timestamps must be True"
+        assert self.rope_type == "rope", "use_video_timestamps requires rope_type='rope'"
+        self.use_video_timestamps = use_video_timestamps
+        self.video_audio_interleave_length = video_audio_interleave_length
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.audio_token_id = audio_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        self.audio_start_token_id = audio_start_token_id
+        self.audio_end_token_id = audio_end_token_id
+        self.video_start_token_id = video_start_token_id
+        self.video_end_token_id = video_end_token_id
+        self.pad_token_id = pad_token_id
+
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.temporal_compression_ratio = temporal_compression_ratio
+
+        self.audio_sampling_rate = audio_sampling_rate
+        self.audio_nfft = audio_nfft
+        self.audio_hop_length = audio_hop_length
+        self.audio_window_size = audio_window_size
+        self.audio_fmin = audio_fmin
+        self.audio_fmax = audio_fmax
+        self.audio_n_mels = audio_n_mels
+        self.audio_segment_size = audio_segment_size
+        self.audio_kernel_size = audio_kernel_size
+        self.audio_stride_size = audio_stride_size
+        self.audio_avg_pooler = audio_avg_pooler
+        self.audio_channels = audio_channels
+        self.audio_group_size = audio_group_size
+        self.audio_input_id_per_second = audio_input_id_per_second
+
+        self._mel_spec_kwargs = dict(
+            sample_rate=audio_sampling_rate,
+            n_fft=audio_nfft,
+            hop_length=audio_hop_length,
+            win_length=audio_window_size,
+            f_min=audio_fmin,
+            f_max=audio_fmax,
+            n_mels=audio_n_mels,
+            power=1.0,
+            center=True,
+        )
+        self._mel_spectrogram: Any | None = None
+        self._resamplers: OrderedDict = OrderedDict()
+        self._resamplers_max = 16
+
+        if isinstance(audio_zeroemb_idx, int):
+            self.audio_zeroemb_idxs = torch.tensor([audio_zeroemb_idx] * audio_channels, dtype=torch.int32)
+        else:
+            self.audio_zeroemb_idxs = torch.tensor(audio_zeroemb_idx, dtype=torch.int32)
+
+        assert image_min_pixels is not None, "image_min_pixels must be set"
+        assert image_max_pixels is not None, "image_max_pixels must be set"
+        assert video_min_pixels is not None, "video_min_pixels must be set"
+        assert video_max_pixels is not None, "video_max_pixels must be set"
+        assert video_total_max_pixels is not None, "video_total_max_pixels must be set"
+        assert fps is not None or num_frames is not None, "fps or num_frames must be set"
+
+        self._img_kw = {"min_pixels": image_min_pixels, "max_pixels": image_max_pixels}
+        self._vid_kw = {
+            "min_pixels": video_min_pixels,
+            "max_pixels": video_max_pixels,
+            "total_max_pixels": video_total_max_pixels,
+            "fps": fps,
+            "num_frames": num_frames,
+            "max_frames": max_frames,
+            "min_frames": min_frames,
+        }
+
+    @property
+    def mel_spectrogram(self) -> Any:
+        if self._mel_spectrogram is None:
+            if _MelSpectrogram is None:
+                raise RuntimeError("torchaudio is required for audio. Install with: pip install torchaudio")
+            self._mel_spectrogram = _MelSpectrogram(**self._mel_spec_kwargs)
+        return self._mel_spectrogram
+
+    def _resolve_img_kw(self, img: ImageInput) -> dict:
+        return {
+            "min_px": (img.min_pixels if img.min_pixels is not None else self._img_kw["min_pixels"]),
+            "max_px": (img.max_pixels if img.max_pixels is not None else self._img_kw["max_pixels"]),
+        }
+
+    def _resolve_vid_kw(self, vid: VideoInput) -> dict:
+        kw: dict = {}
+        for k in ("min_pixels", "max_pixels", "total_max_pixels"):
+            kw[k] = getattr(vid, k) or self._vid_kw[k]
+        if vid.num_frames is not None:
+            kw["num_frames"] = vid.num_frames
+        elif vid.fps is not None:
+            kw["fps"] = vid.fps
+            if vid.max_frames is not None:
+                kw["max_frames"] = vid.max_frames
+            if vid.min_frames is not None:
+                kw["min_frames"] = vid.min_frames
+        elif self._vid_kw["num_frames"] is not None:
+            kw["num_frames"] = self._vid_kw["num_frames"]
+        elif self._vid_kw["fps"] is not None:
+            kw["fps"] = self._vid_kw["fps"]
+            if self._vid_kw["max_frames"] is not None:
+                kw["max_frames"] = self._vid_kw["max_frames"]
+            if self._vid_kw["min_frames"] is not None:
+                kw["min_frames"] = self._vid_kw["min_frames"]
+        else:
+            raise ValueError("No video sampling strategy specified (fps or num_frames).")
+        return kw
+
+    def preprocess_audio(self, audio: Any) -> tuple[torch.Tensor, int]:
+        """Decode audio bytes/path/tuple → (mel_spec (T, n_mels), token_len)."""
+        if isinstance(audio, tuple):
+            waveform, original_sr = audio
+        else:
+            if AudioDecoder is None:
+                raise RuntimeError("torchcodec is required for audio. Install with: pip install torchcodec")
+            if isinstance(audio, bytes):
+                file_obj: Any = io.BytesIO(audio)
+            elif isinstance(audio, str):
+                if audio.startswith("data:"):
+                    import pybase64 as _b64
+
+                    file_obj = io.BytesIO(_b64.b64decode(audio.split(",")[1]))
+                elif audio.startswith(("http://", "https://")):
+                    r = requests.get(audio, timeout=30)
+                    r.raise_for_status()
+                    file_obj = io.BytesIO(r.content)
+                else:
+                    file_obj = audio
+            else:
+                raise ValueError(f"Unsupported audio source type: {type(audio)}")
+            samples = AudioDecoder(file_obj).get_all_samples()
+            waveform = samples.data
+            original_sr = samples.sample_rate
+
+        if original_sr != self.audio_sampling_rate:
+            if original_sr not in self._resamplers:
+                if len(self._resamplers) >= self._resamplers_max:
+                    self._resamplers.popitem(last=False)
+                self._resamplers[original_sr] = torchaudio.transforms.Resample(
+                    orig_freq=original_sr, new_freq=self.audio_sampling_rate
+                )
+            self._resamplers.move_to_end(original_sr)
+            waveform = self._resamplers[original_sr](waveform)
+
+        if waveform.ndim == 2:
+            waveform = waveform.mean(dim=0)
+        spec = self.mel_spectrogram(waveform[None, :])
+        spec = torch.log(torch.clip(spec, min=1e-7)).squeeze().transpose(0, 1)
+
+        n = spec.shape[0]
+        n = n + 3 - self.audio_kernel_size
+        n = (n + 2 - self.audio_kernel_size) // self.audio_stride_size + 1
+        n = n // self.audio_avg_pooler + int(n % self.audio_avg_pooler != 0)
+        token_len = math.ceil(n / self.audio_group_size)
+        return spec, token_len
+
+    def process_image(self, image: ImageInput) -> torch.Tensor:
+        kw = self._resolve_img_kw(image)
+        src = image.image
+        if isinstance(src, (str, bytes)):
+            src = _fetch_image(src)
+        tensor, _, _ = _transform_single(
+            src,
+            factor=self.patch_size * self.merge_size,
+            device=self.device,
+            **kw,
+        )
+        return tensor
+
+    def process_video(self, video_input: VideoInput) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]:
+        kw = self._resolve_vid_kw(video_input)
+        video = video_input.video
+        if not isinstance(video, tuple):
+            raise ValueError(
+                f"video must be a (frames_TCHW, timestamps_T) tuple, "
+                f"got {type(video)}. "
+                "Decode the video before calling the processor."
+            )
+        frames, timestamps = video
+
+        fps = 1.0 if len(timestamps) < 2 else float(1.0 / (float(timestamps[1]) - float(timestamps[0])))
+        start = video_input.start_time if video_input.start_time is not None else float(timestamps[0])
+        end = video_input.end_time if video_input.end_time is not None else float(timestamps[-1]) + 1.0 / fps
+
+        if video_input.segment_type != "individual":
+            mask = (timestamps >= start) & (timestamps < end)
+            idxs = torch.where(mask)[0]
+            if len(idxs) == 0:
+                idxs = torch.where(timestamps <= start)[0][-1:]
+            frames, timestamps = frames[idxs], timestamps[idxs]
+
+        tp = self.temporal_patch_size * self.temporal_compression_ratio
+        n = frames.shape[0]
+        total_px = kw["total_max_pixels"]
+        max_px = max(kw["min_pixels"], min(total_px * tp // max(n, 1), kw["max_pixels"]))
+
+        if n % tp != 0:
+            pad = tp - n % tp
+            frames = torch.cat(
+                [frames, frames[-1:].repeat(pad, *([1] * (frames.ndim - 1)))],
+                dim=0,
+            )
+            timestamps = torch.cat([timestamps, timestamps[-1:].repeat(pad)], dim=0)
+
+        transformed, _, _ = _transform_batch(
+            frames,
+            factor=self.patch_size * self.merge_size,
+            min_px=kw["min_pixels"],
+            max_px=max_px,
+            device=self.device,
+        )
+        patches, thw = self._flatten_visual(transformed, "video")
+        meta = {
+            "fps_sampled": fps,
+            "segment_start_time": start,
+            "segment_end_time": end,
+        }
+        return patches, thw, timestamps, meta
+
+    def process_audio(self, audio: AudioInput) -> Any:
+        src = audio.audio
+        if isinstance(src, np.ndarray):
+            src = (torch.from_numpy(src).float(), self.audio_sampling_rate)
+        if isinstance(src, (str, bytes, tuple)):
+            return self.preprocess_audio(src)
+        # Pre-tokenized tensor (T, n_vq)
+        assert isinstance(src, torch.Tensor) and src.ndim == 2
+        T = src.shape[0]
+        src = src[:, : self.audio_channels].to(torch.long)
+        pad_T = (T + self.audio_group_size - 1) // self.audio_group_size * self.audio_group_size
+        padding = torch.zeros(pad_T - T, self.audio_channels, dtype=torch.long) + src[-1]
+        src = torch.cat([src, padding], dim=0)
+        return src.reshape(pad_T // self.audio_group_size, self.audio_group_size, self.audio_channels)
+
+    def _flatten_visual(self, visual: torch.Tensor, kind: str) -> tuple[torch.Tensor, torch.Tensor]:
+        if kind == "image":
+            h, w = visual.shape[-2:]
+            patches = visual.unsqueeze(0).repeat(self.temporal_patch_size, 1, 1, 1)
+        else:  # video / video_audio
+            temporal_stride = self.temporal_compression_ratio * self.temporal_patch_size
+            assert visual.shape[0] % temporal_stride == 0
+            patches = visual
+            h, w = patches.shape[-2:]
+
+        C = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = h // self.patch_size, w // self.patch_size
+
+        patches = (
+            patches.contiguous()
+            .view(
+                grid_t,
+                self.temporal_patch_size,
+                C,
+                grid_h // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+                grid_w // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+            )
+            .permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+            .contiguous()
+            .view(
+                grid_t * grid_h * grid_w,
+                C * self.temporal_patch_size * self.patch_size * self.patch_size,
+            )
+        )
+        thw = torch.tensor([grid_t, grid_h, grid_w], dtype=torch.int32)
+        return patches, thw
+
+    def process(self, contents: list[Content], verbose: bool = False) -> MiMoVLInputSample:
+        input_ids: list[int] = []
+        labels: list[int] = []
+        img_pv: list[torch.Tensor] = []
+        img_grids: list[torch.Tensor] = []
+        vid_pv: list[torch.Tensor] = []
+        vid_grids: list[torch.Tensor] = []
+        audio_inputs: list[torch.Tensor] = []
+        is_audio_tokenized: list[bool] = []
+        audio_token_lens: list[int] = []
+        second_per_grid_ts: list[float] = []
+        video_start_times: list[float] = []
+        va_audio_inputs: list[torch.Tensor] = []
+        video_audio_n_segs: list[int] = []
+        video_audio_seg_lens: list[int] = []
+
+        # Pre-decode videos in parallel
+        vid_info = [
+            (i, c.content, c.type == "video_audio")
+            for i, c in enumerate(contents)
+            if c.type in ("video", "video_audio")
+        ]
+        vid_results: dict[int, tuple] = {}
+        if vid_info:
+            n_t = min(self.video_process_num_threads, len(vid_info))
+            if n_t > 1 and len(vid_info) > 1:
+                with ThreadPoolExecutor(max_workers=n_t) as ex:
+                    fut_map = {ex.submit(self.process_video, vi): idx for idx, vi, _ in vid_info}
+                    for fut in as_completed(fut_map):
+                        vid_results[fut_map[fut]] = fut.result()
+            else:
+                for idx, vi, _ in vid_info:
+                    vid_results[idx] = self.process_video(vi)
+
+        for ci, content in enumerate(contents):
+            _ids: list[int] = []
+            _lbls: list[int] | None = None
+
+            if content.type == "text":
+                _ids = (
+                    self.tokenizer.encode(content.content)
+                    if isinstance(content.content, str)
+                    else list(content.content)
+                )
+                if content.is_target:
+                    _lbls = _ids
+
+            elif content.type == "image":
+                tensor = self.process_image(content.content)
+                patches, thw = self._flatten_visual(tensor, "image")
+                t, h, w = thw.tolist()
+                n_tok = (t * h * w) // (self.merge_size**2)
+                img_pv.append(patches)
+                img_grids.append(thw)
+                _ids = [self.vision_start_token_id] + [self.image_token_id] * n_tok + [self.vision_end_token_id]
+
+            elif content.type == "video":
+                patches, thw, ts, meta = vid_results[ci]
+                t, h, w = thw.tolist()
+                n_per_grid = h * w // (self.merge_size**2)
+                vid_pv.append(patches)
+                vid_grids.append(thw)
+                second_per_grid_ts.append(self.temporal_patch_size / meta["fps_sampled"])
+                video_start_times.append(float(ts[0]))
+                video_audio_n_segs.append(0)
+
+                stride = self.temporal_patch_size * self.temporal_compression_ratio
+                ts_texts = [_format_timestamp(float(x)) for x in ts[::stride]]
+                ts_ids_list = [self.tokenizer.encode(s) for s in ts_texts]
+
+                _ids = [self.video_start_token_id]
+                for ts_ids in ts_ids_list:
+                    _ids += (
+                        ts_ids
+                        + [self.vision_start_token_id]
+                        + [self.video_token_id] * n_per_grid
+                        + [self.vision_end_token_id]
+                    )
+                _ids += [self.video_end_token_id]
+
+            elif content.type == "audio":
+                processed = self.process_audio(content.content)
+                if isinstance(processed, tuple):
+                    is_audio_tokenized.append(False)
+                    spec, tok_len = processed
+                    audio_inputs.append(spec)
+                else:
+                    is_audio_tokenized.append(True)
+                    tok_len = processed.shape[0]
+                    audio_inputs.append(processed)
+                audio_token_lens.append(tok_len)
+                _ids = [self.audio_start_token_id] + [self.audio_token_id] * tok_len + [self.audio_end_token_id]
+
+            elif content.type == "video_audio":
+                patches, thw, ts, meta = vid_results[ci]
+                second_per_grid_ts.append(self.temporal_patch_size / meta["fps_sampled"])
+                video_start_times.append(float(ts[0]))
+                processed_audio = self.process_audio(content.content)
+                tok_per_sec = self.audio_input_id_per_second / self.audio_group_size
+
+                t, h, w = thw.tolist()
+                vid_pv.append(patches)
+                vid_grids.append(thw)
+
+                if isinstance(processed_audio, tuple):
+                    # Mel spec (not pre-tokenized): store in va_audio_inputs separately
+                    spec, total_atok = processed_audio
+                    va_audio_inputs.append(spec)
+                    _va_is_tokenized = False
+                else:
+                    # Pre-tokenized: not expected in Aphrodite, but handle defensively
+                    total_atok = processed_audio.shape[0]
+                    _va_is_tokenized = True
+
+                n_per_grid = h * w // (self.merge_size**2)
+                stride = self.temporal_patch_size * self.temporal_compression_ratio
+                grid_ts = ts[::stride]
+                ts_texts = [_format_timestamp(float(x)) for x in grid_ts]
+                ts_ids_list = [self.tokenizer.encode(s) for s in ts_texts]
+
+                units: list[tuple] = []
+                for i in range(len(grid_ts)):
+                    a_start = int(float(grid_ts[i]) * tok_per_sec)
+                    a_end = (
+                        int(float(grid_ts[i + 1]) * tok_per_sec)
+                        if i < len(grid_ts) - 1
+                        else int(meta["segment_end_time"] * tok_per_sec)
+                    )
+                    seg_len = min(a_end, total_atok) - a_start
+                    assert seg_len > 0, f"Zero-length audio segment at grid index {i}"
+                    seg = processed_audio[a_start : a_start + seg_len] if _va_is_tokenized else None
+                    units.append(
+                        (
+                            float(grid_ts[i]),
+                            ts_texts[i],
+                            ts_ids_list[i],
+                            n_per_grid,
+                            seg_len,
+                            seg,
+                        )
+                    )
+
+                il = self.video_audio_interleave_length
+                if il == -1:
+                    groups: list[list] = [list(enumerate(units))]
+                elif il == 0:
+                    groups = [[(i, u)] for i, u in enumerate(units)]
+                else:
+                    groups, cur, t_ptr = [], [], 0.0
+                    for i, u in enumerate(units):
+                        while u[0] >= t_ptr + il:
+                            if cur:
+                                groups.append(cur)
+                                cur = []
+                            t_ptr += il
+                        cur.append((i, u))
+                    if cur:
+                        groups.append(cur)
+
+                # Track n_segs (= num groups) and per-group audio token counts
+                video_audio_n_segs.append(len(groups))
+                for group in groups:
+                    group_seg_len = sum(u[4] for _, u in group)
+                    video_audio_seg_lens.append(group_seg_len)
+
+                _ids = [self.video_start_token_id]
+                for group in groups:
+                    _ids += group[0][1][2]  # first-unit timestamp token ids
+                    _vid_tok: list[int] = []
+                    _aud_tok: list[int] = []
+                    for _, u in group:
+                        _, _, _, vid_n, seg_n, seg_audio = u
+                        _vid_tok += (
+                            [self.vision_start_token_id] + [self.video_token_id] * vid_n + [self.vision_end_token_id]
+                        )
+                        _aud_tok += [self.audio_token_id] * seg_n
+                        if seg_audio is not None:
+                            # Pre-tokenized per-frame segments (rare in Aphrodite)
+                            audio_inputs.append(seg_audio)
+                    _ids += _vid_tok + [self.audio_start_token_id] + _aud_tok + [self.audio_end_token_id]
+                _ids += [self.video_end_token_id]
+
+            input_ids.extend(_ids)
+            labels.extend(_lbls if _lbls is not None else [self.pad_token_id] * len(_ids))
+
+        ids_t = torch.tensor(input_ids)
+        lbl_arr = np.roll(labels, shift=-1)
+        lbl_arr[-1] = self.pad_token_id
+        lbl_t = torch.tensor(lbl_arr)
+
+        extra: dict = {}
+        if is_audio_tokenized:
+            assert all(is_audio_tokenized) or not any(is_audio_tokenized)
+            extra["is_audio_tokenized"] = is_audio_tokenized[0]
+
+        position_ids = torch.arange(ids_t.shape[0]).expand(3, -1)
+        rope_deltas = torch.zeros((1, 1), dtype=torch.int32)
+
+        return MiMoVLInputSample(
+            input_ids=ids_t,
+            labels=lbl_t,
+            pixel_values=img_pv,
+            pixel_values_videos=vid_pv,
+            image_thw_grids=img_grids,
+            video_thw_grids=vid_grids,
+            audio_inputs=audio_inputs,
+            second_per_grid_ts=second_per_grid_ts,
+            video_start_times=video_start_times,
+            audio_token_lens=audio_token_lens,
+            va_audio_inputs=va_audio_inputs,
+            video_audio_n_segs=video_audio_n_segs,
+            video_audio_seg_lens=video_audio_seg_lens,
+            position_ids=position_ids,
+            rope_deltas=rope_deltas,
+            extra=extra,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Aphrodite ProcessorMixin wrapper
+# ---------------------------------------------------------------------------
+
+
+class MiMoOmniProcessor(ProcessorMixin):
+    """HuggingFace-compatible ProcessorMixin wrapper for MiMo-Omni.
+
+    Accepts PIL images, pre-decoded video tuples (frames_TCHW, timestamps_T),
+    and audio (file path / bytes / (waveform, sr) tuple / numpy array).
+    """
+
+    attributes = ["tokenizer"]
+    tokenizer_class = "AutoTokenizer"
+
+    # Single or multi-pad placeholders produced by the chat template / prior expansion
+    _IMG_RE = re.compile(r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>")
+    _VID_RE = re.compile(r"<\|vision_start\|>(?:<\|video_pad\|>)+<\|vision_end\|>")
+    _AUD_RE = re.compile(r"<\|mimo_audio_start\|>(?:<\|audio_pad\|>)+<\|mimo_audio_end\|>")
+
+    _MM_RE = re.compile(
+        r"(<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
+        r"|<\|vision_start\|>(?:<\|video_pad\|>)+<\|vision_end\|>"
+        r"|<\|mimo_audio_start\|>(?:<\|audio_pad\|>)+<\|mimo_audio_end\|>)"
+    )
+
+    def __init__(
+        self,
+        tokenizer: Any,
+        *,
+        patch_size: int = 14,
+        merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        temporal_compression_ratio: int = 1,
+        image_min_pixels: int | None = None,
+        image_max_pixels: int | None = None,
+        video_min_pixels: int | None = None,
+        video_max_pixels: int | None = None,
+        video_total_max_pixels: int | None = None,
+        fps: float = 2.0,
+        num_frames: int | None = None,
+        max_frames: int = 256,
+        min_frames: int = 8,
+        video_audio_interleave_length: int = 0,
+        audio_sampling_rate: int = 24000,
+        audio_nfft: int = 960,
+        audio_hop_length: int = 240,
+        audio_window_size: int = 960,
+        audio_fmin: float = 0.0,
+        audio_fmax: float | None = None,
+        audio_n_mels: int = 128,
+        audio_segment_size: int = 6000,
+        audio_kernel_size: int = 3,
+        audio_stride_size: int = 2,
+        audio_avg_pooler: int = 2,
+        audio_channels: int = 8,
+        audio_group_size: int = 4,
+        audio_input_id_per_second: float = 25.0,
+        audio_zeroemb_idx: int = 4096,
+        image_token_id: int | None = None,
+        video_token_id: int | None = None,
+        audio_token_id: int | None = None,
+        vision_start_token_id: int | None = None,
+        vision_end_token_id: int | None = None,
+        audio_start_token_id: int | None = None,
+        audio_end_token_id: int | None = None,
+        video_start_token_id: int | None = None,
+        video_end_token_id: int | None = None,
+        rope_type: str = "rope",
+    ) -> None:
+        self.tokenizer = tokenizer
+
+        unit = patch_size * merge_size
+        self.mimo_processor = MiMoVLProcessor(
+            tokenizer=tokenizer,
+            patch_size=patch_size,
+            merge_size=merge_size,
+            temporal_patch_size=temporal_patch_size,
+            temporal_compression_ratio=temporal_compression_ratio,
+            use_video_timestamps=True,
+            video_audio_interleave_length=video_audio_interleave_length,
+            audio_sampling_rate=audio_sampling_rate,
+            audio_nfft=audio_nfft,
+            audio_hop_length=audio_hop_length,
+            audio_window_size=audio_window_size,
+            audio_fmin=audio_fmin,
+            audio_fmax=audio_fmax,
+            audio_n_mels=audio_n_mels,
+            audio_segment_size=audio_segment_size,
+            audio_kernel_size=audio_kernel_size,
+            audio_stride_size=audio_stride_size,
+            audio_avg_pooler=audio_avg_pooler,
+            audio_channels=audio_channels,
+            audio_group_size=audio_group_size,
+            audio_input_id_per_second=audio_input_id_per_second,
+            audio_zeroemb_idx=audio_zeroemb_idx,
+            image_min_pixels=image_min_pixels or (4 * unit * unit),
+            image_max_pixels=image_max_pixels or (4096 * unit * unit),
+            video_min_pixels=video_min_pixels or (4 * unit * unit),
+            video_max_pixels=video_max_pixels or (4096 * unit * unit),
+            video_total_max_pixels=video_total_max_pixels or (16384 * unit * unit),
+            fps=fps,
+            num_frames=num_frames,
+            max_frames=max_frames,
+            min_frames=min_frames,
+            image_token_id=image_token_id,
+            video_token_id=video_token_id,
+            audio_token_id=audio_token_id,
+            vision_start_token_id=vision_start_token_id,
+            vision_end_token_id=vision_end_token_id,
+            audio_start_token_id=audio_start_token_id,
+            audio_end_token_id=audio_end_token_id,
+            video_start_token_id=video_start_token_id,
+            video_end_token_id=video_end_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            rope_type=rope_type,
+        )
+
+    @classmethod
+    def from_hf_config(cls, tokenizer: Any, hf_config: Any) -> "MiMoOmniProcessor":
+        """Convenience factory: instantiate directly from an HF model config object."""
+        vc = hf_config.vision_config
+        if isinstance(vc, dict):
+            patch_size = vc.get("patch_size", 14)
+            merge_size = vc.get("spatial_merge_size", 2)
+            temporal_patch_size = vc.get("temporal_patch_size", 2)
+        else:
+            patch_size = getattr(vc, "patch_size", 14)
+            merge_size = getattr(vc, "spatial_merge_size", 2)
+            temporal_patch_size = getattr(vc, "temporal_patch_size", 2)
+
+        pc: dict = getattr(hf_config, "processor_config", {}) or {}
+        ac = getattr(hf_config, "audio_config", None)
+        audio_sr: int | None = pc.get("audio_sampling_rate")
+        if audio_sr is None and ac is not None:
+            if isinstance(ac, dict):
+                audio_sr = ac.get("sampling_rate") or ac.get("sample_rate")
+            else:
+                audio_sr = getattr(ac, "sampling_rate", None) or getattr(ac, "sample_rate", None)
+
+        rope_type = "rope"
+        rs = getattr(hf_config, "rope_scaling", None)
+        if rs and rs.get("type") == "default" and rs.get("mrope_section") is not None:
+            rope_type = "mrope"
+
+        unit = patch_size * merge_size
+        return cls(
+            tokenizer,
+            patch_size=patch_size,
+            merge_size=merge_size,
+            temporal_patch_size=temporal_patch_size,
+            image_min_pixels=pc.get("image_min_pixels") or (4 * unit * unit),
+            image_max_pixels=pc.get("image_max_pixels") or (4096 * unit * unit),
+            video_min_pixels=pc.get("video_min_pixels") or (4 * unit * unit),
+            video_max_pixels=pc.get("video_max_pixels") or (4096 * unit * unit),
+            video_total_max_pixels=(pc.get("video_total_max_pixels") or (16384 * unit * unit)),
+            fps=pc.get("fps") or 2.0,
+            num_frames=pc.get("num_frames"),
+            max_frames=pc.get("max_frames") or 256,
+            min_frames=pc.get("min_frames") or 8,
+            video_audio_interleave_length=pc.get("video_audio_interleave_length", 0),
+            audio_sampling_rate=audio_sr or 24000,
+            image_token_id=pc.get("image_token_id"),
+            video_token_id=pc.get("video_token_id"),
+            audio_token_id=pc.get("audio_token_id"),
+            vision_start_token_id=pc.get("vision_start_token_id"),
+            vision_end_token_id=pc.get("vision_end_token_id"),
+            audio_start_token_id=pc.get("audio_start_token_id"),
+            audio_end_token_id=pc.get("audio_end_token_id"),
+            video_start_token_id=pc.get("video_start_token_id"),
+            video_end_token_id=pc.get("video_end_token_id"),
+            rope_type=rope_type,
+        )
+
+    @property
+    def image_token(self) -> str:
+        """Token string used as image placeholder (for Aphrodite integration)."""
+        return "<|image_pad|>"
+
+    @property
+    def video_token(self) -> str:
+        """Token string used as video placeholder (for Aphrodite integration)."""
+        return "<|video_pad|>"
+
+    @property
+    def image_processor(self) -> Any:
+        """Minimal image-processor-like object for Aphrodite processing-info compat."""
+        p = self.mimo_processor
+
+        class _ImageProcessor:
+            merge_size = p.merge_size
+            size = {
+                "shortest_edge": p._img_kw["min_pixels"],
+                "longest_edge": p._img_kw["max_pixels"],
+            }
+
+        return _ImageProcessor()
+
+    def _modality(self, token: str) -> str:
+        if self._IMG_RE.fullmatch(token):
+            return "image"
+        if self._VID_RE.fullmatch(token):
+            return "video"
+        if self._AUD_RE.fullmatch(token):
+            return "audio"
+        return "unknown"
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Any = None,
+        videos: Any = None,
+        audio: Any = None,
+        video_audio: Any = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs: Any,
+    ) -> BatchFeature:
+        """Process multimodal inputs into model-ready tensors.
+
+        Args:
+            text: Prompt string(s) containing multimodal placeholders
+                  ``<|vision_start|><|image_pad|><|vision_end|>``,
+                  ``<|vision_start|><|video_pad|><|vision_end|>``, or
+                  ``<|mimo_audio_start|><|audio_pad|><|mimo_audio_end|>``.
+            images: PIL.Image or list[PIL.Image].
+            videos: list of ``(frames_TCHW: torch.Tensor, timestamps_T: torch.Tensor)``
+                    tuples (pre-decoded).
+            audio: list of ``str`` (path/url/base64), ``bytes``,
+                   ``(waveform_1D, sample_rate)`` tuples, or ``np.ndarray``.
+            return_tensors: Passed to :class:`BatchFeature`.
+
+        Returns:
+            :class:`BatchFeature` with keys:
+            - ``input_ids``
+            - ``pixel_values`` + ``image_grid_thw``
+            - ``pixel_values_videos`` + ``video_grid_thw`` + ``second_per_grid_ts``
+            - ``audio_features``
+        """
+        if isinstance(text, list):
+            text = text[0] if len(text) == 1 else "\n".join(text)
+
+        imgs: list = ([images] if isinstance(images, Image.Image) else list(images)) if images is not None else []
+        vids: list = list(videos) if videos is not None else []
+        auds: list = list(audio) if audio is not None else []
+        va_items: list = list(video_audio) if video_audio is not None else []
+
+        # If audio exists but text has no audio placeholder, prepend one
+        _aud_placeholder = "<|mimo_audio_start|><|audio_pad|><|mimo_audio_end|>"
+        if auds and text is not None and not self._AUD_RE.search(text):
+            text = _aud_placeholder + text
+
+        # Build Content list
+        contents: list[Content] = []
+
+        if text and (imgs or vids or auds or va_items):
+            parts = self._MM_RE.split(text)
+            img_it = iter(imgs)
+            vid_it = iter(vids)
+            aud_it = iter(auds)
+            va_it = iter(va_items)
+            for part in parts:
+                if self._MM_RE.fullmatch(part):
+                    mod = self._modality(part)
+                    if mod == "image":
+                        with contextlib.suppress(StopIteration):
+                            contents.append(
+                                Content(
+                                    type="image",
+                                    content=ImageInput(image=next(img_it)),
+                                )
+                            )
+                    elif mod == "video":
+                        # Try regular video first, fall back to video_audio
+                        vid_item = None
+                        vid_type = "video"
+                        with contextlib.suppress(StopIteration):
+                            vid_item = next(vid_it)
+                        if vid_item is None:
+                            with contextlib.suppress(StopIteration):
+                                vid_item = next(va_it)
+                                vid_type = "video_audio"
+                        if vid_item is not None:
+                            if vid_type == "video":
+                                contents.append(
+                                    Content(
+                                        type="video",
+                                        content=VideoInput(video=vid_item),
+                                    )
+                                )
+                            else:
+                                contents.append(
+                                    Content(
+                                        type="video_audio",
+                                        content=vid_item,
+                                    )
+                                )
+                    elif mod == "audio":
+                        with contextlib.suppress(StopIteration):
+                            contents.append(
+                                Content(
+                                    type="audio",
+                                    content=AudioInput(audio=next(aud_it)),
+                                )
+                            )
+                elif part:
+                    contents.append(Content(type="text", content=part))
+        elif text:
+            contents.append(Content(type="text", content=text))
+        else:
+            for img in imgs:
+                contents.append(Content(type="image", content=ImageInput(image=img)))
+            for vid in vids:
+                contents.append(Content(type="video", content=VideoInput(video=vid)))
+            for aud in auds:
+                contents.append(Content(type="audio", content=AudioInput(audio=aud)))
+            for va_item in va_items:
+                contents.append(Content(type="video_audio", content=va_item))
+
+        if not contents:
+            ids = self.tokenizer(text or "", return_tensors=return_tensors)["input_ids"]
+            return BatchFeature(data={"input_ids": ids}, tensor_type=return_tensors)
+
+        sample = self.mimo_processor.process(contents, verbose=False)
+
+        # Aphrodite expects input_ids to have a batch dimension [1, seq_len].
+        data: dict = {"input_ids": sample.input_ids.unsqueeze(0)}
+
+        if sample.pixel_values:
+            data["pixel_values"] = torch.cat(sample.pixel_values, dim=0)
+            data["image_grid_thw"] = torch.stack(sample.image_thw_grids)
+
+        if sample.pixel_values_videos:
+            data["pixel_values_videos"] = torch.cat(sample.pixel_values_videos, dim=0)
+            data["video_grid_thw"] = torch.stack(sample.video_thw_grids)
+            if sample.second_per_grid_ts:
+                data["second_per_grid_ts"] = torch.tensor(sample.second_per_grid_ts, dtype=torch.float32)
+            if sample.video_start_times:
+                data["video_start_times"] = torch.tensor(sample.video_start_times, dtype=torch.float32)
+            if sample.video_audio_n_segs:
+                data["video_audio_n_segs"] = torch.tensor(sample.video_audio_n_segs, dtype=torch.long)
+            # video_audio_seg_lens: 2D padded tensor (num_videos, max_T).
+            # Row i has the per-group audio token lengths for video i
+            # (zeros for regular videos; valid values for video_audio videos).
+            n_segs_list = sample.video_audio_n_segs
+            max_segs = max(n_segs_list) if n_segs_list else 0
+            if max_segs > 0:
+                seg_lens_2d = torch.zeros(len(n_segs_list), max_segs, dtype=torch.long)
+                flat_cursor = 0
+                for vi, n in enumerate(n_segs_list):
+                    if n > 0:
+                        seg_lens_2d[vi, :n] = torch.tensor(
+                            sample.video_audio_seg_lens[flat_cursor : flat_cursor + n],
+                            dtype=torch.long,
+                        )
+                        flat_cursor += n
+                data["video_audio_seg_lens"] = seg_lens_2d
+
+        # audio_features is a list of variable-length mel-spec tensors; pop it
+        # before BatchFeature conversion to avoid "batched tensors of the same
+        # length" errors, then re-attach it after.
+        audio_features = None
+        if sample.audio_inputs:
+            audio_features = sample.audio_inputs
+            if "is_audio_tokenized" in sample.extra:
+                data["is_audio_tokenized"] = sample.extra["is_audio_tokenized"]
+            if sample.audio_token_lens:
+                data["audio_token_lens"] = torch.tensor(sample.audio_token_lens, dtype=torch.long)
+
+        bf = BatchFeature(data=data, tensor_type=return_tensors)
+        if audio_features is not None:
+            bf["audio_features"] = audio_features
+        # va_audio_features: list of mel-spec tensors (one per video_audio item)
+        if sample.va_audio_inputs:
+            bf["va_audio_features"] = sample.va_audio_inputs
+        return bf
diff --git a/aphrodite/transformers_utils/processors/moondream3.py b/aphrodite/transformers_utils/processors/moondream3.py
new file mode 100644
index 0000000000..f8cf05f6ad
--- /dev/null
+++ b/aphrodite/transformers_utils/processors/moondream3.py
@@ -0,0 +1,522 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Custom processor for Moondream3 model."""
+
+import math
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import (
+    PreTokenizedInput,
+    PreTrainedTokenizerBase,
+    TextInput,
+)
+
+from aphrodite.multimodal.image import convert_image_mode
+
+__all__ = ["Moondream3Processor"]
+
+
+class Moondream3ProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            "max_crops": 12,
+            "overlap_margin": 4,
+            "crop_size": 378,
+            "patch_size": 14,
+            "convert_to_rgb": True,
+            "return_tensors": "pt",
+        },
+    }
+
+
+def select_tiling(height: int, width: int, crop_size: int, max_crops: int) -> tuple[int, int]:
+    """Determine the optimal number of tiles to cover an image."""
+    if height <= crop_size or width <= crop_size:
+        return (1, 1)
+
+    min_h = math.ceil(height / crop_size)
+    min_w = math.ceil(width / crop_size)
+
+    if min_h * min_w > max_crops:
+        ratio = math.sqrt(max_crops / (min_h * min_w))
+        return (max(1, math.floor(min_h * ratio)), max(1, math.floor(min_w * ratio)))
+
+    h_tiles = math.floor(math.sqrt(max_crops * height / width))
+    w_tiles = math.floor(math.sqrt(max_crops * width / height))
+
+    h_tiles = max(h_tiles, min_h)
+    w_tiles = max(w_tiles, min_w)
+
+    if h_tiles * w_tiles > max_crops:
+        if w_tiles > h_tiles:
+            w_tiles = math.floor(max_crops / h_tiles)
+        else:
+            h_tiles = math.floor(max_crops / w_tiles)
+
+    return (max(1, h_tiles), max(1, w_tiles))
+
+
+class Moondream3Processor(ProcessorMixin):
+    """
+    Constructs a Moondream3 processor which handles image preprocessing
+    and tokenization for the Moondream3 multimodal model.
+
+    Args:
+        tokenizer: The tokenizer to use for text processing.
+        chat_template: Optional chat template string.
+        crop_size: Size of each image crop.
+        max_crops: Maximum number of crops per image.
+        overlap_margin: Margin for overlapping crops in patches.
+        patch_size: Size of each patch.
+    """
+
+    attributes = ["tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "crop_size",
+        "max_crops",
+        "overlap_margin",
+        "patch_size",
+    ]
+
+    tokenizer_class = "AutoTokenizer"
+    # Use separate tokenizer repo
+    _tokenizer_repo = "moondream/starmie-v1"
+
+    # Default chat template for Moondream3
+    # Moondream uses special tokens for prompting:
+    # - Token 0 (<|endoftext|>): BOS token (ALWAYS present at position 0)
+    # - Token 1 (<|md_reserved_0|>): Start of instruction
+    # - Token 2 (<|md_reserved_1|>): Separator before question
+    # - Token 3 (<|md_reserved_2|>): End of question / start of answer
+    #
+    # Task routing based on text prefix:
+    #   "caption [short|normal|long]" → describe<|md_reserved_1|>{length}
+    #   "describe [short|normal|long]" → describe<|md_reserved_1|>{length}
+    #   otherwise                      → query<|md_reserved_1|><text>
+    #
+    # Format with image:
+    #   <|endoftext|><image><|md_reserved_0|>{task}<|md_reserved_1|>{q}<|md_reserved_2|>
+    # Format without image:
+    #   <|endoftext|><|md_reserved_0|>{task}<|md_reserved_1|>{q}<|md_reserved_2|>
+    _default_chat_template = (
+        "{% for message in messages %}"
+        "{% if message['role'] == 'user' %}"
+        "{% if message['content'] is string %}"
+        # Simple string content (with image assumed) - route by prefix
+        "<|endoftext|><image><|md_reserved_0|>"
+        "{% if message['content'] == 'caption' %}"
+        "describe<|md_reserved_1|>normal<|md_reserved_2|>"
+        "{% elif message['content'].startswith('caption ') %}"
+        "describe<|md_reserved_1|>{{ message['content'][8:] }}<|md_reserved_2|>"
+        "{% elif message['content'] == 'describe' %}"
+        "describe<|md_reserved_1|>normal<|md_reserved_2|>"
+        "{% elif message['content'].startswith('describe ') %}"
+        "describe<|md_reserved_1|>{{ message['content'][9:] }}<|md_reserved_2|>"
+        "{% else %}"
+        "query<|md_reserved_1|>{{ message['content'] }}<|md_reserved_2|>"
+        "{% endif %}"
+        "{% else %}"
+        # List content - build Moondream's image prefix independently of
+        # OpenAI-style content part order, then render the text task.
+        "<|endoftext|>"
+        "{% for content in message['content'] %}"
+        "{% if content['type'] in ['image', 'image_url', 'input_image', 'image_pil'] %}"  # noqa: E501
+        "<image>"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% for content in message['content'] %}"
+        "{% if content['type'] == 'text' %}"
+        "<|md_reserved_0|>"
+        "{% if content['text'] == 'caption' %}"
+        "describe<|md_reserved_1|>normal<|md_reserved_2|>"
+        "{% elif content['text'].startswith('caption ') %}"
+        "describe<|md_reserved_1|>{{ content['text'][8:] }}<|md_reserved_2|>"
+        "{% elif content['text'] == 'describe' %}"
+        "describe<|md_reserved_1|>normal<|md_reserved_2|>"
+        "{% elif content['text'].startswith('describe ') %}"
+        "describe<|md_reserved_1|>{{ content['text'][9:] }}<|md_reserved_2|>"
+        "{% else %}"
+        "query<|md_reserved_1|>{{ content['text'] }}<|md_reserved_2|>"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% endif %}"
+        "{% elif message['role'] == 'assistant' %}"
+        "{{ message['content'] }}"
+        "{% endif %}"
+        "{% endfor %}"
+    )
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase | None = None,
+        chat_template: str | None = None,
+        crop_size: int = 378,
+        max_crops: int = 12,
+        overlap_margin: int = 4,
+        patch_size: int = 14,
+        **kwargs,
+    ):
+        self.image_token = "<image>"
+        self.crop_size = crop_size
+        self.max_crops = max_crops
+        self.overlap_margin = overlap_margin
+        self.patch_size = patch_size
+
+        # Number of patches per crop (27x27 = 729 for 378/14)
+        self.patches_per_crop = (crop_size // patch_size) ** 2
+
+        # Use default chat template if none provided
+        if chat_template is None:
+            chat_template = self._default_chat_template
+
+        super().__init__(tokenizer, chat_template=chat_template)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        **kwargs,
+    ):
+        """
+        Load the processor, using a separate tokenizer repo.
+
+        The moondream3 model uses a custom tokenizer from 'moondream/starmie-v1'
+        instead of having tokenizer files in the model repo.
+        """
+        from transformers import AutoTokenizer, PreTrainedTokenizerFast
+        from transformers.utils import cached_file
+
+        tokenizer = kwargs.pop("tokenizer", None)
+
+        tokenizer_kwargs = {
+            "trust_remote_code": kwargs.get("trust_remote_code", False),
+        }
+        for key in (
+            "cache_dir",
+            "force_download",
+            "local_files_only",
+            "revision",
+            "subfolder",
+            "token",
+            "use_fast",
+        ):
+            if key in kwargs:
+                tokenizer_kwargs[key] = kwargs[key]
+
+        cached_file_kwargs = {
+            key: tokenizer_kwargs[key]
+            for key in (
+                "cache_dir",
+                "force_download",
+                "local_files_only",
+                "revision",
+                "subfolder",
+                "token",
+            )
+            if key in tokenizer_kwargs
+        }
+
+        def load_tokenizer(repo_or_path):
+            try:
+                return AutoTokenizer.from_pretrained(repo_or_path, **tokenizer_kwargs)
+            except Exception:
+                tokenizer_file = cached_file(
+                    repo_or_path,
+                    "tokenizer.json",
+                    **cached_file_kwargs,
+                )
+                return PreTrainedTokenizerFast(
+                    tokenizer_file=tokenizer_file,
+                    clean_up_tokenization_spaces=False,
+                )
+
+        if isinstance(tokenizer, str):
+            tokenizer = load_tokenizer(tokenizer)
+
+        if tokenizer is None:
+            # Prefer model-local tokenizer files first. If unavailable, fall
+            # back to moondream's dedicated tokenizer repository.
+            try:
+                tokenizer = load_tokenizer(pretrained_model_name_or_path)
+            except Exception:
+                tokenizer = load_tokenizer(cls._tokenizer_repo)
+
+        # Configure special tokens for Moondream3
+        # BOS and EOS are both token 0 (<|endoftext|>), matching the native
+        # config (TokenizerConfig.bos_id=0, eos_id=0). This is standard for
+        # GPT-2 style models where <|endoftext|> signals both start and end.
+        # Token 1 (<|md_reserved_0|>) is a template delimiter, NOT the EOS.
+        tokenizer.bos_token = "<|endoftext|>"
+        tokenizer.bos_token_id = 0
+        tokenizer.eos_token = "<|endoftext|>"
+        tokenizer.eos_token_id = 0
+
+        # Extract processor-specific kwargs
+        crop_size = kwargs.pop("crop_size", 378)
+        max_crops = kwargs.pop("max_crops", 12)
+        overlap_margin = kwargs.pop("overlap_margin", 4)
+        patch_size = kwargs.pop("patch_size", 14)
+        chat_template = kwargs.pop("chat_template", None)
+
+        # Set default chat template on tokenizer if not already set
+        if chat_template is None:
+            chat_template = cls._default_chat_template
+        if tokenizer.chat_template is None:
+            tokenizer.chat_template = chat_template
+
+        return cls(
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            crop_size=crop_size,
+            max_crops=max_crops,
+            overlap_margin=overlap_margin,
+            patch_size=patch_size,
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
+        **kwargs: Unpack[Moondream3ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Process images and text for Moondream3 model.
+
+        Args:
+            images: Input images (PIL Image, numpy array, or list thereof).
+            text: Input text or list of texts.
+            **kwargs: Additional processing arguments.
+
+        Returns:
+            BatchFeature with processed inputs.
+        """
+        output_kwargs = self._merge_kwargs(
+            Moondream3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Process images
+        image_features = {}
+        if images is not None:
+            processed_images = []
+            tilings = []
+
+            images_list = images if isinstance(images, list) else [images]
+            for image in images_list:
+                pixel_values, tiling = self.preprocess_image(image, **output_kwargs["images_kwargs"])
+                processed_images.append(pixel_values)
+                tilings.append(tiling)
+
+            if processed_images:
+                image_features["pixel_values"] = processed_images
+                image_features["tilings"] = tilings
+
+        # Process text
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+
+            # Get text kwargs, remove keys we set ourselves
+            text_kwargs = output_kwargs.get("text_kwargs", {}).copy()
+            text_kwargs.pop("return_tensors", None)
+            text_kwargs.pop("add_special_tokens", None)
+
+            # Tokenize text
+            tokenized = self.tokenizer(
+                text,
+                add_special_tokens=True,
+                return_tensors="pt",
+                **text_kwargs,
+            )
+
+            output = BatchFeature(data=dict(tokenized))
+
+            # Add image features
+            if image_features:
+                output["pixel_values"] = image_features["pixel_values"]
+                output["tilings"] = image_features["tilings"]
+
+            return output
+
+        # If only images were provided
+        return BatchFeature(data=image_features)
+
+    @staticmethod
+    def _image_array_to_uint8(array: np.ndarray) -> np.ndarray:
+        if array.dtype == np.uint8:
+            return np.ascontiguousarray(array)
+
+        if array.dtype == np.bool_:
+            return np.ascontiguousarray(array.astype(np.uint8) * 255)
+
+        if np.issubdtype(array.dtype, np.floating):
+            array = np.nan_to_num(array, nan=0.0, posinf=255.0, neginf=0.0)
+            if array.size > 0 and array.max() <= 1.0:
+                array = array * 255.0
+            array = np.rint(array)
+
+        return np.ascontiguousarray(np.clip(array, 0, 255).astype(np.uint8))
+
+    @staticmethod
+    def _to_pil_image(image: ImageInput) -> Image.Image:
+        if isinstance(image, Image.Image):
+            return image
+
+        if isinstance(image, torch.Tensor):
+            tensor = image.detach().cpu()
+            if tensor.dtype == torch.bfloat16:
+                tensor = tensor.to(torch.float32)
+            image_array = tensor.numpy()
+        elif isinstance(image, np.ndarray):
+            image_array = image
+        else:
+            raise TypeError(
+                f"Moondream3 images must be PIL images, numpy arrays, or torch tensors, got {type(image)!r}."
+            )
+
+        if image_array.ndim == 2:
+            image_array = Moondream3Processor._image_array_to_uint8(image_array)
+            return Image.fromarray(image_array)
+
+        if image_array.ndim != 3:
+            raise ValueError(f"Moondream3 image arrays must have 2 or 3 dimensions, got shape {image_array.shape}.")
+
+        channel_dims = (1, 3, 4)
+        if image_array.shape[-1] not in channel_dims:
+            if image_array.shape[0] not in channel_dims:
+                raise ValueError(
+                    "Moondream3 image arrays must be HWC or CHW with 1, 3, "
+                    f"or 4 channels, got shape {image_array.shape}."
+                )
+            image_array = np.transpose(image_array, (1, 2, 0))
+
+        image_array = Moondream3Processor._image_array_to_uint8(image_array)
+        if image_array.shape[-1] == 1:
+            image_array = image_array[..., 0]
+
+        return Image.fromarray(image_array)
+
+    def preprocess_image(
+        self,
+        image: ImageInput,
+        max_crops: int = 12,
+        overlap_margin: int = 4,
+        crop_size: int = 378,
+        patch_size: int = 14,
+        convert_to_rgb: bool = True,
+        return_tensors: str = "pt",
+    ) -> tuple[torch.Tensor, tuple[int, int]]:
+        """
+        Preprocess an image using overlap-and-resize cropping strategy.
+
+        Args:
+            image: Input PIL image, numpy array, or torch tensor.
+            max_crops: Maximum number of crops.
+            overlap_margin: Margin for overlapping in patches.
+            crop_size: Size of each crop.
+            patch_size: Size of each patch.
+            convert_to_rgb: Whether to convert to RGB.
+            return_tensors: Return type ("pt" for PyTorch).
+
+        Returns:
+            Tuple of (pixel_values tensor, tiling tuple).
+        """
+        image = self._to_pil_image(image)
+        if convert_to_rgb:
+            image = convert_image_mode(image, "RGB")
+
+        # Convert to numpy array
+        image_array = np.array(image)
+        original_h, original_w = image_array.shape[:2]
+
+        margin_pixels = patch_size * overlap_margin
+        total_margin_pixels = margin_pixels * 2
+
+        crop_patches = crop_size // patch_size
+        crop_window_patches = crop_patches - (2 * overlap_margin)
+        crop_window_size = crop_window_patches * patch_size
+
+        tiling = select_tiling(
+            original_h - total_margin_pixels,
+            original_w - total_margin_pixels,
+            crop_window_size,
+            max_crops,
+        )
+
+        n_crops = tiling[0] * tiling[1] + 1
+        crops = np.zeros((n_crops, crop_size, crop_size, 3), dtype=np.uint8)
+
+        target_size = (
+            tiling[0] * crop_window_size + total_margin_pixels,
+            tiling[1] * crop_window_size + total_margin_pixels,
+        )
+
+        # Resize image
+        pil_img = Image.fromarray(image_array)
+        resized = pil_img.resize(
+            (int(target_size[1]), int(target_size[0])),
+            resample=Image.Resampling.LANCZOS,
+        )
+        resized_array = np.asarray(resized)
+
+        # Create global crop
+        global_pil = pil_img.resize((crop_size, crop_size), resample=Image.Resampling.LANCZOS)
+        crops[0] = np.asarray(global_pil)
+
+        # Create local crops
+        for i in range(tiling[0]):
+            for j in range(tiling[1]):
+                y0 = i * crop_window_size
+                x0 = j * crop_window_size
+                y_end = min(y0 + crop_size, resized_array.shape[0])
+                x_end = min(x0 + crop_size, resized_array.shape[1])
+
+                crop_region = resized_array[y0:y_end, x0:x_end]
+                crop_idx = 1 + i * tiling[1] + j
+                h_slice = slice(None, crop_region.shape[0])
+                w_slice = slice(None, crop_region.shape[1])
+                crops[crop_idx, h_slice, w_slice] = crop_region
+
+        # Convert to tensor: (n_crops, H, W, C) -> (n_crops, C, H, W)
+        pixel_values = np.transpose(crops, (0, 3, 1, 2))
+
+        if return_tensors == "pt":
+            # Match HF reference preprocessing exactly: convert uint8 crops to
+            # bfloat16 before in-place normalization.
+            pixel_values = torch.from_numpy(pixel_values).to(dtype=torch.bfloat16).div_(255.0).sub_(0.5).div_(0.5)
+        else:
+            pixel_values = pixel_values.astype(np.float32) / 255.0
+            pixel_values = (pixel_values - 0.5) / 0.5
+
+        return pixel_values, tiling
+
+    def get_num_image_tokens(self) -> int:
+        """Return the number of image tokens (729 = 27x27 patches)."""
+        return self.patches_per_crop
+
+    def batch_decode(self, *args, **kwargs):
+        """Forward to tokenizer's batch_decode."""
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """Forward to tokenizer's decode."""
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        return tokenizer_input_names + ["pixel_values", "tilings"]
+
+
+AutoProcessor.register("Moondream3Processor", Moondream3Processor)
diff --git a/aphrodite/utils/flashinfer.py b/aphrodite/utils/flashinfer.py
index ad299a2d5d..8bd768f625 100644
--- a/aphrodite/utils/flashinfer.py
+++ b/aphrodite/utils/flashinfer.py
@@ -675,6 +675,38 @@ def flashinfer_scaled_fp8_mm(
     return output
 
 
+def flashinfer_scaled_fp8_mm_out(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2 and out.ndim == 2
+    assert a.shape[1] == b.shape[0]
+    assert out.shape == (a.shape[0], b.shape[1])
+    assert scale_a.numel() == 1 and scale_b.numel() == 1
+    assert a.dtype == torch.float8_e4m3fn and b.dtype == torch.float8_e4m3fn
+    assert out.device.type == "cuda"
+    assert a.is_contiguous()
+
+    from flashinfer import bmm_fp8 as bmm_fp8_
+
+    bmm_fp8_(
+        a.unsqueeze(0),
+        # FlashInfer expects the weight in the same column-major view layout
+        # consumed by flashinfer_scaled_fp8_mm, so keep the transposed view.
+        b.unsqueeze(0),
+        scale_a,
+        scale_b,
+        out_dtype or out.dtype,
+        out.unsqueeze(0),
+        "auto",
+    )
+    return out
+
+
 def flashinfer_quant_nvfp4_8x4_sf_layout(
     a: torch.Tensor, a_global_sf: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -788,6 +820,7 @@ def is_flashinfer_cudnn_fp8_prefill_attn_supported() -> bool:
     "use_trtllm_attention",
     "flashinfer_scaled_fp4_mm",
     "flashinfer_scaled_fp8_mm",
+    "flashinfer_scaled_fp8_mm_out",
     "flashinfer_quant_nvfp4_8x4_sf_layout",
     "flashinfer_fp8_blockscale_gemm",
     "should_use_flashinfer_for_blockscale_fp8_gemm",
diff --git a/aphrodite/utils/multi_stream_utils.py b/aphrodite/utils/multi_stream_utils.py
index cc6bc64624..c5143aa460 100644
--- a/aphrodite/utils/multi_stream_utils.py
+++ b/aphrodite/utils/multi_stream_utils.py
@@ -56,3 +56,69 @@ def maybe_execute_in_parallel(
         result0 = fn0()
         result1 = fn1()
     return (result0, result1)
+
+
+def execute_in_parallel(
+    default_fn: Callable[[], Any],
+    aux_fns: list[Callable[[], Any] | None],
+    start_event: torch.cuda.Event,
+    done_events: list[torch.cuda.Event],
+    aux_streams: list[torch.cuda.Stream] | None = None,
+    enable: bool = False,
+) -> tuple[Any, list[Any]]:
+    """Run default_fn on the current stream and aux_fns concurrently on
+    aux_streams.
+    Generalizes maybe_execute_in_parallel to N aux callables. Slots where
+    aux_fns[i] is None are skipped (no stream switch, no event record); their
+    corresponding entry in the returned aux_results list is None.
+    start_event fans out from the current stream to every launched aux stream;
+    done_events[i] is recorded after aux_fns[i] so the current stream joins
+    before returning. Falls back to sequential execution on the current stream
+    when aux_streams is None or enable is False; in that case default_fn runs
+    first, then aux_fns in order.
+    Args:
+        default_fn: Callable for the default (current) stream.
+        aux_fns: Per-aux callables; entries may be None to skip.
+        start_event: CUDA event recorded on the current stream before
+            default_fn so each launched aux stream can wait on it.
+        done_events: One CUDA event per aux slot, recorded after the
+            corresponding aux_fn. Length must match aux_fns.
+        aux_streams: Per-aux CUDA streams. Length must match aux_fns.
+            Multi-stream is disabled when None.
+        enable: Opt-in switch for the multi-stream path. Defaults to False,
+            so callers that pass aux_streams must also pass enable=True
+            (typically gated by an env var) to actually overlap. When False,
+            execution falls back to sequential on the current stream.
+    Returns:
+        Tuple of (default_result, aux_results) where aux_results[i] is the
+        result of aux_fns[i] (or None when skipped).
+    """
+    aux_results: list[Any]
+    if aux_streams is None or not enable:
+        default_result = default_fn()
+        aux_results = [fn() if fn is not None else None for fn in aux_fns]
+        return default_result, aux_results
+
+    assert len(aux_fns) == len(aux_streams) == len(done_events), (
+        "aux_fns, aux_streams, and done_events must be the same length"
+    )
+
+    aux_results = [None] * len(aux_fns)
+    pending: list[torch.cuda.Event] = []
+
+    start_event.record()
+    for i, fn in enumerate(aux_fns):
+        if fn is None:
+            continue
+        with torch.cuda.stream(aux_streams[i]):
+            start_event.wait()
+            aux_results[i] = fn()
+            done_events[i].record()
+        pending.append(done_events[i])
+
+    default_result = default_fn()
+
+    for ev in pending:
+        ev.wait()
+
+    return default_result, aux_results
diff --git a/aphrodite/v1/attention/backends/cpu_attn.py b/aphrodite/v1/attention/backends/cpu_attn.py
index d1cbe31288..8cd1d9bfe7 100644
--- a/aphrodite/v1/attention/backends/cpu_attn.py
+++ b/aphrodite/v1/attention/backends/cpu_attn.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar
+from typing import TYPE_CHECKING, ClassVar
+
+if TYPE_CHECKING:
+    from aphrodite.config.cache import CacheDType
 
 import torch
 
@@ -35,6 +38,12 @@ class CPUAttentionBackend(AttentionBackend):
         torch.bfloat16,
         torch.float32,
     ]
+    supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -131,7 +140,13 @@ def __init__(
         if self.window_size is None:
             self.window_size = -1
         self.block_size = aphrodite_config.cache_config.block_size
-        self.isa = _get_attn_isa(self.dtype, self.block_size, self.head_dim)
+        kv_cache_dtype_str = aphrodite_config.cache_config.cache_dtype
+        self.isa = _get_attn_isa(
+            self.dtype,
+            self.block_size,
+            self.head_dim,
+            kv_cache_dtype_str,
+        )
         self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)
 
     def build(
@@ -242,8 +257,7 @@ def __init__(
         self.kv_cache_dtype = kv_cache_dtype
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if is_quantized_kv_cache(kv_cache_dtype):
-            raise NotImplementedError("FP8 KV cache is unsupported in CPU_ATTN")
+        self.is_fp8_kv_cache = is_quantized_kv_cache(kv_cache_dtype)
         self.attn_type = attn_type
 
         self.sinks = sinks
@@ -312,6 +326,9 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 attn_metadata.isa,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                kv_cache_dtype=self.kv_cache_dtype,
             )
 
         if attn_metadata.use_sdpa_prefill:
@@ -343,6 +360,9 @@ def forward(
                 softcap=self.logits_soft_cap,
                 scheduler_metadata=attn_metadata.scheduler_metadata,
                 s_aux=self.sinks,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                kv_cache_dtype=self.kv_cache_dtype,
             )
 
         return output
@@ -463,12 +483,23 @@ def _make_sliding_window_bias(
     return attn_biases
 
 
-def _get_attn_isa(dtype: torch.dtype, block_size: int, head_size: int | None = None) -> str:
+def _get_attn_isa(
+    dtype: torch.dtype,
+    block_size: int,
+    head_size: int | None = None,
+    kv_cache_dtype: str | None = None,
+) -> str:
+    fp8_kv = is_quantized_kv_cache(kv_cache_dtype) if kv_cache_dtype else False
     if head_size is not None and head_size % 32 != 0 and head_size % 16 == 0:
+        if fp8_kv:
+            raise NotImplementedError("FP8 KV cache requires head_size divisible by 32 on CPU.")
         return "vec16"
     supports_amx = torch.cpu._is_amx_tile_supported()
     supports_arm = current_platform.get_cpu_architecture() == CpuArchEnum.ARM
     supports_vxe = current_platform.get_cpu_architecture() == CpuArchEnum.S390X
+    supports_avx512 = torch.cpu._is_avx512_supported()
+    if fp8_kv and not supports_amx and not supports_avx512:
+        raise NotImplementedError("FP8 KV cache on CPU requires x86 with AVX-512 or AMX.")
     if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
         return "amx"
     elif block_size % 32 == 0:
diff --git a/aphrodite/v1/attention/backends/flashinfer.py b/aphrodite/v1/attention/backends/flashinfer.py
index c84e430f26..f47feadacf 100755
--- a/aphrodite/v1/attention/backends/flashinfer.py
+++ b/aphrodite/v1/attention/backends/flashinfer.py
@@ -315,6 +315,7 @@ class FlashInferBackend(AttentionBackend):
         "fp8",
         "fp8_e4m3",
         "fp8_e5m2",
+        "nvfp4",
     ]
 
     @staticmethod
@@ -371,13 +372,15 @@ def get_kv_cache_stride_order(
         return stride_order
 
     @staticmethod
-    def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
+    def get_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
             return torch.float8_e4m3fn
         elif kv_cache_dtype == "fp8_e5m2":
             return torch.float8_e5m2
+        elif kv_cache_dtype == "nvfp4":
+            return torch.uint8
         else:
-            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
+            raise ValueError(f"Unrecognized dtype: {kv_cache_dtype}")
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -587,9 +590,8 @@ def __init__(
                 # For NVFP4, kv_cache_dtype stays as the string "nvfp4"
                 # which is passed to FlashInferImpl
                 self.kv_cache_dtype = self.cache_dtype
-                raise NotImplementedError("nvfp4 KV cache is not yet supported")
             else:
-                self.kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(self.cache_dtype)
+                self.kv_cache_dtype = FlashInferBackend.get_dtype_for_flashinfer(self.cache_dtype)
         else:
             self.cache_dtype = "auto"
             self.is_kvcache_nvfp4 = False
@@ -605,7 +607,7 @@ def __init__(
         if can_use_trtllm and not aphrodite_config.attention_config.disable_flashinfer_q_quantization:
             if self.is_kvcache_nvfp4:
                 # NVFP4 KV cache uses FP8 quantized queries
-                self.q_data_type = FlashInferBackend.get_fp8_dtype_for_flashinfer("fp8_e4m3")
+                self.q_data_type = FlashInferBackend.get_dtype_for_flashinfer("fp8_e4m3")
             else:
                 self.q_data_type = self.kv_cache_dtype
         else:
@@ -715,8 +717,13 @@ def _get_prefill_wrapper(
                     dcp_a2a=self.dcp_a2a,
                 )
             else:
+                # NVFP4 KV cache requires the trtllm-gen backend inside
+                # the wrapper; fa2/fa3 do not support nvfp4.
+                backend = "trtllm-gen" if self.is_kvcache_nvfp4 else "auto"
                 self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                    self._get_workspace_buffer(), get_kv_cache_layout()
+                    self._get_workspace_buffer(),
+                    get_kv_cache_layout(),
+                    backend=backend,
                 )
         assert self._prefill_wrapper is not None
         return self._prefill_wrapper
@@ -736,6 +743,9 @@ def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False):
                 paged_kv_indptr = None
                 paged_kv_indices = None
                 paged_kv_last_page_len = None
+            # NVFP4 KV cache requires the trtllm-gen backend inside
+            # the wrapper; fa2/fa3 do not support nvfp4.
+            backend = "trtllm-gen" if self.is_kvcache_nvfp4 else "auto"
             decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
                 self._get_workspace_buffer(),
                 get_kv_cache_layout(),
@@ -747,6 +757,7 @@ def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False):
                 # at least as good as cuda cores for all attention ops in latest
                 # gpus.
                 use_tensor_cores=True,
+                backend=backend,
             )
 
             # save the decode wrapper
@@ -1058,6 +1069,10 @@ def build(
                         prefill_wrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
                     )
+                    # NVFP4 trtllm kernel only supports FP8 output;
+                    # use FP8 o_data_type so the wrapper matches the
+                    # FP8 output buffer allocated in forward().
+                    o_dtype = FP8_DTYPE if self.is_kvcache_nvfp4 else self.model_config.dtype
                     prefill_wrapper.plan(
                         qo_indptr=qo_indptr_prefill_cpu,
                         paged_kv_indptr=paged_kv_indptr_prefill_cpu,
@@ -1073,7 +1088,7 @@ def build(
                         logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
-                        o_data_type=self.model_config.dtype,
+                        o_data_type=o_dtype,
                         fixed_split_size=self.prefill_fixed_split_size,
                         disable_split_kv=self.disable_split_kv,
                     )
@@ -1103,6 +1118,10 @@ def build(
                 # Use the persistent buffer with padding length,
                 # instead of the same address but chunked version
                 # in atten_metadata when using cudagraph.
+                # NVFP4 trtllm kernel only supports FP8 output;
+                # use FP8 o_data_type so the wrapper matches the
+                # FP8 output buffer allocated in forward().
+                o_dtype = FP8_DTYPE if self.is_kvcache_nvfp4 else self.model_config.dtype
                 fast_plan_decode(
                     decode_wrapper,
                     indptr_cpu=self.paged_kv_indptr.cpu[: num_input_tokens + 1],
@@ -1119,7 +1138,7 @@ def build(
                     logits_soft_cap=self.logits_soft_cap,
                     q_data_type=self.q_data_type,
                     kv_data_type=self.kv_cache_dtype,
-                    o_data_type=self.model_config.dtype,
+                    o_data_type=o_dtype,
                     fixed_split_size=self.decode_fixed_split_size,
                     disable_split_kv=self.disable_split_kv,
                 )
@@ -1199,6 +1218,17 @@ def __init__(
         self.bmm2_scale: float | None = None
         self.o_sf_scale: float | None = None
 
+        # Pre-allocated FP8 output buffer for NVFP4 without fused output quant.
+        if self.is_kvcache_nvfp4 and aphrodite_config is not None:
+            max_num_tokens = aphrodite_config.scheduler_config.max_num_batched_tokens
+            self._nvfp4_fp8_out = torch.empty(
+                (max_num_tokens, num_heads, head_size),
+                dtype=FP8_DTYPE,
+                device="cuda",
+            )
+        else:
+            self._nvfp4_fp8_out = None
+
         dcp_a2a = (
             aphrodite_config is not None
             and aphrodite_config.parallel_config.decode_context_parallel_size > 1
@@ -1308,7 +1338,7 @@ def forward(
         # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
         # to process the cache when the kv_cache_dtype is fp8
         if self.kv_sharing_target_layer_name is None and is_quantized_kv_cache(self.kv_cache_dtype):
-            torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(self.kv_cache_dtype)
+            torch_dtype = FlashInferBackend.get_dtype_for_flashinfer(self.kv_cache_dtype)
             kv_cache = kv_cache.view(torch_dtype)
 
         # Inputs and outputs may be padded for CUDA graphs
@@ -1376,13 +1406,33 @@ def forward(
                     assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                     assert prefill_wrapper._sm_scale == self.scale
                     assert prefill_wrapper._causal
+
+                    if self.is_kvcache_nvfp4:
+                        kv_cache_permute = nvfp4_kv_data
+                    kv_cache_sf = nvfp4_kv_block_scales if self.is_kvcache_nvfp4 else None
+
+                    # NVFP4 trtllm kernel only supports FP8 output.
+                    # Use a pre-allocated FP8 buffer and dequantize
+                    # afterwards.
+                    needs_fp8_out_prefill = self.is_kvcache_nvfp4 and output.dtype != FP8_DTYPE
+                    if needs_fp8_out_prefill:
+                        out_prefill = self._nvfp4_fp8_out[:num_prefill_tokens]
+                    else:
+                        out_prefill = output[num_decode_tokens:]
+
                     prefill_wrapper.run(
                         prefill_query,
                         kv_cache_permute,
                         k_scale=layer._k_scale_float,
                         v_scale=layer._v_scale_float,
-                        out=output[num_decode_tokens:],
+                        out=out_prefill,
+                        kv_cache_sf=kv_cache_sf,
                     )
+
+                    if needs_fp8_out_prefill:
+                        output[num_decode_tokens : num_decode_tokens + num_prefill_tokens].copy_(
+                            out_prefill.to(output.dtype)
+                        )
             else:
                 assert isinstance(attn_metadata.prefill, TRTLLMPrefill)
                 # prefill_query may be non-contiguous or have degenerate strides
@@ -1413,6 +1463,12 @@ def forward(
                     assert self.o_sf_scale is None
                     out = output[num_decode_tokens:]
 
+                # NVFP4 trtllm kernel only supports FP8 output.
+                # Use a pre-allocated FP8 buffer and dequantize afterwards.
+                needs_fp8_out = self.is_kvcache_nvfp4 and output.dtype != FP8_DTYPE
+                if needs_fp8_out:
+                    out = self._nvfp4_fp8_out[:num_prefill_tokens]
+
                 prefill_kv_block_scales = None
                 if self.is_kvcache_nvfp4:
                     # NVFP4 trtllm-gen kernel requires FP8 query.
@@ -1423,7 +1479,7 @@ def forward(
                     )
                     mock_kv_cache = nvfp4_kv_data
                     mock_block_table = block_tables_prefill
-                    prefill_kv_block_scales = nvfp4_kv_block_scales  # noqa: F841
+                    prefill_kv_block_scales = nvfp4_kv_block_scales
                 elif attn_metadata.q_data_type != FP8_DTYPE and self.kv_cache_dtype.startswith("fp8"):
                     # TRTLLM prefill attention does not support BF16 Q
                     # and fp8 kv cache. So to enable prefill attention
@@ -1467,8 +1523,14 @@ def forward(
                     sinks=self.sinks,
                     o_sf_scale=self.o_sf_scale,
                     out=out,
+                    kv_cache_sf=prefill_kv_block_scales,
                 )
 
+                if needs_fp8_out:
+                    output[num_decode_tokens : num_decode_tokens + num_prefill_tokens].copy_(
+                        out[:num_prefill_tokens].to(output.dtype)
+                    )
+
         if num_decode_tokens > 0:
             decode_query = query[:num_decode_tokens]
             assert decode_query.shape[0] == num_decode_tokens
@@ -1481,6 +1543,18 @@ def forward(
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
 
+                if self.is_kvcache_nvfp4:
+                    kv_cache_permute = nvfp4_kv_data
+                kv_cache_sf = nvfp4_kv_block_scales if self.is_kvcache_nvfp4 else None
+
+                # NVFP4 kernel only supports FP8 output.
+                # Use a pre-allocated FP8 buffer and dequantize afterwards.
+                needs_fp8_out = self.is_kvcache_nvfp4 and output.dtype != FP8_DTYPE
+                if needs_fp8_out:
+                    out_decode = self._nvfp4_fp8_out[:num_decode_tokens]
+                else:
+                    out_decode = output[:num_decode_tokens]
+
                 if use_dcp:
                     decode_query = get_dcp_group().all_gather(decode_query.contiguous(), dim=-2)
                     output_tmp = torch.empty_like(decode_query)
@@ -1497,6 +1571,7 @@ def forward(
                         out=output_tmp,
                         lse=lse,
                         return_lse=True,
+                        kv_cache_sf=kv_cache_sf,
                     )
                     output[:num_decode_tokens] = self.dcp_combine(
                         output_tmp,
@@ -1509,8 +1584,12 @@ def forward(
                         kv_cache_permute,
                         k_scale=layer._k_scale_float,
                         v_scale=layer._v_scale_float,
-                        out=output[:num_decode_tokens],
+                        out=out_decode,
+                        kv_cache_sf=kv_cache_sf,
                     )
+
+                if needs_fp8_out:
+                    output[:num_decode_tokens].copy_(out_decode.to(output.dtype))
             else:
                 # decode_query may be non-contiguous or have degenerate strides
                 assert isinstance(attn_metadata.decode, TRTLLMDecode)
@@ -1550,6 +1629,12 @@ def forward(
                     assert self.o_sf_scale is None
                     out = output[:num_decode_tokens]
 
+                # NVFP4 trtllm kernel only supports FP8 output.
+                # Use a pre-allocated FP8 buffer and dequantize afterwards.
+                needs_fp8_out = self.is_kvcache_nvfp4 and output.dtype != FP8_DTYPE
+                if needs_fp8_out:
+                    out = self._nvfp4_fp8_out[:num_decode_tokens]
+
                 if num_decode_tokens % attn_metadata.num_decodes != 0:
                     # This gets triggered when the dummy_run forces
                     # attention to be initialized with q_len = 0
@@ -1559,7 +1644,7 @@ def forward(
 
                 trtllm_batch_decode_with_kv_cache(
                     query=decode_query,
-                    kv_cache=nvfp4_kv_data if self.is_kvcache_nvfp4 else kv_cache_permute,
+                    kv_cache=(nvfp4_kv_data if self.is_kvcache_nvfp4 else kv_cache_permute),
                     workspace_buffer=workspace_buffer,
                     block_tables=block_tables_decode,
                     seq_lens=seq_lens_decode,
@@ -1571,7 +1656,11 @@ def forward(
                     o_sf_scale=self.o_sf_scale,
                     out=out,
                     q_len_per_req=q_len_per_req,
+                    kv_cache_sf=(nvfp4_kv_block_scales if self.is_kvcache_nvfp4 else None),
                 )
+
+                if needs_fp8_out:
+                    output[:num_decode_tokens].copy_(out.to(output.dtype))
         return output_padded
 
     def do_kv_cache_update(
diff --git a/aphrodite/v1/attention/backends/flex_attention.py b/aphrodite/v1/attention/backends/flex_attention.py
index e4617d0eea..6b385b74ba 100644
--- a/aphrodite/v1/attention/backends/flex_attention.py
+++ b/aphrodite/v1/attention/backends/flex_attention.py
@@ -35,6 +35,7 @@
     AttentionMetadataBuilder,
     AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
 )
 from aphrodite.v1.kv_cache_interface import AttentionSpec, EncoderOnlyAttentionSpec
 
@@ -95,6 +96,10 @@ def supports_attn_type(cls, attn_type: str) -> bool:
         """FlexAttention supports both decoder and encoder-only attention."""
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
+    @classmethod
+    def supports_batch_invariance(cls) -> bool:
+        return True
+
     @classmethod
     def supports_mm_prefix(cls) -> bool:
         """FlexAttention supports full attention for image tokens."""
@@ -126,6 +131,10 @@ def use_cascade_attention(*args, **kwargs) -> bool:
     def get_supported_head_sizes(cls) -> list[int]:
         return []
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
 
 # @torch.compile(fullgraph=True, mode="reduce-overhead")
 def physical_to_logical_mapping(
@@ -307,14 +316,9 @@ class BlockSparsityHint(NamedTuple):
 
 
 def copy_to_persistent(dst, src):
-    try:
-        dst = dst.as_strided(src.shape, src.stride())
-    except RuntimeError as e:
-        raise RuntimeError(
-            f"Fail to re-stride a persistent tensor of shape {dst.shape} for a tensor of shape {src.shape}"
-        ) from e
-    dst.copy_(src)
-    return dst
+    sliced = dst[tuple(slice(0, s) for s in src.shape)]
+    sliced.copy_(src)
+    return sliced
 
 
 @dataclass
diff --git a/aphrodite/v1/attention/backends/mla/indexer.py b/aphrodite/v1/attention/backends/mla/indexer.py
index d455de0847..349cae755a 100644
--- a/aphrodite/v1/attention/backends/mla/indexer.py
+++ b/aphrodite/v1/attention/backends/mla/indexer.py
@@ -122,7 +122,7 @@ def get_name() -> str:
 
     @staticmethod
     def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
-        return [1 if current_platform.is_rocm() else 64]
+        return [1, 64] if current_platform.is_rocm() else [64]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -295,7 +295,10 @@ def __init__(self, *args, **kwargs):
                 device=self.device,
             )
         self.arange_buffer = torch.arange(
-            scheduler_config.max_num_seqs * next_n,
+            max(
+                scheduler_config.max_num_seqs * next_n,
+                scheduler_config.max_num_batched_tokens,
+            ),
             dtype=torch.int32,
             device=self.device,
         )
diff --git a/aphrodite/v1/attention/backends/mla/prefill/__init__.py b/aphrodite/v1/attention/backends/mla/prefill/__init__.py
new file mode 100644
index 0000000000..fe6c6aca9a
--- /dev/null
+++ b/aphrodite/v1/attention/backends/mla/prefill/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from aphrodite.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+from aphrodite.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
+from aphrodite.v1.attention.backends.mla.prefill.selector import get_mla_prefill_backend
+
+__all__ = [
+    "MLAPrefillBackend",
+    "MLAPrefillBackendEnum",
+    "get_mla_prefill_backend",
+]
diff --git a/aphrodite/v1/attention/backends/mla/prefill/base.py b/aphrodite/v1/attention/backends/mla/prefill/base.py
new file mode 100644
index 0000000000..5909b9b3ac
--- /dev/null
+++ b/aphrodite/v1/attention/backends/mla/prefill/base.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Abstract base class for MLA prefill backends."""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, ClassVar
+
+import torch
+
+if TYPE_CHECKING:
+    from aphrodite.config import AphroditeConfig
+    from aphrodite.model_executor.layers.attention.mla_attention import (
+        MLACommonPrefillMetadata,
+    )
+    from aphrodite.platforms.interface import DeviceCapability
+    from aphrodite.v1.attention.backends.mla.prefill.selector import (
+        MLAPrefillSelectorConfig,
+    )
+
+
+class MLAPrefillBackend(ABC):
+    """Abstract base class for MLA prefill backends."""
+
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    requires_r1_mla_dimensions: ClassVar[bool] = False
+
+    @staticmethod
+    @abstractmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @classmethod
+    def supports_compute_capability(cls, device_capability: "DeviceCapability") -> bool:
+        return True
+
+    @classmethod
+    def supports_dtype(cls, dtype: torch.dtype) -> bool:
+        return dtype in cls.supported_dtypes
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return True
+
+    @classmethod
+    def validate_configuration(
+        cls,
+        device_capability: "DeviceCapability",
+        selector_config: "MLAPrefillSelectorConfig",
+    ) -> list[str]:
+        invalid_reasons: list[str] = []
+
+        if not cls.supports_compute_capability(device_capability):
+            invalid_reasons.append(
+                f"compute capability {device_capability.major}.{device_capability.minor} not supported"
+            )
+
+        if not cls.supports_dtype(selector_config.dtype):
+            invalid_reasons.append(f"dtype {selector_config.dtype} not supported")
+
+        if not cls.is_available():
+            invalid_reasons.append("required dependencies not available")
+
+        if cls.requires_r1_mla_dimensions and not selector_config.is_r1_compatible:
+            invalid_reasons.append(
+                "model does not have DeepSeek R1 MLA dimensions "
+                "(qk_nope_head_dim=128, qk_rope_head_dim=64, v_head_dim=128)"
+            )
+
+        return invalid_reasons
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        aphrodite_config: "AphroditeConfig",
+        device: torch.device,
+        layer_names: list[str] | None = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.scale = scale
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.aphrodite_config = aphrodite_config
+        self.device = device
+        self.layer_names = layer_names
+
+    def prepare_metadata(  # noqa: B027
+        self,
+        prefill_metadata: "MLACommonPrefillMetadata",
+    ) -> None:
+        """Prepare backend-specific metadata before the forward pass.
+
+        Called by the metadata builder after constructing the prefill metadata.
+        """
+        self._prefill_metadata = prefill_metadata
+
+    @abstractmethod
+    def run_prefill_new_tokens(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def run_prefill_context_chunk(
+        self,
+        chunk_idx: int,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/aphrodite/v1/attention/backends/mla/prefill/flash_attn.py b/aphrodite/v1/attention/backends/mla/prefill/flash_attn.py
new file mode 100644
index 0000000000..6cb4a2375f
--- /dev/null
+++ b/aphrodite/v1/attention/backends/mla/prefill/flash_attn.py
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""FlashAttention backend for MLA prefill."""
+
+import functools
+from typing import TYPE_CHECKING
+
+import torch
+
+import aphrodite.envs as envs
+from aphrodite.platforms import current_platform
+from aphrodite.v1.attention.backends.fa_utils import (
+    get_flash_attn_version,
+    is_flash_attn_varlen_func_available,
+)
+from aphrodite.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+
+if TYPE_CHECKING:
+    from aphrodite.config import AphroditeConfig
+
+if is_flash_attn_varlen_func_available():
+    from aphrodite.v1.attention.backends.fa_utils import flash_attn_varlen_func
+else:
+    flash_attn_varlen_func = None  # type: ignore[assignment]
+
+
+class FlashAttnPrefillBackend(MLAPrefillBackend):
+    """FlashAttention backend for MLA prefill."""
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN"
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return is_flash_attn_varlen_func_available()
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        aphrodite_config: "AphroditeConfig",
+        device: torch.device,
+        layer_names: list[str] | None = None,
+    ) -> None:
+        super().__init__(
+            num_heads=num_heads,
+            scale=scale,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            aphrodite_config=aphrodite_config,
+            device=device,
+            layer_names=layer_names,
+        )
+
+        # Handle the differences between the flash_attn_varlen from
+        # flash_attn and the one from aphrodite_flash_attn
+        assert flash_attn_varlen_func is not None, (
+            "FlashAttnPrefillBackend requires flash_attn_varlen_func. "
+            "Ensure FlashAttnPrefillBackend.is_available() is checked first."
+        )
+        qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+        self.aphrodite_flash_attn_version = get_flash_attn_version(head_size=qk_head_dim)
+        if self.aphrodite_flash_attn_version is not None:
+            self.flash_attn_varlen_func = functools.partial(
+                flash_attn_varlen_func, fa_version=self.aphrodite_flash_attn_version
+            )
+
+        # Determine if we need to pad V
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims.
+        # FA3 on Hopper (SM90) and FA4 natively handle diff headdims.
+        device_capability = current_platform.get_device_capability()
+        self.requires_v_padding = self.aphrodite_flash_attn_version is None or not (
+            (self.aphrodite_flash_attn_version == 3 and device_capability is not None and device_capability[0] == 9)
+            or self.aphrodite_flash_attn_version == 4
+        )
+
+        # Track whether we're using aphrodite's FA or upstream (for ROCm)
+        self._is_aphrodite_fa = current_platform.is_cuda() or current_platform.is_xpu()
+
+    def _flash_attn_varlen_diff_headdims(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool = False,
+        softmax_scale: float | None = None,
+        **kwargs,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        maybe_padded_v = v
+        if self.requires_v_padding:
+            maybe_padded_v = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        if self._is_aphrodite_fa:
+            kwargs["return_softmax_lse"] = return_softmax_lse
+        else:
+            # ROCm leverages the upstream flash_attn, which takes a parameter
+            # called "return_attn_probs" instead of return_softmax_lse
+            kwargs["return_attn_probs"] = return_softmax_lse
+        if envs.APHRODITE_BATCH_INVARIANT:
+            kwargs["num_splits"] = 1
+
+        attn_out = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=maybe_padded_v,
+            softmax_scale=softmax_scale,
+            **kwargs,
+        )
+
+        # Unpack the output if there are multiple results
+        lse = None
+        if isinstance(attn_out, tuple):
+            attn_out, lse = attn_out[0], attn_out[1]
+
+        # Unpad output back to v_head_dim if we padded V
+        if self.requires_v_padding:
+            attn_out = attn_out[..., : v.shape[-1]]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            return attn_out, lse
+        return attn_out
+
+    def run_prefill_new_tokens(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        return self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=self._prefill_metadata.query_start_loc,
+            cu_seqlens_k=self._prefill_metadata.query_start_loc,
+            max_seqlen_q=self._prefill_metadata.max_query_len,
+            max_seqlen_k=self._prefill_metadata.max_query_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=return_softmax_lse,
+        )
+
+    def run_prefill_context_chunk(
+        self,
+        chunk_idx: int,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert self._prefill_metadata.chunked_context is not None
+        return self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=self._prefill_metadata.query_start_loc,
+            cu_seqlens_k=self._prefill_metadata.chunked_context.cu_seq_lens[chunk_idx],
+            max_seqlen_q=self._prefill_metadata.max_query_len,
+            max_seqlen_k=self._prefill_metadata.chunked_context.max_seq_lens[chunk_idx],
+            softmax_scale=self.scale,
+            causal=False,  # Context is unmasked
+            return_softmax_lse=True,
+        )
diff --git a/aphrodite/v1/attention/backends/mla/prefill/flashinfer.py b/aphrodite/v1/attention/backends/mla/prefill/flashinfer.py
new file mode 100644
index 0000000000..8388abfac2
--- /dev/null
+++ b/aphrodite/v1/attention/backends/mla/prefill/flashinfer.py
@@ -0,0 +1,204 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""FlashInfer backend for MLA prefill."""
+
+from typing import TYPE_CHECKING
+
+import torch
+
+import aphrodite.envs as envs
+from aphrodite.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+from aphrodite.v1.attention.backends.utils import (
+    get_per_layer_parameters,
+    infer_global_hyperparameters,
+)
+from aphrodite.v1.worker.workspace import current_workspace_manager
+
+if TYPE_CHECKING:
+    from aphrodite.config import AphroditeConfig
+    from aphrodite.model_executor.layers.attention.mla_attention import (
+        MLACommonPrefillMetadata,
+    )
+    from aphrodite.platforms.interface import DeviceCapability
+
+try:
+    from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
+except ImportError:
+    BatchPrefillWithRaggedKVCacheWrapper = object  # type: ignore[misc,assignment]
+
+_DEFAULT_NUM_CHUNKS = 32
+
+
+class FlashInferPrefillBackend(MLAPrefillBackend):
+    """FlashInfer backend for MLA prefill."""
+
+    requires_r1_mla_dimensions = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER"
+
+    @classmethod
+    def supports_compute_capability(cls, device_capability: "DeviceCapability") -> bool:
+        return device_capability.major == 10
+
+    @classmethod
+    def is_available(cls) -> bool:
+        try:
+            from flashinfer import (
+                BatchPrefillWithRaggedKVCacheWrapper,  # noqa: F401
+            )
+
+            return True
+        except ImportError:
+            return False
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        aphrodite_config: "AphroditeConfig",
+        device: torch.device,
+        layer_names: list[str] | None = None,
+    ) -> None:
+        super().__init__(
+            num_heads=num_heads,
+            scale=scale,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            aphrodite_config=aphrodite_config,
+            device=device,
+            layer_names=layer_names,
+        )
+
+        self._prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
+        self._prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = []
+        if layer_names is None:
+            raise ValueError("FlashInferPrefillBackend requires layer_names to initialize global hyperparameters.")
+
+        from aphrodite.model_executor.layers.attention.mla_attention import (
+            MLACommonImpl,
+        )
+
+        self._global_hyperparameters = infer_global_hyperparameters(
+            get_per_layer_parameters(aphrodite_config, layer_names, MLACommonImpl)  # type: ignore[type-abstract]
+        )
+
+    def _ensure_chunks(
+        self,
+        num_chunks: int,
+        workspace_buffer: torch.Tensor,
+    ) -> None:
+        if len(self._prefill_chunks) < num_chunks:
+            for _ in range(len(self._prefill_chunks), num_chunks):
+                self._prefill_chunks.append(
+                    BatchPrefillWithRaggedKVCacheWrapper(workspace_buffer, "NHD", backend="cutlass")
+                )
+
+    def prepare_metadata(
+        self,
+        prefill_metadata: "MLACommonPrefillMetadata",
+    ) -> None:
+        qo_indptr = prefill_metadata.query_start_loc
+        has_context = prefill_metadata.chunked_context is not None
+        (workspace_buffer,) = current_workspace_manager().get_simultaneous(
+            ((envs.APHRODITE_FLASHINFER_WORKSPACE_BUFFER_SIZE,), torch.uint8),
+        )
+
+        if self._prefill_main is None:
+            self._prefill_main = BatchPrefillWithRaggedKVCacheWrapper(workspace_buffer, "NHD", backend="cutlass")
+            self._ensure_chunks(_DEFAULT_NUM_CHUNKS, workspace_buffer)
+
+        if has_context:
+            chunked_context = prefill_metadata.chunked_context
+            assert chunked_context is not None
+            num_chunks = chunked_context.cu_seq_lens.shape[0]
+            self._ensure_chunks(num_chunks, workspace_buffer)
+
+        num_qo_heads = self.num_heads
+        num_kv_heads = num_qo_heads
+
+        head_dim_qk = self.qk_nope_head_dim + self.qk_rope_head_dim
+        head_dim_vo = self.v_head_dim
+        kv_indptr = qo_indptr.clone()
+
+        assert self._prefill_main is not None
+        self._prefill_main.plan(
+            qo_indptr=qo_indptr,
+            kv_indptr=kv_indptr,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim_qk=head_dim_qk,
+            head_dim_vo=head_dim_vo,
+            causal=True,
+            sm_scale=self._global_hyperparameters.sm_scale,
+            window_left=self._global_hyperparameters.window_left,
+            logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
+            q_data_type=prefill_metadata.q_data_type,
+            o_data_type=prefill_metadata.output_dtype,
+        )
+
+        if has_context:
+            chunked_context = prefill_metadata.chunked_context
+            assert chunked_context is not None
+            for i in range(num_chunks):
+                kv_indptr_chunk = chunked_context.cu_seq_lens[i]
+
+                self._prefill_chunks[i].plan(
+                    qo_indptr=qo_indptr,
+                    kv_indptr=kv_indptr_chunk,
+                    num_qo_heads=num_qo_heads,
+                    num_kv_heads=num_kv_heads,
+                    head_dim_qk=head_dim_qk,
+                    head_dim_vo=head_dim_vo,
+                    causal=False,
+                    sm_scale=self._global_hyperparameters.sm_scale,
+                    window_left=self._global_hyperparameters.window_left,
+                    logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
+                    q_data_type=prefill_metadata.q_data_type,
+                    o_data_type=prefill_metadata.output_dtype,
+                )
+
+    def run_prefill_new_tokens(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        assert self._prefill_main is not None
+
+        ret = self._prefill_main.run(
+            q=q,
+            k=k,
+            v=v,
+            return_lse=return_softmax_lse,
+        )
+
+        if isinstance(ret, tuple):
+            # Convert from (q_len, num_heads) to (num_heads, q_len)
+            return ret[0], ret[1].transpose(0, 1).contiguous()
+        return ret
+
+    def run_prefill_context_chunk(
+        self,
+        chunk_idx: int,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        attn_out, lse = self._prefill_chunks[chunk_idx].run(
+            q=q,
+            k=k,
+            v=v,
+            return_lse=True,
+        )
+
+        # Convert from (q_len, num_heads) to (num_heads, q_len)
+        return attn_out, lse.transpose(0, 1).contiguous()
diff --git a/aphrodite/v1/attention/backends/mla/prefill/registry.py b/aphrodite/v1/attention/backends/mla/prefill/registry.py
new file mode 100644
index 0000000000..69ac05af2d
--- /dev/null
+++ b/aphrodite/v1/attention/backends/mla/prefill/registry.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Registry for MLA prefill backends.
+
+This module provides an enumeration of all available MLA prefill backends
+and utilities for loading them.
+"""
+
+from enum import Enum, EnumMeta
+from typing import TYPE_CHECKING
+
+from aphrodite.utils.import_utils import resolve_obj_by_qualname
+
+if TYPE_CHECKING:
+    from aphrodite.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+
+
+class _MLAPrefillBackendEnumMeta(EnumMeta):
+    """Metaclass for MLAPrefillBackendEnum to provide better error messages."""
+
+    def __getitem__(cls, name: str):
+        try:
+            return super().__getitem__(name)
+        except KeyError:
+            members = cls.__members__.keys()
+            valid_backends = ", ".join(members)
+            raise ValueError(f"Unknown MLA prefill backend: '{name}'. Valid options are: {valid_backends}") from None
+
+
+class MLAPrefillBackendEnum(Enum, metaclass=_MLAPrefillBackendEnumMeta):
+    """Enumeration of all supported MLA prefill backends."""
+
+    FLASH_ATTN = "aphrodite.v1.attention.backends.mla.prefill.flash_attn.FlashAttnPrefillBackend"
+    FLASHINFER = "aphrodite.v1.attention.backends.mla.prefill.flashinfer.FlashInferPrefillBackend"
+    TRTLLM_RAGGED = "aphrodite.v1.attention.backends.mla.prefill.trtllm_ragged.TrtllmRaggedPrefillBackend"
+
+    def get_path(self) -> str:
+        """Get the fully qualified class path for this backend."""
+        return self.value
+
+    def get_class(self) -> "type[MLAPrefillBackend]":
+        """Lazy load and return the backend class."""
+        return resolve_obj_by_qualname(self.get_path())
diff --git a/aphrodite/v1/attention/backends/mla/prefill/selector.py b/aphrodite/v1/attention/backends/mla/prefill/selector.py
new file mode 100644
index 0000000000..a25bb07a8b
--- /dev/null
+++ b/aphrodite/v1/attention/backends/mla/prefill/selector.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Selector for MLA prefill backends.
+
+This module provides functions for selecting the appropriate MLA prefill
+backend based on device capabilities and configuration.
+"""
+
+from functools import cache
+from typing import TYPE_CHECKING, NamedTuple
+
+import torch
+
+from aphrodite.logger import init_logger
+from aphrodite.platforms.interface import DeviceCapability
+from aphrodite.v1.attention.backends.mla.prefill.registry import MLAPrefillBackendEnum
+
+if TYPE_CHECKING:
+    from aphrodite.config import AphroditeConfig
+    from aphrodite.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+
+logger = init_logger(__name__)
+
+
+class MLAPrefillSelectorConfig(NamedTuple):
+    """Hashable configuration for MLA prefill backend selection.
+
+    This is analogous to AttentionSelectorConfig and contains model-specific
+    configuration needed to select an MLA prefill backend, extracted from
+    AphroditeConfig into a hashable form for caching.
+    """
+
+    dtype: torch.dtype
+    is_r1_compatible: bool
+
+
+def is_deepseek_r1_mla_compatible(aphrodite_config: "AphroditeConfig") -> bool:
+    """Check if model has DeepSeek R1 compatible MLA dimensions.
+
+    DeepSeek R1 MLA dimensions are:
+    - qk_nope_head_dim = 128
+    - qk_rope_head_dim = 64
+    - v_head_dim = 128
+    """
+    if aphrodite_config.model_config is None:
+        return False
+    hf_text_config = aphrodite_config.model_config.hf_text_config
+    qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
+    qk_rope_head_dim = getattr(hf_text_config, "qk_rope_head_dim", 1)
+    v_head_dim = getattr(hf_text_config, "v_head_dim", 1)
+    return qk_nope_head_dim == 128 and qk_rope_head_dim == 64 and v_head_dim == 128
+
+
+def _get_mla_prefill_backend_priorities(
+    device_capability: DeviceCapability,
+) -> list[MLAPrefillBackendEnum]:
+    """Get MLA prefill backend priorities based on device capability.
+
+    Args:
+        device_capability: The device's compute capability.
+
+    Returns:
+        List of backends in priority order (highest priority first).
+    """
+    if device_capability.major == 10:  # Blackwell
+        return [
+            MLAPrefillBackendEnum.FLASH_ATTN,
+            MLAPrefillBackendEnum.TRTLLM_RAGGED,
+            MLAPrefillBackendEnum.FLASHINFER,
+        ]
+    else:  # Hopper (SM90) and older
+        return [
+            MLAPrefillBackendEnum.FLASH_ATTN,
+        ]
+
+
+def get_mla_prefill_backend(
+    aphrodite_config: "AphroditeConfig",
+) -> "type[MLAPrefillBackend]":
+    """Select the MLA prefill backend based on configuration and device.
+
+    This function first checks for explicit user preferences via
+    mla_prefill_backend in AttentionConfig, then falls back to automatic
+    priority-based selection.
+
+    Args:
+        aphrodite_config: The Aphrodite configuration.
+
+    Returns:
+        The selected prefill backend class.
+    """
+    from aphrodite.platforms import current_platform
+
+    device_capability = current_platform.get_device_capability()
+    if device_capability is None:
+        logger.info_once("Device capability not available, using FlashAttention MLA prefill backend.")
+        return MLAPrefillBackendEnum.FLASH_ATTN.get_class()
+
+    attention_config = aphrodite_config.attention_config
+
+    selector_config = MLAPrefillSelectorConfig(
+        dtype=aphrodite_config.model_config.dtype,
+        is_r1_compatible=is_deepseek_r1_mla_compatible(aphrodite_config),
+    )
+
+    if attention_config.mla_prefill_backend is not None:
+        selected_backend = attention_config.mla_prefill_backend
+        backend_cls: type[MLAPrefillBackend] | None = None
+        try:
+            backend_cls = selected_backend.get_class()
+            invalid_reasons = backend_cls.validate_configuration(device_capability, selector_config)
+        except ImportError:
+            invalid_reasons = ["ImportError"]
+        if invalid_reasons:
+            raise ValueError(
+                f"Selected MLA prefill backend {selected_backend.name} "
+                f"is not valid for this configuration. "
+                f"Reason: {invalid_reasons}"
+            )
+        assert backend_cls is not None
+        logger.info("Using %s MLA prefill backend.", selected_backend.name)
+        return backend_cls
+
+    return _auto_select_mla_prefill_backend(
+        device_capability,
+        selector_config,
+    )
+
+
+@cache
+def _auto_select_mla_prefill_backend(
+    device_capability: DeviceCapability,
+    selector_config: MLAPrefillSelectorConfig,
+) -> "type[MLAPrefillBackend]":
+    """Auto-select the best available MLA prefill backend.
+
+    Args:
+        device_capability: The device's compute capability.
+        selector_config: Hashable configuration for backend selection.
+
+    Returns:
+        The selected prefill backend class.
+    """
+    priorities = _get_mla_prefill_backend_priorities(device_capability)
+    all_invalid_reasons: dict[str, list[str]] = {}
+
+    for backend_enum in priorities:
+        backend_cls: type[MLAPrefillBackend] | None = None
+        try:
+            backend_cls = backend_enum.get_class()
+            invalid_reasons = backend_cls.validate_configuration(device_capability, selector_config)
+        except ImportError:
+            invalid_reasons = ["ImportError"]
+        if not invalid_reasons:
+            assert backend_cls is not None
+            logger.info_once("Using %s MLA prefill backend.", backend_enum.name)
+            return backend_cls
+        all_invalid_reasons[backend_enum.name] = invalid_reasons
+
+    reasons_str = (
+        "{" + ", ".join(f"{name}: [{', '.join(reasons)}]" for name, reasons in all_invalid_reasons.items()) + "}"
+    )
+    config_str = repr(selector_config)
+    logger.debug_once(
+        "Some MLA prefill backends are not valid with %s. Reasons: %s.",
+        config_str,
+        reasons_str,
+    )
+
+    raise ValueError(f"No valid MLA prefill backend found with {config_str}. Reasons: {reasons_str}.")
diff --git a/aphrodite/v1/attention/backends/mla/prefill/trtllm_ragged.py b/aphrodite/v1/attention/backends/mla/prefill/trtllm_ragged.py
new file mode 100644
index 0000000000..6cffbbfd4d
--- /dev/null
+++ b/aphrodite/v1/attention/backends/mla/prefill/trtllm_ragged.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""TRT-LLM Ragged backend for MLA prefill."""
+
+from typing import TYPE_CHECKING
+
+import torch
+
+import aphrodite.envs as envs
+from aphrodite.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
+from aphrodite.v1.worker.workspace import current_workspace_manager
+
+if TYPE_CHECKING:
+    from aphrodite.config import AphroditeConfig
+    from aphrodite.model_executor.layers.attention.mla_attention import (
+        MLACommonPrefillMetadata,
+    )
+    from aphrodite.platforms.interface import DeviceCapability
+
+
+class TrtllmRaggedPrefillBackend(MLAPrefillBackend):
+    """TRT-LLM Ragged backend for MLA prefill."""
+
+    requires_r1_mla_dimensions = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRTLLM_RAGGED"
+
+    @classmethod
+    def supports_compute_capability(cls, device_capability: "DeviceCapability") -> bool:
+        return device_capability.major == 10
+
+    @classmethod
+    def is_available(cls) -> bool:
+        try:
+            from flashinfer.prefill import (
+                trtllm_ragged_attention_deepseek,  # noqa: F401
+            )
+
+            return True
+        except ImportError:
+            return False
+
+    def __init__(
+        self,
+        num_heads: int,
+        scale: float,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        aphrodite_config: "AphroditeConfig",
+        device: torch.device,
+        layer_names: list[str] | None = None,
+    ) -> None:
+        super().__init__(
+            num_heads=num_heads,
+            scale=scale,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            aphrodite_config=aphrodite_config,
+            device=device,
+            layer_names=layer_names,
+        )
+
+    def _get_workspace_buffer(self) -> torch.Tensor:
+        (workspace_buffer,) = current_workspace_manager().get_simultaneous(
+            (
+                (envs.APHRODITE_FLASHINFER_WORKSPACE_BUFFER_SIZE,),
+                torch.uint8,
+            ),
+        )
+        return workspace_buffer
+
+    def prepare_metadata(
+        self,
+        prefill_metadata: "MLACommonPrefillMetadata",
+    ) -> None:
+        super().prepare_metadata(prefill_metadata)
+        self._query_seq_lens = prefill_metadata.query_start_loc[1:] - prefill_metadata.query_start_loc[:-1]
+
+    def run_prefill_new_tokens(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        return_softmax_lse: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        from flashinfer.prefill import trtllm_ragged_attention_deepseek
+
+        workspace_buffer = self._get_workspace_buffer()
+        out = torch.empty(
+            q.shape[0],
+            q.shape[1],
+            v.shape[2],
+            device=q.device,
+            dtype=self._prefill_metadata.output_dtype,
+        )
+
+        ret = trtllm_ragged_attention_deepseek(
+            query=q,
+            key=k,
+            value=v,
+            workspace_buffer=workspace_buffer,
+            seq_lens=self._query_seq_lens,
+            max_q_len=self._prefill_metadata.max_query_len,
+            max_kv_len=self._prefill_metadata.max_query_len,
+            bmm1_scale=self.scale,
+            bmm2_scale=1.0,
+            o_sf_scale=1.0,
+            batch_size=self._query_seq_lens.shape[0],
+            window_left=-1,
+            cum_seq_lens_q=self._prefill_metadata.query_start_loc,
+            cum_seq_lens_kv=self._prefill_metadata.query_start_loc,
+            enable_pdl=False,
+            is_causal=True,
+            return_lse=return_softmax_lse,
+            out=out,
+        )
+
+        if isinstance(ret, tuple):
+            # Convert from (q_len, num_heads) to (num_heads, q_len)
+            return ret[0], ret[1].transpose(0, 1).contiguous()
+        return ret
+
+    def run_prefill_context_chunk(
+        self,
+        chunk_idx: int,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        from flashinfer.prefill import trtllm_ragged_attention_deepseek
+
+        assert self._prefill_metadata.chunked_context is not None
+        assert self._prefill_metadata.chunked_context.seq_lens[chunk_idx] is not None
+        workspace_buffer = self._get_workspace_buffer()
+
+        out = torch.empty(
+            q.shape[0],
+            q.shape[1],
+            v.shape[2],
+            device=q.device,
+            dtype=self._prefill_metadata.output_dtype,
+        )
+
+        attn_out, lse = trtllm_ragged_attention_deepseek(
+            query=q,
+            key=k,
+            value=v,
+            workspace_buffer=workspace_buffer,
+            seq_lens=self._prefill_metadata.chunked_context.seq_lens[chunk_idx],
+            max_q_len=self._prefill_metadata.max_query_len,
+            max_kv_len=self._prefill_metadata.chunked_context.max_seq_lens[chunk_idx],
+            bmm1_scale=self.scale,
+            bmm2_scale=1.0,
+            o_sf_scale=1.0,
+            batch_size=self._prefill_metadata.chunked_context.seq_lens[chunk_idx].shape[0],
+            window_left=-1,
+            cum_seq_lens_q=self._prefill_metadata.query_start_loc,
+            cum_seq_lens_kv=self._prefill_metadata.chunked_context.cu_seq_lens[chunk_idx],
+            enable_pdl=False,
+            is_causal=False,
+            return_lse=True,
+            out=out,
+        )
+
+        # Convert from (q_len, num_heads) to (num_heads, q_len)
+        return attn_out, lse.transpose(0, 1).contiguous()
diff --git a/aphrodite/v1/attention/backends/mla/rocm_aiter_mla.py b/aphrodite/v1/attention/backends/mla/rocm_aiter_mla.py
index baeb6c197c..64d7f55f09 100644
--- a/aphrodite/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/aphrodite/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -365,6 +365,7 @@ class AiterMLAHelper:
     """
 
     _AITER_MIN_MLA_HEADS: Final = 16
+    _AITER_UNSUPPORTED_HEADS = [32]
 
     @staticmethod
     def check_num_heads_validity(num_heads: int):
@@ -388,6 +389,7 @@ def get_actual_mla_num_heads(num_heads: int) -> int:
 
     @staticmethod
     def get_mla_padded_q(num_heads: int, q: torch.Tensor) -> torch.Tensor:
+        assert num_heads not in AiterMLAHelper._AITER_UNSUPPORTED_HEADS, f"unsupported head_num: {num_heads}"
         return (
             q
             if num_heads >= AiterMLAHelper._AITER_MIN_MLA_HEADS
diff --git a/aphrodite/v1/attention/backends/mla/triton_mla.py b/aphrodite/v1/attention/backends/mla/triton_mla.py
index 51a77ad869..f917b59529 100644
--- a/aphrodite/v1/attention/backends/mla/triton_mla.py
+++ b/aphrodite/v1/attention/backends/mla/triton_mla.py
@@ -119,16 +119,6 @@ def __init__(
 
         self._sm_count = current_platform.num_compute_units()
 
-    def _flash_attn_varlen_diff_headdims(self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs):
-        return super()._flash_attn_varlen_diff_headdims(
-            q,
-            k,
-            v,
-            return_softmax_lse=return_softmax_lse,
-            softmax_scale=softmax_scale,
-            **kwargs,
-        )
-
     def forward_mqa(
         self,
         q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
diff --git a/aphrodite/v1/attention/ops/dcp_alltoall.py b/aphrodite/v1/attention/ops/dcp_alltoall.py
index 50f511d1e9..1d098dd144 100644
--- a/aphrodite/v1/attention/ops/dcp_alltoall.py
+++ b/aphrodite/v1/attention/ops/dcp_alltoall.py
@@ -9,13 +9,11 @@
 A2A exchanges partial attention outputs and their LSE values across
 ranks, then combines them with exact LSE-weighted reduction.
 
-This reduces the number of NCCL calls per attention layer from 3
-(AG for Q, AG for K metadata, RS for output) to 2 (A2A for output,
-A2A for LSE), lowering per-step communication overhead for long-context
-decode where NCCL latency is a significant fraction of step time.
+This reduces the number of NCCL calls per attention layer by exchanging
+the partial output and LSE in a single packed All-to-All payload.
 
 Usage:
-    aphrodite serve model --tp 16 --dcp 16 --dcp-comm-backend a2a
+    aphrodite run model --tp 16 --dcp 16 --dcp-comm-backend a2a
 
 Reference: https://arxiv.org/abs/2507.07120
 """
@@ -28,6 +26,10 @@
 import torch.distributed as dist
 
 from aphrodite.triton_utils import tl, triton
+from aphrodite.v1.worker.workspace import (
+    current_workspace_manager,
+    is_workspace_manager_initialized,
+)
 
 if TYPE_CHECKING:
     from aphrodite.distributed.parallel_state import GroupCoordinator
@@ -44,7 +46,6 @@ def _lse_weighted_combine(
     CPU reference implementation for LSE-weighted combination.
 
     This is a pure PyTorch implementation used for testing and validation.
-    For GPU execution, use dcp_lse_combine_triton instead.
 
     Args:
         outputs: Partial attention outputs [N, B, H, D]
@@ -102,57 +103,121 @@ def _lse_weighted_combine(
     return result
 
 
+def _dcp_a2a_lse_pack_dim(output_dtype: torch.dtype) -> int:
+    bits = torch.finfo(output_dtype).bits
+    if bits == 16:
+        return 2
+    if bits == 32:
+        return 1
+    raise ValueError(f"Cannot pack fp32 LSE into output dtype {output_dtype}.")
+
+
+def _dcp_a2a_send_recv_buffers(
+    shape: tuple[int, ...],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if is_workspace_manager_initialized():
+        send_buffer, recv_buffer = current_workspace_manager().get_simultaneous(
+            (shape, dtype),
+            (shape, dtype),
+        )
+        return send_buffer, recv_buffer
+
+    return (
+        torch.empty(shape, device=device, dtype=dtype),
+        torch.empty(shape, device=device, dtype=dtype),
+    )
+
+
 @triton.jit
-def _dcp_lse_combine_kernel(
-    # Input pointers
-    recv_output_ptr,
-    recv_lse_ptr,
-    # Output pointers
+def _dcp_a2a_pack_send_kernel(
     out_ptr,
-    out_lse_ptr,
-    # Strides for recv_output [N, B, H_local, D]
-    ro_stride_N,
-    ro_stride_B,
-    ro_stride_H,
-    ro_stride_D,
-    # Strides for recv_lse [N, B, H_local]
-    rl_stride_N,
-    rl_stride_B,
-    rl_stride_H,
-    # Strides for output [B, H_local, D]
-    o_stride_B,
-    o_stride_H,
-    o_stride_D,
-    # Constants
+    lse_ptr,
+    send_ptr,
+    out_stride_B,
+    out_stride_H,
+    out_stride_D,
+    lse_stride_B,
+    lse_stride_H,
+    send_stride_N,
+    send_stride_B,
+    send_stride_H,
+    send_stride_D,
     N: tl.constexpr,
     HEAD_DIM: tl.constexpr,
-    IS_BASE_E: tl.constexpr,
-    RETURN_LSE: tl.constexpr,
+    H_PER_RANK: tl.constexpr,
+    LSE_PACK_DIM: tl.constexpr,
 ):
-    """
-    Triton kernel for LSE-weighted combination of partial attention outputs.
+    batch_idx = tl.program_id(0).to(tl.int64)
+    local_head_idx = tl.program_id(1).to(tl.int64)
+    d_offsets = tl.arange(0, HEAD_DIM)
+
+    for rank_idx in tl.static_range(N):
+        src_head_idx = rank_idx * H_PER_RANK + local_head_idx
+        send_base = rank_idx * send_stride_N + batch_idx * send_stride_B + local_head_idx * send_stride_H
 
-    After All-to-All, each rank has:
-    - recv_output [N, B, H_local, D]: partial outputs from all KV shards
-    - recv_lse [N, B, H_local]: partial LSEs from all KV shards
+        out_offsets = batch_idx * out_stride_B + src_head_idx * out_stride_H + d_offsets * out_stride_D
+        tl.store(
+            send_ptr + send_base + d_offsets * send_stride_D,
+            tl.load(out_ptr + out_offsets),
+        )
+
+        lse_val = tl.load(lse_ptr + batch_idx * lse_stride_B + src_head_idx * lse_stride_H)
+        if LSE_PACK_DIM == 1:
+            tl.store(
+                send_ptr + send_base + HEAD_DIM * send_stride_D,
+                lse_val.to(send_ptr.dtype.element_ty),
+            )
+        else:
+            lse_bits = lse_val.to(tl.uint32, bitcast=True)
+            lo = (lse_bits & 0xFFFF).to(tl.uint16)
+            hi = ((lse_bits >> 16) & 0xFFFF).to(tl.uint16)
+            tl.store(
+                send_ptr + send_base + HEAD_DIM * send_stride_D,
+                lo.to(send_ptr.dtype.element_ty, bitcast=True),
+            )
+            tl.store(
+                send_ptr + send_base + (HEAD_DIM + 1) * send_stride_D,
+                hi.to(send_ptr.dtype.element_ty, bitcast=True),
+            )
 
-    This kernel computes the weighted combination locally (no communication).
 
-    Grid: (B, H_local)
-    Each program handles one (batch, head) and processes all D elements.
-    """
+@triton.jit
+def _dcp_a2a_unpack_combine_kernel(
+    recv_ptr,
+    out_ptr,
+    out_lse_ptr,
+    recv_stride_N,
+    recv_stride_B,
+    recv_stride_H,
+    recv_stride_D,
+    out_stride_B,
+    out_stride_H,
+    out_stride_D,
+    out_lse_stride_B,
+    out_lse_stride_H,
+    N: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    IS_BASE_E: tl.constexpr,
+    RETURN_LSE: tl.constexpr,
+    LSE_PACK_DIM: tl.constexpr,
+):
     batch_idx = tl.program_id(0).to(tl.int64)
     head_idx = tl.program_id(1).to(tl.int64)
+    d_offsets = tl.arange(0, HEAD_DIM)
 
-    # Base offset for this (batch, head)
-    base_lse_offset = batch_idx * rl_stride_B + head_idx * rl_stride_H
-    base_out_offset = batch_idx * ro_stride_B + head_idx * ro_stride_H
-
-    # First pass: find max LSE for numerical stability
     lse_max = -float("inf")
-    for n in tl.static_range(N):
-        lse_offset = n * rl_stride_N + base_lse_offset
-        lse_val = tl.load(recv_lse_ptr + lse_offset)
+    for rank_idx in tl.static_range(N):
+        recv_base = rank_idx * recv_stride_N + batch_idx * recv_stride_B + head_idx * recv_stride_H
+        if LSE_PACK_DIM == 1:
+            lse_val = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D).to(tl.float32)
+        else:
+            lo_raw = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D)
+            hi_raw = tl.load(recv_ptr + recv_base + (HEAD_DIM + 1) * recv_stride_D)
+            lo = lo_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            hi = hi_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            lse_val = (lo | (hi << 16)).to(tl.float32, bitcast=True)
         lse_val = tl.where(
             (lse_val != lse_val) | (lse_val == float("inf")),
             -float("inf"),
@@ -162,11 +227,17 @@ def _dcp_lse_combine_kernel(
 
     lse_max = tl.where(lse_max == -float("inf"), 0.0, lse_max)
 
-    # Second pass: compute sum of exp(lse - max)
     lse_sum = 0.0
-    for n in tl.static_range(N):
-        lse_offset = n * rl_stride_N + base_lse_offset
-        lse_val = tl.load(recv_lse_ptr + lse_offset)
+    for rank_idx in tl.static_range(N):
+        recv_base = rank_idx * recv_stride_N + batch_idx * recv_stride_B + head_idx * recv_stride_H
+        if LSE_PACK_DIM == 1:
+            lse_val = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D).to(tl.float32)
+        else:
+            lo_raw = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D)
+            hi_raw = tl.load(recv_ptr + recv_base + (HEAD_DIM + 1) * recv_stride_D)
+            lo = lo_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            hi = hi_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            lse_val = (lo | (hi << 16)).to(tl.float32, bitcast=True)
         lse_val = tl.where(
             (lse_val != lse_val) | (lse_val == float("inf")),
             -float("inf"),
@@ -177,19 +248,22 @@ def _dcp_lse_combine_kernel(
         else:
             lse_sum += tl.exp2(lse_val - lse_max)
 
-    # Compute global LSE
     if IS_BASE_E:  # noqa: SIM108
         global_lse = tl.log(lse_sum) + lse_max
     else:
         global_lse = tl.log2(lse_sum) + lse_max
 
-    # Third pass: weighted combination across D dimension
-    d_offsets = tl.arange(0, HEAD_DIM)
     acc = tl.zeros([HEAD_DIM], dtype=tl.float32)
-
-    for n in tl.static_range(N):
-        lse_offset = n * rl_stride_N + base_lse_offset
-        lse_val = tl.load(recv_lse_ptr + lse_offset)
+    for rank_idx in tl.static_range(N):
+        recv_base = rank_idx * recv_stride_N + batch_idx * recv_stride_B + head_idx * recv_stride_H
+        if LSE_PACK_DIM == 1:
+            lse_val = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D).to(tl.float32)
+        else:
+            lo_raw = tl.load(recv_ptr + recv_base + HEAD_DIM * recv_stride_D)
+            hi_raw = tl.load(recv_ptr + recv_base + (HEAD_DIM + 1) * recv_stride_D)
+            lo = lo_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            hi = hi_raw.to(tl.uint16, bitcast=True).to(tl.uint32)
+            lse_val = (lo | (hi << 16)).to(tl.float32, bitcast=True)
         lse_val = tl.where(
             (lse_val != lse_val) | (lse_val == float("inf")),
             -float("inf"),
@@ -200,74 +274,84 @@ def _dcp_lse_combine_kernel(
         else:
             weight = tl.exp2(lse_val - global_lse)
         weight = tl.where(weight != weight, 0.0, weight)
+        acc += tl.load(recv_ptr + recv_base + d_offsets * recv_stride_D).to(tl.float32) * weight
 
-        out_offsets = n * ro_stride_N + base_out_offset + d_offsets * ro_stride_D
-        out_vals = tl.load(recv_output_ptr + out_offsets)
-        acc += out_vals.to(tl.float32) * weight
-
-    # Store result
-    final_offsets = batch_idx * o_stride_B + head_idx * o_stride_H + d_offsets * o_stride_D
+    final_offsets = batch_idx * out_stride_B + head_idx * out_stride_H + d_offsets * out_stride_D
     tl.store(out_ptr + final_offsets, acc)
 
     if RETURN_LSE:
-        tl.store(out_lse_ptr + base_lse_offset, global_lse)
+        out_lse_offset = batch_idx * out_lse_stride_B + head_idx * out_lse_stride_H
+        tl.store(out_lse_ptr + out_lse_offset, global_lse)
 
 
-def dcp_lse_combine_triton(
-    recv_output: torch.Tensor,
-    recv_lse: torch.Tensor,
-    return_lse: bool = False,
-    is_lse_base_on_e: bool = True,
-) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-    """
-    Triton-accelerated LSE-weighted combination for DCP A2A.
-
-    Args:
-        recv_output: [N, B, H_local, D] - partial outputs from all KV shards
-        recv_lse: [N, B, H_local] - partial LSEs from all KV shards
-        return_lse: If True, also return the global LSE
-        is_lse_base_on_e: If True, LSE is base e; if False, base 2
-
-    Returns:
-        Combined output [B, H_local, D]
-        If return_lse=True, also returns global_lse [B, H_local]
-    """
-    N, B, H_local, D = recv_output.shape
-
-    out = torch.empty((B, H_local, D), device=recv_output.device, dtype=recv_output.dtype)
-
-    if return_lse:
-        out_lse = torch.empty((B, H_local), device=recv_lse.device, dtype=recv_lse.dtype)
-    else:
-        out_lse = torch.empty(1, device=recv_lse.device, dtype=recv_lse.dtype)
-
-    ro_stride_N, ro_stride_B, ro_stride_H, ro_stride_D = recv_output.stride()
-    rl_stride_N, rl_stride_B, rl_stride_H = recv_lse.stride()
-    o_stride_B, o_stride_H, o_stride_D = out.stride()
+def _dcp_a2a_pack_send(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    send_buffer: torch.Tensor,
+    world_size: int,
+    h_per_rank: int,
+    head_dim: int,
+    lse_pack_dim: int,
+) -> None:
+    grid = (cp_attn_out.shape[0], h_per_rank, 1)
+    _dcp_a2a_pack_send_kernel[grid](
+        cp_attn_out,
+        cp_attn_lse,
+        send_buffer,
+        cp_attn_out.stride(0),
+        cp_attn_out.stride(1),
+        cp_attn_out.stride(2),
+        cp_attn_lse.stride(0),
+        cp_attn_lse.stride(1),
+        send_buffer.stride(0),
+        send_buffer.stride(1),
+        send_buffer.stride(2),
+        send_buffer.stride(3),
+        N=world_size,
+        HEAD_DIM=head_dim,
+        H_PER_RANK=h_per_rank,
+        LSE_PACK_DIM=lse_pack_dim,
+    )
 
-    grid = (B, H_local, 1)
 
-    _dcp_lse_combine_kernel[grid](
-        recv_output,
-        recv_lse,
+def _dcp_a2a_unpack_combine(
+    recv_buffer: torch.Tensor,
+    head_dim: int,
+    lse_pack_dim: int,
+    return_lse: bool,
+    is_lse_base_on_e: bool,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+    world_size, num_tokens, h_per_rank, _ = recv_buffer.shape
+    out = torch.empty(
+        (num_tokens, h_per_rank, head_dim),
+        device=recv_buffer.device,
+        dtype=recv_buffer.dtype,
+    )
+    out_lse = torch.empty(
+        (num_tokens, h_per_rank) if return_lse else (1, 1),
+        device=recv_buffer.device,
+        dtype=torch.float32 if return_lse else recv_buffer.dtype,
+    )
+    grid = (num_tokens, h_per_rank, 1)
+    _dcp_a2a_unpack_combine_kernel[grid](
+        recv_buffer,
         out,
         out_lse,
-        ro_stride_N,
-        ro_stride_B,
-        ro_stride_H,
-        ro_stride_D,
-        rl_stride_N,
-        rl_stride_B,
-        rl_stride_H,
-        o_stride_B,
-        o_stride_H,
-        o_stride_D,
-        N=N,
-        HEAD_DIM=D,
+        recv_buffer.stride(0),
+        recv_buffer.stride(1),
+        recv_buffer.stride(2),
+        recv_buffer.stride(3),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        out_lse.stride(0),
+        out_lse.stride(1),
+        N=world_size,
+        HEAD_DIM=head_dim,
         IS_BASE_E=is_lse_base_on_e,
         RETURN_LSE=return_lse,
+        LSE_PACK_DIM=lse_pack_dim,
     )
-
     if return_lse:
         return out, out_lse
     return out
@@ -284,17 +368,8 @@ def dcp_a2a_lse_reduce(
     """
     Combine partial attention outputs across DCP ranks using All-to-All.
 
-    Each rank holds attention output for all heads but only a local shard
-    of the KV cache. This function:
-    1. Exchanges partial outputs across ranks via All-to-All
-    2. Exchanges LSE values via All-to-All
-    3. Combines them with exact LSE-weighted reduction (Triton kernel)
-
-    Tensor flow:
-        Input:  cp_attn_out [B, H, D] - all heads, local KV shard
-        Reshape: [N, B, H/N, D] - split heads across ranks
-        A2A:    Two all_to_all_single calls (output and LSE)
-        Combine: recv [N, B, H/N, D] + lse [N, B, H/N] -> [B, H/N, D]
+    The output and fp32 LSE are packed into a single output-dtype buffer, sent
+    with one All-to-All, then unpacked and combined with exact LSE weighting.
 
     Args:
         cp_attn_out: [B, H, D] where B=num_tokens, H=total_heads, D=head_dim
@@ -315,41 +390,34 @@ def dcp_a2a_lse_reduce(
             return cp_attn_out, cp_attn_lse
         return cp_attn_out
 
-    local_output = cp_attn_out.contiguous()
-    local_lse = cp_attn_lse.contiguous()
-
-    B, H, D = local_output.shape
+    B, H, D = cp_attn_out.shape
+    if H % world_size != 0:
+        raise ValueError(f"H={H} must be divisible by DCP world size {world_size}.")
     H_per_rank = H // world_size
+    lse_pack_dim = _dcp_a2a_lse_pack_dim(cp_attn_out.dtype)
 
-    # Reshape for All-to-All: [B, H, D] -> [N, B, H/N, D]
-    # Split heads into N chunks, each destined for a different rank
-    send_output = local_output.view(B, world_size, H_per_rank, D).permute(1, 0, 2, 3).contiguous()
-    recv_output = torch.empty_like(send_output)
-
-    # Same for LSE: [B, H] -> [N, B, H/N]
-    send_lse = local_lse.view(B, world_size, H_per_rank).permute(1, 0, 2).contiguous()
-    recv_lse = torch.empty_like(send_lse)
+    send_buffer, recv_buffer = _dcp_a2a_send_recv_buffers(
+        (world_size, B, H_per_rank, D + lse_pack_dim),
+        device=cp_attn_out.device,
+        dtype=cp_attn_out.dtype,
+    )
 
-    # All-to-All for partial attention outputs and LSE values (async overlap)
-    work_output = dist.all_to_all_single(
-        recv_output.view(-1),
-        send_output.view(-1),
-        group=cp_group.device_group,
-        async_op=True,
+    _dcp_a2a_pack_send(
+        cp_attn_out,
+        cp_attn_lse,
+        send_buffer,
+        world_size,
+        H_per_rank,
+        D,
+        lse_pack_dim,
     )
-    work_lse = dist.all_to_all_single(
-        recv_lse.view(-1),
-        send_lse.view(-1),
+
+    work = dist.all_to_all_single(
+        recv_buffer.view(-1),
+        send_buffer.view(-1),
         group=cp_group.device_group,
         async_op=True,
     )
-    work_output.wait()
-    work_lse.wait()
-
-    # LSE-weighted combination via Triton kernel (local, no communication)
-    return dcp_lse_combine_triton(
-        recv_output,
-        recv_lse,
-        return_lse=return_lse,
-        is_lse_base_on_e=is_lse_base_on_e,
-    )
+    work.wait()
+
+    return _dcp_a2a_unpack_combine(recv_buffer, D, lse_pack_dim, return_lse, is_lse_base_on_e)
diff --git a/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_compress_quant_cache.py b/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_compress_quant_cache.py
index 3128b3d0fe..6d71704fc2 100644
--- a/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_compress_quant_cache.py
+++ b/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_compress_quant_cache.py
@@ -21,7 +21,7 @@
 
 from aphrodite.triton_utils import tl, triton
 
-from .fused_indexer_q import _e2m1_nibble
+from .fused_indexer_q import _fp32x2_to_fp4x2
 
 
 # =============================================================================
@@ -545,18 +545,16 @@ def _fused_kv_compress_norm_rope_insert_indexer_mxfp4_attn(
         tl.max(tl.abs(even_2d), axis=1),
         tl.max(tl.abs(odd_2d), axis=1),
     )
-    amax = tl.maximum(amax, 1e-4)
+    amax = tl.maximum(amax, 6.0 * (2**-126))
 
     # ue8m0 block scale: 2^ceil(log2(amax / 6.0)), stored as (exp + 127) byte.
-    log2_ratio = tl.ceil(tl.log2(amax / 6.0))
+    log2_ratio = tl.ceil(tl.log2(amax * (1.0 / 6.0)))
     log2_ratio = tl.minimum(tl.maximum(log2_ratio, -127.0), 127.0)
     inv_scale = tl.exp2(-log2_ratio)
     ue8m0 = (log2_ratio + 127.0).to(tl.uint8)  # [N_QUANT_BLOCKS]
 
     inv_scale_col = tl.reshape(inv_scale, (N_QUANT_BLOCKS, 1))
-    lo_nib = _e2m1_nibble(even_2d * inv_scale_col)  # (N_BLOCKS, HALF_BLOCK) uint8
-    hi_nib = _e2m1_nibble(odd_2d * inv_scale_col)
-    packed = lo_nib | (hi_nib << 4)
+    packed = _fp32x2_to_fp4x2(even_2d * inv_scale_col, odd_2d * inv_scale_col)  # (N_BLOCKS, HALF_BLOCK) uint8
     packed_flat = tl.reshape(packed, (TOKEN_STRIDE,))
 
     tl.store(val_ptr + tl.arange(0, TOKEN_STRIDE), packed_flat)
diff --git a/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py b/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
index 0003b1f188..acdd0e8d28 100644
--- a/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
+++ b/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_indexer_q.py
@@ -24,36 +24,22 @@ def _get_cos_sin(
 
 
 @triton.jit
-def _e2m1_nibble(x):
-    """Quantize fp32 x (already scale-divided) to E2M1 4-bit nibble in uint8.
-    Matches torch.bucketize with boundaries
-    [0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0] and right=False (each boundary
-    belongs to the lower bucket), plus sign bit."""
-    abs_x = tl.minimum(tl.abs(x), 6.0)
-    code = tl.where(
-        abs_x <= 0.25,
-        0.0,
-        tl.where(
-            abs_x <= 0.75,
-            1.0,
-            tl.where(
-                abs_x <= 1.25,
-                2.0,
-                tl.where(
-                    abs_x <= 1.75,
-                    3.0,
-                    tl.where(
-                        abs_x <= 2.5,
-                        4.0,
-                        tl.where(abs_x <= 3.5, 5.0, tl.where(abs_x <= 5.0, 6.0, 7.0)),
-                    ),
-                ),
-            ),
-        ),
-    )
-    code_u8 = code.to(tl.uint8)
-    sign = ((x < 0) & (code_u8 != 0)).to(tl.uint8)
-    return code_u8 | (sign << 3)
+def _fp32x2_to_fp4x2(x_lo, x_hi):
+    # NOTE: $1 is high nibble, $2 is low nibble
+    return tl.inline_asm_elementwise(
+        """
+        {
+            .reg .b8 tmp;
+            cvt.rn.satfinite.e2m1x2.f32 tmp, $1, $2;
+            cvt.u32.u8 $0, tmp;
+        }
+        """,
+        constraints="=r,f,f",
+        args=[x_hi, x_lo],
+        dtype=tl.uint32,
+        is_pure=True,
+        pack=1,
+    ).to(tl.uint8)
 
 
 @triton.jit
@@ -65,17 +51,16 @@ def _quantize_mxfp4_pair(x_lo, x_hi):
         - ue8m0  : scalar uint8    (block scale = 2^(ue8m0 - 127))
     """
     amax = tl.maximum(tl.max(tl.abs(x_lo)), tl.max(tl.abs(x_hi)))
-    amax = tl.maximum(amax, 1e-4)
+    # 6 * 2^-126 is from https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/inference/kernel.py#L163
+    amax = tl.maximum(amax, 6.0 * (2**-126))
     # ue8m0 block scale: 2^ceil(log2(amax/6.0)).
-    log2_ratio = tl.math.ceil(tl.math.log2(amax / 6.0))
+    log2_ratio = tl.math.ceil(tl.math.log2(amax * (1.0 / 6.0)))
     log2_ratio = tl.minimum(tl.maximum(log2_ratio, -127.0), 127.0)
     scale = tl.math.exp2(log2_ratio)
     ue8m0 = (log2_ratio + 127.0).to(tl.uint8)
 
     inv_scale = 1.0 / scale
-    lo_nib = _e2m1_nibble(x_lo * inv_scale)
-    hi_nib = _e2m1_nibble(x_hi * inv_scale)
-    packed = lo_nib | (hi_nib << 4)
+    packed = _fp32x2_to_fp4x2(x_lo * inv_scale, x_hi * inv_scale)
     return packed, ue8m0
 
 
diff --git a/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py b/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
index 65dea10b3d..3e44e9ca92 100644
--- a/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
+++ b/aphrodite/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
@@ -10,6 +10,7 @@
 import torch
 
 from aphrodite.triton_utils import tl, triton
+from aphrodite.utils.torch_utils import direct_register_custom_op
 
 
 @triton.jit
@@ -156,34 +157,74 @@ def fused_inv_rope_fp8_quant(
     fp8_dtype = torch.float8_e4m3fn
     fp8_max = torch.finfo(fp8_dtype).max
 
-    fp8_buf = torch.empty(
-        (n_groups, num_tokens, d),
-        dtype=fp8_dtype,
-        device=o.device,
-    )
-
     tma_aligned_T = get_tma_aligned_size(num_tokens, 4)
     if tma_aligned_scales:
         packed_sf_k = (num_scale_blocks + 3) // 4
-        scale_buf = torch.empty(
-            n_groups * packed_sf_k * tma_aligned_T,
-            dtype=torch.int32,
-            device=o.device,
-        ).as_strided(
-            (n_groups, num_tokens, packed_sf_k),
-            (packed_sf_k * tma_aligned_T, 1, tma_aligned_T),
-        )
+        scale_inner = packed_sf_k
     else:
-        scale_buf = torch.empty(
-            n_groups * num_scale_blocks * tma_aligned_T,
-            dtype=torch.float32,
-            device=o.device,
-        ).as_strided(
-            (n_groups, num_tokens, num_scale_blocks),
-            (num_scale_blocks * tma_aligned_T, 1, tma_aligned_T),
-        )
-
-    common_args = dict(
+        scale_inner = num_scale_blocks
+
+    # Run kernel through a custom op so inductor sees an opaque boundary.
+    # It's a pytorch bug, see https://github.com/aphrodite-project/aphrodite/issues/41106
+    fp8_buf, scale_buf = torch.ops.aphrodite.fused_inv_rope_fp8_quant_kernel(
+        o,
+        positions,
+        cos_sin_cache,
+        heads_per_group,
+        quant_group_size,
+        chunks_per_head,
+        nope_dim % quant_group_size,
+        rope_dim // 2,
+        tma_aligned_scales,
+        fp8_max,
+        tma_aligned_T,
+        num_tokens,
+        n_groups,
+        d,
+        scale_inner,
+    )
+    return fp8_buf.transpose(0, 1), scale_buf.transpose(0, 1)
+
+
+def _fused_inv_rope_fp8_quant_kernel_impl(
+    o: torch.Tensor,
+    positions: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    heads_per_group: int,
+    quant_group_size: int,
+    chunks_per_head: int,
+    rope_start: int,
+    half_rope: int,
+    tma_aligned_scales: bool,
+    fp8_max: float,
+    tma_aligned_T: int,
+    num_tokens: int,
+    n_groups: int,
+    d: int,
+    scale_inner: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    fp8_buf = torch.empty(
+        (n_groups, num_tokens, d),
+        dtype=torch.float8_e4m3fn,
+        device=o.device,
+    )
+    scale_dtype = torch.int32 if tma_aligned_scales else torch.float32
+    scale_buf = torch.empty(
+        n_groups * scale_inner * tma_aligned_T,
+        dtype=scale_dtype,
+        device=o.device,
+    ).as_strided(
+        (n_groups, num_tokens, scale_inner),
+        (scale_inner * tma_aligned_T, 1, tma_aligned_T),
+    )
+    grid = (tma_aligned_T, n_groups * heads_per_group)
+    _fused_inv_rope_fp8_quant_per_head[grid](
+        o,
+        positions,
+        cos_sin_cache,
+        fp8_buf,
+        scale_buf,
+        num_tokens,
         heads_per_group=heads_per_group,
         o_stride_token=o.stride(0),
         o_stride_head=o.stride(1),
@@ -196,23 +237,52 @@ def fused_inv_rope_fp8_quant(
         eps=1e-10,
         QUANT_GROUP_SIZE=quant_group_size,
         CHUNKS_PER_HEAD=chunks_per_head,
-        ROPE_START=nope_dim % quant_group_size,
-        HALF_ROPE=rope_dim // 2,
+        ROPE_START=rope_start,
+        HALF_ROPE=half_rope,
         TMA_ALIGNED_SCALES=tma_aligned_scales,
         num_stages=1,
         launch_pdl=False,
+        num_warps=1,
     )
+    return fp8_buf, scale_buf
 
-    grid = (tma_aligned_T, n_groups * heads_per_group)
-    _fused_inv_rope_fp8_quant_per_head[grid](
-        o,
-        positions,
-        cos_sin_cache,
-        fp8_buf,
-        scale_buf,
-        num_tokens,
-        **common_args,
-        num_warps=1,
+
+def _fused_inv_rope_fp8_quant_kernel_fake(
+    o: torch.Tensor,
+    positions: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    heads_per_group: int,
+    quant_group_size: int,
+    chunks_per_head: int,
+    rope_start: int,
+    half_rope: int,
+    tma_aligned_scales: bool,
+    fp8_max: float,
+    tma_aligned_T: int,
+    num_tokens: int,
+    n_groups: int,
+    d: int,
+    scale_inner: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    fp8_buf = torch.empty(
+        (n_groups, num_tokens, d),
+        dtype=torch.float8_e4m3fn,
+        device=o.device,
     )
+    scale_dtype = torch.int32 if tma_aligned_scales else torch.float32
+    scale_buf = torch.empty(
+        n_groups * scale_inner * tma_aligned_T,
+        dtype=scale_dtype,
+        device=o.device,
+    ).as_strided(
+        (n_groups, num_tokens, scale_inner),
+        (scale_inner * tma_aligned_T, 1, tma_aligned_T),
+    )
+    return fp8_buf, scale_buf
 
-    return fp8_buf.transpose(0, 1), scale_buf.transpose(0, 1)
+
+direct_register_custom_op(
+    op_name="fused_inv_rope_fp8_quant_kernel",
+    op_func=_fused_inv_rope_fp8_quant_kernel_impl,
+    fake_impl=_fused_inv_rope_fp8_quant_kernel_fake,
+)
diff --git a/aphrodite/v1/attention/ops/rocm_aiter_mla_sparse.py b/aphrodite/v1/attention/ops/rocm_aiter_mla_sparse.py
index 2585276672..4962c192ae 100644
--- a/aphrodite/v1/attention/ops/rocm_aiter_mla_sparse.py
+++ b/aphrodite/v1/attention/ops/rocm_aiter_mla_sparse.py
@@ -13,9 +13,6 @@
 from aphrodite.v1.attention.backends.mla.indexer import DeepseekV32IndexerMetadata
 from aphrodite.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton
 
-if current_platform.is_cuda_alike():
-    from aphrodite import _custom_ops as ops
-
 
 @triton.jit
 def _indexer_k_quant_and_cache_kernel(
@@ -92,7 +89,8 @@ def indexer_k_quant_and_cache_triton(
     # In real layout, we store the first portion as kv cache value
     # and second portion as kv cache scale
     kv_cache = kv_cache.view(num_blocks, -1)
-    kv_cache_value = kv_cache[:, : block_size * head_dim]
+    fp8_dtype = current_platform.fp8_dtype()
+    kv_cache_value = kv_cache[:, : block_size * head_dim].view(fp8_dtype)
     kv_cache_scale = kv_cache[:, block_size * head_dim :].view(torch.float32)
     head_tile_size = head_tile_size // kv_cache.element_size()
     grid = (num_tokens,)
@@ -106,7 +104,7 @@ def indexer_k_quant_and_cache_triton(
         block_size,
         num_tokens,
         head_dim,
-        "NHD",
+        "SHUFFLE",
         block_tile_size,
         head_tile_size,
         IS_FNUZ=current_platform.fp8_dtype() == torch.float8_e4m3fnuz,
@@ -204,7 +202,7 @@ def cp_gather_indexer_k_quant_cache_triton(
         block_table_stride,
         k_cache_value.stride(0),
         k_cache_scale.stride(0),
-        "NHD",
+        "SHUFFLE",
         head_dim,
         block_tile_size,
         head_tile_size,
@@ -309,31 +307,36 @@ def rocm_fp8_paged_mqa_logits(
     from aphrodite._aiter_ops import rocm_aiter_ops
 
     aiter_paged_mqa_logits_module = None
+    # if rocm_aiter_ops.is_enabled():
+    batch_size, next_n, heads, head_dim = q_fp8.shape
+    num_blocks, block_size, _, _ = kv_cache_fp8.shape
+
     if rocm_aiter_ops.is_enabled():
         aiter_paged_mqa_logits_module = paged_mqa_logits_module()
 
     if aiter_paged_mqa_logits_module is not None:
-        deepgemm_fp8_paged_mqa_logits_stage1 = aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits_stage1
+        deepgemm_fp8_paged_mqa_logits = aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits
         batch_size, next_n, heads, _ = q_fp8.shape
-        out_qk = torch.full(
-            (heads, batch_size * next_n, max_model_len),
+        out_logits = torch.full(
+            [batch_size * next_n, max_model_len],
             float("-inf"),
             device="cuda",
             dtype=torch.float32,
         )
-        # TODO: 1. Replace _stage1 and out_qk.sum with another fused variant;
-        #       2. Remove ChunkQ when AITER PR #2891 merged
-        deepgemm_fp8_paged_mqa_logits_stage1(
+        deepgemm_fp8_paged_mqa_logits(
             q_fp8,
             kv_cache_fp8,
             weights,
-            out_qk,
+            out_logits,
             context_lens,
             block_tables,
             max_model_len,
-            ChunkQ=heads,
+            ChunkK=256,
+            Preshuffle=block_size == 64,
+            KVBlockSize=block_size,
+            WavePerEU=2,
         )
-        return out_qk.sum(dim=0)
+        return out_logits
     else:
         return fp8_paged_mqa_logits_torch(q_fp8, kv_cache_fp8, weights, context_lens, block_tables, max_model_len)
 
@@ -514,7 +517,7 @@ def rocm_aiter_sparse_attn_indexer(
     num_tokens = slot_mapping.shape[0]
     k = k[:num_tokens]
 
-    ops.indexer_k_quant_and_cache(
+    indexer_k_quant_and_cache_triton(
         k,
         kv_cache,
         slot_mapping,
@@ -537,13 +540,13 @@ def rocm_aiter_sparse_attn_indexer(
                 device=k.device,
                 dtype=torch.uint8,
             )
-
-            ops.cp_gather_indexer_k_quant_cache(
+            cp_gather_indexer_k_quant_cache_triton(
                 kv_cache,
                 k_fp8,
                 k_scale,
                 chunk.block_table,
                 chunk.cu_seq_lens,
+                chunk.token_to_seq,
             )
 
             logits = rocm_fp8_mqa_logits(
diff --git a/aphrodite/v1/attention/selector.py b/aphrodite/v1/attention/selector.py
index ec8918f0aa..b56adebcbf 100644
--- a/aphrodite/v1/attention/selector.py
+++ b/aphrodite/v1/attention/selector.py
@@ -80,9 +80,6 @@ def get_attn_backend(
     else:
         block_size = None
 
-    speculative_config = aphrodite_config.speculative_config
-    use_non_causal = speculative_config is not None and speculative_config.method == "dflash"
-
     attn_selector_config = AttentionSelectorConfig(
         head_size=head_size,
         dtype=dtype,
@@ -94,7 +91,7 @@ def get_attn_backend(
         use_mm_prefix=use_mm_prefix,
         use_per_head_quant_scales=use_per_head_quant_scales,
         attn_type=attn_type or AttentionType.DECODER,
-        use_non_causal=use_non_causal,
+        use_non_causal=aphrodite_config.attention_config.use_non_causal,
         use_batch_invariant=envs.APHRODITE_BATCH_INVARIANT,
     )
 
diff --git a/aphrodite/v1/core/kv_cache_coordinator.py b/aphrodite/v1/core/kv_cache_coordinator.py
index c63fe54552..3b5071dd1d 100644
--- a/aphrodite/v1/core/kv_cache_coordinator.py
+++ b/aphrodite/v1/core/kv_cache_coordinator.py
@@ -82,6 +82,7 @@ def get_num_blocks_to_allocate(
         num_encoder_tokens: int,
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
@@ -98,6 +99,10 @@ def get_num_blocks_to_allocate(
             num_tokens_main_model: The number of tokens for the main model (aka target
                 model in spec decode). w/o spec decode, it is num_tokens;
                 with spec decode, it is num_tokens - num_lookahead_tokens.
+            apply_admission_cap: If True, apply the recycling-aware
+                per-request admission cap (SWA / chunked-local). Set only by
+                the full-sequence admission gate; per-step allocation must
+                leave it False so the predictor matches `allocate_new_blocks`.
 
         Returns:
             The number of blocks to allocate.
@@ -108,7 +113,12 @@ def get_num_blocks_to_allocate(
                 # For cross-attention, we issue a single static allocation
                 # of blocks based on the number of encoder input tokens.
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                    request_id, num_encoder_tokens, [], 0, num_encoder_tokens
+                    request_id,
+                    num_encoder_tokens,
+                    [],
+                    0,
+                    num_encoder_tokens,
+                    apply_admission_cap=apply_admission_cap,
                 )
             else:
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
@@ -117,6 +127,7 @@ def get_num_blocks_to_allocate(
                     new_computed_blocks[i],
                     total_computed_tokens,
                     num_tokens_main_model,
+                    apply_admission_cap=apply_admission_cap,
                 )
         return num_blocks_to_allocate
 
diff --git a/aphrodite/v1/core/kv_cache_manager.py b/aphrodite/v1/core/kv_cache_manager.py
index 1a627c08b0..869167ed6a 100644
--- a/aphrodite/v1/core/kv_cache_manager.py
+++ b/aphrodite/v1/core/kv_cache_manager.py
@@ -209,43 +209,6 @@ def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]:
 
         return self.create_kv_cache_blocks(computed_blocks), num_new_computed_tokens
 
-    def can_fit_full_sequence(
-        self,
-        request: Request,
-        num_new_computed_tokens: int = 0,
-        new_computed_blocks: KVCacheBlocks | None = None,
-        num_external_computed_tokens: int = 0,
-        num_encoder_tokens: int = 0,
-    ) -> bool:
-        """Check if the KV cache has enough free blocks to hold the full
-        sequence, accounting for prefix cache hits and sliding window.
-
-        This is used as an admission gate to prevent over-admitting requests
-        when chunked prefill would otherwise only check the first chunk.
-        """
-        if new_computed_blocks is not None:
-            new_computed_block_list = new_computed_blocks.blocks
-        else:
-            new_computed_block_list = self.empty_kv_cache_blocks.blocks
-
-        num_local_computed_tokens = request.num_computed_tokens + num_new_computed_tokens
-        total_computed_tokens = min(
-            num_local_computed_tokens + num_external_computed_tokens,
-            self.max_model_len,
-        )
-        full_num_tokens = min(request.num_tokens, self.max_model_len)
-
-        num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
-            request_id=request.request_id,
-            num_tokens=full_num_tokens,
-            new_computed_blocks=new_computed_block_list,
-            num_encoder_tokens=num_encoder_tokens,
-            total_computed_tokens=total_computed_tokens,
-            num_tokens_main_model=full_num_tokens,
-        )
-
-        return num_blocks_to_allocate <= self.block_pool.get_num_free_blocks()
-
     def allocate_slots(
         self,
         request: Request,
@@ -256,6 +219,7 @@ def allocate_slots(
         num_external_computed_tokens: int = 0,
         delay_cache_blocks: bool = False,
         num_encoder_tokens: int = 0,
+        full_sequence_must_fit: bool = False,
     ) -> KVCacheBlocks | None:
         """Add slots for a request with new tokens to append.
 
@@ -277,6 +241,10 @@ def allocate_slots(
             num_encoder_tokens: The number of encoder tokens to allocate for
                 cross-attention in encoder-decoder models(e.g., Whisper).
                 For decoder-only models, this should be 0.
+            full_sequence_must_fit: Only allocate blocks if the KV cache has enough
+                free blocks to hold the full sequence, accounting for prefix cache hits
+                and sliding window. Used as an admission gate to prevent over-admitting
+                requests when chunked prefill would otherwise only check the first chunk
 
         Blocks layout:
         ```
@@ -345,11 +313,25 @@ def allocate_slots(
             num_local_computed_tokens + num_external_computed_tokens,
             self.max_model_len,
         )
+
+        if full_sequence_must_fit:
+            # First check and fail if the full request sequence won't fit.
+            full_num_tokens = min(request.num_tokens, self.max_model_len)
+
+            num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
+                request_id=request.request_id,
+                num_tokens=full_num_tokens,
+                new_computed_blocks=new_computed_block_list,
+                num_encoder_tokens=num_encoder_tokens,
+                total_computed_tokens=total_computed_tokens,
+                num_tokens_main_model=full_num_tokens,
+                apply_admission_cap=True,
+            )
+            if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
+                return None
+
         num_tokens_main_model = total_computed_tokens + num_new_tokens
-        num_tokens_need_slot = min(
-            num_tokens_main_model + num_lookahead_tokens,
-            self.max_model_len,
-        )
+        num_tokens_need_slot = min(num_tokens_main_model + num_lookahead_tokens, self.max_model_len)
 
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
diff --git a/aphrodite/v1/core/kv_cache_utils.py b/aphrodite/v1/core/kv_cache_utils.py
index f007b6bfa7..5989cb6cab 100644
--- a/aphrodite/v1/core/kv_cache_utils.py
+++ b/aphrodite/v1/core/kv_cache_utils.py
@@ -835,29 +835,44 @@ def get_max_concurrency_for_kv_cache_config(aphrodite_config: AphroditeConfig, k
     return max_concurrency
 
 
-def may_override_num_blocks(aphrodite_config: AphroditeConfig, num_blocks: int, suppress_log: bool = False) -> int:
+def may_override_num_blocks(aphrodite_config: AphroditeConfig, num_blocks: int) -> int:
     """
     Override the number of kv cache blocks if `num_gpu_blocks_override` is set.
+    The override is logged once, at the call site in `get_kv_cache_configs`.
     """
     if aphrodite_config.cache_config.num_gpu_blocks_override is not None:
-        num_gpu_blocks_override = aphrodite_config.cache_config.num_gpu_blocks_override
-        if not suppress_log:
-            logger.info(
-                "Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
-                num_blocks,
-                num_gpu_blocks_override,
-            )
-        num_blocks = num_gpu_blocks_override
+        num_blocks = aphrodite_config.cache_config.num_gpu_blocks_override
 
     return num_blocks
 
 
+def _pool_bytes_per_block(kv_cache_groups: list[KVCacheGroupSpec]) -> int:
+    """
+    Bytes consumed by one block in the worker's shared KV cache pool, mirroring
+    the divisor used by `get_kv_cache_config_from_groups` to convert
+    `available_memory` into `num_blocks`. Used to compute the effective KV cache
+    capacity once `num_gpu_blocks_override` is applied.
+    """
+    if len(kv_cache_groups) == 1 and isinstance(kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs):
+        return kv_cache_groups[0].kv_cache_spec.page_size_bytes
+    if all(isinstance(g.kv_cache_spec, UniformTypeKVCacheSpecs) for g in kv_cache_groups):
+        # DeepseekV4: shared layout sized by the largest per-page-size bucket.
+        full_mla_spec = cast(UniformTypeKVCacheSpecs, kv_cache_groups[0].kv_cache_spec)
+        layer_tuple_page_bytes = sum(full_mla_spec.get_page_sizes())
+        num_layer_tuples = max(
+            cast(UniformTypeKVCacheSpecs, g.kv_cache_spec).get_num_layer_tuples() for g in kv_cache_groups
+        )
+        return layer_tuple_page_bytes * num_layer_tuples
+    group_size = max(len(g.layer_names) for g in kv_cache_groups)
+    page_size = get_uniform_page_size([g.kv_cache_spec for g in kv_cache_groups])
+    return page_size * group_size
+
+
 def get_num_blocks(
     aphrodite_config: AphroditeConfig,
     num_layers: int,
     available_memory: int,
     page_size: int,
-    suppress_log: bool = False,
 ) -> int:
     """
     Get the number of kv cache blocks.
@@ -867,13 +882,10 @@ def get_num_blocks(
         num_layers: The number of layers
         available_memory: Memory available for KV cache in bytes.
         page_size: The page size of the KV cache.
-        suppress_log: Whether to suppress override log messages. Used when creating a
-            temporary/dummy KV cache config, e.g. during CG memory profiling
     """
     num_blocks = int(available_memory // page_size // num_layers)
     num_blocks = max(num_blocks, 0)
-    num_blocks = may_override_num_blocks(aphrodite_config, num_blocks, suppress_log=suppress_log)
-    return num_blocks
+    return may_override_num_blocks(aphrodite_config, num_blocks)
 
 
 def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
@@ -1159,7 +1171,6 @@ def get_kv_cache_config_from_groups(
     aphrodite_config: AphroditeConfig,
     kv_cache_groups: list[KVCacheGroupSpec],
     available_memory: int,
-    suppress_log: bool = False,
 ) -> KVCacheConfig:
     """
     Generate the KV cache configuration from the KV cache groups and spec
@@ -1187,7 +1198,7 @@ def get_kv_cache_config_from_groups(
         # different hidden sizes. Allocate different amount of memory for each
         # layer based on its hidden size.
         num_blocks = available_memory // kv_cache_groups[0].kv_cache_spec.page_size_bytes
-        num_blocks = may_override_num_blocks(aphrodite_config, num_blocks, suppress_log=suppress_log)
+        num_blocks = may_override_num_blocks(aphrodite_config, num_blocks)
         per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
         kv_cache_tensors = [
             KVCacheTensor(
@@ -1215,13 +1226,7 @@ def get_kv_cache_config_from_groups(
 
         page_size = get_uniform_page_size([group.kv_cache_spec for group in kv_cache_groups])
         assert group_size > 0, "group_size must be greater than 0"
-        num_blocks = get_num_blocks(
-            aphrodite_config,
-            group_size,
-            available_memory,
-            page_size,
-            suppress_log=suppress_log,
-        )
+        num_blocks = get_num_blocks(aphrodite_config, group_size, available_memory, page_size)
         kv_cache_tensors = []
         for i in range(group_size):
             shared_by = []
@@ -1289,7 +1294,9 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                     block_size=spec.block_size,
                     num_kv_heads=spec.num_kv_heads,
                     head_size=spec.head_size,
+                    head_size_v=spec.head_size_v,
                     dtype=spec.dtype,
+                    kv_quant_mode=spec.kv_quant_mode,
                     sliding_window=spec.sliding_window,
                     page_size_padded=spec.page_size_padded,
                 )
@@ -1560,27 +1567,21 @@ def _report_kv_cache_config(aphrodite_config: AphroditeConfig, kv_cache_config:
         aphrodite_config: The global AphroditeConfig
         kv_cache_config: The resolved KV cache configuration
     """
-    min_block_size = min([group.kv_cache_spec.block_size for group in kv_cache_config.kv_cache_groups])
-
-    # Log the KV cache size and maximum concurrency.
-    num_tokens = kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups) * min_block_size
-    dcp_size = aphrodite_config.parallel_config.decode_context_parallel_size
-    pcp_size = aphrodite_config.parallel_config.prefill_context_parallel_size
-    if pcp_size * dcp_size > 1:
-        num_tokens *= pcp_size * dcp_size
-        logger.info(
-            "Multiplying the GPU KV cache size by the cp_world_size %d (pcp_world_size %d * dcp_world_size %d).",
-            pcp_size * dcp_size,
-            pcp_size,
-            dcp_size,
-        )
-    num_tokens_str = f"{num_tokens:,}"
-    logger.info_once("GPU KV cache size: %s tokens", num_tokens_str)
-    max_model_len_str = f"{aphrodite_config.model_config.max_model_len:,}"
+    max_model_len = aphrodite_config.model_config.max_model_len
     max_concurrency = get_max_concurrency_for_kv_cache_config(aphrodite_config, kv_cache_config)
+    # GPU KV cache size in tokens = max_concurrency * max_model_len: the total
+    # tokens of context the pool can hold at peak utilization. Sourcing this
+    # from the concurrency calculation handles hybrid layouts correctly: SWA /
+    # chunked-local groups have a per-request block count that's capped by
+    # their window, so a naive `num_blocks // num_groups * block_size` formula
+    # underestimates capacity for these models. DCP/PCP sharding is already
+    # accounted for in each spec's `max_memory_usage_bytes`.
+    num_tokens = int(max_concurrency * max_model_len)
+
+    logger.info_once("GPU KV cache size: %s tokens", f"{num_tokens:,}")
     logger.info_once(
         "Maximum concurrency for %s tokens per request: %.2fx",
-        max_model_len_str,
+        f"{max_model_len:,}",
         max_concurrency,
     )
 
@@ -1825,6 +1826,28 @@ def get_kv_cache_configs(
         _project_kv_cache_groups_to_worker(global_kv_cache_groups, worker_spec) for worker_spec in kv_cache_specs
     ]
 
+    # If `num_gpu_blocks_override` is set, the cache size that will actually
+    # be allocated is decoupled from the profiled `available_memory`:
+    # `may_override_num_blocks` in `get_kv_cache_config_from_groups` clamps
+    # `num_blocks` to the override. Reflect that in `available_memory` here so
+    # auto-fit, the admission check, and the per-worker config builder all
+    # plan against the same effective capacity.
+    override = aphrodite_config.cache_config.num_gpu_blocks_override
+    if override is not None:
+        adjusted_memory: list[int] = []
+        for groups, avail_mem in zip(projected_groups_per_worker, available_memory):
+            if not groups:
+                adjusted_memory.append(avail_mem)
+                continue
+            bytes_per_block = _pool_bytes_per_block(groups)
+            logger.info(
+                "Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
+                avail_mem // bytes_per_block,
+                override,
+            )
+            adjusted_memory.append(override * bytes_per_block)
+        available_memory = adjusted_memory
+
     if aphrodite_config.model_config.original_max_model_len == -1:
         _auto_fit_max_model_len(aphrodite_config, projected_groups_per_worker, available_memory)
 
diff --git a/aphrodite/v1/core/sched/output.py b/aphrodite/v1/core/sched/output.py
index 78881a1c9d..babf7acfad 100644
--- a/aphrodite/v1/core/sched/output.py
+++ b/aphrodite/v1/core/sched/output.py
@@ -38,6 +38,7 @@ class NewRequestData:
     num_computed_tokens: int
     lora_request: LoRARequest | None
     prompt_embeds: "torch.Tensor | None" = None
+    prompt_is_token_ids: list[bool] | None = None
 
     # Only used for v2 model runner.
     prefill_token_ids: list[int] | None = None
@@ -59,6 +60,7 @@ def from_request(
             num_computed_tokens=request.num_computed_tokens,
             lora_request=request.lora_request,
             prompt_embeds=request.prompt_embeds,
+            prompt_is_token_ids=request.prompt_is_token_ids,
             prefill_token_ids=prefill_token_ids,
         )
 
diff --git a/aphrodite/v1/core/sched/scheduler.py b/aphrodite/v1/core/sched/scheduler.py
index 94a1235902..4f29915f1d 100644
--- a/aphrodite/v1/core/sched/scheduler.py
+++ b/aphrodite/v1/core/sched/scheduler.py
@@ -670,17 +670,6 @@ def schedule(self) -> SchedulerOutput:
                 if self.is_encoder_decoder and request.has_encoder_inputs and encoder_inputs_to_schedule:
                     num_encoder_tokens = sum(request.get_num_encoder_embeds(i) for i in encoder_inputs_to_schedule)
 
-                if self.scheduler_reserve_full_isl and not self.kv_cache_manager.can_fit_full_sequence(
-                    request,
-                    num_new_computed_tokens=num_new_local_computed_tokens,
-                    new_computed_blocks=new_computed_blocks,
-                    num_external_computed_tokens=num_external_computed_tokens,
-                    num_encoder_tokens=num_encoder_tokens,
-                ):
-                    if request.has_encoder_inputs:
-                        self.encoder_cache_manager.free(request)
-                    break
-
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens,
@@ -690,6 +679,7 @@ def schedule(self) -> SchedulerOutput:
                     num_external_computed_tokens=num_external_computed_tokens,
                     delay_cache_blocks=load_kv_async,
                     num_encoder_tokens=num_encoder_tokens,
+                    full_sequence_must_fit=self.scheduler_reserve_full_isl,
                 )
 
                 if new_blocks is None:
@@ -1313,7 +1303,7 @@ def update_from_output(
                     stopped_preempted_reqs.add(request)
 
             # Extract sample logprobs if needed.
-            if request.sampling_params is not None and request.sampling_params.logprobs is not None and logprobs:
+            if request.sampling_params is not None and request.sampling_params.num_logprobs is not None and logprobs:
                 new_logprobs = logprobs.slice_request(req_index, len(new_token_ids))
 
             if num_nans_in_logits is not None and req_id in num_nans_in_logits:
diff --git a/aphrodite/v1/core/single_type_kv_cache_manager.py b/aphrodite/v1/core/single_type_kv_cache_manager.py
index 71c8d2e6dd..162c754d9b 100644
--- a/aphrodite/v1/core/single_type_kv_cache_manager.py
+++ b/aphrodite/v1/core/single_type_kv_cache_manager.py
@@ -92,6 +92,7 @@ def get_num_blocks_to_allocate(
         new_computed_blocks: Sequence[KVCacheBlock],
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
@@ -107,13 +108,16 @@ def get_num_blocks_to_allocate(
             num_tokens_main_model: The number of tokens for the main model (aka target
                 model in spec decode). w/o spec decode, it is num_tokens;
                 with spec decode, it is num_tokens - num_lookahead_tokens.
+            apply_admission_cap: If True, clamp by `num_required_blocks` by
+                `_max_admission_blocks_per_request`for recycling-aware specs
+                (SWA, chunked-local).
 
         Returns:
             The number of blocks to allocate.
         """
 
         num_required_blocks = cdiv(num_tokens, self.block_size)
-        if self._max_admission_blocks_per_request is not None:
+        if apply_admission_cap and self._max_admission_blocks_per_request is not None:
             # Recycling-aware specs (SWA, chunked-local) cap the per-request
             # reservation here so admission matches the startup pool sizer
             # (`SlidingWindowSpec.max_admission_blocks_per_request` / its
@@ -842,6 +846,7 @@ def get_num_blocks_to_allocate(
         new_computed_blocks: Sequence[KVCacheBlock],
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         assert isinstance(self.kv_cache_spec, MambaSpec)
         if len(new_computed_blocks) > 0 and new_computed_blocks[-1].block_hash in self.cached_blocks_this_step:
@@ -861,6 +866,7 @@ def get_num_blocks_to_allocate(
                 new_computed_blocks,
                 total_computed_tokens,
                 num_tokens_main_model,
+                apply_admission_cap=apply_admission_cap,
             )
         else:
             # We don't allocate blocks for lookahead tokens in align mode, because if
diff --git a/aphrodite/v1/engine/__init__.py b/aphrodite/v1/engine/__init__.py
index 3b06aa5ddc..dc0ebbff69 100644
--- a/aphrodite/v1/engine/__init__.py
+++ b/aphrodite/v1/engine/__init__.py
@@ -94,6 +94,12 @@ class EngineCoreRequest(
     data_parallel_rank: int | None
     prompt_embeds: torch.Tensor | None = None
 
+    # Per-position mask for mixed-mode inputs (e.g chat completion with
+    # prompt_embeds content parts). `True` means the position is a real
+    # token ID; `False` means the position uses a pre-computed entry from
+    # `prompt_embeds`. `None` for pure-tokens and pure-embeds requests.
+    prompt_is_token_ids: list[bool] | None = None
+
     # Index of the client, used to ensure outputs are sent back to the same
     # client for this request when scaling out the front-end.
     client_index: int = 0
@@ -114,6 +120,7 @@ class EngineCoreRequest(
     external_req_id: str | None = None
 
     reasoning_ended: bool | None = None
+    reasoning_parser_kwargs: dict[str, Any] | None = None
 
     @property
     def params(self) -> SamplingParams | PoolingParams:
diff --git a/aphrodite/v1/engine/async_llm.py b/aphrodite/v1/engine/async_llm.py
index 442570579a..891f9bbbb0 100644
--- a/aphrodite/v1/engine/async_llm.py
+++ b/aphrodite/v1/engine/async_llm.py
@@ -290,6 +290,7 @@ async def add_request(
         data_parallel_rank: int | None = None,
         prompt_text: str | None = None,
         reasoning_ended: bool | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
@@ -306,7 +307,7 @@ async def add_request(
             )
 
         if isinstance(prompt, AsyncGenerator):
-            if reasoning_ended is not None:
+            if reasoning_ended is not None or reasoning_parser_kwargs is not None:
                 raise NotImplementedError
 
             # Streaming input case.
@@ -354,6 +355,8 @@ async def add_request(
 
         if reasoning_ended is not None:
             request.reasoning_ended = reasoning_ended
+        if reasoning_parser_kwargs is not None:
+            request.reasoning_parser_kwargs = reasoning_parser_kwargs
 
         self.input_processor.assign_request_id(request)
 
@@ -518,6 +521,7 @@ async def generate(
         priority: int = 0,
         data_parallel_rank: int | None = None,
         reasoning_ended: bool | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -547,6 +551,7 @@ async def generate(
                 data_parallel_rank=data_parallel_rank,
                 prompt_text=prompt_text,
                 reasoning_ended=reasoning_ended,
+                reasoning_parser_kwargs=reasoning_parser_kwargs,
             )
 
             # The output_handler task pushes items into the queue.
diff --git a/aphrodite/v1/engine/core.py b/aphrodite/v1/engine/core.py
index 24baa4a5b2..4ed9240a6a 100644
--- a/aphrodite/v1/engine/core.py
+++ b/aphrodite/v1/engine/core.py
@@ -1455,7 +1455,8 @@ def engine_idle_callback(engine: "EngineCoreProc", future: Future[Any]) -> None:
 
         pause_state = PauseState.PAUSED_ALL if mode == "keep" else PauseState.PAUSED_NEW
         self.scheduler.set_pause_state(pause_state)
-        if not self.has_work():
+
+        if self._pause_complete():
             if clear_cache:
                 self._reset_caches()
             return None
@@ -1464,6 +1465,13 @@ def engine_idle_callback(engine: "EngineCoreProc", future: Future[Any]) -> None:
         self._idle_state_callbacks.append(partial(engine_idle_callback, future=future))
         return future
 
+    def _pause_complete(self) -> bool:
+        """Returns True if the pause has fully completed and the caller can
+        return ``None`` synchronously; False if the pause is still pending
+        and the caller should register an idle-state callback to finish it.
+        """
+        return not self.has_work()
+
     def _send_finish_outputs_to_client(
         self, req_ids: list[str], client_index: int, finish_reason: FinishReason
     ) -> None:
@@ -1510,6 +1518,14 @@ def __init__(
         self.current_wave = 0
         self.last_counts = (0, 0)
 
+        # Two-phase pause protocol state. When pending_pause is True, the
+        # engine keeps stepping (dummy batches) while waiting for all DP
+        # ranks to also set pending_pause. Once all ranks agree via
+        # all-reduce, ignore_start_dp_wave is set so that stale
+        # START_DP_WAVE messages cannot re-wake the engines.
+        self.pending_pause = False
+        self.ignore_start_dp_wave = False
+
         from aphrodite.distributed.elastic_ep.elastic_state import ElasticEPScalingState
 
         self.eep_scaling_state: ElasticEPScalingState | None = None
@@ -1539,6 +1555,7 @@ def _init_data_parallel(self, aphrodite_config: AphroditeConfig):
         assert 0 <= local_dp_rank <= dp_rank < dp_size
 
         self.dp_rank = dp_rank
+        self.dp_size = dp_size
         dp_group, dp_store = parallel_config.stateless_init_dp_group(return_store=True)
         self.dp_group, self.dp_store = dp_group, dp_store
 
@@ -1547,30 +1564,76 @@ def shutdown(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
+    def _pause_complete(self) -> bool:
+        """Two-phase DP-aware pause.
+        Phase 1: Set local pause state and ``pending_pause`` flag. If the
+        engines are idle, kick-start them by setting ``engines_running`` to
+        True so ranks enter the stepping loop and reach the all-reduce
+        consensus checkpoint in ``_has_global_unfinished_reqs``.
+        Phase 2 (in ``_has_global_unfinished_reqs``): Once the all-reduce
+        confirms that **all** ranks have ``pending_pause`` set, collectively
+        stop stepping and set ``ignore_start_dp_wave`` so that stale
+        ``START_DP_WAVE`` messages cannot re-wake any engine.
+        """
+        self.pending_pause = True
+        self.engines_running = True
+
+        return False
+
     def add_request(self, request: Request, request_wave: int = 0):
         super().add_request(request, request_wave)
         if self.has_coordinator and request_wave != self.current_wave:
             if request_wave > self.current_wave:
                 self.current_wave = request_wave
             elif not self.engines_running and self.scheduler.pause_state == PauseState.UNPAUSED:
-                self.engines_running = True
                 # Request received for an already-completed wave, notify
                 # front-end that we need to start the next one.
+                self.engines_running = True
                 self.output_queue.put_nowait((-1, EngineCoreOutputs(start_wave=self.current_wave)))
 
     def resume_scheduler(self):
+        if self.pending_pause or (self.engines_running and self.ignore_start_dp_wave):
+            raise RuntimeError(
+                "resume_scheduler called while pause is still in "
+                "flight. Wait for the pause future to resolve before "
+                "resuming."
+            )
+        if self.engines_running:
+            logger.debug("Resume called while engines are not paused, ignoring.")
+            return
+
         super().resume_scheduler()
-        if self.has_coordinator and not self.engines_running and self.scheduler.has_unfinished_requests():
-            # Wake up other DP engines.
-            self.output_queue.put_nowait((-1, EngineCoreOutputs(start_wave=self.current_wave)))
+        self.ignore_start_dp_wave = False
+
+        # Barrier: wait for all DP ranks to have resumed (and cleared
+        # ignore_start_dp_wave) before any rank starts stepping. Uses
+        # the existing all-reduce which is safe because engines are
+        # stopped.
+        has_global_unfinished = ParallelConfig.has_unfinished_dp(
+            self.dp_group, self.scheduler.has_unfinished_requests()
+        )
+
+        if has_global_unfinished:
+            self.engines_running = True
+
+    def barrier(self):
+        """Blocking barrier on the DP process group (test-only utility)."""
+        import torch.distributed as dist
+
+        dist.barrier(group=self.dp_group)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType, request: Any) -> None:
         if request_type == EngineCoreRequestType.START_DP_WAVE:
+            if self.ignore_start_dp_wave:
+                return
             new_wave, exclude_eng_index = request
             if exclude_eng_index != self.engine_index and (new_wave >= self.current_wave):
                 self.current_wave = new_wave
                 if not self.engines_running:
-                    logger.debug("EngineCore starting idle loop for wave %d.", new_wave)
+                    logger.debug(
+                        "EngineCore starting idle loop for wave %d.",
+                        new_wave,
+                    )
                     self.engines_running = True
         else:
             super()._handle_client_request(request_type, request)
@@ -1644,7 +1707,18 @@ def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
         if self.step_counter % 32 != 0:
             return True
 
-        return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished)
+        has_unfinished, pause_consensus = ParallelConfig.sync_dp_state(
+            self.dp_group,
+            has_unfinished=local_unfinished,
+            pending_pause=self.pending_pause,
+        )
+
+        if pause_consensus:
+            self.ignore_start_dp_wave = True
+            self.pending_pause = False
+            logger.debug("DP pause consensus reached, ignoring START_DP_WAVE.")
+
+        return has_unfinished
 
     def reinitialize_distributed(self, reconfig_request: ReconfigureDistributedRequest) -> None:
         from copy import deepcopy
diff --git a/aphrodite/v1/engine/input_processor.py b/aphrodite/v1/engine/input_processor.py
index 986c9cdea1..68877d8582 100644
--- a/aphrodite/v1/engine/input_processor.py
+++ b/aphrodite/v1/engine/input_processor.py
@@ -270,11 +270,13 @@ def process_inputs(
 
         # Mypy can be conservative for TypedDict unions; normalize access.
         if decoder_inputs["type"] == "embeds":
-            prompt_token_ids = None
             prompt_embeds = decoder_inputs["prompt_embeds"]
+            prompt_token_ids = decoder_inputs.get("prompt_token_ids")
+            prompt_is_token_ids = decoder_inputs.get("is_token_ids")
         else:
             prompt_token_ids = decoder_inputs["prompt_token_ids"]
             prompt_embeds = None
+            prompt_is_token_ids = None
 
         sampling_params = None
         pooling_params = None
@@ -335,6 +337,7 @@ def process_inputs(
             request_id=request_id,
             prompt_token_ids=prompt_token_ids,
             prompt_embeds=prompt_embeds,
+            prompt_is_token_ids=prompt_is_token_ids,
             mm_features=mm_features,
             sampling_params=sampling_params,
             pooling_params=pooling_params,
diff --git a/aphrodite/v1/engine/logprobs.py b/aphrodite/v1/engine/logprobs.py
index 46930b20e0..c7a903c32f 100644
--- a/aphrodite/v1/engine/logprobs.py
+++ b/aphrodite/v1/engine/logprobs.py
@@ -47,7 +47,7 @@ def from_new_request(
     ) -> "LogprobsProcessor":
         sampling_params = request.sampling_params
         assert sampling_params is not None
-        num_logprobs = sampling_params.logprobs
+        num_logprobs = sampling_params.num_logprobs
         num_prompt_logprobs = sampling_params.prompt_logprobs
         return cls(
             tokenizer=tokenizer,
diff --git a/aphrodite/v1/kv_cache_interface.py b/aphrodite/v1/kv_cache_interface.py
index 455d2b6c9c..a2a21e03ff 100644
--- a/aphrodite/v1/kv_cache_interface.py
+++ b/aphrodite/v1/kv_cache_interface.py
@@ -147,6 +147,10 @@ def page_size_bytes(self) -> int:
 
     @property
     def real_page_size_bytes(self) -> int:
+        if self.kv_quant_mode.is_nvfp4:
+            # Packed layout: fp4 data + fp8 block scales per head.
+            full_dim = nvfp4_kv_cache_full_dim(self.head_size)
+            return 2 * self.block_size * self.num_kv_heads * full_dim * get_dtype_size(self.dtype)
         return 2 * self.block_size * self.num_kv_heads * self.head_size * get_dtype_size(self.dtype)
 
 
diff --git a/aphrodite/v1/kv_offload/abstract.py b/aphrodite/v1/kv_offload/abstract.py
deleted file mode 100644
index 209d729a15..0000000000
--- a/aphrodite/v1/kv_offload/abstract.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-OffloadingManager class for managing KV data offloading in Aphrodite v1
-
-This class runs in the scheduler, tracks which blocks are offloaded
-and their address.
-
-The class provides the following primitives:
-    lookup() - check whether a single block is offloaded and ready.
-    prepare_load() - prepare given blocks to be read.
-        The given blocks will be protected from eviction.
-        This function returns a LoadSpec which encapsulates
-        information required for performing the load.
-    touch() - marks the give blocks as recently used. Can be used
-        to track block's LRU. This function is separated from the
-        prepare_load function to allow setting block recency even
-        for blocks which do not need reading from the cache, such as
-        blocks that are cached by the GPU prefix cache.
-    complete_load() - mark blocks which were previously prepared to be
-        loaded as done loading. This is to re-allow their eviction.
-    prepare_store() - prepare the given blocks to be written.
-        Returns a StoreSpec encapsulating offloading information,
-        as well as a list of blocks that were evicted as a result.
-    complete_store() - marks a previous store as completed.
-        Following this call, the given blocks will become loadable.
-"""
-
-from abc import ABC, abstractmethod
-from collections.abc import Iterable
-from dataclasses import dataclass
-from typing import Any, NewType
-
-# `OffloadKey` identifies an offloaded block. It combines a block hash with
-# its KV cache group index, encoded as raw bytes to avoid tuple GC overhead.
-# Use the helper functions below to construct / decompose keys.
-OffloadKey = NewType("OffloadKey", bytes)
-
-
-def make_offload_key(block_hash: bytes, group_idx: int) -> OffloadKey:
-    """Pack a block hash and group index into an `OffloadKey`."""
-    return OffloadKey(block_hash + group_idx.to_bytes(4, "big", signed=False))
-
-
-def get_offload_block_hash(key: OffloadKey) -> bytes:
-    """Extract the block hash from an `OffloadKey`."""
-    return key[:-4]
-
-
-def get_offload_group_idx(key: OffloadKey) -> int:
-    """Extract the group index from an `OffloadKey`."""
-    return int.from_bytes(key[-4:], "big", signed=False)
-
-
-@dataclass
-class ReqContext:
-    kv_transfer_params: dict[str, Any] | None = None
-
-
-class LoadStoreSpec(ABC):
-    """
-    Abstract metadata that encapsulates information allowing a worker
-    to load, and optionally also to store, blocks of KV data.
-    """
-
-    @staticmethod
-    @abstractmethod
-    def medium() -> str:
-        """
-        Returns a string representation of the medium type
-        this store/load targets.
-        """
-        pass
-
-
-@dataclass
-class PrepareStoreOutput:
-    keys_to_store: list[OffloadKey]
-    store_spec: LoadStoreSpec
-    evicted_keys: list[OffloadKey]
-
-
-@dataclass
-class OffloadingEvent:
-    keys: list[OffloadKey]
-    medium: str
-    # True if blocks are removed, False if stored
-    removed: bool
-
-
-class OffloadingManager(ABC):
-    @abstractmethod
-    def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
-        """
-        Checks whether a single block is offloaded and ready to be read.
-
-        Args:
-            key: the key identifying the block to lookup.
-            req_context: per-request context (e.g. kv_transfer_params).
-
-        Returns:
-            True if the block is offloaded and ready, False if not,
-            or None if the lookup should be retried later.
-            Returning None will delay the request handling by the Aphrodite
-            scheduler.
-        """
-        pass
-
-    @abstractmethod
-    def prepare_load(
-        self,
-        keys: Iterable[OffloadKey],
-        req_context: ReqContext,
-    ) -> LoadStoreSpec:
-        """
-        Prepare the given blocks to be read.
-        The given blocks will be protected from eviction until
-        complete_load is called.
-        It assumes all given blocks are offloaded.
-
-        Args:
-            keys: the keys identifying the blocks.
-            req_context: per-request context (e.g. kv_transfer_params).
-
-        Returns:
-            A LoadStoreSpec that can be used by a worker to locate and load
-            the actual offloaded KV data.
-        """
-        pass
-
-    def touch(self, keys: Iterable[OffloadKey]):
-        """
-        Mark the given blocks as recently used.
-        This could in practice mean moving them to the end of an LRU list.
-
-        Args:
-            keys: the keys identifying the blocks.
-        """
-        return
-
-    def complete_load(self, keys: Iterable[OffloadKey]):
-        """
-        Marks previous blocks that were prepared to load as done loading.
-
-        Args:
-            keys: the keys identifying the blocks.
-        """
-        return
-
-    @abstractmethod
-    def prepare_store(
-        self,
-        keys: Iterable[OffloadKey],
-        req_context: ReqContext,
-    ) -> PrepareStoreOutput | None:
-        """
-        Prepare the given blocks to be offloaded.
-        The given blocks will be protected from eviction until
-        complete_store is called.
-
-        Args:
-            keys: the keys identifying the blocks.
-            req_context: per-request context (e.g. kv_transfer_params).
-
-        Returns:
-            A PrepareStoreOutput indicating which blocks need storing,
-            where to store them (LoadStoreSpec), and list of blocks that
-            were evicted as a result.
-            None is returned if the blocks cannot be stored.
-        """
-        pass
-
-    def complete_store(self, keys: Iterable[OffloadKey], success: bool = True):
-        """
-        Marks blocks which were previously prepared to be stored, as stored.
-        Following this call, the blocks become loadable.
-        If if_success is False, blocks that were not marked as stored will be
-        removed.
-
-        Args:
-            keys: the keys identifying the blocks.
-            success: whether the blocks were stored successfully.
-        """
-        return
-
-    def take_events(self) -> Iterable[OffloadingEvent]:
-        """
-        Take the offloading events from the manager.
-
-        Yields:
-            New OffloadingEvents collected since the last call.
-        """
-        return ()
-
-    def shutdown(self) -> None:
-        """Shutdown the manager and release any resources."""
-        return
diff --git a/aphrodite/v1/kv_offload/base.py b/aphrodite/v1/kv_offload/base.py
new file mode 100644
index 0000000000..6f1fad43e9
--- /dev/null
+++ b/aphrodite/v1/kv_offload/base.py
@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Core abstractions for KV cache offloading in Aphrodite v1.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from collections.abc import Collection, Iterable, Iterator, Sequence
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, NewType
+
+import numpy as np
+import torch
+
+from aphrodite.logger import init_logger
+
+if TYPE_CHECKING:
+    from aphrodite.config import AphroditeConfig
+    from aphrodite.v1.kv_cache_interface import KVCacheConfig
+    from aphrodite.v1.kv_offload.worker.worker import OffloadingHandler
+
+# `OffloadKey` identifies an offloaded block. It combines a block hash with
+# its KV cache group index, encoded as raw bytes to avoid tuple GC overhead.
+# Use the helper functions below to construct / decompose keys.
+OffloadKey = NewType("OffloadKey", bytes)
+
+logger = init_logger(__name__)
+
+
+def make_offload_key(block_hash: bytes, group_idx: int) -> OffloadKey:
+    """Pack a block hash and group index into an `OffloadKey`."""
+    return OffloadKey(block_hash + group_idx.to_bytes(4, "big", signed=False))
+
+
+def get_offload_block_hash(key: OffloadKey) -> bytes:
+    """Extract the block hash from an `OffloadKey`."""
+    return key[:-4]
+
+
+def get_offload_group_idx(key: OffloadKey) -> int:
+    """Extract the group index from an `OffloadKey`."""
+    return int.from_bytes(key[-4:], "big", signed=False)
+
+
+@dataclass
+class ReqContext:
+    kv_transfer_params: dict[str, Any] | None = None
+
+
+class LoadStoreSpec(ABC):
+    """
+    Abstract metadata that encapsulates information allowing a worker
+    to load, and optionally also to store, blocks of KV data.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def medium() -> str:
+        """
+        Returns a string representation of the medium type
+        this store/load targets.
+        """
+        pass
+
+
+@dataclass
+class PrepareStoreOutput:
+    keys_to_store: list[OffloadKey]
+    store_spec: LoadStoreSpec
+    evicted_keys: list[OffloadKey]
+
+
+@dataclass
+class OffloadingEvent:
+    keys: list[OffloadKey]
+    medium: str
+    # True if blocks are removed, False if stored
+    removed: bool
+
+
+"""
+OffloadingManager class for managing KV data offloading in Aphrodite v1
+This class runs in the scheduler, tracks which blocks are offloaded
+and their address.
+The class provides the following primitives:
+    lookup() - check whether a single block is offloaded and ready.
+    prepare_load() - prepare given blocks to be read.
+        The given blocks will be protected from eviction.
+        This function returns a LoadSpec which encapsulates
+        information required for performing the load.
+    touch() - marks the give blocks as recently used. Can be used
+        to track block's LRU. This function is separated from the
+        prepare_load function to allow setting block recency even
+        for blocks which do not need reading from the cache, such as
+        blocks that are cached by the GPU prefix cache.
+    complete_load() - mark blocks which were previously prepared to be
+        loaded as done loading. This is to re-allow their eviction.
+    prepare_store() - prepare the given blocks to be written.
+        Returns a StoreSpec encapsulating offloading information,
+        as well as a list of blocks that were evicted as a result.
+    complete_store() - marks a previous store as completed.
+        Following this call, the given blocks will become loadable.
+"""
+
+
+class OffloadingManager(ABC):
+    @abstractmethod
+    def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
+        """
+        Checks whether a single block is offloaded and ready to be read.
+        Args:
+            key: the key identifying the block to lookup.
+            req_context: per-request context (e.g. kv_transfer_params).
+        Returns:
+            True if the block is offloaded and ready, False if not,
+            or None if the lookup should be retried later.
+            Returning None will delay the request handling by the Aphrodite
+            scheduler.
+        """
+        pass
+
+    @abstractmethod
+    def prepare_load(
+        self,
+        keys: Collection[OffloadKey],
+        req_context: ReqContext,
+    ) -> LoadStoreSpec:
+        """
+        Prepare the given blocks to be read.
+        The given blocks will be protected from eviction until
+        complete_load is called.
+        It assumes all given blocks are offloaded.
+        Args:
+            keys: the keys identifying the blocks.
+            req_context: per-request context (e.g. kv_transfer_params).
+        Returns:
+            A LoadStoreSpec that can be used by a worker to locate and load
+            the actual offloaded KV data.
+        """
+        pass
+
+    def touch(self, keys: Collection[OffloadKey]):
+        """
+        Mark the given blocks as recently used.
+        This could in practice mean moving them to the end of an LRU list.
+        Args:
+            keys: the keys identifying the blocks.
+        """
+        return
+
+    def complete_load(self, keys: Collection[OffloadKey]):
+        """
+        Marks previous blocks that were prepared to load as done loading.
+        Args:
+            keys: the keys identifying the blocks.
+        """
+        return
+
+    @abstractmethod
+    def prepare_store(
+        self,
+        keys: Collection[OffloadKey],
+        req_context: ReqContext,
+    ) -> PrepareStoreOutput | None:
+        """
+        Prepare the given blocks to be offloaded.
+        The given blocks will be protected from eviction until
+        complete_store is called.
+        Args:
+            keys: the keys identifying the blocks.
+            req_context: per-request context (e.g. kv_transfer_params).
+        Returns:
+            A PrepareStoreOutput indicating which blocks need storing,
+            where to store them (LoadStoreSpec), and list of blocks that
+            were evicted as a result.
+            None is returned if the blocks cannot be stored.
+        """
+        pass
+
+    def complete_store(self, keys: Collection[OffloadKey], success: bool = True):
+        """
+        Marks blocks which were previously prepared to be stored, as stored.
+        Following this call, the blocks become loadable.
+        If if_success is False, blocks that were not marked as stored will be
+        removed.
+        Args:
+            keys: the keys identifying the blocks.
+            success: whether the blocks were stored successfully.
+        """
+        return
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        """
+        Take the offloading events from the manager.
+        Yields:
+            New OffloadingEvents collected since the last call.
+        """
+        return ()
+
+    def shutdown(self) -> None:
+        """Shutdown the manager and release any resources."""
+        return
+
+
+class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
+    """
+    Spec for loading/storing KV blocks from given block numbers.
+    """
+
+    def __init__(self, block_ids: list[int]):
+        self.block_ids = np.array(block_ids, dtype=np.int64)
+
+    def __repr__(self) -> str:
+        return repr(self.block_ids)
+
+
+class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to GPU memory.
+    If there are multiple KV groups, the blocks are expected to be
+    ordered by the group index.
+    In that case, group_sizes[i] determines the number of blocks
+    per the i-th KV group, and thus sum(group_sizes) == len(block_ids).
+    group_sizes=None indicates a single KV group.
+    If block_indices is given, each group (determined by group_sizes) of block IDs
+    will correspond to logically contiguous blocks, e.g. blocks 5-10 of a some request.
+    block_indices[i] will represent the block index of the first block in group #i.
+    Thus, len(block_indices) == len(group_sizes) = number of KV cache groups.
+    This information is required in order to support off/loading from offloaded blocks
+    which are larger than GPU blocks.
+    In such cases, the first GPU block per each group may be unaligned to the offloaded
+    block size, and so knowing block_indices[i] allows the worker to correctly
+    skip part of the first matching offloaded block.
+    """
+
+    def __init__(
+        self,
+        block_ids: list[int],
+        group_sizes: Sequence[int],
+        block_indices: Sequence[int],
+    ):
+        super().__init__(block_ids)
+        assert sum(group_sizes) == len(block_ids)
+        assert len(block_indices) == len(group_sizes)
+        self.group_sizes: Sequence[int] = group_sizes
+        self.block_indices: Sequence[int] = block_indices
+
+    @staticmethod
+    def medium() -> str:
+        return "GPU"
+
+
+@dataclass
+class CanonicalKVCacheTensor:
+    """
+    A canonicalized KV cache tensor whose first dimension is num_blocks.
+    For attention backends where the raw tensor has num_blocks at a
+    non-leading physical dimension (e.g. FlashAttention's
+    (2, num_blocks, ...) layout), the tensor is split so that each
+    resulting CanonicalKVCacheTensor starts with (num_blocks, ...).
+    """
+
+    # The KV cache tensor with shape (num_blocks, ...)
+    tensor: torch.Tensor
+    # The (possibly padded) page size per block in bytes
+    page_size_bytes: int
+
+
+@dataclass
+class CanonicalKVCacheRef:
+    """
+    Per-layer (or group of layers) reference to a specific (by index)
+    CanonicalKVCacheTensor and records the un-padded page size used by that layer.
+    """
+
+    # Index into the list of CanonicalKVCacheTensor objects
+    tensor_idx: int
+    # The un-padded page size per block in bytes
+    page_size_bytes: int
+
+
+@dataclass
+class CanonicalKVCaches:
+    """
+    Canonicalized block-level representation of the KV caches.
+    Composed of:
+        - Unique list of KV cache data tensors,
+          each with shape (num_blocks, page_size_in_bytes) and int8 dtype.
+        - Per-group data references of the tensors.
+          i.e. how each KV cache group maps to the tensors.
+    """
+
+    # Ordered list of unique block tensors, each with shape
+    # (num_blocks, ...).
+    tensors: list[CanonicalKVCacheTensor]
+    # Per-KV-cache-group list of data references that map each layer
+    # in the group to the appropriate entry in the tensors list.
+    group_data_refs: list[list[CanonicalKVCacheRef]]
+
+
+class OffloadingSpec(ABC):
+    """Spec for an offloading connector"""
+
+    def __init__(self, aphrodite_config: AphroditeConfig, kv_cache_config: KVCacheConfig):
+        logger.warning(
+            "Initializing OffloadingSpec. This API is experimental and "
+            "subject to change in the future as we iterate the design."
+        )
+        self.aphrodite_config = aphrodite_config
+        self.kv_cache_config = kv_cache_config
+
+        kv_transfer_config = aphrodite_config.kv_transfer_config
+        assert kv_transfer_config is not None
+        self.extra_config = kv_transfer_config.kv_connector_extra_config
+
+        # block size used by Aphrodite for hashing request tokens for the sake
+        # of enabling prefix caching
+        self.hash_block_size = aphrodite_config.cache_config.block_size
+        # gpu block size per group
+        self.gpu_block_size: tuple[int, ...] = tuple(
+            kv_cache_group.kv_cache_spec.block_size for kv_cache_group in kv_cache_config.kv_cache_groups
+        )
+
+        for block_size in self.gpu_block_size:
+            assert block_size % self.hash_block_size == 0, (
+                f"gpu_block_size={block_size} not divisible by "
+                f"hash_block_size={self.hash_block_size}. "
+                f"Hybrid models (e.g. Mamba+Attention) need "
+                f"--enable-prefix-caching to align block sizes."
+            )
+
+        # offloaded_block_size / gpu_block_size
+        self.block_size_factor: int = 1
+
+        offloaded_block_size = self.extra_config.get("block_size")
+        if offloaded_block_size is not None:
+            offloaded_block_size_int = int(offloaded_block_size)
+            gpu_block_sizes = set(self.gpu_block_size)
+            assert len(gpu_block_sizes) == 1, (
+                "If 'block_size' is specified in kv_connector_extra_config, "
+                "there must be at least one KV cache group, "
+                "and all groups must have the same block size."
+            )
+            gpu_block_size = gpu_block_sizes.pop()
+
+            assert offloaded_block_size_int % gpu_block_size == 0
+            self.block_size_factor = offloaded_block_size_int // gpu_block_size
+
+    @abstractmethod
+    def get_manager(self) -> OffloadingManager:
+        """
+        Get an OffloadingManager that will be used
+        by the scheduler-side offloading connector to track
+        offloaded blocks and manage evictions.
+        """
+        pass
+
+    @abstractmethod
+    def get_handlers(
+        self, kv_caches: CanonicalKVCaches
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
+        """
+        Get offloading handlers along with their respective src and dst types.
+        Args:
+            kv_caches: Canonicalized KV caches.
+        Yields:
+            Tuples of (src_type, dst_type, offloading_handler).
+        """
+        pass
diff --git a/aphrodite/v1/kv_offload/cpu/common.py b/aphrodite/v1/kv_offload/cpu/common.py
new file mode 100644
index 0000000000..06e5959bf5
--- /dev/null
+++ b/aphrodite/v1/kv_offload/cpu/common.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from aphrodite.v1.kv_offload.base import BlockIDsLoadStoreSpec
+
+
+class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to CPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "CPU"
diff --git a/aphrodite/v1/kv_offload/worker/cpu_gpu.py b/aphrodite/v1/kv_offload/cpu/gpu_worker.py
similarity index 98%
rename from aphrodite/v1/kv_offload/worker/cpu_gpu.py
rename to aphrodite/v1/kv_offload/cpu/gpu_worker.py
index 2edc52c0d9..c6fccaddf3 100644
--- a/aphrodite/v1/kv_offload/worker/cpu_gpu.py
+++ b/aphrodite/v1/kv_offload/cpu/gpu_worker.py
@@ -11,9 +11,13 @@
 from aphrodite.logger import init_logger
 from aphrodite.utils.math_utils import cdiv
 from aphrodite.utils.platform_utils import is_pin_memory_available
+from aphrodite.v1.kv_offload.base import (
+    BlockIDsLoadStoreSpec,
+    CanonicalKVCacheRef,
+    CanonicalKVCaches,
+    GPULoadStoreSpec,
+)
 from aphrodite.v1.kv_offload.cpu.shared_offload_region import SharedOffloadRegion
-from aphrodite.v1.kv_offload.mediums import BlockIDsLoadStoreSpec, GPULoadStoreSpec
-from aphrodite.v1.kv_offload.spec import CanonicalKVCacheRef, CanonicalKVCaches
 from aphrodite.v1.kv_offload.worker.worker import (
     OffloadingHandler,
     TransferResult,
diff --git a/aphrodite/v1/kv_offload/cpu/manager.py b/aphrodite/v1/kv_offload/cpu/manager.py
index 958c39fa39..138449db4d 100644
--- a/aphrodite/v1/kv_offload/cpu/manager.py
+++ b/aphrodite/v1/kv_offload/cpu/manager.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
+from collections.abc import Collection, Iterable
 from typing import Literal
 
-from aphrodite.v1.kv_offload.abstract import (
+from aphrodite.v1.kv_offload.base import (
     LoadStoreSpec,
     OffloadingEvent,
     OffloadingManager,
@@ -11,10 +11,10 @@
     PrepareStoreOutput,
     ReqContext,
 )
-from aphrodite.v1.kv_offload.cpu.policies.abstract import BlockStatus, CachePolicy
+from aphrodite.v1.kv_offload.cpu.common import CPULoadStoreSpec
 from aphrodite.v1.kv_offload.cpu.policies.arc import ARCCachePolicy
+from aphrodite.v1.kv_offload.cpu.policies.base import BlockStatus, CachePolicy
 from aphrodite.v1.kv_offload.cpu.policies.lru import LRUCachePolicy
-from aphrodite.v1.kv_offload.mediums import CPULoadStoreSpec
 
 _CACHE_POLICIES: dict[str, type[CachePolicy]] = {
     "lru": LRUCachePolicy,
@@ -87,7 +87,7 @@ def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
 
     def prepare_load(
         self,
-        keys: Iterable[OffloadKey],
+        keys: Collection[OffloadKey],
         req_context: ReqContext,
     ) -> LoadStoreSpec:
         blocks = []
@@ -99,10 +99,10 @@ def prepare_load(
             blocks.append(block)
         return self._get_load_store_spec(keys, blocks)
 
-    def touch(self, keys: Iterable[OffloadKey]) -> None:
+    def touch(self, keys: Collection[OffloadKey]) -> None:
         self._policy.touch(keys)
 
-    def complete_load(self, keys: Iterable[OffloadKey]) -> None:
+    def complete_load(self, keys: Collection[OffloadKey]) -> None:
         for key in keys:
             block = self._policy.get(key)
             assert block is not None, f"Block {key!r} not found"
@@ -111,13 +111,11 @@ def complete_load(self, keys: Iterable[OffloadKey]) -> None:
 
     def prepare_store(
         self,
-        keys: Iterable[OffloadKey],
+        keys: Collection[OffloadKey],
         req_context: ReqContext,
     ) -> PrepareStoreOutput | None:
-        keys_list = list(keys)
-
         # filter out blocks that are already stored
-        keys_to_store = [k for k in keys_list if self._policy.get(k) is None]
+        keys_to_store = [k for k in keys if self._policy.get(k) is None]
 
         if not keys_to_store:
             return PrepareStoreOutput(
@@ -132,7 +130,7 @@ def prepare_store(
         if num_blocks_to_evict > 0:
             # Blocks from the original input are excluded from eviction candidates:
             # a block that was already stored must remain in the cache after this call.
-            protected = set(keys_list)
+            protected = set(keys)
             evicted = self._policy.evict(num_blocks_to_evict, protected)
             if evicted is None:
                 return None
@@ -164,7 +162,7 @@ def prepare_store(
             evicted_keys=to_evict,
         )
 
-    def complete_store(self, keys: Iterable[OffloadKey], success: bool = True) -> None:
+    def complete_store(self, keys: Collection[OffloadKey], success: bool = True) -> None:
         stored_keys: list[OffloadKey] = []
 
         if success:
diff --git a/aphrodite/v1/kv_offload/cpu/policies/arc.py b/aphrodite/v1/kv_offload/cpu/policies/arc.py
index 5db7e65fbe..9cd2528309 100644
--- a/aphrodite/v1/kv_offload/cpu/policies/arc.py
+++ b/aphrodite/v1/kv_offload/cpu/policies/arc.py
@@ -3,8 +3,8 @@
 from collections import OrderedDict
 from collections.abc import Iterable
 
-from aphrodite.v1.kv_offload.abstract import OffloadKey
-from aphrodite.v1.kv_offload.cpu.policies.abstract import BlockStatus, CachePolicy
+from aphrodite.v1.kv_offload.base import OffloadKey
+from aphrodite.v1.kv_offload.cpu.policies.base import BlockStatus, CachePolicy
 
 
 class ARCCachePolicy(CachePolicy):
diff --git a/aphrodite/v1/kv_offload/cpu/policies/abstract.py b/aphrodite/v1/kv_offload/cpu/policies/base.py
similarity index 97%
rename from aphrodite/v1/kv_offload/cpu/policies/abstract.py
rename to aphrodite/v1/kv_offload/cpu/policies/base.py
index 9c60c72d9b..a9d9cb2884 100644
--- a/aphrodite/v1/kv_offload/cpu/policies/abstract.py
+++ b/aphrodite/v1/kv_offload/cpu/policies/base.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 
-from aphrodite.v1.kv_offload.abstract import OffloadKey
+from aphrodite.v1.kv_offload.base import OffloadKey
 
 
 class BlockStatus(ctypes.Structure):
diff --git a/aphrodite/v1/kv_offload/cpu/policies/lru.py b/aphrodite/v1/kv_offload/cpu/policies/lru.py
index 3c7daa7897..87c47e04bb 100644
--- a/aphrodite/v1/kv_offload/cpu/policies/lru.py
+++ b/aphrodite/v1/kv_offload/cpu/policies/lru.py
@@ -3,8 +3,8 @@
 from collections import OrderedDict
 from collections.abc import Iterable
 
-from aphrodite.v1.kv_offload.abstract import OffloadKey
-from aphrodite.v1.kv_offload.cpu.policies.abstract import BlockStatus, CachePolicy
+from aphrodite.v1.kv_offload.base import OffloadKey
+from aphrodite.v1.kv_offload.cpu.policies.base import BlockStatus, CachePolicy
 
 
 class LRUCachePolicy(CachePolicy):
diff --git a/aphrodite/v1/kv_offload/cpu/spec.py b/aphrodite/v1/kv_offload/cpu/spec.py
index f3954d4bf4..4945d7ecff 100644
--- a/aphrodite/v1/kv_offload/cpu/spec.py
+++ b/aphrodite/v1/kv_offload/cpu/spec.py
@@ -5,12 +5,17 @@
 from aphrodite.config import AphroditeConfig
 from aphrodite.platforms import current_platform
 from aphrodite.v1.kv_cache_interface import KVCacheConfig
-from aphrodite.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from aphrodite.v1.kv_offload.base import (
+    CanonicalKVCaches,
+    GPULoadStoreSpec,
+    LoadStoreSpec,
+    OffloadingManager,
+    OffloadingSpec,
+)
+from aphrodite.v1.kv_offload.cpu.common import CPULoadStoreSpec
+from aphrodite.v1.kv_offload.cpu.gpu_worker import CpuGpuOffloadingHandlers
 from aphrodite.v1.kv_offload.cpu.manager import CPUOffloadingManager
-from aphrodite.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from aphrodite.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
-from aphrodite.v1.kv_offload.spec import CanonicalKVCaches, OffloadingSpec
-from aphrodite.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
 from aphrodite.v1.kv_offload.worker.worker import OffloadingHandler
 
 
diff --git a/aphrodite/v1/kv_offload/factory.py b/aphrodite/v1/kv_offload/factory.py
index afd84af255..3a2f1a2c2d 100644
--- a/aphrodite/v1/kv_offload/factory.py
+++ b/aphrodite/v1/kv_offload/factory.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING
 
 from aphrodite.logger import init_logger
-from aphrodite.v1.kv_offload.spec import OffloadingSpec
+from aphrodite.v1.kv_offload.base import OffloadingSpec
 
 if TYPE_CHECKING:
     from aphrodite.config import AphroditeConfig
diff --git a/aphrodite/v1/kv_offload/mediums.py b/aphrodite/v1/kv_offload/mediums.py
deleted file mode 100644
index 47e75c72d5..0000000000
--- a/aphrodite/v1/kv_offload/mediums.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from abc import ABC
-from collections.abc import Sequence
-
-import numpy as np
-
-from aphrodite.v1.kv_offload.abstract import LoadStoreSpec
-
-
-class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
-    """
-    Spec for loading/storing KV blocks from given block numbers.
-    """
-
-    def __init__(self, block_ids: list[int]):
-        self.block_ids = np.array(block_ids, dtype=np.int64)
-
-    def __repr__(self) -> str:
-        return repr(self.block_ids)
-
-
-class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
-    """
-    Spec for loading/storing a KV block to GPU memory.
-
-    If there are multiple KV groups, the blocks are expected to be
-    ordered by the group index.
-    In that case, group_sizes[i] determines the number of blocks
-    per the i-th KV group, and thus sum(group_sizes) == len(block_ids).
-    group_sizes=None indicates a single KV group.
-
-    If block_indices is given, each group (determined by group_sizes) of block IDs
-    will correspond to logically contiguous blocks, e.g. blocks 5-10 of a some request.
-    block_indices[i] will represent the block index of the first block in group #i.
-    Thus, len(block_indices) == len(group_sizes) = number of KV cache groups.
-    This information is required in order to support off/loading from offloaded blocks
-    which are larger than GPU blocks.
-    In such cases, the first GPU block per each group may be unaligned to the offloaded
-    block size, and so knowing block_indices[i] allows the worker to correctly
-    skip part of the first matching offloaded block.
-    """
-
-    def __init__(
-        self,
-        block_ids: list[int],
-        group_sizes: Sequence[int],
-        block_indices: Sequence[int],
-    ):
-        super().__init__(block_ids)
-        assert sum(group_sizes) == len(block_ids)
-        assert len(block_indices) == len(group_sizes)
-        self.group_sizes: Sequence[int] = group_sizes
-        self.block_indices: Sequence[int] = block_indices
-
-    @staticmethod
-    def medium() -> str:
-        return "GPU"
-
-
-class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
-    """
-    Spec for loading/storing a KV block to CPU memory.
-    """
-
-    @staticmethod
-    def medium() -> str:
-        return "CPU"
diff --git a/aphrodite/v1/kv_offload/reuse_manager.py b/aphrodite/v1/kv_offload/reuse_manager.py
index 1ba15514fb..247bff2556 100644
--- a/aphrodite/v1/kv_offload/reuse_manager.py
+++ b/aphrodite/v1/kv_offload/reuse_manager.py
@@ -8,9 +8,9 @@
 """
 
 from collections import OrderedDict
-from collections.abc import Iterable
+from collections.abc import Collection, Iterable
 
-from aphrodite.v1.kv_offload.abstract import (
+from aphrodite.v1.kv_offload.base import (
     LoadStoreSpec,
     OffloadingEvent,
     OffloadingManager,
@@ -72,14 +72,13 @@ def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
             self.counts[key] = 1
         return self._backing.lookup(key, req_context)
 
-    def prepare_store(self, keys: Iterable[OffloadKey], req_context: ReqContext) -> PrepareStoreOutput | None:
+    def prepare_store(self, keys: Collection[OffloadKey], req_context: ReqContext) -> PrepareStoreOutput | None:
         """Filter out blocks below threshold, then delegate to backing.
 
         Filtering is evaluated *before* calling the backing manager's
         ``prepare_store`` so that blocks that would be skipped do not
         consume any CPU offload capacity.
         """
-        keys = list(keys)
         eligible = [key for key in keys if self.counts.get(key, 0) >= self.store_threshold]
 
         # Passing an empty list is intentional and safe — CPUOffloadingManager
@@ -91,16 +90,16 @@ def prepare_store(self, keys: Iterable[OffloadKey], req_context: ReqContext) ->
     # Delegated methods
     # ------------------------------------------------------------------
 
-    def prepare_load(self, keys: Iterable[OffloadKey], req_context: ReqContext) -> LoadStoreSpec:
+    def prepare_load(self, keys: Collection[OffloadKey], req_context: ReqContext) -> LoadStoreSpec:
         return self._backing.prepare_load(keys, req_context)
 
-    def touch(self, keys: Iterable[OffloadKey]) -> None:
+    def touch(self, keys: Collection[OffloadKey]) -> None:
         return self._backing.touch(keys)
 
-    def complete_load(self, keys: Iterable[OffloadKey]) -> None:
+    def complete_load(self, keys: Collection[OffloadKey]) -> None:
         return self._backing.complete_load(keys)
 
-    def complete_store(self, keys: Iterable[OffloadKey], success: bool = True) -> None:
+    def complete_store(self, keys: Collection[OffloadKey], success: bool = True) -> None:
         return self._backing.complete_store(keys, success)
 
     def take_events(self) -> Iterable[OffloadingEvent]:
diff --git a/aphrodite/v1/kv_offload/spec.py b/aphrodite/v1/kv_offload/spec.py
deleted file mode 100644
index dc5f2f806c..0000000000
--- a/aphrodite/v1/kv_offload/spec.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from abc import ABC, abstractmethod
-from collections.abc import Iterator
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
-
-import torch
-
-from aphrodite.logger import init_logger
-from aphrodite.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
-from aphrodite.v1.kv_offload.worker.worker import OffloadingHandler
-
-if TYPE_CHECKING:
-    from aphrodite.config import AphroditeConfig
-    from aphrodite.v1.kv_cache_interface import KVCacheConfig
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class CanonicalKVCacheTensor:
-    """
-    A canonicalized KV cache tensor whose first dimension is num_blocks.
-
-    For attention backends where the raw tensor has num_blocks at a
-    non-leading physical dimension (e.g. FlashAttention's
-    (2, num_blocks, ...) layout), the tensor is split so that each
-    resulting CanonicalKVCacheTensor starts with (num_blocks, ...).
-    """
-
-    # The KV cache tensor with shape (num_blocks, ...)
-    tensor: torch.Tensor
-    # The (possibly padded) page size per block in bytes
-    page_size_bytes: int
-
-
-@dataclass
-class CanonicalKVCacheRef:
-    """
-    Per-layer (or group of layers) reference to a specific (by index)
-    CanonicalKVCacheTensor and records the un-padded page size used by that layer.
-    """
-
-    # Index into the list of CanonicalKVCacheTensor objects
-    tensor_idx: int
-    # The un-padded page size per block in bytes
-    page_size_bytes: int
-
-
-@dataclass
-class CanonicalKVCaches:
-    """
-    Canonicalized block-level representation of the KV caches.
-
-    Composed of:
-        - Unique list of KV cache data tensors,
-          each with shape (num_blocks, page_size_in_bytes) and int8 dtype.
-        - Per-group data references of the tensors.
-          i.e. how each KV cache group maps to the tensors.
-    """
-
-    # Ordered list of unique block tensors, each with shape
-    # (num_blocks, ...).
-    tensors: list[CanonicalKVCacheTensor]
-    # Per-KV-cache-group list of data references that map each layer
-    # in the group to the appropriate entry in the tensors list.
-    group_data_refs: list[list[CanonicalKVCacheRef]]
-
-
-class OffloadingSpec(ABC):
-    """Spec for an offloading connector"""
-
-    def __init__(self, aphrodite_config: "AphroditeConfig", kv_cache_config: "KVCacheConfig"):
-        logger.warning(
-            "Initializing OffloadingSpec. This API is experimental and "
-            "subject to change in the future as we iterate the design."
-        )
-        self.aphrodite_config = aphrodite_config
-        self.kv_cache_config = kv_cache_config
-
-        kv_transfer_config = aphrodite_config.kv_transfer_config
-        assert kv_transfer_config is not None
-        self.extra_config = kv_transfer_config.kv_connector_extra_config
-
-        # block size used by Aphrodite for hashing request tokens for the sake
-        # of enabling prefix caching
-        self.hash_block_size = aphrodite_config.cache_config.block_size
-        # gpu block size per group
-        self.gpu_block_size: tuple[int, ...] = tuple(
-            kv_cache_group.kv_cache_spec.block_size for kv_cache_group in kv_cache_config.kv_cache_groups
-        )
-
-        for block_size in self.gpu_block_size:
-            assert block_size % self.hash_block_size == 0, (
-                f"gpu_block_size={block_size} not divisible by "
-                f"hash_block_size={self.hash_block_size}. "
-                f"Hybrid models (e.g. Mamba+Attention) need "
-                f"--enable-prefix-caching to align block sizes."
-            )
-
-        # offloaded_block_size / gpu_block_size
-        self.block_size_factor: int = 1
-
-        offloaded_block_size = self.extra_config.get("block_size")
-        if offloaded_block_size is not None:
-            offloaded_block_size_int = int(offloaded_block_size)
-            gpu_block_sizes = set(self.gpu_block_size)
-            assert len(gpu_block_sizes) == 1, (
-                "If 'block_size' is specified in kv_connector_extra_config, "
-                "there must be at least one KV cache group, "
-                "and all groups must have the same block size."
-            )
-            gpu_block_size = gpu_block_sizes.pop()
-
-            assert offloaded_block_size_int % gpu_block_size == 0
-            self.block_size_factor = offloaded_block_size_int // gpu_block_size
-
-    @abstractmethod
-    def get_manager(self) -> OffloadingManager:
-        """
-        Get an OffloadingManager that will be used
-        by the scheduler-side offloading connector to track
-        offloaded blocks and manage evictions.
-        """
-        pass
-
-    @abstractmethod
-    def get_handlers(
-        self, kv_caches: CanonicalKVCaches
-    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
-        """
-        Get offloading handlers along with their respective src and dst types.
-
-        Args:
-            kv_caches: Canonicalized KV caches.
-
-        Yields:
-            Tuples of (src_type, dst_type, offloading_handler).
-        """
-        pass
diff --git a/aphrodite/v1/kv_offload/worker/worker.py b/aphrodite/v1/kv_offload/worker/worker.py
index c447b6c338..e94d509572 100644
--- a/aphrodite/v1/kv_offload/worker/worker.py
+++ b/aphrodite/v1/kv_offload/worker/worker.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 
 from aphrodite.logger import init_logger
-from aphrodite.v1.kv_offload.abstract import LoadStoreSpec
+from aphrodite.v1.kv_offload.base import LoadStoreSpec
 
 # a single transfer spec (src_blocks_spec, dst_blocks_spec)
 TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec]
diff --git a/aphrodite/v1/metrics/ray_wrappers.py b/aphrodite/v1/metrics/ray_wrappers.py
index daced12bb1..27ea2ff100 100644
--- a/aphrodite/v1/metrics/ray_wrappers.py
+++ b/aphrodite/v1/metrics/ray_wrappers.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import time
 
 from aphrodite.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorProm
@@ -28,10 +29,13 @@ def _get_replica_id() -> str | None:
 
 
 class RayPrometheusMetric:
+    _is_labeled: bool = False
+
     def __init__(self):
         if ray_metrics is None:
             raise ImportError("RayPrometheusMetric requires Ray to be installed.")
         self.metric: Metric = None
+        self._tags: dict[str, str] = {"ReplicaId": _get_replica_id() or ""}
 
     @staticmethod
     def _get_tag_keys(labelnames: list[str] | None) -> tuple[str, ...]:
@@ -39,7 +43,7 @@ def _get_tag_keys(labelnames: list[str] | None) -> tuple[str, ...]:
         labels.append("ReplicaId")
         return tuple(labels)
 
-    def labels(self, *labels, **labelskwargs):
+    def _build_tags(self, *labels, **labelskwargs) -> dict[str, str]:
         if labels:
             # -1 because ReplicaId was added automatically
             expected = len(self.metric._tag_keys) - 1
@@ -51,12 +55,15 @@ def labels(self, *labels, **labelskwargs):
 
         labelskwargs["ReplicaId"] = _get_replica_id() or ""
 
-        if labelskwargs:
-            for k, v in labelskwargs.items():
-                if not isinstance(v, str):
-                    labelskwargs[k] = str(v)
-            self.metric.set_default_tags(labelskwargs)
-        return self
+        return {k: v if isinstance(v, str) else str(v) for k, v in labelskwargs.items()}
+
+    def labels(self, *labels, **labelskwargs) -> "RayPrometheusMetric":
+        if self._is_labeled:
+            raise ValueError("labels() cannot be called on an already-labeled metric.")
+        clone = copy.copy(self)
+        clone._tags = self._build_tags(*labels, **labelskwargs)
+        clone._is_labeled = True
+        return clone
 
     @staticmethod
     def _get_sanitized_opentelemetry_name(name: str) -> str:
@@ -90,6 +97,8 @@ def __init__(
         # implemented at the observability layer (Prometheus/Grafana).
         del multiprocess_mode
 
+        super().__init__()
+
         tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
 
@@ -100,7 +109,7 @@ def __init__(
         )
 
     def set(self, value: int | float):
-        return self.metric.set(value)
+        return self.metric.set(value, tags=self._tags)
 
     def set_to_current_time(self):
         # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html
@@ -117,6 +126,7 @@ def __init__(
         documentation: str | None = "",
         labelnames: list[str] | None = None,
     ):
+        super().__init__()
         tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
         self.metric = ray_metrics.Counter(
@@ -128,7 +138,7 @@ def __init__(
     def inc(self, value: int | float = 1.0):
         if value == 0:
             return
-        return self.metric.inc(value)
+        return self.metric.inc(value, tags=self._tags)
 
 
 class RayHistogramWrapper(RayPrometheusMetric):
@@ -142,6 +152,7 @@ def __init__(
         labelnames: list[str] | None = None,
         buckets: list[float] | None = None,
     ):
+        super().__init__()
         tag_keys = self._get_tag_keys(labelnames)
         name = self._get_sanitized_opentelemetry_name(name)
 
@@ -154,7 +165,7 @@ def __init__(
         )
 
     def observe(self, value: int | float):
-        return self.metric.observe(value)
+        return self.metric.observe(value, tags=self._tags)
 
 
 class RaySpecDecodingProm(SpecDecodingProm):
diff --git a/aphrodite/v1/request.py b/aphrodite/v1/request.py
index f3e936a3b7..891c0247ad 100644
--- a/aphrodite/v1/request.py
+++ b/aphrodite/v1/request.py
@@ -66,6 +66,7 @@ def __init__(
         client_index: int = 0,
         arrival_time: float | None = None,
         prompt_embeds: torch.Tensor | None = None,
+        prompt_is_token_ids: list[bool] | None = None,
         mm_features: list[MultiModalFeatureSpec] | None = None,
         lora_request: "LoRARequest | None" = None,
         cache_salt: str | None = None,
@@ -74,6 +75,7 @@ def __init__(
         block_hasher: Callable[["Request"], list["BlockHash"]] | None = None,
         resumable: bool = False,
         reasoning_ended: bool | None = None,
+        reasoning_parser_kwargs: dict[str, Any] | None = None,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -84,6 +86,7 @@ def __init__(
         self.structured_output_request = StructuredOutputRequest.from_sampling_params(sampling_params)
         if self.structured_output_request is not None:
             self.structured_output_request.reasoning_ended = reasoning_ended
+            self.structured_output_request.reasoning_parser_kwargs = reasoning_parser_kwargs
         self.arrival_time = arrival_time if arrival_time is not None else time.time()
 
         self.status = RequestStatus.WAITING
@@ -110,6 +113,10 @@ def __init__(
 
         self.prompt_token_ids = prompt_token_ids
         self.prompt_embeds = prompt_embeds
+        # Per-position mask used in mixed-mode (chat completion with
+        # prompt_embeds). `None` except when both `prompt_token_ids` and
+        # `prompt_embeds` are set and their positions are interleaved.
+        self.prompt_is_token_ids = prompt_is_token_ids
         # Cache per-block prompt-embed hashes to avoid rehashing the same
         # tensor slices when generating extra keys.
         self._prompt_embeds_per_block_hashes: dict[tuple[int, int], bytes] = {}
@@ -176,6 +183,7 @@ def from_engine_core_request(
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
             prompt_embeds=request.prompt_embeds,
+            prompt_is_token_ids=request.prompt_is_token_ids,
             mm_features=request.mm_features,
             sampling_params=request.sampling_params,
             pooling_params=request.pooling_params,
@@ -187,6 +195,7 @@ def from_engine_core_request(
             block_hasher=block_hasher,
             resumable=request.resumable,
             reasoning_ended=request.reasoning_ended,
+            reasoning_parser_kwargs=request.reasoning_parser_kwargs,
         )
 
     def append_output_token_ids(
diff --git a/aphrodite/v1/sample/logits_processor/__init__.py b/aphrodite/v1/sample/logits_processor/__init__.py
index 9737557c93..7d98128afa 100644
--- a/aphrodite/v1/sample/logits_processor/__init__.py
+++ b/aphrodite/v1/sample/logits_processor/__init__.py
@@ -18,7 +18,6 @@
     LogitBiasLogitsProcessor,
     MinPLogitsProcessor,
     MinTokensLogitsProcessor,
-    ThinkingTokenBudgetLogitsProcessor,
     process_dict_updates,
 )
 from aphrodite.v1.sample.logits_processor.interface import (
@@ -47,7 +46,6 @@
     MinTokensLogitsProcessor,
     LogitBiasLogitsProcessor,
     MinPLogitsProcessor,
-    ThinkingTokenBudgetLogitsProcessor,
 ]
 
 
@@ -329,5 +327,4 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
     "STR_POOLING_REJECTS_LOGITSPROCS",
     "LOGITSPROCS_GROUP",
     "AdapterLogitsProcessor",
-    "ThinkingTokenBudgetLogitsProcessor",
 ]
diff --git a/aphrodite/v1/sample/logits_processor/builtin.py b/aphrodite/v1/sample/logits_processor/builtin.py
index 2d24c794fa..52b06d8032 100644
--- a/aphrodite/v1/sample/logits_processor/builtin.py
+++ b/aphrodite/v1/sample/logits_processor/builtin.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Callable, Sequence
-from typing import TYPE_CHECKING, Any, TypeVar
+from typing import TYPE_CHECKING, TypeVar
 
 import numpy as np
 import torch
@@ -277,236 +277,6 @@ def apply_with_spec_decode(
         return logits
 
 
-class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
-    """Limits the number of tokens allowed inside a 'thinking' section."""
-
-    def __init__(self, aphrodite_config: "AphroditeConfig", device: torch.device, is_pin_memory: bool):
-        reasoning_config = aphrodite_config.reasoning_config
-        max_num_reqs = aphrodite_config.scheduler_config.max_num_seqs
-
-        # Check if thinking is enabled
-        self.is_enabled = reasoning_config is not None and reasoning_config.enabled
-
-        self.reasoning_start_token_ids = getattr(reasoning_config, "reasoning_start_token_ids", [])
-        self.reasoning_end_token_ids = getattr(reasoning_config, "reasoning_end_token_ids", [])
-
-        self.pin_memory = is_pin_memory
-        self.device = device
-        # Per-request state tracking for thinking token management
-        # Key: request_index, Value: state dict containing:
-        # "in_think": bool - currently in thinking mode
-        # "in_end": bool - currently forcing end tokens output
-        # "check_count_down": int - steps remaining until next think
-        #                            start/end token parsing
-        # "think_count": int - number of thinking tokens generated
-        # "end_count": int - number of end tokens forced so far
-        # "thinking_token_budget": int - max allowed thinking tokens
-        # "output_tok_ids": list[int] - generated output tokens
-        # "prev_output_length": int - previous output length for
-        #                               incremental processing
-        self._state: dict[int, dict[str, Any]] = {}
-
-        # Preallocate reusable tensors
-        self.mask = torch.zeros(max_num_reqs, dtype=torch.bool, device=device)
-        self.force_token_ids = torch.full((max_num_reqs,), -1, dtype=torch.long, device=device)
-
-    @staticmethod
-    def _find_last_sequence_index(target_list: list[int], token_ids: list[int]) -> int:
-        """
-        Returns the index of the last occurrence of token_ids in target_list.
-
-        Args:
-          target_list (list[int]): The list of token IDs.
-          token_ids (list[int]): The sequence of token IDs to find.
-        """
-        if not token_ids:
-            return -1
-        for i in range(len(target_list) - len(token_ids), -1, -1):
-            if target_list[i : i + len(token_ids)] == token_ids:
-                return i
-        return -1
-
-    def _init_state_entry(self, prompt_tok_ids: list[int] | None, thinking_token_budget: int) -> dict[str, Any]:
-        """Initializes the tracking state for a given sequence index."""
-        if prompt_tok_ids is None:
-            last_start = -1
-            last_end = -1
-            in_think = False
-            think_count = 0
-        else:
-            last_start = self._find_last_sequence_index(prompt_tok_ids, self.reasoning_start_token_ids)
-            last_end = self._find_last_sequence_index(prompt_tok_ids, self.reasoning_end_token_ids)
-            in_think = last_start > last_end
-            if in_think:
-                think_count = len(prompt_tok_ids) - (last_start + len(self.reasoning_start_token_ids))
-            else:
-                think_count = 0
-
-        return {
-            "in_think": in_think,  # Currently in thinking mode
-            "in_end": in_think and thinking_token_budget == 0,
-            "check_count_down": thinking_token_budget,
-            "think_count": think_count,  # Number of tokens in thinking section
-            "end_count": 0,  # Number of end tokens forced so far
-            "prompt_tok_ids": prompt_tok_ids,
-            "output_tok_ids": [],
-            "thinking_token_budget": thinking_token_budget,
-            "prev_output_length": 0,
-            # Track previous output length for incremental updates
-        }
-
-    def _update_think_state(self, state: dict[str, Any]):
-        """Updates the state based on newly generated output tokens."""
-        if not state.get("in_end", False) and state.get("check_count_down", 0) > 0:
-            state["check_count_down"] -= 1
-            return
-
-        output = state.get("output_tok_ids", [])
-        if not output:
-            return
-
-        # Track previous output length for incremental processing
-        prev_length = state.get("prev_output_length", 0)
-        current_length = len(output)
-
-        if current_length <= prev_length:
-            return
-
-        # Process only newly added tokens
-        new_tokens = output[prev_length:]
-        state["prev_output_length"] = current_length
-
-        # Check if new tokens contain think start or end sequences
-        start_len = len(self.reasoning_start_token_ids)
-        end_len = len(self.reasoning_end_token_ids)
-
-        # Look for think sequences in recent tokens (including boundary)
-        # Check overlapping regions where sequences might span boundaries
-        check_start_idx = max(0, prev_length - max(start_len, end_len) + 1)
-        recent_tokens = output[check_start_idx:]
-
-        # Find any think start/end sequences in recent tokens
-        recent_start_pos = self._find_last_sequence_index(recent_tokens, self.reasoning_start_token_ids)
-        recent_end_pos = self._find_last_sequence_index(recent_tokens, self.reasoning_end_token_ids)
-
-        # Update state based on recent sequences
-        if not state["in_end"]:
-            if recent_start_pos >= 0 and recent_end_pos >= 0:
-                if recent_start_pos > recent_end_pos:
-                    # Case: ...<end>...<start>... - entering think mode
-                    absolute_start_pos = check_start_idx + recent_start_pos
-                    new_think_count = current_length - (absolute_start_pos + start_len)
-                    state["in_think"] = True
-                    state["think_count"] = new_think_count
-                else:
-                    # Case: ...<start>...<end>... - exiting think mode
-                    state["in_think"] = False
-                    state["think_count"] = 0
-            elif recent_start_pos >= 0:
-                # Found think start - entering think mode
-                absolute_start_pos = check_start_idx + recent_start_pos
-                new_think_count = current_length - (absolute_start_pos + start_len)
-                state["in_think"] = True
-                state["think_count"] = new_think_count
-            elif recent_end_pos >= 0:
-                # Found think end - exiting think mode
-                state["in_think"] = False
-                state["think_count"] = 0
-            elif state["in_think"]:
-                # Continue thinking mode, increment count by new tokens
-                state["think_count"] += len(new_tokens)
-
-            # Set countdown based on current state
-            if state["in_think"]:
-                remaining_budget = max(0, state["thinking_token_budget"] - state["think_count"])
-                state["check_count_down"] = max(0, remaining_budget - 1)
-            else:
-                state["check_count_down"] = state["thinking_token_budget"]
-
-            # Check if need to transition to end mode
-            if state["in_think"] and state["think_count"] >= state["thinking_token_budget"]:
-                state["in_think"] = False
-                state["in_end"] = True
-                state["end_count"] = 0
-                state["check_count_down"] = state["thinking_token_budget"]
-        else:
-            # In end mode
-            state["end_count"] += 1
-            if state["end_count"] >= len(self.reasoning_end_token_ids):
-                state.update(
-                    {
-                        "in_end": False,
-                        "end_count": 0,
-                        "check_count_down": state["thinking_token_budget"],
-                    }
-                )
-
-    def is_argmax_invariant(self) -> bool:
-        """This logits processor can change the outcome of
-        greedy sampling by forcing that the thinking section
-        ends after a certain number of tokens."""
-        return False
-
-    def update_state(self, batch_update: BatchUpdate | None):
-        if not self.is_enabled:
-            return
-        if batch_update:
-            for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
-                thinking_token_budget = params.thinking_token_budget
-
-                if thinking_token_budget is not None:
-                    self._state[index] = self._init_state_entry(prompt_tok_ids, thinking_token_budget)
-                    self._state[index]["output_tok_ids"] = output_tok_ids
-                else:
-                    # Remove state if no thinking budget
-                    self._state.pop(index, None)
-
-            for index in batch_update.removed:
-                self._state.pop(index, {})
-
-            for i1, i2, direction in batch_update.moved:
-                if direction == MoveDirectionality.SWAP:
-                    state1 = self._state.pop(i1, None)
-                    state2 = self._state.pop(i2, None)
-                    if state1 is not None:
-                        self._state[i2] = state1
-                    if state2 is not None:
-                        self._state[i1] = state2
-                else:
-                    state = self._state.pop(i1, None)
-                    if state is not None:
-                        self._state[i2] = state
-
-        for state in self._state.values():
-            self._update_think_state(state)
-
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if not self.is_enabled or not self._state:
-            return logits
-
-        batch_size = logits.size(0)
-        self.mask[:batch_size] = False
-
-        for i in range(batch_size):
-            state = self._state.get(i)
-            if state and state["in_end"]:
-                self.mask[i] = True
-                self.force_token_ids[i] = self.reasoning_end_token_ids[state["end_count"]]
-
-        # Check in CPU first not to sync with GPU
-        has_active_thinking = any(state.get("in_end", False) for state in self._state.values())
-
-        if has_active_thinking:
-            current_mask = self.mask[:batch_size]
-            active_indices = current_mask.nonzero(as_tuple=False).view(-1)
-            if len(active_indices) > 0:
-                force_tokens = self.force_token_ids[active_indices]
-                # Apply a large value for the end thinking token id index
-                logits[active_indices, force_tokens] = 1e9
-
-        return logits
-
-
 def process_dict_updates(
     req_entries: dict[int, T],
     batch_update: BatchUpdate | None,
diff --git a/aphrodite/v1/sample/metadata.py b/aphrodite/v1/sample/metadata.py
index 38f0815d96..91db084719 100644
--- a/aphrodite/v1/sample/metadata.py
+++ b/aphrodite/v1/sample/metadata.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
 from dataclasses import dataclass, field
 from typing import Any
 
 import torch
 
 from aphrodite.v1.sample.logits_processor import LogitsProcessors
+from aphrodite.v1.sample.thinking_budget_state import ThinkingBudgetStateHolder
 
 
 @dataclass
@@ -105,6 +108,9 @@ class SamplingMetadata:
 
     # Speculative token ids
     spec_token_ids: list[list[int]] | None = None
+    # When non-None, use ``holder.has_tracked_requests()`` to see if this batch applies
+    # thinking-token-budget logits (holder may exist with an empty tracking set).
+    thinking_budget_state_holder: ThinkingBudgetStateHolder | None = None
 
     # Cached padded token-history tensor for GPU-side sampler ops.
     output_token_ids_tensor: torch.Tensor | None = None
diff --git a/aphrodite/v1/sample/ops/topk_topp_sampler.py b/aphrodite/v1/sample/ops/topk_topp_sampler.py
index 6e1ea48fac..a5e9188537 100644
--- a/aphrodite/v1/sample/ops/topk_topp_sampler.py
+++ b/aphrodite/v1/sample/ops/topk_topp_sampler.py
@@ -38,23 +38,35 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
 
                 capability = current_platform.get_device_capability()
                 assert capability is not None
-                if not FlashInferBackend.supports_compute_capability(capability):
+                if FlashInferBackend.supports_compute_capability(capability):
+                    logger.info_once(
+                        "Using FlashInfer for top-p & top-k sampling.",
+                        scope="global",
+                    )
+                    self.forward = self.forward_cuda
+                elif envs.is_set("APHRODITE_USE_FLASHINFER_SAMPLER"):
+                    # User explicitly opted in but the GPU can't run FlashInfer.
                     capability_str = capability.as_version_str()
                     raise RuntimeError(
                         "FlashInfer does not support compute capability "
                         f"{capability_str}, unset APHRODITE_USE_FLASHINFER_SAMPLER=1."
                     )
-                # Users must opt in explicitly via APHRODITE_USE_FLASHINFER_SAMPLER=1.
-                logger.info_once(
-                    "Using FlashInfer for top-p & top-k sampling.",
-                    scope="global",
-                )
-                self.forward = self.forward_cuda
+                else:
+                    # Default-on path; hardware can't run FlashInfer →
+                    # quietly fall back to the PyTorch-native sampler
+                    # instead of failing server startup.
+                    logger.warning_once(
+                        "FlashInfer top-p/top-k sampling not supported on "
+                        "compute capability %s; falling back to PyTorch-native "
+                        "sampler. Set APHRODITE_USE_FLASHINFER_SAMPLER=0 to silence.",
+                        capability.as_version_str(),
+                    )
+                    self.forward = self.forward_native
             else:
-                logger.debug_once(
-                    "FlashInfer top-p/top-k sampling is available but disabled "
-                    "by default. Set APHRODITE_USE_FLASHINFER_SAMPLER=1 to opt in "
-                    "after verifying accuracy for your workloads."
+                # User explicitly set APHRODITE_USE_FLASHINFER_SAMPLER=0.
+                logger.info_once(
+                    "FlashInfer top-p/top-k sampling disabled via "
+                    "APHRODITEUSE_FLASHINFER_SAMPLER=0; using PyTorch-native sampler."
                 )
                 self.forward = self.forward_native
 
@@ -159,9 +171,9 @@ def forward_cuda(
         p: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """More optimized implementation for top-k and top-p sampling."""
-        # We prefer `random_sample` over `flashinfer_sample` when sorting is
-        # not needed. This is because `random_sample` does not require
-        # CPU-GPU synchronization while `flashinfer_sample` does.
+        # Fall back to the PyTorch-native path when FlashInfer has nothing
+        # to do (no top-k / top-p filter) or when per-request generators
+        # are present (unsupported by FlashInfer 0.2.3+).
         if (k is None and p is None) or generators:
             if generators:
                 logger.debug_once(
@@ -395,10 +407,6 @@ def flashinfer_sample(
     NOTE: The outputs of this function do not necessarily match the outputs of
     the `random_sample` function. It only guarantees that the outputs are
     statistically equivalent.
-
-    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
-    does not. Call this function at the end of the forward pass to minimize
-    the synchronization overhead.
     """
     import flashinfer
 
diff --git a/aphrodite/v1/sample/rejection_sampler.py b/aphrodite/v1/sample/rejection_sampler.py
index 2b59cfc110..6bfe34f6e9 100644
--- a/aphrodite/v1/sample/rejection_sampler.py
+++ b/aphrodite/v1/sample/rejection_sampler.py
@@ -267,16 +267,20 @@ def apply_logits_processors(
     ) -> torch.Tensor:
         has_penalties = not sampling_metadata.no_penalties
         any_penalties_or_bad_words = sampling_metadata.bad_words_token_ids or has_penalties
+        holder = sampling_metadata.thinking_budget_state_holder
+        needs_thinking = holder is not None and holder.has_tracked_requests()
 
         output_token_ids = sampling_metadata.output_token_ids
-        if any_penalties_or_bad_words:
+        if any_penalties_or_bad_words or needs_thinking:
             output_token_ids = self._combine_outputs_with_spec_tokens(
                 output_token_ids,
                 sampling_metadata.spec_token_ids,
             )
 
         # Calculate indices of target logits.
-        if sampling_metadata.allowed_token_ids_mask is not None or has_penalties:
+        repeat_indices: torch.Tensor | None = None
+        need_repeat_indices = sampling_metadata.allowed_token_ids_mask is not None or has_penalties or needs_thinking
+        if need_repeat_indices:
             num_requests = len(sampling_metadata.output_token_ids)
             num_draft_tokens = torch.tensor(metadata.num_draft_tokens, device="cpu")
             original_indices = torch.arange(num_requests, device="cpu")
@@ -292,7 +296,12 @@ def apply_logits_processors(
         # Apply bad words exclusion.
         if bad_words_token_ids := sampling_metadata.bad_words_token_ids:
             apply_bad_words_with_drafts(logits, bad_words_token_ids, output_token_ids, metadata.num_draft_tokens)
-
+        if holder is not None and holder.has_tracked_requests():
+            logits = holder.apply_to_logits(
+                logits,
+                predict_bonus_token=False,
+                spec_token_ids=sampling_metadata.spec_token_ids,
+            )
         return logits
 
     @staticmethod
diff --git a/aphrodite/v1/sample/sampler.py b/aphrodite/v1/sample/sampler.py
index 004edbe8bd..57a0267d4c 100644
--- a/aphrodite/v1/sample/sampler.py
+++ b/aphrodite/v1/sample/sampler.py
@@ -14,6 +14,7 @@
 from aphrodite.v1.outputs import LogprobsTensors, SamplerOutput
 from aphrodite.v1.sample.metadata import SamplingMetadata
 from aphrodite.v1.sample.ops import SamplingOps
+from aphrodite.v1.sample.ops.bad_words import apply_bad_words
 from aphrodite.v1.sample.ops.logprobs import batched_count_greater_than
 from aphrodite.v1.sample.ops.temperatures import apply_all_temperatures
 from aphrodite.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
@@ -501,3 +502,60 @@ def gather_logprobs(
         indices = indices.to(torch.int32)
 
         return LogprobsTensors(indices, logprobs, token_ranks)
+
+    @staticmethod
+    def _combine_outputs_with_spec_tokens(
+        output_token_ids: list[list[int]],
+        spec_token_ids: list[list[int]] | None = None,
+    ) -> list[list[int]]:
+        if spec_token_ids is None:
+            return output_token_ids
+
+        return [[*out, *spec] if spec else out for out, spec in zip(output_token_ids, spec_token_ids)]
+
+    def apply_logits_processors(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        predict_bonus_token: bool,
+    ) -> torch.Tensor:
+        bad_words_token_ids = sampling_metadata.bad_words_token_ids
+        any_penalties_or_bad_words = bool(bad_words_token_ids) or not sampling_metadata.no_penalties
+        holder = sampling_metadata.thinking_budget_state_holder
+        needs_thinking_combine = holder is not None and holder.has_tracked_requests()
+
+        output_token_ids = sampling_metadata.output_token_ids
+        if predict_bonus_token and (any_penalties_or_bad_words or needs_thinking_combine):
+            # Combine base outputs with spec tokens when speculative decoding
+            # is enabled.
+            output_token_ids = self._combine_outputs_with_spec_tokens(
+                output_token_ids,
+                sampling_metadata.spec_token_ids,
+            )
+
+        # Apply allowed token ids.
+        if sampling_metadata.allowed_token_ids_mask is not None:
+            logits.masked_fill_(sampling_metadata.allowed_token_ids_mask, float("-inf"))
+
+        # Apply bad words exclusion.
+        if bad_words_token_ids:
+            apply_bad_words(logits, bad_words_token_ids, output_token_ids)
+
+        # Apply logits processors which can impact greedy sampling.
+        for processor in sampling_metadata.logitsprocs.non_argmax_invariant:
+            logits = processor.apply(logits)
+
+        # Apply penalties (e.g., freq_penalties).
+        logits = self.apply_penalties(logits, sampling_metadata, output_token_ids)
+        if holder is not None and holder.has_tracked_requests():
+            holder.update_state(
+                output_token_ids,
+                sampling_metadata.spec_token_ids,
+                repeat_indices=None,
+            )
+            logits = holder.apply_to_logits(
+                logits,
+                predict_bonus_token,
+                sampling_metadata.spec_token_ids,
+            )
+        return logits
diff --git a/aphrodite/v1/sample/thinking_budget_state.py b/aphrodite/v1/sample/thinking_budget_state.py
new file mode 100644
index 0000000000..7ac46d9680
--- /dev/null
+++ b/aphrodite/v1/sample/thinking_budget_state.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Per-batch thinking token budget state; applied after penalties at sample time."""
+
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from aphrodite.v1.sample.logits_processor.interface import (
+    BatchUpdate,
+    MoveDirectionality,
+)
+
+if TYPE_CHECKING:
+    from aphrodite.config.reasoning import ReasoningConfig
+
+
+def maybe_create_thinking_budget_state_holder(
+    reasoning_config: "ReasoningConfig | None",
+    max_num_seqs: int,
+    num_spec_tokens: int,
+    device: torch.device,
+    is_pin_memory: bool,
+) -> "ThinkingBudgetStateHolder | None":
+    if reasoning_config is None:
+        return None
+    return ThinkingBudgetStateHolder(reasoning_config, max_num_seqs, num_spec_tokens, device, is_pin_memory)
+
+
+class ThinkingBudgetStateHolder:
+    """Tracks thinking sections and forces end tokens when budget is exceeded."""
+
+    think_start_token_ids: list[int]
+    think_end_token_ids: list[int]
+
+    def __init__(
+        self,
+        reasoning_config: "ReasoningConfig | None",
+        max_num_seqs: int,
+        num_spec_tokens: int,
+        device: torch.device,
+        is_pin_memory: bool,
+    ):
+        _ = is_pin_memory  # API parity with logits processors
+        max_num_reqs = max_num_seqs
+        self.in_spec_mode = num_spec_tokens > 0
+        self.num_spec_tokens = num_spec_tokens
+
+        # No separate enable flag: a non-``None`` ``reasoning_config`` is the switch.
+        self.is_enabled = reasoning_config is not None
+
+        if reasoning_config is None:
+            self.think_start_token_ids = []
+            self.think_end_token_ids = []
+        else:
+            rs = reasoning_config.reasoning_start_token_ids
+            re = reasoning_config.reasoning_end_token_ids
+            self.think_start_token_ids = rs if rs else []
+            self.think_end_token_ids = re if re else []
+
+        self.device = device
+        self._state: dict[int, dict[str, Any]] = {}
+        self.cu_num_tokens: dict[int, int] = {}
+
+        if self.num_spec_tokens > 0:
+            self.mask = torch.zeros(
+                max_num_reqs * (self.num_spec_tokens + 1),
+                dtype=torch.bool,
+                device=device,
+            )
+            self.force_token_ids = torch.full(
+                (max_num_reqs * (self.num_spec_tokens + 1),),
+                -1,
+                dtype=torch.long,
+                device=device,
+            )
+        else:
+            self.mask = torch.zeros(max_num_reqs, dtype=torch.bool, device=device)
+            self.force_token_ids = torch.full((max_num_reqs,), -1, dtype=torch.long, device=device)
+
+    def has_tracked_requests(self) -> bool:
+        """True when ``sync_batch`` has state for a ``thinking_token_budget`` row.
+
+        Used to decide whether sampling needs output-token rows and spec combining;
+        distinct from merely having a holder instance (reasoning may be on with no
+        budgeted requests in this batch).
+        """
+        return bool(self._state)
+
+    def sync_batch(self, batch_update: BatchUpdate | None) -> None:
+        """Add/remove/move per-request state only (no _update_think_state)."""
+        if not self.is_enabled or not batch_update:
+            return
+        for index in batch_update.removed:
+            self._state.pop(index, None)
+
+        for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
+            thinking_token_budget = params.thinking_token_budget
+            if thinking_token_budget is not None:
+                self._state[index] = self._init_state_entry(prompt_tok_ids, thinking_token_budget)
+                self._state[index]["output_tok_ids"] = output_tok_ids
+                self._state[index]["spec_token_ids"] = []
+            else:
+                self._state.pop(index, None)
+
+        for i1, i2, direction in batch_update.moved:
+            if direction == MoveDirectionality.SWAP:
+                state1 = self._state.get(i1)
+                state2 = self._state.get(i2)
+                if state1 is not None:
+                    self._state[i2] = state1
+                if state2 is not None:
+                    self._state[i1] = state2
+            else:
+                state = self._state.pop(i1, None)
+                if state is not None:
+                    self._state[i2] = state
+
+    def update_state(
+        self,
+        output_token_ids: list[list[int]],
+        spec_token_ids: list[list[int]] | None,
+        repeat_indices: torch.Tensor | None = None,
+    ) -> None:
+        """Refresh output/spec from sampling rows and recompute think state."""
+        if not self.is_enabled or not self._state:
+            return
+
+        spec_lists = spec_token_ids or []
+        last_row_for_req: dict[int, int] | None = None
+        if repeat_indices is not None:
+            last_row_for_req = {}
+            rpt = repeat_indices.cpu().tolist()
+            for batch_row, req_i in enumerate(rpt):
+                last_row_for_req[req_i] = batch_row
+
+        for seq_idx, state in list(self._state.items()):
+            if last_row_for_req is not None:
+                output_row: int | None = last_row_for_req.get(seq_idx)
+                if output_row is None or output_row >= len(output_token_ids):
+                    continue
+                state["output_tok_ids"] = output_token_ids[output_row]
+            elif seq_idx >= len(output_token_ids):
+                continue
+            else:
+                state["output_tok_ids"] = output_token_ids[seq_idx]
+            if seq_idx < len(spec_lists):
+                state["spec_token_ids"] = list(spec_lists[seq_idx])
+            else:
+                state["spec_token_ids"] = []
+            state["in_spec_mode"] = self.in_spec_mode
+            state["force_index"] = []
+            if len(state["output_tok_ids"]) > 0:
+                spec_len = len(state["spec_token_ids"])
+                # Only strip draft suffix when there are spec tokens; ``[:-0]`` would
+                # clear the whole list (Python treats stop index 0 as "up to empty").
+                if spec_len > 0 and len(state["output_tok_ids"]) >= spec_len:
+                    state["output_tok_ids"] = state["output_tok_ids"][:-spec_len]
+            self._update_think_state(state)
+
+    def apply_to_logits(
+        self,
+        logits: torch.Tensor,
+        predict_bonus_token: bool,
+        spec_token_ids: list[list[int]] | None,
+    ) -> torch.Tensor:
+        """Mask and bump logits for forced end-of-thinking tokens."""
+        if not self.is_enabled or not self._state:
+            return logits
+        spec_lists = spec_token_ids or []
+        return self._apply_forcing_to_logits(logits, predict_bonus_token, spec_lists)
+
+    @staticmethod
+    def _find_last_sequence_index(target_list: list[int], token_ids: list[int]) -> int:
+        if not token_ids:
+            return -1
+        for i in range(len(target_list) - len(token_ids), -1, -1):
+            if target_list[i : i + len(token_ids)] == token_ids:
+                return i
+        return -1
+
+    def _init_state_entry(self, prompt_tok_ids: list[int] | None, thinking_token_budget: int) -> dict[str, Any]:
+        if prompt_tok_ids is None:
+            last_start = -1
+            last_end = -1
+            in_think = False
+            think_count = 0
+            start_thinking = -1
+            countdown = thinking_token_budget
+            continue_thinking = False
+            in_end = False
+        else:
+            start_thinking = -1
+            countdown = thinking_token_budget
+            continue_thinking = False
+            in_end = False
+            last_start = self._find_last_sequence_index(prompt_tok_ids, self.think_start_token_ids)
+            last_end = self._find_last_sequence_index(prompt_tok_ids, self.think_end_token_ids)
+            in_think = last_start > last_end
+            # load metrics such as think count, start thinking
+            # if request is in thinking mode, already
+            if in_think:
+                think_count = len(prompt_tok_ids) - (last_start + len(self.think_start_token_ids))
+                start_thinking = len(prompt_tok_ids) - think_count - 1
+                countdown -= think_count
+                continue_thinking = True
+                # check if the token is exhausted within prompt
+                token_exhausted = thinking_token_budget - think_count
+                in_end = token_exhausted <= 0
+            else:
+                think_count = 0
+
+        return {
+            "in_think": in_think,
+            "in_end": in_end,
+            "check_count_down": countdown,
+            "think_count": think_count,
+            "end_count": 0,
+            "prompt_tok_ids": prompt_tok_ids,
+            "output_tok_ids": [],
+            "thinking_token_budget": thinking_token_budget,
+            "prev_output_length": 0,
+            "spec_token_ids": [],
+            "force_index": [],
+            "start_thinking": start_thinking,
+            "end_thinking": -1,
+            "in_spec_mode": False,
+            "bonus_token_forced": False,
+            "continue_thinking": continue_thinking,
+        }
+
+    def _update_think_state(self, state: dict[str, Any]) -> None:
+        if state.get("thinking_token_budget", -1) == -1:
+            return
+        if len(self.think_end_token_ids) == 0:
+            state["thinking_token_budget"] = -1
+            state["in_end"] = False
+            state["force_index"] = []
+            return
+
+        if state["start_thinking"] == -1:
+            start_thinking = self._find_last_sequence_index(state.get("output_tok_ids", []), self.think_start_token_ids)
+            state["start_thinking"] = start_thinking
+        if state["end_thinking"] == -1:
+            end_thinking = self._find_last_sequence_index(state.get("output_tok_ids", []), self.think_end_token_ids)
+            state["end_thinking"] = end_thinking
+
+        if state["start_thinking"] == -1:
+            return
+
+        if state["continue_thinking"]:
+            sampled_tokens_from_previous_step = len(state.get("output_tok_ids", [])) - state.get(
+                "prev_output_length", 0
+            )
+        else:
+            if state["prev_output_length"] == 0:
+                sampled_tokens_from_previous_step = len(state.get("output_tok_ids", [])) - len(
+                    self.think_start_token_ids
+                )
+            else:
+                sampled_tokens_from_previous_step = len(state.get("output_tok_ids", [])) - state["prev_output_length"]
+        current_step_countdown = state["check_count_down"] - sampled_tokens_from_previous_step
+        predicted_countdown = current_step_countdown - len(state["spec_token_ids"]) - 1
+        # We only proceed further if we have counted down the thinking budget
+        # to 0 or less and when we are in the "in think" mode.
+        if not state.get("in_end", False) and predicted_countdown >= 0 and state["start_thinking"] > -1:
+            state["check_count_down"] = current_step_countdown
+            state["prev_output_length"] = len(state.get("output_tok_ids", []))
+            return
+        output = state.get("output_tok_ids", [])
+        if not output:
+            # When in_end was set at init (budget=0, prompt already in think),
+            # we must force the first generated token to be the end token;
+            # otherwise apply() sees in_end=True but force_index=[] and
+            # allows an extra thinking token.
+            if state.get("in_end", False):
+                state["force_index"] = [0]
+            return
+
+        # Track previous output length for incremental processing
+        prev_length = state.get("prev_output_length", 0)
+        current_length = len(output)
+
+        if current_length <= prev_length:
+            if state.get("in_end", False):
+                remaining_budget = state["thinking_token_budget"] - state["think_count"]
+                spec_len = len(state["spec_token_ids"])
+                if spec_len > 0:
+                    if 0 < remaining_budget < spec_len:
+                        state["force_index"] = [remaining_budget]
+                    elif remaining_budget <= 0:
+                        state["force_index"] = [0]
+                    else:
+                        state["force_index"] = [spec_len]
+                else:
+                    state["force_index"] = [0]
+            return
+
+        state["prev_output_length"] = current_length
+
+        start_len = len(self.think_start_token_ids)
+        absolute_start_pos = state["start_thinking"]
+
+        if state["continue_thinking"] and state["end_thinking"] > -1:
+            absolute_end_pos = state["end_thinking"] + len(state.get("prompt_tok_ids") or [])
+        else:
+            absolute_end_pos = state["end_thinking"]
+        # Update state based on recent sequences
+        # This is the case where we are in end mode, but the rejection sampler
+        # rejected a token before the end token,
+        # so we need to go back to think mode and wait for the next end token
+        # eg with 999: [2,4,5,999] -> [3,-1,-1,-1]
+        if state["in_end"] and state["end_count"] == 0:
+            new_tokens = output[prev_length:]
+            stopping_thinking = self.think_end_token_ids[state["end_count"]] in new_tokens
+            if not stopping_thinking:
+                state["in_think"] = True
+                state["in_end"] = False
+                state["end_count"] = 0
+                state["bonus_token_forced"] = False
+
+        if not state["in_end"]:
+            if absolute_start_pos >= 0 and absolute_end_pos >= 0:
+                # Case: ...<end>...<start>... - entering think mode
+                if absolute_start_pos > absolute_end_pos:
+                    new_think_count = current_length - (absolute_start_pos + start_len)
+                    state["in_think"] = True
+                    state["think_count"] = new_think_count
+                else:
+                    # Case: ...<start>...<end>... - exiting think mode
+                    state["in_think"] = False
+                    state["think_count"] = 0
+
+            elif absolute_start_pos >= 0 and not state["continue_thinking"]:
+                # Found think start - entering think mode
+                new_think_count = current_length - (absolute_start_pos + start_len)
+                state["in_think"] = True
+                state["think_count"] = new_think_count
+
+            elif absolute_end_pos >= 0:
+                # Found think end - exiting think mode
+                state["in_think"] = False
+                state["think_count"] = 0
+
+            elif state["in_think"]:
+                # Continue thinking mode, increment count by new tokens
+                prompt_tok_ids = state.get("prompt_tok_ids") or []
+                think_tokens_in_prompt = len(prompt_tok_ids) - (absolute_start_pos + start_len)
+                state["think_count"] = len(state["output_tok_ids"]) + think_tokens_in_prompt
+            if state["in_think"]:
+                remaining_budget = max(0, state["thinking_token_budget"] - state["think_count"])
+                state["check_count_down"] = remaining_budget
+            else:
+                state["check_count_down"] = state["thinking_token_budget"]
+
+            total_thinking_tokens = state["think_count"] + len(state["spec_token_ids"]) + 1
+            # Check if need to transition to end mode
+            # If we have more thinking tokens than the budget,
+            # we need to transition to end mode
+            if state["in_think"] and total_thinking_tokens > state["thinking_token_budget"]:
+                # Calculate force_index: position within spec_token_ids where
+                # forcing starts. If we're already over budget without spec
+                # tokens, force from position 0. Force from the position
+                # where budget is exceeded.
+                state["in_think"] = False
+                state["in_end"] = True
+                state["end_count"] = 0
+                state["check_count_down"] = state["thinking_token_budget"]
+                remaining_budget = state["thinking_token_budget"] - state["think_count"]
+                spec_len = len(state["spec_token_ids"])
+                if 0 < remaining_budget < spec_len:
+                    state["force_index"] = [remaining_budget]
+
+                elif remaining_budget <= 0:
+                    state["force_index"] = [0]
+
+                else:
+                    # remaining_budget >= spec_len: all spec tokens are within
+                    # budget; force the bonus token position
+                    state["force_index"] = [len(state["spec_token_ids"])]
+
+        else:
+            state["force_index"] = []
+            if len(state["spec_token_ids"]) > 0:
+                for i, token_id in enumerate(state["spec_token_ids"]):
+                    if state["end_count"] + 1 < len(self.think_end_token_ids):
+                        if token_id == self.think_end_token_ids[state["end_count"] + 1]:
+                            state["end_count"] += 1
+                        else:
+                            state["end_count"] += 1
+                            state["force_index"] = [i]
+                            break
+                    else:
+                        state["end_count"] += 1
+                if len(state["force_index"]) == 0:
+                    state["end_count"] += 1
+                    state["force_index"] = [len(state["spec_token_ids"])]
+            else:
+                state["end_count"] += 1
+                state["force_index"] = [0]
+            if state["end_count"] >= len(self.think_end_token_ids):
+                state.update(
+                    {
+                        "in_end": False,
+                        "end_count": 0,
+                        "check_count_down": state["thinking_token_budget"],
+                    }
+                )
+
+    def _apply_forcing_to_logits(
+        self,
+        logits: torch.Tensor,
+        predict_bonus_token: bool,
+        spec_token_ids_for_layout: list[list[int]],
+    ) -> torch.Tensor:
+        self.mask[:] = False
+        cumulative_total = 0
+        self.cu_num_tokens.clear()
+
+        n_layout = len(spec_token_ids_for_layout)
+        if self._state:
+            n_layout = max(n_layout, max(self._state.keys()) + 1)
+
+        for index in range(n_layout):
+            self.cu_num_tokens[index] = cumulative_total
+            spec_tokens = spec_token_ids_for_layout[index] if index < len(spec_token_ids_for_layout) else []
+            if self.in_spec_mode:
+                cumulative_total += len(spec_tokens) if not predict_bonus_token else 1
+            else:
+                cumulative_total += 1
+
+        for seq_idx in sorted(self._state.keys()):
+            if seq_idx not in self.cu_num_tokens:
+                continue
+            state = self._state[seq_idx]
+            if state.get("in_end", False):
+                # logits processor in spec mode are called twice
+                # once for bonus token logits and
+                # second time for the target logits
+                # in case the force index is bonus token index
+                # we change the force index to 0
+                if predict_bonus_token:
+                    if state.get("force_index") and state["force_index"][0] < len(state["spec_token_ids"]):
+                        continue
+                    else:
+                        state["force_index"] = [0]
+                # continue enforcing the end thinking tokens
+                if state["end_count"] > 0:
+                    state["bonus_token_forced"] = False
+                if state and not state["bonus_token_forced"]:
+                    force_index = state.get("force_index", [])
+                    if len(force_index) == 0:
+                        continue
+                    end_count = state.get("end_count", 0)
+                    for force_idx in force_index:
+                        if end_count < len(self.think_end_token_ids):
+                            mask_idx = self.cu_num_tokens[seq_idx] + force_idx
+                            if mask_idx < len(self.mask) and mask_idx < logits.shape[0]:
+                                self.mask[mask_idx] = True
+                                self.force_token_ids[mask_idx] = self.think_end_token_ids[end_count]
+                            if predict_bonus_token:
+                                if state["end_count"] > 0:
+                                    state["bonus_token_forced"] = False
+                                    state["force_index"] = []
+                                else:
+                                    state["bonus_token_forced"] = True
+
+        has_active_thinking = any(state.get("in_end", False) for state in self._state.values())
+
+        if has_active_thinking:
+            active_indices = self.mask.nonzero(as_tuple=False).view(-1)
+
+            if len(active_indices) > 0:
+                force_tokens = self.force_token_ids[active_indices]
+                logits[active_indices, force_tokens] = 1e9
+
+        return logits
diff --git a/aphrodite/v1/spec_decode/dflash.py b/aphrodite/v1/spec_decode/dflash.py
index efe790c9f9..624c246713 100644
--- a/aphrodite/v1/spec_decode/dflash.py
+++ b/aphrodite/v1/spec_decode/dflash.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+from dataclasses import replace
 from typing import Any
 
 import torch
@@ -65,6 +65,17 @@ def __init__(
         # For DFlash we use the input embeddings to embed the mask token
         self.parallel_drafting_hidden_state_tensor = None
 
+    @override
+    def _create_draft_aphrodite_config(self) -> AphroditeConfig:
+        base = super()._create_draft_aphrodite_config()
+        return replace(
+            base,
+            attention_config=replace(
+                base.attention_config,
+                use_non_causal=True,
+            ),
+        )
+
     @override
     def _raise_if_multimodal(self):
         # Override to allow multimodal inputs since DFlash supports Qwen3.5 models
diff --git a/aphrodite/v1/spec_decode/llm_base_proposer.py b/aphrodite/v1/spec_decode/llm_base_proposer.py
index c6ba861637..be38cbd541 100644
--- a/aphrodite/v1/spec_decode/llm_base_proposer.py
+++ b/aphrodite/v1/spec_decode/llm_base_proposer.py
@@ -826,13 +826,12 @@ def prepare_next_token_ids_padded(
         is not sampled and comes from `request.get_token_id()` instead. This is denoted
         the "backup" token id. It also counts rejected tokens via `sampled_token_ids`.
         """
-        # Precompute get_token_id for when there is no valid next token
+        # Precompute backup token IDs for discarded requests.
         num_reqs = gpu_input_batch.num_reqs
-        seq_lens_list = (gpu_input_batch.num_tokens_no_spec[:num_reqs] - 1).tolist()
-        self.backup_next_token_ids.np[:num_reqs] = np.array(
-            [requests[gpu_input_batch.req_ids[i]].get_token_id(seq_lens_list[i]) for i in range(num_reqs)],
-            dtype=np.int32,
-        )
+        for i in range(num_reqs):
+            self.backup_next_token_ids.np[i] = requests[gpu_input_batch.req_ids[i]].get_token_id(
+                gpu_input_batch.num_tokens_no_spec[i] - 1
+            )
         self.backup_next_token_ids.copy_to_gpu(num_reqs)
         backup_tokens_gpu = self.backup_next_token_ids.gpu
 
@@ -1173,15 +1172,29 @@ def _create_draft_aphrodite_config(self) -> AphroditeConfig:
         Subclasses may override to apply additional config changes.
         """
         spec_cfg = self.speculative_config
+        base = self.aphrodite_config
+
         if spec_cfg.moe_backend is not None:
-            return replace(
-                self.aphrodite_config,
+            base = replace(
+                base,
                 kernel_config=replace(
-                    self.aphrodite_config.kernel_config,
+                    base.kernel_config,
                     moe_backend=spec_cfg.moe_backend,
                 ),
             )
-        return self.aphrodite_config
+
+        # Note (matt): Never inherit the attention backend from base, because there are
+        # many opportunities for incompatibility, so we always independently autoselect
+        # unless explicitly specified in the speculative config.
+        base = replace(
+            base,
+            attention_config=replace(
+                base.attention_config,
+                backend=spec_cfg.attention_backend,
+            ),
+        )
+
+        return base
 
     def _get_model(self) -> nn.Module:
         """
@@ -1238,6 +1251,7 @@ def load_model(self, target_model: nn.Module) -> None:
                 "Exaone4_5_ForConditionalGeneration",
                 "GlmOcrForConditionalGeneration",
                 "HunYuanVLForConditionalGeneration",
+                "MiMoV2OmniForCausalLM",
                 "Qwen2_5_VLForConditionalGeneration",
                 "Qwen3_5ForConditionalGeneration",
                 "Qwen3_5MoeForConditionalGeneration",
diff --git a/aphrodite/v1/structured_output/__init__.py b/aphrodite/v1/structured_output/__init__.py
index 43e6a15f0c..2d18fe1491 100644
--- a/aphrodite/v1/structured_output/__init__.py
+++ b/aphrodite/v1/structured_output/__init__.py
@@ -37,7 +37,10 @@ class StructuredOutputManager:
 
     def __init__(self, aphrodite_config: AphroditeConfig):
         self.backend: StructuredOutputBackend | None = None
-        self.reasoner: ReasoningParser | None = None
+        # We only store the class of the reasoner in the manager.
+        # The parser instance is request-scoped because some reasoning parsers
+        # depend on per-request chat-template kwargs.
+        self.reasoner_cls: type[ReasoningParser] | None = None
         self.aphrodite_config = aphrodite_config
 
         # When in external_launcher mode, async grammar compilation causes deadlocks
@@ -78,11 +81,25 @@ def __init__(self, aphrodite_config: AphroditeConfig):
 
             reasoning_parser = self.aphrodite_config.structured_outputs_config.reasoning_parser
             if reasoning_parser:
-                reasoner_cls = ReasoningParserManager.get_reasoning_parser(reasoning_parser)
-                self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
+                self.reasoner_cls = ReasoningParserManager.get_reasoning_parser(reasoning_parser)
 
         self.enable_in_reasoning = self.aphrodite_config.structured_outputs_config.enable_in_reasoning
 
+    def _get_reasoner(self, request: "Request") -> "ReasoningParser | None":
+        structured_req = request.structured_output_request
+        if structured_req is None or self.reasoner_cls is None:
+            return None
+
+        if structured_req.reasoner is None:
+            # Lazily build the request-local parser so the structured-output
+            # gate observes the same template kwargs used by the frontend.
+            parser_kwargs = structured_req.reasoning_parser_kwargs or {}
+            structured_req.reasoner = self.reasoner_cls(
+                tokenizer=self.tokenizer,
+                **parser_kwargs,
+            )
+        return structured_req.reasoner
+
     def grammar_init(self, request: "Request") -> None:
         if request.structured_output_request is None:
             return
@@ -260,7 +277,8 @@ def should_fill_bitmask(self, request: "Request") -> bool:
         # NOTE (Hanchen) if enable_in_reasoning is True, it means that
         # the model needs to be constrained in reasoning. So we should always
         # enable the bitmask filling.
-        if self.reasoner is not None:
+        reasoner = self._get_reasoner(request)
+        if reasoner is not None:
             if self.enable_in_reasoning:
                 return True
             assert request.structured_output_request is not None
@@ -269,7 +287,7 @@ def should_fill_bitmask(self, request: "Request") -> bool:
                 # is an independent code path, it is kept for now.
                 # After unifying the `openai_gptoss` and non-`openai_gptoss` styles,
                 # it can be removed.
-                request.structured_output_request.reasoning_ended = self.reasoner.is_reasoning_end(
+                request.structured_output_request.reasoning_ended = reasoner.is_reasoning_end(
                     request.prompt_token_ids or []
                 )
             return request.structured_output_request.reasoning_ended
@@ -286,7 +304,8 @@ def should_advance(self, request: "Request") -> bool:
             assert request.structured_output_request.grammar is not None
         # by default, we should always advance
         # for cases that don't use thinking mode.
-        if self.reasoner is None:
+        reasoner = self._get_reasoner(request)
+        if reasoner is None:
             return True
 
         # if the model needs structured in reasoning, we should advance
@@ -301,7 +320,7 @@ def should_advance(self, request: "Request") -> bool:
         delta_from = request.num_computed_tokens - request.num_output_placeholders
         all_token_ids = request.all_token_ids
         start = delta_from if delta_from >= 0 else max(len(all_token_ids) + delta_from, 0)
-        if self.reasoner.is_reasoning_end_streaming(all_token_ids, itertools.islice(all_token_ids, start, None)):
+        if reasoner.is_reasoning_end_streaming(all_token_ids, itertools.islice(all_token_ids, start, None)):
             # Reasoning just ended, so we shouldn't advance til
             # next pass
             structured_req.reasoning_ended = True
diff --git a/aphrodite/v1/structured_output/request.py b/aphrodite/v1/structured_output/request.py
index 82d2076736..1ac0f5ca31 100644
--- a/aphrodite/v1/structured_output/request.py
+++ b/aphrodite/v1/structured_output/request.py
@@ -5,7 +5,7 @@
 import json
 from concurrent.futures import Future
 from concurrent.futures._base import TimeoutError
-from typing import cast
+from typing import TYPE_CHECKING, Any, cast
 
 from aphrodite.sampling_params import SamplingParams, StructuredOutputsParams
 from aphrodite.v1.structured_output.backend_types import (
@@ -14,12 +14,19 @@
     StructuredOutputOptions,
 )
 
+if TYPE_CHECKING:
+    from aphrodite.reasoning import ReasoningParser
+
 
 @dataclasses.dataclass
 class StructuredOutputRequest:
     params: StructuredOutputsParams
     _grammar: Future[StructuredOutputGrammar] | StructuredOutputGrammar | None = None
     reasoning_ended: bool | None = None
+    reasoning_parser_kwargs: dict[str, Any] | None = None
+    # Cached per request; do not share reasoning parsers across requests because
+    # their behavior can depend on reasoning_parser_kwargs.
+    reasoner: "ReasoningParser | None" = None
 
     @staticmethod
     def from_sampling_params(
diff --git a/aphrodite/v1/worker/cpu_model_runner.py b/aphrodite/v1/worker/cpu_model_runner.py
index 45412b31ce..c9451f98f7 100644
--- a/aphrodite/v1/worker/cpu_model_runner.py
+++ b/aphrodite/v1/worker/cpu_model_runner.py
@@ -21,6 +21,9 @@
 
 class CPUModelRunner(GPUModelRunner):
     def __init__(self, aphrodite_config: AphroditeConfig, device: torch.device):
+        # avoid calling accelerator APIs for methods inherited from super class
+        _set_torch_accelerator_to_noop()
+
         with _torch_cuda_wrapper():
             super().__init__(aphrodite_config, device)
 
@@ -230,3 +233,11 @@ def _set_global_compilation_settings(config: AphroditeConfig):
         yield
     finally:
         torch_inductor_config.freezing = freezing_value
+
+
+def _set_torch_accelerator_to_noop() -> None:
+    def noop(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    torch.accelerator.synchronize = noop
+    torch.accelerator.empty_cache = noop
diff --git a/aphrodite/v1/worker/gpu/block_table.py b/aphrodite/v1/worker/gpu/block_table.py
index 010655d7cc..5817dc88e9 100644
--- a/aphrodite/v1/worker/gpu/block_table.py
+++ b/aphrodite/v1/worker/gpu/block_table.py
@@ -47,14 +47,7 @@ def __init__(
                 device=device,
             )
             self.block_tables.append(block_table)
-        self.block_table_ptrs = self._make_ptr_tensor([b.gpu for b in self.block_tables])
-        self.block_table_strides = torch.tensor(
-            [b.gpu.stride(0) for b in self.block_tables],
-            dtype=torch.int64,
-            device=self.device,
-        )
 
-        self.block_sizes_tensor = torch.tensor(self.block_sizes, dtype=torch.int32, device=self.device)
         self.num_blocks = UvaBackedTensor(
             (self.num_kv_cache_groups, self.max_num_reqs),
             dtype=torch.int32,
@@ -63,7 +56,6 @@ def __init__(
         # Block tables used for model's forward pass.
         # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
         self.input_block_tables: list[torch.Tensor] = [torch.zeros_like(b.gpu) for b in self.block_tables]
-        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
 
         self.slot_mappings = torch.zeros(
             self.num_kv_cache_groups,
@@ -72,10 +64,27 @@ def __init__(
             device=self.device,
         )
 
+        self.init_block_table_layout_tensors()
+
     def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor:
         # NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses.
         return torch.tensor([t.data_ptr() for t in x], dtype=torch.uint64, device=self.device)
 
+    def init_block_table_layout_tensors(self) -> None:
+        # Called at init and after a CuMem kv_cache wake-up. The ptr tensors
+        # cache raw data_ptr() values that go stale once the underlying tensors
+        # are reallocated on wake; block_sizes_tensor needs re-populating
+        # because its storage lives under the kv_cache pool tag and comes back
+        # with undefined contents.
+        self.block_table_ptrs = self._make_ptr_tensor([b.gpu for b in self.block_tables])
+        self.block_table_strides = torch.tensor(
+            [b.gpu.stride(0) for b in self.block_tables],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.block_sizes_tensor = torch.tensor(self.block_sizes, dtype=torch.int32, device=self.device)
+        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
+
     def append_block_ids(
         self,
         req_index: int,
diff --git a/aphrodite/v1/worker/gpu/cudagraph_utils.py b/aphrodite/v1/worker/gpu/cudagraph_utils.py
index 933bf9881c..0d8b6fad44 100644
--- a/aphrodite/v1/worker/gpu/cudagraph_utils.py
+++ b/aphrodite/v1/worker/gpu/cudagraph_utils.py
@@ -3,12 +3,13 @@
 from collections import defaultdict
 from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, NamedTuple
 
 import torch
 import torch.nn as nn
 from tqdm import tqdm
 
+from aphrodite.compilation.counter import compilation_counter
 from aphrodite.config import AphroditeConfig
 from aphrodite.config.compilation import CUDAGraphMode
 from aphrodite.distributed.parallel_state import (
@@ -32,6 +33,11 @@
 logger = init_logger(__name__)
 
 
+class CapturedAttentionState(NamedTuple):
+    attn_metadata: dict[str, Any]
+    slot_mappings: dict[str, torch.Tensor]
+
+
 @dataclass(frozen=True)
 class BatchExecutionDescriptor:
     """Describes the shape of the batch and CG mode to run; this is used to make shape
@@ -163,15 +169,19 @@ def needs_capture(self) -> bool:
     @torch.inference_mode()
     def capture(
         self,
-        create_forward_fn: Callable[[BatchExecutionDescriptor], Callable[[CUDAGraphMode], None]],
+        create_forward_fn: Callable[
+            [BatchExecutionDescriptor],
+            tuple[Callable[[CUDAGraphMode], None], CapturedAttentionState],
+        ],
         progress_bar_desc: str = "Capturing CUDA graphs",
-    ) -> None:
+    ) -> dict[BatchExecutionDescriptor, CapturedAttentionState]:
         """Capture CUDA graphs.
 
         Args:
             create_forward_fn: Factory that prepares inputs (OUTSIDE graph) and
-                returns a function that runs forward with a given CUDAGraphMode.
+                returns a tuple of (forward_fn, captured_attn_state).
         """
+        captured_attn_states: dict[BatchExecutionDescriptor, CapturedAttentionState] = {}
         with graph_capture(device=self.device):
             # Capture in order: PIECEWISE first, then FULL. PIECEWISE has larger
             # activations so FULL activations should fit in already allocated
@@ -185,7 +195,8 @@ def capture(
                     descs = tqdm(descs, desc=f"{progress_bar_desc} ({mode.name})")
                 for desc in descs:
                     # Prepare inputs and get forward function
-                    forward_fn = create_forward_fn(desc)
+                    forward_fn, attn_state = create_forward_fn(desc)
+                    captured_attn_states[desc] = attn_state
 
                     # Warmup
                     forward_fn(CUDAGraphMode.NONE)
@@ -208,7 +219,9 @@ def capture(
                             # the next forward pass.
                             get_offloader().join_after_forward()
                         self.graphs[desc] = graph
+                        compilation_counter.num_cudagraph_captured += 1
         self._graphs_captured = True
+        return captured_attn_states
 
     def dispatch(
         self,
@@ -266,13 +279,16 @@ def capture(
         has_lora: bool = False,
         use_aux_hidden_state_outputs: bool = False,
         progress_bar_desc: str = "Capturing CUDA graphs",
-    ) -> None:
+    ) -> dict[BatchExecutionDescriptor, CapturedAttentionState]:
         """Capture CUDA graphs for model forward pass."""
         self.use_aux_hidden_state_outputs = use_aux_hidden_state_outputs
 
         def create_forward_fn(
             desc: BatchExecutionDescriptor,
-        ) -> Callable[[CUDAGraphMode], None]:
+        ) -> tuple[
+            Callable[[CUDAGraphMode], None],
+            CapturedAttentionState,
+        ]:
             num_tokens = desc.num_tokens
             num_reqs = desc.num_reqs or min(num_tokens, self.max_num_reqs)
             num_tokens_across_dp = (
@@ -344,9 +360,9 @@ def forward_fn(cg_mode: CUDAGraphMode) -> None:
                     for k, v in intermediate_tensors.tensors.items():
                         self.intermediate_tensors[k][:num_tokens] = v
 
-            return forward_fn
+            return forward_fn, CapturedAttentionState(attn_metadata, slot_mappings)
 
-        super().capture(create_forward_fn, progress_bar_desc)
+        return super().capture(create_forward_fn, progress_bar_desc)
 
     def run_fullgraph(
         self, desc: BatchExecutionDescriptor
@@ -372,7 +388,7 @@ def prepare_inputs_to_capture(
     block_tables: BlockTables,
     attn_groups: list[list[AttentionGroup]],
     kv_cache_config: KVCacheConfig,
-) -> tuple[dict[str, Any], dict[str, torch.Tensor]]:
+) -> CapturedAttentionState:
     input_batch = InputBatch.make_dummy(num_reqs, num_tokens, input_buffers)
     input_block_tables = block_tables.get_dummy_block_tables(num_reqs)
     slot_mappings = block_tables.get_dummy_slot_mappings(num_tokens)
@@ -399,4 +415,4 @@ def prepare_inputs_to_capture(
         kv_cache_config,
         for_capture=True,
     )
-    return attn_metadata, slot_mappings_by_layer
+    return CapturedAttentionState(attn_metadata, slot_mappings_by_layer)
diff --git a/aphrodite/v1/worker/gpu/kv_connector.py b/aphrodite/v1/worker/gpu/kv_connector.py
index 70ad33db4c..1ae61e553a 100644
--- a/aphrodite/v1/worker/gpu/kv_connector.py
+++ b/aphrodite/v1/worker/gpu/kv_connector.py
@@ -49,7 +49,7 @@ def __init__(self, aphrodite_config: AphroditeConfig, kv_caches_dict: dict[str,
         self.kv_connector = get_kv_transfer_group()
         # Register kv caches with KV Connector if applicable.
         # TODO: support cross_layers_kv_cache
-        # (see https://github.com/vllm-project/vllm/pull/27743)
+        # (see https://github.com/aphrodite-project/aphrodite/pull/27743)
         self.kv_connector.register_kv_caches(kv_caches_dict)
         self.kv_connector.set_host_xfer_buffer_ops(copy_kv_blocks)
 
diff --git a/aphrodite/v1/worker/gpu/mm/rope.py b/aphrodite/v1/worker/gpu/mm/rope.py
index 0099dd33b2..b5f01670a1 100644
--- a/aphrodite/v1/worker/gpu/mm/rope.py
+++ b/aphrodite/v1/worker/gpu/mm/rope.py
@@ -20,7 +20,7 @@ class RopeState:
     NOTE: `positions` is implemented with one additional dummy position on
     purpose to make it non-contiguous so that it can work with torch compile.
     See detailed explanation in
-    https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
+    https://github.com/aphrodite-project/aphrodite/pull/12128#discussion_r1926431923
 
     NOTE: When M-RoPE is enabled, position ids are 3D regardless of the
     modality of inputs. For text-only inputs, each dimension has identical
diff --git a/aphrodite/v1/worker/gpu/model_runner.py b/aphrodite/v1/worker/gpu/model_runner.py
index 290a02876f..82ffb05c4b 100644
--- a/aphrodite/v1/worker/gpu/model_runner.py
+++ b/aphrodite/v1/worker/gpu/model_runner.py
@@ -27,6 +27,7 @@
 import torch
 import torch.nn as nn
 
+from aphrodite.compilation.counter import compilation_counter
 from aphrodite.config import AphroditeConfig
 from aphrodite.config.compilation import CUDAGraphMode
 from aphrodite.distributed.parallel_state import (
@@ -265,8 +266,7 @@ def load_model(self, load_dummy_weights: bool = False, *args, **kwargs) -> None:
             logger.info("Loading model from scratch...")
 
             self.model = model_loader.load_model(
-                aphrodite_config=self.aphrodite_config,
-                model_config=self.aphrodite_config.model_config,
+                aphrodite_config=self.aphrodite_config, model_config=self.aphrodite_config.model_config
             )
             if self.lora_config:
                 self.model = self.load_lora_model(self.model, self.aphrodite_config, self.device)
@@ -413,7 +413,7 @@ def _dummy_run(
             # HACK(lucas): for now since the worker is shared between MRV1 and MRV2,
             # and for spec-decode with MTP we want to make sure the dummy runs use
             # 1+num_speculative_tokens we use max here, this will likely be eventually
-            # changed in the worker: https://github.com/vllm-project/vllm/pull/35243
+            # changed in the worker: https://github.com/aphrodite-project/aphrodite/pull/35243
             num_tokens = max(num_tokens, self.decode_query_len)
             num_reqs = num_tokens // self.decode_query_len
             assert num_tokens % self.decode_query_len == 0
@@ -534,6 +534,9 @@ def profile_run(self) -> None:
         del hidden_states, sample_hidden_states
         gc.collect()
 
+    def post_kv_cache_wake_up(self) -> None:
+        self.block_tables.init_block_table_layout_tensors()
+
     def reset_mm_cache(self) -> None:
         if self.encoder_cache is not None:
             self.encoder_cache.reset_mm_cache()
@@ -560,13 +563,15 @@ def capture_model(self) -> int:
             )
             return 0
 
+        compilation_counter.num_gpu_runner_capture_triggers += 1
+
         start_time = time.perf_counter()
         gc.collect()
         torch.accelerator.empty_cache()
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         with self.maybe_setup_dummy_loras(self.lora_config):
-            self.cudagraph_manager.capture(
+            captured_attn_states = self.cudagraph_manager.capture(
                 self.model,
                 self.model_state,
                 self.input_buffers,
@@ -578,7 +583,7 @@ def capture_model(self) -> int:
                 use_aux_hidden_state_outputs=self.use_aux_hidden_state_outputs,
             )
             if self.speculator is not None:
-                self.speculator.capture_model()
+                self.speculator.capture(captured_attn_states)
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
@@ -779,7 +784,6 @@ def prepare_inputs(self, scheduler_output: SchedulerOutput, batch_desc: BatchExe
             out=seq_lens_cpu_upper_bound_np[:num_reqs],
         )
         seq_lens_cpu_upper_bound = torch.from_numpy(seq_lens_cpu_upper_bound_np)
-
         return InputBatch(
             req_ids=req_ids,
             num_reqs=num_reqs,
@@ -901,6 +905,8 @@ def postprocess(
         computed_prefill = self.req_states.num_computed_prefill_tokens
         computed_prefill[idx_mapping_np] += input_batch.num_scheduled_tokens
         np.minimum(computed_prefill, self.req_states.prefill_len.np, out=computed_prefill)
+        # Advance the CPU mirror optimistically (assume all scheduled accepted).
+        self.req_states.num_computed_tokens_np[idx_mapping_np] += input_batch.num_scheduled_tokens
 
     @torch.inference_mode()
     def execute_model(
@@ -1264,6 +1270,8 @@ def postprocess_pool(self, input_batch: InputBatch) -> None:
         computed_prefill = self.req_states.num_computed_prefill_tokens
         computed_prefill[idx_mapping_np] += input_batch.num_scheduled_tokens
         np.minimum(computed_prefill, self.req_states.prefill_len.np, out=computed_prefill)
+        # Advance the CPU mirror optimistically (assume all scheduled accepted).
+        self.req_states.num_computed_tokens_np[idx_mapping_np] += input_batch.num_scheduled_tokens
 
     ########### EPLB methods start ###########
     @property
diff --git a/aphrodite/v1/worker/gpu/model_states/default.py b/aphrodite/v1/worker/gpu/model_states/default.py
index 95609019de..ee483b63f4 100644
--- a/aphrodite/v1/worker/gpu/model_states/default.py
+++ b/aphrodite/v1/worker/gpu/model_states/default.py
@@ -67,9 +67,7 @@ def get_supported_generation_tasks(self) -> tuple[GenerationTask, ...]:
             supports_realtime,
             supports_transcription,
         )
-        from aphrodite.model_executor.models.interfaces_base import (
-            is_text_generation_model,
-        )
+        from aphrodite.model_executor.models.interfaces_base import is_text_generation_model
 
         supported_tasks = list[GenerationTask]()
 
diff --git a/aphrodite/v1/worker/gpu/sample/logprob.py b/aphrodite/v1/worker/gpu/sample/logprob.py
index abf7f7b1e7..08ce63d56b 100644
--- a/aphrodite/v1/worker/gpu/sample/logprob.py
+++ b/aphrodite/v1/worker/gpu/sample/logprob.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import numpy as np
 import torch
 
+from aphrodite.sampling_params import MAX_LOGPROB_TOKEN_IDS, SamplingParams
 from aphrodite.triton_utils import tl, triton
 from aphrodite.v1.outputs import LogprobsTensors
+from aphrodite.v1.worker.gpu.buffer_utils import StagedWriteTensor, UvaBackedTensor
 
 
 @triton.jit
@@ -73,6 +76,9 @@ def _ranks_kernel(
 
 
 def compute_token_logprobs(logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
+    # NOTE(woosuk): To save GPU memory, we do not materialize the full
+    # [batch_size, vocab_size] logprobs tensor. The kernel computes
+    # max + logsumexp per row and only emits logprobs at `token_ids`.
     batch_size, vocab_size = logits.shape
     token_ids = token_ids.to(torch.int64)
     num_logprobs = token_ids.shape[1]
@@ -95,18 +101,52 @@ def compute_topk_logprobs(
     num_logprobs: int,
     sampled_token_ids: torch.Tensor,
     cu_num_logits: list[int] | None = None,
+    logprob_token_ids_state: "LogprobTokenIdsState | None" = None,
+    expanded_idx_mapping: torch.Tensor | None = None,
+    max_per_req_token_ids: int = 0,
 ) -> LogprobsTensors:
     assert num_logprobs >= 0
     batch_size, vocab_size = logits.shape
-    logprob_token_ids = sampled_token_ids.unsqueeze(-1)
-    if num_logprobs > 0:
-        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
-        logprob_token_ids = torch.cat((logprob_token_ids, topk_indices), dim=1)
-
-    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
-    # logprobs tensor. Instead, we only compute and return the logprobs of
-    # the topk + 1 tokens.
-    logprobs = compute_token_logprobs(logits, logprob_token_ids)
+
+    if max_per_req_token_ids == 0:
+        # Fast path: no request asked for custom logprob_token_ids.
+        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
+        if num_logprobs > 0:
+            topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+            logprob_token_ids = torch.cat((logprob_token_ids, topk_indices), dim=1)
+        logprobs = compute_token_logprobs(logits, logprob_token_ids)
+    else:
+        # Some requests specified logprob_token_ids. Build the [batch_size,
+        # 1 + max_cols] token_ids matrix and validity mask on the GPU via a
+        # single triton kernel, overriding the topk columns with per-request
+        # tokens where applicable.
+        assert logprob_token_ids_state is not None
+        assert expanded_idx_mapping is not None
+        topk_indices = None
+        if num_logprobs > 0:
+            topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+
+        num_cols = max(num_logprobs, max_per_req_token_ids)
+        logprob_token_ids = sampled_token_ids.new_zeros((batch_size, 1 + num_cols))
+        valid_mask = torch.zeros_like(logprob_token_ids, dtype=torch.bool)
+        _fill_logprob_token_ids_kernel[(batch_size,)](
+            logprob_token_ids,
+            logprob_token_ids.stride(0),
+            valid_mask,
+            valid_mask.stride(0),
+            sampled_token_ids,
+            topk_indices if topk_indices is not None else logprob_token_ids,
+            topk_indices.stride(0) if topk_indices is not None else 0,
+            expanded_idx_mapping,
+            logprob_token_ids_state.num_token_ids.gpu,
+            logprob_token_ids_state.token_ids.gpu,
+            logprob_token_ids_state.token_ids.gpu.stride(0),
+            NUM_TOPK=num_logprobs,
+            PADDED_COLS=triton.next_power_of_2(num_cols),
+        )
+        logprobs = compute_token_logprobs(logits, logprob_token_ids)
+        logprobs = logprobs.masked_fill(~valid_mask, float("-inf"))
+
     token_ranks = torch.empty(batch_size, dtype=torch.int64, device=logits.device)
     _ranks_kernel[(batch_size,)](
         token_ranks,
@@ -122,3 +162,85 @@ def compute_topk_logprobs(
         selected_token_ranks=token_ranks,
         cu_num_generated_tokens=cu_num_logits,
     )
+
+
+@triton.jit
+def _fill_logprob_token_ids_kernel(
+    # [batch_size, 1 + num_cols]
+    out_token_ids_ptr,
+    out_token_ids_stride,
+    # [batch_size, 1 + num_cols]
+    out_valid_mask_ptr,
+    out_valid_mask_stride,
+    sampled_token_ids_ptr,  # [batch_size]
+    topk_indices_ptr,  # [batch_size, NUM_TOPK] (unused when NUM_TOPK == 0)
+    topk_indices_stride,
+    expanded_idx_mapping_ptr,  # [batch_size] -> req_state_idx
+    num_per_req_token_ids_ptr,  # [max_num_reqs]
+    per_req_token_ids_ptr,  # [max_num_reqs, MAX_LOGPROB_TOKEN_IDS]
+    per_req_token_ids_stride,
+    NUM_TOPK: tl.constexpr,
+    PADDED_COLS: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+
+    # Column 0: always the sampled token, always valid.
+    sampled = tl.load(sampled_token_ids_ptr + batch_idx)
+    tl.store(out_token_ids_ptr + batch_idx * out_token_ids_stride, sampled)
+    tl.store(out_valid_mask_ptr + batch_idx * out_valid_mask_stride, 1)
+
+    req_state_idx = tl.load(expanded_idx_mapping_ptr + batch_idx)
+    num_custom = tl.load(num_per_req_token_ids_ptr + req_state_idx)
+
+    col = tl.arange(0, PADDED_COLS)
+    tid_base = out_token_ids_ptr + batch_idx * out_token_ids_stride + 1
+    mask_base = out_valid_mask_ptr + batch_idx * out_valid_mask_stride + 1
+
+    if num_custom > 0:
+        # Override topk with per-request custom tokens.
+        src = per_req_token_ids_ptr + req_state_idx * per_req_token_ids_stride
+        valid = col < num_custom
+        # per_req_token_ids is int32; output is int64.
+        tokens = tl.load(src + col, mask=valid, other=0).to(tl.int64)
+    else:
+        # Fill with topk indices (no-op when NUM_TOPK == 0).
+        src = topk_indices_ptr + batch_idx * topk_indices_stride
+        valid = col < NUM_TOPK
+        tokens = tl.load(src + col, mask=valid, other=0)
+
+    tl.store(tid_base + col, tokens, mask=valid)
+    tl.store(mask_base + col, tl.full([PADDED_COLS], 1, tl.int1), mask=valid)
+
+
+class LogprobTokenIdsState:
+    """Per-request override of which token ids' logprobs to return.
+
+    See `SamplingParams.logprob_token_ids`.
+    """
+
+    def __init__(self, max_num_reqs: int, device: torch.device):
+        self.max_num_reqs = max_num_reqs
+        self.num_token_ids = UvaBackedTensor(max_num_reqs, dtype=torch.int32)
+        self.token_ids = StagedWriteTensor(
+            (max_num_reqs, MAX_LOGPROB_TOKEN_IDS),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def add_request(self, req_idx: int, sampling_params: SamplingParams) -> None:
+        token_ids = sampling_params.logprob_token_ids
+        if not token_ids:
+            self.num_token_ids.np[req_idx] = 0
+            return
+        n = len(token_ids)
+        if n > MAX_LOGPROB_TOKEN_IDS:
+            raise ValueError(f"Too many logprob_token_ids: {n}. The max is {MAX_LOGPROB_TOKEN_IDS}.")
+        self.num_token_ids.np[req_idx] = n
+        self.token_ids.stage_write(req_idx, 0, token_ids)
+
+    def apply_staged_writes(self) -> None:
+        self.num_token_ids.copy_to_uva()
+        self.token_ids.apply_write()
+
+    def max_num_token_ids(self, idx_mapping_np: np.ndarray) -> int:
+        return int(self.num_token_ids.np[idx_mapping_np].max(initial=0))
diff --git a/aphrodite/v1/worker/gpu/sample/sampler.py b/aphrodite/v1/worker/gpu/sample/sampler.py
index 685407f1d6..ddd1ecc9a8 100644
--- a/aphrodite/v1/worker/gpu/sample/sampler.py
+++ b/aphrodite/v1/worker/gpu/sample/sampler.py
@@ -12,7 +12,10 @@
 from aphrodite.v1.worker.gpu.sample.bad_words import BadWordsState
 from aphrodite.v1.worker.gpu.sample.gumbel import gumbel_sample
 from aphrodite.v1.worker.gpu.sample.logit_bias import LogitBiasState
-from aphrodite.v1.worker.gpu.sample.logprob import compute_topk_logprobs
+from aphrodite.v1.worker.gpu.sample.logprob import (
+    LogprobTokenIdsState,
+    compute_topk_logprobs,
+)
 from aphrodite.v1.worker.gpu.sample.output import SamplerOutput
 from aphrodite.v1.worker.gpu.sample.penalties import PenaltiesState
 from aphrodite.v1.worker.gpu.sample.states import NO_LOGPROBS, SamplingStates
@@ -38,6 +41,7 @@ def __init__(
         self.penalties_state = PenaltiesState(req_states)
         self.logit_bias_state = LogitBiasState(max_num_reqs, device)
         self.bad_words_state = BadWordsState(req_states)
+        self.logprob_token_ids_state = LogprobTokenIdsState(max_num_reqs, device)
         self.num_speculative_tokens = num_speculative_tokens
 
     def add_request(self, req_idx: int, prompt_len: int, sampling_params: SamplingParams) -> None:
@@ -45,12 +49,14 @@ def add_request(self, req_idx: int, prompt_len: int, sampling_params: SamplingPa
         self.penalties_state.add_request(req_idx, sampling_params)
         self.logit_bias_state.add_request(req_idx, prompt_len, sampling_params)
         self.bad_words_state.add_request(req_idx, sampling_params)
+        self.logprob_token_ids_state.add_request(req_idx, sampling_params)
 
     def apply_staged_writes(self) -> None:
         self.sampling_states.apply_staged_writes()
         self.penalties_state.apply_staged_writes()
         self.logit_bias_state.apply_staged_writes()
         self.bad_words_state.apply_staged_writes()
+        self.logprob_token_ids_state.apply_staged_writes()
 
     def __call__(
         self,
@@ -77,12 +83,22 @@ def __call__(
         )
 
         max_num_logprobs = self.sampling_states.max_num_logprobs(idx_mapping_np)
-        if max_num_logprobs != NO_LOGPROBS:
+        max_per_req_token_ids = self.logprob_token_ids_state.max_num_token_ids(idx_mapping_np)
+        if max_num_logprobs != NO_LOGPROBS or max_per_req_token_ids > 0:
             if self.logprobs_mode == "processed_logprobs":
                 logits = processed_logits
             expanded_logits = logits.shape[0] != idx_mapping_np.shape[0]
             cu_num_logits = cu_num_logits_np.tolist() if expanded_logits else None
-            logprobs_tensors = compute_topk_logprobs(logits, max_num_logprobs, sampled, cu_num_logits)
+            num_logprobs = max_num_logprobs if max_num_logprobs != NO_LOGPROBS else 0
+            logprobs_tensors = compute_topk_logprobs(
+                logits,
+                num_logprobs,
+                sampled,
+                cu_num_logits,
+                logprob_token_ids_state=self.logprob_token_ids_state,
+                expanded_idx_mapping=input_batch.expanded_idx_mapping,
+                max_per_req_token_ids=max_per_req_token_ids,
+            )
         else:
             logprobs_tensors = None
 
diff --git a/aphrodite/v1/worker/gpu/spec_decode/eagle/cudagraph.py b/aphrodite/v1/worker/gpu/spec_decode/eagle/cudagraph.py
index 4198836d44..ce1a6bfd2f 100644
--- a/aphrodite/v1/worker/gpu/spec_decode/eagle/cudagraph.py
+++ b/aphrodite/v1/worker/gpu/spec_decode/eagle/cudagraph.py
@@ -10,6 +10,7 @@
 from aphrodite.v1.worker.gpu.block_table import BlockTables
 from aphrodite.v1.worker.gpu.cudagraph_utils import (
     BatchExecutionDescriptor,
+    CapturedAttentionState,
     CudaGraphManager,
     prepare_inputs_to_capture,
 )
@@ -18,8 +19,8 @@
 from aphrodite.v1.worker.utils import AttentionGroup
 
 
-class EagleCudaGraphManager(CudaGraphManager):
-    """CudaGraphManager for Eagle speculative decoding."""
+class EagleCudaGraphManagerBase(CudaGraphManager):
+    """Base CudaGraphManager for Eagle with a dedicated graph pool."""
 
     def __init__(
         self,
@@ -37,6 +38,44 @@ def __init__(
         if cudagraph_mode:
             self.pool = torch.cuda.graph_pool_handle()
 
+
+class PrefillEagleCudaGraphManager(EagleCudaGraphManagerBase):
+    """Eagle CudaGraphManager for prefill, using pre-built attention states
+    from the target model's capture."""
+
+    def capture(
+        self,
+        forward_fn: Callable,
+        full_cg_attn_states: dict[BatchExecutionDescriptor, CapturedAttentionState],
+        progress_bar_desc: str = "Capturing CUDA graphs",
+    ) -> None:
+        def create_forward_fn(
+            desc: BatchExecutionDescriptor,
+        ) -> tuple[Callable[[CUDAGraphMode], None], CapturedAttentionState]:
+            num_tokens = desc.num_tokens
+            num_reqs = desc.num_reqs or min(num_tokens, self.max_num_reqs)
+            num_tokens_across_dp = (
+                torch.full((self.dp_size,), num_tokens, dtype=torch.int32, device="cpu") if self.dp_size > 1 else None
+            )
+            attn_state = full_cg_attn_states[desc]
+            attn_metadata, slot_mappings = attn_state
+            fwd = lambda cg_mode: forward_fn(
+                num_reqs,
+                num_tokens,
+                attn_metadata,
+                slot_mappings,
+                num_tokens_across_dp,
+                cg_mode,
+            )
+            return fwd, attn_state
+
+        super().capture(create_forward_fn, progress_bar_desc)
+
+
+class DecodeEagleCudaGraphManager(EagleCudaGraphManagerBase):
+    """Eagle CudaGraphManager for decode draft generation, building its own
+    attention metadata from scratch."""
+
     def capture(
         self,
         forward_fn: Callable,
@@ -47,17 +86,15 @@ def capture(
         kv_cache_config: KVCacheConfig,
         progress_bar_desc: str = "Capturing CUDA graphs",
     ) -> None:
-        """Capture CUDA graphs for Eagle."""
-
         def create_forward_fn(
             desc: BatchExecutionDescriptor,
-        ) -> Callable[[CUDAGraphMode], None]:
+        ) -> tuple[Callable[[CUDAGraphMode], None], CapturedAttentionState]:
             num_tokens = desc.num_tokens
             num_reqs = desc.num_reqs or min(num_tokens, self.max_num_reqs)
             num_tokens_across_dp = (
                 torch.full((self.dp_size,), num_tokens, dtype=torch.int32, device="cpu") if self.dp_size > 1 else None
             )
-            attn_metadata, slot_mappings = prepare_inputs_to_capture(
+            attn_state = prepare_inputs_to_capture(
                 num_reqs,
                 num_tokens,
                 model_state,
@@ -66,8 +103,9 @@ def create_forward_fn(
                 attn_groups,
                 kv_cache_config,
             )
+            attn_metadata, slot_mappings = attn_state
 
-            return lambda cg_mode: forward_fn(
+            fwd = lambda cg_mode: forward_fn(
                 num_reqs,
                 num_tokens,
                 attn_metadata,
@@ -75,5 +113,6 @@ def create_forward_fn(
                 num_tokens_across_dp,
                 cg_mode,
             )
+            return fwd, attn_state
 
         super().capture(create_forward_fn, progress_bar_desc)
diff --git a/aphrodite/v1/worker/gpu/spec_decode/eagle/speculator.py b/aphrodite/v1/worker/gpu/spec_decode/eagle/speculator.py
index 49b8dedc53..01a46a3595 100644
--- a/aphrodite/v1/worker/gpu/spec_decode/eagle/speculator.py
+++ b/aphrodite/v1/worker/gpu/spec_decode/eagle/speculator.py
@@ -20,6 +20,8 @@
 )
 from aphrodite.v1.worker.gpu.block_table import BlockTables
 from aphrodite.v1.worker.gpu.cudagraph_utils import (
+    BatchExecutionDescriptor,
+    CapturedAttentionState,
     get_uniform_token_count,
 )
 from aphrodite.v1.worker.gpu.dp_utils import dispatch_cg_and_sync_dp
@@ -27,7 +29,8 @@
 from aphrodite.v1.worker.gpu.model_states.interface import ModelState
 from aphrodite.v1.worker.gpu.sample.gumbel import gumbel_sample
 from aphrodite.v1.worker.gpu.spec_decode.eagle.cudagraph import (
-    EagleCudaGraphManager,
+    DecodeEagleCudaGraphManager,
+    PrefillEagleCudaGraphManager,
 )
 from aphrodite.v1.worker.gpu.spec_decode.eagle.utils import load_eagle_model
 
@@ -96,13 +99,13 @@ def __init__(self, aphrodite_config: AphroditeConfig, device: torch.device):
                 device=device,
             )
 
-        self.prefill_cudagraph_manager: EagleCudaGraphManager | None = None
-        self.decode_cudagraph_manager: EagleCudaGraphManager | None = None
+        self.prefill_cudagraph_manager: PrefillEagleCudaGraphManager | None = None
+        self.decode_cudagraph_manager: DecodeEagleCudaGraphManager | None = None
 
     def init_cudagraph_manager(self, cudagraph_mode: CUDAGraphMode) -> None:
         cudagraph_mode = self.aphrodite_config.compilation_config.cudagraph_mode
         # Initialize cudagraph manager for draft prefill (draft position 0).
-        self.prefill_cudagraph_manager = EagleCudaGraphManager(
+        self.prefill_cudagraph_manager = PrefillEagleCudaGraphManager(
             self.aphrodite_config,
             self.device,
             cudagraph_mode,
@@ -119,7 +122,7 @@ def init_cudagraph_manager(self, cudagraph_mode: CUDAGraphMode) -> None:
             cudagraph_mode = CUDAGraphMode.NONE
 
         # Initialize cudagraph manager for draft decodes (draft positions > 0).
-        self.decode_cudagraph_manager = EagleCudaGraphManager(
+        self.decode_cudagraph_manager = DecodeEagleCudaGraphManager(
             self.aphrodite_config,
             self.device,
             cudagraph_mode,
@@ -335,7 +338,10 @@ def _build_draft_attn_metadata(
         )
         return attn_metadata
 
-    def capture_model(self) -> None:
+    def capture(
+        self,
+        attn_states: dict[BatchExecutionDescriptor, CapturedAttentionState],
+    ) -> None:
         logger.info("Capturing model for Eagle speculator...")
         # Reset indices to zeros to prevent stale values from prior
         # dummy runs to cause out-of-bounds indexing during capture.
@@ -349,11 +355,7 @@ def capture_model(self) -> None:
         assert self.prefill_cudagraph_manager is not None
         self.prefill_cudagraph_manager.capture(
             self.prefill,
-            self.model_state,
-            self.input_buffers,
-            self.block_tables,
-            self.attn_groups,
-            self.kv_cache_config,
+            attn_states,
             progress_bar_desc="Capturing eagle prefill CUDA graphs",
         )
 
@@ -456,15 +458,6 @@ def propose(
         )
 
         if prefill_batch_desc.cg_mode == CUDAGraphMode.FULL:
-            # It is necessary to rebuild the attention metadata when
-            # replaying the FULL graph so that any attention metadata
-            # builder state is updated.
-            self._build_draft_attn_metadata(
-                num_reqs=num_reqs,
-                num_reqs_padded=prefill_batch_desc.num_reqs or num_reqs,
-                num_tokens_padded=prefill_batch_desc.num_tokens,
-                max_query_len=self.num_speculative_steps + 1,
-            )
             # Replay the full graph for draft prefill.
             assert self.prefill_cudagraph_manager is not None
             self.prefill_cudagraph_manager.run_fullgraph(prefill_batch_desc)
diff --git a/aphrodite/v1/worker/gpu_input_batch.py b/aphrodite/v1/worker/gpu_input_batch.py
index 33e217d490..0fdb292b04 100644
--- a/aphrodite/v1/worker/gpu_input_batch.py
+++ b/aphrodite/v1/worker/gpu_input_batch.py
@@ -8,6 +8,7 @@
 import numpy as np
 import torch
 
+from aphrodite.config.reasoning import ReasoningConfig
 from aphrodite.lora.request import LoRARequest
 from aphrodite.multimodal.inputs import MultiModalFeatureSpec
 from aphrodite.pooling_params import PoolingParams
@@ -23,6 +24,9 @@
 )
 from aphrodite.v1.sample.metadata import SamplingMetadata
 from aphrodite.v1.sample.ops.dry import init_dry_state
+from aphrodite.v1.sample.thinking_budget_state import (
+    maybe_create_thinking_budget_state_holder,
+)
 from aphrodite.v1.utils import copy_slice
 from aphrodite.v1.worker.block_table import MultiGroupBlockTable
 
@@ -49,6 +53,10 @@ class CachedRequestState:
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
 
+    # Per-position mask for mixed-mode inputs (e.g chat completion with
+    # prompt_embeds content parts). See `Request.prompt_is_token_ids`.
+    prompt_is_token_ids: list[bool] | None = None
+
     # Used when both async_scheduling and spec_decode are enabled.
     prev_num_draft_len: int = 0
 
@@ -96,12 +104,20 @@ def __init__(
         max_num_blocks_per_req: list[int] | None = None,
         logitsprocs: LogitsProcessors | None = None,
         logitsprocs_need_output_token_ids: bool = False,
-        is_spec_decode: bool = False,
+        num_spec_tokens: int = 0,
         is_pooling_model: bool = False,
         cp_kv_cache_interleave_size: int = 1,
+        reasoning_config: ReasoningConfig | None = None,
     ):
+        self.thinking_budget_state_holder = maybe_create_thinking_budget_state_holder(
+            reasoning_config,
+            max_num_reqs,
+            num_spec_tokens,
+            device,
+            pin_memory,
+        )
+        self.thinking_token_budget_reqs: set[str] = set()
         self.is_pooling_model = is_pooling_model
-        self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_batched_tokens = max_num_batched_tokens
@@ -484,7 +500,10 @@ def add_request(
         end_idx = start_idx + len(request.output_token_ids)
         if request.prompt_token_ids is not None:
             self.token_ids_cpu[req_index, :num_prompt_tokens] = request.prompt_token_ids
-            self.is_token_ids[req_index, :num_prompt_tokens] = True
+            if request.prompt_is_token_ids is not None:
+                self.is_token_ids[req_index, :num_prompt_tokens] = request.prompt_is_token_ids
+            else:
+                self.is_token_ids[req_index, :num_prompt_tokens] = True
         else:
             self.is_token_ids[req_index, :num_prompt_tokens] = False
         if request.prompt_embeds is not None:
@@ -750,6 +769,7 @@ def remove_request(self, req_id: str) -> int | None:
             # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         self.bad_words_token_ids.pop(req_index, None)
+        self.thinking_token_budget_reqs.discard(req_id)
         self.logit_bias.pop(req_index, None)
         self.dry_sequence_breaker_ids.pop(req_index, None)
         self.persistent_data.pop(req_index, None)
@@ -1116,6 +1136,8 @@ def refresh_metadata(self):
         # reset batch update tracking.
         # Update sampling metadata if batch state is changed.
         batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
+        if self.thinking_budget_state_holder is not None and batch_update:
+            self.thinking_budget_state_holder.sync_batch(batch_update)
         for logit_proc in self.logitsprocs.all:
             logit_proc.update_state(batch_update)
         if batch_update:
@@ -1219,12 +1241,15 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
 
         # Only set output_token_ids if required by the current requests'
         # sampling parameters.
+        holder = self.thinking_budget_state_holder
+        thinking_budget_tracks_reqs = holder is not None and holder.has_tracked_requests()
         needs_output_token_ids = (
             not self.no_penalties
             or not self.no_dry
             or not self.no_no_repeat_ngram
             or bool(self.bad_words_token_ids)
             or self.logitsprocs_need_output_token_ids
+            or not thinking_budget_tracks_reqs
         )
         output_token_ids = cast(list[list[int]], self.req_output_token_ids) if needs_output_token_ids else []
         output_token_ids_tensor = None
@@ -1312,6 +1337,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             bad_words_token_ids=self.bad_words_token_ids,
             logit_bias=self.logit_bias,
             logitsprocs=self.logitsprocs,
+            thinking_budget_state_holder=self.thinking_budget_state_holder,
             temperature_last=self.temperature_last[:num_reqs],
             persistent_data=self.persistent_data,
         )
@@ -1563,6 +1589,10 @@ def no_penalties(self) -> bool:
             and len(self.repetition_penalties_reqs) == 0
         )
 
+    @property
+    def no_thinking_budget(self) -> bool:
+        return self.thinking_budget_state_holder is None or len(self.thinking_token_budget_reqs) == 0
+
     @property
     def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
diff --git a/aphrodite/v1/worker/gpu_model_runner.py b/aphrodite/v1/worker/gpu_model_runner.py
index 6397ee68b1..7e4d8530bd 100644
--- a/aphrodite/v1/worker/gpu_model_runner.py
+++ b/aphrodite/v1/worker/gpu_model_runner.py
@@ -593,7 +593,7 @@ def __init__(
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[placeholder_block_size],
             kernel_block_sizes=[placeholder_block_size],
-            is_spec_decode=bool(self.aphrodite_config.speculative_config),
+            num_spec_tokens=self.num_spec_tokens,
             logitsprocs=build_logitsprocs(
                 self.aphrodite_config,
                 self.device,
@@ -609,6 +609,7 @@ def __init__(
             or self.aphrodite_config.reasoning_config is not None,
             is_pooling_model=self.is_pooling_model,
             cp_kv_cache_interleave_size=self.parallel_config.cp_kv_cache_interleave_size,
+            reasoning_config=self.aphrodite_config.reasoning_config,
         )
 
         # Separate cuda stream for overlapping transfer of sampled token ids from
@@ -814,6 +815,9 @@ def reset_encoder_cache(self) -> None:
         self.encoder_cache.clear()
         self.late_interaction_runner.clear()
 
+    def post_kv_cache_wake_up(self) -> None:
+        self.init_fp8_kv_scales()
+
     @torch.inference_mode()
     def init_fp8_kv_scales(self) -> None:
         """
@@ -1077,6 +1081,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> Callable | None
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
                 prompt_embeds=new_req_data.prompt_embeds,
+                prompt_is_token_ids=new_req_data.prompt_is_token_ids,
                 mm_features=new_req_data.mm_features,
                 sampling_params=sampling_params,
                 pooling_params=pooling_params,
@@ -1402,9 +1407,13 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
         assert req_state.prompt_token_ids is not None, "M-RoPE requires prompt_token_ids to be available."
         mrope_model = cast(SupportsMRoPE, model)
 
+        # `prompt_embeds` is a passthrough modality (no grid_thw), models'
+        # M-RoPE code assumes per-feature grid info, so filter it out. The
+        # prompt_embeds positions are treated as text positions for M-RoPE.
+        mrope_features = [f for f in req_state.mm_features if f.modality != "prompt_embeds"]
         req_state.mrope_positions, req_state.mrope_position_delta = mrope_model.get_mrope_input_positions(
             req_state.prompt_token_ids,
-            req_state.mm_features,
+            mrope_features,
         )
 
     def _init_xdrope_positions(self, req_state: CachedRequestState):
@@ -2536,6 +2545,27 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput") -> list[torch
         if not mm_kwargs:
             return []
 
+        # `prompt_embeds` is a passthrough modality, the tensor is already in
+        # the model embedding space, so no encoder runs. Inject each
+        # `prompt_embeds` tensor directly into the encoder cache here so that
+        # `_gather_mm_embeddings` can splice it via the standard `is_mm_embed`
+        # path.
+        pe_indices = [i for i, (modality, _) in enumerate(mm_kwargs) if modality == "prompt_embeds"]
+        if pe_indices:
+            for i in pe_indices:
+                pe_tensor = mm_kwargs[i][1]["embedding"].data
+                assert isinstance(pe_tensor, torch.Tensor)
+
+                self.encoder_cache[mm_hashes[i]] = pe_tensor.to(self.device)
+                self.maybe_save_ec_to_connector(self.encoder_cache, mm_hashes[i])
+            # Filter out `prompt_embeds` items from mm_kwargs/mm_hashes/mm_lora_refs
+            # since they don't require further encoder processing.
+            mm_hashes = [h for i, h in enumerate(mm_hashes) if i not in pe_indices]
+            mm_kwargs = [k for i, k in enumerate(mm_kwargs) if i not in pe_indices]
+            mm_lora_refs = [r for i, r in enumerate(mm_lora_refs) if i not in pe_indices]
+            if not mm_kwargs:
+                return []  # nothing left to encode after filtering out `prompt_embeds`
+
         should_time = bool(
             self.observability_config
             and self.observability_config.enable_mm_processor_stats
@@ -5260,6 +5290,24 @@ def _dummy_sampler_run(
         )
         try:
             sampler_output = self.sampler(logits=logits, sampling_metadata=dummy_metadata)
+            # Also warm forward_native (taken when generators dict is non-empty),
+            # but skip the extra call in 'processed_logits' / 'processed_logprobs'
+            # modes — there TopKTopPSampler binds forward = forward_native at
+            # init time, so the warmup call is redundant and only inflates peak
+            # memory during profile_run.
+            # No .clone() of logits: warmup output is discarded, so any in-place
+            # mutation by forward_native does not affect correctness.
+            if self.sampler.logprobs_mode not in (
+                "processed_logits",
+                "processed_logprobs",
+            ):
+                self.sampler(
+                    logits=logits,
+                    sampling_metadata=replace(
+                        dummy_metadata,
+                        generators={0: torch.Generator(device=self.device).manual_seed(0)},
+                    ),
+                )
         except RuntimeError as e:
             if "out of memory" in str(e):
                 raise RuntimeError(
@@ -5454,7 +5502,6 @@ def _init_minimal_kv_cache_for_profiling(self) -> None:
             self.aphrodite_config,
             kv_cache_groups,
             available_memory=0,
-            suppress_log=True,
         )
         self.cache_config.num_gpu_blocks_override = saved_override
 
@@ -6071,10 +6118,11 @@ def may_reinitialize_input_batch(self, kv_cache_config: KVCacheConfig, kernel_bl
                 block_sizes=block_sizes,
                 kernel_block_sizes=kernel_block_sizes,
                 max_num_blocks_per_req=max_num_blocks,
-                is_spec_decode=bool(self.aphrodite_config.speculative_config),
+                num_spec_tokens=self.num_spec_tokens,
                 logitsprocs=self.input_batch.logitsprocs,
                 logitsprocs_need_output_token_ids=self.input_batch.logitsprocs_need_output_token_ids,
                 is_pooling_model=self.is_pooling_model,
+                reasoning_config=self.aphrodite_config.reasoning_config,
             )
 
         assert self._init_block_sizes == block_sizes, (
diff --git a/aphrodite/v1/worker/gpu_ubatch_wrapper.py b/aphrodite/v1/worker/gpu_ubatch_wrapper.py
index 11c91b8137..c88c91d534 100644
--- a/aphrodite/v1/worker/gpu_ubatch_wrapper.py
+++ b/aphrodite/v1/worker/gpu_ubatch_wrapper.py
@@ -71,7 +71,9 @@ def __init__(
                 A function that sets the number of SMs for computation.
         """
 
-        assert current_platform.is_cuda(), "SM control is currently only supported on CUDA"
+        assert current_platform.is_cuda() or current_platform.is_rocm(), (
+            "SM/CU control is supported on CUDA and ROCm platforms"
+        )
         device = torch.accelerator.current_device_index()
         total_sms = num_compute_units(device)
 
diff --git a/aphrodite/v1/worker/gpu_worker.py b/aphrodite/v1/worker/gpu_worker.py
index a04b8be91a..aada237022 100644
--- a/aphrodite/v1/worker/gpu_worker.py
+++ b/aphrodite/v1/worker/gpu_worker.py
@@ -5,12 +5,13 @@
 import gc
 import os
 from collections.abc import Callable
-from contextlib import AbstractContextManager, nullcontext
+from contextlib import AbstractContextManager, contextmanager, nullcontext
 from datetime import timedelta
 from types import NoneType
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
+import regex as re
 import torch
 import torch.nn as nn
 
@@ -50,7 +51,7 @@
 from aphrodite.tracing import instrument
 from aphrodite.utils.mem_constants import GiB_bytes
 from aphrodite.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
-from aphrodite.utils.torch_utils import is_quantized_kv_cache, set_random_seed
+from aphrodite.utils.torch_utils import set_random_seed
 from aphrodite.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from aphrodite.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from aphrodite.v1.outputs import (
@@ -196,15 +197,8 @@ def wake_up(self, tags: list[str] | None = None) -> None:
                     buffer.data.copy_(self._sleep_saved_buffers[name].data)
             self._sleep_saved_buffers = {}
 
-        # If the KV cache has just been woken up,
-        # the internal state of cache_engine must be reset,
-        # especially the FP8 scaling factor.
-        if (
-            (tags is None or "kv_cache" in tags)
-            and is_quantized_kv_cache(self.cache_config.cache_dtype)
-            and hasattr(self.model_runner, "init_fp8_kv_scales")
-        ):
-            self.model_runner.init_fp8_kv_scales()
+        if tags is None or "kv_cache" in tags:
+            self.model_runner.post_kv_cache_wake_up()
 
     def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
         if not self.aphrodite_config.model_config.enable_sleep_mode:
@@ -217,6 +211,28 @@ def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
             assert allocator.get_current_usage() == 0, "Sleep mode can only be used for one instance per process."
         return allocator.use_memory_pool(tag=tag)
 
+    @contextmanager
+    def _scoped_allocator_max_split(self, max_split_size_mb: int):
+        """Temporarily set max_split_size_mb to reduce allocator fragmentation at the
+        cost of more cudaMalloc calls (negligible in practice). Restores the original
+        value on exit."""
+        if not current_platform.is_cuda():
+            yield
+            return
+
+        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+        match = re.search(r"max_split_size_mb:(\d+)", conf)
+        original_value = match.group(1) if match else None
+
+        torch._C._accelerator_setAllocatorSettings(f"max_split_size_mb:{max_split_size_mb}")
+        try:
+            yield
+        finally:
+            # PyTorch defaults to SIZE_MAX (no limit).
+            _SIZE_MAX_MB = (2**64 - 1) // (1024 * 1024)
+            restore = original_value if original_value else str(_SIZE_MAX_MB)
+            torch._C._accelerator_setAllocatorSettings(f"max_split_size_mb:{restore}")
+
     @instrument(span_name="Init device")
     def init_device(self):
         if self.device_config.device_type == "cuda":
@@ -315,6 +331,8 @@ def load_model(self, *, load_dummy_weights: bool = False) -> None:
         with (
             self._maybe_get_memory_pool_context(tag="weights"),
             set_current_aphrodite_config(self.aphrodite_config),
+            # 20 MiB is the minimum PyTorch allows for max_split_size_mb.
+            self._scoped_allocator_max_split(max_split_size_mb=20),
         ):
             self.model_runner.load_model(load_dummy_weights=load_dummy_weights)
 
diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake
index 338a961029..6623a95a45 100644
--- a/cmake/external_projects/deepgemm.cmake
+++ b/cmake/external_projects/deepgemm.cmake
@@ -59,11 +59,26 @@ if(DEEPGEMM_ARCHS)
   # Build the _C pybind11 extension from DeepGEMM's C++ source.
   # This is a CXX-only module — CUDA kernels are JIT-compiled at runtime.
   #
-  Python_add_library(_deep_gemm_C MODULE WITH_SOABI
-    "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
+  # Free-threaded Python doesn't yet support the stable ABI, so skip USE_SABI
+  # there. (The other Aphrodite extensions get this guard for free via
+  # define_extension_target; this target uses raw Python_add_library.)
+  run_python(IS_FREETHREADED_PYTHON
+    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+    "Failed to determine whether interpreter is free-threaded")
+  if (NOT IS_FREETHREADED_PYTHON)
+    Python_add_library(_deep_gemm_C MODULE WITH_SOABI USE_SABI 3
+      "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
+  else()
+    Python_add_library(_deep_gemm_C MODULE WITH_SOABI
+      "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
+  endif()
 
   # The pybind11 module name must be _C to match DeepGEMM's Python imports.
-  set_target_properties(_deep_gemm_C PROPERTIES OUTPUT_NAME "_C")
+  # Place the build artifact in a subdir so it doesn't collide with Aphrodite's own
+  # `_C.abi3.so` in the build tree (the install destination still differs).
+  set_target_properties(_deep_gemm_C PROPERTIES
+    OUTPUT_NAME "_C"
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm")
 
   target_compile_definitions(_deep_gemm_C PRIVATE
     "-DTORCH_EXTENSION_NAME=_C")
@@ -75,11 +90,15 @@ if(DEEPGEMM_ARCHS)
     "${deepgemm_SOURCE_DIR}/third-party/cutlass/tools/util/include"
     "${deepgemm_SOURCE_DIR}/third-party/fmt/include")
 
+  # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+  # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
   target_compile_options(_deep_gemm_C PRIVATE
     $<$<COMPILE_LANGUAGE:CXX>:-std=c++17>
     $<$<COMPILE_LANGUAGE:CXX>:-O3>
     $<$<COMPILE_LANGUAGE:CXX>:-Wno-psabi>
-    $<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>)
+    $<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>
+    $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
+    $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 
   # torch_python is required because DeepGEMM uses pybind11 type casters
   # for at::Tensor (via PYBIND11_MODULE), unlike Aphrodite's own extensions which
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index 59326df99a..9f3a40e859 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -1,5 +1,16 @@
 #include "cpu_attn_dispatch_generated.h"
 
+// Maps kv_cache_dtype string to Fp8KVCacheDataType enum.
+// "auto" -> kAuto(0); "fp8"/"fp8_e4m3" -> kFp8E4M3; "fp8_e5m2" -> kFp8E5M2.
+static inline cpu_attention::Fp8KVCacheDataType parse_fp8_kv_dtype(
+    const std::string& kv_cache_dtype) {
+  if (kv_cache_dtype == "fp8_e5m2")
+    return cpu_attention::Fp8KVCacheDataType::kFp8E5M2;
+  if (kv_cache_dtype == "fp8_e4m3" || kv_cache_dtype == "fp8")
+    return cpu_attention::Fp8KVCacheDataType::kFp8E4M3;
+  return cpu_attention::Fp8KVCacheDataType::kAuto;
+}
+
 torch::Tensor get_scheduler_metadata(
     const int64_t num_req, const int64_t num_heads_q,
     const int64_t num_heads_kv, const int64_t head_dim,
@@ -49,7 +60,7 @@ torch::Tensor get_scheduler_metadata(
   input.enable_kv_split = enable_kv_split;
 
   APHRODITE_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
-    CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
+    CPU_ATTN_DISPATCH(head_dim, isa, 0, [&]() {
       input.elem_size = sizeof(scalar_t);
       input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
       input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
@@ -72,7 +83,9 @@ void cpu_attn_reshape_and_cache(
         key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
     torch::Tensor&
         value_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
-    const torch::Tensor& slot_mapping, const std::string& isa) {
+    const torch::Tensor& slot_mapping, const std::string& isa,
+    const double k_scale = 1.0, const double v_scale = 1.0,
+    const std::string& kv_cache_dtype = "auto") {
   TORCH_CHECK_EQ(key.dim(), 3);
   TORCH_CHECK_EQ(value.dim(), 3);
   TORCH_CHECK_EQ(key_cache.dim(), 4);
@@ -80,18 +93,30 @@ void cpu_attn_reshape_and_cache(
   TORCH_CHECK_EQ(key.stride(2), 1);
   TORCH_CHECK_EQ(value.stride(2), 1);
 
+  const int64_t kv_cache_idx =
+      static_cast<int64_t>(parse_fp8_kv_dtype(kv_cache_dtype));
+  const bool is_fp8 = (kv_cache_idx != 0);
+
+  if (is_fp8) {
+    TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte,
+                "key_cache must be uint8 for FP8 path");
+    TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte,
+                "value_cache must be uint8 for FP8 path");
+    TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path");
+    TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path");
+  }
+
+  const float k_inv = is_fp8 ? 1.0f / static_cast<float>(k_scale) : 0.0f;
+  const float v_inv = is_fp8 ? 1.0f / static_cast<float>(v_scale) : 0.0f;
+
   const int64_t token_num = key.size(0);
-  const int64_t key_token_num_stride = key.stride(0);
-  const int64_t value_token_num_stride = value.stride(0);
-  const int64_t head_num = value.size(1);
-  const int64_t key_head_num_stride = key.stride(1);
-  const int64_t value_head_num_stride = value.stride(1);
+  const int64_t head_num = key.size(1);
+  const int64_t head_dim = key.size(2);
   const int64_t num_blocks = key_cache.size(0);
   const int64_t num_blocks_stride = key_cache.stride(0);
   const int64_t cache_head_num_stride = key_cache.stride(1);
   const int64_t block_size = key_cache.size(2);
   const int64_t block_size_stride = key_cache.stride(2);
-  const int64_t head_dim = key.size(-1);
 
   cpu_attention::ISA isa_tag = [&]() {
     if (isa == "amx") {
@@ -109,16 +134,24 @@ void cpu_attn_reshape_and_cache(
     }
   }();
 
+  if (is_fp8) {
+    TORCH_CHECK(isa_tag == cpu_attention::ISA::AMX ||
+                    isa_tag == cpu_attention::ISA::VEC,
+                "FP8 KV cache is only supported on x86 (AMX/VEC) ISA");
+  }
+
   APHRODITE_DISPATCH_FLOATING_TYPES(
       key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
-        CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
+        CPU_ATTN_DISPATCH(head_dim, isa_tag, kv_cache_idx, [&]() {
+          using kv_t = typename attn_impl::kv_cache_t;
           attn_impl::reshape_and_cache(
               key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-              key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-              slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
-              value_token_num_stride, head_num, key_head_num_stride,
-              value_head_num_stride, num_blocks, num_blocks_stride,
-              cache_head_num_stride, block_size, block_size_stride);
+              reinterpret_cast<kv_t*>(key_cache.data_ptr()),
+              reinterpret_cast<kv_t*>(value_cache.data_ptr()),
+              slot_mapping.data_ptr<int64_t>(), token_num, key.stride(0),
+              value.stride(0), head_num, key.stride(1), value.stride(1),
+              num_blocks, num_blocks_stride, cache_head_num_stride, block_size,
+              block_size_stride, k_inv, v_inv);
         });
       });
 }
@@ -137,13 +170,26 @@ void cpu_attention_with_kv_cache(
     const int64_t sliding_window_left, const int64_t sliding_window_right,
     const torch::Tensor& block_table,  // [num_tokens, max_block_num]
     const double softcap, const torch::Tensor& scheduler_metadata,
-    const std::optional<torch::Tensor>& s_aux  // [num_heads]
-) {
+    const std::optional<torch::Tensor>& s_aux,  // [num_heads]
+    const double k_scale = 1.0, const double v_scale = 1.0,
+    const std::string& kv_cache_dtype = "auto") {
   TORCH_CHECK_EQ(query.dim(), 3);
   TORCH_CHECK_EQ(query.stride(2), 1);
   TORCH_CHECK_EQ(key_cache.dim(), 4);
   TORCH_CHECK_EQ(value_cache.dim(), 4);
 
+  const int64_t kv_cache_idx =
+      static_cast<int64_t>(parse_fp8_kv_dtype(kv_cache_dtype));
+  const bool is_fp8 = (kv_cache_idx != 0);
+  if (is_fp8) {
+    TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte,
+                "key_cache must be uint8 for FP8 path");
+    TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte,
+                "value_cache must be uint8 for FP8 path");
+    TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path");
+    TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path");
+  }
+
   cpu_attention::AttentionInput input;
   input.metadata = reinterpret_cast<cpu_attention::AttentionMetadata*>(
       scheduler_metadata.data_ptr());
@@ -165,25 +211,32 @@ void cpu_attention_with_kv_cache(
   input.block_table = block_table.data_ptr<int32_t>();
   input.alibi_slopes =
       alibi_slopes.has_value() ? alibi_slopes->data_ptr<float>() : nullptr;
-  // For now sink must be bf16
   input.s_aux = s_aux.has_value() ? s_aux->data_ptr<c10::BFloat16>() : nullptr;
   input.scale = scale;
   input.causal = causal;
   input.sliding_window_left = sliding_window_left;
   input.sliding_window_right = sliding_window_right;
   if (input.causal) {
-    // to make boundary calculation easier
     input.sliding_window_right = 0;
   }
-  float softcap_fp32 = softcap;
-  input.softcap = softcap_fp32;
+  input.softcap = static_cast<float>(softcap);
+
+  if (is_fp8) {
+    input.k_scale_fp8 = static_cast<float>(k_scale);
+    input.v_scale_fp8 = static_cast<float>(v_scale);
+    TORCH_CHECK(input.metadata->isa == cpu_attention::ISA::AMX ||
+                    input.metadata->isa == cpu_attention::ISA::VEC,
+                "FP8 KV cache is only supported on x86 (AMX/VEC) ISA");
+  }
 
   APHRODITE_DISPATCH_FLOATING_TYPES(
       query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
-        CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
-          TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
-          cpu_attention::AttentionMainLoop<attn_impl> mainloop;
-          mainloop(&input);
-        });
+        CPU_ATTN_DISPATCH(
+            query.size(2), input.metadata->isa, kv_cache_idx, [&]() {
+              TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment,
+                             0);
+              cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+              mainloop(&input);
+            });
       });
 }
diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp
index 1c8644d523..6a0341085d 100644
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -1,6 +1,7 @@
 #ifndef CPU_ATTN_AMX_HPP
 #define CPU_ATTN_AMX_HPP
 
+#include "cpu_attn_fp8.hpp"
 #include "cpu_attn_impl.hpp"
 
 namespace cpu_attention {
@@ -21,9 +22,10 @@ typedef struct __tile_config {
 // 2-2-4 pattern, for 16 < m <= 32
 // TILE 0, 1: load A matrix, row num should be 16, m - 16
 // TILE 2, 3: load B matrix, row num should be 16
-// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
-// - 16
-template <typename kv_cache_t>
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16,
+// m - 16, m - 16
+// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type.
+template <typename q_buffer_t, typename kv_cache_t>
 class TileGemm224 {
  public:
   template <AttentionGemmPhase phase, int32_t k_size>
@@ -42,13 +44,56 @@ class TileGemm224 {
   }
 };
 
-template <>
-class TileGemm224<c10::BFloat16> {
+// Dequantize one FP8 tile (AMX_TILE_ROW_NUM rows x 32 cols) to BF16.
+template <typename kv_cache_t>
+FORCE_INLINE void deq_tile_amx(const uint8_t* src, c10::BFloat16* dst) {
+  for (int r = 0; r < AMX_TILE_ROW_NUM; ++r) {
+    if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e4m3fn>) {
+      vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e4m3_tag{})
+          .save(dst + r * 32);
+    } else {
+      vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e5m2_tag{})
+          .save(dst + r * 32);
+    }
+  }
+}
+
+// For FP8: dequant src into scratch and return scratch.
+// For BF16: return src directly (scratch is unused; the compiler elides it).
+template <typename kv_cache_t>
+FORCE_INLINE const c10::BFloat16* prepare_b_tile(const kv_cache_t* src,
+                                                 c10::BFloat16* scratch) {
+  if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<kv_cache_t, c10::Float8_e5m2>) {
+    deq_tile_amx<kv_cache_t>(reinterpret_cast<const uint8_t*>(src), scratch);
+    return scratch;
+  } else {
+    return reinterpret_cast<const c10::BFloat16*>(src);
+  }
+}
+
+// Handles both BF16 and FP8 KV cache (2-2-4 pattern).
+template <typename kv_cache_t>
+class TileGemm224<c10::BFloat16, kv_cache_t> {
+  static_assert(std::is_same_v<kv_cache_t, c10::BFloat16> ||
+                    std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+                    std::is_same_v<kv_cache_t, c10::Float8_e5m2>,
+                "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2");
+
+  static constexpr bool fp8_kv =
+      std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+      std::is_same_v<kv_cache_t, c10::Float8_e5m2>;
+
+  static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16);
+  // BF16 path: scratch_elems=1 so the scratch array is eliminated by the
+  // compiler.
+  static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1;
+
  public:
   template <AttentionGemmPhase phase, int32_t k_size>
   FORCE_INLINE static void gemm(const int32_t m_size,
                                 c10::BFloat16* __restrict__ a_tile,
-                                c10::BFloat16* __restrict__ b_tile,
+                                kv_cache_t* __restrict__ b_tile,
                                 float* __restrict__ c_tile, const int64_t lda,
                                 const int64_t ldb, const int64_t ldc,
                                 const int32_t block_size,
@@ -56,6 +101,7 @@ class TileGemm224<c10::BFloat16> {
                                 const bool accum_c) {
     const int32_t k_times =
         dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+
     c10::BFloat16* __restrict__ a_tile_0 = a_tile;
     c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
     const int64_t a_tile_stride = [&]() {
@@ -70,8 +116,8 @@ class TileGemm224<c10::BFloat16> {
       }
     }();
 
-    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
-    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+    kv_cache_t* __restrict__ b_tile_2 = b_tile;
+    kv_cache_t* __restrict__ b_tile_3 = [&]() {
       if constexpr (phase == AttentionGemmPhase::QK) {
         // k_cache is prepacked
         return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
@@ -106,11 +152,16 @@ class TileGemm224<c10::BFloat16> {
       _tile_zero(7);
     }
 
+    alignas(64) c10::BFloat16 scratch_2[scratch_elems];
+    alignas(64) c10::BFloat16 scratch_3[scratch_elems];
     for (int32_t k = 0; k < k_times; ++k) {
+      const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+      const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+
       _tile_loadd(0, a_tile_0, a_tile_stride);
-      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_stream_loadd(2, const_cast<c10::BFloat16*>(load_2), b_tile_stride);
       _tile_dpbf16ps(4, 0, 2);
-      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_stream_loadd(3, const_cast<c10::BFloat16*>(load_3), b_tile_stride);
       _tile_dpbf16ps(5, 0, 3);
       _tile_loadd(1, a_tile_1, a_tile_stride);
       _tile_dpbf16ps(6, 1, 2);
@@ -154,13 +205,13 @@ class TileGemm224<c10::BFloat16> {
 };
 
 // 1-2-2 pattern, for 0 < m <= 16
-// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
-// m, m
-// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
-// num should be 16
-// TILE 6, 7, (6, 7): store results C matrix, row num should be
-// m
-template <typename kv_cache_t>
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should
+// be m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row num
+// should be 16
+// TILE 6, 7: store results C matrix, row num should be m
+// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type.
+template <typename q_buffer_t, typename kv_cache_t>
 class TileGemm122 {
  public:
   template <AttentionGemmPhase phase, int32_t k_size>
@@ -179,13 +230,26 @@ class TileGemm122 {
   }
 };
 
-template <>
-class TileGemm122<c10::BFloat16> {
+// Handles both BF16 and FP8 KV cache (1-2-2 pattern).
+template <typename kv_cache_t>
+class TileGemm122<c10::BFloat16, kv_cache_t> {
+  static_assert(std::is_same_v<kv_cache_t, c10::BFloat16> ||
+                    std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+                    std::is_same_v<kv_cache_t, c10::Float8_e5m2>,
+                "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2");
+
+  static constexpr bool fp8_kv =
+      std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+      std::is_same_v<kv_cache_t, c10::Float8_e5m2>;
+
+  static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16);
+  static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1;
+
  public:
   template <AttentionGemmPhase phase, int32_t k_size>
   FORCE_INLINE static void gemm(const int32_t m_size,
                                 c10::BFloat16* __restrict__ a_tile,
-                                c10::BFloat16* __restrict__ b_tile,
+                                kv_cache_t* __restrict__ b_tile,
                                 float* __restrict__ c_tile, const int64_t lda,
                                 const int64_t ldb, const int64_t ldc,
                                 const int32_t block_size,
@@ -215,21 +279,19 @@ class TileGemm122<c10::BFloat16> {
       }
     }();
 
-    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
-    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+    kv_cache_t* __restrict__ b_tile_2 = b_tile;
+    kv_cache_t* __restrict__ b_tile_3 = [&]() {
       if constexpr (phase == AttentionGemmPhase::QK) {
-        // k_cache is prepacked
         return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
       } else if constexpr (phase == AttentionGemmPhase::PV) {
-        // v_cache is prepacked
         return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
       } else {
         TORCH_CHECK(false, "Unreachable");
       }
     }();
-    c10::BFloat16* __restrict__ b_tile_4 =
+    kv_cache_t* __restrict__ b_tile_4 =
         b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
-    c10::BFloat16* __restrict__ b_tile_5 =
+    kv_cache_t* __restrict__ b_tile_5 =
         b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
     int64_t b_stride = AMX_TILE_ROW_BYTES;
 
@@ -250,16 +312,25 @@ class TileGemm122<c10::BFloat16> {
       _tile_zero(7);
     }
 
+    alignas(64) c10::BFloat16 scratch_2[scratch_elems];
+    alignas(64) c10::BFloat16 scratch_3[scratch_elems];
+    alignas(64) c10::BFloat16 scratch_4[scratch_elems];
+    alignas(64) c10::BFloat16 scratch_5[scratch_elems];
     for (int32_t k = 0; k < k_group_times; ++k) {
+      const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+      const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+      const c10::BFloat16* load_4 = prepare_b_tile(b_tile_4, scratch_4);
+      const c10::BFloat16* load_5 = prepare_b_tile(b_tile_5, scratch_5);
+
       _tile_loadd(0, a_tile_0, a_tile_stride);
-      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_stream_loadd(2, const_cast<c10::BFloat16*>(load_2), b_stride);
       _tile_dpbf16ps(6, 0, 2);
-      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_stream_loadd(3, const_cast<c10::BFloat16*>(load_3), b_stride);
       _tile_dpbf16ps(7, 0, 3);
       _tile_loadd(1, a_tile_1, a_tile_stride);
-      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_stream_loadd(4, const_cast<c10::BFloat16*>(load_4), b_stride);
       _tile_dpbf16ps(6, 1, 4);
-      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_stream_loadd(5, const_cast<c10::BFloat16*>(load_5), b_stride);
       _tile_dpbf16ps(7, 1, 5);
 
       // update ptrs
@@ -279,10 +350,13 @@ class TileGemm122<c10::BFloat16> {
     }
 
     if (has_tail) {
+      const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2);
+      const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3);
+
       _tile_loadd(0, a_tile_0, a_tile_stride);
-      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_stream_loadd(2, const_cast<c10::BFloat16*>(load_2), b_stride);
       _tile_dpbf16ps(6, 0, 2);
-      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_stream_loadd(3, const_cast<c10::BFloat16*>(load_3), b_stride);
       _tile_dpbf16ps(7, 0, 3);
     }
 
@@ -302,21 +376,25 @@ class TileGemm122<c10::BFloat16> {
     _tile_loadconfig(&config);
   }
 };
+
 }  // namespace
 
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::AMX, scalar_t, head_dim, kv_cache_scalar_t> {
+  static constexpr bool fp8_kv =
+      std::is_same_v<kv_cache_scalar_t, c10::Float8_e4m3fn> ||
+      std::is_same_v<kv_cache_scalar_t, c10::Float8_e5m2>;
+
  public:
   using query_t = scalar_t;
   using q_buffer_t = scalar_t;
-  using kv_cache_t = scalar_t;
+  using kv_cache_t = kv_cache_scalar_t;
   using logits_buffer_t = float;
   using partial_output_buffer_t = float;
   using prob_buffer_t = scalar_t;
 
   constexpr static int64_t BlockSizeAlignment =
-      AMX_TILE_ROW_BYTES /
-      sizeof(kv_cache_t);  // KV token num unit of QK and PV phases
+      32;  // AMX_TILE_ROW_NUM = 16 tokens/tile; 32 = 2 tiles
   constexpr static int64_t HeadDimAlignment =
       2 * (AMX_TILE_ROW_BYTES / 4);  // headdim num unit of PV phase
   constexpr static int64_t MaxQHeadNumPerIteration = 32;
@@ -324,6 +402,9 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
   constexpr static ISA ISAType = ISA::AMX;
   constexpr static bool scale_on_logits = true;
 
+  float k_scale = 1.0f;
+  float v_scale = 1.0f;
+
  public:
   AttentionImpl() : current_q_head_num_(0) {
     // Use all columns in AMX tiles
@@ -332,21 +413,50 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
 
   ~AttentionImpl() { _tile_release(); }
 
+  void init_from_input(const AttentionInput* input) {
+    if constexpr (fp8_kv) {
+      k_scale = input->k_scale_fp8;
+      v_scale = input->v_scale_fp8;
+    }
+  }
+
+  float get_output_v_scale() const noexcept {
+    if constexpr (fp8_kv) {
+      // AMX dequant places FP8 payload into a BF16 field (exponent bias 127).
+      // Correction = 2^(127 - FP8_bias): E4M3 bias=7 → 2^120, E5M2 bias=15 →
+      // 2^112.
+      constexpr float bias =
+          std::is_same_v<kv_cache_t, c10::Float8_e5m2> ? 0x1p112f : 0x1p120f;
+      return v_scale * bias;
+    }
+    return 1.0f;
+  }
+
   template <template <typename tile_gemm_t> typename attention>
   FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    if constexpr (fp8_kv) {
+      // Same bias correction as get_output_v_scale: AMX FP8→BF16 dequant
+      // shifts the exponent bias from FP8 to BF16 (127), so we multiply by
+      // 2^(127-FP8_bias) to recover the true value. E4M3: 2^120, E5M2: 2^112.
+      const float bias =
+          std::is_same_v<kv_cache_t, c10::Float8_e5m2> ? 0x1p112f : 0x1p120f;
+      scale *= k_scale * bias;
+    }
     if (q_head_num > AMX_TILE_ROW_NUM) {
       if (q_head_num != current_q_head_num_) {
         current_q_head_num_ = q_head_num;
-        TileGemm224<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+        TileGemm224<q_buffer_t, kv_cache_t>::init_tile_config(q_head_num,
+                                                              amx_tile_config_);
       }
-      attention<TileGemm224<kv_cache_t>> attention_iteration;
+      attention<TileGemm224<q_buffer_t, kv_cache_t>> attention_iteration;
       attention_iteration(CPU_ATTENTION_PARAMS);
     } else {
       if (q_head_num != current_q_head_num_) {
         current_q_head_num_ = q_head_num;
-        TileGemm122<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+        TileGemm122<q_buffer_t, kv_cache_t>::init_tile_config(q_head_num,
+                                                              amx_tile_config_);
       }
-      attention<TileGemm122<kv_cache_t>> attention_iteration;
+      attention<TileGemm122<q_buffer_t, kv_cache_t>> attention_iteration;
       attention_iteration(CPU_ATTENTION_PARAMS);
     }
   }
@@ -411,13 +521,26 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
   // reshape KV to AMX friendly layout
   static void reshape_and_cache(
       const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      kv_cache_t* __restrict__ key_cache, kv_cache_t* __restrict__ value_cache,
       const int64_t* __restrict__ slot_mapping, const int64_t token_num,
       const int64_t key_token_num_stride, const int64_t value_token_num_stride,
       const int64_t head_num, const int64_t key_head_num_stride,
       const int64_t value_head_num_stride, const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
-      const int64_t block_size, const int64_t block_size_stride) {
+      const int64_t block_size, const int64_t block_size_stride,
+      const float k_inv = 0.0f, const float v_inv = 0.0f) {
+    if constexpr (fp8_kv) {
+      constexpr auto qfn = select_fp8_quant_fn<kv_cache_t>();
+      reshape_and_cache_fp8_amx_impl<scalar_t, qfn>(
+          key, value, reinterpret_cast<uint8_t*>(key_cache),
+          reinterpret_cast<uint8_t*>(value_cache), slot_mapping, token_num,
+          head_num, head_dim, block_size, key_token_num_stride,
+          key_head_num_stride, value_token_num_stride, value_head_num_stride,
+          num_blocks_stride, cache_head_num_stride, num_blocks_stride,
+          cache_head_num_stride, k_inv, v_inv);
+      return;
+    }
+
     // For AMX 2D tiles, size of each line is 64 bytes
     constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
     // For AMX B matrix, N always is 16
@@ -426,6 +549,9 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
     // For now suppose block_size is divisible by amx_tile_column_num
     TORCH_CHECK_EQ(block_size % amx_b_tile_k_size, 0);
 
+    scalar_t* __restrict__ kc = reinterpret_cast<scalar_t*>(key_cache);
+    scalar_t* __restrict__ vc = reinterpret_cast<scalar_t*>(value_cache);
+
 #pragma omp parallel for collapse(2)
     for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
       for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
@@ -453,8 +579,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
           constexpr int64_t quadword_num_per_group =
               token_num_per_group * quadword_num;
           int32_t* key_cache_start_ptr =
-              reinterpret_cast<int32_t*>(key_cache +
-                                         block_idx * num_blocks_stride +
+              reinterpret_cast<int32_t*>(kc + block_idx * num_blocks_stride +
                                          head_idx * cache_head_num_stride) +
               group_idx * quadword_num_per_group + group_offset;
 
@@ -483,7 +608,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
                                             token_idx * value_token_num_stride +
                                             head_idx * value_head_num_stride;
           scalar_t* value_cache_start_ptr =
-              value_cache + block_idx * num_blocks_stride +
+              vc + block_idx * num_blocks_stride +
               head_idx * cache_head_num_stride +
               sub_group_idx * token_num_per_sub_group * amx_b_tile_n_size +
               sub_group_offset;
diff --git a/csrc/cpu/cpu_attn_fp8.hpp b/csrc/cpu/cpu_attn_fp8.hpp
new file mode 100644
index 0000000000..1479645a2d
--- /dev/null
+++ b/csrc/cpu/cpu_attn_fp8.hpp
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "cpu/utils.hpp"
+
+typedef uint32_t __attribute__((__may_alias__)) u32_alias_t;
+typedef uint16_t __attribute__((__may_alias__)) u16_alias_t;
+typedef float __attribute__((__may_alias__)) f32_alias_t;
+
+// Reference scalar dequant — used to verify vectorized AMX dequant.
+inline float fp8e4m3_to_float_scalar(uint8_t b, float scale) noexcept {
+  // NaN encoding in E4M3
+  if ((b & 0x7F) == 0x7F) return std::numeric_limits<float>::quiet_NaN();
+  uint32_t b_u32 = static_cast<uint32_t>(b);
+  uint32_t sign = (b_u32 & 0x80) << 24;
+  uint32_t payload = (b_u32 & 0x7F) << 20;
+  uint32_t bits = sign | payload;
+  float b_f32_unscaled = *reinterpret_cast<const f32_alias_t*>(&bits);
+  float b_f32_scaled = b_f32_unscaled * scale * 0x1p120f;
+  return b_f32_scaled;
+}
+
+inline uint8_t float_to_fp8e4m3_scalar(float v, float inv_scale) noexcept {
+  v *= inv_scale;
+  constexpr float fp8_max = 448.0f;
+  v = std::max(-fp8_max, std::min(fp8_max, v));
+  if (v == 0.0f) return 0;
+
+  // Inverse mapping of fp8e4m3_to_float_scalar: shift the effective exponent
+  // bias from fp32 (127) back to fp8 e4m3 (7), then pack sign|payload.
+  float v_f32_unscaled = v * 0x1p-120f;
+  uint32_t bits = *reinterpret_cast<const u32_alias_t*>(&v_f32_unscaled);
+  uint8_t sign = static_cast<uint8_t>((bits >> 24) & 0x80);
+  uint8_t payload = static_cast<uint8_t>((bits >> 20) & 0x7F);
+  if (payload == 0) return sign;
+  payload = std::min<uint8_t>(payload, 0x7E);  // keep 0x7F as NaN encoding
+  return static_cast<uint8_t>(sign | payload);
+}
+
+// ---------------------------------------------------------------------------
+// AMX reshape impl — parameterised on the quantisation function.
+// Writes key/value into uint8 FP8 KV cache using the AMX tile-friendly layout.
+// K: halfword-packed (2 FP8 per uint16, token_num_per_group=16).
+// V: sub-group packing (token_num_per_sub_group=2, head_elems_per_group=16).
+// block_size must be divisible by 32.
+// ---------------------------------------------------------------------------
+template <typename scalar_t, uint8_t (*quant_fn)(float, float)>
+inline void reshape_and_cache_fp8_amx_impl(
+    const scalar_t* key_ptr, const scalar_t* value_ptr, uint8_t* key_cache_ptr,
+    uint8_t* value_cache_ptr, const int64_t* slot_ptr, int64_t token_num,
+    int64_t head_num, int64_t head_dim, int64_t block_size, int64_t k_stride0,
+    int64_t k_stride1, int64_t v_stride0, int64_t v_stride1, int64_t kc_stride0,
+    int64_t kc_stride1, int64_t vc_stride0, int64_t vc_stride1, float k_inv,
+    float v_inv) {
+  constexpr int64_t token_num_per_group = 16;  // AMX_TILE_ROW_NUM
+  const int64_t halfword_num = head_dim / 2;   // 2 FP8 per uint16
+  const int64_t halfword_num_per_group = token_num_per_group * halfword_num;
+  constexpr int64_t head_elems_per_group = 16;
+  constexpr int64_t token_num_per_sub_group = 2;  // = 4 / sizeof(BF16)
+  const int64_t group_num = head_dim / head_elems_per_group;
+  const int64_t group_size = block_size * head_elems_per_group;
+
+#pragma omp parallel for collapse(2) schedule(static)
+  for (int64_t tok = 0; tok < token_num; ++tok) {
+    for (int64_t h = 0; h < head_num; ++h) {
+      const int64_t slot = slot_ptr[tok];
+      if (slot < 0) continue;
+      const int64_t block_idx = slot / block_size;
+      const int64_t block_offset = slot % block_size;
+
+      // Key: halfword-packed, 2 FP8 per uint16
+      {
+        const scalar_t* ksrc = key_ptr + tok * k_stride0 + h * k_stride1;
+        const int64_t group_idx = block_offset / token_num_per_group;
+        const int64_t group_offset = block_offset % token_num_per_group;
+        uint16_t* kdst =
+            reinterpret_cast<uint16_t*>(key_cache_ptr + block_idx * kc_stride0 +
+                                        h * kc_stride1) +
+            group_idx * halfword_num_per_group + group_offset;
+        for (int64_t j = 0; j < halfword_num; ++j) {
+          uint8_t fp8_0 = quant_fn(static_cast<float>(ksrc[j * 2]), k_inv);
+          uint8_t fp8_1 = quant_fn(static_cast<float>(ksrc[j * 2 + 1]), k_inv);
+          uint8_t bytes[2] = {fp8_0, fp8_1};
+          uint16_t hw = *reinterpret_cast<const u16_alias_t*>(bytes);
+          kdst[j * token_num_per_group] = hw;
+        }
+      }
+
+      // Value: sub-group packing (token_num_per_sub_group = 2)
+      {
+        const scalar_t* vsrc = value_ptr + tok * v_stride0 + h * v_stride1;
+        const int64_t sub_group_idx = block_offset / token_num_per_sub_group;
+        const int64_t sub_group_offset = block_offset % token_num_per_sub_group;
+        uint8_t* vdst =
+            value_cache_ptr + block_idx * vc_stride0 + h * vc_stride1 +
+            sub_group_idx * token_num_per_sub_group * head_elems_per_group +
+            sub_group_offset;
+        for (int64_t i = 0; i < group_num; ++i) {
+          for (int64_t j = 0; j < head_elems_per_group; ++j)
+            vdst[j * token_num_per_sub_group] =
+                quant_fn(static_cast<float>(vsrc[j]), v_inv);
+          vsrc += head_elems_per_group;
+          vdst += group_size;
+        }
+      }
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// FP8 E5M2 scalar helpers
+// ---------------------------------------------------------------------------
+
+// Reference scalar dequant — used to verify vectorized AMX dequant.
+// FP8 E5M2: s[7] e[6:2] m[1:0], exponent bias = 15 (same as FP16).
+// Byte b → FP16 bits = b << 8 (no bias correction needed).
+inline float fp8e5m2_to_float_scalar(uint8_t b, float scale) noexcept {
+  const uint8_t exp_bits = (b >> 2) & 0x1F;
+  const uint8_t mant_bits = b & 0x03;
+  // NaN: exp=11111, mant!=00
+  if (exp_bits == 0x1F && mant_bits != 0)
+    return std::numeric_limits<float>::quiet_NaN();
+  const uint32_t sign = static_cast<uint32_t>(b & 0x80) << 24;
+  if (exp_bits == 0x1F)
+    return sign ? -std::numeric_limits<float>::infinity()
+                : std::numeric_limits<float>::infinity();
+  if (exp_bits == 0) {  // subnormal: (-1)^s * 2^-14 * mant/4
+    if (mant_bits == 0) return 0.0f;
+    float v = mant_bits * 0x1p-16f;
+    return (sign ? -v : v) * scale;
+  }
+  // Normal: FP32 exp = exp5 - 15 + 127, mantissa top 2 bits
+  uint32_t fp32_bits = sign |
+                       ((static_cast<uint32_t>(exp_bits) - 15 + 127) << 23) |
+                       (static_cast<uint32_t>(mant_bits) << 21);
+  float val = *reinterpret_cast<const f32_alias_t*>(&fp32_bits);
+  return val * scale;
+}
+
+inline uint8_t float_to_fp8e5m2_scalar(float v, float inv_scale) noexcept {
+  v *= inv_scale;
+  constexpr float fp8_e5m2_max = 57344.0f;
+  v = std::max(-fp8_e5m2_max, std::min(fp8_e5m2_max, v));
+  if (v == 0.0f) return 0;
+  uint32_t bits = *reinterpret_cast<const u32_alias_t*>(&v);
+  const uint8_t sign = static_cast<uint8_t>((bits >> 24) & 0x80);
+  const int32_t exp_fp32 = static_cast<int32_t>((bits >> 23) & 0xFF) - 127;
+  const uint8_t mant2 = static_cast<uint8_t>((bits >> 21) & 0x03);
+  if (exp_fp32 < -14) {  // subnormal in E5M2
+    const int shift = -14 - exp_fp32;
+    if (shift + 21 >= 32)
+      return sign;  // underflow: too small for E5M2 subnormal
+    const uint32_t m = (0x800000u | (bits & 0x7FFFFFu)) >> (shift + 21);
+    return sign | static_cast<uint8_t>(std::min<uint32_t>(m, 3u));
+  }
+  const uint8_t exp5 = static_cast<uint8_t>(exp_fp32 + 15);
+  return sign | (exp5 << 2) | mant2;
+}
+
+// ---------------------------------------------------------------------------
+// Select the FP8 quant function at compile time based on kv_cache_t.
+// ---------------------------------------------------------------------------
+template <typename kv_cache_t>
+constexpr auto select_fp8_quant_fn() {
+  if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e5m2>)
+    return float_to_fp8e5m2_scalar;
+  else
+    return float_to_fp8e4m3_scalar;
+}
+
+// ---------------------------------------------------------------------------
+// VEC reshape impl — parameterised on the quantisation function.
+// Writes key (column-major) and value (row-major) into uint8 FP8 KV cache.
+// The pragma omp must live outside APHRODITE_DISPATCH_FLOATING_TYPES because
+// #pragma cannot appear inside variadic macro arguments.
+// ---------------------------------------------------------------------------
+template <typename scalar_t, uint8_t (*quant_fn)(float, float)>
+inline void reshape_and_cache_fp8_vec_impl(
+    const scalar_t* key_ptr, const scalar_t* value_ptr, uint8_t* key_cache_ptr,
+    uint8_t* value_cache_ptr, const int64_t* slot_ptr, int64_t token_num,
+    int64_t head_num, int64_t head_dim, int64_t block_size, int64_t k_stride0,
+    int64_t k_stride1, int64_t v_stride0, int64_t v_stride1, int64_t kc_stride0,
+    int64_t kc_stride1, int64_t vc_stride0, int64_t vc_stride1, float k_inv,
+    float v_inv) {
+#pragma omp parallel for collapse(2) schedule(static)
+  for (int64_t tok = 0; tok < token_num; ++tok) {
+    for (int64_t h = 0; h < head_num; ++h) {
+      const int64_t slot = slot_ptr[tok];
+      if (slot < 0) continue;
+      const int64_t block_idx = slot / block_size;
+      const int64_t block_offset = slot % block_size;
+
+      // Key layout: column-major within block
+      const scalar_t* ksrc = key_ptr + tok * k_stride0 + h * k_stride1;
+      uint8_t* kdst = key_cache_ptr + block_idx * kc_stride0 + h * kc_stride1 +
+                      block_offset;
+      for (int64_t i = 0; i < head_dim; ++i)
+        kdst[i * block_size] = quant_fn(static_cast<float>(ksrc[i]), k_inv);
+
+      // Value layout: row-major within block (contiguous head_dim bytes)
+      const scalar_t* vsrc = value_ptr + tok * v_stride0 + h * v_stride1;
+      uint8_t* vdst = value_cache_ptr + block_idx * vc_stride0 +
+                      h * vc_stride1 + block_offset * head_dim;
+      for (int64_t i = 0; i < head_dim; ++i)
+        vdst[i] = quant_fn(static_cast<float>(vsrc[i]), v_inv);
+    }
+  }
+}
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index c1974bfd0a..f5b473bd26 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -14,8 +14,22 @@
 namespace cpu_attention {
 enum class ISA { AMX, VEC, VEC16, NEON, VXE };
 
-template <ISA isa, typename scalar_t, int64_t head_dim>
-class AttentionImpl {};
+// Mirrors csrc/attention/dtype_fp8.cuh Fp8KVCacheDataType exactly.
+enum class Fp8KVCacheDataType {
+  kAuto = 0,
+  kFp8E4M3 = 1,
+  kFp8E5M2 = 2,
+};
+
+struct AttentionInput;
+
+template <ISA isa, typename scalar_t, int64_t head_dim,
+          typename kv_cache_scalar_t = scalar_t>
+class AttentionImpl {
+ public:
+  void init_from_input(const AttentionInput*) {}
+  float get_output_v_scale() const noexcept { return 1.0f; }
+};
 
 struct AttentionWorkItemGroup {
   int32_t req_id;
@@ -780,6 +794,9 @@ struct AttentionInput {
   int32_t sliding_window_left;
   int32_t sliding_window_right;
   float softcap;
+  // FP8 KV cache scales (used by FP8 attention implementations)
+  float k_scale_fp8 = 1.0f;
+  float v_scale_fp8 = 1.0f;
 };
 
 #define DEFINE_CPU_ATTENTION_PARAMS                                         \
@@ -1374,6 +1391,13 @@ class AttentionMainLoop {
       }
 
       attention_impl_t attn_impl;
+      constexpr bool fp8_kv = std::is_same_v<kv_cache_t, c10::Float8_e4m3fn> ||
+                              std::is_same_v<kv_cache_t, c10::Float8_e5m2>;
+      float output_v_scale = 1.0f;
+      if constexpr (fp8_kv) {
+        attn_impl.init_from_input(input);
+        output_v_scale = attn_impl.get_output_v_scale();
+      }
 
       // general information
       const int32_t q_head_num = input->num_heads;
@@ -1753,7 +1777,7 @@ class AttentionMainLoop {
                                reinterpret_cast<query_t*>(input->output) +
                                    output_buffer_offset,
                                sum_buffer, actual_q_heads_per_kv,
-                               actual_q_token_num, q_head_num);
+                               actual_q_token_num, q_head_num, output_v_scale);
                 } else {
                   const int32_t stride =
                       actual_q_heads_per_kv * split_kv_q_token_num_threshold;
@@ -1823,7 +1847,7 @@ class AttentionMainLoop {
               split_output_buffer,
               reinterpret_cast<query_t*>(input->output) + output_buffer_offset,
               split_sum_buffer, actual_q_heads_per_kv, curr_output_token_num,
-              q_head_num);
+              q_head_num, output_v_scale);
         }
       }
     }
@@ -1947,8 +1971,8 @@ class AttentionMainLoop {
                     query_t* __restrict__ curr_output_buffer,
                     float* __restrict__ sum_buffer,
                     const int32_t q_heads_per_kv,
-                    const int32_t actual_q_token_num,
-                    const int32_t q_head_num) {
+                    const int32_t actual_q_token_num, const int32_t q_head_num,
+                    const float v_scale = 1.0f) {
     // final output
     using output_vec_t = typename VecTypeTrait<query_t>::vec_t;
 
@@ -1962,7 +1986,7 @@ class AttentionMainLoop {
           curr_partial_output_buffer;
       query_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
       for (int32_t head_idx = 0; head_idx < q_heads_per_kv; ++head_idx) {
-        vec_op::FP32Vec16 inv_sum_scale_vec(1.0 / *curr_sum_buffer);
+        vec_op::FP32Vec16 inv_sum_scale_vec(v_scale / *curr_sum_buffer);
 
         for (int32_t i = 0; i < group_num_per_head; ++i) {
           vec_op::FP32Vec16 vec(curr_partial_output_buffer_iter);
diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp
index 3523893c38..db4c5df2e8 100644
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -248,8 +248,8 @@ class TileGemmNeonFMLA {
 }  // namespace
 
 // this is similar to "ISA::VEC" at the moment
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::NEON, scalar_t, head_dim, kv_cache_scalar_t> {
  public:
   using query_t = scalar_t;
   using q_buffer_t = float;
@@ -343,7 +343,8 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
       const int64_t head_num, const int64_t key_head_num_stride,
       const int64_t value_head_num_stride, const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
-      const int64_t block_size, const int64_t block_size_stride) {
+      const int64_t block_size, const int64_t block_size_stride,
+      const float /*k_inv*/ = 0.0f, const float /*v_inv*/ = 0.0f) {
 #pragma omp parallel for collapse(2)
     for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
       for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
@@ -388,7 +389,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
 #ifdef ARM_BF16_SUPPORT
 // For BF16 on Arm, reuse the BFMMLA kernels with 32-token alignment.
 template <int64_t head_dim>
-class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim>
+class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim, c10::BFloat16>
     : public AttentionImplNEONBFMMLA<BLOCK_SIZE_ALIGNMENT, ISA::NEON,
                                      head_dim> {};
 #endif
diff --git a/csrc/cpu/cpu_attn_neon_bfmmla.hpp b/csrc/cpu/cpu_attn_neon_bfmmla.hpp
index fb133aa130..4e4578a74f 100644
--- a/csrc/cpu/cpu_attn_neon_bfmmla.hpp
+++ b/csrc/cpu/cpu_attn_neon_bfmmla.hpp
@@ -602,7 +602,8 @@ class AttentionImplNEONBFMMLA {
       [[maybe_unused]] const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
       const int64_t block_size,
-      [[maybe_unused]] const int64_t block_size_stride) {
+      [[maybe_unused]] const int64_t block_size_stride,
+      const float /*k_inv*/ = 0.0f, const float /*v_inv*/ = 0.0f) {
     const int64_t k_block_stride = (head_dim / TILE_K) * K_INNER_STRIDE;
     const int64_t v_pair_stride =
         (block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
diff --git a/csrc/cpu/cpu_attn_vec.hpp b/csrc/cpu/cpu_attn_vec.hpp
index f51a232ba9..61cae12d67 100644
--- a/csrc/cpu/cpu_attn_vec.hpp
+++ b/csrc/cpu/cpu_attn_vec.hpp
@@ -1,11 +1,37 @@
 #ifndef CPU_ATTN_VEC_HPP
 #define CPU_ATTN_VEC_HPP
 
+#include "cpu_attn_fp8.hpp"
 #include "cpu_attn_impl.hpp"
 
 namespace cpu_attention {
 
 namespace {
+
+// Load 32 kv_cache_t elements starting at ptr and return them as two FP32Vec16s
+// covering the lower 16 and upper 16 positions.
+// For FP8: both halves come from a single BF16Vec32 dequant of 32 bytes.
+// For BF16/FP16/FP32: two separate vector loads at ptr and ptr+16.
+template <typename kv_cache_t>
+FORCE_INLINE std::pair<vec_op::FP32Vec16, vec_op::FP32Vec16> load_b_pair_vec(
+    const kv_cache_t* ptr) {
+  if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e4m3fn>) {
+    // BF16 container, but values are in the FP16 exponent range (bias 15 not
+    // 127).
+    vec_op::BF16Vec32 bf16_b_reg(reinterpret_cast<const uint8_t*>(ptr),
+                                 vec_op::fp8_e4m3_tag{});
+    return {vec_op::FP32Vec16(bf16_b_reg, 0), vec_op::FP32Vec16(bf16_b_reg, 1)};
+  } else if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e5m2>) {
+    vec_op::BF16Vec32 bf16_b_reg(reinterpret_cast<const uint8_t*>(ptr),
+                                 vec_op::fp8_e5m2_tag{});
+    return {vec_op::FP32Vec16(bf16_b_reg, 0), vec_op::FP32Vec16(bf16_b_reg, 1)};
+  } else {
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+    return {vec_op::FP32Vec16(load_vec_t(ptr)),
+            vec_op::FP32Vec16(load_vec_t(ptr + 16))};
+  }
+}
+
 // 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
 template <typename kv_cache_t>
 class TileGemm82 {
@@ -54,10 +80,7 @@ class TileGemm82 {
                          const int32_t block_size, const int32_t dynamic_k_size,
                          const bool accum_c) {
     static_assert(0 < M && M <= 8);
-    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
 
-    kv_cache_t* __restrict__ curr_b_0 = b_tile;
-    kv_cache_t* __restrict__ curr_b_1 = b_tile + 16;
     float* __restrict__ curr_c_0 = c_tile;
     float* __restrict__ curr_c_1 = c_tile + 16;
 
@@ -76,16 +99,14 @@ class TileGemm82 {
     }
 
     float* __restrict__ curr_a = a_tile;
+    kv_cache_t* __restrict__ curr_b = b_tile;
+
     for (int32_t k = 0; k < dynamic_k_size; ++k) {
-      load_vec_t b_0_reg(curr_b_0);
-      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
-      load_vec_t b_1_reg(curr_b_1);
-      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+      auto [fp32_b_0_reg, fp32_b_1_reg] = load_b_pair_vec(curr_b);
 
       float* __restrict__ curr_m_a = curr_a;
       vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
-        float v = *curr_m_a;
-        vec_op::FP32Vec16 a_reg(v);
+        vec_op::FP32Vec16 a_reg(*curr_m_a);
         c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
         c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
 
@@ -95,8 +116,7 @@ class TileGemm82 {
 
       // update
       curr_a += 1;
-      curr_b_0 += ldb;
-      curr_b_1 += ldb;
+      curr_b += ldb;
     }
 
     vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
@@ -109,15 +129,20 @@ class TileGemm82 {
     });
   }
 };
+
 }  // namespace
 
 // This is a general but naive implementation based on vector instructions
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::VEC, scalar_t, head_dim, kv_cache_scalar_t> {
+  static constexpr bool fp8_kv =
+      std::is_same_v<kv_cache_scalar_t, c10::Float8_e4m3fn> ||
+      std::is_same_v<kv_cache_scalar_t, c10::Float8_e5m2>;
+
  public:
   using query_t = scalar_t;
   using q_buffer_t = float;
-  using kv_cache_t = scalar_t;
+  using kv_cache_t = kv_cache_scalar_t;
   using logits_buffer_t = float;
   using partial_output_buffer_t = float;
   using prob_buffer_t = float;
@@ -129,11 +154,45 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
   constexpr static int64_t MaxQHeadNumPerIteration = 8;
   constexpr static int64_t HeadDim = head_dim;
   constexpr static ISA ISAType = ISA::VEC;
-  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+  constexpr static bool scale_on_logits = fp8_kv;
+
+  float k_scale = 1.0f;
+  float v_scale = 1.0f;
 
  public:
+  void init_from_input(const AttentionInput* input) {
+    if constexpr (fp8_kv) {
+      k_scale = input->k_scale_fp8;
+      v_scale = input->v_scale_fp8;
+    }
+  }
+
+  float get_output_v_scale() const noexcept {
+    if constexpr (fp8_kv) {
+      // VEC dequant unpacks FP8 into a pseudo-FP16 layout (exponent bias 15).
+      // E4M3 (bias=7) needs correction 2^(15-7) = 2^8; E5M2 bias matches FP16
+      // so no correction.
+      if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e5m2>) {
+        return v_scale;
+      } else {
+        return v_scale * 0x1p8f;
+      }
+    }
+    return 1.0f;
+  }
+
   template <template <typename tile_gemm_t> typename attention>
   FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    if constexpr (fp8_kv) {
+      // Same bias correction as get_output_v_scale: VEC FP8→pseudo-FP16 dequant
+      // uses bias 15; E4M3 (bias=7) needs ×2^8, E5M2 (bias=15) needs no
+      // correction.
+      if constexpr (std::is_same_v<kv_cache_t, c10::Float8_e5m2>) {
+        scale *= k_scale;
+      } else {
+        scale *= k_scale * 0x1p8f;
+      }
+    }
     attention<TileGemm82<kv_cache_t>> attention_iteration;
     attention_iteration(CPU_ATTENTION_PARAMS);
   }
@@ -161,17 +220,19 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
                               // row-major
   }
 
-  // Copy q to q_buffer and cast it to fp32
-  static void copy_q_heads_tile(
-      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
-      float* __restrict__ q_buffer, const int32_t q_num,
-      const int32_t q_heads_per_kv, const int64_t q_num_stride,
-      const int64_t q_head_stride, float scale) {
+  // Copy q to q_buffer and cast it to fp32.
+  // FP8: QK scale is folded into execute_attention; copy Q unscaled here.
+  void copy_q_heads_tile(scalar_t* __restrict__ src,
+                         float* __restrict__ q_buffer, const int32_t q_num,
+                         const int32_t q_heads_per_kv,
+                         const int64_t q_num_stride,
+                         const int64_t q_head_stride, float scale) {
     static_assert(head_dim % 16 == 0);
     constexpr int32_t unroll_size = head_dim / 16;
     using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
 
-    vec_op::FP32Vec16 scale_vec(scale);
+    const float effective_scale = fp8_kv ? 1.0f : scale;
+    vec_op::FP32Vec16 scale_vec(effective_scale);
     for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
       for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
         scalar_t* __restrict__ curr_q =
@@ -196,13 +257,26 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
   // reshape K as column-major and V as row-major
   static void reshape_and_cache(
       const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      kv_cache_t* __restrict__ key_cache, kv_cache_t* __restrict__ value_cache,
       const int64_t* __restrict__ slot_mapping, const int64_t token_num,
       const int64_t key_token_num_stride, const int64_t value_token_num_stride,
       const int64_t head_num, const int64_t key_head_num_stride,
       const int64_t value_head_num_stride, const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
-      const int64_t block_size, const int64_t block_size_stride) {
+      const int64_t block_size, const int64_t block_size_stride,
+      const float k_inv = 0.0f, const float v_inv = 0.0f) {
+    if constexpr (fp8_kv) {
+      constexpr auto qfn = select_fp8_quant_fn<kv_cache_t>();
+      reshape_and_cache_fp8_vec_impl<scalar_t, qfn>(
+          key, value, reinterpret_cast<uint8_t*>(key_cache),
+          reinterpret_cast<uint8_t*>(value_cache), slot_mapping, token_num,
+          head_num, head_dim, block_size, key_token_num_stride,
+          key_head_num_stride, value_token_num_stride, value_head_num_stride,
+          num_blocks_stride, cache_head_num_stride, num_blocks_stride,
+          cache_head_num_stride, k_inv, v_inv);
+      return;
+    }
+
 #pragma omp parallel for collapse(2)
     for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
       for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
@@ -220,8 +294,9 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
                                           token_idx * key_token_num_stride +
                                           head_idx * key_head_num_stride;
           scalar_t* key_cache_start_ptr =
-              key_cache + block_idx * num_blocks_stride +
-              head_idx * cache_head_num_stride + block_offset;
+              reinterpret_cast<scalar_t*>(key_cache) +
+              block_idx * num_blocks_stride + head_idx * cache_head_num_stride +
+              block_offset;
 
 #pragma GCC unroll 8
           for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
@@ -234,8 +309,9 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
                                             token_idx * value_token_num_stride +
                                             head_idx * value_head_num_stride;
           scalar_t* value_cache_start_ptr =
-              value_cache + block_idx * num_blocks_stride +
-              head_idx * cache_head_num_stride + block_offset * head_dim;
+              reinterpret_cast<scalar_t*>(value_cache) +
+              block_idx * num_blocks_stride + head_idx * cache_head_num_stride +
+              block_offset * head_dim;
           std::memcpy(value_cache_start_ptr, value_start_ptr,
                       sizeof(scalar_t) * head_dim);
         }
@@ -243,6 +319,7 @@ class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
     }
   }
 };
+
 }  // namespace cpu_attention
 
 #endif
diff --git a/csrc/cpu/cpu_attn_vec16.hpp b/csrc/cpu/cpu_attn_vec16.hpp
index 06e4ad7624..bc15d614a7 100644
--- a/csrc/cpu/cpu_attn_vec16.hpp
+++ b/csrc/cpu/cpu_attn_vec16.hpp
@@ -116,9 +116,9 @@ class TileGemm161 {
 }  // namespace
 
 // This is a general but naive implementation based on vector instructions
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::VEC16, scalar_t, head_dim>
-    : public AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::VEC16, scalar_t, head_dim, kv_cache_scalar_t>
+    : public AttentionImpl<ISA::VEC, scalar_t, head_dim, kv_cache_scalar_t> {
  public:
   using query_t = scalar_t;
   using q_buffer_t = float;
diff --git a/csrc/cpu/cpu_attn_vxe.hpp b/csrc/cpu/cpu_attn_vxe.hpp
index 45db4ebd73..cbfda4cf78 100644
--- a/csrc/cpu/cpu_attn_vxe.hpp
+++ b/csrc/cpu/cpu_attn_vxe.hpp
@@ -244,8 +244,8 @@ class TileGemmS390X {
 
 }  // namespace
 
-template <typename scalar_t, int64_t head_dim>
-class AttentionImpl<ISA::VXE, scalar_t, head_dim> {
+template <typename scalar_t, int64_t head_dim, typename kv_cache_scalar_t>
+class AttentionImpl<ISA::VXE, scalar_t, head_dim, kv_cache_scalar_t> {
  public:
   using query_t = scalar_t;
   using q_buffer_t = float;
@@ -342,7 +342,8 @@ class AttentionImpl<ISA::VXE, scalar_t, head_dim> {
       const int64_t head_num, const int64_t key_head_num_stride,
       const int64_t value_head_num_stride, const int64_t num_blocks,
       const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
-      const int64_t block_size, const int64_t block_size_stride) {
+      const int64_t block_size, const int64_t block_size_stride,
+      const float /*k_inv*/ = 0.0f, const float /*v_inv*/ = 0.0f) {
 #pragma omp parallel for collapse(2)
     for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
       for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index c574290af9..02becd891f 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -15,6 +15,9 @@ using namespace at::vec;
 
 namespace vec_op {
 
+struct fp8_e4m3_tag {};
+struct fp8_e5m2_tag {};
+
 #define APHRODITE_DISPATCH_CASE_FLOATING_TYPES(...)    \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
@@ -323,6 +326,9 @@ struct BF16Vec32 : public VectorizedRegWrapper<BF16Vec32, 4, c10::BFloat16> {
     reg.val[2] = vec8_data.reg.val[0];
     reg.val[3] = vec8_data.reg.val[0];
   };
+
+  explicit BF16Vec32(const uint8_t*, fp8_e4m3_tag) : Base() {}
+  explicit BF16Vec32(const uint8_t*, fp8_e5m2_tag) : Base() {}
 };
 
 struct FP32Vec4 : public VectorizedRegWrapper<FP32Vec4, 1, float> {
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index 962d67525b..46d217c0a2 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -8,6 +8,9 @@
 #include <torch/all.h>
 namespace vec_op {
 
+struct fp8_e4m3_tag {};
+struct fp8_e5m2_tag {};
+
 #define vec_neg(a) (-(a))
 #define vec_add(a, b) ((a) + (b))
 #define vec_sub(a, b) ((a) - (b))
@@ -242,6 +245,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   explicit BF16Vec32(const BF16Vec8& vec8_data)
       : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
 
+  explicit BF16Vec32(const uint8_t*, fp8_e4m3_tag) : reg{} {}
+  explicit BF16Vec32(const uint8_t*, fp8_e5m2_tag) : reg{} {}
+
   void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
 };
 
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 3ba89ace9e..ed694ba64b 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -11,6 +11,17 @@ static_assert(false, "AVX2 must be supported for the current implementation.");
 
 namespace vec_op {
 
+// Tags for FP8 BF16Vec32 constructors (avoid overload collision with
+// BF16Vec32(void*)).
+// VEC path (FP8 → pseudo-FP16 layout, scale correction applied later):
+struct fp8_e4m3_tag {};  // E4M3 → pseudo-FP16; BF16 value = true_E4M3 * 2^-8
+struct fp8_e5m2_tag {};  // E5M2 → FP16 bits directly (same exponent bias=15)
+// AMX path (FP8 → unscaled BF16, no FP32 round-trip):
+// BF16 value = true_E4M3 * 2^-120 (E4M3) or true_E5M2 * 2^-112 (E5M2).
+// Exponent rebiasing is folded into k/v scales by the caller.
+struct fp8_bf16_e4m3_tag {};
+struct fp8_bf16_e5m2_tag {};
+
 #define APHRODITE_DISPATCH_CASE_FLOATING_TYPES(...)       \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
@@ -177,6 +188,50 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
                                (__m128i)vec8_data.reg, 2),
             (__m128i)vec8_data.reg, 3)) {}
 
+  // Decode 32 FP8-E4M3 bytes to pseudo-FP16 layout (stored in the BF16
+  // register).  Result = true_E4M3 * 2^-8; caller applies scale * 2^8.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_e4m3_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m512i b16 = _mm512_cvtepu8_epi16(b8);
+    __m512i sign =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x80)), 8);
+    __m512i payload =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x7F)), 7);
+    reg = _mm512_or_si512(sign, payload);
+  }
+
+  // Decode 32 FP8-E5M2 bytes to FP16 layout.
+  // E5M2 and FP16 share the same 5-bit exponent bias (15), so FP8 byte b maps
+  // directly to FP16 bits by shifting left 8 — no sign/payload reconstruction.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_e5m2_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    reg = _mm512_slli_epi16(_mm512_cvtepu8_epi16(b8), 8);
+  }
+
+  // Direct FP8-E4M3 → unscaled BF16 for AMX (no FP32 round-trip).
+  // BF16 value = true_E4M3 * 2^-120; exponent rebiasing folded into k/v scales.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_bf16_e4m3_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m512i b16 = _mm512_cvtepu8_epi16(b8);
+    __m512i sign =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x80)), 8);
+    __m512i payload =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x7F)), 4);
+    reg = _mm512_or_si512(sign, payload);
+  }
+
+  // Direct FP8-E5M2 → unscaled BF16 for AMX (no FP32 round-trip).
+  // BF16 value = true_E5M2 * 2^-112; exponent rebiasing folded into k/v scales.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_bf16_e5m2_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m512i b16 = _mm512_cvtepu8_epi16(b8);
+    __m512i sign =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x80)), 8);
+    __m512i payload =
+        _mm512_slli_epi16(_mm512_and_si512(b16, _mm512_set1_epi16(0x7F)), 5);
+    reg = _mm512_or_si512(sign, payload);
+  }
+
   void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; }
 };
 #else
@@ -201,6 +256,77 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
             _mm256_castsi128_si256((__m128i)vec8_data.reg),
             (__m128i)vec8_data.reg, 1)) {}
 
+  // E4M3 decode (AVX2 path) — same bit-layout trick as the AVX512 variant
+  // above.  Result = true_E4M3 * 2^-8; caller applies scale * 2^8.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_e4m3_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m128i b8_low = _mm256_extracti128_si256(b8, 0);
+    __m128i b8_high = _mm256_extracti128_si256(b8, 1);
+    __m256i b16_low = _mm256_cvtepu8_epi16(b8_low);
+    __m256i b16_high = _mm256_cvtepu8_epi16(b8_high);
+
+    __m256i sign_low = _mm256_slli_epi16(
+        _mm256_and_si256(b16_low, _mm256_set1_epi16(0x80)), 8);
+    __m256i payload_low = _mm256_slli_epi16(
+        _mm256_and_si256(b16_low, _mm256_set1_epi16(0x7F)), 7);
+    __m256i sign_high = _mm256_slli_epi16(
+        _mm256_and_si256(b16_high, _mm256_set1_epi16(0x80)), 8);
+    __m256i payload_high = _mm256_slli_epi16(
+        _mm256_and_si256(b16_high, _mm256_set1_epi16(0x7F)), 7);
+    reg_low = _mm256_or_si256(sign_low, payload_low);
+    reg_high = _mm256_or_si256(sign_high, payload_high);
+  }
+
+  // E5M2 decode (AVX2 path) — b << 8 maps to FP16 bits; see AVX512 variant
+  // above.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_e5m2_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m128i b8_low = _mm256_extracti128_si256(b8, 0);
+    __m128i b8_high = _mm256_extracti128_si256(b8, 1);
+    reg_low = _mm256_slli_epi16(_mm256_cvtepu8_epi16(b8_low), 8);
+    reg_high = _mm256_slli_epi16(_mm256_cvtepu8_epi16(b8_high), 8);
+  }
+
+  // Direct FP8-E4M3 → unscaled BF16 for AMX (AVX2 path, no FP32 round-trip).
+  // BF16 value = true_E4M3 * 2^-120; exponent rebiasing folded into k/v scales.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_bf16_e4m3_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m128i b8_low = _mm256_extracti128_si256(b8, 0);
+    __m128i b8_high = _mm256_extracti128_si256(b8, 1);
+    __m256i b16_low = _mm256_cvtepu8_epi16(b8_low);
+    __m256i b16_high = _mm256_cvtepu8_epi16(b8_high);
+    reg_low = _mm256_or_si256(
+        _mm256_slli_epi16(_mm256_and_si256(b16_low, _mm256_set1_epi16(0x80)),
+                          8),
+        _mm256_slli_epi16(_mm256_and_si256(b16_low, _mm256_set1_epi16(0x7F)),
+                          4));
+    reg_high = _mm256_or_si256(
+        _mm256_slli_epi16(_mm256_and_si256(b16_high, _mm256_set1_epi16(0x80)),
+                          8),
+        _mm256_slli_epi16(_mm256_and_si256(b16_high, _mm256_set1_epi16(0x7F)),
+                          4));
+  }
+
+  // Direct FP8-E5M2 → unscaled BF16 for AMX (AVX2 path, no FP32 round-trip).
+  // BF16 value = true_E5M2 * 2^-112; exponent rebiasing folded into k/v scales.
+  explicit BF16Vec32(const uint8_t* ptr, fp8_bf16_e5m2_tag) {
+    __m256i b8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+    __m128i b8_low = _mm256_extracti128_si256(b8, 0);
+    __m128i b8_high = _mm256_extracti128_si256(b8, 1);
+    __m256i b16_low = _mm256_cvtepu8_epi16(b8_low);
+    __m256i b16_high = _mm256_cvtepu8_epi16(b8_high);
+    reg_low = _mm256_or_si256(
+        _mm256_slli_epi16(_mm256_and_si256(b16_low, _mm256_set1_epi16(0x80)),
+                          8),
+        _mm256_slli_epi16(_mm256_and_si256(b16_low, _mm256_set1_epi16(0x7F)),
+                          5));
+    reg_high = _mm256_or_si256(
+        _mm256_slli_epi16(_mm256_and_si256(b16_high, _mm256_set1_epi16(0x80)),
+                          8),
+        _mm256_slli_epi16(_mm256_and_si256(b16_high, _mm256_set1_epi16(0x7F)),
+                          5));
+  }
+
   void save(void* ptr) const {
     _mm256_storeu_si256((__m256i*)ptr, reg_low);
     _mm256_storeu_si256((__m256i*)ptr + 1, reg_high);
@@ -391,6 +517,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
       : reg(_mm512_castsi512_ps(
             _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
 
+  explicit FP32Vec16(const BF16Vec32& v, int upper) {
+    __m256i v_half_i = _mm512_extracti32x8_epi32(v.reg, upper);
+    reg = _mm512_cvtph_ps(v_half_i);
+  }
+
   explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {}
 
   explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@@ -495,6 +626,14 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(const FP32Vec8& data)
       : reg_low(data.reg), reg_high(data.reg) {}
 
+  explicit FP32Vec16(const BF16Vec32& v, int upper) {
+    const __m256i& half = upper ? v.reg_high : v.reg_low;
+    __m128i lo = _mm256_extractf128_si256(half, 0);
+    __m128i hi = _mm256_extractf128_si256(half, 1);
+    reg_low = _mm256_cvtph_ps(lo);
+    reg_high = _mm256_cvtph_ps(hi);
+  }
+
   explicit FP32Vec16(const FP16Vec16& v) {
     __m128i low = _mm256_extractf128_si256(v.reg, 0);
     __m128i high = _mm256_extractf128_si256(v.reg, 1);
diff --git a/csrc/cpu/generate_cpu_attn_dispatch.py b/csrc/cpu/generate_cpu_attn_dispatch.py
index 9a889490d8..f479d27945 100644
--- a/csrc/cpu/generate_cpu_attn_dispatch.py
+++ b/csrc/cpu/generate_cpu_attn_dispatch.py
@@ -22,71 +22,93 @@
     "VXE": 4,
 }
 
+# KV cache index: 0 = auto (same as scalar_t), 1 = fp8_e4m3, 2 = fp8_e5m2
+KV_CACHE_IDX = {
+    "auto": 0,
+    "fp8_e4m3": 1,
+    "fp8_e5m2": 2,
+}
+
+# C++ type for each kv_cache index
+KV_CACHE_CPP_TYPES = {
+    "auto": "scalar_t",
+    "fp8_e4m3": "c10::Float8_e4m3fn",
+    "fp8_e5m2": "c10::Float8_e5m2",
+}
+
 # ISAs supported for head_dims divisible by 32
 ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16", "VXE"]
 
 # ISAs supported for head_dims divisible by 16 only
 ISA_FOR_16 = ["VEC16"]
 
+# ISAs that support FP8 KV cache (x86 AVX2/AVX-512 required)
+ISA_FOR_FP8 = ["AMX", "VEC"]
+
 
-def encode_params(head_dim: int, isa_type: str) -> int:
-    """Encode head_dim and ISA type into a single int64_t."""
+def encode_params(head_dim: int, isa_type: str, kv_cache: str = "auto") -> int:
+    """Encode head_dim, ISA type, and KV cache type into a single int64_t."""
     isa_val = ISA_TYPES[isa_type]
-    # Encoding: (head_dim << 8) | isa_type
-    # This allows head_dim up to 2^56 - 1 and 256 ISA types
-    return (head_dim << 8) | isa_val
+    kv_val = KV_CACHE_IDX[kv_cache]
+    # Encoding: (head_dim << 16) | (kv_cache_idx << 8) | isa_type
+    # This allows head_dim up to 2^48 - 1, 256 KV cache types, and 256 ISA types
+    return (head_dim << 16) | (kv_val << 8) | isa_val
+
+
+def _make_case(head_dim: int, isa: str, kv_cache: str = "auto", isa_override: str | None = None) -> str:
+    """Generate a single switch case line."""
+    encoded = encode_params(head_dim, isa, kv_cache)
+    actual_isa = isa_override if isa_override else isa
+    cpp_type = KV_CACHE_CPP_TYPES[kv_cache]
+    attn_impl = (
+        f"cpu_attention::AttentionImpl<"
+        f"cpu_attention::ISA::{actual_isa}, \\\n"
+        f"                                                       "
+        f"scalar_t, head_dim, {cpp_type}>"
+    )
+    comment = (
+        f"head_dim={head_dim}, isa={isa}"
+        if kv_cache == "auto"
+        else f"head_dim={head_dim}, isa={isa}, kv_cache={kv_cache}"
+    )
+    return (
+        f"""      case {encoded}LL: {{ """
+        f"""/* {comment} */ \\"""
+        f"""
+        constexpr size_t head_dim = {head_dim}; \\"""
+        f"""
+        using attn_impl = {attn_impl}; \\"""
+        f"""
+        return __VA_ARGS__(); \\"""
+        f"""
+      }} \\"""
+    )
 
 
-def generate_cases_for_isa_group(isa_list: list[str]) -> str:
+def generate_cases_for_isa_group(isa_list: list[str], include_fp8: bool = False) -> str:
     """Generate switch cases for a specific ISA group."""
     cases = []
 
-    # Generate cases for head_dims divisible by 32
+    # Non-FP8 cases for head_dims divisible by 32
     for head_dim in HEAD_DIMS_32:
         for isa in isa_list:
             if isa not in ISA_FOR_32:
                 continue
-            encoded = encode_params(head_dim, isa)
-            case_str = (
-                f"""      case {encoded}LL: {{ """
-                f"""/* head_dim={head_dim}, isa={isa} */ \\"""
-                f"""
-        constexpr size_t head_dim = {head_dim}; \\"""
-                f"""
-        using attn_impl = cpu_attention::AttentionImpl<"""
-                f"""cpu_attention::ISA::{isa}, \\"""
-                f"""
-                                                       """
-                f"""scalar_t, head_dim>; \\"""
-                f"""
-        return __VA_ARGS__(); \\"""
-                f"""
-      }} \\"""
-            )
-            cases.append(case_str)
+            cases.append(_make_case(head_dim, isa, "auto"))
 
-    # Generate cases for head_dims divisible by 16 only
+    # Non-FP8 cases for head_dims divisible by 16 only
     for head_dim in HEAD_DIMS_16:
         for isa in isa_list:
-            encoded = encode_params(head_dim, isa)
-            case_str = (
-                f"""      case {encoded}LL: {{ """
-                f"""/* head_dim={head_dim}, isa={isa} """
-                f"""(using VEC16) */ \\"""
-                f"""
-        constexpr size_t head_dim = {head_dim}; \\"""
-                f"""
-        using attn_impl = cpu_attention::AttentionImpl<"""
-                f"""cpu_attention::ISA::VEC16, \\"""
-                f"""
-                                                       """
-                f"""scalar_t, head_dim>; \\"""
-                f"""
-        return __VA_ARGS__(); \\"""
-                f"""
-      }} \\"""
-            )
-            cases.append(case_str)
+            cases.append(_make_case(head_dim, isa, "auto", isa_override="VEC16"))
+
+    # FP8 cases: only AMX and VEC, only head_dims divisible by 32
+    if include_fp8:
+        for fp8_type in ("fp8_e4m3", "fp8_e5m2"):
+            for head_dim in HEAD_DIMS_32:
+                for isa in isa_list:
+                    if isa not in ISA_FOR_FP8:
+                        continue
+                    cases.append(_make_case(head_dim, isa, fp8_type))
 
     return "\n".join(cases)
 
@@ -94,8 +116,9 @@ def generate_cases_for_isa_group(isa_list: list[str]) -> str:
 def generate_helper_function() -> str:
     """Generate helper function to encode parameters."""
     return """
-inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa) {
-  return (head_dim << 8) | static_cast<int64_t>(isa);
+inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa,
+                                      int64_t kv_cache_idx = 0) {
+  return (head_dim << 16) | (kv_cache_idx << 8) | static_cast<int64_t>(isa);
 }
 """
 
@@ -129,87 +152,74 @@ def generate_header_file() -> str:
 
     # Generate dispatch macro with conditional compilation for different ISA sets
     header += """
-// Dispatch macro using encoded parameters
-"""
-
-    # x86_64 with AMX
-    header += """#if defined(CPU_CAPABILITY_AMXBF16)
-#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
-  [&] { \\
-    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
-    switch (encoded_params) { \\
+// Dispatch macro using encoded parameters.
+// KV_CACHE_IDX: Fp8KVCacheDataType enum value (kAuto=0, kFp8E4M3=1, kFp8E5M2=2).
+// FP8 cases (kv_cache_idx != 0) are generated on x86 platforms with AVX2 or
+// AVX-512: BF16Vec32 FP8 constructors have both AVX-512 and AVX2 implementations
+// in cpu_types_x86.hpp. Non-x86 platforms (#else fallback) have fp8=False.
 """
-    header += generate_cases_for_isa_group(["AMX", "VEC", "VEC16"])
-    header += """
-      default: { \\
-        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
-                    std::to_string(HEAD_DIM) + " isa=" + \\
-                    std::to_string(static_cast<int>(ISA_TYPE))); \\
-      } \\
-    } \\
-  }()
 
-"""
-
-    # ARM64 with NEON
-    header += """#elif defined(__aarch64__)
-#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
-  [&] { \\
-    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
-    switch (encoded_params) { \\
-"""
-    header += generate_cases_for_isa_group(["NEON", "VEC", "VEC16"])
-    header += """
-      default: { \\
-        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
-                    std::to_string(HEAD_DIM) + " isa=" + \\
-                    std::to_string(static_cast<int>(ISA_TYPE))); \\
-      } \\
-    } \\
-  }()
-
-"""
-
-    # s390x with VXE
-    header += """#elif defined(__s390x__)
-#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
-  [&] { \\
-    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
-    switch (encoded_params) { \\
-"""
-    header += generate_cases_for_isa_group(["VXE", "VEC", "VEC16"])
-    header += """
-      default: { \\
-        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
-                    std::to_string(HEAD_DIM) + " isa=" + \\
-                    std::to_string(static_cast<int>(ISA_TYPE))); \\
-      } \\
-    } \\
-  }()
-
-"""
-
-    # Fallback: VEC and VEC16 only
-    header += """#else
-#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
-  [&] { \\
-    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
-    switch (encoded_params) { \\
-"""
-    header += generate_cases_for_isa_group(["VEC", "VEC16"])
-    header += """
-      default: { \\
-        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
-                    std::to_string(HEAD_DIM) + " isa=" + \\
-                    std::to_string(static_cast<int>(ISA_TYPE))); \\
-      } \\
-    } \\
-  }()
-
-#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ / __s390x__ */
-
-#endif  // CPU_ATTN_DISPATCH_GENERATED_H
-"""
+    def _macro_block(guard: str, isa_list: list[str], fp8: bool) -> str:
+        """Return one CPU_ATTN_DISPATCH macro block for a given guard."""
+        enc = "    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE, KV_CACHE_IDX); \\"
+        cases = generate_cases_for_isa_group(isa_list, include_fp8=fp8)
+        tail = (
+            "\n"
+            "      default: { \\\n"
+            "        TORCH_CHECK(false, "
+            '"Unsupported CPU attention configuration: head_dim=" + \\\n'
+            '                    std::to_string(HEAD_DIM) + " isa=" + \\\n'
+            "                    std::to_string(static_cast<int>(ISA_TYPE))"
+            " + \\\n"
+            '                    " kv_cache_idx=" + '
+            "std::to_string(KV_CACHE_IDX)); \\\n"
+            "      } \\\n"
+            "    } \\\n"
+            "  }()\n\n"
+        )
+        return (
+            f"{guard}\n"
+            "#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, KV_CACHE_IDX, ...) \\\n"
+            "  [&] { \\\n"
+            f"{enc}\n"
+            "    switch (encoded_params) { \\\n"
+            f"{cases}"
+            f"{tail}"
+        )
+
+    header += _macro_block(
+        "#if defined(CPU_CAPABILITY_AMXBF16)",
+        ["AMX", "VEC", "VEC16"],
+        fp8=True,
+    )
+    header += _macro_block(
+        "#elif defined(__aarch64__)",
+        ["NEON", "VEC", "VEC16"],
+        fp8=False,
+    )
+    header += _macro_block(
+        "#elif defined(__s390x__)",
+        ["VXE", "VEC", "VEC16"],
+        fp8=False,
+    )
+    header += _macro_block(
+        "#elif defined(__AVX512F__)",
+        ["VEC", "VEC16"],
+        fp8=True,
+    )
+    header += _macro_block(
+        "#elif defined(__AVX2__)",
+        ["VEC", "VEC16"],
+        fp8=False,
+    )
+    header += _macro_block(
+        "#else",
+        ["VEC", "VEC16"],
+        fp8=False,
+    )
+    header += (
+        "#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ / __s390x__ */\n\n#endif  // CPU_ATTN_DISPATCH_GENERATED_H\n"
+    )
 
     return header
 
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 2bbc955a77..428df64269 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -101,7 +101,9 @@ void cpu_attn_reshape_and_cache(const torch::Tensor& key,
                                 torch::Tensor& key_cache,
                                 torch::Tensor& value_cache,
                                 const torch::Tensor& slot_mapping,
-                                const std::string& isa);
+                                const std::string& isa, const double k_scale,
+                                const double v_scale,
+                                const std::string& kv_cache_dtype);
 
 void cpu_attention_with_kv_cache(
     const torch::Tensor& query, const torch::Tensor& key_cache,
@@ -112,7 +114,8 @@ void cpu_attention_with_kv_cache(
     const int64_t sliding_window_left, const int64_t sliding_window_right,
     const torch::Tensor& block_table, const double softcap,
     const torch::Tensor& scheduler_metadata,
-    const std::optional<torch::Tensor>& s_aux);
+    const std::optional<torch::Tensor>& s_aux, const double k_scale,
+    const double v_scale, const std::string& kv_cache_dtype);
 
 // Note: just for avoiding importing errors
 void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
@@ -384,15 +387,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       &get_scheduler_metadata);
   ops.def(
       "cpu_attn_reshape_and_cache(Tensor key, Tensor value, Tensor(a2!) "
-      "key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str "
-      "isa) -> ()",
+      "key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str isa, "
+      "float k_scale=1.0, float v_scale=1.0, str kv_cache_dtype=\"auto\") -> "
+      "()",
       &cpu_attn_reshape_and_cache);
   ops.def(
       "cpu_attention_with_kv_cache(Tensor query, Tensor key_cache, Tensor "
       "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
       "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
       "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
-      "float softcap, Tensor scheduler_metadata, Tensor? s_aux) -> ()",
+      "float softcap, Tensor scheduler_metadata, Tensor? s_aux, "
+      "float k_scale=1.0, float v_scale=1.0, str kv_cache_dtype=\"auto\") -> "
+      "()",
       &cpu_attention_with_kv_cache);
 
   // placeholders
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index c32d3a0606..ca96b0ef3f 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -96,44 +96,14 @@ struct enable_sm90_or_later : Kernel {
 };
 
 template <typename Kernel>
-struct enable_sm90_only : Kernel {
+struct enable_sm100_to_sm120 : Kernel {
   template <typename... Args>
   CUTLASS_DEVICE void operator()(Args&&... args) {
 #if defined __CUDA_ARCH__
-  #if __CUDA_ARCH__ == 900
+  #if (__CUDA_ARCH__ >= 1000 && __CUDA_ARCH__ < 1200)
     Kernel::operator()(std::forward<Args>(args)...);
   #else
-    printf("This kernel only supports sm90.\n");
-    asm("trap;");
-  #endif
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm100f_only : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__
-  #if __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
-    Kernel::operator()(std::forward<Args>(args)...);
-  #else
-    printf("This kernel only supports sm100f.\n");
-    asm("trap;");
-  #endif
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm100a_only : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__
-  #if __CUDA_ARCH__ == 1000
-    Kernel::operator()(std::forward<Args>(args)...);
-  #else
-    printf("This kernel only supports sm100a.\n");
+    printf("This kernel only supports sm[100, 120).\n");
     asm("trap;");
   #endif
 #endif
@@ -148,7 +118,7 @@ struct enable_sm120_only : Kernel {
   #if __CUDA_ARCH__ == 1200
     Kernel::operator()(std::forward<Args>(args)...);
   #else
-    printf("This kernel only supports sm120.\n");
+    printf("This kernel only supports sm120a.\n");
     asm("trap;");
   #endif
 #endif
@@ -160,8 +130,13 @@ template <typename Kernel>
 struct enable_sm120_family : Kernel {
   template <typename... Args>
   CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
+#if defined __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
     Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm120f.\n");
+    asm("trap;");
+  #endif
 #endif
   }
 };
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
index 7ccebf7516..0874fe71c2 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
@@ -141,7 +141,7 @@ struct cutlass_3x_gemm_sm100 {
               sizeof(typename CollectiveEpilogue::SharedStorage))>,
           KernelSchedule>::CollectiveOp;
 
-  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+  using GemmKernel = enable_sm100_to_sm120<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };
 
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
index 6e2f0b5662..11676639c8 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -125,7 +125,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
           MainloopScheduler
       >::CollectiveOp>;
 
-  using KernelType = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+  using KernelType = enable_sm100_to_sm120<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
 
   struct GemmKernel : public KernelType {};
diff --git a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
index d6ae939f6d..aecf052c7c 100644
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -92,7 +92,7 @@ struct cutlass_3x_gemm_sm100_fp8 {
   // -----------------------------------------------------------
   // Kernel definition
   // -----------------------------------------------------------
-  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+  using GemmKernel = enable_sm100_to_sm120<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };
 
diff --git a/csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu b/csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu
index 1894503f46..61e084a4b1 100644
--- a/csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -236,17 +236,41 @@ void per_token_group_quant_8bit(const torch::stable::Tensor& input,
 #undef LAUNCH_KERNEL
 }
 
-template <typename T, typename DST_DTYPE>
-__global__ void per_token_group_quant_8bit_packed_kernel(
+// Register-resident fast path for group_size==128.
+//
+// Each thread holds 16 source elements (32 B = uint4 x 2) in registers across
+// the absmax reduce -> scale compute -> quantize pipeline. No shared memory.
+// UE8M0 scale extracted via bit math (bit-exact with exp2f(ceilf(log2f))).
+//
+// Loads two contiguous uint4s (16 B + 16 B = 32 B) per thread; on Blackwell
+// nvcc fuses these into a single 256-bit LDG.E.256.
+//
+// Constraints: GROUP_SIZE % (THREADS_PER_GROUP * VEC_SIZE) == 0; for
+// THREADS_PER_GROUP=8 and bf16/fp16 (VEC_SIZE=16), this means GROUP_SIZE=128.
+template <typename T, typename DST_DTYPE, int GROUP_SIZE>
+__global__ void per_token_group_quant_8bit_packed_register_kernel(
     const T* __restrict__ input, void* __restrict__ output_q,
-    unsigned int* __restrict__ output_s_packed, const int group_size,
-    const int num_groups_padded, const int groups_per_block,
-    const int padded_groups_per_row, const int groups_per_row, const int mn,
-    const int tma_aligned_mn, const int num_scale_elems, const float eps,
+    unsigned int* __restrict__ output_s_packed, const int64_t num_groups_padded,
+    const int groups_per_block, const int padded_groups_per_row,
+    const int groups_per_row, const int mn, const int output_q_mn_extent,
+    const int tma_aligned_mn, const int64_t num_scale_elems, const float eps,
     const float min_8bit, const float max_8bit) {
-  const int threads_per_group = 16;
-  const int64_t local_group_id = threadIdx.x / threads_per_group;
-  const int lane_id = threadIdx.x % threads_per_group;
+  static_assert(GROUP_SIZE == 128, "fast path supports GROUP_SIZE==128");
+  constexpr int THREADS_PER_GROUP = 8;
+  constexpr int VEC_SIZE = 32 / sizeof(T);  // 16 for bf16/fp16
+  static_assert(GROUP_SIZE == THREADS_PER_GROUP * VEC_SIZE,
+                "GROUP_SIZE must equal THREADS_PER_GROUP * VEC_SIZE");
+  // Each group's 8 threads must live in a single warp octet so the
+  // 0xffu << (threadIdx.x & 24u) shuffle mask selects exactly the lanes
+  // that share a group. Requires 32 % THREADS_PER_GROUP == 0 and the host
+  // to launch num_threads as a multiple of THREADS_PER_GROUP (which it does
+  // via num_threads = groups_per_block * THREADS_PER_GROUP).
+  static_assert(32 % THREADS_PER_GROUP == 0,
+                "THREADS_PER_GROUP must divide warp size for the shuffle "
+                "mask to be valid");
+
+  const int local_group_id = threadIdx.x / THREADS_PER_GROUP;
+  const int lane_id = threadIdx.x % THREADS_PER_GROUP;
 
   const int64_t block_group_id = blockIdx.x * groups_per_block;
   const int64_t global_group_id = block_group_id + local_group_id;
@@ -254,141 +278,207 @@ __global__ void per_token_group_quant_8bit_packed_kernel(
     return;
   }
 
-  // map flat group id to 2D indices (mn_idx, sf_k_idx)
   const int sf_k_idx =
       static_cast<int>(global_group_id % padded_groups_per_row);
   const int mn_idx = static_cast<int>(global_group_id / padded_groups_per_row);
-
-  // whether it is a valid group (not padding)
   const bool is_valid_group = (mn_idx < mn) && (sf_k_idx < groups_per_row);
 
-  // shared memory to cache each group's data to avoid double DRAM reads.
-  extern __shared__ __align__(16) char smem_raw[];
-  T* smem = reinterpret_cast<T*>(smem_raw);
-  T* smem_group = smem + local_group_id * group_size;
-
-  // compute scale for valid groups
-  float y_s = 0.f;
+  // Load 16 input elements (32 B) into registers as two adjacent uint4
+  // loads. nvcc keeps these as 2x LDG.E.128 on sm_100; the per-thread cost
+  // is dominated by HBM bandwidth at large MN, so a fused 256-bit load via
+  // inline PTX gave no measurable speedup.
+  // alignas(16) is required so the uint4* reinterpret_cast below is
+  // well-defined for T == bf16/fp16 (default alignof is 2).
+  alignas(16) T regs[VEC_SIZE];
+  float local_absmax = eps;
   if (is_valid_group) {
     const T* group_input =
-        input + static_cast<int64_t>(mn_idx) * groups_per_row * group_size +
-        sf_k_idx * group_size;
-    y_s = ComputeGroupScale<T, true>(group_input, smem_group, group_size,
-                                     lane_id, threads_per_group, eps, max_8bit);
+        input + static_cast<int64_t>(mn_idx) * groups_per_row * GROUP_SIZE +
+        sf_k_idx * GROUP_SIZE + lane_id * VEC_SIZE;
+    uint4* dst = reinterpret_cast<uint4*>(&regs[0]);
+    const uint4* src = reinterpret_cast<const uint4*>(group_input);
+    dst[0] = src[0];
+    dst[1] = src[1];
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      float v = fabsf(static_cast<float>(regs[i]));
+      local_absmax = fmaxf(local_absmax, v);
+    }
   }
 
-  // pack 4 scales into a uint32 exponent
+  // 8-lane subgroup shuffle reduce (octet of the warp). The mask selects the
+  // 8 lanes within the warp that share a group.
+  unsigned mask = 0xffu << (threadIdx.x & 24u);
+  local_absmax = fmaxf(local_absmax, __shfl_xor_sync(mask, local_absmax, 4));
+  local_absmax = fmaxf(local_absmax, __shfl_xor_sync(mask, local_absmax, 2));
+  local_absmax = fmaxf(local_absmax, __shfl_xor_sync(mask, local_absmax, 1));
+
+  float y_s = local_absmax / max_8bit;
+  y_s = fmaxf(y_s, 1e-10f);
+  uint32_t bits = __float_as_uint(y_s);
+  uint32_t exp_bits = (bits >> 23) & 0xffu;
+  uint32_t mant_bits = bits & 0x7fffffu;
+  uint8_t exp_byte =
+      static_cast<uint8_t>(exp_bits + (mant_bits != 0u ? 1u : 0u));
+
+  // Lane 0 writes the packed scale byte.
   if (lane_id == 0) {
-    // each uint32 in output_s_packed stores 4 packed scales
     const int sf_k_pack_idx = sf_k_idx / 4;
     const int pos = sf_k_idx % 4;
     const int out_idx = sf_k_pack_idx * tma_aligned_mn + mn_idx;
-
     if (is_valid_group) {
-      // reinterpret the UE8M0 scale y_s as IEEE bits, extract the 8-bit
-      // exponent, and place it into the correct byte of the 32-bit word.
-      const unsigned int bits = __float_as_uint(y_s);
-      const uint8_t exponent = static_cast<uint8_t>((bits >> 23u) & 0xffu);
-      reinterpret_cast<uint8_t*>(output_s_packed)[out_idx * 4 + pos] = exponent;
+      reinterpret_cast<uint8_t*>(output_s_packed)[out_idx * 4 + pos] = exp_byte;
     } else if (out_idx < num_scale_elems) {
-      // write zero for padding groups if within bounds of output_s_packed
       reinterpret_cast<uint8_t*>(output_s_packed)[out_idx * 4 + pos] = 0;
     }
   }
 
-  __syncthreads();
+  // For padded mn rows that fall within output_q's allocated extent, write
+  // a uint4 of zeros to keep the buffer clean for downstream TMA loads.
+  // Skip writes for sf_k padding (those positions don't exist in output_q).
+  if (!is_valid_group) {
+    if (sf_k_idx < groups_per_row && mn_idx >= mn &&
+        mn_idx < output_q_mn_extent) {
+      DST_DTYPE* group_output =
+          static_cast<DST_DTYPE*>(output_q) +
+          static_cast<int64_t>(mn_idx) * groups_per_row * GROUP_SIZE +
+          sf_k_idx * GROUP_SIZE + lane_id * VEC_SIZE;
+      *reinterpret_cast<uint4*>(group_output) = make_uint4(0, 0, 0, 0);
+    }
+    return;
+  }
 
-  if (is_valid_group) {
-    DST_DTYPE* group_output =
-        static_cast<DST_DTYPE*>(output_q) +
-        static_cast<int64_t>(mn_idx) * groups_per_row * group_size +
-        sf_k_idx * group_size;
-    QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
-                                threads_per_group, y_s, min_8bit, max_8bit);
+  // Reconstruct y_s as a power-of-2 float and use its reciprocal.
+  float y_s_q = __uint_as_float(static_cast<uint32_t>(exp_byte) << 23);
+  float inv_y = 1.0f / y_s_q;
+
+  // Quantize and pack into 16 fp8/int8 bytes (= uint4). VEC_SIZE==16 so we
+  // fill four 32-bit words, four bytes each.
+  uint32_t packed_lo = 0;
+  uint32_t packed_lo_hi = 0;
+  uint32_t packed_hi_lo = 0;
+  uint32_t packed_hi = 0;
+#pragma unroll
+  for (int i = 0; i < VEC_SIZE; ++i) {
+    float q =
+        fminf(fmaxf(static_cast<float>(regs[i]) * inv_y, min_8bit), max_8bit);
+    DST_DTYPE qb = DST_DTYPE(q);
+    uint8_t byte = *reinterpret_cast<uint8_t*>(&qb);
+    const int shift = (i & 3) * 8;
+    if (i < 4) {
+      packed_lo |= static_cast<uint32_t>(byte) << shift;
+    } else if (i < 8) {
+      packed_lo_hi |= static_cast<uint32_t>(byte) << shift;
+    } else if (i < 12) {
+      packed_hi_lo |= static_cast<uint32_t>(byte) << shift;
+    } else {
+      packed_hi |= static_cast<uint32_t>(byte) << shift;
+    }
   }
+
+  uint4 packed_out =
+      make_uint4(packed_lo, packed_lo_hi, packed_hi_lo, packed_hi);
+  DST_DTYPE* group_output =
+      static_cast<DST_DTYPE*>(output_q) +
+      static_cast<int64_t>(mn_idx) * groups_per_row * GROUP_SIZE +
+      sf_k_idx * GROUP_SIZE + lane_id * VEC_SIZE;
+  *reinterpret_cast<uint4*>(group_output) = packed_out;
 }
 
+// Public entry point: register-resident packed quant kernel.
+// Constraints: group_size == 128 and bf16/fp16 input.
 void per_token_group_quant_8bit_packed(const torch::stable::Tensor& input,
                                        torch::stable::Tensor& output_q,
                                        torch::stable::Tensor& output_s_packed,
                                        int64_t group_size, double eps,
                                        double min_8bit, double max_8bit) {
+  STD_TORCH_CHECK(group_size == 128,
+                  "per_token_group_quant_8bit_packed only supports "
+                  "group_size==128, got ",
+                  group_size, ".");
+  const auto in_dtype = input.scalar_type();
+  STD_TORCH_CHECK(
+      in_dtype == torch::headeronly::ScalarType::Half ||
+          in_dtype == torch::headeronly::ScalarType::BFloat16,
+      "per_token_group_quant_8bit_packed only supports bf16/fp16 input.");
+
   STD_TORCH_CHECK(input.is_contiguous());
   STD_TORCH_CHECK(output_q.is_contiguous());
 
   const int64_t k = input.size(-1);
-  STD_TORCH_CHECK(k % group_size == 0, "Last dimension (", k,
-                  ") must be divisible by group_size (", group_size, ").");
+  STD_TORCH_CHECK(k % group_size == 0, "input last dim k=", k,
+                  " is not divisible by group_size=", group_size, ".");
 
   const int64_t mn = input.numel() / k;
   const int64_t groups_per_row = k / group_size;
-
-  STD_TORCH_CHECK(output_s_packed.dim() == 2,
-                  "output_s_packed must be 2D, got dim=", output_s_packed.dim(),
-                  ".");
-
   const int64_t k_num_packed_sfk = (groups_per_row + 3) / 4;
   const int64_t tma_aligned_mn = ((mn + 3) / 4) * 4;
 
+  // output_q may be allocated with extra padded mn rows (e.g.,
+  // (tma_aligned_mn, k)) so the kernel can zero-fill them in-line and the
+  // caller can use torch.empty instead of torch.zeros. The grid only covers
+  // up to tma_aligned_mn, so we cap the extent there.
+  const int64_t output_q_mn_actual = output_q.numel() / k;
+  STD_TORCH_CHECK(output_q_mn_actual >= mn,
+                  "output_q must have at least mn rows; got ",
+                  output_q_mn_actual, " rows for mn=", mn, ".");
+  const int64_t output_q_mn_extent =
+      output_q_mn_actual < tma_aligned_mn ? output_q_mn_actual : tma_aligned_mn;
+
   STD_TORCH_CHECK(
       output_s_packed.scalar_type() == torch::headeronly::ScalarType::Int,
-      "output_s_packed must have dtype int32 for UE8M0-packed scales.");
-  // DeepGEMM expects SFA scales in MN-major form with shape
-  // [mn, ceil_div(K, 128 * 4)] and TMA-aligned stride on the last
-  // dimension.
+      "output_s_packed must be int32 for UE8M0-packed scales.");
   STD_TORCH_CHECK(output_s_packed.size(0) == mn &&
                       output_s_packed.size(1) == k_num_packed_sfk,
                   "output_s_packed shape must be [", mn, ", ", k_num_packed_sfk,
-                  "], but got [", output_s_packed.size(0), ", ",
+                  "]; got [", output_s_packed.size(0), ", ",
                   output_s_packed.size(1), "].");
-  // Verify column-major TMA-aligned layout
   STD_TORCH_CHECK(output_s_packed.stride(0) == 1 &&
                       output_s_packed.stride(1) == tma_aligned_mn,
-                  "output_s_packed must have strides [1, ", tma_aligned_mn,
-                  "], but got [", output_s_packed.stride(0), ", ",
+                  "output_s_packed strides must be [1, ", tma_aligned_mn,
+                  "]; got [", output_s_packed.stride(0), ", ",
                   output_s_packed.stride(1), "].");
 
   cudaStream_t stream = get_current_cuda_stream();
 
-  constexpr int THREADS_PER_GROUP = 16;
-
-  // Expand the grid to cover MN and K padding so every byte in
-  // output_s_packed is written (padding bytes get zeroed by the kernel).
+  constexpr int THREADS_PER_GROUP = 8;
   const int64_t padded_groups_per_row = k_num_packed_sfk * 4;
   const int64_t num_groups_padded = tma_aligned_mn * padded_groups_per_row;
-  // Number of elements in output_s_packed.
   const int64_t num_scale_elems = mn + (k_num_packed_sfk - 1) * tma_aligned_mn;
-
   const int groups_per_block = GetGroupsPerBlock(num_groups_padded);
 
   auto dst_type = output_q.scalar_type();
-  const int num_blocks = num_groups_padded / groups_per_block;
+  const int64_t num_blocks = num_groups_padded / groups_per_block;
   const int num_threads = groups_per_block * THREADS_PER_GROUP;
-
-#define LAUNCH_PACKED_KERNEL(T, DST_DTYPE)                                     \
-  do {                                                                         \
-    dim3 grid(num_blocks);                                                     \
-    dim3 block(num_threads);                                                   \
-    size_t smem_bytes =                                                        \
-        static_cast<size_t>(groups_per_block) * group_size * sizeof(T);        \
-    per_token_group_quant_8bit_packed_kernel<T, DST_DTYPE>                     \
-        <<<grid, block, smem_bytes, stream>>>(                                 \
-            static_cast<const T*>(input.data_ptr()), output_q.data_ptr(),      \
-            reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()),       \
-            static_cast<int>(group_size), static_cast<int>(num_groups_padded), \
-            groups_per_block, static_cast<int>(padded_groups_per_row),         \
-            static_cast<int>(groups_per_row), static_cast<int>(mn),            \
-            static_cast<int>(tma_aligned_mn),                                  \
-            static_cast<int>(num_scale_elems), static_cast<float>(eps),        \
-            static_cast<float>(min_8bit), static_cast<float>(max_8bit));       \
+  // CUDA caps grid.x at 2^31 - 1; this fits any realistic shape but guard
+  // against pathological inputs.
+  STD_TORCH_CHECK(num_blocks <= static_cast<int64_t>(INT32_MAX),
+                  "per_token_group_quant_8bit_packed grid too large: ",
+                  num_blocks, " blocks (max ", INT32_MAX, ").");
+
+#define LAUNCH_REG_KERNEL(T, DST_DTYPE)                                   \
+  do {                                                                    \
+    dim3 grid(static_cast<unsigned int>(num_blocks));                     \
+    dim3 block(num_threads);                                              \
+    per_token_group_quant_8bit_packed_register_kernel<T, DST_DTYPE, 128>  \
+        <<<grid, block, 0, stream>>>(                                     \
+            static_cast<const T*>(input.data_ptr()), output_q.data_ptr(), \
+            reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()),  \
+            num_groups_padded, groups_per_block,                          \
+            static_cast<int>(padded_groups_per_row),                      \
+            static_cast<int>(groups_per_row), static_cast<int>(mn),       \
+            static_cast<int>(output_q_mn_extent),                         \
+            static_cast<int>(tma_aligned_mn), num_scale_elems,            \
+            static_cast<float>(eps), static_cast<float>(min_8bit),        \
+            static_cast<float>(max_8bit));                                \
   } while (0)
 
-  APHRODITE_STABLE_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "per_token_group_quant_8bit_packed", ([&] {
+  APHRODITE_STABLE_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "per_token_group_quant_8bit_packed_register", ([&] {
         if (dst_type == torch::headeronly::ScalarType::Float8_e4m3fn) {
-          LAUNCH_PACKED_KERNEL(scalar_t, __nv_fp8_e4m3);
+          LAUNCH_REG_KERNEL(scalar_t, __nv_fp8_e4m3);
         } else if (dst_type == torch::headeronly::ScalarType::Char) {
-          LAUNCH_PACKED_KERNEL(scalar_t, int8_t);
+          LAUNCH_REG_KERNEL(scalar_t, int8_t);
         } else {
           STD_TORCH_CHECK(
               false,
@@ -397,7 +487,7 @@ void per_token_group_quant_8bit_packed(const torch::stable::Tensor& input,
         }
       }));
 
-#undef LAUNCH_PACKED_KERNEL
+#undef LAUNCH_REG_KERNEL
 }
 
 void per_token_group_quant_fp8(const torch::stable::Tensor& input,
diff --git a/csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h b/csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h
index d67fd2b336..6630c0dece 100644
--- a/csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h
+++ b/csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h
@@ -8,3 +8,13 @@ void per_token_group_quant_8bit(const torch::stable::Tensor& input,
                                 torch::stable::Tensor& output_s,
                                 int64_t group_size, double eps, double min_8bit,
                                 double max_8bit, bool scale_ue8m0 = false);
+
+// Public op: register-resident packed quant for the DeepGEMM Blackwell path.
+// Restricted to group_size == 128 and bf16/fp16 input; other configurations
+// raise STD_TORCH_CHECK. The legacy shared-memory fallback was removed because
+// no production caller (deep_gemm_moe / input_quant_fp8) uses other shapes.
+void per_token_group_quant_8bit_packed(const torch::stable::Tensor& input,
+                                       torch::stable::Tensor& output_q,
+                                       torch::stable::Tensor& output_s_packed,
+                                       int64_t group_size, double eps,
+                                       double min_8bit, double max_8bit);
diff --git a/csrc/libtorch_stable/torch_bindings.cpp b/csrc/libtorch_stable/torch_bindings.cpp
index 044d9813fd..f67600b043 100644
--- a/csrc/libtorch_stable/torch_bindings.cpp
+++ b/csrc/libtorch_stable/torch_bindings.cpp
@@ -7,6 +7,10 @@
 // Note: We register under namespace "_C" so ops are accessible as
 // torch.ops._C.<op_name> for compatibility with existing code.
 STABLE_TORCH_LIBRARY_FRAGMENT(_C, ops) {
+#ifndef USE_ROCM
+  ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
+#endif
+
 #ifndef USE_ROCM
   // Compute per-token-group FP8 quantized tensor and scaling factor.
   // The dummy arguments are here so we can correctly fuse with RMSNorm.
@@ -218,6 +222,10 @@ STABLE_TORCH_LIBRARY_FRAGMENT(_C, ops) {
 }
 
 STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, ops) {
+#ifndef USE_ROCM
+  ops.impl("permute_cols", TORCH_BOX(&permute_cols));
+#endif
+
 #ifndef USE_ROCM
   // Per-token group quantization
   ops.impl("per_token_group_fp8_quant", TORCH_BOX(&per_token_group_quant_fp8));
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 973190935d..259745b7d4 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -67,9 +67,6 @@ void shuffle_rows(const torch::Tensor& input_tensor,
                   torch::Tensor& output_tensor);
 
 #ifndef USE_ROCM
-// cuBLAS bf16 x bf16 -> fp32 router GEMM (fallback for non-SM90 / batch > 16)
-torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
-                                    torch::Tensor const& weight);
 
 // DeepSeek V3 optimized router GEMM kernel for SM90+
 // Computes output = mat_a @ mat_b.T where:
diff --git a/csrc/moe/router_gemm.cu b/csrc/moe/router_gemm.cu
deleted file mode 100644
index a939f8846f..0000000000
--- a/csrc/moe/router_gemm.cu
+++ /dev/null
@@ -1,52 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-// bf16 x bf16 -> fp32 router GEMM via cuBLAS.
-// Uses CUBLAS_COMPUTE_32F so bf16 operands accumulate into fp32,
-// matching TRT-LLM's cuBLAS fallback behaviour in dsv3RouterGemmOp.
-
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cublas_v2.h>
-
-// cuBLAS column-major math for row-major PyTorch tensors:
-//   weight[N,K]_row  lda=K  -> cuBLAS sees (K,N) col-major; CUBLAS_OP_T ->
-//   (N,K) input[M,K]_row   ldb=K  -> cuBLAS sees (K,M) col-major; CUBLAS_OP_N
-//   -> (K,M) out[M,N]_row     ldc=N  -> cuBLAS sees (N,M) col-major (written as
-//   output^T)
-// cuBLAS: C(N,M) = weight(N,K) @ input(K,M)  =>  C^T = output[M,N]
-// params: m=N, n=M, k=K, lda=K (weight), ldb=K (input), ldc=N (output)
-
-torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
-                                    torch::Tensor const& weight) {
-  TORCH_CHECK(input.dtype() == torch::kBFloat16,
-              "router_gemm_bf16_fp32: input must be bfloat16");
-  TORCH_CHECK(weight.dtype() == torch::kBFloat16,
-              "router_gemm_bf16_fp32: weight must be bfloat16");
-  TORCH_CHECK(input.dim() == 2 && weight.dim() == 2,
-              "router_gemm_bf16_fp32: input and weight must be 2-D");
-  TORCH_CHECK(input.size(1) == weight.size(1),
-              "router_gemm_bf16_fp32: inner dimensions must match");
-
-  int64_t const M = input.size(0);
-  int64_t const N = weight.size(0);
-  int64_t const K = input.size(1);
-
-  auto out = torch::empty({M, N}, input.options().dtype(torch::kFloat32));
-
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  TORCH_CUDABLAS_CHECK(
-      cublasSetStream(handle, at::cuda::getCurrentCUDAStream()));
-
-  float const alpha = 1.0f;
-  float const beta = 0.0f;
-
-  TORCH_CUDABLAS_CHECK(cublasGemmEx(
-      handle, CUBLAS_OP_T, CUBLAS_OP_N, static_cast<int>(N),
-      static_cast<int>(M), static_cast<int>(K), &alpha, weight.data_ptr(),
-      CUDA_R_16BF, static_cast<int>(K), input.data_ptr(), CUDA_R_16BF,
-      static_cast<int>(K), &beta, out.data_ptr(), CUDA_R_32F,
-      static_cast<int>(N), CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT));
-
-  return out;
-}
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 7bf56ba7a2..b737cb5435 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -133,10 +133,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "Tensor)");
   m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 
-  // cuBLAS bf16 x bf16 -> fp32 router GEMM (fallback for non-SM90 / batch > 16)
-  m.def("router_gemm_bf16_fp32(Tensor input, Tensor weight) -> Tensor");
-  m.impl("router_gemm_bf16_fp32", torch::kCUDA, &router_gemm_bf16_fp32);
-
   // DeepSeek V3 optimized router GEMM for SM90+
   m.def("dsv3_router_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
   // conditionally compiled so impl registration is in source file
diff --git a/csrc/persistent_topk.cuh b/csrc/persistent_topk.cuh
index 146990d645..5d111ba88a 100644
--- a/csrc/persistent_topk.cuh
+++ b/csrc/persistent_topk.cuh
@@ -887,27 +887,14 @@ __global__ void __launch_bounds__(kThreadsPerBlock, 2)
   uint32_t* shared_ordered =
       reinterpret_cast<uint32_t*>(smem_raw + kFixedSmemLarge);
 
-  // RadixRowState for multi-CTA cooperative radix
+  // RadixRowState for multi-CTA cooperative radix.
+  // Zero-initialization is done host-side via cudaMemsetAsync in topk.cu
+  // before launch — that gives a stream-ordered happens-before edge for all
+  // CTAs, which the previous in-kernel init (CTA-0 only + intra-CTA
+  // __syncthreads) did not provide and which manifested as a race against
+  // CTA-1+'s first red_release on arrival_counter.
   RadixRowState* state = &params.row_states[group_id];
 
-  // -- Initialize RadixRowState (only needed if large rows exist) --
-  if (params.max_seq_len > RADIX_THRESHOLD) {
-    if (cta_in_group == 0) {
-      for (uint32_t buf = 0; buf < 3; buf++) {
-        for (uint32_t i = tx; i < RADIX; i += kThreadsPerBlock) {
-          state->histogram[buf][i] = 0;
-        }
-      }
-      if (tx == 0) {
-        state->remaining_k = 0;
-        state->prefix = 0;
-        state->arrival_counter = 0;
-        state->output_counter = 0;
-      }
-    }
-    __syncthreads();
-  }
-
   int barrier_phase = 0;
   const uint32_t total_iters = (params.num_rows + num_groups - 1) / num_groups;
 
diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
index a68eab1925..05c226f11d 100644
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -7,23 +7,23 @@
 
 namespace aphrodite {
 
-template <typename scalar_t, bool IS_NEOX>
+template <typename scalar_t, typename cache_t, bool IS_NEOX>
 inline __device__ void apply_token_rotary_embedding(
-    scalar_t* __restrict__ arr, const float* __restrict__ cos_ptr,
-    const float* __restrict__ sin_ptr, int rot_offset, int embed_dim,
+    scalar_t* __restrict__ arr, const cache_t* __restrict__ cos_ptr,
+    const cache_t* __restrict__ sin_ptr, int rot_offset, int embed_dim,
     const bool inverse) {
   int x_index, y_index;
   float cos_f, sin_f;
   if (IS_NEOX) {
     x_index = rot_offset;
     y_index = embed_dim + rot_offset;
-    cos_f = APHRODITE_LDG(cos_ptr + x_index);
-    sin_f = APHRODITE_LDG(sin_ptr + x_index);
+    cos_f = static_cast<float>(APHRODITE_LDG(cos_ptr + x_index));
+    sin_f = static_cast<float>(APHRODITE_LDG(sin_ptr + x_index));
   } else {
     x_index = 2 * rot_offset;
     y_index = 2 * rot_offset + 1;
-    cos_f = APHRODITE_LDG(cos_ptr + x_index / 2);
-    sin_f = APHRODITE_LDG(sin_ptr + x_index / 2);
+    cos_f = static_cast<float>(APHRODITE_LDG(cos_ptr + x_index / 2));
+    sin_f = static_cast<float>(APHRODITE_LDG(sin_ptr + x_index / 2));
   }
   if (inverse) {
     sin_f = -sin_f;
@@ -34,7 +34,7 @@ inline __device__ void apply_token_rotary_embedding(
   arr[y_index] = static_cast<scalar_t>(y_f * cos_f + x_f * sin_f);
 }
 
-template <typename scalar_t, bool IS_NEOX>
+template <typename scalar_t, typename cache_t, bool IS_NEOX>
 inline __device__ void apply_rotary_embedding(
     scalar_t* __restrict__ query,  // [batch_size, seq_len, num_heads,
                                    // head_size] or [num_tokens, num_heads,
@@ -43,14 +43,14 @@ inline __device__ void apply_rotary_embedding(
                                    // [batch_size, seq_len, num_kv_heads,
                                    // head_size] or [num_tokens, num_kv_heads,
                                    // head_size]
-    const float* cache_ptr, const int head_size, const int num_heads,
+    const cache_t* cache_ptr, const int head_size, const int num_heads,
     const int num_kv_heads, const int rot_dim, const int token_idx,
     const int64_t query_stride, const int64_t key_stride,
     const int64_t head_stride, const int64_t rope_dim_offset,
     const bool inverse) {
   const int embed_dim = rot_dim / 2;
-  const float* cos_ptr = cache_ptr;
-  const float* sin_ptr = cache_ptr + embed_dim;
+  const cache_t* cos_ptr = cache_ptr;
+  const cache_t* sin_ptr = cache_ptr + embed_dim;
 
   const int nq = num_heads * embed_dim;
   for (int i = threadIdx.x; i < nq; i += blockDim.x) {
@@ -58,7 +58,7 @@ inline __device__ void apply_rotary_embedding(
     const int64_t token_head =
         token_idx * query_stride + head_idx * head_stride + rope_dim_offset;
     const int rot_offset = i % embed_dim;
-    apply_token_rotary_embedding<scalar_t, IS_NEOX>(
+    apply_token_rotary_embedding<scalar_t, cache_t, IS_NEOX>(
         query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim, inverse);
   }
 
@@ -69,13 +69,13 @@ inline __device__ void apply_rotary_embedding(
       const int64_t token_head =
           token_idx * key_stride + head_idx * head_stride + rope_dim_offset;
       const int rot_offset = i % embed_dim;
-      apply_token_rotary_embedding<scalar_t, IS_NEOX>(
+      apply_token_rotary_embedding<scalar_t, cache_t, IS_NEOX>(
           key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim, inverse);
     }
   }
 }
 
-template <typename scalar_t, bool IS_NEOX>
+template <typename scalar_t, typename cache_t, bool IS_NEOX>
 __global__ void rotary_embedding_kernel(
     const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
                                             // [num_tokens]
@@ -86,15 +86,15 @@ __global__ void rotary_embedding_kernel(
                                  // [batch_size, seq_len, num_kv_heads,
                                  // head_size] or [num_tokens, num_kv_heads,
                                  // head_size]
-    const float* __restrict__ cos_sin_cache,  // [max_position, rot_dim] fp32
+    const cache_t* __restrict__ cos_sin_cache,  // [max_position, rot_dim]
     const int rot_dim, const int64_t query_stride, const int64_t key_stride,
     const int64_t head_stride, const int num_heads, const int num_kv_heads,
     const int head_size, const int64_t rope_dim_offset, const bool inverse) {
   const int token_idx = blockIdx.x;
   int64_t pos = positions[token_idx];
-  const float* cache_ptr = cos_sin_cache + pos * rot_dim;
+  const cache_t* cache_ptr = cos_sin_cache + pos * rot_dim;
 
-  apply_rotary_embedding<scalar_t, IS_NEOX>(
+  apply_rotary_embedding<scalar_t, cache_t, IS_NEOX>(
       query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
       token_idx, query_stride, key_stride, head_stride, rope_dim_offset,
       inverse);
@@ -168,25 +168,31 @@ void rotary_embedding(
   dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  auto cache_f32 = cos_sin_cache.to(torch::kFloat32);
   APHRODITE_DISPATCH_FLOATING_TYPES(
       query.scalar_type(), "rotary_embedding", [&] {
-        if (is_neox) {
-          aphrodite::rotary_embedding_kernel<scalar_t, true>
-              <<<grid, block, 0, stream>>>(
-                  positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-                  key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-                  cache_f32.data_ptr<float>(), rot_dim, query_stride,
-                  key_stride, head_stride, num_heads, num_kv_heads, head_size,
-                  rope_dim_offset, inverse);
-        } else {
-          aphrodite::rotary_embedding_kernel<scalar_t, false>
-              <<<grid, block, 0, stream>>>(
-                  positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-                  key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-                  cache_f32.data_ptr<float>(), rot_dim, query_stride,
-                  key_stride, head_stride, num_heads, num_kv_heads, head_size,
-                  rope_dim_offset, inverse);
-        }
+        using query_t = scalar_t;
+        APHRODITE_DISPATCH_FLOATING_TYPES(
+            cos_sin_cache.scalar_type(), "rotary_embedding_cache", [&] {
+              using cache_t = scalar_t;
+              if (is_neox) {
+                aphrodite::rotary_embedding_kernel<query_t, cache_t, true>
+                    <<<grid, block, 0, stream>>>(
+                        positions.data_ptr<int64_t>(),
+                        query.data_ptr<query_t>(),
+                        key.has_value() ? key->data_ptr<query_t>() : nullptr,
+                        cos_sin_cache.data_ptr<cache_t>(), rot_dim,
+                        query_stride, key_stride, head_stride, num_heads,
+                        num_kv_heads, head_size, rope_dim_offset, inverse);
+              } else {
+                aphrodite::rotary_embedding_kernel<query_t, cache_t, false>
+                    <<<grid, block, 0, stream>>>(
+                        positions.data_ptr<int64_t>(),
+                        query.data_ptr<query_t>(),
+                        key.has_value() ? key->data_ptr<query_t>() : nullptr,
+                        cos_sin_cache.data_ptr<cache_t>(), rot_dim,
+                        query_stride, key_stride, head_stride, num_heads,
+                        num_kv_heads, head_size, rope_dim_offset, inverse);
+              }
+            });
       });
 }
diff --git a/csrc/topk.cu b/csrc/topk.cu
index 87c1ce5ad7..f5563380af 100644
--- a/csrc/topk.cu
+++ b/csrc/topk.cu
@@ -83,22 +83,100 @@ void launch_persistent_topk(const torch::Tensor& logits,
     size_t smem_size = P::kFixedSmemLarge + chunk_size * sizeof(uint32_t);
     if (smem_size < P::kSmemMedium) smem_size = P::kSmemMedium;
 
+    // Query occupancy for the instantiation that will actually launch;
+    // overestimating it deadlocks the cooperative barrier.
     int occupancy = 1;
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &occupancy, P::persistent_topk_kernel<TopK, 4>, P::kThreadsPerBlock,
-        smem_size);
+    cudaError_t occ_err = cudaSuccess;
+    if (vec_size == 4) {
+      occ_err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &occupancy, P::persistent_topk_kernel<TopK, 4>, P::kThreadsPerBlock,
+          smem_size);
+    } else if (vec_size == 2) {
+      occ_err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &occupancy, P::persistent_topk_kernel<TopK, 2>, P::kThreadsPerBlock,
+          smem_size);
+    } else {
+      occ_err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &occupancy, P::persistent_topk_kernel<TopK, 1>, P::kThreadsPerBlock,
+          smem_size);
+    }
+    TORCH_CHECK(occ_err == cudaSuccess,
+                "persistent_topk occupancy query failed: ",
+                cudaGetErrorString(occ_err));
     if (occupancy < 1) occupancy = 1;
 
-    uint32_t max_resident_ctas = static_cast<uint32_t>(num_sms) * occupancy;
+    // The cooperative spin-wait barrier only runs when at least one row hits
+    // the radix path (seq_len > RADIX_THRESHOLD). Below that, non-CTA-0 CTAs
+    // early-exit, so oversubscription can't deadlock and headroom is wasted.
+    const bool needs_cooperative =
+        static_cast<uint32_t>(max_seq_len) > P::RADIX_THRESHOLD;
+
+    const uint32_t hw_resident_cap =
+        static_cast<uint32_t>(num_sms) * static_cast<uint32_t>(occupancy);
+    uint32_t max_resident_ctas = hw_resident_cap;
+    if (needs_cooperative) {
+      // Reserve one CTA per SM when occupancy allows; fall back to a single
+      // CTA when occupancy == 1 (the most deadlock-prone case — any straggler
+      // kernel that takes the only slot on one SM hangs the barrier). Never
+      // drop below one full group's worth.
+      uint32_t headroom = (occupancy > 1) ? static_cast<uint32_t>(num_sms) : 1u;
+      if (max_resident_ctas >= headroom + ctas_per_group) {
+        max_resident_ctas -= headroom;
+      }
+    }
     uint32_t num_groups = std::min(max_resident_ctas / ctas_per_group,
                                    static_cast<uint32_t>(num_rows));
     if (num_groups == 0) num_groups = 1;
     uint32_t total_ctas = num_groups * ctas_per_group;
 
+    // If the cooperative launch wouldn't fit, fall back to FilteredTopK
+    // instead of deadlocking. Only relevant when needs_cooperative.
+    if (needs_cooperative && total_ctas > hw_resident_cap) {
+      TORCH_CHECK(max_smem_per_block >= 128 * 1024,
+                  "persistent_topk would oversubscribe and the FilteredTopK "
+                  "fallback requires >=128KB smem per block (have ",
+                  max_smem_per_block, "). total_ctas=", total_ctas,
+                  " > num_sms*occupancy=", hw_resident_cap, " (TopK=", TopK,
+                  ", vec_size=", vec_size, ", ctas_per_group=", ctas_per_group,
+                  ", smem=", smem_size, ").");
+      cudaError_t status =
+          aphrodite::FilteredTopKRaggedTransform<float, int32_t, TopK>(
+              logits.data_ptr<float>(), output.data_ptr<int32_t>(),
+              lengths.data_ptr<int32_t>(), static_cast<uint32_t>(num_rows),
+              static_cast<uint32_t>(TopK), static_cast<uint32_t>(stride),
+              stream);
+      TORCH_CHECK(status == cudaSuccess,
+                  "FilteredTopK fallback failed: ", cudaGetErrorString(status));
+      return;
+    }
+
     size_t state_bytes = num_groups * sizeof(P::RadixRowState);
     TORCH_CHECK(workspace.size(0) >= static_cast<int64_t>(state_bytes),
                 "workspace too small, need ", state_bytes, " bytes");
 
+    // Zero the per-group RadixRowState region before launch — only when the
+    // radix path will actually run (max_seq_len > RADIX_THRESHOLD). The
+    // RadixRowState fields (arrival_counter, histograms) are only touched by
+    // radix_topk; the decode/medium paths inside the persistent kernel
+    // operate purely in shared memory and never read these globals, so a
+    // stale workspace is harmless for them.
+    //
+    // Why we need the memset (when needs_cooperative is true):
+    //   1. arrival_counter accumulates within a launch and is never reset,
+    //      so a prior call leaves it at a large positive value. Without this
+    //      reset, the very first wait_ge in the next call sees counter >>
+    //      target and returns instantly, breaking the barrier.
+    //   2. The previous in-kernel init only ran in CTA-0 with intra-CTA
+    //      __syncthreads(), so it had no happens-before edge to CTA-1+'s
+    //      first red_release. cudaMemsetAsync is stream-ordered: the zero
+    //      is globally visible before any CTA runs.
+    if (needs_cooperative) {
+      cudaError_t mz_err = cudaMemsetAsync(workspace.data_ptr<uint8_t>(), 0,
+                                           state_bytes, stream);
+      TORCH_CHECK(mz_err == cudaSuccess,
+                  "row_states memset failed: ", cudaGetErrorString(mz_err));
+    }
+
     P::PersistentTopKParams params;
     params.input = logits.data_ptr<float>();
     params.output = output.data_ptr<int32_t>();
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index d2d895011b..950487338a 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -105,8 +105,8 @@ def ReadTargets(log, show_all):
     """Reads all targets from .ninja_log file |log_file|, sorted by duration.
 
     The result is a list of Target objects."""
-    header = log.readline()
-    assert header == "# ninja log v5\n", "unrecognized ninja log version {!r}".format(header)
+    # header = log.readline()
+    # assert header == "# ninja log v5\n", "unrecognized ninja log version {!r}".format(header)
     targets_dict = {}
     last_end_seen = 0.0
     for line in log: