diff --git a/csrc/moe/fp32_router_gemm.cu b/csrc/moe/fp32_router_gemm.cu index f566460c2a3c..7f0f9be9ac06 100644 --- a/csrc/moe/fp32_router_gemm.cu +++ b/csrc/moe/fp32_router_gemm.cu @@ -78,7 +78,7 @@ __device__ __forceinline__ void load_activation<__nv_bfloat16, 8>( // Weight is always fp32; output is always fp32. // VPT = 16 / sizeof(InputT): 4 for fp32, 8 for bf16 template + int kHiddenDim, bool ENABLE_PDL> __global__ __launch_bounds__(128, 1) void fp32_router_gemm_kernel( float* out, InputT const* mat_a, float const* mat_b) { constexpr int VPT = 16 / sizeof(InputT); @@ -103,9 +103,11 @@ __global__ __launch_bounds__(128, 1) void fp32_router_gemm_kernel( k_bases[ki] = ki * k_elems_per_k_iteration + tid * VPT; } -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm volatile("griddepcontrol.launch_dependents;"); - asm volatile("griddepcontrol.wait;"); +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) && \ + defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + if constexpr (ENABLE_PDL) { + asm volatile("griddepcontrol.wait;"); + } #endif for (int ki = 0; ki < k_iterations; ki++) { @@ -149,6 +151,14 @@ __global__ __launch_bounds__(128, 1) void fp32_router_gemm_kernel( out[m * kNumExperts + n_idx] = final_sum; } } + +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) && \ + defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + if constexpr (ENABLE_PDL) { + __syncthreads(); + asm volatile("griddepcontrol.launch_dependents;"); + } +#endif } // --------------------------------------------------------------------------- @@ -159,20 +169,29 @@ template void invokeFp32RouterGemm(float* output, InputT const* mat_a, float const* mat_b, cudaStream_t stream) { constexpr int kBlockSize = 128; - cudaLaunchConfig_t config; - config.gridDim = kNumExperts; - config.blockDim = kBlockSize; - config.dynamicSmemBytes = 0; - config.stream = stream; - cudaLaunchAttribute attrs[1]; - attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = 1; - config.numAttrs = 1; - config.attrs = attrs; - cudaLaunchKernelEx(&config, - fp32_router_gemm_kernel, - output, mat_a, mat_b); +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + if (getEnvEnablePDL()) { + cudaLaunchConfig_t config; + config.gridDim = kNumExperts; + config.blockDim = kBlockSize; + config.dynamicSmemBytes = 0; + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = 1; + config.numAttrs = 1; + config.attrs = attrs; + cudaLaunchKernelEx(&config, + fp32_router_gemm_kernel, + output, mat_a, mat_b); + return; + } +#endif + + fp32_router_gemm_kernel + <<>>(output, mat_a, mat_b); } // --------------------------------------------------------------------------- diff --git a/csrc/moe/fp32_router_gemm_entry.cu b/csrc/moe/fp32_router_gemm_entry.cu index 8c6a7a5ad157..2ef85082eda5 100644 --- a/csrc/moe/fp32_router_gemm_entry.cu +++ b/csrc/moe/fp32_router_gemm_entry.cu @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -54,11 +55,24 @@ void fp32_router_gemm(at::Tensor& output, // [num_tokens, num_experts] const at::Tensor& mat_b // [num_experts, hidden_dim] ) { TORCH_CHECK(output.dim() == 2 && mat_a.dim() == 2 && mat_b.dim() == 2); + TORCH_CHECK(output.is_cuda() && mat_a.is_cuda() && mat_b.is_cuda(), + "fp32_router_gemm: all tensors must be CUDA tensors"); + TORCH_CHECK(output.get_device() == mat_a.get_device() && + output.get_device() == mat_b.get_device(), + "fp32_router_gemm: all tensors must be on the same CUDA device"); + TORCH_CHECK(output.is_contiguous() && mat_a.is_contiguous() && + mat_b.is_contiguous(), + "fp32_router_gemm: all tensors must be contiguous"); const int num_tokens = mat_a.size(0); const int num_experts = mat_b.size(0); const int hidden_dim = mat_a.size(1); + TORCH_CHECK(output.size(0) == num_tokens && output.size(1) == num_experts, + "fp32_router_gemm: output must have shape [num_tokens, " + "num_experts], got [", + output.size(0), ", ", output.size(1), "], expected [", + num_tokens, ", ", num_experts, "]"); TORCH_CHECK( mat_a.size(1) == mat_b.size(1), "fp32_router_gemm: mat_a and mat_b must have the same hidden_dim"); @@ -68,8 +82,8 @@ void fp32_router_gemm(at::Tensor& output, // [num_tokens, num_experts] TORCH_CHECK(num_experts == FP32_NUM_EXPERTS, "fp32_router_gemm: expected num_experts=", FP32_NUM_EXPERTS, ", got ", num_experts); - TORCH_CHECK(num_tokens >= 1 && num_tokens <= FP32_MAX_TOKENS, - "fp32_router_gemm: num_tokens must be in [1, ", FP32_MAX_TOKENS, + TORCH_CHECK(num_tokens <= FP32_MAX_TOKENS, + "fp32_router_gemm: num_tokens must be in [0, ", FP32_MAX_TOKENS, "], got ", num_tokens); TORCH_CHECK(mat_a.dtype() == at::kFloat || mat_a.dtype() == at::kBFloat16, "fp32_router_gemm: mat_a must be float32 or bfloat16"); @@ -78,6 +92,11 @@ void fp32_router_gemm(at::Tensor& output, // [num_tokens, num_experts] TORCH_CHECK(output.dtype() == at::kFloat, "fp32_router_gemm: output must be float32"); + if (num_tokens == 0) { + return; + } + + const at::cuda::OptionalCUDAGuard device_guard(device_of(mat_a)); const int sm = getSMVersion(); TORCH_CHECK(sm >= 90, "fp32_router_gemm: requires SM90+, got SM", sm); diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 869c15041058..9b058bc8e4cb 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -62,6 +62,13 @@ __device__ __forceinline__ float toFloat(T value) { } } +#ifndef USE_ROCM +inline bool supportsPdlOnCurrentDevice() { + const auto* props = at::cuda::getCurrentDeviceProperties(); + return props != nullptr && props->major >= 9; +} +#endif + // Scoring function enums enum ScoringFunc { SCORING_SOFTMAX = 0, // apply softmax @@ -315,9 +322,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__ const int thread_row = warp_base_row + thread_row_in_warp; -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) +#if !defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) && \ + defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) if constexpr (ENABLE_PDL) { - asm volatile("griddepcontrol.launch_dependents;"); asm volatile("griddepcontrol.wait;"); } #endif @@ -569,6 +576,13 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__ } } +#if !defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) && \ + defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + if constexpr (ENABLE_PDL) { + asm volatile("griddepcontrol.launch_dependents;"); + } +#endif + } namespace detail @@ -599,8 +613,8 @@ void topkGatingLauncherHelper(const InputType* input, const bool* finished, floa const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB; dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB); -#ifndef USE_ROCM - if (enable_pdl) { +#if !defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + if (enable_pdl && supportsPdlOnCurrentDevice()) { cudaLaunchConfig_t config; config.gridDim = num_blocks; config.blockDim = block_dim; diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index d0aab04f4910..fc38f918ecfc 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2300,6 +2300,17 @@ def gpt_oss_router_gemm( return output +if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "fp32_router_gemm"): + + @register_fake("_moe_C::fp32_router_gemm") + def fp32_router_gemm_fake( + output: torch.Tensor, + mat_a: torch.Tensor, + mat_b: torch.Tensor, + ) -> None: + return + + def topk_softmax( topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/envs.py b/vllm/envs.py index 2944bb111d24..2bbbb3232111 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -75,6 +75,7 @@ VLLM_MAIN_CUDA_VERSION: str = "12.9" VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" VLLM_BATCH_INVARIANT: bool = False + TRTLLM_ENABLE_PDL: bool = False MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -500,6 +501,9 @@ def _get_or_set_default() -> str: # Enable batch-invariant mode: deterministic results regardless of # batch composition. Requires NVIDIA GPU with compute capability >= 9.0. "VLLM_BATCH_INVARIANT": lambda: bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0"))), + # Enable Programmatic Dependent Launch for supported NVIDIA MoE router + # kernels. Requires CUDA >= 12.0 and compute capability >= 9.0. + "TRTLLM_ENABLE_PDL": lambda: bool(int(os.getenv("TRTLLM_ENABLE_PDL", "0"))), # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index a95481a7e6a0..4a1f8cfff724 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -233,6 +233,8 @@ class FusedMoE(CustomOp): quant_config: Quantization configure. enable_eplb: Whether to enable expert parallelism load balancer. router_logits_dtype: Data type for router logits buffers. + enable_router_pdl: Whether fused top-k routing kernels should join a + Programmatic Dependent Launch chain. """ # --8<-- [end:fused_moe] @@ -272,6 +274,7 @@ def __init__( gate: torch.nn.Module | None = None, shared_experts: torch.nn.Module | None = None, routed_input_transform: torch.nn.Module | None = None, + enable_router_pdl: bool = False, ): super().__init__() @@ -462,6 +465,7 @@ def __init__( # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: self.quant_method.topk_indices_dtype, + enable_pdl=enable_router_pdl, ) self.routing_method_type: RoutingMethodType = self.router.routing_method_type diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 2d9848cc73d6..ccfc0f40e08e 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -45,7 +45,7 @@ def vllm_topk_sigmoid( gating_output: torch.Tensor, renormalize: bool = False, e_score_correction_bias: torch.Tensor | None = None, - enable_pdl: bool = True, # FIXME + enable_pdl: bool = False, ) -> tuple[torch.Tensor, ...]: ops.topk_sigmoid( topk_weights, @@ -81,6 +81,7 @@ def fused_topk_bias( renormalize: bool, scoring_func: str = "softmax", indices_type: torch.dtype | None = None, + enable_pdl: bool = False, ): if not rocm_aiter_ops.is_fused_moe_enabled(): assert hidden_states.size(0) == gating_output.size(0), ( @@ -110,6 +111,7 @@ def fused_topk_bias( gating_output, renormalize, e_score_correction_bias, + enable_pdl, ) return topk_weights, topk_ids elif scoring_func == "sigmoid": @@ -120,6 +122,7 @@ def fused_topk_bias( gating_output, renormalize, e_score_correction_bias, + enable_pdl, ) return topk_weights, topk_ids else: @@ -186,6 +189,7 @@ def __init__( routed_scaling_factor: float = 1.0, enable_eplb: bool = False, indices_type_getter: Callable[[], torch.dtype | None] | None = None, + enable_pdl: bool = False, ): super().__init__( top_k=top_k, @@ -198,6 +202,7 @@ def __init__( self.renormalize = renormalize self.scoring_func = scoring_func self.routed_scaling_factor = routed_scaling_factor + self.enable_pdl = enable_pdl @property def routing_method_type(self) -> RoutingMethodType: @@ -224,6 +229,7 @@ def _compute_routing( renormalize=self.renormalize, scoring_func=self.scoring_func, indices_type=indices_type, + enable_pdl=self.enable_pdl, ) if self.routed_scaling_factor != 1.0: diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index 8385bdf1c51d..03000f22a6fb 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -77,6 +77,7 @@ def fused_topk( renormalize: bool, indices_type: torch.dtype | None = None, scoring_func: str = "softmax", + enable_pdl: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch" @@ -96,20 +97,30 @@ def fused_topk( ) if scoring_func == "softmax": - topk_func = dispatch_topk_softmax_func( - use_rocm_aiter=rocm_aiter_ops.is_fused_moe_enabled() - ) + use_rocm_aiter = rocm_aiter_ops.is_fused_moe_enabled() + topk_func = dispatch_topk_softmax_func(use_rocm_aiter=use_rocm_aiter) + pdl_kwargs = {} if use_rocm_aiter else {"enable_pdl": enable_pdl} topk_weights, topk_ids = topk_func( - topk_weights, topk_ids, token_expert_indices, gating_output, renormalize + topk_weights, + topk_ids, + token_expert_indices, + gating_output, + renormalize, + **pdl_kwargs, ) return topk_weights, topk_ids, token_expert_indices elif scoring_func == "sigmoid": - topk_func = dispatch_topk_sigmoid_func( - use_rocm_aiter=rocm_aiter_ops.is_fused_moe_enabled() - ) + use_rocm_aiter = rocm_aiter_ops.is_fused_moe_enabled() + topk_func = dispatch_topk_sigmoid_func(use_rocm_aiter=use_rocm_aiter) + pdl_kwargs = {} if use_rocm_aiter else {"enable_pdl": enable_pdl} topk_weights, topk_ids = topk_func( - topk_weights, topk_ids, token_expert_indices, gating_output, renormalize + topk_weights, + topk_ids, + token_expert_indices, + gating_output, + renormalize, + **pdl_kwargs, ) return topk_weights, topk_ids, token_expert_indices @@ -129,6 +140,7 @@ def __init__( renormalize: bool = True, enable_eplb: bool = False, indices_type_getter: Callable[[], torch.dtype | None] | None = None, + enable_pdl: bool = False, ): super().__init__( top_k=top_k, @@ -139,6 +151,7 @@ def __init__( ) self.renormalize = renormalize self.scoring_func = scoring_func + self.enable_pdl = enable_pdl @property def routing_method_type(self) -> RoutingMethodType: @@ -164,6 +177,7 @@ def _compute_routing( renormalize=self.renormalize, indices_type=indices_type, scoring_func=self.scoring_func, + enable_pdl=self.enable_pdl, ) return topk_weights, topk_ids diff --git a/vllm/model_executor/layers/fused_moe/router/gate_linear.py b/vllm/model_executor/layers/fused_moe/router/gate_linear.py index 39d7051db8ec..63274c187f1f 100644 --- a/vllm/model_executor/layers/fused_moe/router/gate_linear.py +++ b/vllm/model_executor/layers/fused_moe/router/gate_linear.py @@ -15,7 +15,8 @@ class GateLinear(ReplicatedLinear): """MoE gate linear layer with multi-tier GEMM dispatch: 1. DSV3 specialized kernel (SM90+, fp32 out, M<=16, H=7168, E=256/384) - 2. fp32 specialized kernel (SM90+, fp32 in/out, M<=32, H=3072, E=256) + 2. fp32 specialized kernel (SM90+, bf16/fp32 in, fp32 out, + M<=32, H=3072, E=256) 3. gpt-oss specialized kernel (SM90+, bf16, M<=128, H=2880, E=32/128) 4. cuBLAS bf16×bf16→fp32 (SM90+ + bf16 weight + fp32 out_dtype) 5. F.linear via ReplicatedLinear (ultimate fallback) @@ -56,7 +57,7 @@ def __init__( ) # If fp32 compute is required and no specialized kernel is available, - # store weights in fp32 so Tier 3 computes in fp32 natively. + # store weights in fp32 so the fallback linear path computes in fp32. if force_fp32_compute and not can_use_specialized_kernels: params_dtype = torch.float32 @@ -136,7 +137,10 @@ def forward( # Tier 2: fp32 specialized kernel (H=3072, E=256, M<=32) # Dispatch is wrapped in a custom op so that torch.compile/CUDA-graph # capture does not freeze the runtime num_tokens branch. - if self.allow_fp32_router_gemm: + if self.allow_fp32_router_gemm and x.dtype in ( + torch.float32, + torch.bfloat16, + ): output = torch.ops.vllm.fp32_router_gemm_dispatch(x, self.weight) return output, None diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py index 11027e894bee..5cab6859d05e 100644 --- a/vllm/model_executor/layers/fused_moe/router/router_factory.py +++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py @@ -49,6 +49,8 @@ def create_fused_moe_router( # eplb parameters enable_eplb: bool = False, eplb_state: EplbLayerState = EMPTY_EPLB_STATE, + # routing kernel parameters + enable_pdl: bool = False, ) -> FusedMoERouter: """ Factory function to create the appropriate FusedMoERouter subclass based on @@ -86,6 +88,11 @@ def create_fused_moe_router( enable_eplb: Whether EPLB is enabled eplb_state: EPLB (Expert Parallelism Load Balancing) state + Routing kernel arguments: + enable_pdl: Whether CUDA fused top-k routing kernels should participate + in a Programmatic Dependent Launch chain. This is only used on + supported NVIDIA GPUs with CUDA >= 12.0 and SM90+. + Returns: An instance of the appropriate FusedMoERouter subclass """ @@ -156,6 +163,7 @@ def create_fused_moe_router( routed_scaling_factor=routed_scaling_factor, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, + enable_pdl=enable_pdl, ) return FusedTopKRouter( @@ -166,4 +174,5 @@ def create_fused_moe_router( scoring_func=scoring_func, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, + enable_pdl=enable_pdl, ) diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 4da1c340e636..3f696ac7cf80 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -30,6 +30,7 @@ from torch import nn from transformers import PretrainedConfig +import vllm.envs as envs from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -57,6 +58,7 @@ default_weight_loader, maybe_remap_kv_scale_name, ) +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -70,6 +72,17 @@ ) +def _enable_router_pdl() -> bool: + is_hopper_or_blackwell = current_platform.is_device_capability( + (9, 0) + ) or current_platform.is_device_capability_family(100) + return ( + current_platform.is_cuda() + and is_hopper_or_blackwell + and envs.TRTLLM_ENABLE_PDL + ) + + class MiniMaxM2MoE(nn.Module): def __init__( self, @@ -108,6 +121,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.experts", router_logits_dtype=torch.float32, + enable_router_pdl=_enable_router_pdl(), ) self.gate = GateLinear(