diff --git a/csrc/moe/fp32_router_gemm.cu b/csrc/moe/fp32_router_gemm.cu
index f566460c2a3c..7f0f9be9ac06 100644
--- a/csrc/moe/fp32_router_gemm.cu
+++ b/csrc/moe/fp32_router_gemm.cu
@@ -78,7 +78,7 @@ __device__ __forceinline__ void load_activation<__nv_bfloat16, 8>(
 // Weight is always fp32; output is always fp32.
 // VPT = 16 / sizeof(InputT):  4 for fp32, 8 for bf16
 template <typename InputT, int kBlockSize, int kNumTokens, int kNumExperts,
-          int kHiddenDim>
+          int kHiddenDim, bool ENABLE_PDL>
 __global__ __launch_bounds__(128, 1) void fp32_router_gemm_kernel(
     float* out, InputT const* mat_a, float const* mat_b) {
   constexpr int VPT = 16 / sizeof(InputT);
@@ -103,9 +103,11 @@ __global__ __launch_bounds__(128, 1) void fp32_router_gemm_kernel(
     k_bases[ki] = ki * k_elems_per_k_iteration + tid * VPT;
   }
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-  asm volatile("griddepcontrol.launch_dependents;");
-  asm volatile("griddepcontrol.wait;");
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) && \
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+  if constexpr (ENABLE_PDL) {
+    asm volatile("griddepcontrol.wait;");
+  }
 #endif
 
   for (int ki = 0; ki < k_iterations; ki++) {
@@ -149,6 +151,14 @@ __global__ __launch_bounds__(128, 1) void fp32_router_gemm_kernel(
       out[m * kNumExperts + n_idx] = final_sum;
     }
   }
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) && \
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+  if constexpr (ENABLE_PDL) {
+    __syncthreads();
+    asm volatile("griddepcontrol.launch_dependents;");
+  }
+#endif
 }
 
 // ---------------------------------------------------------------------------
@@ -159,20 +169,29 @@ template <typename InputT, int kNumTokens, int kNumExperts, int kHiddenDim>
 void invokeFp32RouterGemm(float* output, InputT const* mat_a,
                           float const* mat_b, cudaStream_t stream) {
   constexpr int kBlockSize = 128;
-  cudaLaunchConfig_t config;
-  config.gridDim = kNumExperts;
-  config.blockDim = kBlockSize;
-  config.dynamicSmemBytes = 0;
-  config.stream = stream;
-  cudaLaunchAttribute attrs[1];
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = 1;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config,
-                     fp32_router_gemm_kernel<InputT, kBlockSize, kNumTokens,
-                                             kNumExperts, kHiddenDim>,
-                     output, mat_a, mat_b);
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+  if (getEnvEnablePDL()) {
+    cudaLaunchConfig_t config;
+    config.gridDim = kNumExperts;
+    config.blockDim = kBlockSize;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = 1;
+    config.numAttrs = 1;
+    config.attrs = attrs;
+    cudaLaunchKernelEx(&config,
+                       fp32_router_gemm_kernel<InputT, kBlockSize, kNumTokens,
+                                               kNumExperts, kHiddenDim, true>,
+                       output, mat_a, mat_b);
+    return;
+  }
+#endif
+
+  fp32_router_gemm_kernel<InputT, kBlockSize, kNumTokens, kNumExperts,
+                          kHiddenDim, false>
+      <<<kNumExperts, kBlockSize, 0, stream>>>(output, mat_a, mat_b);
 }
 
 // ---------------------------------------------------------------------------
diff --git a/csrc/moe/fp32_router_gemm_entry.cu b/csrc/moe/fp32_router_gemm_entry.cu
index 8c6a7a5ad157..2ef85082eda5 100644
--- a/csrc/moe/fp32_router_gemm_entry.cu
+++ b/csrc/moe/fp32_router_gemm_entry.cu
@@ -3,6 +3,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include <cuda_bf16.h>
 #include <cuda_runtime.h>
@@ -54,11 +55,24 @@ void fp32_router_gemm(at::Tensor& output,       // [num_tokens, num_experts]
                       const at::Tensor& mat_b   // [num_experts, hidden_dim]
 ) {
   TORCH_CHECK(output.dim() == 2 && mat_a.dim() == 2 && mat_b.dim() == 2);
+  TORCH_CHECK(output.is_cuda() && mat_a.is_cuda() && mat_b.is_cuda(),
+              "fp32_router_gemm: all tensors must be CUDA tensors");
+  TORCH_CHECK(output.get_device() == mat_a.get_device() &&
+                  output.get_device() == mat_b.get_device(),
+              "fp32_router_gemm: all tensors must be on the same CUDA device");
+  TORCH_CHECK(output.is_contiguous() && mat_a.is_contiguous() &&
+                  mat_b.is_contiguous(),
+              "fp32_router_gemm: all tensors must be contiguous");
 
   const int num_tokens = mat_a.size(0);
   const int num_experts = mat_b.size(0);
   const int hidden_dim = mat_a.size(1);
 
+  TORCH_CHECK(output.size(0) == num_tokens && output.size(1) == num_experts,
+              "fp32_router_gemm: output must have shape [num_tokens, "
+              "num_experts], got [",
+              output.size(0), ", ", output.size(1), "], expected [",
+              num_tokens, ", ", num_experts, "]");
   TORCH_CHECK(
       mat_a.size(1) == mat_b.size(1),
       "fp32_router_gemm: mat_a and mat_b must have the same hidden_dim");
@@ -68,8 +82,8 @@ void fp32_router_gemm(at::Tensor& output,       // [num_tokens, num_experts]
   TORCH_CHECK(num_experts == FP32_NUM_EXPERTS,
               "fp32_router_gemm: expected num_experts=", FP32_NUM_EXPERTS,
               ", got ", num_experts);
-  TORCH_CHECK(num_tokens >= 1 && num_tokens <= FP32_MAX_TOKENS,
-              "fp32_router_gemm: num_tokens must be in [1, ", FP32_MAX_TOKENS,
+  TORCH_CHECK(num_tokens <= FP32_MAX_TOKENS,
+              "fp32_router_gemm: num_tokens must be in [0, ", FP32_MAX_TOKENS,
               "], got ", num_tokens);
   TORCH_CHECK(mat_a.dtype() == at::kFloat || mat_a.dtype() == at::kBFloat16,
               "fp32_router_gemm: mat_a must be float32 or bfloat16");
@@ -78,6 +92,11 @@ void fp32_router_gemm(at::Tensor& output,       // [num_tokens, num_experts]
   TORCH_CHECK(output.dtype() == at::kFloat,
               "fp32_router_gemm: output must be float32");
 
+  if (num_tokens == 0) {
+    return;
+  }
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(mat_a));
   const int sm = getSMVersion();
   TORCH_CHECK(sm >= 90, "fp32_router_gemm: requires SM90+, got SM", sm);
 
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 869c15041058..9b058bc8e4cb 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -62,6 +62,13 @@ __device__ __forceinline__ float toFloat(T value) {
     }
 }
 
+#ifndef USE_ROCM
+inline bool supportsPdlOnCurrentDevice() {
+    const auto* props = at::cuda::getCurrentDeviceProperties();
+    return props != nullptr && props->major >= 9;
+}
+#endif
+
 // Scoring function enums
 enum ScoringFunc {
   SCORING_SOFTMAX = 0, // apply softmax
@@ -315,9 +322,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
     const int thread_row = warp_base_row + thread_row_in_warp;
 
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if !defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) && \
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
     if constexpr (ENABLE_PDL) {
-        asm volatile("griddepcontrol.launch_dependents;");
         asm volatile("griddepcontrol.wait;");
     }
 #endif
@@ -569,6 +576,13 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
         }
     }
 
+#if !defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) && \
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if constexpr (ENABLE_PDL) {
+        asm volatile("griddepcontrol.launch_dependents;");
+    }
+#endif
+
 }
 
 namespace detail
@@ -599,8 +613,8 @@ void topkGatingLauncherHelper(const InputType* input, const bool* finished, floa
     const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
 
     dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-#ifndef USE_ROCM
-    if (enable_pdl) {
+#if !defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+    if (enable_pdl && supportsPdlOnCurrentDevice()) {
         cudaLaunchConfig_t config;
         config.gridDim = num_blocks;
         config.blockDim = block_dim;
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d0aab04f4910..fc38f918ecfc 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2300,6 +2300,17 @@ def gpt_oss_router_gemm(
     return output
 
 
+if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "fp32_router_gemm"):
+
+    @register_fake("_moe_C::fp32_router_gemm")
+    def fp32_router_gemm_fake(
+        output: torch.Tensor,
+        mat_a: torch.Tensor,
+        mat_b: torch.Tensor,
+    ) -> None:
+        return
+
+
 def topk_softmax(
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
diff --git a/vllm/envs.py b/vllm/envs.py
index 2944bb111d24..2bbbb3232111 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,7 @@
     VLLM_MAIN_CUDA_VERSION: str = "12.9"
     VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
     VLLM_BATCH_INVARIANT: bool = False
+    TRTLLM_ENABLE_PDL: bool = False
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -500,6 +501,9 @@ def _get_or_set_default() -> str:
     # Enable batch-invariant mode: deterministic results regardless of
     # batch composition. Requires NVIDIA GPU with compute capability >= 9.0.
     "VLLM_BATCH_INVARIANT": lambda: bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0"))),
+    # Enable Programmatic Dependent Launch for supported NVIDIA MoE router
+    # kernels. Requires CUDA >= 12.0 and compute capability >= 9.0.
+    "TRTLLM_ENABLE_PDL": lambda: bool(int(os.getenv("TRTLLM_ENABLE_PDL", "0"))),
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index a95481a7e6a0..4a1f8cfff724 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -233,6 +233,8 @@ class FusedMoE(CustomOp):
         quant_config: Quantization configure.
         enable_eplb: Whether to enable expert parallelism load balancer.
         router_logits_dtype: Data type for router logits buffers.
+        enable_router_pdl: Whether fused top-k routing kernels should join a
+                           Programmatic Dependent Launch chain.
     """
 
     # --8<-- [end:fused_moe]
@@ -272,6 +274,7 @@ def __init__(
         gate: torch.nn.Module | None = None,
         shared_experts: torch.nn.Module | None = None,
         routed_input_transform: torch.nn.Module | None = None,
+        enable_router_pdl: bool = False,
     ):
         super().__init__()
 
@@ -462,6 +465,7 @@ def __init__(
             # TODO(bnell): once we can construct the MK at init time, we
             # can make this a value.
             indices_type_getter=lambda: self.quant_method.topk_indices_dtype,
+            enable_pdl=enable_router_pdl,
         )
         self.routing_method_type: RoutingMethodType = self.router.routing_method_type
 
diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
index 2d9848cc73d6..ccfc0f40e08e 100644
--- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
+++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
@@ -45,7 +45,7 @@ def vllm_topk_sigmoid(
     gating_output: torch.Tensor,
     renormalize: bool = False,
     e_score_correction_bias: torch.Tensor | None = None,
-    enable_pdl: bool = True,  # FIXME
+    enable_pdl: bool = False,
 ) -> tuple[torch.Tensor, ...]:
     ops.topk_sigmoid(
         topk_weights,
@@ -81,6 +81,7 @@ def fused_topk_bias(
     renormalize: bool,
     scoring_func: str = "softmax",
     indices_type: torch.dtype | None = None,
+    enable_pdl: bool = False,
 ):
     if not rocm_aiter_ops.is_fused_moe_enabled():
         assert hidden_states.size(0) == gating_output.size(0), (
@@ -110,6 +111,7 @@ def fused_topk_bias(
                 gating_output,
                 renormalize,
                 e_score_correction_bias,
+                enable_pdl,
             )
             return topk_weights, topk_ids
         elif scoring_func == "sigmoid":
@@ -120,6 +122,7 @@ def fused_topk_bias(
                 gating_output,
                 renormalize,
                 e_score_correction_bias,
+                enable_pdl,
             )
             return topk_weights, topk_ids
         else:
@@ -186,6 +189,7 @@ def __init__(
         routed_scaling_factor: float = 1.0,
         enable_eplb: bool = False,
         indices_type_getter: Callable[[], torch.dtype | None] | None = None,
+        enable_pdl: bool = False,
     ):
         super().__init__(
             top_k=top_k,
@@ -198,6 +202,7 @@ def __init__(
         self.renormalize = renormalize
         self.scoring_func = scoring_func
         self.routed_scaling_factor = routed_scaling_factor
+        self.enable_pdl = enable_pdl
 
     @property
     def routing_method_type(self) -> RoutingMethodType:
@@ -224,6 +229,7 @@ def _compute_routing(
             renormalize=self.renormalize,
             scoring_func=self.scoring_func,
             indices_type=indices_type,
+            enable_pdl=self.enable_pdl,
         )
 
         if self.routed_scaling_factor != 1.0:
diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py
index 8385bdf1c51d..03000f22a6fb 100644
--- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py
+++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py
@@ -77,6 +77,7 @@ def fused_topk(
     renormalize: bool,
     indices_type: torch.dtype | None = None,
     scoring_func: str = "softmax",
+    enable_pdl: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch"
 
@@ -96,20 +97,30 @@ def fused_topk(
     )
 
     if scoring_func == "softmax":
-        topk_func = dispatch_topk_softmax_func(
-            use_rocm_aiter=rocm_aiter_ops.is_fused_moe_enabled()
-        )
+        use_rocm_aiter = rocm_aiter_ops.is_fused_moe_enabled()
+        topk_func = dispatch_topk_softmax_func(use_rocm_aiter=use_rocm_aiter)
+        pdl_kwargs = {} if use_rocm_aiter else {"enable_pdl": enable_pdl}
         topk_weights, topk_ids = topk_func(
-            topk_weights, topk_ids, token_expert_indices, gating_output, renormalize
+            topk_weights,
+            topk_ids,
+            token_expert_indices,
+            gating_output,
+            renormalize,
+            **pdl_kwargs,
         )
 
         return topk_weights, topk_ids, token_expert_indices
     elif scoring_func == "sigmoid":
-        topk_func = dispatch_topk_sigmoid_func(
-            use_rocm_aiter=rocm_aiter_ops.is_fused_moe_enabled()
-        )
+        use_rocm_aiter = rocm_aiter_ops.is_fused_moe_enabled()
+        topk_func = dispatch_topk_sigmoid_func(use_rocm_aiter=use_rocm_aiter)
+        pdl_kwargs = {} if use_rocm_aiter else {"enable_pdl": enable_pdl}
         topk_weights, topk_ids = topk_func(
-            topk_weights, topk_ids, token_expert_indices, gating_output, renormalize
+            topk_weights,
+            topk_ids,
+            token_expert_indices,
+            gating_output,
+            renormalize,
+            **pdl_kwargs,
         )
 
         return topk_weights, topk_ids, token_expert_indices
@@ -129,6 +140,7 @@ def __init__(
         renormalize: bool = True,
         enable_eplb: bool = False,
         indices_type_getter: Callable[[], torch.dtype | None] | None = None,
+        enable_pdl: bool = False,
     ):
         super().__init__(
             top_k=top_k,
@@ -139,6 +151,7 @@ def __init__(
         )
         self.renormalize = renormalize
         self.scoring_func = scoring_func
+        self.enable_pdl = enable_pdl
 
     @property
     def routing_method_type(self) -> RoutingMethodType:
@@ -164,6 +177,7 @@ def _compute_routing(
             renormalize=self.renormalize,
             indices_type=indices_type,
             scoring_func=self.scoring_func,
+            enable_pdl=self.enable_pdl,
         )
 
         return topk_weights, topk_ids
diff --git a/vllm/model_executor/layers/fused_moe/router/gate_linear.py b/vllm/model_executor/layers/fused_moe/router/gate_linear.py
index 39d7051db8ec..63274c187f1f 100644
--- a/vllm/model_executor/layers/fused_moe/router/gate_linear.py
+++ b/vllm/model_executor/layers/fused_moe/router/gate_linear.py
@@ -15,7 +15,8 @@ class GateLinear(ReplicatedLinear):
     """MoE gate linear layer with multi-tier GEMM dispatch:
 
     1. DSV3 specialized kernel (SM90+, fp32 out, M<=16, H=7168, E=256/384)
-    2. fp32 specialized kernel  (SM90+, fp32 in/out, M<=32, H=3072, E=256)
+    2. fp32 specialized kernel  (SM90+, bf16/fp32 in, fp32 out,
+       M<=32, H=3072, E=256)
     3. gpt-oss specialized kernel (SM90+, bf16, M<=128, H=2880, E=32/128)
     4. cuBLAS bf16×bf16→fp32 (SM90+ + bf16 weight + fp32 out_dtype)
     5. F.linear via ReplicatedLinear (ultimate fallback)
@@ -56,7 +57,7 @@ def __init__(
         )
 
         # If fp32 compute is required and no specialized kernel is available,
-        # store weights in fp32 so Tier 3 computes in fp32 natively.
+        # store weights in fp32 so the fallback linear path computes in fp32.
         if force_fp32_compute and not can_use_specialized_kernels:
             params_dtype = torch.float32
 
@@ -136,7 +137,10 @@ def forward(
         # Tier 2: fp32 specialized kernel (H=3072, E=256, M<=32)
         # Dispatch is wrapped in a custom op so that torch.compile/CUDA-graph
         # capture does not freeze the runtime num_tokens branch.
-        if self.allow_fp32_router_gemm:
+        if self.allow_fp32_router_gemm and x.dtype in (
+            torch.float32,
+            torch.bfloat16,
+        ):
             output = torch.ops.vllm.fp32_router_gemm_dispatch(x, self.weight)
             return output, None
 
diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py
index 11027e894bee..5cab6859d05e 100644
--- a/vllm/model_executor/layers/fused_moe/router/router_factory.py
+++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py
@@ -49,6 +49,8 @@ def create_fused_moe_router(
     # eplb parameters
     enable_eplb: bool = False,
     eplb_state: EplbLayerState = EMPTY_EPLB_STATE,
+    # routing kernel parameters
+    enable_pdl: bool = False,
 ) -> FusedMoERouter:
     """
     Factory function to create the appropriate FusedMoERouter subclass based on
@@ -86,6 +88,11 @@ def create_fused_moe_router(
         enable_eplb: Whether EPLB is enabled
         eplb_state: EPLB (Expert Parallelism Load Balancing) state
 
+    Routing kernel arguments:
+        enable_pdl: Whether CUDA fused top-k routing kernels should participate
+            in a Programmatic Dependent Launch chain. This is only used on
+            supported NVIDIA GPUs with CUDA >= 12.0 and SM90+.
+
     Returns:
         An instance of the appropriate FusedMoERouter subclass
     """
@@ -156,6 +163,7 @@ def create_fused_moe_router(
             routed_scaling_factor=routed_scaling_factor,
             enable_eplb=enable_eplb,
             indices_type_getter=indices_type_getter,
+            enable_pdl=enable_pdl,
         )
 
     return FusedTopKRouter(
@@ -166,4 +174,5 @@ def create_fused_moe_router(
         scoring_func=scoring_func,
         enable_eplb=enable_eplb,
         indices_type_getter=indices_type_getter,
+        enable_pdl=enable_pdl,
     )
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 4da1c340e636..3f696ac7cf80 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -30,6 +30,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
@@ -57,6 +58,7 @@
     default_weight_loader,
     maybe_remap_kv_scale_name,
 )
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -70,6 +72,17 @@
 )
 
 
+def _enable_router_pdl() -> bool:
+    is_hopper_or_blackwell = current_platform.is_device_capability(
+        (9, 0)
+    ) or current_platform.is_device_capability_family(100)
+    return (
+        current_platform.is_cuda()
+        and is_hopper_or_blackwell
+        and envs.TRTLLM_ENABLE_PDL
+    )
+
+
 class MiniMaxM2MoE(nn.Module):
     def __init__(
         self,
@@ -108,6 +121,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
             router_logits_dtype=torch.float32,
+            enable_router_pdl=_enable_router_pdl(),
         )
 
         self.gate = GateLinear(