From f90b6e36c9b65aef7ebf08ad295dc1a63ca36456 Mon Sep 17 00:00:00 2001
From: austin1997 <18709560+austin1997@users.noreply.github.com>
Date: Wed, 22 Apr 2026 06:19:07 +0000
Subject: [PATCH 1/3] [ROCm] Route BF16 softmax through matrix kernel (MIOpen
 NOT_IMPLEMENTED)

MIOpen (as of ROCm 7.x) returns MIOPEN_STATUS_NOT_IMPLEMENTED for
miopenSoftmaxForward_V2 with miopenBFloat16, so the gpudnn softmax path
cannot be used for BF16 on HIP. When the input dim exceeds the warp
softmax cap, route BF16 through the existing matrix softmax kernel
instead of letting the call fall into the MIOpen branch.

Also gate the CUDNN_VERSION < 8100 BF16 fallback specialization on
!defined(PADDLE_WITH_HIP) -- that branch dispatched into MIOpen too and
would trip the same NOT_IMPLEMENTED failure on ROCm.
---
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 4752513c483f5..c0c26949d4764 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -1325,7 +1325,7 @@ void LaunchKeMatrixSoftmaxForwardKernel(const GPUContext& dev_ctx,
       <<<N, block_dim, 0, dev_ctx.stream()>>>(out, input, dim_size);
 }
 
-#if CUDNN_VERSION < 8100
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION < 8100
 template <>
 inline void LaunchSoftmaxForwardCudnnKernel<phi::bfloat16>(
     const GPUContext& dev_ctx,
@@ -2811,7 +2811,16 @@ void SoftmaxForwardCUDAKernelDriverImpl(const GPUContext& dev_ctx,
                                                            dim_log2);
       }
     } else {
-      if (dim >= MATRIX_SOFTMAX_THRESHOLD) {
+      bool use_matrix_kernel = dim >= MATRIX_SOFTMAX_THRESHOLD;
+#ifdef PADDLE_WITH_HIP
+      // MIOpen (as of ROCm 7.x) returns MIOPEN_STATUS_NOT_IMPLEMENTED for
+      // miopenSoftmaxForward_V2 with miopenBFloat16. Route BF16 through the
+      // matrix softmax kernel for any dim that exceeds the warp-softmax cap.
+      if (std::is_same<T, phi::bfloat16>::value) {
+        use_matrix_kernel = true;
+      }
+#endif
+      if (use_matrix_kernel) {
         LaunchKeMatrixSoftmaxForwardKernel<T, IndexType, LogMode>(
             dev_ctx, out_data, x.data<T>(), N, dim);
       } else {

From fb70d4fe8b7c15c3b338082491d053325061eba7 Mon Sep 17 00:00:00 2001
From: austin1997 <18709560+austin1997@users.noreply.github.com>
Date: Wed, 22 Apr 2026 06:20:21 +0000
Subject: [PATCH 2/3] [ROCm] Skip cuDNN-only conv2d fusion passes on HIP

conv2d_add_fuse_pass and conv2d_add_act_fuse_pass rewrite conv2d+add[+act]
into the fused_conv2d_add_act op, which has only a cuDNN GPUDNN kernel.
On ROCm the rewrite succeeds but kernel dispatch later fails because no
HIP kernel is registered, so PaddleX currently works around this by
calling config.delete_pass("conv2d_add_act_fuse_pass") and
config.delete_pass("conv2d_add_fuse_pass") under paddle.is_compiled_with_rocm()
in paddlex/inference/models/runners/paddle_static/runner.py.

Gate both the pass registration (REGISTER_IR_PASS / USE_PIR_PASS) and the
pass-builder inclusion on PADDLE_WITH_CUDA so the rewrite never runs on
HIP builds, making the PaddleX delete_pass calls unnecessary.
---
 paddle/fluid/inference/api/paddle_pass_builder.cc           | 4 ++++
 paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc | 6 ++++++
 paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc     | 6 ++++++
 paddle/fluid/pir/transforms/passes.h                        | 2 ++
 4 files changed, 18 insertions(+)

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 2cfab1eaf1cf3..ad7f59b11f98c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -635,8 +635,12 @@ const std::vector<std::string> kPirGpuPasses{
     // Operator fusion pass
     "silu_fuse_pass",
     "conv2d_bn_fuse_pass",
+#ifdef PADDLE_WITH_CUDA
+    // conv2d_add(_act)_fuse_pass lower to the fused_conv2d_add_act op, which
+    // only has a cuDNN GPUDNN kernel. Skip on ROCm/HIP.
     "conv2d_add_act_fuse_pass",
     "conv2d_add_fuse_pass",
+#endif
     "embedding_eltwise_layernorm_fuse_pass",
     "fused_rotary_position_embedding_pass",
     "fused_flash_attn_pass",
diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
index d81ef58c2eecd..9adfd1282ea6e 100644
--- a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
@@ -349,4 +349,10 @@ std::unique_ptr<Pass> CreateConv2dAddActFusePass() {
 
 }  // namespace pir
 
+// The fused_conv2d_add_act op this pass produces only has a cuDNN
+// (PADDLE_WITH_CUDA) GPUDNN kernel, so the pass is a no-op on other backends.
+// Skip registration on ROCm/HIP to avoid applying the rewrite and later
+// failing at kernel-dispatch time.
+#ifdef PADDLE_WITH_CUDA
 REGISTER_IR_PASS(conv2d_add_act_fuse_pass, Conv2dAddActFusePass);
+#endif
diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
index 475eb426e1de9..1c5ff1e52bfd9 100644
--- a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
@@ -221,4 +221,10 @@ std::unique_ptr<Pass> CreateConv2dAddFusePass() {
 }
 }  // namespace pir
 
+// The fused_conv2d_add_act op this pass produces only has a cuDNN
+// (PADDLE_WITH_CUDA) GPUDNN kernel, so the pass is a no-op on other backends.
+// Skip registration on ROCm/HIP to avoid applying the rewrite and later
+// failing at kernel-dispatch time.
+#ifdef PADDLE_WITH_CUDA
 REGISTER_IR_PASS(conv2d_add_fuse_pass, Conv2dAddFusePass);
+#endif
diff --git a/paddle/fluid/pir/transforms/passes.h b/paddle/fluid/pir/transforms/passes.h
index 5b34175ac4f54..31f0e514cfa54 100644
--- a/paddle/fluid/pir/transforms/passes.h
+++ b/paddle/fluid/pir/transforms/passes.h
@@ -34,8 +34,10 @@ USE_PIR_PASS(matmul_add_act_fuse_pass);
 USE_PIR_PASS(silu_fuse_pass);
 USE_PIR_PASS(fc_elementwise_layernorm_fuse_pass);
 USE_PIR_PASS(conv2d_bn_fuse_pass);
+#ifdef PADDLE_WITH_CUDA
 USE_PIR_PASS(conv2d_add_fuse_pass);
 USE_PIR_PASS(conv2d_add_act_fuse_pass);
+#endif
 USE_PIR_PASS(embedding_eltwise_layernorm_fuse_pass);
 USE_PIR_PASS(add_norm_fuse_pass);
 USE_PIR_PASS(group_norm_silu_fuse_pass);

From fa80fca032a2927561815f6d9a06c8a15f6003cc Mon Sep 17 00:00:00 2001
From: austin1997 <18709560+austin1997@users.noreply.github.com>
Date: Wed, 22 Apr 2026 12:55:56 +0000
Subject: [PATCH 3/3] [ROCm] Re-enable BF16 conv kernels on HIP

Restore the BF16 registrations for conv2d / conv3d / depthwise conv kernels
and the DataType::BFLOAT16 -> miopenBFloat16 mapping originally added by
ROCm/Paddle#47 and reverted on paddle_hackthon ahead of RDNA4 enablement.

The change is gated at compile time by the existing #ifdef PADDLE_WITH_HIP
block. Deployment to archs that lack native BF16 support should be handled
via PADDLE_ROCM_OFFLOAD_ARCHS (paddle_hackthon's default list already
covers the BF16-capable set: CDNA3/gfx942, CDNA4/gfx950, RDNA3/gfx1100-
1102, RDNA4/gfx1200-1201); if a downstream target needs to strip BF16 from
the build, it can narrow the offload-arch list accordingly. No runtime
arch queries are introduced.
---
 paddle/phi/backends/gpu/rocm/miopen_desc.h    |  3 +++
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 15 ++++++++-----
 paddle/phi/kernels/gpudnn/conv_kernel.cu      | 21 ++++++++++++++-----
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h
index 15276c61ef8dd..b2119d16c4a88 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_desc.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_desc.h
@@ -62,6 +62,9 @@ inline miopenDataType_t ToCudnnDataType(const DataType& t) {
     case DataType::FLOAT32:
       type = miopenFloat;
       break;
+    case DataType::BFLOAT16:
+      type = miopenBFloat16;
+      break;
     default:
       break;
   }
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index ac390524b19a4..fd0c69da3b1be 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -1443,34 +1443,39 @@ PD_REGISTER_KERNEL(conv2d_grad,
                    ALL_LAYOUT,
                    phi::ConvCudnnGradKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(conv3d_grad,
                    GPUDNN,
                    ALL_LAYOUT,
                    phi::Conv3DCudnnGradKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 PD_REGISTER_KERNEL(conv2d_double_grad,
                    GPUDNN,
                    ALL_LAYOUT,
                    phi::ConvCudnnGradGradKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(conv3d_double_grad,
                    GPUDNN,
                    ALL_LAYOUT,
                    phi::Conv3DCudnnDoubleGradKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(depthwise_conv2d_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::DepthwiseConvDoubleGradGPUDNNKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 #else
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(conv2d_grad,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index fcc1a2fff7029..a6fc23d41e7f4 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -561,18 +561,29 @@ void Conv3DCudnnKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(
-    conv2d, GPUDNN, ALL_LAYOUT, phi::ConvCudnnKernel, float, phi::float16) {}
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   phi::float16,
+                   phi::bfloat16) {}
 
-PD_REGISTER_KERNEL(
-    conv3d, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnKernel, float, phi::float16) {}
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   phi::float16,
+                   phi::bfloat16) {}
 
 PD_REGISTER_KERNEL(depthwise_conv2d,
                    GPUDNN,
                    ALL_LAYOUT,
                    phi::DepthwiseConvCudnnKernel,
                    float,
-                   phi::float16) {}
+                   phi::float16,
+                   phi::bfloat16) {}
 
 #else
 #if CUDNN_VERSION_MIN(8, 1, 0)