From f90b6e36c9b65aef7ebf08ad295dc1a63ca36456 Mon Sep 17 00:00:00 2001 From: austin1997 <18709560+austin1997@users.noreply.github.com> Date: Wed, 22 Apr 2026 06:19:07 +0000 Subject: [PATCH 1/3] [ROCm] Route BF16 softmax through matrix kernel (MIOpen NOT_IMPLEMENTED) MIOpen (as of ROCm 7.x) returns MIOPEN_STATUS_NOT_IMPLEMENTED for miopenSoftmaxForward_V2 with miopenBFloat16, so the gpudnn softmax path cannot be used for BF16 on HIP. When the input dim exceeds the warp softmax cap, route BF16 through the existing matrix softmax kernel instead of letting the call fall into the MIOpen branch. Also gate the CUDNN_VERSION < 8100 BF16 fallback specialization on !defined(PADDLE_WITH_HIP) -- that branch dispatched into MIOpen too and would trip the same NOT_IMPLEMENTED failure on ROCm. --- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 4752513c483f5..c0c26949d4764 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -1325,7 +1325,7 @@ void LaunchKeMatrixSoftmaxForwardKernel(const GPUContext& dev_ctx, <<>>(out, input, dim_size); } -#if CUDNN_VERSION < 8100 +#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION < 8100 template <> inline void LaunchSoftmaxForwardCudnnKernel( const GPUContext& dev_ctx, @@ -2811,7 +2811,16 @@ void SoftmaxForwardCUDAKernelDriverImpl(const GPUContext& dev_ctx, dim_log2); } } else { - if (dim >= MATRIX_SOFTMAX_THRESHOLD) { + bool use_matrix_kernel = dim >= MATRIX_SOFTMAX_THRESHOLD; +#ifdef PADDLE_WITH_HIP + // MIOpen (as of ROCm 7.x) returns MIOPEN_STATUS_NOT_IMPLEMENTED for + // miopenSoftmaxForward_V2 with miopenBFloat16. Route BF16 through the + // matrix softmax kernel for any dim that exceeds the warp-softmax cap. + if (std::is_same::value) { + use_matrix_kernel = true; + } +#endif + if (use_matrix_kernel) { LaunchKeMatrixSoftmaxForwardKernel( dev_ctx, out_data, x.data(), N, dim); } else { From fb70d4fe8b7c15c3b338082491d053325061eba7 Mon Sep 17 00:00:00 2001 From: austin1997 <18709560+austin1997@users.noreply.github.com> Date: Wed, 22 Apr 2026 06:20:21 +0000 Subject: [PATCH 2/3] [ROCm] Skip cuDNN-only conv2d fusion passes on HIP conv2d_add_fuse_pass and conv2d_add_act_fuse_pass rewrite conv2d+add[+act] into the fused_conv2d_add_act op, which has only a cuDNN GPUDNN kernel. On ROCm the rewrite succeeds but kernel dispatch later fails because no HIP kernel is registered, so PaddleX currently works around this by calling config.delete_pass("conv2d_add_act_fuse_pass") and config.delete_pass("conv2d_add_fuse_pass") under paddle.is_compiled_with_rocm() in paddlex/inference/models/runners/paddle_static/runner.py. Gate both the pass registration (REGISTER_IR_PASS / USE_PIR_PASS) and the pass-builder inclusion on PADDLE_WITH_CUDA so the rewrite never runs on HIP builds, making the PaddleX delete_pass calls unnecessary. --- paddle/fluid/inference/api/paddle_pass_builder.cc | 4 ++++ paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc | 6 ++++++ paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc | 6 ++++++ paddle/fluid/pir/transforms/passes.h | 2 ++ 4 files changed, 18 insertions(+) diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 2cfab1eaf1cf3..ad7f59b11f98c 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -635,8 +635,12 @@ const std::vector kPirGpuPasses{ // Operator fusion pass "silu_fuse_pass", "conv2d_bn_fuse_pass", +#ifdef PADDLE_WITH_CUDA + // conv2d_add(_act)_fuse_pass lower to the fused_conv2d_add_act op, which + // only has a cuDNN GPUDNN kernel. Skip on ROCm/HIP. "conv2d_add_act_fuse_pass", "conv2d_add_fuse_pass", +#endif "embedding_eltwise_layernorm_fuse_pass", "fused_rotary_position_embedding_pass", "fused_flash_attn_pass", diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc index d81ef58c2eecd..9adfd1282ea6e 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc @@ -349,4 +349,10 @@ std::unique_ptr CreateConv2dAddActFusePass() { } // namespace pir +// The fused_conv2d_add_act op this pass produces only has a cuDNN +// (PADDLE_WITH_CUDA) GPUDNN kernel, so the pass is a no-op on other backends. +// Skip registration on ROCm/HIP to avoid applying the rewrite and later +// failing at kernel-dispatch time. +#ifdef PADDLE_WITH_CUDA REGISTER_IR_PASS(conv2d_add_act_fuse_pass, Conv2dAddActFusePass); +#endif diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc index 475eb426e1de9..1c5ff1e52bfd9 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc @@ -221,4 +221,10 @@ std::unique_ptr CreateConv2dAddFusePass() { } } // namespace pir +// The fused_conv2d_add_act op this pass produces only has a cuDNN +// (PADDLE_WITH_CUDA) GPUDNN kernel, so the pass is a no-op on other backends. +// Skip registration on ROCm/HIP to avoid applying the rewrite and later +// failing at kernel-dispatch time. +#ifdef PADDLE_WITH_CUDA REGISTER_IR_PASS(conv2d_add_fuse_pass, Conv2dAddFusePass); +#endif diff --git a/paddle/fluid/pir/transforms/passes.h b/paddle/fluid/pir/transforms/passes.h index 5b34175ac4f54..31f0e514cfa54 100644 --- a/paddle/fluid/pir/transforms/passes.h +++ b/paddle/fluid/pir/transforms/passes.h @@ -34,8 +34,10 @@ USE_PIR_PASS(matmul_add_act_fuse_pass); USE_PIR_PASS(silu_fuse_pass); USE_PIR_PASS(fc_elementwise_layernorm_fuse_pass); USE_PIR_PASS(conv2d_bn_fuse_pass); +#ifdef PADDLE_WITH_CUDA USE_PIR_PASS(conv2d_add_fuse_pass); USE_PIR_PASS(conv2d_add_act_fuse_pass); +#endif USE_PIR_PASS(embedding_eltwise_layernorm_fuse_pass); USE_PIR_PASS(add_norm_fuse_pass); USE_PIR_PASS(group_norm_silu_fuse_pass); From fa80fca032a2927561815f6d9a06c8a15f6003cc Mon Sep 17 00:00:00 2001 From: austin1997 <18709560+austin1997@users.noreply.github.com> Date: Wed, 22 Apr 2026 12:55:56 +0000 Subject: [PATCH 3/3] [ROCm] Re-enable BF16 conv kernels on HIP Restore the BF16 registrations for conv2d / conv3d / depthwise conv kernels and the DataType::BFLOAT16 -> miopenBFloat16 mapping originally added by ROCm/Paddle#47 and reverted on paddle_hackthon ahead of RDNA4 enablement. The change is gated at compile time by the existing #ifdef PADDLE_WITH_HIP block. Deployment to archs that lack native BF16 support should be handled via PADDLE_ROCM_OFFLOAD_ARCHS (paddle_hackthon's default list already covers the BF16-capable set: CDNA3/gfx942, CDNA4/gfx950, RDNA3/gfx1100- 1102, RDNA4/gfx1200-1201); if a downstream target needs to strip BF16 from the build, it can narrow the offload-arch list accordingly. No runtime arch queries are introduced. --- paddle/phi/backends/gpu/rocm/miopen_desc.h | 3 +++ paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 15 ++++++++----- paddle/phi/kernels/gpudnn/conv_kernel.cu | 21 ++++++++++++++----- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h index 15276c61ef8dd..b2119d16c4a88 100644 --- a/paddle/phi/backends/gpu/rocm/miopen_desc.h +++ b/paddle/phi/backends/gpu/rocm/miopen_desc.h @@ -62,6 +62,9 @@ inline miopenDataType_t ToCudnnDataType(const DataType& t) { case DataType::FLOAT32: type = miopenFloat; break; + case DataType::BFLOAT16: + type = miopenBFloat16; + break; default: break; } diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index ac390524b19a4..fd0c69da3b1be 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -1443,34 +1443,39 @@ PD_REGISTER_KERNEL(conv2d_grad, ALL_LAYOUT, phi::ConvCudnnGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv3d_grad, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv2d_double_grad, GPUDNN, ALL_LAYOUT, phi::ConvCudnnGradGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv3d_double_grad, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnDoubleGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(depthwise_conv2d_double_grad, GPU, ALL_LAYOUT, phi::DepthwiseConvDoubleGradGPUDNNKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(conv2d_grad, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index fcc1a2fff7029..a6fc23d41e7f4 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -561,18 +561,29 @@ void Conv3DCudnnKernel(const Context& dev_ctx, } // namespace phi #ifdef PADDLE_WITH_HIP -PD_REGISTER_KERNEL( - conv2d, GPUDNN, ALL_LAYOUT, phi::ConvCudnnKernel, float, phi::float16) {} +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + phi::float16, + phi::bfloat16) {} -PD_REGISTER_KERNEL( - conv3d, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnKernel, float, phi::float16) {} +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(depthwise_conv2d, GPUDNN, ALL_LAYOUT, phi::DepthwiseConvCudnnKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0)