From 8f87fdb9b3ee16726a533f178e2fb915d1d95544 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 14 Apr 2026 17:32:16 +0800
Subject: [PATCH 01/19] feat: Add CUDA 13.2 Dockerfile

---
 tools/dockerfile/manylinux/Dockerfile-132 | 71 +++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 tools/dockerfile/manylinux/Dockerfile-132

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
new file mode 100644
index 0000000000000..adc08cdfac096
--- /dev/null
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -0,0 +1,71 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+ARG CUDA_VERSION=13.2
+ARG BASE_TARGET=cuda${CUDA_VERSION}
+
+FROM nvcr.io/nvidia/cuda:13.2.0-cudnn-devel-ubuntu24.04 as base
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+ARG PYTHON_VERSION=3.12
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LD_LIBRARY_PATH=/usr/local/cuda-${CUDA_VERSION}/compat:/usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
+
+ENV HOME /root
+
+RUN apt-get update --allow-unauthenticated && \
+    apt-get install -y --no-install-recommends \
+      git \
+      vim \
+      curl \
+      wget \
+      make \
+      libgl1 \
+      libglib2.0-0 \
+      libssl-dev \
+      autoconf \
+      automake \
+      libtool \
+      libmlx5-1 \
+      libibverbs-dev \
+      python${PYTHON_VERSION} \
+      python${PYTHON_VERSION}-dev \
+      python3-pip && \
+    ln -sf /usr/bin/python3 /usr/bin/python && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.31/cmake-3.31.0-linux-x86_64.tar.gz && \
+    tar -zxf cmake-3.31.0-linux-x86_64.tar.gz && \
+    rm cmake-3.31.0-linux-x86_64.tar.gz && \
+    rm -rf /home/cmake-3.31.0-linux-x86_64/doc /home/cmake-3.31.0-linux-x86_64/man
+
+ENV PATH=/home/cmake-3.31.0-linux-x86_64/bin:$PATH
+
+
+ARG TMP_DIR=patchelf_tmp
+RUN rm -rf "$TMP_DIR" && git clone -b 0.15.0 https://github.com/NixOS/patchelf "$TMP_DIR" && \
+    cd "$TMP_DIR" && ./bootstrap.sh && \
+    ./configure && make && make install && \
+    cd .. && rm -rf "$TMP_DIR"
+
+RUN wget -q https://paddle-ci.gz.bcebos.com/ccache-4.8.2.tar.gz && \
+    tar xf ccache-4.8.2.tar.gz && mkdir /usr/local/ccache-4.8.2 && cd ccache-4.8.2 && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/ccache-4.8.2 .. && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-4.8.2/bin/ccache /usr/local/bin/ccache && \
+    cd ../../ && rm -rf ccache-4.8.2.tar.gz && rm -rf ccache-4.8.2
+
+COPY paddle/scripts/compile_requirements.txt /root
+COPY python/requirements.txt /root
+RUN pip install --break-system-packages -r /root/requirements.txt && \
+    pip install --break-system-packages -r /root/compile_requirements.txt && \
+    rm -rf /root/compile_requirements.txt /root/requirements.txt

From 40c7e79e9eaa5b806040e3420d7f2a14877df9d2 Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Tue, 14 Apr 2026 19:58:12 +0800
Subject: [PATCH 02/19] Update tools/dockerfile/manylinux/Dockerfile-132

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 tools/dockerfile/manylinux/Dockerfile-132 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index adc08cdfac096..a6c8ee1a79807 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -51,7 +51,7 @@ ENV PATH=/home/cmake-3.31.0-linux-x86_64/bin:$PATH
 
 
 ARG TMP_DIR=patchelf_tmp
-RUN rm -rf "$TMP_DIR" && git clone -b 0.15.0 https://github.com/NixOS/patchelf "$TMP_DIR" && \
+RUN rm -rf "$TMP_DIR" && git clone --depth 1 --branch 0.15.0 https://github.com/NixOS/patchelf "$TMP_DIR" && \
     cd "$TMP_DIR" && ./bootstrap.sh && \
     ./configure && make && make install && \
     cd .. && rm -rf "$TMP_DIR"

From 234835acf11be7db679dd16efb126f99c7d46439 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Sat, 18 Apr 2026 22:57:48 +0800
Subject: [PATCH 03/19] add nccl

---
 tools/dockerfile/manylinux/Dockerfile-132 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index a6c8ee1a79807..c6175ace71fba 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -37,7 +37,9 @@ RUN apt-get update --allow-unauthenticated && \
       libibverbs-dev \
       python${PYTHON_VERSION} \
       python${PYTHON_VERSION}-dev \
-      python3-pip && \
+      python3-pip \
+      libnccl2=2.29.7-1+cuda13.2 \
+      libnccl-dev=2.29.7-1+cuda13.2 && \
     ln -sf /usr/bin/python3 /usr/bin/python && \
     rm -rf /var/lib/apt/lists/*
 

From 4c1032be45ade1fe02f4cce7a192fe579fb19597 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Mon, 20 Apr 2026 00:44:45 +0800
Subject: [PATCH 04/19] [CUDA] adapt Paddle build and kernels for CUDA 13.2

---
 cmake/cuda.cmake                        | 10 +++++++++
 cmake/external/warpctc.cmake            | 16 ++++++++++++-
 cmake/external/warprnnt.cmake           | 16 ++++++++++++-
 cmake/third_party.cmake                 |  7 +++++-
 paddle/phi/core/kernel_registry.h       | 30 +++++++++++++++++++------
 paddle/phi/kernels/CMakeLists.txt       | 13 +++++++++++
 paddle/phi/kernels/gpu/arange_kernel.cu |  9 ++++++--
 paddle/phi/kernels/gpu/range_kernel.cu  |  9 ++++++--
 8 files changed, 96 insertions(+), 14 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 28cd08ea5a8dd..7c36df0dfb010 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -11,6 +11,7 @@ if(WITH_NV_JETSON)
   set(paddle_known_gpu_archs10 "53 62 72")
   set(paddle_known_gpu_archs11 "53 62 72 87")
   set(paddle_known_gpu_archs12 "53 62 72 87 90 100")
+  set(paddle_known_gpu_archs13 "87 90 100")
 elseif(NEW_RELEASE_ALL)
   message("Using New Release Strategy - All Arches Package")
   add_definitions(-DNEW_RELEASE_ALL)
@@ -18,6 +19,7 @@ elseif(NEW_RELEASE_ALL)
   set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
   set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Package")
   add_definitions(-DNEW_RELEASE_PYPI)
@@ -25,6 +27,7 @@ elseif(NEW_RELEASE_PYPI)
   set(paddle_known_gpu_archs10 "")
   set(paddle_known_gpu_archs11 "61 70 75 80")
   set(paddle_known_gpu_archs12 "61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Package")
   add_definitions(-DNEW_RELEASE_JIT)
@@ -32,11 +35,13 @@ elseif(NEW_RELEASE_JIT)
   set(paddle_known_gpu_archs10 "50 60 70 75")
   set(paddle_known_gpu_archs11 "50 60 70 75 80")
   set(paddle_known_gpu_archs12 "50 60 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 else()
   set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90 100")
   set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
   set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 endif()
 
 ######################################################################################
@@ -289,6 +294,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 14.0) # CUDA 13.0+
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs13})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
 # Fix ARM NEON conflict with CUDA on aarch64 platforms.
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 17ef70b4a071c..810e1d348421e 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -101,6 +101,20 @@ else()
   set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
 
+set(WARPCTC_NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA})
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  string(
+    REGEX
+    REPLACE "(^| )-gencode arch=compute_(50|52|53|60|61|62|70|72),code=sm_\\2"
+            "" WARPCTC_NVCC_FLAGS_EXTRA "${WARPCTC_NVCC_FLAGS_EXTRA}")
+  string(
+    REGEX
+    REPLACE
+      "(^| )-gencode arch=compute_(50|52|53|60|61|62|70|72),code=compute_\\2"
+      "" WARPCTC_NVCC_FLAGS_EXTRA "${WARPCTC_NVCC_FLAGS_EXTRA}")
+  string(STRIP "${WARPCTC_NVCC_FLAGS_EXTRA}" WARPCTC_NVCC_FLAGS_EXTRA)
+endif()
+
 # For CMake >= 4.0.0, force policy compatibility for third-party warpctc's CMake.
 set(WARPCTC_POLICY_ARGS "")
 if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
@@ -134,7 +148,7 @@ ExternalProject_Add(
              -DWITH_GPU=${WITH_GPU}
              -DWITH_ROCM=${WITH_ROCM}
              -DWITH_OMP=${USE_OMP}
-             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
+             -DNVCC_FLAGS_EXTRA=${WARPCTC_NVCC_FLAGS_EXTRA}
              -DWITH_TORCH=OFF
              -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
              -DBUILD_SHARED=ON
diff --git a/cmake/external/warprnnt.cmake b/cmake/external/warprnnt.cmake
index ce4b43343a4e9..3f3c69693e4b8 100644
--- a/cmake/external/warprnnt.cmake
+++ b/cmake/external/warprnnt.cmake
@@ -104,6 +104,20 @@ else()
   set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
 
+set(WARPRNNT_NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA})
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  string(
+    REGEX
+    REPLACE "(^| )-gencode arch=compute_(50|52|53|60|61|62|70|72),code=sm_\\2"
+            "" WARPRNNT_NVCC_FLAGS_EXTRA "${WARPRNNT_NVCC_FLAGS_EXTRA}")
+  string(
+    REGEX
+    REPLACE
+      "(^| )-gencode arch=compute_(50|52|53|60|61|62|70|72),code=compute_\\2"
+      "" WARPRNNT_NVCC_FLAGS_EXTRA "${WARPRNNT_NVCC_FLAGS_EXTRA}")
+  string(STRIP "${WARPRNNT_NVCC_FLAGS_EXTRA}" WARPRNNT_NVCC_FLAGS_EXTRA)
+endif()
+
 # For CMake >= 4.0.0, force policy compatibility for third-party warprnnt's CMake.
 set(WARPRNNT_POLICY_ARGS "")
 if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
@@ -135,7 +149,7 @@ ExternalProject_Add(
              -DWITH_GPU=${WITH_GPU}
              -DWITH_ROCM=${WITH_ROCM}
              -DWITH_OMP=${USE_OMP}
-             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
+             -DNVCC_FLAGS_EXTRA=${WARPRNNT_NVCC_FLAGS_EXTRA}
              -DBUILD_SHARED=ON
              -DBUILD_TESTS=OFF
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 755739e533695..526d17cc204d6 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -604,7 +604,12 @@ if(WITH_GPU
    AND NOT WITH_ARM
    AND NOT WIN32
    AND NOT APPLE)
-  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
+  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 13.0)
+    message(
+      STATUS
+        "flash-attn is disabled for default CUDA 13.x builds because the bundled third_party/flashattn source build is not yet stable with this toolchain."
+    )
+  elseif(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
     foreach(arch ${NVCC_ARCH_BIN})
       if(${arch} GREATER_EQUAL 90)
         set(WITH_FLASHATTN_V3 ON)
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index cbb4fbd60c243..8660c3af41fe7 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -197,10 +197,26 @@ struct KernelRegistrar {
   ::phi::KernelArgsParseFunctor<                              \
       decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse
 
+// nvcc 13.x crashes in cudafe++ on the explicit instantiation form
+// `template decltype(fn<T, Ctx>) fn<T, Ctx>;`. Keep macro registration intact
+// by replacing it with a `used` anchor that still forces the specialization
+// to be emitted without hitting the buggy syntax.
+#if defined(__CUDACC__) && !defined(_WIN32) && \
+    defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 13)
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(id, ...)          \
+  static auto* const PD_CONCATENATE(__pd_kernel_instantiation_anchor_, id) \
+      __attribute__((used)) = &__VA_ARGS__;
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(PD_ID, __VA_ARGS__)
+#else
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
+  template decltype(__VA_ARGS__) __VA_ARGS__;
+#endif
+
 // The macro for instantiating function kernel
 #define FUNCTION_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, context) \
-  template decltype(meta_kernel_fn<cpp_dtype, context>)                   \
-      meta_kernel_fn<cpp_dtype, context>;
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(                            \
+      meta_kernel_fn<cpp_dtype, context>)
 
 /** PD_REGISTER_KERNEL
  *
@@ -1368,7 +1384,7 @@ struct KernelRegistrar {
 #if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
 #define PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                \
     kernel_name, backend, layout, kernel_fn)                             \
-  template decltype(kernel_fn) kernel_fn;                                \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                 \
   static void                                                            \
       __FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
           const ::phi::KernelKey& kernel_key UNUSED,                     \
@@ -1391,7 +1407,7 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define __PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                    \
     reg_type, kernel_name, backend, layout, kernel_fn)                         \
-  template decltype(kernel_fn) kernel_fn;                                      \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                       \
   static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(    \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);              \
   static const ::phi::KernelRegistrar                                          \
@@ -1440,8 +1456,8 @@ struct KernelRegistrar {
 #if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
 #define PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(                        \
     kernel_name, layout, meta_kernel_fn)                                 \
-  template decltype(meta_kernel_fn<::phi::CustomContext>)                \
-      meta_kernel_fn<::phi::CustomContext>;                              \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(                           \
+      meta_kernel_fn<::phi::CustomContext>)                              \
   static void                                                            \
       __FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
           const ::phi::KernelKey kernel_key UNUSED,                      \
@@ -1535,7 +1551,7 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define ___PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(                           \
     reg_type, kernel_name, backend, layout, kernel_fn, args_def_fn)            \
-  template decltype(kernel_fn) kernel_fn;                                      \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                       \
   static const ::phi::KernelRegistrar                                          \
       __reg_phi_kernel_##kernel_name##_##backend##_##layout(                   \
           reg_type,                                                            \
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 52a805dc37c3b..f338c6ff8e22a 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -45,6 +45,19 @@ file(
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "gpu/*.cu" "gpu/*.cu.cc")
 
+if(WITH_GPU AND (CUDA_VERSION VERSION_GREATER_EQUAL 13.0))
+  list(
+    REMOVE_ITEM
+    kernel_cu
+    "fusion/fp8_gemm/fp8_gemm_with_cublasLt/fp8_fp8_half_gemm.cu"
+    "legacy/gpu/fp8_gemm_blockwise_kernel.cu"
+    "legacy/gpu/fp8_quant_blockwise_kernel.cu")
+  message(
+    STATUS
+      "Skipping phi FP8 CUDA kernels for CUDA ${CUDA_VERSION} because nvcc 13.x triggers an internal compiler error while registering float8 kernels."
+  )
+endif()
+
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
   list(REMOVE_ITEM kernel_cu "sparse/gpu/conv_kernel_igemm.cu")
diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
index ee7873e0110f4..69c412a329a6b 100644
--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -96,8 +96,13 @@ void ArangeKernel(const Context& dev_ctx,
       dev_ctx, start_value, end_value, step_value, out);
 }
 
-template decltype(ArangeNullaryKernel<int64_t, GPUContext>) ArangeNullaryKernel;
-template decltype(ArangeNullaryKernel<int, GPUContext>) ArangeNullaryKernel;
+template void ArangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
+                                                       const int64_t,
+                                                       const int64_t,
+                                                       const int64_t,
+                                                       DenseTensor*);
+template void ArangeNullaryKernel<int, GPUContext>(
+    const GPUContext&, const int, const int, const int, DenseTensor*);
 }  // namespace phi
 
 PD_REGISTER_KERNEL(arange_tensor,
diff --git a/paddle/phi/kernels/gpu/range_kernel.cu b/paddle/phi/kernels/gpu/range_kernel.cu
index f90f54b8cab08..84f34d0e6f319 100644
--- a/paddle/phi/kernels/gpu/range_kernel.cu
+++ b/paddle/phi/kernels/gpu/range_kernel.cu
@@ -129,8 +129,13 @@ void RangeKernel(const Context& dev_ctx,
       dev_ctx, start_value, end_value, step_value, out);
 }
 
-template decltype(RangeNullaryKernel<int64_t, GPUContext>) RangeNullaryKernel;
-template decltype(RangeNullaryKernel<int, GPUContext>) RangeNullaryKernel;
+template void RangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
+                                                      const int64_t,
+                                                      const int64_t,
+                                                      const int64_t,
+                                                      DenseTensor*);
+template void RangeNullaryKernel<int, GPUContext>(
+    const GPUContext&, const int, const int, const int, DenseTensor*);
 }  // namespace phi
 
 PD_REGISTER_KERNEL(range_tensor,

From f57a196ba650e5a2c1e9bac976545430447c3482 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Mon, 20 Apr 2026 13:34:11 +0800
Subject: [PATCH 05/19] rollback warpctc and warpnnt

---
 cmake/external/warpctc.cmake  | 16 +---------------
 cmake/external/warprnnt.cmake | 16 +---------------
 2 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 810e1d348421e..17ef70b4a071c 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -101,20 +101,6 @@ else()
   set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
 
-set(WARPCTC_NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA})
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
-  string(
-    REGEX
-    REPLACE "(^| )-gencode arch=compute_(50|52|53|60|61|62|70|72),code=sm_\\2"
-            "" WARPCTC_NVCC_FLAGS_EXTRA "${WARPCTC_NVCC_FLAGS_EXTRA}")
-  string(
-    REGEX
-    REPLACE
-      "(^| )-gencode arch=compute_(50|52|53|60|61|62|70|72),code=compute_\\2"
-      "" WARPCTC_NVCC_FLAGS_EXTRA "${WARPCTC_NVCC_FLAGS_EXTRA}")
-  string(STRIP "${WARPCTC_NVCC_FLAGS_EXTRA}" WARPCTC_NVCC_FLAGS_EXTRA)
-endif()
-
 # For CMake >= 4.0.0, force policy compatibility for third-party warpctc's CMake.
 set(WARPCTC_POLICY_ARGS "")
 if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
@@ -148,7 +134,7 @@ ExternalProject_Add(
              -DWITH_GPU=${WITH_GPU}
              -DWITH_ROCM=${WITH_ROCM}
              -DWITH_OMP=${USE_OMP}
-             -DNVCC_FLAGS_EXTRA=${WARPCTC_NVCC_FLAGS_EXTRA}
+             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
              -DWITH_TORCH=OFF
              -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
              -DBUILD_SHARED=ON
diff --git a/cmake/external/warprnnt.cmake b/cmake/external/warprnnt.cmake
index 3f3c69693e4b8..ce4b43343a4e9 100644
--- a/cmake/external/warprnnt.cmake
+++ b/cmake/external/warprnnt.cmake
@@ -104,20 +104,6 @@ else()
   set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
 
-set(WARPRNNT_NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA})
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
-  string(
-    REGEX
-    REPLACE "(^| )-gencode arch=compute_(50|52|53|60|61|62|70|72),code=sm_\\2"
-            "" WARPRNNT_NVCC_FLAGS_EXTRA "${WARPRNNT_NVCC_FLAGS_EXTRA}")
-  string(
-    REGEX
-    REPLACE
-      "(^| )-gencode arch=compute_(50|52|53|60|61|62|70|72),code=compute_\\2"
-      "" WARPRNNT_NVCC_FLAGS_EXTRA "${WARPRNNT_NVCC_FLAGS_EXTRA}")
-  string(STRIP "${WARPRNNT_NVCC_FLAGS_EXTRA}" WARPRNNT_NVCC_FLAGS_EXTRA)
-endif()
-
 # For CMake >= 4.0.0, force policy compatibility for third-party warprnnt's CMake.
 set(WARPRNNT_POLICY_ARGS "")
 if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
@@ -149,7 +135,7 @@ ExternalProject_Add(
              -DWITH_GPU=${WITH_GPU}
              -DWITH_ROCM=${WITH_ROCM}
              -DWITH_OMP=${USE_OMP}
-             -DNVCC_FLAGS_EXTRA=${WARPRNNT_NVCC_FLAGS_EXTRA}
+             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
              -DBUILD_SHARED=ON
              -DBUILD_TESTS=OFF
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON

From d70ff8976fca508e0ff1356bc35268e89f7ac0a5 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 21 Apr 2026 12:53:01 +0800
Subject: [PATCH 06/19] clean code

---
 paddle/phi/kernels/CMakeLists.txt | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index f338c6ff8e22a..52a805dc37c3b 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -45,19 +45,6 @@ file(
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "gpu/*.cu" "gpu/*.cu.cc")
 
-if(WITH_GPU AND (CUDA_VERSION VERSION_GREATER_EQUAL 13.0))
-  list(
-    REMOVE_ITEM
-    kernel_cu
-    "fusion/fp8_gemm/fp8_gemm_with_cublasLt/fp8_fp8_half_gemm.cu"
-    "legacy/gpu/fp8_gemm_blockwise_kernel.cu"
-    "legacy/gpu/fp8_quant_blockwise_kernel.cu")
-  message(
-    STATUS
-      "Skipping phi FP8 CUDA kernels for CUDA ${CUDA_VERSION} because nvcc 13.x triggers an internal compiler error while registering float8 kernels."
-  )
-endif()
-
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
   list(REMOVE_ITEM kernel_cu "sparse/gpu/conv_kernel_igemm.cu")

From dcda4600e8f1affaccc27672b55a07e6a008d449 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 21 Apr 2026 14:43:19 +0800
Subject: [PATCH 07/19] `setup.py` add cuda 13.2

---
 python/setup.py.in | 70 ++++++++++++++++++++++++++-----------
 setup.py           | 86 +++++++++++++++++++++++++++++++---------------
 2 files changed, 109 insertions(+), 47 deletions(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index b9aaeaf831c55..6cdcd0d9fe234 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -611,6 +611,7 @@ def get_paddle_extra_install_requirements():
     #(Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas. Additionally, it now supports the installation of TensorRT, further enhancing its functionality. This integration simplifies the process as the operation of 'pip install paddle' is no longer dependent on the separate installation of cuda, cudnn, or TensorRT.
     paddle_cuda_requires = []
     paddle_tensorrt_requires = []
+    cuda_major_version = None
     if '@WITH_PIP_CUDA_LIBRARIES@' == 'ON':
         if platform.system() == 'Linux':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
@@ -721,6 +722,23 @@ def get_paddle_extra_install_requirements():
                 "nvidia-cufile==1.15.1.6; platform_system == 'Linux' | "
                 "cuda-python==13.0.3; platform_system == 'Linux'"
             ),
+            "13.2": (
+                "nvidia-cuda-nvrtc==13.2.78; platform_system == 'Linux' | "
+                "nvidia-cuda-runtime==13.2.75; platform_system == 'Linux' | "
+                "nvidia-cuda-cupti==13.2.75; platform_system == 'Linux' | "
+                "nvidia-cudnn-cu13==9.21.0.82; platform_system == 'Linux' | "
+                "nvidia-cublas==13.4.0.1; platform_system == 'Linux' | "
+                "nvidia-cufft==12.2.0.46; platform_system == 'Linux' | "
+                "nvidia-curand==10.4.2.55; platform_system == 'Linux' | "
+                "nvidia-cusolver==12.2.0.1; platform_system == 'Linux' | "
+                "nvidia-cusparse==12.7.10.1; platform_system == 'Linux' | "
+                "nvidia-cusparselt-cu13==0.9.0; platform_system == 'Linux' | "
+                "nvidia-nccl-cu13==2.29.7; platform_system == 'Linux' | "
+                "nvidia-nvtx==13.2.75; platform_system == 'Linux' | "
+                "nvidia-nvjitlink==13.2.78; platform_system == 'Linux' | "
+                "nvidia-cufile==1.17.1.22; platform_system == 'Linux' | "
+                "cuda-python==13.2.0; platform_system == 'Linux'"
+            ),
         }
         if '@WITH_CINN@' == 'ON':
             PADDLE_CUDA_INSTALL_REQUIREMENTS["12.3"] += (
@@ -741,6 +759,9 @@ def get_paddle_extra_install_requirements():
             PADDLE_CUDA_INSTALL_REQUIREMENTS["13.0"] += (
                     " | nvidia-cuda-cccl==13.0.85;platform_system == 'Linux' "
             )
+            PADDLE_CUDA_INSTALL_REQUIREMENTS["13.2"] += (
+                    " | nvidia-cuda-cccl==13.2.75;platform_system == 'Linux' "
+            )
         elif platform.system() == 'Windows':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
                 "11.8": (
@@ -811,37 +832,46 @@ def get_paddle_extra_install_requirements():
 
     if '@WITH_PIP_TENSORRT@' == 'ON':
         version_str = get_tensorrt_version()
-        version_default = int(version_str.split(".")[0])
-        if platform.system() =='Linux' or (platform.system()=='Windows' and version_default>=10):
-
+        version_default = int(version_str.split(".")[0]) if version_str else None
+        if platform.system() == 'Linux' and cuda_major_version == '13.2':
+            if not version_str and platform.machine() == 'aarch64':
+                return paddle_cuda_requires, ["tensorrt-cu13==10.16.1.11"]
+            PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
+                "tensorrt-cu13==10.16.1.11",
+            ]
+        elif platform.system() =='Linux' or (platform.system()=='Windows' and version_default is not None and version_default>=10):
             PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
                 "tensorrt==8.5.3.1",
                 "tensorrt==8.6.0",
                 "tensorrt==8.6.1.post1",
                 "tensorrt==10.3.0",
             ]
+        else:
+            return paddle_cuda_requires, []
 
-            if not version_str:
-                return paddle_cuda_requires,[]
+        if not version_str:
+            return paddle_cuda_requires,[]
 
-            version_main = ".".join(version_str.split(".")[:3])
+        version_main = ".".join(version_str.split(".")[:3])
 
-            matched_package = None
-            for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
-                paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
-                paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])
+        matched_package = None
+        for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
+            paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
+            paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])
 
-                if version_main == paddle_tensorrt_main:
-                    matched_package = paddle_tensorrt_requires
-                    break
+            if version_main == paddle_tensorrt_main:
+                matched_package = paddle_tensorrt_requires
+                break
 
-            if matched_package:
-                paddle_tensorrt_requires = [matched_package]
-            else:
-                print(
-                    f"No exact match found for TensorRT Version: {version_str}. We currently support TensorRT versions 8.5.3.1, 8.6.0, and 8.6.1."
-                )
-                return paddle_cuda_requires, []
+        if matched_package:
+            paddle_tensorrt_requires = [matched_package]
+        else:
+            print(
+                "No exact match found for TensorRT Version: "
+                f"{version_str}. We currently support TensorRT versions "
+                "8.5.3.1, 8.6.0, 8.6.1.post1, 10.3.0, and 10.16.1.11."
+            )
+            return paddle_cuda_requires, []
 
     return paddle_cuda_requires,paddle_tensorrt_requires
 
diff --git a/setup.py b/setup.py
index 28617ca82539a..6bfa0ffedf02a 100644
--- a/setup.py
+++ b/setup.py
@@ -1109,7 +1109,8 @@ def get_setup_requires():
 def get_paddle_extra_install_requirements():
     paddle_cuda_requires = []
     paddle_tensorrt_requires = []
-    # (Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas, thereby making the operation of 'pip install paddle' no longer dependent on the installation of cuda and cudnn.
+    cuda_major_version = None
+    # (Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas. Additionally, it now supports the installation of TensorRT, further enhancing its functionality. This integration simplifies the process as the operation of 'pip install paddle' is no longer dependent on the separate installation of cuda, cudnn, or TensorRT.
     if env_dict.get("WITH_PIP_CUDA_LIBRARIES") == "ON":
         if platform.system() == 'Linux':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
@@ -1220,6 +1221,23 @@ def get_paddle_extra_install_requirements():
                     "nvidia-cufile==1.15.1.6; platform_system == 'Linux' | "
                     "cuda-python==13.0.3; platform_system == 'Linux'"
                 ),
+                "13.2": (
+                    "nvidia-cuda-nvrtc==13.2.78; platform_system == 'Linux' | "
+                    "nvidia-cuda-runtime==13.2.75; platform_system == 'Linux' | "
+                    "nvidia-cuda-cupti==13.2.75; platform_system == 'Linux' | "
+                    "nvidia-cudnn-cu13==9.21.0.82; platform_system == 'Linux' | "
+                    "nvidia-cublas==13.4.0.1; platform_system == 'Linux' | "
+                    "nvidia-cufft==12.2.0.46; platform_system == 'Linux' | "
+                    "nvidia-curand==10.4.2.55; platform_system == 'Linux' | "
+                    "nvidia-cusolver==12.2.0.1; platform_system == 'Linux' | "
+                    "nvidia-cusparse==12.7.10.1; platform_system == 'Linux' | "
+                    "nvidia-cusparselt-cu13==0.9.0; platform_system == 'Linux' | "
+                    "nvidia-nccl-cu13==2.29.7; platform_system == 'Linux' | "
+                    "nvidia-nvtx==13.2.75; platform_system == 'Linux' | "
+                    "nvidia-nvjitlink==13.2.78; platform_system == 'Linux' | "
+                    "nvidia-cufile==1.17.1.22; platform_system == 'Linux' | "
+                    "cuda-python==13.2.0; platform_system == 'Linux'"
+                ),
             }
             if env_dict.get("WITH_CINN") == "ON":
                 PADDLE_CUDA_INSTALL_REQUIREMENTS["12.3"] += (
@@ -1240,6 +1258,9 @@ def get_paddle_extra_install_requirements():
                 PADDLE_CUDA_INSTALL_REQUIREMENTS["13.0"] += (
                     " | nvidia-cuda-cccl==13.0.85;platform_system == 'Linux' "
                 )
+                PADDLE_CUDA_INSTALL_REQUIREMENTS["13.2"] += (
+                    " | nvidia-cuda-cccl==13.2.75;platform_system == 'Linux' "
+                )
 
         elif platform.system() == 'Windows':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
@@ -1317,43 +1338,54 @@ def get_paddle_extra_install_requirements():
 
     if env_dict.get("WITH_PIP_TENSORRT") == "ON":
         version_str = get_tensorrt_version()
-        version_default = int(version_str.split(".")[0])
-        if platform.system() == 'Linux' or (
-            platform.system() == 'Windows' and version_default >= 10
+        version_default = (
+            int(version_str.split(".")[0]) if version_str else None
+        )
+        if platform.system() == 'Linux' and cuda_major_version == '13.2':
+            if not version_str and platform.machine() == 'aarch64':
+                return paddle_cuda_requires, ["tensorrt-cu13==10.16.1.11"]
+            PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
+                "tensorrt-cu13==10.16.1.11",
+            ]
+        elif platform.system() == 'Linux' or (
+            platform.system() == 'Windows'
+            and version_default is not None
+            and version_default >= 10
         ):
             PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
                 "tensorrt==8.5.3.1",
                 "tensorrt==8.6.0",
                 "tensorrt==8.6.1.post1",
+                "tensorrt==10.3.0",
             ]
+        else:
+            return paddle_cuda_requires, []
 
-            if not version_str:
-                return paddle_cuda_requires, []
+        if not version_str:
+            return paddle_cuda_requires, []
 
-            version_main = ".".join(version_str.split(".")[:3])
+        version_main = ".".join(version_str.split(".")[:3])
 
-            matched_package = None
-            for (
-                paddle_tensorrt_requires
-            ) in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
-                paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[
-                    1
-                ]
-                paddle_tensorrt_main = ".".join(
-                    paddle_tensorrt_version.split(".")[:3]
-                )
+        matched_package = None
+        for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
+            paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
+            paddle_tensorrt_main = ".".join(
+                paddle_tensorrt_version.split(".")[:3]
+            )
 
-                if version_main == paddle_tensorrt_main:
-                    matched_package = paddle_tensorrt_requires
-                    break
+            if version_main == paddle_tensorrt_main:
+                matched_package = paddle_tensorrt_requires
+                break
 
-            if matched_package:
-                paddle_tensorrt_requires = [matched_package]
-            else:
-                print(
-                    f"No exact match found for TensorRT Version: {version_str}. We currently support TensorRT versions 8.5.3.1, 8.6.0, and 8.6.1."
-                )
-                return paddle_cuda_requires, []
+        if matched_package:
+            paddle_tensorrt_requires = [matched_package]
+        else:
+            print(
+                "No exact match found for TensorRT Version: "
+                f"{version_str}. We currently support TensorRT versions "
+                "8.5.3.1, 8.6.0, 8.6.1.post1, 10.3.0, and 10.16.1.11."
+            )
+            return paddle_cuda_requires, []
 
     return paddle_cuda_requires, paddle_tensorrt_requires
 

From 225a3eec9813bb1af543173429ac5ace21503040 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 21 Apr 2026 20:51:55 +0800
Subject: [PATCH 08/19] dockerfile add `global.break-system-packages`

---
 tools/dockerfile/manylinux/Dockerfile-132 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index c6175ace71fba..799946dc1a254 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -68,6 +68,7 @@ RUN wget -q https://paddle-ci.gz.bcebos.com/ccache-4.8.2.tar.gz && \
 
 COPY paddle/scripts/compile_requirements.txt /root
 COPY python/requirements.txt /root
-RUN pip install --break-system-packages -r /root/requirements.txt && \
+RUN pip config set global.break-system-packages true && \
+    pip install --break-system-packages -r /root/requirements.txt && \
     pip install --break-system-packages -r /root/compile_requirements.txt && \
     rm -rf /root/compile_requirements.txt /root/requirements.txt

From a308b5dd98079467e1b6f4477369faf8210fa19d Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Thu, 23 Apr 2026 09:18:54 +0800
Subject: [PATCH 09/19] update `LD_LIBRARY_PATH`

---
 tools/dockerfile/manylinux/Dockerfile-132 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index 799946dc1a254..71e5a4e2275b6 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -16,7 +16,7 @@ ARG PYTHON_VERSION=3.12
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV LD_LIBRARY_PATH=/usr/local/cuda-${CUDA_VERSION}/compat:/usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/compat:/usr/local/cuda-13.2/targets/x86_64-linux/lib:/usr/local/cuda-13.2/lib64:$LD_LIBRARY_PATH
 
 ENV HOME /root
 

From b2b5a8f59b1aaab9149e1b6c766db32cb072a759 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Mon, 27 Apr 2026 08:57:42 +0800
Subject: [PATCH 10/19] Update the Dockerfile to support multi-architecture
 builds and fix the CUDA 13.x compatibility issue of flash-attn

---
 cmake/third_party.cmake                   | 18 ++++++++++---
 tools/dockerfile/manylinux/Dockerfile-132 | 32 ++++++++++++++---------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 526d17cc204d6..8e9caeaddc260 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -605,10 +605,20 @@ if(WITH_GPU
    AND NOT WIN32
    AND NOT APPLE)
   if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 13.0)
-    message(
-      STATUS
-        "flash-attn is disabled for default CUDA 13.x builds because the bundled third_party/flashattn source build is not yet stable with this toolchain."
-    )
+    foreach(arch ${NVCC_ARCH_BIN})
+      if(${arch} GREATER_EQUAL 90)
+        set(WITH_FLASHATTN_V3 ON)
+        break()
+      endif()
+    endforeach()
+    foreach(arch ${NVCC_ARCH_BIN})
+      if(${arch} GREATER_EQUAL 80)
+        include(external/flashattn)
+        list(APPEND third_party_deps extern_flashattn)
+        set(WITH_FLASHATTN ON)
+        break()
+      endif()
+    endforeach()
   elseif(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
     foreach(arch ${NVCC_ARCH_BIN})
       if(${arch} GREATER_EQUAL 90)
diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index 71e5a4e2275b6..f083ef9607fba 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -12,11 +12,12 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG WITH_GPU
 ARG WITH_AVX
 ARG PYTHON_VERSION=3.12
+ARG TARGETARCH
 
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/compat:/usr/local/cuda-13.2/targets/x86_64-linux/lib:/usr/local/cuda-13.2/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/compat:/usr/local/cuda-13.2/targets/x86_64-linux/lib:/usr/local/cuda-13.2/targets/sbsa-linux/lib:/usr/local/cuda-13.2/targets/aarch64-linux/lib:/usr/local/cuda-13.2/lib64:$LD_LIBRARY_PATH
 
 ENV HOME /root
 
@@ -35,6 +36,7 @@ RUN apt-get update --allow-unauthenticated && \
       libtool \
       libmlx5-1 \
       libibverbs-dev \
+      ccache \
       python${PYTHON_VERSION} \
       python${PYTHON_VERSION}-dev \
       python3-pip \
@@ -44,12 +46,23 @@ RUN apt-get update --allow-unauthenticated && \
     rm -rf /var/lib/apt/lists/*
 
 WORKDIR /home
-RUN wget -q https://cmake.org/files/v3.31/cmake-3.31.0-linux-x86_64.tar.gz && \
-    tar -zxf cmake-3.31.0-linux-x86_64.tar.gz && \
-    rm cmake-3.31.0-linux-x86_64.tar.gz && \
-    rm -rf /home/cmake-3.31.0-linux-x86_64/doc /home/cmake-3.31.0-linux-x86_64/man
+RUN set -eux; \
+        if [ "${TARGETARCH:-}" = "amd64" ] || [ "$(uname -m)" = "x86_64" ]; then \
+            CMAKE_ARCH="x86_64"; \
+        elif [ "${TARGETARCH:-}" = "arm64" ] || [ "$(uname -m)" = "aarch64" ]; then \
+            CMAKE_ARCH="aarch64"; \
+        else \
+            echo "Unsupported architecture: TARGETARCH=${TARGETARCH:-unknown}, uname -m=$(uname -m)"; \
+            exit 1; \
+        fi; \
+        CMAKE_PKG="cmake-3.31.0-linux-${CMAKE_ARCH}.tar.gz"; \
+        wget -q "https://cmake.org/files/v3.31/${CMAKE_PKG}"; \
+        tar -zxf "${CMAKE_PKG}"; \
+        rm "${CMAKE_PKG}"; \
+        mv "/home/cmake-3.31.0-linux-${CMAKE_ARCH}" /home/cmake-3.31.0; \
+        rm -rf /home/cmake-3.31.0/doc /home/cmake-3.31.0/man
 
-ENV PATH=/home/cmake-3.31.0-linux-x86_64/bin:$PATH
+ENV PATH=/home/cmake-3.31.0/bin:$PATH
 
 
 ARG TMP_DIR=patchelf_tmp
@@ -58,13 +71,6 @@ RUN rm -rf "$TMP_DIR" && git clone --depth 1 --branch 0.15.0 https://github.com/
     ./configure && make && make install && \
     cd .. && rm -rf "$TMP_DIR"
 
-RUN wget -q https://paddle-ci.gz.bcebos.com/ccache-4.8.2.tar.gz && \
-    tar xf ccache-4.8.2.tar.gz && mkdir /usr/local/ccache-4.8.2 && cd ccache-4.8.2 && \
-    mkdir build && cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/ccache-4.8.2 .. && \
-    make -j8 && make install && \
-    ln -s /usr/local/ccache-4.8.2/bin/ccache /usr/local/bin/ccache && \
-    cd ../../ && rm -rf ccache-4.8.2.tar.gz && rm -rf ccache-4.8.2
 
 COPY paddle/scripts/compile_requirements.txt /root
 COPY python/requirements.txt /root

From 5232181907a3f9fceab5b7b76c09c67e1977da5b Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Sun, 3 May 2026 21:43:38 +0800
Subject: [PATCH 11/19] Add CUDA 13.2 manylinux support

---
 tools/dockerfile/manylinux/Dockerfile         |  4 ++
 .../manylinux/common/install_cuda.sh          | 58 +++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/tools/dockerfile/manylinux/Dockerfile b/tools/dockerfile/manylinux/Dockerfile
index 63c09c674fd78..5c971699384c8 100644
--- a/tools/dockerfile/manylinux/Dockerfile
+++ b/tools/dockerfile/manylinux/Dockerfile
@@ -83,6 +83,10 @@ FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 ENV DESIRED_CUDA=12.6
 
+FROM cuda as cuda13.2
+RUN bash ./install_cuda.sh 13.2
+ENV DESIRED_CUDA=13.2
+
 
 # Install paddle
 FROM python as paddle
diff --git a/tools/dockerfile/manylinux/common/install_cuda.sh b/tools/dockerfile/manylinux/common/install_cuda.sh
index e84513a3315cc..2d1c9fdfb7fbe 100644
--- a/tools/dockerfile/manylinux/common/install_cuda.sh
+++ b/tools/dockerfile/manylinux/common/install_cuda.sh
@@ -63,6 +63,17 @@ function install_cusparselt_063 {
     rm -rf tmp_cusparselt
 }
 
+function install_cusparselt_090_cuda13 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.9.0.3_cuda13-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.9.0.3_cuda13-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.9.0.3_cuda13-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.9.0.3_cuda13-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
 function install_nccl_2162 {
     wget -q https://nccl2-deb.cdn.bcebos.com/nccl_2.16.2-1+cuda11.8_x86_64.txz --no-check-certificate --no-proxy
     tar xf nccl_2.16.2-1+cuda11.8_x86_64.txz
@@ -95,6 +106,14 @@ function install_nccl_2234 {
     rm -rf nccl_2.23.4-1+cuda12.6_x86_64 nccl_2.23.4-1+cuda12.6_x86_64.txz
 }
 
+function install_nccl_2297_cuda132 {
+    yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+    yum install -y \
+        libnccl-2.29.7-1+cuda13.2 \
+        libnccl-devel-2.29.7-1+cuda13.2 \
+        libnccl-static-2.29.7-1+cuda13.2
+}
+
 function install_trt_8616 {
     wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz --no-check-certificate --no-proxy
     tar -zxf TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz -C /usr/local
@@ -109,6 +128,13 @@ function install_trt_105018 {
     rm -f TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz
 }
 
+function install_trt_1016111 {
+    wget -q https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.16.1/tars/TensorRT-10.16.1.11.Linux.x86_64-gnu.cuda-13.2.tar.gz --no-check-certificate --no-proxy
+    tar -zxf TensorRT-10.16.1.11.Linux.x86_64-gnu.cuda-13.2.tar.gz -C /usr/local
+    cp -rf /usr/local/TensorRT-10.16.1.11/include/* /usr/include/ && cp -rf /usr/local/TensorRT-10.16.1.11/lib/* /usr/lib/
+    rm -f TensorRT-10.16.1.11.Linux.x86_64-gnu.cuda-13.2.tar.gz
+}
+
 function install_118 {
     CUDNN_VERSION=8.9.7.29
     NCCL_VERSION=2.16.5
@@ -264,6 +290,36 @@ function install_129 {
     ldconfig
 }
 
+function install_132 {
+    CUDNN_VERSION=9.20.0.48
+    NCCL_VERSION=2.29.7
+    TensorRT_VERSION=10.16.1.11
+    echo "Installing CUDA 13.2.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and TensorRT ${TensorRT_VERSION} and cuSparseLt-0.9.0"
+    rm -rf /usr/local/cuda-13.2 /usr/local/cuda
+    # install CUDA 13.2.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/13.2.0/local_installers/cuda_13.2.0_595.45.04_linux.run
+    chmod +x cuda_13.2.0_595.45.04_linux.run
+    ./cuda_13.2.0_595.45.04_linux.run --toolkit --driver --silent --kernel-source-path=/usr/src/kernels/4.18.0-553.34.1.el8_10.x86_64
+    rm -f cuda_13.2.0_595.45.04_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-13.2 /usr/local/cuda
+    rm -rf /usr/bin/nvidia-smi
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    install_nccl_2297_cuda132
+    install_trt_1016111
+    install_cusparselt_090_cuda13
+
+    ldconfig
+}
+
 function prune_118 {
     echo "Pruning CUDA 11.8 and cuDNN"
     #####################################################################################
@@ -408,6 +464,8 @@ do
         ;;
     12.9) install_129
         ;;
+    13.2) install_132
+        ;;
     *) echo "bad argument $1"; exit 1
         ;;
     esac

From c729efa086ce910ec9c425a162f781e712c18192 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Thu, 14 May 2026 21:07:46 +0800
Subject: [PATCH 12/19] Update the URL of the flash-attention sub-module and
 adjust the logic for checking the CUDA version

---
 .gitmodules             |  2 +-
 cmake/third_party.cmake | 17 +----------------
 third_party/flashattn   |  2 +-
 3 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 9cc275e82c7c1..6fd0eb2e63013 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -72,7 +72,7 @@
 	ignore = dirty
 [submodule "third_party/flashattn"]
 	path = third_party/flashattn
-	url = https://github.com/PaddlePaddle/flash-attention.git
+	url = https://github.com/gouzil/flash-attention.git
 	ignore = dirty
 [submodule "third_party/gtest"]
 	path = third_party/gtest
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 8e9caeaddc260..755739e533695 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -604,22 +604,7 @@ if(WITH_GPU
    AND NOT WITH_ARM
    AND NOT WIN32
    AND NOT APPLE)
-  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 13.0)
-    foreach(arch ${NVCC_ARCH_BIN})
-      if(${arch} GREATER_EQUAL 90)
-        set(WITH_FLASHATTN_V3 ON)
-        break()
-      endif()
-    endforeach()
-    foreach(arch ${NVCC_ARCH_BIN})
-      if(${arch} GREATER_EQUAL 80)
-        include(external/flashattn)
-        list(APPEND third_party_deps extern_flashattn)
-        set(WITH_FLASHATTN ON)
-        break()
-      endif()
-    endforeach()
-  elseif(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
+  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
     foreach(arch ${NVCC_ARCH_BIN})
       if(${arch} GREATER_EQUAL 90)
         set(WITH_FLASHATTN_V3 ON)
diff --git a/third_party/flashattn b/third_party/flashattn
index 0ae8b7991b721..215e9d33ae56a 160000
--- a/third_party/flashattn
+++ b/third_party/flashattn
@@ -1 +1 @@
-Subproject commit 0ae8b7991b7219bf53a17997d0876aaa0c4e301a
+Subproject commit 215e9d33ae56ac986cc3abbb68760e7c360fded6

From 230dcc665b4631dff474af28f7d43db2a94d03fa Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 26 May 2026 00:55:04 +0800
Subject: [PATCH 13/19] [CUDA] Restrict CUDA compiler version for flash
 attention support

---
 cmake/third_party.cmake | 3 ++-
 third_party/flashattn   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 755739e533695..3e8e7faf4984b 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -604,7 +604,8 @@ if(WITH_GPU
    AND NOT WITH_ARM
    AND NOT WIN32
    AND NOT APPLE)
-  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
+  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3
+     AND ${CMAKE_CUDA_COMPILER_VERSION} LESS_EQUAL 12.9)
     foreach(arch ${NVCC_ARCH_BIN})
       if(${arch} GREATER_EQUAL 90)
         set(WITH_FLASHATTN_V3 ON)
diff --git a/third_party/flashattn b/third_party/flashattn
index 215e9d33ae56a..bda9b377eaa26 160000
--- a/third_party/flashattn
+++ b/third_party/flashattn
@@ -1 +1 @@
-Subproject commit 215e9d33ae56ac986cc3abbb68760e7c360fded6
+Subproject commit bda9b377eaa261158d8ee2c7c793191e6a7db245

From 40ecfcea79c620880bfef4c8fe55fece6e90f563 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 26 May 2026 09:28:47 +0800
Subject: [PATCH 14/19] update Dockerfile-132 add Python 3.12.13

---
 tools/dockerfile/manylinux/Dockerfile-132 | 38 ++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index f083ef9607fba..0acb9e65b5f8e 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -12,6 +12,7 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG WITH_GPU
 ARG WITH_AVX
 ARG PYTHON_VERSION=3.12
+ARG PYTHON_SOURCE_VERSION=3.12.13
 ARG TARGETARCH
 
 ENV WITH_GPU=${WITH_GPU:-ON}
@@ -28,6 +29,7 @@ RUN apt-get update --allow-unauthenticated && \
       curl \
       wget \
       make \
+      build-essential \
       libgl1 \
       libglib2.0-0 \
       libssl-dev \
@@ -36,15 +38,43 @@ RUN apt-get update --allow-unauthenticated && \
       libtool \
       libmlx5-1 \
       libibverbs-dev \
+      pkg-config \
+      zlib1g-dev \
+      libbz2-dev \
+      libreadline-dev \
+      libsqlite3-dev \
+      libncursesw5-dev \
+      libffi-dev \
+      libgdbm-dev \
+      libgdbm-compat-dev \
+      liblzma-dev \
+      libexpat1-dev \
+      tk-dev \
+      uuid-dev \
+      xz-utils \
       ccache \
-      python${PYTHON_VERSION} \
-      python${PYTHON_VERSION}-dev \
-      python3-pip \
       libnccl2=2.29.7-1+cuda13.2 \
       libnccl-dev=2.29.7-1+cuda13.2 && \
-    ln -sf /usr/bin/python3 /usr/bin/python && \
     rm -rf /var/lib/apt/lists/*
 
+RUN cd /tmp && \
+    wget -q https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tar.xz && \
+    tar -xf Python-${PYTHON_VERSION}.tar.xz && \
+    cd Python-${PYTHON_VERSION} && \
+    ./configure \
+      --prefix=/usr/local \
+      --enable-optimizations \
+      --with-ensurepip=install && \
+    make -j"$(nproc)" && \
+    make altinstall && \
+    ln -sf /usr/local/bin/python${PYTHON_SHORT} /usr/local/bin/python3 && \
+    ln -sf /usr/local/bin/python3 /usr/local/bin/python && \
+    ln -sf /usr/local/bin/pip${PYTHON_SHORT} /usr/local/bin/pip3 && \
+    ln -sf /usr/local/bin/pip3 /usr/local/bin/pip && \
+    python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel packaging && \
+    cd / && \
+    rm -rf /tmp/Python-${PYTHON_VERSION} /tmp/Python-${PYTHON_VERSION}.tar.xz
+
 WORKDIR /home
 RUN set -eux; \
         if [ "${TARGETARCH:-}" = "amd64" ] || [ "$(uname -m)" = "x86_64" ]; then \

From 607d54ddf9a23b61289670c7fa4ce783cac83323 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 26 May 2026 09:41:00 +0800
Subject: [PATCH 15/19] fix

---
 tools/dockerfile/manylinux/Dockerfile-132 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index 0acb9e65b5f8e..a3c7a502285dc 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -58,9 +58,9 @@ RUN apt-get update --allow-unauthenticated && \
     rm -rf /var/lib/apt/lists/*
 
 RUN cd /tmp && \
-    wget -q https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tar.xz && \
-    tar -xf Python-${PYTHON_VERSION}.tar.xz && \
-    cd Python-${PYTHON_VERSION} && \
+    wget -q https://www.python.org/ftp/python/${PYTHON_SOURCE_VERSION}/Python-${PYTHON_SOURCE_VERSION}.tar.xz && \
+    tar -xf Python-${PYTHON_SOURCE_VERSION}.tar.xz && \
+    cd Python-${PYTHON_SOURCE_VERSION} && \
     ./configure \
       --prefix=/usr/local \
       --enable-optimizations \
@@ -73,7 +73,7 @@ RUN cd /tmp && \
     ln -sf /usr/local/bin/pip3 /usr/local/bin/pip && \
     python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel packaging && \
     cd / && \
-    rm -rf /tmp/Python-${PYTHON_VERSION} /tmp/Python-${PYTHON_VERSION}.tar.xz
+    rm -rf /tmp/Python-${PYTHON_SOURCE_VERSION} /tmp/Python-${PYTHON_SOURCE_VERSION}.tar.xz
 
 WORKDIR /home
 RUN set -eux; \

From 14e8ecc50cb23a397efa0dcf665ab0d697eb2895 Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 26 May 2026 10:07:18 +0800
Subject: [PATCH 16/19] fix py version

---
 tools/dockerfile/manylinux/Dockerfile-132 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index a3c7a502285dc..7b2829d73218f 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -67,9 +67,9 @@ RUN cd /tmp && \
       --with-ensurepip=install && \
     make -j"$(nproc)" && \
     make altinstall && \
-    ln -sf /usr/local/bin/python${PYTHON_SHORT} /usr/local/bin/python3 && \
+    ln -sf /usr/local/bin/python${PYTHON_VERSION} /usr/local/bin/python3 && \
     ln -sf /usr/local/bin/python3 /usr/local/bin/python && \
-    ln -sf /usr/local/bin/pip${PYTHON_SHORT} /usr/local/bin/pip3 && \
+    ln -sf /usr/local/bin/pip${PYTHON_VERSION} /usr/local/bin/pip3 && \
     ln -sf /usr/local/bin/pip3 /usr/local/bin/pip && \
     python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel packaging && \
     cd / && \

From ab055a27762d663455489d5c21da1e9348cc682f Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 26 May 2026 16:54:48 +0800
Subject: [PATCH 17/19] update Dockerfile-132

---
 tools/dockerfile/manylinux/Dockerfile-132 | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index 7b2829d73218f..d89ea6c44f3df 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -18,7 +18,7 @@ ARG TARGETARCH
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/compat:/usr/local/cuda-13.2/targets/x86_64-linux/lib:/usr/local/cuda-13.2/targets/sbsa-linux/lib:/usr/local/cuda-13.2/targets/aarch64-linux/lib:/usr/local/cuda-13.2/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/compat:$LD_LIBRARY_PATH
 
 ENV HOME /root
 
@@ -29,6 +29,9 @@ RUN apt-get update --allow-unauthenticated && \
       curl \
       wget \
       make \
+      zstd \
+      rsync \
+      ca-certificates \
       build-essential \
       libgl1 \
       libglib2.0-0 \

From b562e19d8d4432387a58fa8fa901debfb6fe6d5c Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 26 May 2026 22:19:31 +0800
Subject: [PATCH 18/19] empty commit


From aa3920f0c43660700e1ba68e63a4ddb38fbc908b Mon Sep 17 00:00:00 2001
From: gouzi <530971494@qq.com>
Date: Tue, 26 May 2026 23:51:43 +0800
Subject: [PATCH 19/19] update flash-attention to org, Dockerfile-132 add
 gdrcopy

---
 .gitmodules                               |  2 +-
 third_party/flashattn                     |  2 +-
 tools/dockerfile/manylinux/Dockerfile-132 | 17 ++++++++++-------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 6fd0eb2e63013..9cc275e82c7c1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -72,7 +72,7 @@
 	ignore = dirty
 [submodule "third_party/flashattn"]
 	path = third_party/flashattn
-	url = https://github.com/gouzil/flash-attention.git
+	url = https://github.com/PaddlePaddle/flash-attention.git
 	ignore = dirty
 [submodule "third_party/gtest"]
 	path = third_party/gtest
diff --git a/third_party/flashattn b/third_party/flashattn
index bda9b377eaa26..1f3e4bb9aaa0d 160000
--- a/third_party/flashattn
+++ b/third_party/flashattn
@@ -1 +1 @@
-Subproject commit bda9b377eaa261158d8ee2c7c793191e6a7db245
+Subproject commit 1f3e4bb9aaa0dbefa96181bf6026e810f3ebecc1
diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
index d89ea6c44f3df..8f684c231ee2b 100644
--- a/tools/dockerfile/manylinux/Dockerfile-132
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -5,7 +5,7 @@ ARG CUDA_VERSION=13.2
 ARG BASE_TARGET=cuda${CUDA_VERSION}
 
 FROM nvcr.io/nvidia/cuda:13.2.0-cudnn-devel-ubuntu24.04 as base
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+LABEL maintainer="paddle-dev@baidu.com"
 
 
 # ENV variables
@@ -13,12 +13,13 @@ ARG WITH_GPU
 ARG WITH_AVX
 ARG PYTHON_VERSION=3.12
 ARG PYTHON_SOURCE_VERSION=3.12.13
-ARG TARGETARCH
+ARG TMP_DIR=patchelf_tmp
 
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV DEBIAN_FRONTEND=noninteractive
 ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/compat:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/usr/local/gdrcopy
 
 ENV HOME /root
 
@@ -97,17 +98,19 @@ RUN set -eux; \
 
 ENV PATH=/home/cmake-3.31.0/bin:$PATH
 
-
-ARG TMP_DIR=patchelf_tmp
 RUN rm -rf "$TMP_DIR" && git clone --depth 1 --branch 0.15.0 https://github.com/NixOS/patchelf "$TMP_DIR" && \
     cd "$TMP_DIR" && ./bootstrap.sh && \
     ./configure && make && make install && \
     cd .. && rm -rf "$TMP_DIR"
 
-
 COPY paddle/scripts/compile_requirements.txt /root
 COPY python/requirements.txt /root
 RUN pip config set global.break-system-packages true && \
-    pip install --break-system-packages -r /root/requirements.txt && \
-    pip install --break-system-packages -r /root/compile_requirements.txt && \
+    pip install -r /root/requirements.txt && \
+    pip install -r /root/compile_requirements.txt && \
     rm -rf /root/compile_requirements.txt /root/requirements.txt
+
+RUN cd /usr/local && \
+    wget -q https://paddle-ci.gz.bcebos.com/gdrcopy.tar && \
+    tar xf gdrcopy.tar && \
+    rm -f gdrcopy.tar