diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 28cd08ea5a8dd..7c36df0dfb010 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -11,6 +11,7 @@ if(WITH_NV_JETSON)
   set(paddle_known_gpu_archs10 "53 62 72")
   set(paddle_known_gpu_archs11 "53 62 72 87")
   set(paddle_known_gpu_archs12 "53 62 72 87 90 100")
+  set(paddle_known_gpu_archs13 "87 90 100")
 elseif(NEW_RELEASE_ALL)
   message("Using New Release Strategy - All Arches Package")
   add_definitions(-DNEW_RELEASE_ALL)
@@ -18,6 +19,7 @@ elseif(NEW_RELEASE_ALL)
   set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
   set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Package")
   add_definitions(-DNEW_RELEASE_PYPI)
@@ -25,6 +27,7 @@ elseif(NEW_RELEASE_PYPI)
   set(paddle_known_gpu_archs10 "")
   set(paddle_known_gpu_archs11 "61 70 75 80")
   set(paddle_known_gpu_archs12 "61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Package")
   add_definitions(-DNEW_RELEASE_JIT)
@@ -32,11 +35,13 @@ elseif(NEW_RELEASE_JIT)
   set(paddle_known_gpu_archs10 "50 60 70 75")
   set(paddle_known_gpu_archs11 "50 60 70 75 80")
   set(paddle_known_gpu_archs12 "50 60 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 else()
   set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90 100")
   set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
   set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 endif()
 
 ######################################################################################
@@ -289,6 +294,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 14.0) # CUDA 13.0+
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs13})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
 # Fix ARM NEON conflict with CUDA on aarch64 platforms.
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 755739e533695..3e8e7faf4984b 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -604,7 +604,8 @@ if(WITH_GPU
    AND NOT WITH_ARM
    AND NOT WIN32
    AND NOT APPLE)
-  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
+  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3
+     AND ${CMAKE_CUDA_COMPILER_VERSION} LESS_EQUAL 12.9)
     foreach(arch ${NVCC_ARCH_BIN})
       if(${arch} GREATER_EQUAL 90)
         set(WITH_FLASHATTN_V3 ON)
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index b6ab155dfaab1..f56189cd1d13a 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -197,10 +197,26 @@ struct KernelRegistrar {
   ::phi::KernelArgsParseFunctor<                              \
       decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse
 
+// nvcc 13.x crashes in cudafe++ on the explicit instantiation form
+// `template decltype(fn<T, Ctx>) fn<T, Ctx>;`. Keep macro registration intact
+// by replacing it with a `used` anchor that still forces the specialization
+// to be emitted without hitting the buggy syntax.
+#if defined(__CUDACC__) && !defined(_WIN32) && \
+    defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 13)
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(id, ...)          \
+  static auto* const PD_CONCATENATE(__pd_kernel_instantiation_anchor_, id) \
+      __attribute__((used)) = &__VA_ARGS__;
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(PD_ID, __VA_ARGS__)
+#else
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
+  template decltype(__VA_ARGS__) __VA_ARGS__;
+#endif
+
 // The macro for instantiating function kernel
 #define FUNCTION_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, context) \
-  template decltype(meta_kernel_fn<cpp_dtype, context>)                   \
-      meta_kernel_fn<cpp_dtype, context>;
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(                            \
+      meta_kernel_fn<cpp_dtype, context>)
 
 /** PD_REGISTER_KERNEL
  *
@@ -1368,7 +1384,7 @@ struct KernelRegistrar {
 #if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
 #define PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                \
     kernel_name, backend, layout, kernel_fn)                             \
-  template decltype(kernel_fn) kernel_fn;                                \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                 \
   static void                                                            \
       __FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
           const ::phi::KernelKey& kernel_key UNUSED,                     \
@@ -1391,7 +1407,7 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define __PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                    \
     reg_type, kernel_name, backend, layout, kernel_fn)                         \
-  template decltype(kernel_fn) kernel_fn;                                      \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                       \
   static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(    \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);              \
   static const ::phi::KernelRegistrar                                          \
@@ -1440,8 +1456,8 @@ struct KernelRegistrar {
 #if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
 #define PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(                        \
     kernel_name, layout, meta_kernel_fn)                                 \
-  template decltype(meta_kernel_fn<::phi::CustomContext>)                \
-      meta_kernel_fn<::phi::CustomContext>;                              \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(                           \
+      meta_kernel_fn<::phi::CustomContext>)                              \
   static void                                                            \
       __FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
           const ::phi::KernelKey kernel_key UNUSED,                      \
@@ -1535,7 +1551,7 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define ___PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(                           \
     reg_type, kernel_name, backend, layout, kernel_fn, args_def_fn)            \
-  template decltype(kernel_fn) kernel_fn;                                      \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                       \
   static const ::phi::KernelRegistrar                                          \
       __reg_phi_kernel_##kernel_name##_##backend##_##layout(                   \
           reg_type,                                                            \
diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
index 88f97b4e5c241..328cbc8bba56d 100644
--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -135,8 +135,13 @@ void ArangeKernel(const Context& dev_ctx,
       <<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
 }
 
-template decltype(ArangeNullaryKernel<int64_t, GPUContext>) ArangeNullaryKernel;
-template decltype(ArangeNullaryKernel<int, GPUContext>) ArangeNullaryKernel;
+template void ArangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
+                                                       const int64_t,
+                                                       const int64_t,
+                                                       const int64_t,
+                                                       DenseTensor*);
+template void ArangeNullaryKernel<int, GPUContext>(
+    const GPUContext&, const int, const int, const int, DenseTensor*);
 }  // namespace phi
 
 PD_REGISTER_KERNEL(arange_tensor,
diff --git a/paddle/phi/kernels/gpu/range_kernel.cu b/paddle/phi/kernels/gpu/range_kernel.cu
index 658ec208e5138..250e83b7eec5b 100644
--- a/paddle/phi/kernels/gpu/range_kernel.cu
+++ b/paddle/phi/kernels/gpu/range_kernel.cu
@@ -135,8 +135,13 @@ void RangeKernel(const Context& dev_ctx,
       <<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
 }
 
-template decltype(RangeNullaryKernel<int64_t, GPUContext>) RangeNullaryKernel;
-template decltype(RangeNullaryKernel<int, GPUContext>) RangeNullaryKernel;
+template void RangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
+                                                      const int64_t,
+                                                      const int64_t,
+                                                      const int64_t,
+                                                      DenseTensor*);
+template void RangeNullaryKernel<int, GPUContext>(
+    const GPUContext&, const int, const int, const int, DenseTensor*);
 }  // namespace phi
 
 PD_REGISTER_KERNEL(range_tensor,
diff --git a/python/setup.py.in b/python/setup.py.in
index 103329999426f..169bb448a26d6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -653,6 +653,7 @@ def get_paddle_extra_install_requirements():
     #(Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas. Additionally, it now supports the installation of TensorRT, further enhancing its functionality. This integration simplifies the process as the operation of 'pip install paddle' is no longer dependent on the separate installation of cuda, cudnn, or TensorRT.
     paddle_cuda_requires = []
     paddle_tensorrt_requires = []
+    cuda_major_version = None
     if '@WITH_PIP_CUDA_LIBRARIES@' == 'ON':
         if platform.system() == 'Linux':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
@@ -763,6 +764,23 @@ def get_paddle_extra_install_requirements():
                 "nvidia-cufile==1.15.1.6; platform_system == 'Linux' | "
                 "cuda-python==13.0.3; platform_system == 'Linux'"
             ),
+            "13.2": (
+                "nvidia-cuda-nvrtc==13.2.78; platform_system == 'Linux' | "
+                "nvidia-cuda-runtime==13.2.75; platform_system == 'Linux' | "
+                "nvidia-cuda-cupti==13.2.75; platform_system == 'Linux' | "
+                "nvidia-cudnn-cu13==9.21.0.82; platform_system == 'Linux' | "
+                "nvidia-cublas==13.4.0.1; platform_system == 'Linux' | "
+                "nvidia-cufft==12.2.0.46; platform_system == 'Linux' | "
+                "nvidia-curand==10.4.2.55; platform_system == 'Linux' | "
+                "nvidia-cusolver==12.2.0.1; platform_system == 'Linux' | "
+                "nvidia-cusparse==12.7.10.1; platform_system == 'Linux' | "
+                "nvidia-cusparselt-cu13==0.9.0; platform_system == 'Linux' | "
+                "nvidia-nccl-cu13==2.29.7; platform_system == 'Linux' | "
+                "nvidia-nvtx==13.2.75; platform_system == 'Linux' | "
+                "nvidia-nvjitlink==13.2.78; platform_system == 'Linux' | "
+                "nvidia-cufile==1.17.1.22; platform_system == 'Linux' | "
+                "cuda-python==13.2.0; platform_system == 'Linux'"
+            ),
         }
         if '@WITH_CINN@' == 'ON':
             PADDLE_CUDA_INSTALL_REQUIREMENTS["12.3"] += (
@@ -783,6 +801,9 @@ def get_paddle_extra_install_requirements():
             PADDLE_CUDA_INSTALL_REQUIREMENTS["13.0"] += (
                     " | nvidia-cuda-cccl==13.0.85;platform_system == 'Linux' "
             )
+            PADDLE_CUDA_INSTALL_REQUIREMENTS["13.2"] += (
+                    " | nvidia-cuda-cccl==13.2.75;platform_system == 'Linux' "
+            )
         elif platform.system() == 'Windows':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
                 "11.8": (
@@ -853,37 +874,46 @@ def get_paddle_extra_install_requirements():
 
     if '@WITH_PIP_TENSORRT@' == 'ON':
         version_str = get_tensorrt_version()
-        version_default = int(version_str.split(".")[0])
-        if platform.system() =='Linux' or (platform.system()=='Windows' and version_default>=10):
-
+        version_default = int(version_str.split(".")[0]) if version_str else None
+        if platform.system() == 'Linux' and cuda_major_version == '13.2':
+            if not version_str and platform.machine() == 'aarch64':
+                return paddle_cuda_requires, ["tensorrt-cu13==10.16.1.11"]
+            PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
+                "tensorrt-cu13==10.16.1.11",
+            ]
+        elif platform.system() =='Linux' or (platform.system()=='Windows' and version_default is not None and version_default>=10):
             PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
                 "tensorrt==8.5.3.1",
                 "tensorrt==8.6.0",
                 "tensorrt==8.6.1.post1",
                 "tensorrt==10.3.0",
             ]
+        else:
+            return paddle_cuda_requires, []
 
-            if not version_str:
-                return paddle_cuda_requires,[]
+        if not version_str:
+            return paddle_cuda_requires,[]
 
-            version_main = ".".join(version_str.split(".")[:3])
+        version_main = ".".join(version_str.split(".")[:3])
 
-            matched_package = None
-            for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
-                paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
-                paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])
+        matched_package = None
+        for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
+            paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
+            paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])
 
-                if version_main == paddle_tensorrt_main:
-                    matched_package = paddle_tensorrt_requires
-                    break
+            if version_main == paddle_tensorrt_main:
+                matched_package = paddle_tensorrt_requires
+                break
 
-            if matched_package:
-                paddle_tensorrt_requires = [matched_package]
-            else:
-                print(
-                    f"No exact match found for TensorRT Version: {version_str}. We currently support TensorRT versions 8.5.3.1, 8.6.0, and 8.6.1."
-                )
-                return paddle_cuda_requires, []
+        if matched_package:
+            paddle_tensorrt_requires = [matched_package]
+        else:
+            print(
+                "No exact match found for TensorRT Version: "
+                f"{version_str}. We currently support TensorRT versions "
+                "8.5.3.1, 8.6.0, 8.6.1.post1, 10.3.0, and 10.16.1.11."
+            )
+            return paddle_cuda_requires, []
 
     return paddle_cuda_requires,paddle_tensorrt_requires
 
diff --git a/setup.py b/setup.py
index acf1eee575f96..00787fa9f1186 100644
--- a/setup.py
+++ b/setup.py
@@ -1161,7 +1161,8 @@ def get_setup_requires():
 def get_paddle_extra_install_requirements():
     paddle_cuda_requires = []
     paddle_tensorrt_requires = []
-    # (Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas, thereby making the operation of 'pip install paddle' no longer dependent on the installation of cuda and cudnn.
+    cuda_major_version = None
+    # (Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas. Additionally, it now supports the installation of TensorRT, further enhancing its functionality. This integration simplifies the process as the operation of 'pip install paddle' is no longer dependent on the separate installation of cuda, cudnn, or TensorRT.
     if env_dict.get("WITH_PIP_CUDA_LIBRARIES") == "ON":
         if platform.system() == 'Linux':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
@@ -1272,6 +1273,23 @@ def get_paddle_extra_install_requirements():
                     "nvidia-cufile==1.15.1.6; platform_system == 'Linux' | "
                     "cuda-python==13.0.3; platform_system == 'Linux'"
                 ),
+                "13.2": (
+                    "nvidia-cuda-nvrtc==13.2.78; platform_system == 'Linux' | "
+                    "nvidia-cuda-runtime==13.2.75; platform_system == 'Linux' | "
+                    "nvidia-cuda-cupti==13.2.75; platform_system == 'Linux' | "
+                    "nvidia-cudnn-cu13==9.21.0.82; platform_system == 'Linux' | "
+                    "nvidia-cublas==13.4.0.1; platform_system == 'Linux' | "
+                    "nvidia-cufft==12.2.0.46; platform_system == 'Linux' | "
+                    "nvidia-curand==10.4.2.55; platform_system == 'Linux' | "
+                    "nvidia-cusolver==12.2.0.1; platform_system == 'Linux' | "
+                    "nvidia-cusparse==12.7.10.1; platform_system == 'Linux' | "
+                    "nvidia-cusparselt-cu13==0.9.0; platform_system == 'Linux' | "
+                    "nvidia-nccl-cu13==2.29.7; platform_system == 'Linux' | "
+                    "nvidia-nvtx==13.2.75; platform_system == 'Linux' | "
+                    "nvidia-nvjitlink==13.2.78; platform_system == 'Linux' | "
+                    "nvidia-cufile==1.17.1.22; platform_system == 'Linux' | "
+                    "cuda-python==13.2.0; platform_system == 'Linux'"
+                ),
             }
             if env_dict.get("WITH_CINN") == "ON":
                 PADDLE_CUDA_INSTALL_REQUIREMENTS["12.3"] += (
@@ -1292,6 +1310,9 @@ def get_paddle_extra_install_requirements():
                 PADDLE_CUDA_INSTALL_REQUIREMENTS["13.0"] += (
                     " | nvidia-cuda-cccl==13.0.85;platform_system == 'Linux' "
                 )
+                PADDLE_CUDA_INSTALL_REQUIREMENTS["13.2"] += (
+                    " | nvidia-cuda-cccl==13.2.75;platform_system == 'Linux' "
+                )
 
         elif platform.system() == 'Windows':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
@@ -1369,43 +1390,54 @@ def get_paddle_extra_install_requirements():
 
     if env_dict.get("WITH_PIP_TENSORRT") == "ON":
         version_str = get_tensorrt_version()
-        version_default = int(version_str.split(".")[0])
-        if platform.system() == 'Linux' or (
-            platform.system() == 'Windows' and version_default >= 10
+        version_default = (
+            int(version_str.split(".")[0]) if version_str else None
+        )
+        if platform.system() == 'Linux' and cuda_major_version == '13.2':
+            if not version_str and platform.machine() == 'aarch64':
+                return paddle_cuda_requires, ["tensorrt-cu13==10.16.1.11"]
+            PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
+                "tensorrt-cu13==10.16.1.11",
+            ]
+        elif platform.system() == 'Linux' or (
+            platform.system() == 'Windows'
+            and version_default is not None
+            and version_default >= 10
         ):
             PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
                 "tensorrt==8.5.3.1",
                 "tensorrt==8.6.0",
                 "tensorrt==8.6.1.post1",
+                "tensorrt==10.3.0",
             ]
+        else:
+            return paddle_cuda_requires, []
 
-            if not version_str:
-                return paddle_cuda_requires, []
+        if not version_str:
+            return paddle_cuda_requires, []
 
-            version_main = ".".join(version_str.split(".")[:3])
+        version_main = ".".join(version_str.split(".")[:3])
 
-            matched_package = None
-            for (
-                paddle_tensorrt_requires
-            ) in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
-                paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[
-                    1
-                ]
-                paddle_tensorrt_main = ".".join(
-                    paddle_tensorrt_version.split(".")[:3]
-                )
+        matched_package = None
+        for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
+            paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
+            paddle_tensorrt_main = ".".join(
+                paddle_tensorrt_version.split(".")[:3]
+            )
 
-                if version_main == paddle_tensorrt_main:
-                    matched_package = paddle_tensorrt_requires
-                    break
+            if version_main == paddle_tensorrt_main:
+                matched_package = paddle_tensorrt_requires
+                break
 
-            if matched_package:
-                paddle_tensorrt_requires = [matched_package]
-            else:
-                print(
-                    f"No exact match found for TensorRT Version: {version_str}. We currently support TensorRT versions 8.5.3.1, 8.6.0, and 8.6.1."
-                )
-                return paddle_cuda_requires, []
+        if matched_package:
+            paddle_tensorrt_requires = [matched_package]
+        else:
+            print(
+                "No exact match found for TensorRT Version: "
+                f"{version_str}. We currently support TensorRT versions "
+                "8.5.3.1, 8.6.0, 8.6.1.post1, 10.3.0, and 10.16.1.11."
+            )
+            return paddle_cuda_requires, []
 
     return paddle_cuda_requires, paddle_tensorrt_requires
 
diff --git a/third_party/flashattn b/third_party/flashattn
index 0ae8b7991b721..1f3e4bb9aaa0d 160000
--- a/third_party/flashattn
+++ b/third_party/flashattn
@@ -1 +1 @@
-Subproject commit 0ae8b7991b7219bf53a17997d0876aaa0c4e301a
+Subproject commit 1f3e4bb9aaa0dbefa96181bf6026e810f3ebecc1
diff --git a/tools/dockerfile/manylinux/Dockerfile b/tools/dockerfile/manylinux/Dockerfile
index 63c09c674fd78..5c971699384c8 100644
--- a/tools/dockerfile/manylinux/Dockerfile
+++ b/tools/dockerfile/manylinux/Dockerfile
@@ -83,6 +83,10 @@ FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 ENV DESIRED_CUDA=12.6
 
+FROM cuda as cuda13.2
+RUN bash ./install_cuda.sh 13.2
+ENV DESIRED_CUDA=13.2
+
 
 # Install paddle
 FROM python as paddle
diff --git a/tools/dockerfile/manylinux/Dockerfile-132 b/tools/dockerfile/manylinux/Dockerfile-132
new file mode 100644
index 0000000000000..8f684c231ee2b
--- /dev/null
+++ b/tools/dockerfile/manylinux/Dockerfile-132
@@ -0,0 +1,116 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+ARG CUDA_VERSION=13.2
+ARG BASE_TARGET=cuda${CUDA_VERSION}
+
+FROM nvcr.io/nvidia/cuda:13.2.0-cudnn-devel-ubuntu24.04 as base
+LABEL maintainer="paddle-dev@baidu.com"
+
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+ARG PYTHON_VERSION=3.12
+ARG PYTHON_SOURCE_VERSION=3.12.13
+ARG TMP_DIR=patchelf_tmp
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/compat:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/usr/local/gdrcopy
+
+ENV HOME /root
+
+RUN apt-get update --allow-unauthenticated && \
+    apt-get install -y --no-install-recommends \
+      git \
+      vim \
+      curl \
+      wget \
+      make \
+      zstd \
+      rsync \
+      ca-certificates \
+      build-essential \
+      libgl1 \
+      libglib2.0-0 \
+      libssl-dev \
+      autoconf \
+      automake \
+      libtool \
+      libmlx5-1 \
+      libibverbs-dev \
+      pkg-config \
+      zlib1g-dev \
+      libbz2-dev \
+      libreadline-dev \
+      libsqlite3-dev \
+      libncursesw5-dev \
+      libffi-dev \
+      libgdbm-dev \
+      libgdbm-compat-dev \
+      liblzma-dev \
+      libexpat1-dev \
+      tk-dev \
+      uuid-dev \
+      xz-utils \
+      ccache \
+      libnccl2=2.29.7-1+cuda13.2 \
+      libnccl-dev=2.29.7-1+cuda13.2 && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN cd /tmp && \
+    wget -q https://www.python.org/ftp/python/${PYTHON_SOURCE_VERSION}/Python-${PYTHON_SOURCE_VERSION}.tar.xz && \
+    tar -xf Python-${PYTHON_SOURCE_VERSION}.tar.xz && \
+    cd Python-${PYTHON_SOURCE_VERSION} && \
+    ./configure \
+      --prefix=/usr/local \
+      --enable-optimizations \
+      --with-ensurepip=install && \
+    make -j"$(nproc)" && \
+    make altinstall && \
+    ln -sf /usr/local/bin/python${PYTHON_VERSION} /usr/local/bin/python3 && \
+    ln -sf /usr/local/bin/python3 /usr/local/bin/python && \
+    ln -sf /usr/local/bin/pip${PYTHON_VERSION} /usr/local/bin/pip3 && \
+    ln -sf /usr/local/bin/pip3 /usr/local/bin/pip && \
+    python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel packaging && \
+    cd / && \
+    rm -rf /tmp/Python-${PYTHON_SOURCE_VERSION} /tmp/Python-${PYTHON_SOURCE_VERSION}.tar.xz
+
+WORKDIR /home
+RUN set -eux; \
+        if [ "${TARGETARCH:-}" = "amd64" ] || [ "$(uname -m)" = "x86_64" ]; then \
+            CMAKE_ARCH="x86_64"; \
+        elif [ "${TARGETARCH:-}" = "arm64" ] || [ "$(uname -m)" = "aarch64" ]; then \
+            CMAKE_ARCH="aarch64"; \
+        else \
+            echo "Unsupported architecture: TARGETARCH=${TARGETARCH:-unknown}, uname -m=$(uname -m)"; \
+            exit 1; \
+        fi; \
+        CMAKE_PKG="cmake-3.31.0-linux-${CMAKE_ARCH}.tar.gz"; \
+        wget -q "https://cmake.org/files/v3.31/${CMAKE_PKG}"; \
+        tar -zxf "${CMAKE_PKG}"; \
+        rm "${CMAKE_PKG}"; \
+        mv "/home/cmake-3.31.0-linux-${CMAKE_ARCH}" /home/cmake-3.31.0; \
+        rm -rf /home/cmake-3.31.0/doc /home/cmake-3.31.0/man
+
+ENV PATH=/home/cmake-3.31.0/bin:$PATH
+
+RUN rm -rf "$TMP_DIR" && git clone --depth 1 --branch 0.15.0 https://github.com/NixOS/patchelf "$TMP_DIR" && \
+    cd "$TMP_DIR" && ./bootstrap.sh && \
+    ./configure && make && make install && \
+    cd .. && rm -rf "$TMP_DIR"
+
+COPY paddle/scripts/compile_requirements.txt /root
+COPY python/requirements.txt /root
+RUN pip config set global.break-system-packages true && \
+    pip install -r /root/requirements.txt && \
+    pip install -r /root/compile_requirements.txt && \
+    rm -rf /root/compile_requirements.txt /root/requirements.txt
+
+RUN cd /usr/local && \
+    wget -q https://paddle-ci.gz.bcebos.com/gdrcopy.tar && \
+    tar xf gdrcopy.tar && \
+    rm -f gdrcopy.tar
diff --git a/tools/dockerfile/manylinux/common/install_cuda.sh b/tools/dockerfile/manylinux/common/install_cuda.sh
index e84513a3315cc..2d1c9fdfb7fbe 100644
--- a/tools/dockerfile/manylinux/common/install_cuda.sh
+++ b/tools/dockerfile/manylinux/common/install_cuda.sh
@@ -63,6 +63,17 @@ function install_cusparselt_063 {
     rm -rf tmp_cusparselt
 }
 
+function install_cusparselt_090_cuda13 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.9.0.3_cuda13-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.9.0.3_cuda13-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.9.0.3_cuda13-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.9.0.3_cuda13-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
 function install_nccl_2162 {
     wget -q https://nccl2-deb.cdn.bcebos.com/nccl_2.16.2-1+cuda11.8_x86_64.txz --no-check-certificate --no-proxy
     tar xf nccl_2.16.2-1+cuda11.8_x86_64.txz
@@ -95,6 +106,14 @@ function install_nccl_2234 {
     rm -rf nccl_2.23.4-1+cuda12.6_x86_64 nccl_2.23.4-1+cuda12.6_x86_64.txz
 }
 
+function install_nccl_2297_cuda132 {
+    yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+    yum install -y \
+        libnccl-2.29.7-1+cuda13.2 \
+        libnccl-devel-2.29.7-1+cuda13.2 \
+        libnccl-static-2.29.7-1+cuda13.2
+}
+
 function install_trt_8616 {
     wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz --no-check-certificate --no-proxy
     tar -zxf TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz -C /usr/local
@@ -109,6 +128,13 @@ function install_trt_105018 {
     rm -f TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz
 }
 
+function install_trt_1016111 {
+    wget -q https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.16.1/tars/TensorRT-10.16.1.11.Linux.x86_64-gnu.cuda-13.2.tar.gz --no-check-certificate --no-proxy
+    tar -zxf TensorRT-10.16.1.11.Linux.x86_64-gnu.cuda-13.2.tar.gz -C /usr/local
+    cp -rf /usr/local/TensorRT-10.16.1.11/include/* /usr/include/ && cp -rf /usr/local/TensorRT-10.16.1.11/lib/* /usr/lib/
+    rm -f TensorRT-10.16.1.11.Linux.x86_64-gnu.cuda-13.2.tar.gz
+}
+
 function install_118 {
     CUDNN_VERSION=8.9.7.29
     NCCL_VERSION=2.16.5
@@ -264,6 +290,36 @@ function install_129 {
     ldconfig
 }
 
+function install_132 {
+    CUDNN_VERSION=9.20.0.48
+    NCCL_VERSION=2.29.7
+    TensorRT_VERSION=10.16.1.11
+    echo "Installing CUDA 13.2.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and TensorRT ${TensorRT_VERSION} and cuSparseLt-0.9.0"
+    rm -rf /usr/local/cuda-13.2 /usr/local/cuda
+    # install CUDA 13.2.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/13.2.0/local_installers/cuda_13.2.0_595.45.04_linux.run
+    chmod +x cuda_13.2.0_595.45.04_linux.run
+    ./cuda_13.2.0_595.45.04_linux.run --toolkit --driver --silent --kernel-source-path=/usr/src/kernels/4.18.0-553.34.1.el8_10.x86_64
+    rm -f cuda_13.2.0_595.45.04_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-13.2 /usr/local/cuda
+    rm -rf /usr/bin/nvidia-smi
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda13-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    install_nccl_2297_cuda132
+    install_trt_1016111
+    install_cusparselt_090_cuda13
+
+    ldconfig
+}
+
 function prune_118 {
     echo "Pruning CUDA 11.8 and cuDNN"
     #####################################################################################
@@ -408,6 +464,8 @@ do
         ;;
     12.9) install_129
         ;;
+    13.2) install_132
+        ;;
     *) echo "bad argument $1"; exit 1
         ;;
     esac