Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8f87fdb
feat: Add CUDA 13.2 Dockerfile
gouzil Apr 14, 2026
40c7e79
Update tools/dockerfile/manylinux/Dockerfile-132
gouzil Apr 14, 2026
234835a
add nccl
gouzil Apr 18, 2026
4c1032b
[CUDA] adapt Paddle build and kernels for CUDA 13.2
gouzil Apr 19, 2026
f57a196
rollback warpctc and warpnnt
gouzil Apr 20, 2026
d430605
Merge branch 'develop' of github.com:gouzil/Paddle into cuda/support_…
gouzil Apr 20, 2026
d70ff89
clean code
gouzil Apr 21, 2026
dcda460
`setup.py` add cuda 13.2
gouzil Apr 21, 2026
225a3ee
dockerfile add `global.break-system-packages`
gouzil Apr 21, 2026
a308b5d
update `LD_LIBRARY_PATH`
gouzil Apr 23, 2026
b2b5a8f
Update the Dockerfile to support multi-architecture builds and fix th…
gouzil Apr 27, 2026
0678a56
Merge branch 'develop' of github.com:gouzil/Paddle into cuda/support_…
gouzil May 3, 2026
5232181
Add CUDA 13.2 manylinux support
gouzil May 3, 2026
a823383
Merge branch 'develop' of github.com:gouzil/Paddle into cuda/support_…
gouzil May 14, 2026
c729efa
Update the URL of the flash-attention sub-module and adjust the logic…
gouzil May 14, 2026
230dcc6
[CUDA] Restrict CUDA compiler version for flash attention support
gouzil May 25, 2026
40ecfce
update Dockerfile-132 add Python 3.12.13
gouzil May 26, 2026
607d54d
fix
gouzil May 26, 2026
14e8ecc
fix py version
gouzil May 26, 2026
ab055a2
update Dockerfile-132
gouzil May 26, 2026
06697b2
Merge branch 'develop' of github.com:gouzil/Paddle into cuda/support_…
gouzil May 26, 2026
b562e19
empty commit
gouzil May 26, 2026
aa3920f
update flash-attention to org, Dockerfile-132 add gdrcopy
gouzil May 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,37 @@ if(WITH_NV_JETSON)
set(paddle_known_gpu_archs10 "53 62 72")
set(paddle_known_gpu_archs11 "53 62 72 87")
set(paddle_known_gpu_archs12 "53 62 72 87 90 100")
set(paddle_known_gpu_archs13 "87 90 100")
elseif(NEW_RELEASE_ALL)
message("Using New Release Strategy - All Arches Package")
add_definitions(-DNEW_RELEASE_ALL)
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90 100")
set(paddle_known_gpu_archs13 "75 80 86 90 100")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review from @codex (GPT-5.5 xhigh).

这里的 CUDA 13 默认 arch 集合只有 75 80 86 90 100,后面的 CUDA_ARCH_NAME=Blackwell 也仍只映射到 100。CUDA 13.2 nvcc 官方支持列表已经包含 sm_103/sm_110/sm_120/sm_121 等 targets;如果 CUDA 13.2 release wheel 仍用 CUDA_ARCH_NAME=All,这些 targets 默认不会被编进 wheel。建议补齐 CUDA 13 的 release arch 策略,或明确用 PTX/JIT 覆盖这些新架构。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个 @risemeup1 @swgu98 确认下,感觉至少 103 得加一下

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P0 优先级:P0

Follow-up:当前 head b562e19d8d4432387a58fa8fa901debfb6fe6d5c 是 empty commit,CUDA 13 arch 配置没有变化。cmake/cuda.cmake:22 仍只有 75 80 86 90 100cmake/cuda.cmake:195CUDA_ARCH_NAME=Blackwell 仍只映射到 100,因此 CUDA 13.2 release/Blackwell 默认包仍不会覆盖前面指出的新增 Blackwell targets。请补齐 CUDA 13.2 release arch 策略,或明确加入 PTX/JIT 覆盖策略。

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P0 优先级:P0

当前 head 仍未补齐 CUDA 13.2 的 release/Blackwell arch 策略:这里的 CUDA 13 release arch 仍只有 75 80 86 90 100,并且同文件 CUDA_ARCH_NAME=Blackwell 仍只映射到 100。CUDA 13.2 nvcc 已支持更多 Blackwell targets(如 sm_103/sm_110/sm_120/sm_121),因此 CUDA_ARCH_NAME=All 或 Blackwell release 包仍不会默认产出这些目标的 cubin。请补齐 CUDA 13.2 的 arch 列表,或明确加入可接受的 PTX/JIT 覆盖策略。

elseif(NEW_RELEASE_PYPI)
message("Using New Release Strategy - Cubin Package")
add_definitions(-DNEW_RELEASE_PYPI)
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
set(paddle_known_gpu_archs10 "")
set(paddle_known_gpu_archs11 "61 70 75 80")
set(paddle_known_gpu_archs12 "61 70 75 80 90 100")
set(paddle_known_gpu_archs13 "75 80 86 90 100")
elseif(NEW_RELEASE_JIT)
message("Using New Release Strategy - JIT Package")
add_definitions(-DNEW_RELEASE_JIT)
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
set(paddle_known_gpu_archs10 "50 60 70 75")
set(paddle_known_gpu_archs11 "50 60 70 75 80")
set(paddle_known_gpu_archs12 "50 60 70 75 80 90 100")
set(paddle_known_gpu_archs13 "75 80 86 90 100")
else()
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90 100")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90 100")
set(paddle_known_gpu_archs13 "75 80 86 90 100")
endif()

######################################################################################
Expand Down Expand Up @@ -289,6 +294,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 14.0) # CUDA 13.0+
set(paddle_known_gpu_archs ${paddle_known_gpu_archs13})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
endif()

# Fix ARM NEON conflict with CUDA on aarch64 platforms.
Expand Down
3 changes: 2 additions & 1 deletion cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,8 @@ if(WITH_GPU
AND NOT WITH_ARM
AND NOT WIN32
AND NOT APPLE)
if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3
AND ${CMAKE_CUDA_COMPILER_VERSION} LESS_EQUAL 12.9)
foreach(arch ${NVCC_ARCH_BIN})
if(${arch} GREATER_EQUAL 90)
set(WITH_FLASHATTN_V3 ON)
Expand Down
30 changes: 23 additions & 7 deletions paddle/phi/core/kernel_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,26 @@ struct KernelRegistrar {
::phi::KernelArgsParseFunctor< \
decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse

// nvcc 13.x crashes in cudafe++ on the explicit instantiation form
// `template decltype(fn<T, Ctx>) fn<T, Ctx>;`. Keep macro registration intact
// by replacing it with a `used` anchor that still forces the specialization
// to be emitted without hitting the buggy syntax.
#if defined(__CUDACC__) && !defined(_WIN32) && \
defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 13)
#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(id, ...) \
static auto* const PD_CONCATENATE(__pd_kernel_instantiation_anchor_, id) \

This comment was marked as outdated.

__attribute__((used)) = &__VA_ARGS__;
#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(PD_ID, __VA_ARGS__)
#else
#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
template decltype(__VA_ARGS__) __VA_ARGS__;
#endif

// The macro for instantiating function kernel
#define FUNCTION_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, context) \
template decltype(meta_kernel_fn<cpp_dtype, context>) \
meta_kernel_fn<cpp_dtype, context>;
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION( \
meta_kernel_fn<cpp_dtype, context>)

/** PD_REGISTER_KERNEL
*
Expand Down Expand Up @@ -1368,7 +1384,7 @@ struct KernelRegistrar {
#if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
#define PD_REGISTER_KERNEL_FOR_ALL_DTYPE( \
kernel_name, backend, layout, kernel_fn) \
template decltype(kernel_fn) kernel_fn; \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn) \
static void \
__FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
const ::phi::KernelKey& kernel_key UNUSED, \
Expand All @@ -1391,7 +1407,7 @@ struct KernelRegistrar {
#ifndef _WIN32
#define __PD_REGISTER_KERNEL_FOR_ALL_DTYPE( \
reg_type, kernel_name, backend, layout, kernel_fn) \
template decltype(kernel_fn) kernel_fn; \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn) \
static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \
static const ::phi::KernelRegistrar \
Expand Down Expand Up @@ -1440,8 +1456,8 @@ struct KernelRegistrar {
#if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
#define PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE( \
kernel_name, layout, meta_kernel_fn) \
template decltype(meta_kernel_fn<::phi::CustomContext>) \
meta_kernel_fn<::phi::CustomContext>; \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION( \
meta_kernel_fn<::phi::CustomContext>) \
static void \
__FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
const ::phi::KernelKey kernel_key UNUSED, \
Expand Down Expand Up @@ -1535,7 +1551,7 @@ struct KernelRegistrar {
#ifndef _WIN32
#define ___PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE( \
reg_type, kernel_name, backend, layout, kernel_fn, args_def_fn) \
template decltype(kernel_fn) kernel_fn; \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn) \
static const ::phi::KernelRegistrar \
__reg_phi_kernel_##kernel_name##_##backend##_##layout( \
reg_type, \
Expand Down
9 changes: 7 additions & 2 deletions paddle/phi/kernels/gpu/arange_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,13 @@ void ArangeKernel(const Context& dev_ctx,
<<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
}

template decltype(ArangeNullaryKernel<int64_t, GPUContext>) ArangeNullaryKernel;
template decltype(ArangeNullaryKernel<int, GPUContext>) ArangeNullaryKernel;
template void ArangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
const int64_t,
const int64_t,
const int64_t,
DenseTensor*);
template void ArangeNullaryKernel<int, GPUContext>(
const GPUContext&, const int, const int, const int, DenseTensor*);
} // namespace phi

PD_REGISTER_KERNEL(arange_tensor,
Expand Down
9 changes: 7 additions & 2 deletions paddle/phi/kernels/gpu/range_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,13 @@ void RangeKernel(const Context& dev_ctx,
<<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
}

template decltype(RangeNullaryKernel<int64_t, GPUContext>) RangeNullaryKernel;
template decltype(RangeNullaryKernel<int, GPUContext>) RangeNullaryKernel;
template void RangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
const int64_t,
const int64_t,
const int64_t,
DenseTensor*);
template void RangeNullaryKernel<int, GPUContext>(
const GPUContext&, const int, const int, const int, DenseTensor*);
Comment on lines -138 to +144
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

最小复现案例,同样的 case 1,在 cuda 13.0 可以,在 cuda 13.2 会报错 internal error: assertion failed at: "types.h", line 413 in rout_type_supp

template <typename Function>
struct KernelArgsParseFunctor;

template <typename Return, typename... Args>
struct KernelArgsParseFunctor<Return (*)(Args...)> {
  static void Parse(int, int) {}
};

template <typename Function, Function function>
struct KernelImpl {
  static void Compute() {}
  static void VariadicCompute() {}
};

struct KernelRegistrar {
  KernelRegistrar(const char*,
                  void (*)(int, int),
                  void (*)(int, int),
                  void (*)(),
                  void*) {}
};

template <typename T, typename Context>
void ShortKernel(Context, int) {}

#if CASE == 1
template decltype(ShortKernel<float, int>) ShortKernel<float, int>;
#elif CASE == 2
using FunctionType = decltype(ShortKernel<float, int>);
FunctionType* function_ptr = &ShortKernel<float, int>;
#elif CASE == 3
using FunctionPtrType = decltype(&ShortKernel<float, int>);
static auto* compute_ptr =
    &KernelImpl<FunctionPtrType, &ShortKernel<float, int>>::Compute;
static void* variadic_ptr = reinterpret_cast<void*>(
    &KernelImpl<FunctionPtrType, &ShortKernel<float, int>>::VariadicCompute);
#elif CASE == 4
template void ShortKernel<float, int>(int, int);
#elif CASE == 5
static void ProbeArgsDef(int, int) {}
using RegisterFunction = decltype(&ShortKernel<float, int>);
static const KernelRegistrar probe_registrar(
    "probe",
    &KernelArgsParseFunctor<RegisterFunction>::Parse,
    &ProbeArgsDef,
    &KernelImpl<RegisterFunction, &ShortKernel<float, int>>::Compute,
    reinterpret_cast<void*>(
        &KernelImpl<RegisterFunction, &ShortKernel<float, int>>::
            VariadicCompute));
#else
int unused = 0;
#endif

} // namespace phi

PD_REGISTER_KERNEL(range_tensor,
Expand Down
70 changes: 50 additions & 20 deletions python/setup.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,7 @@ def get_paddle_extra_install_requirements():
#(Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas. Additionally, it now supports the installation of TensorRT, further enhancing its functionality. This integration simplifies the process as the operation of 'pip install paddle' is no longer dependent on the separate installation of cuda, cudnn, or TensorRT.
paddle_cuda_requires = []
paddle_tensorrt_requires = []
cuda_major_version = None

This comment was marked as outdated.

if '@WITH_PIP_CUDA_LIBRARIES@' == 'ON':
if platform.system() == 'Linux':
PADDLE_CUDA_INSTALL_REQUIREMENTS = {
Expand Down Expand Up @@ -763,6 +764,23 @@ def get_paddle_extra_install_requirements():
"nvidia-cufile==1.15.1.6; platform_system == 'Linux' | "
"cuda-python==13.0.3; platform_system == 'Linux'"
),
"13.2": (
"nvidia-cuda-nvrtc==13.2.78; platform_system == 'Linux' | "
"nvidia-cuda-runtime==13.2.75; platform_system == 'Linux' | "
"nvidia-cuda-cupti==13.2.75; platform_system == 'Linux' | "
"nvidia-cudnn-cu13==9.21.0.82; platform_system == 'Linux' | "
"nvidia-cublas==13.4.0.1; platform_system == 'Linux' | "
"nvidia-cufft==12.2.0.46; platform_system == 'Linux' | "
"nvidia-curand==10.4.2.55; platform_system == 'Linux' | "
"nvidia-cusolver==12.2.0.1; platform_system == 'Linux' | "
"nvidia-cusparse==12.7.10.1; platform_system == 'Linux' | "
"nvidia-cusparselt-cu13==0.9.0; platform_system == 'Linux' | "
"nvidia-nccl-cu13==2.29.7; platform_system == 'Linux' | "
"nvidia-nvtx==13.2.75; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.2.78; platform_system == 'Linux' | "
"nvidia-cufile==1.17.1.22; platform_system == 'Linux' | "
"cuda-python==13.2.0; platform_system == 'Linux'"
),
}
if '@WITH_CINN@' == 'ON':
PADDLE_CUDA_INSTALL_REQUIREMENTS["12.3"] += (
Expand All @@ -783,6 +801,9 @@ def get_paddle_extra_install_requirements():
PADDLE_CUDA_INSTALL_REQUIREMENTS["13.0"] += (
" | nvidia-cuda-cccl==13.0.85;platform_system == 'Linux' "
)
PADDLE_CUDA_INSTALL_REQUIREMENTS["13.2"] += (
" | nvidia-cuda-cccl==13.2.75;platform_system == 'Linux' "
)
elif platform.system() == 'Windows':
PADDLE_CUDA_INSTALL_REQUIREMENTS = {
"11.8": (
Expand Down Expand Up @@ -853,37 +874,46 @@ def get_paddle_extra_install_requirements():

if '@WITH_PIP_TENSORRT@' == 'ON':
version_str = get_tensorrt_version()
version_default = int(version_str.split(".")[0])
if platform.system() =='Linux' or (platform.system()=='Windows' and version_default>=10):

version_default = int(version_str.split(".")[0]) if version_str else None
if platform.system() == 'Linux' and cuda_major_version == '13.2':
if not version_str and platform.machine() == 'aarch64':
return paddle_cuda_requires, ["tensorrt-cu13==10.16.1.11"]
PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
"tensorrt-cu13==10.16.1.11",
]
elif platform.system() =='Linux' or (platform.system()=='Windows' and version_default is not None and version_default>=10):
PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
"tensorrt==8.5.3.1",
"tensorrt==8.6.0",
"tensorrt==8.6.1.post1",
"tensorrt==10.3.0",
]
else:
return paddle_cuda_requires, []

if not version_str:
return paddle_cuda_requires,[]
if not version_str:
return paddle_cuda_requires,[]

version_main = ".".join(version_str.split(".")[:3])
version_main = ".".join(version_str.split(".")[:3])

matched_package = None
for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])
matched_package = None
for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])

if version_main == paddle_tensorrt_main:
matched_package = paddle_tensorrt_requires
break
if version_main == paddle_tensorrt_main:
matched_package = paddle_tensorrt_requires
break

if matched_package:
paddle_tensorrt_requires = [matched_package]
else:
print(
f"No exact match found for TensorRT Version: {version_str}. We currently support TensorRT versions 8.5.3.1, 8.6.0, and 8.6.1."
)
return paddle_cuda_requires, []
if matched_package:
paddle_tensorrt_requires = [matched_package]
else:
print(
"No exact match found for TensorRT Version: "
f"{version_str}. We currently support TensorRT versions "
"8.5.3.1, 8.6.0, 8.6.1.post1, 10.3.0, and 10.16.1.11."
)
return paddle_cuda_requires, []

return paddle_cuda_requires,paddle_tensorrt_requires

Expand Down
Loading
Loading