-
Notifications
You must be signed in to change notification settings - Fork 6k
【Hackathon 10th Spring No.51】Environment Adaptation support Paddle on CUDA 13.2 #78720
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8f87fdb
40c7e79
234835a
4c1032b
f57a196
d430605
d70ff89
dcda460
225a3ee
a308b5d
b2b5a8f
0678a56
5232181
a823383
c729efa
230dcc6
40ecfce
607d54d
14e8ecc
ab055a2
06697b2
b562e19
aa3920f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,32 +11,37 @@ if(WITH_NV_JETSON) | |
| set(paddle_known_gpu_archs10 "53 62 72") | ||
| set(paddle_known_gpu_archs11 "53 62 72 87") | ||
| set(paddle_known_gpu_archs12 "53 62 72 87 90 100") | ||
| set(paddle_known_gpu_archs13 "87 90 100") | ||
| elseif(NEW_RELEASE_ALL) | ||
| message("Using New Release Strategy - All Arches Package") | ||
| add_definitions(-DNEW_RELEASE_ALL) | ||
| set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100") | ||
| set(paddle_known_gpu_archs10 "50 52 60 61 70 75") | ||
| set(paddle_known_gpu_archs11 "50 60 61 70 75 80") | ||
| set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90 100") | ||
| set(paddle_known_gpu_archs13 "75 80 86 90 100") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 当前 head 仍未补齐 CUDA 13.2 的 release/Blackwell arch 策略:这里的 CUDA 13 release arch 仍只有 |
||
| elseif(NEW_RELEASE_PYPI) | ||
| message("Using New Release Strategy - Cubin Package") | ||
| add_definitions(-DNEW_RELEASE_PYPI) | ||
| set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100") | ||
| set(paddle_known_gpu_archs10 "") | ||
| set(paddle_known_gpu_archs11 "61 70 75 80") | ||
| set(paddle_known_gpu_archs12 "61 70 75 80 90 100") | ||
| set(paddle_known_gpu_archs13 "75 80 86 90 100") | ||
| elseif(NEW_RELEASE_JIT) | ||
| message("Using New Release Strategy - JIT Package") | ||
| add_definitions(-DNEW_RELEASE_JIT) | ||
| set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100") | ||
| set(paddle_known_gpu_archs10 "50 60 70 75") | ||
| set(paddle_known_gpu_archs11 "50 60 70 75 80") | ||
| set(paddle_known_gpu_archs12 "50 60 70 75 80 90 100") | ||
| set(paddle_known_gpu_archs13 "75 80 86 90 100") | ||
| else() | ||
| set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90 100") | ||
| set(paddle_known_gpu_archs10 "50 52 60 61 70 75") | ||
| set(paddle_known_gpu_archs11 "52 60 61 70 75 80") | ||
| set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90 100") | ||
| set(paddle_known_gpu_archs13 "75 80 86 90 100") | ||
| endif() | ||
|
|
||
| ###################################################################################### | ||
|
|
@@ -289,6 +294,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+ | |
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") | ||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") | ||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") | ||
| elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 14.0) # CUDA 13.0+ | ||
| set(paddle_known_gpu_archs ${paddle_known_gpu_archs13}) | ||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") | ||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") | ||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") | ||
| endif() | ||
|
|
||
| # Fix ARM NEON conflict with CUDA on aarch64 platforms. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,8 +135,13 @@ void RangeKernel(const Context& dev_ctx, | |
| <<<grid, block, 0, stream>>>(start_value, step_value, size, out_data); | ||
| } | ||
|
|
||
| template decltype(RangeNullaryKernel<int64_t, GPUContext>) RangeNullaryKernel; | ||
| template decltype(RangeNullaryKernel<int, GPUContext>) RangeNullaryKernel; | ||
| template void RangeNullaryKernel<int64_t, GPUContext>(const GPUContext&, | ||
| const int64_t, | ||
| const int64_t, | ||
| const int64_t, | ||
| DenseTensor*); | ||
| template void RangeNullaryKernel<int, GPUContext>( | ||
| const GPUContext&, const int, const int, const int, DenseTensor*); | ||
|
Comment on lines
-138
to
+144
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 最小复现案例,同样的 case 1,在 cuda 13.0 可以,在 cuda 13.2 会报错 template <typename Function>
struct KernelArgsParseFunctor;
template <typename Return, typename... Args>
struct KernelArgsParseFunctor<Return (*)(Args...)> {
static void Parse(int, int) {}
};
template <typename Function, Function function>
struct KernelImpl {
static void Compute() {}
static void VariadicCompute() {}
};
struct KernelRegistrar {
KernelRegistrar(const char*,
void (*)(int, int),
void (*)(int, int),
void (*)(),
void*) {}
};
template <typename T, typename Context>
void ShortKernel(Context, int) {}
#if CASE == 1
template decltype(ShortKernel<float, int>) ShortKernel<float, int>;
#elif CASE == 2
using FunctionType = decltype(ShortKernel<float, int>);
FunctionType* function_ptr = &ShortKernel<float, int>;
#elif CASE == 3
using FunctionPtrType = decltype(&ShortKernel<float, int>);
static auto* compute_ptr =
&KernelImpl<FunctionPtrType, &ShortKernel<float, int>>::Compute;
static void* variadic_ptr = reinterpret_cast<void*>(
&KernelImpl<FunctionPtrType, &ShortKernel<float, int>>::VariadicCompute);
#elif CASE == 4
template void ShortKernel<float, int>(int, int);
#elif CASE == 5
static void ProbeArgsDef(int, int) {}
using RegisterFunction = decltype(&ShortKernel<float, int>);
static const KernelRegistrar probe_registrar(
"probe",
&KernelArgsParseFunctor<RegisterFunction>::Parse,
&ProbeArgsDef,
&KernelImpl<RegisterFunction, &ShortKernel<float, int>>::Compute,
reinterpret_cast<void*>(
&KernelImpl<RegisterFunction, &ShortKernel<float, int>>::
VariadicCompute));
#else
int unused = 0;
#endif
|
||
| } // namespace phi | ||
|
|
||
| PD_REGISTER_KERNEL(range_tensor, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Review from @codex (GPT-5.5 xhigh).
这里的 CUDA 13 默认 arch 集合只有
75 80 86 90 100,后面的CUDA_ARCH_NAME=Blackwell也仍只映射到100。CUDA 13.2 nvcc 官方支持列表已经包含sm_103/sm_110/sm_120/sm_121等 targets;如果 CUDA 13.2 release wheel 仍用CUDA_ARCH_NAME=All,这些 targets 默认不会被编进 wheel。建议补齐 CUDA 13 的 release arch 策略,或明确用 PTX/JIT 覆盖这些新架构。There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个 @risemeup1 @swgu98 确认下,感觉至少 103 得加一下
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Follow-up:当前 head
b562e19d8d4432387a58fa8fa901debfb6fe6d5c是 empty commit,CUDA 13 arch 配置没有变化。cmake/cuda.cmake:22仍只有75 80 86 90 100,cmake/cuda.cmake:195的CUDA_ARCH_NAME=Blackwell仍只映射到100,因此 CUDA 13.2 release/Blackwell 默认包仍不会覆盖前面指出的新增 Blackwell targets。请补齐 CUDA 13.2 release arch 策略,或明确加入 PTX/JIT 覆盖策略。