Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions .devcontainer/cuda12.9-llvm22/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:26.08-cpp-llvm22-cuda12.9",
"runArgs": [
"--init",
"--name",
"${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9-llvm22"
],
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"set -euo pipefail; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/.{aws,cache,config}; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/{build,wheelhouse}; if test -z \"${localEnv:WSLENV}\"; then docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/build\" --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/wheelhouse\" --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
Comment thread
davebayer marked this conversation as resolved.
],
"postAttachCommand": [
"/bin/bash",
"-c",
"if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE": "true",
"SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX": "cccl-preprocessor-cache",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.9-llvm22",
"DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST": "1",
"CCCL_CUDA_VERSION": "12.9",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "22",
"CCCL_BUILD_INFIX": "cuda12.9-llvm22",
"CCCL_CUDA_EXTENDED": "false",
"HOST_WORKSPACE": "${localWorkspaceFolder}"
},
"workspaceFolder": "/home/coder/cccl",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind",
"mounts": [
"source=/etc/timezone,target=/etc/timezone,type=bind",
"source=/etc/localtime,target=/etc/localtime,type=bind",
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind",
"source=cccl-build,target=/home/coder/cccl/build",
"source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"seaube.clangformat",
"nvidia.nsight-vscode-edition",
"ms-vscode.cmake-tools",
"timonwong.shellcheck"
],
"settings": {
"editor.defaultFormatter": "seaube.clangformat",
"editor.formatOnSave": true,
"clang-format.executable": "/usr/bin/clang-format",
"clangd.arguments": [
"--header-insertion=never",
"--compile-commands-dir=${workspaceFolder}"
],
"files.eol": "\n",
"files.trimTrailingWhitespace": true,
"shellcheck.useWorkspaceRootAsCwd": true
}
}
},
"name": "cuda12.9-llvm22"
}
73 changes: 73 additions & 0 deletions .devcontainer/cuda12.9ext-llvm22/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:26.08-cpp-llvm22-cuda12.9ext",
"runArgs": [
"--init",
"--name",
"${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9ext-llvm22"
],
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"set -euo pipefail; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/.{aws,cache,config}; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/{build,wheelhouse}; if test -z \"${localEnv:WSLENV}\"; then docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/build\" --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/wheelhouse\" --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
],
"postAttachCommand": [
"/bin/bash",
"-c",
"if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE": "true",
"SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX": "cccl-preprocessor-cache",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.9ext-llvm22",
"DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST": "1",
"CCCL_CUDA_VERSION": "12.9",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "22",
"CCCL_BUILD_INFIX": "cuda12.9ext-llvm22",
"CCCL_CUDA_EXTENDED": "true",
"HOST_WORKSPACE": "${localWorkspaceFolder}"
},
"workspaceFolder": "/home/coder/cccl",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind",
"mounts": [
"source=/etc/timezone,target=/etc/timezone,type=bind",
"source=/etc/localtime,target=/etc/localtime,type=bind",
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind",
"source=cccl-build,target=/home/coder/cccl/build",
"source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"seaube.clangformat",
"nvidia.nsight-vscode-edition",
"ms-vscode.cmake-tools",
"timonwong.shellcheck"
],
"settings": {
"editor.defaultFormatter": "seaube.clangformat",
"editor.formatOnSave": true,
"clang-format.executable": "/usr/bin/clang-format",
"clangd.arguments": [
"--header-insertion=never",
"--compile-commands-dir=${workspaceFolder}"
],
"files.eol": "\n",
"files.trimTrailingWhitespace": true,
"shellcheck.useWorkspaceRootAsCwd": true
}
}
},
"name": "cuda12.9ext-llvm22"
}
14 changes: 13 additions & 1 deletion ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,9 @@ workflows:
- {jobs: ['build'], cxx: 'nvhpc', ctk: 'nvhpc', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
# clang-cuda
- {jobs: ['build'], cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 'all', sm: '75;80;90;100;120'}
- {jobs: ['build'], cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 'all', sm: '75;80;90;100;120'}
- {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 23, sm: '75;80;90;100;120'}
- {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 23, sm: '75;80;90;100;120'}
# clang-tidy
- { jobs: ['build'], project: 'tidy', std: 'min', cxx: ['clang'], cudacxx: ['clang'], ctk: 'clang-cuda', sm: '75' }
# arch-specific and family-specific arch builds
Expand Down Expand Up @@ -324,7 +326,9 @@ workflows:
- {jobs: ['build'], cxx: 'nvhpc', ctk: 'nvhpc', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
# clang-cuda
- {jobs: ['build'], cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 'all', sm: '75;80;90;100;120'}
- {jobs: ['build'], cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 'all', sm: '75;80;90;100;120'}
- {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 23, sm: '75;80;90;100;120'}
- {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 23, sm: '75;80;90;100;120'}
# compute-sanitizer
- {jobs: ['compute_sanitizer'], project: 'cub', std: 'max', gpu: 'rtxa6000', sm: 'gpu', cmake_options: '-DCMAKE_CUDA_FLAGS=-lineinfo'}
# clang-tidy
Expand Down Expand Up @@ -352,6 +356,8 @@ workflows:
# Clang21+CTK12.9 is currently only used for cuda-clang testing. nvcc 12.9 doesn't support clang21.
- {jobs: ['dc_ext'], ctk: [ '12.X', '13.X'], cxx: ['clang21']}
- {jobs: ['dc_ext'], ctk: [ '13.X'], cxx: ['gcc15']}
# Clang22+CTK12.9 is currently only used for clang-cuda testing. nvcc 12.9 doesn't support clang22.
- {jobs: ['dc_ext'], ctk: [ '12.X' ], cxx: ['clang_preview22']}
# 12.0 python image, pinned at gcc13 for simplicity. CTK 12.0 doesn't really play nice with gcc13, but
# that doesn't matter for running python tests.
- {jobs: ['dc'], ctk: ['12.0'], cxx: 'gcc13'}
Expand Down Expand Up @@ -388,7 +394,7 @@ all_stds: [17, 20]
# - pybuild: Selects image to use for python wheel builds' outer docker instance
ctk_versions:
12.0: { stds: [17, 20] }
12.9: { stds: [17, 20], alias: ['12.X', 'pybuild', 'clang-cuda'] }
12.9: { stds: [17, 20], alias: ['12.X', 'pybuild', 'clang-cuda', 'clang_preview-cuda'] }
13.0: { stds: [17, 20] }
13.1: { stds: [17, 20], alias: ['nvhpc-prev', 'nvhpc']}
13.3: { stds: [17, 20], alias: ['13.X'] }
Expand Down Expand Up @@ -429,6 +435,12 @@ host_compilers:
19: { stds: [17, 20] }
20: { stds: [17, 20] }
21: { stds: [17, 20], alias: 'cuda' }
clang_preview:
name: 'Clang'
container_tag: 'llvm'
exe: 'clang++'
versions:
22: { stds: [17, 20], alias: 'cuda' }
msvc:
name: 'MSVC'
container_tag: 'cl'
Expand Down
1 change: 1 addition & 0 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ struct AgentHistogramPolicy

/// Vector size for samples loading (1, 2, 4)
static constexpr int VEC_SIZE = VecSize;
static_assert(VEC_SIZE == 1 || VEC_SIZE == 2 || VEC_SIZE == 4);

///< The BlockLoad algorithm to use
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,7 @@ public:
({
histogram_policy policy{};
extract_policy_dispatch_t dispatch{policy};
MaxPolicy::Invoke(cc.get() * 10, dispatch);
_CCCL_VERIFY(MaxPolicy::Invoke(cc.get() * 10, dispatch) == cudaSuccess, "");
return policy;
}),
({ return convert_policy<typename MaxPolicy::ActivePolicy>(); }));
Expand Down
10 changes: 10 additions & 0 deletions cub/cub/util_arch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,17 @@ struct NoScaling
# elif _CCCL_DEVICE_COMPILATION()
return ::cuda::device::current_compute_capability();
# else
// clang 22+ supports __CUDA_ARCH_LIST__ and also instantiates tuning policies inside kernels during the **host**
// pass (e.g. to compute the value for __launch_bounds__), where we rely on current_tuning_cc(), which is then passed
// to the policy selector. In the rare case that the policy selector is an adapter over a policy hub and invokes
// ChainedPolicy (e.g. test cub.test.device.histogram_custom_policy_hub.lid_0), it will fail to compile during
// constant evaluation, since it cannot find a policy for a PTX version of zero. As a workaround, we return the oldest
// CC we are compiling for during the host pass. And for consistency, we do the same for all compilers.
# if _CCCL_CUDA_COMPILER(CLANG)
return ::cuda::__target_compute_capabilities().front();
# else // ^^^ _CCCL_CUDA_COMPILER(CLANG) ^^^ / vvv !_CCCL_CUDA_COMPILER(CLANG) vvv
return {};
# endif // ^^^ !_CCCL_CUDA_COMPILER(CLANG) ^^^
# endif
}

Expand Down
8 changes: 8 additions & 0 deletions cub/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,14 @@ function(

# Ensure that we test with assertions enabled
target_compile_definitions(${test_target} PRIVATE CCCL_ENABLE_ASSERTIONS)

# Disable clang-cuda >= 22 warnings regarding failed loop unrolling.
if (
"${CMAKE_CUDA_COMPILER_ID}" STREQUAL "Clang"
AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_GREATER_EQUAL "22.0.0"
)
target_compile_options(${test_target} PRIVATE "-Wno-pass-failed")
endif()
endfunction()

# Sets out_var to launch id if the label contains launch variants
Expand Down
4 changes: 2 additions & 2 deletions libcudacxx/include/cuda/std/span
Original file line number Diff line number Diff line change
Expand Up @@ -618,10 +618,10 @@ _CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(const array<_Tp, _Sz>&) -> span<const _Tp,

#if _CCCL_HAS_HOST_STD_LIB()
template <class _Tp, size_t _Sz>
_CCCL_HOST_DEVICE span(::std::array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
_CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(::std::array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
Comment thread
davebayer marked this conversation as resolved.

template <class _Tp, size_t _Sz>
_CCCL_HOST_DEVICE span(const ::std::array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
_CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(const ::std::array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
#endif // _CCCL_HAS_HOST_STD_LIB()

_CCCL_TEMPLATE(class _It, class _EndOrSize)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,21 @@
#include <cuda/std/concepts>
// #include <cuda/std/cstring>
#include <cuda/std/__memory_>
#include <cuda/std/type_traits>
#include <cuda/std/utility>

#include "test_macros.h"

template <class T, class Void, class... Args>
inline constexpr bool HasMakeUniqueForOverwriteImpl = false;
template <class T, class... Args>
_CCCL_CONCEPT HasMakeUniqueForOverwrite = _CCCL_REQUIRES_EXPR((T, variadic Args), T t, Args&&... args)(
(cuda::std::make_unique_for_overwrite<T>(cuda::std::forward<Args>(args)...)));
Comment thread
davebayer marked this conversation as resolved.
inline constexpr bool HasMakeUniqueForOverwriteImpl<
T,
cuda::std::void_t<decltype(cuda::std::make_unique_for_overwrite<T>(cuda::std::declval<Args>()...))>,
Args...> = true;

template <class T, class... Args>
inline constexpr bool HasMakeUniqueForOverwrite = HasMakeUniqueForOverwriteImpl<T, void, Args...>;

struct Foo
{
Expand Down
Loading