diff --git a/.devcontainer/cuda12.9-llvm22/devcontainer.json b/.devcontainer/cuda12.9-llvm22/devcontainer.json new file mode 100644 index 00000000000..3ea6540066a --- /dev/null +++ b/.devcontainer/cuda12.9-llvm22/devcontainer.json @@ -0,0 +1,73 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:26.08-cpp-llvm22-cuda12.9", + "runArgs": [ + "--init", + "--name", + "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9-llvm22" + ], + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "set -euo pipefail; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/.{aws,cache,config}; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/{build,wheelhouse}; if test -z \"${localEnv:WSLENV}\"; then docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/build\" --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/wheelhouse\" --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;" + ], + "postAttachCommand": [ + "/bin/bash", + "-c", + "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE": "true", + "SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX": "cccl-preprocessor-cache", + "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.9-llvm22", + "DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST": "1", + "CCCL_CUDA_VERSION": "12.9", + "CCCL_HOST_COMPILER": "llvm", + "CCCL_HOST_COMPILER_VERSION": "22", + "CCCL_BUILD_INFIX": "cuda12.9-llvm22", + "CCCL_CUDA_EXTENDED": "false", + "HOST_WORKSPACE": "${localWorkspaceFolder}" + }, + "workspaceFolder": "/home/coder/cccl", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind", + "mounts": [ + "source=/etc/timezone,target=/etc/timezone,type=bind", + "source=/etc/localtime,target=/etc/localtime,type=bind", + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind", + "source=cccl-build,target=/home/coder/cccl/build", + "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "seaube.clangformat", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools", + "timonwong.shellcheck" + ], + "settings": { + "editor.defaultFormatter": "seaube.clangformat", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--header-insertion=never", + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true, + "shellcheck.useWorkspaceRootAsCwd": true + } + } + }, + "name": "cuda12.9-llvm22" +} diff --git a/.devcontainer/cuda12.9ext-llvm22/devcontainer.json b/.devcontainer/cuda12.9ext-llvm22/devcontainer.json new file mode 100644 index 00000000000..63d84ba149e --- /dev/null +++ b/.devcontainer/cuda12.9ext-llvm22/devcontainer.json @@ -0,0 +1,73 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:26.08-cpp-llvm22-cuda12.9ext", + "runArgs": [ + "--init", + "--name", + "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9ext-llvm22" + ], + "hostRequirements": { + "gpu": "optional" + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "set -euo pipefail; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/.{aws,cache,config}; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/{build,wheelhouse}; if test -z \"${localEnv:WSLENV}\"; then docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/build\" --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/wheelhouse\" --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;" + ], + "postAttachCommand": [ + "/bin/bash", + "-c", + "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE": "true", + "SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX": "cccl-preprocessor-cache", + "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.9ext-llvm22", + "DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST": "1", + "CCCL_CUDA_VERSION": "12.9", + "CCCL_HOST_COMPILER": "llvm", + "CCCL_HOST_COMPILER_VERSION": "22", + "CCCL_BUILD_INFIX": "cuda12.9ext-llvm22", + "CCCL_CUDA_EXTENDED": "true", + "HOST_WORKSPACE": "${localWorkspaceFolder}" + }, + "workspaceFolder": "/home/coder/cccl", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind", + "mounts": [ + "source=/etc/timezone,target=/etc/timezone,type=bind", + "source=/etc/localtime,target=/etc/localtime,type=bind", + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind", + "source=cccl-build,target=/home/coder/cccl/build", + "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd", + "seaube.clangformat", + "nvidia.nsight-vscode-edition", + "ms-vscode.cmake-tools", + "timonwong.shellcheck" + ], + "settings": { + "editor.defaultFormatter": "seaube.clangformat", + "editor.formatOnSave": true, + "clang-format.executable": "/usr/bin/clang-format", + "clangd.arguments": [ + "--header-insertion=never", + "--compile-commands-dir=${workspaceFolder}" + ], + "files.eol": "\n", + "files.trimTrailingWhitespace": true, + "shellcheck.useWorkspaceRootAsCwd": true + } + } + }, + "name": "cuda12.9ext-llvm22" +} diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 75086f6dd7c..e786c15c6cc 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -231,7 +231,9 @@ workflows: - {jobs: ['build'], cxx: 'nvhpc', ctk: 'nvhpc', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']} # clang-cuda - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 'all', sm: '75;80;90;100;120'} + - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 'all', sm: '75;80;90;100;120'} - {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 23, sm: '75;80;90;100;120'} + - {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 23, sm: '75;80;90;100;120'} # clang-tidy - { jobs: ['build'], project: 'tidy', std: 'min', cxx: ['clang'], cudacxx: ['clang'], ctk: 'clang-cuda', sm: '75' } # arch-specific and family-specific arch builds @@ -324,7 +326,9 @@ workflows: - {jobs: ['build'], cxx: 'nvhpc', ctk: 'nvhpc', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']} # clang-cuda - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 'all', sm: '75;80;90;100;120'} + - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 'all', sm: '75;80;90;100;120'} - {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 23, sm: '75;80;90;100;120'} + - {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 23, sm: '75;80;90;100;120'} # compute-sanitizer - {jobs: ['compute_sanitizer'], project: 'cub', std: 'max', gpu: 'rtxa6000', sm: 'gpu', cmake_options: '-DCMAKE_CUDA_FLAGS=-lineinfo'} # clang-tidy @@ -352,6 +356,8 @@ workflows: # Clang21+CTK12.9 is currently only used for cuda-clang testing. nvcc 12.9 doesn't support clang21. - {jobs: ['dc_ext'], ctk: [ '12.X', '13.X'], cxx: ['clang21']} - {jobs: ['dc_ext'], ctk: [ '13.X'], cxx: ['gcc15']} + # Clang22+CTK12.9 is currently only used for clang-cuda testing. nvcc 12.9 doesn't support clang22. + - {jobs: ['dc_ext'], ctk: [ '12.X' ], cxx: ['clang_preview22']} # 12.0 python image, pinned at gcc13 for simplicity. CTK 12.0 doesn't really play nice with gcc13, but # that doesn't matter for running python tests. - {jobs: ['dc'], ctk: ['12.0'], cxx: 'gcc13'} @@ -388,7 +394,7 @@ all_stds: [17, 20] # - pybuild: Selects image to use for python wheel builds' outer docker instance ctk_versions: 12.0: { stds: [17, 20] } - 12.9: { stds: [17, 20], alias: ['12.X', 'pybuild', 'clang-cuda'] } + 12.9: { stds: [17, 20], alias: ['12.X', 'pybuild', 'clang-cuda', 'clang_preview-cuda'] } 13.0: { stds: [17, 20] } 13.1: { stds: [17, 20], alias: ['nvhpc-prev', 'nvhpc']} 13.3: { stds: [17, 20], alias: ['13.X'] } @@ -429,6 +435,12 @@ host_compilers: 19: { stds: [17, 20] } 20: { stds: [17, 20] } 21: { stds: [17, 20], alias: 'cuda' } + clang_preview: + name: 'Clang' + container_tag: 'llvm' + exe: 'clang++' + versions: + 22: { stds: [17, 20], alias: 'cuda' } msvc: name: 'MSVC' container_tag: 'cl' diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index b3cca60e673..d09b8cba00d 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -105,6 +105,7 @@ struct AgentHistogramPolicy /// Vector size for samples loading (1, 2, 4) static constexpr int VEC_SIZE = VecSize; + static_assert(VEC_SIZE == 1 || VEC_SIZE == 2 || VEC_SIZE == 4); ///< The BlockLoad algorithm to use static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 2eb8f2a1a77..40282f29fe0 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -778,7 +778,7 @@ public: ({ histogram_policy policy{}; extract_policy_dispatch_t dispatch{policy}; - MaxPolicy::Invoke(cc.get() * 10, dispatch); + _CCCL_VERIFY(MaxPolicy::Invoke(cc.get() * 10, dispatch) == cudaSuccess, ""); return policy; }), ({ return convert_policy(); })); diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh index 9c915aea2bc..c5b38bb3cca 100644 --- a/cub/cub/util_arch.cuh +++ b/cub/cub/util_arch.cuh @@ -187,7 +187,17 @@ struct NoScaling # elif _CCCL_DEVICE_COMPILATION() return ::cuda::device::current_compute_capability(); # else + // clang 22+ supports __CUDA_ARCH_LIST__ and also instantiates tuning policies inside kernels during the **host** + // pass (e.g. to compute the value for __launch_bounds__), where we rely on current_tuning_cc(), which is then passed + // to the policy selector. In the rare case that the policy selector is an adapter over a policy hub and invokes + // ChainedPolicy (e.g. test cub.test.device.histogram_custom_policy_hub.lid_0), it will fail to compile during + // constant evaluation, since it cannot find a policy for a PTX version of zero. As a workaround, we return the oldest + // CC we are compiling for during the host pass. And for consistency, we do the same for all compilers. +# if _CCCL_CUDA_COMPILER(CLANG) + return ::cuda::__target_compute_capabilities().front(); +# else // ^^^ _CCCL_CUDA_COMPILER(CLANG) ^^^ / vvv !_CCCL_CUDA_COMPILER(CLANG) vvv return {}; +# endif // ^^^ !_CCCL_CUDA_COMPILER(CLANG) ^^^ # endif } diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 6c7af3c8a4b..a428367f079 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -274,6 +274,14 @@ function( # Ensure that we test with assertions enabled target_compile_definitions(${test_target} PRIVATE CCCL_ENABLE_ASSERTIONS) + + # Disable clang-cuda >= 22 warnings regarding failed loop unrolling. + if ( + "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "Clang" + AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_GREATER_EQUAL "22.0.0" + ) + target_compile_options(${test_target} PRIVATE "-Wno-pass-failed") + endif() endfunction() # Sets out_var to launch id if the label contains launch variants diff --git a/libcudacxx/include/cuda/std/span b/libcudacxx/include/cuda/std/span index 46459bf526d..ba4c33ed0f1 100644 --- a/libcudacxx/include/cuda/std/span +++ b/libcudacxx/include/cuda/std/span @@ -618,10 +618,10 @@ _CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(const array<_Tp, _Sz>&) -> span -_CCCL_HOST_DEVICE span(::std::array<_Tp, _Sz>&) -> span<_Tp, _Sz>; +_CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(::std::array<_Tp, _Sz>&) -> span<_Tp, _Sz>; template -_CCCL_HOST_DEVICE span(const ::std::array<_Tp, _Sz>&) -> span; +_CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(const ::std::array<_Tp, _Sz>&) -> span; #endif // _CCCL_HAS_HOST_STD_LIB() _CCCL_TEMPLATE(class _It, class _EndOrSize) diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.create/make_unique_for_overwrite.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.create/make_unique_for_overwrite.pass.cpp index 5392203e496..99e4a16bbdb 100644 --- a/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.create/make_unique_for_overwrite.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.create/make_unique_for_overwrite.pass.cpp @@ -24,13 +24,21 @@ #include // #include #include +#include #include #include "test_macros.h" +template +inline constexpr bool HasMakeUniqueForOverwriteImpl = false; template -_CCCL_CONCEPT HasMakeUniqueForOverwrite = _CCCL_REQUIRES_EXPR((T, variadic Args), T t, Args&&... args)( - (cuda::std::make_unique_for_overwrite(cuda::std::forward(args)...))); +inline constexpr bool HasMakeUniqueForOverwriteImpl< + T, + cuda::std::void_t(cuda::std::declval()...))>, + Args...> = true; + +template +inline constexpr bool HasMakeUniqueForOverwrite = HasMakeUniqueForOverwriteImpl; struct Foo {