NVIDIA · davebayer · Jun 1, 2026
@@ -0,0 +1,73 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:26.08-cpp-llvm22-cuda12.9",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9-llvm22"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "set -euo pipefail; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/.{aws,cache,config}; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/{build,wheelhouse}; if test -z \"${localEnv:WSLENV}\"; then docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/build\" --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/wheelhouse\" --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE": "true",
+    "SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX": "cccl-preprocessor-cache",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.9-llvm22",
+    "DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST": "1",
+    "CCCL_CUDA_VERSION": "12.9",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "22",
+    "CCCL_BUILD_INFIX": "cuda12.9-llvm22",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind",
+  "mounts": [
+    "source=/etc/timezone,target=/etc/timezone,type=bind",
+    "source=/etc/localtime,target=/etc/localtime,type=bind",
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools",
+        "timonwong.shellcheck"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true,
+        "shellcheck.useWorkspaceRootAsCwd": true
+      }
+    }
+  },
+  "name": "cuda12.9-llvm22"
+}
@@ -0,0 +1,73 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:26.08-cpp-llvm22-cuda12.9ext",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9ext-llvm22"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "set -euo pipefail; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/.{aws,cache,config}; mkdir -m 0755 -p \"${localWorkspaceFolder}\"/{build,wheelhouse}; if test -z \"${localEnv:WSLENV}\"; then docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/build\" --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt \"device=${localWorkspaceFolder}/wheelhouse\" --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE": "true",
+    "SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX": "cccl-preprocessor-cache",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.9ext-llvm22",
+    "DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST": "1",
+    "CCCL_CUDA_VERSION": "12.9",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "22",
+    "CCCL_BUILD_INFIX": "cuda12.9ext-llvm22",
+    "CCCL_CUDA_EXTENDED": "true",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind",
+  "mounts": [
+    "source=/etc/timezone,target=/etc/timezone,type=bind",
+    "source=/etc/localtime,target=/etc/localtime,type=bind",
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools",
+        "timonwong.shellcheck"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true,
+        "shellcheck.useWorkspaceRootAsCwd": true
+      }
+    }
+  },
+  "name": "cuda12.9ext-llvm22"
+}
@@ -231,7 +231,9 @@ workflows:
     - {jobs: ['build'], cxx: 'nvhpc',      ctk: 'nvhpc',      std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
     # clang-cuda
     - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 'all', sm: '75;80;90;100;120'}
+    - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 'all', sm: '75;80;90;100;120'}
     - {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda',  std: 23, sm: '75;80;90;100;120'}
+    - {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 23, sm: '75;80;90;100;120'}
     # clang-tidy
     - { jobs: ['build'], project: 'tidy', std: 'min', cxx: ['clang'], cudacxx: ['clang'], ctk: 'clang-cuda', sm: '75' }
     # arch-specific and family-specific arch builds
@@ -324,7 +326,9 @@ workflows:
     - {jobs: ['build'], cxx: 'nvhpc',      ctk: 'nvhpc',      std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
     # clang-cuda
     - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda', std: 'all', sm: '75;80;90;100;120'}
+    - {jobs: ['build'], cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 'all', sm: '75;80;90;100;120'}
     - {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang-cuda', cxx: 'clang-cuda',  std: 23, sm: '75;80;90;100;120'}
+    - {jobs: ['build'], project: 'libcudacxx', cudacxx: 'clang', ctk: 'clang_preview-cuda', cxx: 'clang_preview-cuda', std: 23, sm: '75;80;90;100;120'}
     # compute-sanitizer
     - {jobs: ['compute_sanitizer'], project: 'cub', std: 'max', gpu: 'rtxa6000', sm: 'gpu', cmake_options: '-DCMAKE_CUDA_FLAGS=-lineinfo'}
     # clang-tidy
@@ -352,6 +356,8 @@ workflows:
     # Clang21+CTK12.9 is currently only used for cuda-clang testing. nvcc 12.9 doesn't support clang21.
     - {jobs: ['dc_ext'], ctk: [        '12.X',         '13.X'], cxx: ['clang21']}
     - {jobs: ['dc_ext'], ctk: [                        '13.X'], cxx: ['gcc15']}
+    # Clang22+CTK12.9 is currently only used for clang-cuda testing. nvcc 12.9 doesn't support clang22.
+    - {jobs: ['dc_ext'], ctk: [        '12.X'                ], cxx: ['clang_preview22']}
     # 12.0 python image, pinned at gcc13 for simplicity. CTK 12.0 doesn't really play nice with gcc13, but
     # that doesn't matter for running python tests.
     - {jobs: ['dc'], ctk: ['12.0'], cxx: 'gcc13'}
@@ -388,7 +394,7 @@ all_stds: [17, 20]
 # - pybuild: Selects image to use for python wheel builds' outer docker instance
 ctk_versions:
   12.0: { stds: [17, 20] }
-  12.9: { stds: [17, 20], alias: ['12.X', 'pybuild', 'clang-cuda'] }
+  12.9: { stds: [17, 20], alias: ['12.X', 'pybuild', 'clang-cuda', 'clang_preview-cuda'] }
   13.0: { stds: [17, 20] }
   13.1: { stds: [17, 20], alias: ['nvhpc-prev', 'nvhpc']}
   13.3: { stds: [17, 20], alias: ['13.X'] }
@@ -429,6 +435,12 @@ host_compilers:
       19: { stds: [17, 20] }
       20: { stds: [17, 20] }
       21: { stds: [17, 20], alias: 'cuda' }
+  clang_preview:
+    name: 'Clang'
+    container_tag: 'llvm'
+    exe: 'clang++'
+    versions:
+      22: { stds: [17, 20], alias: 'cuda' }
   msvc:
     name: 'MSVC'
     container_tag: 'cl'

@@ -105,6 +105,7 @@ struct AgentHistogramPolicy
 
   /// Vector size for samples loading (1, 2, 4)
   static constexpr int VEC_SIZE = VecSize;
+  static_assert(VEC_SIZE == 1 || VEC_SIZE == 2 || VEC_SIZE == 4);
 
   ///< The BlockLoad algorithm to use
   static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;

@@ -778,7 +778,7 @@ public:
                       ({
                         histogram_policy policy{};
                         extract_policy_dispatch_t dispatch{policy};
-                        MaxPolicy::Invoke(cc.get() * 10, dispatch);
+                        _CCCL_VERIFY(MaxPolicy::Invoke(cc.get() * 10, dispatch) == cudaSuccess, "");
                         return policy;
                       }),
                       ({ return convert_policy<typename MaxPolicy::ActivePolicy>(); }));

@@ -187,7 +187,17 @@ struct NoScaling
 #  elif _CCCL_DEVICE_COMPILATION()
   return ::cuda::device::current_compute_capability();
 #  else
+  // clang 22+ supports __CUDA_ARCH_LIST__ and also instantiates tuning policies inside kernels during the **host**
+  // pass (e.g. to compute the value for __launch_bounds__), where we rely on current_tuning_cc(), which is then passed
+  // to the policy selector. In the rare case that the policy selector is an adapter over a policy hub and invokes
+  // ChainedPolicy (e.g. test cub.test.device.histogram_custom_policy_hub.lid_0), it will fail to compile during
+  // constant evaluation, since it cannot find a policy for a PTX version of zero. As a workaround, we return the oldest
+  // CC we are compiling for during the host pass. And for consistency, we do the same for all compilers.
+#    if _CCCL_CUDA_COMPILER(CLANG)
+  return ::cuda::__target_compute_capabilities().front();
+#    else // ^^^ _CCCL_CUDA_COMPILER(CLANG) ^^^ / vvv !_CCCL_CUDA_COMPILER(CLANG) vvv
   return {};
+#    endif // ^^^ !_CCCL_CUDA_COMPILER(CLANG) ^^^
 #  endif
 }
 

@@ -274,6 +274,14 @@ function(
 
   # Ensure that we test with assertions enabled
   target_compile_definitions(${test_target} PRIVATE CCCL_ENABLE_ASSERTIONS)
+
+  # Disable clang-cuda >= 22 warnings regarding failed loop unrolling.
+  if (
+    "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "Clang"
+    AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_GREATER_EQUAL "22.0.0"
+  )
+    target_compile_options(${test_target} PRIVATE "-Wno-pass-failed")
+  endif()
 endfunction()
 
 # Sets out_var to launch id if the label contains launch variants

@@ -618,10 +618,10 @@ _CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(const array<_Tp, _Sz>&) -> span<const _Tp,
 
 #if _CCCL_HAS_HOST_STD_LIB()
 template <class _Tp, size_t _Sz>
-_CCCL_HOST_DEVICE span(::std::array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
+_CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(::std::array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
 
 template <class _Tp, size_t _Sz>
-_CCCL_HOST_DEVICE span(const ::std::array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
+_CCCL_DEDUCTION_GUIDE_ATTRIBUTES span(const ::std::array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
 #endif // _CCCL_HAS_HOST_STD_LIB()
 
 _CCCL_TEMPLATE(class _It, class _EndOrSize)

@@ -24,13 +24,21 @@
 #include <cuda/std/concepts>
 // #include <cuda/std/cstring>
 #include <cuda/std/__memory_>
+#include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
 #include "test_macros.h"
 
+template <class T, class Void, class... Args>
+inline constexpr bool HasMakeUniqueForOverwriteImpl = false;
 template <class T, class... Args>
-_CCCL_CONCEPT HasMakeUniqueForOverwrite = _CCCL_REQUIRES_EXPR((T, variadic Args), T t, Args&&... args)(
-  (cuda::std::make_unique_for_overwrite<T>(cuda::std::forward<Args>(args)...)));
+inline constexpr bool HasMakeUniqueForOverwriteImpl<
+  T,
+  cuda::std::void_t<decltype(cuda::std::make_unique_for_overwrite<T>(cuda::std::declval<Args>()...))>,
+  Args...> = true;
+
+template <class T, class... Args>
+inline constexpr bool HasMakeUniqueForOverwrite = HasMakeUniqueForOverwriteImpl<T, void, Args...>;
 
 struct Foo
 {