rapidsai · bdice · Nov 25, 2025 · Nov 25, 2025 · Dec 1, 2025 · nirandaperera
@@ -25,6 +25,11 @@ namespace detail {
  */
 #define RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION 13000
 
+/**
+ * @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support
+ */
+#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 12060
+
 /**
  * @brief Determine at runtime if the CUDA driver supports the stream-ordered
  * memory allocator functions.
@@ -146,6 +151,31 @@ struct runtime_async_managed_alloc {
   }
 };
 
+/**
+ * @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered
+ * pinned memory allocator functions.
+ *
+ * Stream-ordered pinned memory pools were introduced in CUDA 12.6.
+ */
+struct runtime_async_pinned_alloc {
+  static bool is_supported()
+  {
+    static auto supports_async_pinned_pool{[] {
+      // Basic pool support required
+      if (not runtime_async_alloc::is_supported()) { return false; }
+      // CUDA 12.6 or higher is required for async pinned memory pools
+      int cuda_driver_version{};
+      auto driver_result = cudaDriverGetVersion(&cuda_driver_version);
+      int cuda_runtime_version{};
+      auto runtime_result = cudaRuntimeGetVersion(&cuda_runtime_version);
+      return driver_result == cudaSuccess and runtime_result == cudaSuccess and
+             cuda_driver_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION and
+             cuda_runtime_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION;
+    }()};
+    return supports_async_pinned_pool;
+  }
+};
+
 /**
  * @brief Check if the current device is an integrated memory system.
  *

@@ -42,16 +42,21 @@ class cuda_async_managed_memory_resource final : public device_memory_resource {
    * The default managed memory pool is the pool that is created when the device is created.
    * Pool properties such as the release threshold are not modified.
    *
-   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with
-   * managed memory pool
+   * @throws rmm::logic_error if the CUDA build version is less than 13.0
+   * @throws rmm::logic_error if the CUDA runtime version does not support `cudaMallocFromPoolAsync`
+   * with managed memory pool (requires CUDA 13.0 or higher)
    */
   cuda_async_managed_memory_resource()
   {
-    // Check if managed memory pools are supported
+#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION
+    RMM_FAIL(
+      "cuda_async_managed_memory_resource requires CUDA 13.0 or higher. "
+      "This build was compiled with an older CUDA version.");
+#else
+    // Check if managed memory pools are supported at runtime
     RMM_EXPECTS(rmm::detail::runtime_async_managed_alloc::is_supported(),
                 "cuda_async_managed_memory_resource requires CUDA 13.0 or higher");
 
-#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION
     cudaMemPool_t managed_pool_handle{};
     cudaMemLocation location{.type = cudaMemLocationTypeDevice,
                              .id   = rmm::get_current_cuda_device().value()};

@@ -0,0 +1,174 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/detail/export.hpp>
+#include <rmm/detail/runtime_capabilities.hpp>
+#include <rmm/detail/thrust_namespace.h>
+#include <rmm/mr/cuda_async_view_memory_resource.hpp>
+#include <rmm/mr/device_memory_resource.hpp>
+
+#include <cuda/std/type_traits>
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+
+namespace RMM_NAMESPACE {
+namespace mr {
+/**
+ * @addtogroup memory_resources
+ * @{
+ * @file
+ */
+
+/**
+ * @brief `device_memory_resource` derived class that uses
+ * `cudaMallocFromPoolAsync`/`cudaFreeFromPoolAsync` with a pinned memory pool
+ * for allocation/deallocation.
+ */
+class cuda_async_pinned_memory_resource final : public device_memory_resource {
+ public:
+  /**
+   * @brief Constructs a cuda_async_pinned_memory_resource with a pinned memory pool for
+   * the current device.
+   *
+   * On CUDA 12.6-12.x, creates a new pinned memory pool using cudaMemPoolCreate.
+   * On CUDA 13.0+, uses the default pinned memory pool via cudaMemGetDefaultMemPool.
+   *
+   * Pool properties such as the release threshold are not modified.
+   *
+   * @throws rmm::logic_error if the CUDA build version is less than 12.6
+   * @throws rmm::logic_error if the CUDA runtime version does not support pinned memory pools
+   * (requires CUDA 12.6 or higher)
+   */
+  cuda_async_pinned_memory_resource()
+  {
+#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION
+    RMM_FAIL(
+      "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher. "
+      "This build was compiled with an older CUDA version.");
+#else
+    // Check if pinned memory pools are supported at runtime
+    RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(),
+                "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher runtime");
+
+    // Use host location for pinned memory pool (id is ignored for cudaMemLocationTypeHost)
+    cudaMemLocation location{.type = cudaMemLocationTypeHost, .id = 0};
+
+#if CUDA_VERSION >= 13000
+    // CUDA 13.0+: Use the default pinned memory pool (no cleanup needed)
+    RMM_CUDA_TRY(cudaMemGetDefaultMemPool(&pool_handle_, &location, cudaMemAllocationTypePinned));
+#else
+    // CUDA 12.6-12.x: Create a new pinned memory pool (needs cleanup)
+    cudaMemPoolProps pool_props{};
+    pool_props.allocType     = cudaMemAllocationTypePinned;
+    pool_props.location.type = cudaMemLocationTypeHost;
+    pool_props.location.id   = 0;
+    RMM_CUDA_TRY(cudaMemPoolCreate(&pool_handle_, &pool_props));
+    owns_pool_ = true;
+#endif
+
+    // Enable device access to the pinned memory pool
+    cudaMemAccessDesc desc{};
+    desc.location.type = cudaMemLocationTypeDevice;
+    desc.location.id   = rmm::get_current_cuda_device().value();
+    desc.flags         = cudaMemAccessFlagsProtReadWrite;
+    RMM_CUDA_TRY(cudaMemPoolSetAccess(pool_handle_, &desc, 1));
+
+    pool_ = cuda_async_view_memory_resource{pool_handle_};
+#endif
+  }
+
+  /**
+   * @brief Returns the underlying native handle to the CUDA pool
+   *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
+   */
+  [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
+
+  ~cuda_async_pinned_memory_resource() override
+  {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \
+  CUDA_VERSION < 13000
+    if (owns_pool_ && pool_handle_ != nullptr) { cudaMemPoolDestroy(pool_handle_); }
+#endif
+  }
+  cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&)            = delete;
+  cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&)                 = delete;
+  cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete;
+  cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&)      = delete;
+
+ private:
+  cudaMemPool_t pool_handle_{};
+  bool owns_pool_{false};
+  cuda_async_view_memory_resource pool_{};
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * @param bytes The size of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    return pool_.allocate(stream, bytes);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * @param ptr Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param stream Stream on which to perform deallocation
+   */
+  void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
+  {
+    pool_.deallocate(stream, ptr, bytes);
+  }
+
+  /**
+   * @brief Compare this resource to another.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    auto const* async_mr = dynamic_cast<cuda_async_pinned_memory_resource const*>(&other);
+    return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
+  }
+
+  friend auto get_property(cuda_async_pinned_memory_resource const&,
+                           cuda::mr::device_accessible) noexcept
+  {
+    return cuda::mr::device_accessible{};
+  }
+  friend auto get_property(cuda_async_pinned_memory_resource const&,
+                           cuda::mr::host_accessible) noexcept
+  {
+    return cuda::mr::host_accessible{};
+  }
+};
+
+// static property checks
+static_assert(rmm::detail::polyfill::resource<cuda_async_pinned_memory_resource>);
+static_assert(rmm::detail::polyfill::async_resource<cuda_async_pinned_memory_resource>);
+static_assert(rmm::detail::polyfill::resource_with<cuda_async_pinned_memory_resource,
+                                                   cuda::mr::host_accessible,
+                                                   cuda::mr::device_accessible>);
+static_assert(rmm::detail::polyfill::async_resource_with<cuda_async_pinned_memory_resource,
+                                                         cuda::mr::host_accessible,
+                                                         cuda::mr::device_accessible>);
+/** @} */  // end of group
+}  // namespace mr
+}  // namespace RMM_NAMESPACE
@@ -0,0 +1,140 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <rmm/detail/error.hpp>
+#include <rmm/detail/runtime_capabilities.hpp>
+#include <rmm/mr/cuda_async_pinned_memory_resource.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <gtest/gtest.h>
+
+namespace rmm::test {
+namespace {
+
+using cuda_async_pinned_mr = rmm::mr::cuda_async_pinned_memory_resource;
+
+class AsyncPinnedMRTest : public ::testing::Test {
+ protected:
+  void SetUp() override
+  {
+    if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) {
+      GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource "
+                   << "requires CUDA 12.6 or higher and memory pool support.";
+    }
+  }
+};
+
+TEST_F(AsyncPinnedMRTest, BasicAllocateDeallocate)
+{
+  const auto alloc_size{100};
+  cuda_async_pinned_mr mr{};
+  void* ptr = mr.allocate_sync(alloc_size);
+  ASSERT_NE(nullptr, ptr);
+  mr.deallocate_sync(ptr, alloc_size);
+}
+
+TEST_F(AsyncPinnedMRTest, EqualityWithSamePool)
+{
+  // Two instances wrapping the same default pinned pool should compare equal if they
+  // ultimately refer to the same underlying pool handle. Construct two and compare.
+  cuda_async_pinned_mr mr1{};
+  cuda_async_pinned_mr mr2{};
+  EXPECT_TRUE(mr1.is_equal(mr2));
+}
+
+TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromHost)
+{
+  const auto alloc_size{sizeof(int) * 100};
+  cuda_async_pinned_mr mr{};
+  auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size));
+  ASSERT_NE(nullptr, ptr);
+
+  // Pinned memory should be accessible from host
+  // Write from host
+  EXPECT_NO_THROW({
+    for (int i = 0; i < 100; ++i) {
+      ptr[i] = i;
+    }
+  });
+
+  // Verify we can read back
+  EXPECT_EQ(ptr[0], 0);
+  EXPECT_EQ(ptr[50], 50);
+  EXPECT_EQ(ptr[99], 99);
+
+  mr.deallocate_sync(ptr, alloc_size);
+}
+
+TEST_F(AsyncPinnedMRTest, MultipleAllocationsAccessible)
+{
+  const auto alloc_size{512};
+  cuda_async_pinned_mr mr{};
+
+  void* ptr1 = mr.allocate_sync(alloc_size);
+  void* ptr2 = mr.allocate_sync(alloc_size * 2);
+  void* ptr3 = mr.allocate_sync(alloc_size / 2);
+
+  ASSERT_NE(nullptr, ptr1);
+  ASSERT_NE(nullptr, ptr2);
+  ASSERT_NE(nullptr, ptr3);
+
+  // Verify all pointers are accessible from host
+  auto* typed_ptr1 = static_cast<char*>(ptr1);
+  auto* typed_ptr2 = static_cast<char*>(ptr2);
+  auto* typed_ptr3 = static_cast<char*>(ptr3);
+
+  EXPECT_NO_THROW({
+    typed_ptr1[0] = 'a';
+    typed_ptr2[0] = 'b';
+    typed_ptr3[0] = 'c';
+  });
+
+  EXPECT_EQ(typed_ptr1[0], 'a');
+  EXPECT_EQ(typed_ptr2[0], 'b');
+  EXPECT_EQ(typed_ptr3[0], 'c');
+
+  mr.deallocate_sync(ptr1, alloc_size);
+  mr.deallocate_sync(ptr2, alloc_size * 2);
+  mr.deallocate_sync(ptr3, alloc_size / 2);
+}
+
+TEST_F(AsyncPinnedMRTest, PoolHandleIsValid)
+{
+  cuda_async_pinned_mr mr{};
+  cudaMemPool_t pool_handle = mr.pool_handle();
+  EXPECT_NE(pool_handle, nullptr);
+}
+
+TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromDevice)
+{
+  const auto alloc_size{sizeof(int) * 100};
+  cuda_async_pinned_mr mr{};
+  auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size));
+  ASSERT_NE(nullptr, ptr);
+
+  // Initialize from host
+  for (int i = 0; i < 100; ++i) {
+    ptr[i] = i;
+  }
+
+  // Allocate device memory and copy from pinned -> device -> back to verify device access
+  int* d_ptr{};
+  EXPECT_EQ(cudaMalloc(&d_ptr, alloc_size), cudaSuccess);
+  EXPECT_EQ(cudaMemcpy(d_ptr, ptr, alloc_size, cudaMemcpyDefault), cudaSuccess);
+
+  int result[100];
+  EXPECT_EQ(cudaMemcpy(result, d_ptr, alloc_size, cudaMemcpyDefault), cudaSuccess);
+
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_EQ(result[i], i);
+  }
+
+  cudaFree(d_ptr);
+  mr.deallocate_sync(ptr, alloc_size);
+}
+
+}  // namespace
+}  // namespace rmm::test
@@ -134,6 +134,13 @@ cdef extern from "rmm/mr/cuda_async_managed_memory_resource.hpp" \
         cuda_async_managed_memory_resource() except +
         cudaMemPool_t pool_handle() const
 
+cdef extern from "rmm/mr/cuda_async_pinned_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+
+    cdef cppclass cuda_async_pinned_memory_resource(device_memory_resource):
+        cuda_async_pinned_memory_resource() except +
+        cudaMemPool_t pool_handle() const
+
 cdef extern from "rmm/mr/cuda_async_memory_resource.hpp" \
         namespace \
         "rmm::mr::cuda_async_memory_resource" \