From e671b344415fb9f6d22063dc452c0e16bf21ff3b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 25 Nov 2025 17:33:48 -0600
Subject: [PATCH 1/3] Add experimental cuda_async_pinned_memory_resource

Adds a new cuda_async_pinned_memory_resource that provides stream-ordered
pinned (page-locked) host memory allocation using CUDA 13.0's
cudaMemGetDefaultMemPool API with cudaMemAllocationTypePinned.

This parallels the cuda_async_managed_memory_resource added in #2056 and
addresses part of #2054.

Key features:
- Uses default pinned memory pool for stream-ordered allocation
- Accessible from both host and device
- Requires CUDA 13.0+ (matches managed version for consistency)

Implementation includes:
- C++ header and implementation in cuda_async_pinned_memory_resource.hpp
- Runtime capability check in runtime_capabilities.hpp
- C++ tests in cuda_async_pinned_mr_tests.cpp
- Python bindings in experimental module
- Python tests in test_cuda_async_pinned_memory_resource.py
---
 .../rmm/detail/runtime_capabilities.hpp       |  31 ++++
 .../mr/cuda_async_pinned_memory_resource.hpp  | 143 ++++++++++++++++++
 cpp/tests/mr/cuda_async_pinned_mr_tests.cpp   | 112 ++++++++++++++
 python/rmm/rmm/librmm/memory_resource.pxd     |   7 +
 python/rmm/rmm/mr/experimental.py             |   2 +
 .../pylibrmm/memory_resource/experimental.pxd |   3 +
 .../pylibrmm/memory_resource/experimental.pyi |   4 +
 .../pylibrmm/memory_resource/experimental.pyx |  35 ++++-
 .../test_cuda_async_pinned_memory_resource.py |  75 +++++++++
 python/rmm/rmm/tests/test_helpers.py          |   8 +
 10 files changed, 419 insertions(+), 1 deletion(-)
 create mode 100644 cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
 create mode 100644 cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
 create mode 100644 python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py

diff --git a/cpp/include/rmm/detail/runtime_capabilities.hpp b/cpp/include/rmm/detail/runtime_capabilities.hpp
index fe86769d5..43b90ef61 100644
--- a/cpp/include/rmm/detail/runtime_capabilities.hpp
+++ b/cpp/include/rmm/detail/runtime_capabilities.hpp
@@ -25,6 +25,11 @@ namespace detail {
  */
 #define RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION 13000
 
+/**
+ * @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support
+ */
+#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 13000
+
 /**
  * @brief Determine at runtime if the CUDA driver supports the stream-ordered
  * memory allocator functions.
@@ -146,6 +151,32 @@ struct runtime_async_managed_alloc {
   }
 };
 
+/*
+ * @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered
+ * pinned memory allocator functions.
+ *
+ * Stream-ordered pinned memory pools were introduced in CUDA 12.6 but our
+ * implementation requires features from CUDA 13.0 or higher.
+ */
+struct runtime_async_pinned_alloc {
+  static bool is_supported()
+  {
+    static auto supports_async_pinned_pool{[] {
+      // Basic pool support required
+      if (not runtime_async_alloc::is_supported()) { return false; }
+      // CUDA 13.0 or higher is required for async pinned memory pools
+      int cuda_driver_version{};
+      auto driver_result = cudaDriverGetVersion(&cuda_driver_version);
+      int cuda_runtime_version{};
+      auto runtime_result = cudaRuntimeGetVersion(&cuda_runtime_version);
+      return driver_result == cudaSuccess and runtime_result == cudaSuccess and
+             cuda_driver_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION and
+             cuda_runtime_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION;
+    }()};
+    return supports_async_pinned_pool;
+  }
+};
+
 /**
  * @brief Check if the current device is an integrated memory system.
  *
diff --git a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
new file mode 100644
index 000000000..261b5f3ac
--- /dev/null
+++ b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
@@ -0,0 +1,143 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/detail/export.hpp>
+#include <rmm/detail/runtime_capabilities.hpp>
+#include <rmm/detail/thrust_namespace.h>
+#include <rmm/mr/cuda_async_view_memory_resource.hpp>
+#include <rmm/mr/device_memory_resource.hpp>
+
+#include <cuda/std/type_traits>
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+namespace RMM_NAMESPACE {
+namespace mr {
+/**
+ * @addtogroup memory_resources
+ * @{
+ * @file
+ */
+
+/**
+ * @brief `device_memory_resource` derived class that uses
+ * `cudaMallocFromPoolAsync`/`cudaFreeFromPoolAsync` with a pinned memory pool
+ * for allocation/deallocation.
+ */
+class cuda_async_pinned_memory_resource final : public device_memory_resource {
+ public:
+  /**
+   * @brief Constructs a cuda_async_pinned_memory_resource with the default pinned memory pool for
+   * the current device.
+   *
+   * The default pinned memory pool is the pool that is created when the device is created.
+   * Pool properties such as the release threshold are not modified.
+   *
+   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with
+   * pinned memory pool
+   */
+  cuda_async_pinned_memory_resource()
+  {
+    // Check if pinned memory pools are supported
+    RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(),
+                "cuda_async_pinned_memory_resource requires CUDA 13.0 or higher");
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION
+    cudaMemPool_t pinned_pool_handle{};
+    cudaMemLocation location{.type = cudaMemLocationTypeDevice,
+                             .id   = rmm::get_current_cuda_device().value()};
+    RMM_CUDA_TRY(
+      cudaMemGetDefaultMemPool(&pinned_pool_handle, &location, cudaMemAllocationTypePinned));
+    pool_ = cuda_async_view_memory_resource{pinned_pool_handle};
+#endif
+  }
+
+  /**
+   * @brief Returns the underlying native handle to the CUDA pool
+   *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
+   */
+  [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
+
+  ~cuda_async_pinned_memory_resource() override {}
+  cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&)            = delete;
+  cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&)                 = delete;
+  cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete;
+  cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&)      = delete;
+
+ private:
+  cuda_async_view_memory_resource pool_{};
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * @param bytes The size of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    return pool_.allocate(stream, bytes);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * @param ptr Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param stream Stream on which to perform deallocation
+   */
+  void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
+  {
+    pool_.deallocate(stream, ptr, bytes);
+  }
+
+  /**
+   * @brief Compare this resource to another.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    auto const* async_mr = dynamic_cast<cuda_async_pinned_memory_resource const*>(&other);
+    return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
+  }
+
+  friend auto get_property(cuda_async_pinned_memory_resource const&,
+                           cuda::mr::device_accessible) noexcept
+  {
+    return cuda::mr::device_accessible{};
+  }
+  friend auto get_property(cuda_async_pinned_memory_resource const&,
+                           cuda::mr::host_accessible) noexcept
+  {
+    return cuda::mr::host_accessible{};
+  }
+};
+
+// static property checks
+static_assert(rmm::detail::polyfill::resource<cuda_async_pinned_memory_resource>);
+static_assert(rmm::detail::polyfill::async_resource<cuda_async_pinned_memory_resource>);
+static_assert(rmm::detail::polyfill::resource_with<cuda_async_pinned_memory_resource,
+                                                   cuda::mr::host_accessible,
+                                                   cuda::mr::device_accessible>);
+static_assert(rmm::detail::polyfill::async_resource_with<cuda_async_pinned_memory_resource,
+                                                         cuda::mr::host_accessible,
+                                                         cuda::mr::device_accessible>);
+/** @} */  // end of group
+}  // namespace mr
+}  // namespace RMM_NAMESPACE
diff --git a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
new file mode 100644
index 000000000..690684d13
--- /dev/null
+++ b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
@@ -0,0 +1,112 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <rmm/detail/error.hpp>
+#include <rmm/detail/runtime_capabilities.hpp>
+#include <rmm/mr/cuda_async_pinned_memory_resource.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <gtest/gtest.h>
+
+namespace rmm::test {
+namespace {
+
+using cuda_async_pinned_mr = rmm::mr::cuda_async_pinned_memory_resource;
+
+class AsyncPinnedMRTest : public ::testing::Test {
+ protected:
+  void SetUp() override
+  {
+    if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) {
+      GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource "
+                   << "requires CUDA 13.0 or higher and memory pool support.";
+    }
+  }
+};
+
+TEST_F(AsyncPinnedMRTest, BasicAllocateDeallocate)
+{
+  const auto alloc_size{100};
+  cuda_async_pinned_mr mr{};
+  void* ptr = mr.allocate_sync(alloc_size);
+  ASSERT_NE(nullptr, ptr);
+  mr.deallocate_sync(ptr, alloc_size);
+}
+
+TEST_F(AsyncPinnedMRTest, EqualityWithSamePool)
+{
+  // Two instances wrapping the same default pinned pool should compare equal if they
+  // ultimately refer to the same underlying pool handle. Construct two and compare.
+  cuda_async_pinned_mr mr1{};
+  cuda_async_pinned_mr mr2{};
+  EXPECT_TRUE(mr1.is_equal(mr2));
+}
+
+TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromHost)
+{
+  const auto alloc_size{sizeof(int) * 100};
+  cuda_async_pinned_mr mr{};
+  auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size));
+  ASSERT_NE(nullptr, ptr);
+
+  // Pinned memory should be accessible from host
+  // Write from host
+  EXPECT_NO_THROW({
+    for (int i = 0; i < 100; ++i) {
+      ptr[i] = i;
+    }
+  });
+
+  // Verify we can read back
+  EXPECT_EQ(ptr[0], 0);
+  EXPECT_EQ(ptr[50], 50);
+  EXPECT_EQ(ptr[99], 99);
+
+  mr.deallocate_sync(ptr, alloc_size);
+}
+
+TEST_F(AsyncPinnedMRTest, MultipleAllocationsAccessible)
+{
+  const auto alloc_size{512};
+  cuda_async_pinned_mr mr{};
+
+  void* ptr1 = mr.allocate_sync(alloc_size);
+  void* ptr2 = mr.allocate_sync(alloc_size * 2);
+  void* ptr3 = mr.allocate_sync(alloc_size / 2);
+
+  ASSERT_NE(nullptr, ptr1);
+  ASSERT_NE(nullptr, ptr2);
+  ASSERT_NE(nullptr, ptr3);
+
+  // Verify all pointers are accessible from host
+  auto* typed_ptr1 = static_cast<char*>(ptr1);
+  auto* typed_ptr2 = static_cast<char*>(ptr2);
+  auto* typed_ptr3 = static_cast<char*>(ptr3);
+
+  EXPECT_NO_THROW({
+    typed_ptr1[0] = 'a';
+    typed_ptr2[0] = 'b';
+    typed_ptr3[0] = 'c';
+  });
+
+  EXPECT_EQ(typed_ptr1[0], 'a');
+  EXPECT_EQ(typed_ptr2[0], 'b');
+  EXPECT_EQ(typed_ptr3[0], 'c');
+
+  mr.deallocate_sync(ptr1, alloc_size);
+  mr.deallocate_sync(ptr2, alloc_size * 2);
+  mr.deallocate_sync(ptr3, alloc_size / 2);
+}
+
+TEST_F(AsyncPinnedMRTest, PoolHandleIsValid)
+{
+  cuda_async_pinned_mr mr{};
+  cudaMemPool_t pool_handle = mr.pool_handle();
+  EXPECT_NE(pool_handle, nullptr);
+}
+
+}  // namespace
+}  // namespace rmm::test
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
index 55790a475..71a7b551a 100644
--- a/python/rmm/rmm/librmm/memory_resource.pxd
+++ b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -134,6 +134,13 @@ cdef extern from "rmm/mr/cuda_async_managed_memory_resource.hpp" \
         cuda_async_managed_memory_resource() except +
         cudaMemPool_t pool_handle() const
 
+cdef extern from "rmm/mr/cuda_async_pinned_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+
+    cdef cppclass cuda_async_pinned_memory_resource(device_memory_resource):
+        cuda_async_pinned_memory_resource() except +
+        cudaMemPool_t pool_handle() const
+
 cdef extern from "rmm/mr/cuda_async_memory_resource.hpp" \
         namespace \
         "rmm::mr::cuda_async_memory_resource" \
diff --git a/python/rmm/rmm/mr/experimental.py b/python/rmm/rmm/mr/experimental.py
index 1b422a3d3..857ed9d06 100644
--- a/python/rmm/rmm/mr/experimental.py
+++ b/python/rmm/rmm/mr/experimental.py
@@ -5,8 +5,10 @@
 
 from rmm.pylibrmm.memory_resource.experimental import (
     CudaAsyncManagedMemoryResource,
+    CudaAsyncPinnedMemoryResource,
 )
 
 __all__ = [
     "CudaAsyncManagedMemoryResource",
+    "CudaAsyncPinnedMemoryResource",
 ]
diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd
index 2ec0914c7..461d45ddf 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd
+++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd
@@ -7,3 +7,6 @@ from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource
 
 cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
     pass
+
+cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
+    pass
diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi
index 6c098f574..6470fa120 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi
+++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi
@@ -6,3 +6,7 @@ from rmm.pylibrmm.memory_resource._memory_resource import DeviceMemoryResource
 class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
     def __init__(self) -> None: ...
     def pool_handle(self) -> int: ...
+
+class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
+    def __init__(self) -> None: ...
+    def pool_handle(self) -> int: ...
diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx
index 18287a387..890fa0ddd 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx
@@ -5,7 +5,10 @@
 
 from libc.stdint cimport uintptr_t
 
-from rmm.librmm.memory_resource cimport cuda_async_managed_memory_resource
+from rmm.librmm.memory_resource cimport (
+    cuda_async_managed_memory_resource,
+    cuda_async_pinned_memory_resource,
+)
 # import from the private _memory_resource to avoid a circular import
 from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource
 
@@ -38,3 +41,33 @@ cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
         cdef cuda_async_managed_memory_resource* c_mr = \
             <cuda_async_managed_memory_resource*>self.c_obj.get()
         return <uintptr_t>c_mr.pool_handle()
+
+
+cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
+    """
+    Memory resource that uses ``cudaMallocFromPoolAsync``/``cudaFreeAsync`` for
+    allocation/deallocation with a pinned memory pool.
+
+    This resource uses the default pinned memory pool for the current device.
+    Pinned memory is page-locked host memory that can be accessed from both
+    the host and device. This provides fast host-device transfers.
+
+    Requires CUDA 13.0 or higher.
+    """
+    def __cinit__(self):
+        self.c_obj.reset(
+            new cuda_async_pinned_memory_resource()
+        )
+
+    def pool_handle(self):
+        """
+        Returns the underlying CUDA memory pool handle.
+
+        Returns
+        -------
+        int
+            Handle to the underlying CUDA memory pool
+        """
+        cdef cuda_async_pinned_memory_resource* c_mr = \
+            <cuda_async_pinned_memory_resource*>self.c_obj.get()
+        return <uintptr_t>c_mr.pool_handle()
diff --git a/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py
new file mode 100644
index 000000000..b08a1c9c3
--- /dev/null
+++ b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for CudaAsyncPinnedMemoryResource."""
+
+import numpy as np
+import pytest
+from test_helpers import (
+    _ASYNC_PINNED_MEMORY_SUPPORTED,
+    _allocs,
+    _dtypes,
+    _nelems,
+    array_tester,
+)
+
+import rmm
+from rmm.pylibrmm.stream import Stream
+
+
+@pytest.mark.skipif(
+    not _ASYNC_PINNED_MEMORY_SUPPORTED,
+    reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+",
+)
+@pytest.mark.parametrize("dtype", _dtypes)
+@pytest.mark.parametrize("nelem", _nelems)
+@pytest.mark.parametrize("alloc", _allocs)
+def test_cuda_async_pinned_memory_resource(dtype, nelem, alloc):
+    mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource()
+    rmm.mr.set_current_device_resource(mr)
+    assert rmm.mr.get_current_device_resource_type() is type(mr)
+    array_tester(dtype, nelem, alloc)
+
+
+@pytest.mark.skipif(
+    not _ASYNC_PINNED_MEMORY_SUPPORTED,
+    reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+",
+)
+@pytest.mark.parametrize("nelems", _nelems)
+def test_cuda_async_pinned_memory_resource_stream(nelems):
+    mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource()
+    rmm.mr.set_current_device_resource(mr)
+    stream = Stream()
+    expected = np.full(nelems, 5, dtype="u1")
+    dbuf = rmm.DeviceBuffer.to_device(expected, stream=stream)
+    result = np.asarray(dbuf.copy_to_host())
+    np.testing.assert_equal(expected, result)
+
+
+@pytest.mark.skipif(
+    not _ASYNC_PINNED_MEMORY_SUPPORTED,
+    reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+",
+)
+def test_cuda_async_pinned_memory_resource_pool_handle():
+    mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource()
+    pool_handle = mr.pool_handle()
+    assert isinstance(pool_handle, int)
+    assert pool_handle != 0
+
+
+@pytest.mark.skipif(
+    not _ASYNC_PINNED_MEMORY_SUPPORTED,
+    reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+",
+)
+def test_cuda_async_pinned_memory_resource_host_access():
+    """Test that pinned memory allocated by the resource is accessible from host."""
+    mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource()
+    rmm.mr.set_current_device_resource(mr)
+
+    # Allocate a buffer
+    expected = np.full(100, 42, dtype="i4")
+    dbuf = rmm.DeviceBuffer.to_device(expected)
+
+    # Verify host can access the data
+    result = np.asarray(dbuf.copy_to_host())
+    np.testing.assert_equal(expected, result)
diff --git a/python/rmm/rmm/tests/test_helpers.py b/python/rmm/rmm/tests/test_helpers.py
index c7a9fba57..369b91e17 100644
--- a/python/rmm/rmm/tests/test_helpers.py
+++ b/python/rmm/rmm/tests/test_helpers.py
@@ -32,6 +32,14 @@
     and rmm._cuda.gpu.runtimeGetVersion() >= 13000
 )
 
+_ASYNC_PINNED_MEMORY_SUPPORTED = (
+    rmm._cuda.gpu.getDeviceAttribute(
+        runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported,
+        rmm._cuda.gpu.getDevice(),
+    )
+    and rmm._cuda.gpu.runtimeGetVersion() >= 13000
+)
+
 _MEMORY_POOL_HANDLE_TYPES_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(
     runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolSupportedHandleTypes,
     rmm._cuda.gpu.getDevice(),

From 26c2a67c5314efe0cc4b410540d827e4c9320be3 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 25 Nov 2025 17:54:06 -0600
Subject: [PATCH 2/3] Add CUDA 12.6+ support for
 cuda_async_pinned_memory_resource

Enables pinned memory pool support on CUDA 12.6+ using cudaMemPoolCreate
for CUDA 12.6-12.x and cudaMemGetDefaultMemPool for CUDA 13.0+. Uses
unique_ptr with a deleter for automatic pool cleanup.

Updates version requirements: 12.6+ for pinned.
---
 .../rmm/detail/runtime_capabilities.hpp       |  9 ++-
 .../mr/cuda_async_managed_memory_resource.hpp | 13 +++--
 .../mr/cuda_async_pinned_memory_resource.hpp  | 58 +++++++++++++++----
 cpp/tests/mr/cuda_async_pinned_mr_tests.cpp   |  2 +-
 .../test_cuda_async_pinned_memory_resource.py | 10 ++--
 python/rmm/rmm/tests/test_helpers.py          |  2 +-
 6 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/cpp/include/rmm/detail/runtime_capabilities.hpp b/cpp/include/rmm/detail/runtime_capabilities.hpp
index 43b90ef61..8b6345d28 100644
--- a/cpp/include/rmm/detail/runtime_capabilities.hpp
+++ b/cpp/include/rmm/detail/runtime_capabilities.hpp
@@ -28,7 +28,7 @@ namespace detail {
 /**
  * @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support
  */
-#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 13000
+#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 12060
 
 /**
  * @brief Determine at runtime if the CUDA driver supports the stream-ordered
@@ -151,12 +151,11 @@ struct runtime_async_managed_alloc {
   }
 };
 
-/*
+/**
  * @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered
  * pinned memory allocator functions.
  *
- * Stream-ordered pinned memory pools were introduced in CUDA 12.6 but our
- * implementation requires features from CUDA 13.0 or higher.
+ * Stream-ordered pinned memory pools were introduced in CUDA 12.6.
  */
 struct runtime_async_pinned_alloc {
   static bool is_supported()
@@ -164,7 +163,7 @@ struct runtime_async_pinned_alloc {
     static auto supports_async_pinned_pool{[] {
       // Basic pool support required
       if (not runtime_async_alloc::is_supported()) { return false; }
-      // CUDA 13.0 or higher is required for async pinned memory pools
+      // CUDA 12.6 or higher is required for async pinned memory pools
       int cuda_driver_version{};
       auto driver_result = cudaDriverGetVersion(&cuda_driver_version);
       int cuda_runtime_version{};
diff --git a/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp
index bd2baadf4..d73898b82 100644
--- a/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp
+++ b/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp
@@ -42,16 +42,21 @@ class cuda_async_managed_memory_resource final : public device_memory_resource {
    * The default managed memory pool is the pool that is created when the device is created.
    * Pool properties such as the release threshold are not modified.
    *
-   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with
-   * managed memory pool
+   * @throws rmm::logic_error if the CUDA build version is less than 13.0
+   * @throws rmm::logic_error if the CUDA runtime version does not support `cudaMallocFromPoolAsync`
+   * with managed memory pool (requires CUDA 13.0 or higher)
    */
   cuda_async_managed_memory_resource()
   {
-    // Check if managed memory pools are supported
+#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION
+    RMM_FAIL(
+      "cuda_async_managed_memory_resource requires CUDA 13.0 or higher. "
+      "This build was compiled with an older CUDA version.");
+#else
+    // Check if managed memory pools are supported at runtime
     RMM_EXPECTS(rmm::detail::runtime_async_managed_alloc::is_supported(),
                 "cuda_async_managed_memory_resource requires CUDA 13.0 or higher");
 
-#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION
     cudaMemPool_t managed_pool_handle{};
     cudaMemLocation location{.type = cudaMemLocationTypeDevice,
                              .id   = rmm::get_current_cuda_device().value()};
diff --git a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
index 261b5f3ac..b7adeea96 100644
--- a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
+++ b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
@@ -36,28 +36,47 @@ namespace mr {
 class cuda_async_pinned_memory_resource final : public device_memory_resource {
  public:
   /**
-   * @brief Constructs a cuda_async_pinned_memory_resource with the default pinned memory pool for
+   * @brief Constructs a cuda_async_pinned_memory_resource with a pinned memory pool for
    * the current device.
    *
-   * The default pinned memory pool is the pool that is created when the device is created.
+   * On CUDA 12.6-12.x, creates a new pinned memory pool using cudaMemPoolCreate.
+   * On CUDA 13.0+, uses the default pinned memory pool via cudaMemGetDefaultMemPool.
+   *
    * Pool properties such as the release threshold are not modified.
    *
-   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with
-   * pinned memory pool
+   * @throws rmm::logic_error if the CUDA build version is less than 12.6
+   * @throws rmm::logic_error if the CUDA runtime version does not support pinned memory pools
+   * (requires CUDA 12.6 or higher)
    */
   cuda_async_pinned_memory_resource()
   {
-    // Check if pinned memory pools are supported
+#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION
+    RMM_FAIL(
+      "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher. "
+      "This build was compiled with an older CUDA version.");
+#else
+    // Check if pinned memory pools are supported at runtime
     RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(),
-                "cuda_async_pinned_memory_resource requires CUDA 13.0 or higher");
+                "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher runtime");
+
+    pool_handle_.reset(new cudaMemPool_t{});
 
-#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION
-    cudaMemPool_t pinned_pool_handle{};
+#if CUDA_VERSION >= 13000
+    // CUDA 13.0+: Use the default pinned memory pool (no cleanup needed)
     cudaMemLocation location{.type = cudaMemLocationTypeDevice,
                              .id   = rmm::get_current_cuda_device().value()};
     RMM_CUDA_TRY(
-      cudaMemGetDefaultMemPool(&pinned_pool_handle, &location, cudaMemAllocationTypePinned));
-    pool_ = cuda_async_view_memory_resource{pinned_pool_handle};
+      cudaMemGetDefaultMemPool(pool_handle_.get(), &location, cudaMemAllocationTypePinned));
+#else
+    // CUDA 12.6-12.x: Create a new pinned memory pool (needs cleanup)
+    cudaMemPoolProps pool_props{};
+    pool_props.allocType     = cudaMemAllocationTypePinned;
+    pool_props.location.type = cudaMemLocationTypeDevice;
+    pool_props.location.id   = rmm::get_current_cuda_device().value();
+    RMM_CUDA_TRY(cudaMemPoolCreate(pool_handle_.get(), &pool_props));
+#endif
+
+    pool_ = cuda_async_view_memory_resource{*pool_handle_};
 #endif
   }
 
@@ -68,13 +87,30 @@ class cuda_async_pinned_memory_resource final : public device_memory_resource {
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
 
-  ~cuda_async_pinned_memory_resource() override {}
+  ~cuda_async_pinned_memory_resource() override                                          = default;
   cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&)            = delete;
   cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&)                 = delete;
   cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete;
   cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&)      = delete;
 
  private:
+  // Inline deleter: cleanup on CUDA 12.6-12.x, no-op on CUDA 13.0+
+  struct pool_deleter {
+    void operator()(cudaMemPool_t* pool) const
+    {
+      if (pool != nullptr && *pool != nullptr) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \
+  CUDA_VERSION < 13000
+        // CUDA 12.6-12.x: Destroy the pool we created
+        cudaMemPoolDestroy(*pool);  // Ignore errors during destruction
+#endif
+        // CUDA 13.0+: Do nothing (using default pool managed by CUDA runtime)
+      }
+      delete pool;
+    }
+  };
+
+  std::unique_ptr<cudaMemPool_t, pool_deleter> pool_handle_;
   cuda_async_view_memory_resource pool_{};
 
   /**
diff --git a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
index 690684d13..4362e8725 100644
--- a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
+++ b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
@@ -22,7 +22,7 @@ class AsyncPinnedMRTest : public ::testing::Test {
   {
     if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) {
       GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource "
-                   << "requires CUDA 13.0 or higher and memory pool support.";
+                   << "requires CUDA 12.6 or higher and memory pool support.";
     }
   }
 };
diff --git a/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py
index b08a1c9c3..0bb40b16f 100644
--- a/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py
+++ b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py
@@ -19,7 +19,7 @@
 
 @pytest.mark.skipif(
     not _ASYNC_PINNED_MEMORY_SUPPORTED,
-    reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+",
+    reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+",
 )
 @pytest.mark.parametrize("dtype", _dtypes)
 @pytest.mark.parametrize("nelem", _nelems)
@@ -33,7 +33,7 @@ def test_cuda_async_pinned_memory_resource(dtype, nelem, alloc):
 
 @pytest.mark.skipif(
     not _ASYNC_PINNED_MEMORY_SUPPORTED,
-    reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+",
+    reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+",
 )
 @pytest.mark.parametrize("nelems", _nelems)
 def test_cuda_async_pinned_memory_resource_stream(nelems):
@@ -48,7 +48,7 @@ def test_cuda_async_pinned_memory_resource_stream(nelems):
 
 @pytest.mark.skipif(
     not _ASYNC_PINNED_MEMORY_SUPPORTED,
-    reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+",
+    reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+",
 )
 def test_cuda_async_pinned_memory_resource_pool_handle():
     mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource()
@@ -59,7 +59,7 @@ def test_cuda_async_pinned_memory_resource_pool_handle():
 
 @pytest.mark.skipif(
     not _ASYNC_PINNED_MEMORY_SUPPORTED,
-    reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+",
+    reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+",
 )
 def test_cuda_async_pinned_memory_resource_host_access():
     """Test that pinned memory allocated by the resource is accessible from host."""
@@ -67,7 +67,7 @@ def test_cuda_async_pinned_memory_resource_host_access():
     rmm.mr.set_current_device_resource(mr)
 
     # Allocate a buffer
-    expected = np.full(100, 42, dtype="i4")
+    expected = np.full(100, 42, dtype="u1")
     dbuf = rmm.DeviceBuffer.to_device(expected)
 
     # Verify host can access the data
diff --git a/python/rmm/rmm/tests/test_helpers.py b/python/rmm/rmm/tests/test_helpers.py
index 369b91e17..c9041bcf7 100644
--- a/python/rmm/rmm/tests/test_helpers.py
+++ b/python/rmm/rmm/tests/test_helpers.py
@@ -37,7 +37,7 @@
         runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported,
         rmm._cuda.gpu.getDevice(),
     )
-    and rmm._cuda.gpu.runtimeGetVersion() >= 13000
+    and rmm._cuda.gpu.runtimeGetVersion() >= 12060
 )
 
 _MEMORY_POOL_HANDLE_TYPES_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(

From 837dd551606b8fa3805527b51af9764d1697436c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 1 Dec 2025 15:13:53 -0600
Subject: [PATCH 3/3] Review feedback

---
 .../mr/cuda_async_pinned_memory_resource.hpp  | 53 +++++++++----------
 cpp/tests/mr/cuda_async_pinned_mr_tests.cpp   | 28 ++++++++++
 2 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
index b7adeea96..b2f632c0d 100644
--- a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
+++ b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
@@ -17,8 +17,6 @@
 #include <cuda_runtime_api.h>
 
 #include <cstddef>
-#include <cstdint>
-#include <optional>
 
 namespace RMM_NAMESPACE {
 namespace mr {
@@ -59,24 +57,30 @@ class cuda_async_pinned_memory_resource final : public device_memory_resource {
     RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(),
                 "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher runtime");
 
-    pool_handle_.reset(new cudaMemPool_t{});
+    // Use host location for pinned memory pool (id is ignored for cudaMemLocationTypeHost)
+    cudaMemLocation location{.type = cudaMemLocationTypeHost, .id = 0};
 
 #if CUDA_VERSION >= 13000
     // CUDA 13.0+: Use the default pinned memory pool (no cleanup needed)
-    cudaMemLocation location{.type = cudaMemLocationTypeDevice,
-                             .id   = rmm::get_current_cuda_device().value()};
-    RMM_CUDA_TRY(
-      cudaMemGetDefaultMemPool(pool_handle_.get(), &location, cudaMemAllocationTypePinned));
+    RMM_CUDA_TRY(cudaMemGetDefaultMemPool(&pool_handle_, &location, cudaMemAllocationTypePinned));
 #else
     // CUDA 12.6-12.x: Create a new pinned memory pool (needs cleanup)
     cudaMemPoolProps pool_props{};
     pool_props.allocType     = cudaMemAllocationTypePinned;
-    pool_props.location.type = cudaMemLocationTypeDevice;
-    pool_props.location.id   = rmm::get_current_cuda_device().value();
-    RMM_CUDA_TRY(cudaMemPoolCreate(pool_handle_.get(), &pool_props));
+    pool_props.location.type = cudaMemLocationTypeHost;
+    pool_props.location.id   = 0;
+    RMM_CUDA_TRY(cudaMemPoolCreate(&pool_handle_, &pool_props));
+    owns_pool_ = true;
 #endif
 
-    pool_ = cuda_async_view_memory_resource{*pool_handle_};
+    // Enable device access to the pinned memory pool
+    cudaMemAccessDesc desc{};
+    desc.location.type = cudaMemLocationTypeDevice;
+    desc.location.id   = rmm::get_current_cuda_device().value();
+    desc.flags         = cudaMemAccessFlagsProtReadWrite;
+    RMM_CUDA_TRY(cudaMemPoolSetAccess(pool_handle_, &desc, 1));
+
+    pool_ = cuda_async_view_memory_resource{pool_handle_};
 #endif
   }
 
@@ -87,30 +91,21 @@ class cuda_async_pinned_memory_resource final : public device_memory_resource {
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
 
-  ~cuda_async_pinned_memory_resource() override                                          = default;
+  ~cuda_async_pinned_memory_resource() override
+  {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \
+  CUDA_VERSION < 13000
+    if (owns_pool_ && pool_handle_ != nullptr) { cudaMemPoolDestroy(pool_handle_); }
+#endif
+  }
   cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&)            = delete;
   cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&)                 = delete;
   cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete;
   cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&)      = delete;
 
  private:
-  // Inline deleter: cleanup on CUDA 12.6-12.x, no-op on CUDA 13.0+
-  struct pool_deleter {
-    void operator()(cudaMemPool_t* pool) const
-    {
-      if (pool != nullptr && *pool != nullptr) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \
-  CUDA_VERSION < 13000
-        // CUDA 12.6-12.x: Destroy the pool we created
-        cudaMemPoolDestroy(*pool);  // Ignore errors during destruction
-#endif
-        // CUDA 13.0+: Do nothing (using default pool managed by CUDA runtime)
-      }
-      delete pool;
-    }
-  };
-
-  std::unique_ptr<cudaMemPool_t, pool_deleter> pool_handle_;
+  cudaMemPool_t pool_handle_{};
+  bool owns_pool_{false};
   cuda_async_view_memory_resource pool_{};
 
   /**
diff --git a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
index 4362e8725..f5c75fd2c 100644
--- a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
+++ b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
@@ -108,5 +108,33 @@ TEST_F(AsyncPinnedMRTest, PoolHandleIsValid)
   EXPECT_NE(pool_handle, nullptr);
 }
 
+TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromDevice)
+{
+  const auto alloc_size{sizeof(int) * 100};
+  cuda_async_pinned_mr mr{};
+  auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size));
+  ASSERT_NE(nullptr, ptr);
+
+  // Initialize from host
+  for (int i = 0; i < 100; ++i) {
+    ptr[i] = i;
+  }
+
+  // Allocate device memory and copy from pinned -> device -> back to verify device access
+  int* d_ptr{};
+  EXPECT_EQ(cudaMalloc(&d_ptr, alloc_size), cudaSuccess);
+  EXPECT_EQ(cudaMemcpy(d_ptr, ptr, alloc_size, cudaMemcpyDefault), cudaSuccess);
+
+  int result[100];
+  EXPECT_EQ(cudaMemcpy(result, d_ptr, alloc_size, cudaMemcpyDefault), cudaSuccess);
+
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_EQ(result[i], i);
+  }
+
+  cudaFree(d_ptr);
+  mr.deallocate_sync(ptr, alloc_size);
+}
+
 }  // namespace
 }  // namespace rmm::test