From e671b344415fb9f6d22063dc452c0e16bf21ff3b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 25 Nov 2025 17:33:48 -0600 Subject: [PATCH 1/3] Add experimental cuda_async_pinned_memory_resource Adds a new cuda_async_pinned_memory_resource that provides stream-ordered pinned (page-locked) host memory allocation using CUDA 13.0's cudaMemGetDefaultMemPool API with cudaMemAllocationTypePinned. This parallels the cuda_async_managed_memory_resource added in #2056 and addresses part of #2054. Key features: - Uses default pinned memory pool for stream-ordered allocation - Accessible from both host and device - Requires CUDA 13.0+ (matches managed version for consistency) Implementation includes: - C++ header and implementation in cuda_async_pinned_memory_resource.hpp - Runtime capability check in runtime_capabilities.hpp - C++ tests in cuda_async_pinned_mr_tests.cpp - Python bindings in experimental module - Python tests in test_cuda_async_pinned_memory_resource.py --- .../rmm/detail/runtime_capabilities.hpp | 31 ++++ .../mr/cuda_async_pinned_memory_resource.hpp | 143 ++++++++++++++++++ cpp/tests/mr/cuda_async_pinned_mr_tests.cpp | 112 ++++++++++++++ python/rmm/rmm/librmm/memory_resource.pxd | 7 + python/rmm/rmm/mr/experimental.py | 2 + .../pylibrmm/memory_resource/experimental.pxd | 3 + .../pylibrmm/memory_resource/experimental.pyi | 4 + .../pylibrmm/memory_resource/experimental.pyx | 35 ++++- .../test_cuda_async_pinned_memory_resource.py | 75 +++++++++ python/rmm/rmm/tests/test_helpers.py | 8 + 10 files changed, 419 insertions(+), 1 deletion(-) create mode 100644 cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp create mode 100644 cpp/tests/mr/cuda_async_pinned_mr_tests.cpp create mode 100644 python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py diff --git a/cpp/include/rmm/detail/runtime_capabilities.hpp b/cpp/include/rmm/detail/runtime_capabilities.hpp index fe86769d5..43b90ef61 100644 --- a/cpp/include/rmm/detail/runtime_capabilities.hpp +++ b/cpp/include/rmm/detail/runtime_capabilities.hpp @@ -25,6 +25,11 @@ namespace detail { */ #define RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION 13000 +/** + * @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support + */ +#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 13000 + /** * @brief Determine at runtime if the CUDA driver supports the stream-ordered * memory allocator functions. @@ -146,6 +151,32 @@ struct runtime_async_managed_alloc { } }; +/* + * @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered + * pinned memory allocator functions. + * + * Stream-ordered pinned memory pools were introduced in CUDA 12.6 but our + * implementation requires features from CUDA 13.0 or higher. + */ +struct runtime_async_pinned_alloc { + static bool is_supported() + { + static auto supports_async_pinned_pool{[] { + // Basic pool support required + if (not runtime_async_alloc::is_supported()) { return false; } + // CUDA 13.0 or higher is required for async pinned memory pools + int cuda_driver_version{}; + auto driver_result = cudaDriverGetVersion(&cuda_driver_version); + int cuda_runtime_version{}; + auto runtime_result = cudaRuntimeGetVersion(&cuda_runtime_version); + return driver_result == cudaSuccess and runtime_result == cudaSuccess and + cuda_driver_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION and + cuda_runtime_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION; + }()}; + return supports_async_pinned_pool; + } +}; + /** * @brief Check if the current device is an integrated memory system. * diff --git a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp new file mode 100644 index 000000000..261b5f3ac --- /dev/null +++ b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp @@ -0,0 +1,143 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace RMM_NAMESPACE { +namespace mr { +/** + * @addtogroup memory_resources + * @{ + * @file + */ + +/** + * @brief `device_memory_resource` derived class that uses + * `cudaMallocFromPoolAsync`/`cudaFreeFromPoolAsync` with a pinned memory pool + * for allocation/deallocation. + */ +class cuda_async_pinned_memory_resource final : public device_memory_resource { + public: + /** + * @brief Constructs a cuda_async_pinned_memory_resource with the default pinned memory pool for + * the current device. + * + * The default pinned memory pool is the pool that is created when the device is created. + * Pool properties such as the release threshold are not modified. + * + * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with + * pinned memory pool + */ + cuda_async_pinned_memory_resource() + { + // Check if pinned memory pools are supported + RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(), + "cuda_async_pinned_memory_resource requires CUDA 13.0 or higher"); + +#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION + cudaMemPool_t pinned_pool_handle{}; + cudaMemLocation location{.type = cudaMemLocationTypeDevice, + .id = rmm::get_current_cuda_device().value()}; + RMM_CUDA_TRY( + cudaMemGetDefaultMemPool(&pinned_pool_handle, &location, cudaMemAllocationTypePinned)); + pool_ = cuda_async_view_memory_resource{pinned_pool_handle}; +#endif + } + + /** + * @brief Returns the underlying native handle to the CUDA pool + * + * @return cudaMemPool_t Handle to the underlying CUDA pool + */ + [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); } + + ~cuda_async_pinned_memory_resource() override {} + cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&) = delete; + cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&) = delete; + cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete; + cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&) = delete; + + private: + cuda_async_view_memory_resource pool_{}; + + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * @param bytes The size of the allocation + * @param stream Stream on which to perform allocation + * @return void* Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override + { + return pool_.allocate(stream, bytes); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param stream Stream on which to perform deallocation + */ + void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override + { + pool_.deallocate(stream, ptr, bytes); + } + + /** + * @brief Compare this resource to another. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override + { + auto const* async_mr = dynamic_cast(&other); + return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle()); + } + + friend auto get_property(cuda_async_pinned_memory_resource const&, + cuda::mr::device_accessible) noexcept + { + return cuda::mr::device_accessible{}; + } + friend auto get_property(cuda_async_pinned_memory_resource const&, + cuda::mr::host_accessible) noexcept + { + return cuda::mr::host_accessible{}; + } +}; + +// static property checks +static_assert(rmm::detail::polyfill::resource); +static_assert(rmm::detail::polyfill::async_resource); +static_assert(rmm::detail::polyfill::resource_with); +static_assert(rmm::detail::polyfill::async_resource_with); +/** @} */ // end of group +} // namespace mr +} // namespace RMM_NAMESPACE diff --git a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp new file mode 100644 index 000000000..690684d13 --- /dev/null +++ b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp @@ -0,0 +1,112 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include + +#include + +namespace rmm::test { +namespace { + +using cuda_async_pinned_mr = rmm::mr::cuda_async_pinned_memory_resource; + +class AsyncPinnedMRTest : public ::testing::Test { + protected: + void SetUp() override + { + if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) { + GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource " + << "requires CUDA 13.0 or higher and memory pool support."; + } + } +}; + +TEST_F(AsyncPinnedMRTest, BasicAllocateDeallocate) +{ + const auto alloc_size{100}; + cuda_async_pinned_mr mr{}; + void* ptr = mr.allocate_sync(alloc_size); + ASSERT_NE(nullptr, ptr); + mr.deallocate_sync(ptr, alloc_size); +} + +TEST_F(AsyncPinnedMRTest, EqualityWithSamePool) +{ + // Two instances wrapping the same default pinned pool should compare equal if they + // ultimately refer to the same underlying pool handle. Construct two and compare. + cuda_async_pinned_mr mr1{}; + cuda_async_pinned_mr mr2{}; + EXPECT_TRUE(mr1.is_equal(mr2)); +} + +TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromHost) +{ + const auto alloc_size{sizeof(int) * 100}; + cuda_async_pinned_mr mr{}; + auto* ptr = static_cast(mr.allocate_sync(alloc_size)); + ASSERT_NE(nullptr, ptr); + + // Pinned memory should be accessible from host + // Write from host + EXPECT_NO_THROW({ + for (int i = 0; i < 100; ++i) { + ptr[i] = i; + } + }); + + // Verify we can read back + EXPECT_EQ(ptr[0], 0); + EXPECT_EQ(ptr[50], 50); + EXPECT_EQ(ptr[99], 99); + + mr.deallocate_sync(ptr, alloc_size); +} + +TEST_F(AsyncPinnedMRTest, MultipleAllocationsAccessible) +{ + const auto alloc_size{512}; + cuda_async_pinned_mr mr{}; + + void* ptr1 = mr.allocate_sync(alloc_size); + void* ptr2 = mr.allocate_sync(alloc_size * 2); + void* ptr3 = mr.allocate_sync(alloc_size / 2); + + ASSERT_NE(nullptr, ptr1); + ASSERT_NE(nullptr, ptr2); + ASSERT_NE(nullptr, ptr3); + + // Verify all pointers are accessible from host + auto* typed_ptr1 = static_cast(ptr1); + auto* typed_ptr2 = static_cast(ptr2); + auto* typed_ptr3 = static_cast(ptr3); + + EXPECT_NO_THROW({ + typed_ptr1[0] = 'a'; + typed_ptr2[0] = 'b'; + typed_ptr3[0] = 'c'; + }); + + EXPECT_EQ(typed_ptr1[0], 'a'); + EXPECT_EQ(typed_ptr2[0], 'b'); + EXPECT_EQ(typed_ptr3[0], 'c'); + + mr.deallocate_sync(ptr1, alloc_size); + mr.deallocate_sync(ptr2, alloc_size * 2); + mr.deallocate_sync(ptr3, alloc_size / 2); +} + +TEST_F(AsyncPinnedMRTest, PoolHandleIsValid) +{ + cuda_async_pinned_mr mr{}; + cudaMemPool_t pool_handle = mr.pool_handle(); + EXPECT_NE(pool_handle, nullptr); +} + +} // namespace +} // namespace rmm::test diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd index 55790a475..71a7b551a 100644 --- a/python/rmm/rmm/librmm/memory_resource.pxd +++ b/python/rmm/rmm/librmm/memory_resource.pxd @@ -134,6 +134,13 @@ cdef extern from "rmm/mr/cuda_async_managed_memory_resource.hpp" \ cuda_async_managed_memory_resource() except + cudaMemPool_t pool_handle() const +cdef extern from "rmm/mr/cuda_async_pinned_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + + cdef cppclass cuda_async_pinned_memory_resource(device_memory_resource): + cuda_async_pinned_memory_resource() except + + cudaMemPool_t pool_handle() const + cdef extern from "rmm/mr/cuda_async_memory_resource.hpp" \ namespace \ "rmm::mr::cuda_async_memory_resource" \ diff --git a/python/rmm/rmm/mr/experimental.py b/python/rmm/rmm/mr/experimental.py index 1b422a3d3..857ed9d06 100644 --- a/python/rmm/rmm/mr/experimental.py +++ b/python/rmm/rmm/mr/experimental.py @@ -5,8 +5,10 @@ from rmm.pylibrmm.memory_resource.experimental import ( CudaAsyncManagedMemoryResource, + CudaAsyncPinnedMemoryResource, ) __all__ = [ "CudaAsyncManagedMemoryResource", + "CudaAsyncPinnedMemoryResource", ] diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd index 2ec0914c7..461d45ddf 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd +++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd @@ -7,3 +7,6 @@ from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource): pass + +cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource): + pass diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi index 6c098f574..6470fa120 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi +++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi @@ -6,3 +6,7 @@ from rmm.pylibrmm.memory_resource._memory_resource import DeviceMemoryResource class CudaAsyncManagedMemoryResource(DeviceMemoryResource): def __init__(self) -> None: ... def pool_handle(self) -> int: ... + +class CudaAsyncPinnedMemoryResource(DeviceMemoryResource): + def __init__(self) -> None: ... + def pool_handle(self) -> int: ... diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx index 18287a387..890fa0ddd 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx +++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx @@ -5,7 +5,10 @@ from libc.stdint cimport uintptr_t -from rmm.librmm.memory_resource cimport cuda_async_managed_memory_resource +from rmm.librmm.memory_resource cimport ( + cuda_async_managed_memory_resource, + cuda_async_pinned_memory_resource, +) # import from the private _memory_resource to avoid a circular import from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource @@ -38,3 +41,33 @@ cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource): cdef cuda_async_managed_memory_resource* c_mr = \ self.c_obj.get() return c_mr.pool_handle() + + +cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource): + """ + Memory resource that uses ``cudaMallocFromPoolAsync``/``cudaFreeAsync`` for + allocation/deallocation with a pinned memory pool. + + This resource uses the default pinned memory pool for the current device. + Pinned memory is page-locked host memory that can be accessed from both + the host and device. This provides fast host-device transfers. + + Requires CUDA 13.0 or higher. + """ + def __cinit__(self): + self.c_obj.reset( + new cuda_async_pinned_memory_resource() + ) + + def pool_handle(self): + """ + Returns the underlying CUDA memory pool handle. + + Returns + ------- + int + Handle to the underlying CUDA memory pool + """ + cdef cuda_async_pinned_memory_resource* c_mr = \ + self.c_obj.get() + return c_mr.pool_handle() diff --git a/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py new file mode 100644 index 000000000..b08a1c9c3 --- /dev/null +++ b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for CudaAsyncPinnedMemoryResource.""" + +import numpy as np +import pytest +from test_helpers import ( + _ASYNC_PINNED_MEMORY_SUPPORTED, + _allocs, + _dtypes, + _nelems, + array_tester, +) + +import rmm +from rmm.pylibrmm.stream import Stream + + +@pytest.mark.skipif( + not _ASYNC_PINNED_MEMORY_SUPPORTED, + reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+", +) +@pytest.mark.parametrize("dtype", _dtypes) +@pytest.mark.parametrize("nelem", _nelems) +@pytest.mark.parametrize("alloc", _allocs) +def test_cuda_async_pinned_memory_resource(dtype, nelem, alloc): + mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() + rmm.mr.set_current_device_resource(mr) + assert rmm.mr.get_current_device_resource_type() is type(mr) + array_tester(dtype, nelem, alloc) + + +@pytest.mark.skipif( + not _ASYNC_PINNED_MEMORY_SUPPORTED, + reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+", +) +@pytest.mark.parametrize("nelems", _nelems) +def test_cuda_async_pinned_memory_resource_stream(nelems): + mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() + rmm.mr.set_current_device_resource(mr) + stream = Stream() + expected = np.full(nelems, 5, dtype="u1") + dbuf = rmm.DeviceBuffer.to_device(expected, stream=stream) + result = np.asarray(dbuf.copy_to_host()) + np.testing.assert_equal(expected, result) + + +@pytest.mark.skipif( + not _ASYNC_PINNED_MEMORY_SUPPORTED, + reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+", +) +def test_cuda_async_pinned_memory_resource_pool_handle(): + mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() + pool_handle = mr.pool_handle() + assert isinstance(pool_handle, int) + assert pool_handle != 0 + + +@pytest.mark.skipif( + not _ASYNC_PINNED_MEMORY_SUPPORTED, + reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+", +) +def test_cuda_async_pinned_memory_resource_host_access(): + """Test that pinned memory allocated by the resource is accessible from host.""" + mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() + rmm.mr.set_current_device_resource(mr) + + # Allocate a buffer + expected = np.full(100, 42, dtype="i4") + dbuf = rmm.DeviceBuffer.to_device(expected) + + # Verify host can access the data + result = np.asarray(dbuf.copy_to_host()) + np.testing.assert_equal(expected, result) diff --git a/python/rmm/rmm/tests/test_helpers.py b/python/rmm/rmm/tests/test_helpers.py index c7a9fba57..369b91e17 100644 --- a/python/rmm/rmm/tests/test_helpers.py +++ b/python/rmm/rmm/tests/test_helpers.py @@ -32,6 +32,14 @@ and rmm._cuda.gpu.runtimeGetVersion() >= 13000 ) +_ASYNC_PINNED_MEMORY_SUPPORTED = ( + rmm._cuda.gpu.getDeviceAttribute( + runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, + rmm._cuda.gpu.getDevice(), + ) + and rmm._cuda.gpu.runtimeGetVersion() >= 13000 +) + _MEMORY_POOL_HANDLE_TYPES_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolSupportedHandleTypes, rmm._cuda.gpu.getDevice(), From 26c2a67c5314efe0cc4b410540d827e4c9320be3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 25 Nov 2025 17:54:06 -0600 Subject: [PATCH 2/3] Add CUDA 12.6+ support for cuda_async_pinned_memory_resource Enables pinned memory pool support on CUDA 12.6+ using cudaMemPoolCreate for CUDA 12.6-12.x and cudaMemGetDefaultMemPool for CUDA 13.0+. Uses unique_ptr with a deleter for automatic pool cleanup. Updates version requirements: 12.6+ for pinned. --- .../rmm/detail/runtime_capabilities.hpp | 9 ++- .../mr/cuda_async_managed_memory_resource.hpp | 13 +++-- .../mr/cuda_async_pinned_memory_resource.hpp | 58 +++++++++++++++---- cpp/tests/mr/cuda_async_pinned_mr_tests.cpp | 2 +- .../test_cuda_async_pinned_memory_resource.py | 10 ++-- python/rmm/rmm/tests/test_helpers.py | 2 +- 6 files changed, 67 insertions(+), 27 deletions(-) diff --git a/cpp/include/rmm/detail/runtime_capabilities.hpp b/cpp/include/rmm/detail/runtime_capabilities.hpp index 43b90ef61..8b6345d28 100644 --- a/cpp/include/rmm/detail/runtime_capabilities.hpp +++ b/cpp/include/rmm/detail/runtime_capabilities.hpp @@ -28,7 +28,7 @@ namespace detail { /** * @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support */ -#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 13000 +#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 12060 /** * @brief Determine at runtime if the CUDA driver supports the stream-ordered @@ -151,12 +151,11 @@ struct runtime_async_managed_alloc { } }; -/* +/** * @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered * pinned memory allocator functions. * - * Stream-ordered pinned memory pools were introduced in CUDA 12.6 but our - * implementation requires features from CUDA 13.0 or higher. + * Stream-ordered pinned memory pools were introduced in CUDA 12.6. */ struct runtime_async_pinned_alloc { static bool is_supported() @@ -164,7 +163,7 @@ struct runtime_async_pinned_alloc { static auto supports_async_pinned_pool{[] { // Basic pool support required if (not runtime_async_alloc::is_supported()) { return false; } - // CUDA 13.0 or higher is required for async pinned memory pools + // CUDA 12.6 or higher is required for async pinned memory pools int cuda_driver_version{}; auto driver_result = cudaDriverGetVersion(&cuda_driver_version); int cuda_runtime_version{}; diff --git a/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp index bd2baadf4..d73898b82 100644 --- a/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp +++ b/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp @@ -42,16 +42,21 @@ class cuda_async_managed_memory_resource final : public device_memory_resource { * The default managed memory pool is the pool that is created when the device is created. * Pool properties such as the release threshold are not modified. * - * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with - * managed memory pool + * @throws rmm::logic_error if the CUDA build version is less than 13.0 + * @throws rmm::logic_error if the CUDA runtime version does not support `cudaMallocFromPoolAsync` + * with managed memory pool (requires CUDA 13.0 or higher) */ cuda_async_managed_memory_resource() { - // Check if managed memory pools are supported +#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION + RMM_FAIL( + "cuda_async_managed_memory_resource requires CUDA 13.0 or higher. " + "This build was compiled with an older CUDA version."); +#else + // Check if managed memory pools are supported at runtime RMM_EXPECTS(rmm::detail::runtime_async_managed_alloc::is_supported(), "cuda_async_managed_memory_resource requires CUDA 13.0 or higher"); -#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION cudaMemPool_t managed_pool_handle{}; cudaMemLocation location{.type = cudaMemLocationTypeDevice, .id = rmm::get_current_cuda_device().value()}; diff --git a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp index 261b5f3ac..b7adeea96 100644 --- a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp +++ b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp @@ -36,28 +36,47 @@ namespace mr { class cuda_async_pinned_memory_resource final : public device_memory_resource { public: /** - * @brief Constructs a cuda_async_pinned_memory_resource with the default pinned memory pool for + * @brief Constructs a cuda_async_pinned_memory_resource with a pinned memory pool for * the current device. * - * The default pinned memory pool is the pool that is created when the device is created. + * On CUDA 12.6-12.x, creates a new pinned memory pool using cudaMemPoolCreate. + * On CUDA 13.0+, uses the default pinned memory pool via cudaMemGetDefaultMemPool. + * * Pool properties such as the release threshold are not modified. * - * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with - * pinned memory pool + * @throws rmm::logic_error if the CUDA build version is less than 12.6 + * @throws rmm::logic_error if the CUDA runtime version does not support pinned memory pools + * (requires CUDA 12.6 or higher) */ cuda_async_pinned_memory_resource() { - // Check if pinned memory pools are supported +#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION + RMM_FAIL( + "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher. " + "This build was compiled with an older CUDA version."); +#else + // Check if pinned memory pools are supported at runtime RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(), - "cuda_async_pinned_memory_resource requires CUDA 13.0 or higher"); + "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher runtime"); + + pool_handle_.reset(new cudaMemPool_t{}); -#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION - cudaMemPool_t pinned_pool_handle{}; +#if CUDA_VERSION >= 13000 + // CUDA 13.0+: Use the default pinned memory pool (no cleanup needed) cudaMemLocation location{.type = cudaMemLocationTypeDevice, .id = rmm::get_current_cuda_device().value()}; RMM_CUDA_TRY( - cudaMemGetDefaultMemPool(&pinned_pool_handle, &location, cudaMemAllocationTypePinned)); - pool_ = cuda_async_view_memory_resource{pinned_pool_handle}; + cudaMemGetDefaultMemPool(pool_handle_.get(), &location, cudaMemAllocationTypePinned)); +#else + // CUDA 12.6-12.x: Create a new pinned memory pool (needs cleanup) + cudaMemPoolProps pool_props{}; + pool_props.allocType = cudaMemAllocationTypePinned; + pool_props.location.type = cudaMemLocationTypeDevice; + pool_props.location.id = rmm::get_current_cuda_device().value(); + RMM_CUDA_TRY(cudaMemPoolCreate(pool_handle_.get(), &pool_props)); +#endif + + pool_ = cuda_async_view_memory_resource{*pool_handle_}; #endif } @@ -68,13 +87,30 @@ class cuda_async_pinned_memory_resource final : public device_memory_resource { */ [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); } - ~cuda_async_pinned_memory_resource() override {} + ~cuda_async_pinned_memory_resource() override = default; cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&) = delete; cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&) = delete; cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete; cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&) = delete; private: + // Inline deleter: cleanup on CUDA 12.6-12.x, no-op on CUDA 13.0+ + struct pool_deleter { + void operator()(cudaMemPool_t* pool) const + { + if (pool != nullptr && *pool != nullptr) { +#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \ + CUDA_VERSION < 13000 + // CUDA 12.6-12.x: Destroy the pool we created + cudaMemPoolDestroy(*pool); // Ignore errors during destruction +#endif + // CUDA 13.0+: Do nothing (using default pool managed by CUDA runtime) + } + delete pool; + } + }; + + std::unique_ptr pool_handle_; cuda_async_view_memory_resource pool_{}; /** diff --git a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp index 690684d13..4362e8725 100644 --- a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp +++ b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp @@ -22,7 +22,7 @@ class AsyncPinnedMRTest : public ::testing::Test { { if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) { GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource " - << "requires CUDA 13.0 or higher and memory pool support."; + << "requires CUDA 12.6 or higher and memory pool support."; } } }; diff --git a/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py index b08a1c9c3..0bb40b16f 100644 --- a/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py +++ b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py @@ -19,7 +19,7 @@ @pytest.mark.skipif( not _ASYNC_PINNED_MEMORY_SUPPORTED, - reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+", + reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+", ) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("nelem", _nelems) @@ -33,7 +33,7 @@ def test_cuda_async_pinned_memory_resource(dtype, nelem, alloc): @pytest.mark.skipif( not _ASYNC_PINNED_MEMORY_SUPPORTED, - reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+", + reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+", ) @pytest.mark.parametrize("nelems", _nelems) def test_cuda_async_pinned_memory_resource_stream(nelems): @@ -48,7 +48,7 @@ def test_cuda_async_pinned_memory_resource_stream(nelems): @pytest.mark.skipif( not _ASYNC_PINNED_MEMORY_SUPPORTED, - reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+", + reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+", ) def test_cuda_async_pinned_memory_resource_pool_handle(): mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() @@ -59,7 +59,7 @@ def test_cuda_async_pinned_memory_resource_pool_handle(): @pytest.mark.skipif( not _ASYNC_PINNED_MEMORY_SUPPORTED, - reason="CudaAsyncPinnedMemoryResource requires CUDA 13.0+", + reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+", ) def test_cuda_async_pinned_memory_resource_host_access(): """Test that pinned memory allocated by the resource is accessible from host.""" @@ -67,7 +67,7 @@ def test_cuda_async_pinned_memory_resource_host_access(): rmm.mr.set_current_device_resource(mr) # Allocate a buffer - expected = np.full(100, 42, dtype="i4") + expected = np.full(100, 42, dtype="u1") dbuf = rmm.DeviceBuffer.to_device(expected) # Verify host can access the data diff --git a/python/rmm/rmm/tests/test_helpers.py b/python/rmm/rmm/tests/test_helpers.py index 369b91e17..c9041bcf7 100644 --- a/python/rmm/rmm/tests/test_helpers.py +++ b/python/rmm/rmm/tests/test_helpers.py @@ -37,7 +37,7 @@ runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, rmm._cuda.gpu.getDevice(), ) - and rmm._cuda.gpu.runtimeGetVersion() >= 13000 + and rmm._cuda.gpu.runtimeGetVersion() >= 12060 ) _MEMORY_POOL_HANDLE_TYPES_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( From 837dd551606b8fa3805527b51af9764d1697436c Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 1 Dec 2025 15:13:53 -0600 Subject: [PATCH 3/3] Review feedback --- .../mr/cuda_async_pinned_memory_resource.hpp | 53 +++++++++---------- cpp/tests/mr/cuda_async_pinned_mr_tests.cpp | 28 ++++++++++ 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp index b7adeea96..b2f632c0d 100644 --- a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp +++ b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp @@ -17,8 +17,6 @@ #include #include -#include -#include namespace RMM_NAMESPACE { namespace mr { @@ -59,24 +57,30 @@ class cuda_async_pinned_memory_resource final : public device_memory_resource { RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(), "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher runtime"); - pool_handle_.reset(new cudaMemPool_t{}); + // Use host location for pinned memory pool (id is ignored for cudaMemLocationTypeHost) + cudaMemLocation location{.type = cudaMemLocationTypeHost, .id = 0}; #if CUDA_VERSION >= 13000 // CUDA 13.0+: Use the default pinned memory pool (no cleanup needed) - cudaMemLocation location{.type = cudaMemLocationTypeDevice, - .id = rmm::get_current_cuda_device().value()}; - RMM_CUDA_TRY( - cudaMemGetDefaultMemPool(pool_handle_.get(), &location, cudaMemAllocationTypePinned)); + RMM_CUDA_TRY(cudaMemGetDefaultMemPool(&pool_handle_, &location, cudaMemAllocationTypePinned)); #else // CUDA 12.6-12.x: Create a new pinned memory pool (needs cleanup) cudaMemPoolProps pool_props{}; pool_props.allocType = cudaMemAllocationTypePinned; - pool_props.location.type = cudaMemLocationTypeDevice; - pool_props.location.id = rmm::get_current_cuda_device().value(); - RMM_CUDA_TRY(cudaMemPoolCreate(pool_handle_.get(), &pool_props)); + pool_props.location.type = cudaMemLocationTypeHost; + pool_props.location.id = 0; + RMM_CUDA_TRY(cudaMemPoolCreate(&pool_handle_, &pool_props)); + owns_pool_ = true; #endif - pool_ = cuda_async_view_memory_resource{*pool_handle_}; + // Enable device access to the pinned memory pool + cudaMemAccessDesc desc{}; + desc.location.type = cudaMemLocationTypeDevice; + desc.location.id = rmm::get_current_cuda_device().value(); + desc.flags = cudaMemAccessFlagsProtReadWrite; + RMM_CUDA_TRY(cudaMemPoolSetAccess(pool_handle_, &desc, 1)); + + pool_ = cuda_async_view_memory_resource{pool_handle_}; #endif } @@ -87,30 +91,21 @@ class cuda_async_pinned_memory_resource final : public device_memory_resource { */ [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); } - ~cuda_async_pinned_memory_resource() override = default; + ~cuda_async_pinned_memory_resource() override + { +#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \ + CUDA_VERSION < 13000 + if (owns_pool_ && pool_handle_ != nullptr) { cudaMemPoolDestroy(pool_handle_); } +#endif + } cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&) = delete; cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&) = delete; cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete; cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&) = delete; private: - // Inline deleter: cleanup on CUDA 12.6-12.x, no-op on CUDA 13.0+ - struct pool_deleter { - void operator()(cudaMemPool_t* pool) const - { - if (pool != nullptr && *pool != nullptr) { -#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \ - CUDA_VERSION < 13000 - // CUDA 12.6-12.x: Destroy the pool we created - cudaMemPoolDestroy(*pool); // Ignore errors during destruction -#endif - // CUDA 13.0+: Do nothing (using default pool managed by CUDA runtime) - } - delete pool; - } - }; - - std::unique_ptr pool_handle_; + cudaMemPool_t pool_handle_{}; + bool owns_pool_{false}; cuda_async_view_memory_resource pool_{}; /** diff --git a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp index 4362e8725..f5c75fd2c 100644 --- a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp +++ b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp @@ -108,5 +108,33 @@ TEST_F(AsyncPinnedMRTest, PoolHandleIsValid) EXPECT_NE(pool_handle, nullptr); } +TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromDevice) +{ + const auto alloc_size{sizeof(int) * 100}; + cuda_async_pinned_mr mr{}; + auto* ptr = static_cast(mr.allocate_sync(alloc_size)); + ASSERT_NE(nullptr, ptr); + + // Initialize from host + for (int i = 0; i < 100; ++i) { + ptr[i] = i; + } + + // Allocate device memory and copy from pinned -> device -> back to verify device access + int* d_ptr{}; + EXPECT_EQ(cudaMalloc(&d_ptr, alloc_size), cudaSuccess); + EXPECT_EQ(cudaMemcpy(d_ptr, ptr, alloc_size, cudaMemcpyDefault), cudaSuccess); + + int result[100]; + EXPECT_EQ(cudaMemcpy(result, d_ptr, alloc_size, cudaMemcpyDefault), cudaSuccess); + + for (int i = 0; i < 100; ++i) { + EXPECT_EQ(result[i], i); + } + + cudaFree(d_ptr); + mr.deallocate_sync(ptr, alloc_size); +} + } // namespace } // namespace rmm::test