diff --git a/cpp/include/rmm/detail/runtime_capabilities.hpp b/cpp/include/rmm/detail/runtime_capabilities.hpp index fe86769d5..8b6345d28 100644 --- a/cpp/include/rmm/detail/runtime_capabilities.hpp +++ b/cpp/include/rmm/detail/runtime_capabilities.hpp @@ -25,6 +25,11 @@ namespace detail { */ #define RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION 13000 +/** + * @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support + */ +#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 12060 + /** * @brief Determine at runtime if the CUDA driver supports the stream-ordered * memory allocator functions. @@ -146,6 +151,31 @@ struct runtime_async_managed_alloc { } }; +/** + * @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered + * pinned memory allocator functions. + * + * Stream-ordered pinned memory pools were introduced in CUDA 12.6. + */ +struct runtime_async_pinned_alloc { + static bool is_supported() + { + static auto supports_async_pinned_pool{[] { + // Basic pool support required + if (not runtime_async_alloc::is_supported()) { return false; } + // CUDA 12.6 or higher is required for async pinned memory pools + int cuda_driver_version{}; + auto driver_result = cudaDriverGetVersion(&cuda_driver_version); + int cuda_runtime_version{}; + auto runtime_result = cudaRuntimeGetVersion(&cuda_runtime_version); + return driver_result == cudaSuccess and runtime_result == cudaSuccess and + cuda_driver_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION and + cuda_runtime_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION; + }()}; + return supports_async_pinned_pool; + } +}; + /** * @brief Check if the current device is an integrated memory system. * diff --git a/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp index bd2baadf4..d73898b82 100644 --- a/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp +++ b/cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp @@ -42,16 +42,21 @@ class cuda_async_managed_memory_resource final : public device_memory_resource { * The default managed memory pool is the pool that is created when the device is created. * Pool properties such as the release threshold are not modified. * - * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with - * managed memory pool + * @throws rmm::logic_error if the CUDA build version is less than 13.0 + * @throws rmm::logic_error if the CUDA runtime version does not support `cudaMallocFromPoolAsync` + * with managed memory pool (requires CUDA 13.0 or higher) */ cuda_async_managed_memory_resource() { - // Check if managed memory pools are supported +#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION + RMM_FAIL( + "cuda_async_managed_memory_resource requires CUDA 13.0 or higher. " + "This build was compiled with an older CUDA version."); +#else + // Check if managed memory pools are supported at runtime RMM_EXPECTS(rmm::detail::runtime_async_managed_alloc::is_supported(), "cuda_async_managed_memory_resource requires CUDA 13.0 or higher"); -#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION cudaMemPool_t managed_pool_handle{}; cudaMemLocation location{.type = cudaMemLocationTypeDevice, .id = rmm::get_current_cuda_device().value()}; diff --git a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp new file mode 100644 index 000000000..b2f632c0d --- /dev/null +++ b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp @@ -0,0 +1,174 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace RMM_NAMESPACE { +namespace mr { +/** + * @addtogroup memory_resources + * @{ + * @file + */ + +/** + * @brief `device_memory_resource` derived class that uses + * `cudaMallocFromPoolAsync`/`cudaFreeFromPoolAsync` with a pinned memory pool + * for allocation/deallocation. + */ +class cuda_async_pinned_memory_resource final : public device_memory_resource { + public: + /** + * @brief Constructs a cuda_async_pinned_memory_resource with a pinned memory pool for + * the current device. + * + * On CUDA 12.6-12.x, creates a new pinned memory pool using cudaMemPoolCreate. + * On CUDA 13.0+, uses the default pinned memory pool via cudaMemGetDefaultMemPool. + * + * Pool properties such as the release threshold are not modified. + * + * @throws rmm::logic_error if the CUDA build version is less than 12.6 + * @throws rmm::logic_error if the CUDA runtime version does not support pinned memory pools + * (requires CUDA 12.6 or higher) + */ + cuda_async_pinned_memory_resource() + { +#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION + RMM_FAIL( + "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher. " + "This build was compiled with an older CUDA version."); +#else + // Check if pinned memory pools are supported at runtime + RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(), + "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher runtime"); + + // Use host location for pinned memory pool (id is ignored for cudaMemLocationTypeHost) + cudaMemLocation location{.type = cudaMemLocationTypeHost, .id = 0}; + +#if CUDA_VERSION >= 13000 + // CUDA 13.0+: Use the default pinned memory pool (no cleanup needed) + RMM_CUDA_TRY(cudaMemGetDefaultMemPool(&pool_handle_, &location, cudaMemAllocationTypePinned)); +#else + // CUDA 12.6-12.x: Create a new pinned memory pool (needs cleanup) + cudaMemPoolProps pool_props{}; + pool_props.allocType = cudaMemAllocationTypePinned; + pool_props.location.type = cudaMemLocationTypeHost; + pool_props.location.id = 0; + RMM_CUDA_TRY(cudaMemPoolCreate(&pool_handle_, &pool_props)); + owns_pool_ = true; +#endif + + // Enable device access to the pinned memory pool + cudaMemAccessDesc desc{}; + desc.location.type = cudaMemLocationTypeDevice; + desc.location.id = rmm::get_current_cuda_device().value(); + desc.flags = cudaMemAccessFlagsProtReadWrite; + RMM_CUDA_TRY(cudaMemPoolSetAccess(pool_handle_, &desc, 1)); + + pool_ = cuda_async_view_memory_resource{pool_handle_}; +#endif + } + + /** + * @brief Returns the underlying native handle to the CUDA pool + * + * @return cudaMemPool_t Handle to the underlying CUDA pool + */ + [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); } + + ~cuda_async_pinned_memory_resource() override + { +#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \ + CUDA_VERSION < 13000 + if (owns_pool_ && pool_handle_ != nullptr) { cudaMemPoolDestroy(pool_handle_); } +#endif + } + cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&) = delete; + cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&) = delete; + cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete; + cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&) = delete; + + private: + cudaMemPool_t pool_handle_{}; + bool owns_pool_{false}; + cuda_async_view_memory_resource pool_{}; + + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * @param bytes The size of the allocation + * @param stream Stream on which to perform allocation + * @return void* Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override + { + return pool_.allocate(stream, bytes); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param stream Stream on which to perform deallocation + */ + void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override + { + pool_.deallocate(stream, ptr, bytes); + } + + /** + * @brief Compare this resource to another. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override + { + auto const* async_mr = dynamic_cast(&other); + return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle()); + } + + friend auto get_property(cuda_async_pinned_memory_resource const&, + cuda::mr::device_accessible) noexcept + { + return cuda::mr::device_accessible{}; + } + friend auto get_property(cuda_async_pinned_memory_resource const&, + cuda::mr::host_accessible) noexcept + { + return cuda::mr::host_accessible{}; + } +}; + +// static property checks +static_assert(rmm::detail::polyfill::resource); +static_assert(rmm::detail::polyfill::async_resource); +static_assert(rmm::detail::polyfill::resource_with); +static_assert(rmm::detail::polyfill::async_resource_with); +/** @} */ // end of group +} // namespace mr +} // namespace RMM_NAMESPACE diff --git a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp new file mode 100644 index 000000000..f5c75fd2c --- /dev/null +++ b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp @@ -0,0 +1,140 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include + +#include + +namespace rmm::test { +namespace { + +using cuda_async_pinned_mr = rmm::mr::cuda_async_pinned_memory_resource; + +class AsyncPinnedMRTest : public ::testing::Test { + protected: + void SetUp() override + { + if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) { + GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource " + << "requires CUDA 12.6 or higher and memory pool support."; + } + } +}; + +TEST_F(AsyncPinnedMRTest, BasicAllocateDeallocate) +{ + const auto alloc_size{100}; + cuda_async_pinned_mr mr{}; + void* ptr = mr.allocate_sync(alloc_size); + ASSERT_NE(nullptr, ptr); + mr.deallocate_sync(ptr, alloc_size); +} + +TEST_F(AsyncPinnedMRTest, EqualityWithSamePool) +{ + // Two instances wrapping the same default pinned pool should compare equal if they + // ultimately refer to the same underlying pool handle. Construct two and compare. + cuda_async_pinned_mr mr1{}; + cuda_async_pinned_mr mr2{}; + EXPECT_TRUE(mr1.is_equal(mr2)); +} + +TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromHost) +{ + const auto alloc_size{sizeof(int) * 100}; + cuda_async_pinned_mr mr{}; + auto* ptr = static_cast(mr.allocate_sync(alloc_size)); + ASSERT_NE(nullptr, ptr); + + // Pinned memory should be accessible from host + // Write from host + EXPECT_NO_THROW({ + for (int i = 0; i < 100; ++i) { + ptr[i] = i; + } + }); + + // Verify we can read back + EXPECT_EQ(ptr[0], 0); + EXPECT_EQ(ptr[50], 50); + EXPECT_EQ(ptr[99], 99); + + mr.deallocate_sync(ptr, alloc_size); +} + +TEST_F(AsyncPinnedMRTest, MultipleAllocationsAccessible) +{ + const auto alloc_size{512}; + cuda_async_pinned_mr mr{}; + + void* ptr1 = mr.allocate_sync(alloc_size); + void* ptr2 = mr.allocate_sync(alloc_size * 2); + void* ptr3 = mr.allocate_sync(alloc_size / 2); + + ASSERT_NE(nullptr, ptr1); + ASSERT_NE(nullptr, ptr2); + ASSERT_NE(nullptr, ptr3); + + // Verify all pointers are accessible from host + auto* typed_ptr1 = static_cast(ptr1); + auto* typed_ptr2 = static_cast(ptr2); + auto* typed_ptr3 = static_cast(ptr3); + + EXPECT_NO_THROW({ + typed_ptr1[0] = 'a'; + typed_ptr2[0] = 'b'; + typed_ptr3[0] = 'c'; + }); + + EXPECT_EQ(typed_ptr1[0], 'a'); + EXPECT_EQ(typed_ptr2[0], 'b'); + EXPECT_EQ(typed_ptr3[0], 'c'); + + mr.deallocate_sync(ptr1, alloc_size); + mr.deallocate_sync(ptr2, alloc_size * 2); + mr.deallocate_sync(ptr3, alloc_size / 2); +} + +TEST_F(AsyncPinnedMRTest, PoolHandleIsValid) +{ + cuda_async_pinned_mr mr{}; + cudaMemPool_t pool_handle = mr.pool_handle(); + EXPECT_NE(pool_handle, nullptr); +} + +TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromDevice) +{ + const auto alloc_size{sizeof(int) * 100}; + cuda_async_pinned_mr mr{}; + auto* ptr = static_cast(mr.allocate_sync(alloc_size)); + ASSERT_NE(nullptr, ptr); + + // Initialize from host + for (int i = 0; i < 100; ++i) { + ptr[i] = i; + } + + // Allocate device memory and copy from pinned -> device -> back to verify device access + int* d_ptr{}; + EXPECT_EQ(cudaMalloc(&d_ptr, alloc_size), cudaSuccess); + EXPECT_EQ(cudaMemcpy(d_ptr, ptr, alloc_size, cudaMemcpyDefault), cudaSuccess); + + int result[100]; + EXPECT_EQ(cudaMemcpy(result, d_ptr, alloc_size, cudaMemcpyDefault), cudaSuccess); + + for (int i = 0; i < 100; ++i) { + EXPECT_EQ(result[i], i); + } + + cudaFree(d_ptr); + mr.deallocate_sync(ptr, alloc_size); +} + +} // namespace +} // namespace rmm::test diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd index 55790a475..71a7b551a 100644 --- a/python/rmm/rmm/librmm/memory_resource.pxd +++ b/python/rmm/rmm/librmm/memory_resource.pxd @@ -134,6 +134,13 @@ cdef extern from "rmm/mr/cuda_async_managed_memory_resource.hpp" \ cuda_async_managed_memory_resource() except + cudaMemPool_t pool_handle() const +cdef extern from "rmm/mr/cuda_async_pinned_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + + cdef cppclass cuda_async_pinned_memory_resource(device_memory_resource): + cuda_async_pinned_memory_resource() except + + cudaMemPool_t pool_handle() const + cdef extern from "rmm/mr/cuda_async_memory_resource.hpp" \ namespace \ "rmm::mr::cuda_async_memory_resource" \ diff --git a/python/rmm/rmm/mr/experimental.py b/python/rmm/rmm/mr/experimental.py index 1b422a3d3..857ed9d06 100644 --- a/python/rmm/rmm/mr/experimental.py +++ b/python/rmm/rmm/mr/experimental.py @@ -5,8 +5,10 @@ from rmm.pylibrmm.memory_resource.experimental import ( CudaAsyncManagedMemoryResource, + CudaAsyncPinnedMemoryResource, ) __all__ = [ "CudaAsyncManagedMemoryResource", + "CudaAsyncPinnedMemoryResource", ] diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd index 2ec0914c7..461d45ddf 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd +++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd @@ -7,3 +7,6 @@ from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource): pass + +cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource): + pass diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi index 6c098f574..6470fa120 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi +++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi @@ -6,3 +6,7 @@ from rmm.pylibrmm.memory_resource._memory_resource import DeviceMemoryResource class CudaAsyncManagedMemoryResource(DeviceMemoryResource): def __init__(self) -> None: ... def pool_handle(self) -> int: ... + +class CudaAsyncPinnedMemoryResource(DeviceMemoryResource): + def __init__(self) -> None: ... + def pool_handle(self) -> int: ... diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx index 18287a387..890fa0ddd 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx +++ b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx @@ -5,7 +5,10 @@ from libc.stdint cimport uintptr_t -from rmm.librmm.memory_resource cimport cuda_async_managed_memory_resource +from rmm.librmm.memory_resource cimport ( + cuda_async_managed_memory_resource, + cuda_async_pinned_memory_resource, +) # import from the private _memory_resource to avoid a circular import from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource @@ -38,3 +41,33 @@ cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource): cdef cuda_async_managed_memory_resource* c_mr = \ self.c_obj.get() return c_mr.pool_handle() + + +cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource): + """ + Memory resource that uses ``cudaMallocFromPoolAsync``/``cudaFreeAsync`` for + allocation/deallocation with a pinned memory pool. + + This resource uses the default pinned memory pool for the current device. + Pinned memory is page-locked host memory that can be accessed from both + the host and device. This provides fast host-device transfers. + + Requires CUDA 13.0 or higher. + """ + def __cinit__(self): + self.c_obj.reset( + new cuda_async_pinned_memory_resource() + ) + + def pool_handle(self): + """ + Returns the underlying CUDA memory pool handle. + + Returns + ------- + int + Handle to the underlying CUDA memory pool + """ + cdef cuda_async_pinned_memory_resource* c_mr = \ + self.c_obj.get() + return c_mr.pool_handle() diff --git a/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py new file mode 100644 index 000000000..0bb40b16f --- /dev/null +++ b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for CudaAsyncPinnedMemoryResource.""" + +import numpy as np +import pytest +from test_helpers import ( + _ASYNC_PINNED_MEMORY_SUPPORTED, + _allocs, + _dtypes, + _nelems, + array_tester, +) + +import rmm +from rmm.pylibrmm.stream import Stream + + +@pytest.mark.skipif( + not _ASYNC_PINNED_MEMORY_SUPPORTED, + reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+", +) +@pytest.mark.parametrize("dtype", _dtypes) +@pytest.mark.parametrize("nelem", _nelems) +@pytest.mark.parametrize("alloc", _allocs) +def test_cuda_async_pinned_memory_resource(dtype, nelem, alloc): + mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() + rmm.mr.set_current_device_resource(mr) + assert rmm.mr.get_current_device_resource_type() is type(mr) + array_tester(dtype, nelem, alloc) + + +@pytest.mark.skipif( + not _ASYNC_PINNED_MEMORY_SUPPORTED, + reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+", +) +@pytest.mark.parametrize("nelems", _nelems) +def test_cuda_async_pinned_memory_resource_stream(nelems): + mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() + rmm.mr.set_current_device_resource(mr) + stream = Stream() + expected = np.full(nelems, 5, dtype="u1") + dbuf = rmm.DeviceBuffer.to_device(expected, stream=stream) + result = np.asarray(dbuf.copy_to_host()) + np.testing.assert_equal(expected, result) + + +@pytest.mark.skipif( + not _ASYNC_PINNED_MEMORY_SUPPORTED, + reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+", +) +def test_cuda_async_pinned_memory_resource_pool_handle(): + mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() + pool_handle = mr.pool_handle() + assert isinstance(pool_handle, int) + assert pool_handle != 0 + + +@pytest.mark.skipif( + not _ASYNC_PINNED_MEMORY_SUPPORTED, + reason="CudaAsyncPinnedMemoryResource requires CUDA 12.6+", +) +def test_cuda_async_pinned_memory_resource_host_access(): + """Test that pinned memory allocated by the resource is accessible from host.""" + mr = rmm.mr.experimental.CudaAsyncPinnedMemoryResource() + rmm.mr.set_current_device_resource(mr) + + # Allocate a buffer + expected = np.full(100, 42, dtype="u1") + dbuf = rmm.DeviceBuffer.to_device(expected) + + # Verify host can access the data + result = np.asarray(dbuf.copy_to_host()) + np.testing.assert_equal(expected, result) diff --git a/python/rmm/rmm/tests/test_helpers.py b/python/rmm/rmm/tests/test_helpers.py index c7a9fba57..c9041bcf7 100644 --- a/python/rmm/rmm/tests/test_helpers.py +++ b/python/rmm/rmm/tests/test_helpers.py @@ -32,6 +32,14 @@ and rmm._cuda.gpu.runtimeGetVersion() >= 13000 ) +_ASYNC_PINNED_MEMORY_SUPPORTED = ( + rmm._cuda.gpu.getDeviceAttribute( + runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, + rmm._cuda.gpu.getDevice(), + ) + and rmm._cuda.gpu.runtimeGetVersion() >= 12060 +) + _MEMORY_POOL_HANDLE_TYPES_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolSupportedHandleTypes, rmm._cuda.gpu.getDevice(),