Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions cpp/include/rmm/detail/runtime_capabilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ namespace detail {
*/
#define RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION 13000

/**
* @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support
*/
#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 12060

/**
* @brief Determine at runtime if the CUDA driver supports the stream-ordered
* memory allocator functions.
Expand Down Expand Up @@ -146,6 +151,31 @@ struct runtime_async_managed_alloc {
}
};

/**
* @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered
* pinned memory allocator functions.
*
* Stream-ordered pinned memory pools were introduced in CUDA 12.6.
*/
struct runtime_async_pinned_alloc {
static bool is_supported()
{
static auto supports_async_pinned_pool{[] {
// Basic pool support required
if (not runtime_async_alloc::is_supported()) { return false; }
// CUDA 12.6 or higher is required for async pinned memory pools
int cuda_driver_version{};
auto driver_result = cudaDriverGetVersion(&cuda_driver_version);
int cuda_runtime_version{};
auto runtime_result = cudaRuntimeGetVersion(&cuda_runtime_version);
return driver_result == cudaSuccess and runtime_result == cudaSuccess and
cuda_driver_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION and
cuda_runtime_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION;
}()};
return supports_async_pinned_pool;
}
};

/**
* @brief Check if the current device is an integrated memory system.
*
Expand Down
13 changes: 9 additions & 4 deletions cpp/include/rmm/mr/cuda_async_managed_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,21 @@ class cuda_async_managed_memory_resource final : public device_memory_resource {
* The default managed memory pool is the pool that is created when the device is created.
* Pool properties such as the release threshold are not modified.
*
* @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with
* managed memory pool
* @throws rmm::logic_error if the CUDA build version is less than 13.0
* @throws rmm::logic_error if the CUDA runtime version does not support `cudaMallocFromPoolAsync`
* with managed memory pool (requires CUDA 13.0 or higher)
*/
cuda_async_managed_memory_resource()
{
// Check if managed memory pools are supported
#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION
RMM_FAIL(
"cuda_async_managed_memory_resource requires CUDA 13.0 or higher. "
"This build was compiled with an older CUDA version.");
#else
// Check if managed memory pools are supported at runtime
RMM_EXPECTS(rmm::detail::runtime_async_managed_alloc::is_supported(),
"cuda_async_managed_memory_resource requires CUDA 13.0 or higher");

#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION
cudaMemPool_t managed_pool_handle{};
cudaMemLocation location{.type = cudaMemLocationTypeDevice,
.id = rmm::get_current_cuda_device().value()};
Expand Down
174 changes: 174 additions & 0 deletions cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once

#include <rmm/cuda_device.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/detail/runtime_capabilities.hpp>
#include <rmm/detail/thrust_namespace.h>
#include <rmm/mr/cuda_async_view_memory_resource.hpp>
#include <rmm/mr/device_memory_resource.hpp>

#include <cuda/std/type_traits>
#include <cuda_runtime_api.h>

#include <cstddef>

namespace RMM_NAMESPACE {
namespace mr {
/**
* @addtogroup memory_resources
* @{
* @file
*/

/**
* @brief `device_memory_resource` derived class that uses
* `cudaMallocFromPoolAsync`/`cudaFreeFromPoolAsync` with a pinned memory pool
* for allocation/deallocation.
*/
class cuda_async_pinned_memory_resource final : public device_memory_resource {
public:
/**
* @brief Constructs a cuda_async_pinned_memory_resource with a pinned memory pool for
* the current device.
*
* On CUDA 12.6-12.x, creates a new pinned memory pool using cudaMemPoolCreate.
* On CUDA 13.0+, uses the default pinned memory pool via cudaMemGetDefaultMemPool.
*
* Pool properties such as the release threshold are not modified.
*
* @throws rmm::logic_error if the CUDA build version is less than 12.6
* @throws rmm::logic_error if the CUDA runtime version does not support pinned memory pools
* (requires CUDA 12.6 or higher)
*/
cuda_async_pinned_memory_resource()
{
#if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION
RMM_FAIL(
"cuda_async_pinned_memory_resource requires CUDA 12.6 or higher. "
"This build was compiled with an older CUDA version.");
#else
// Check if pinned memory pools are supported at runtime
RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(),
"cuda_async_pinned_memory_resource requires CUDA 12.6 or higher runtime");

// Use host location for pinned memory pool (id is ignored for cudaMemLocationTypeHost)
cudaMemLocation location{.type = cudaMemLocationTypeHost, .id = 0};

#if CUDA_VERSION >= 13000
// CUDA 13.0+: Use the default pinned memory pool (no cleanup needed)
RMM_CUDA_TRY(cudaMemGetDefaultMemPool(&pool_handle_, &location, cudaMemAllocationTypePinned));
#else
// CUDA 12.6-12.x: Create a new pinned memory pool (needs cleanup)
cudaMemPoolProps pool_props{};
pool_props.allocType = cudaMemAllocationTypePinned;
pool_props.location.type = cudaMemLocationTypeHost;
pool_props.location.id = 0;
RMM_CUDA_TRY(cudaMemPoolCreate(&pool_handle_, &pool_props));
owns_pool_ = true;
#endif

// Enable device access to the pinned memory pool
cudaMemAccessDesc desc{};
desc.location.type = cudaMemLocationTypeDevice;
desc.location.id = rmm::get_current_cuda_device().value();
desc.flags = cudaMemAccessFlagsProtReadWrite;
RMM_CUDA_TRY(cudaMemPoolSetAccess(pool_handle_, &desc, 1));

pool_ = cuda_async_view_memory_resource{pool_handle_};
#endif
}

/**
* @brief Returns the underlying native handle to the CUDA pool
*
* @return cudaMemPool_t Handle to the underlying CUDA pool
*/
[[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }

~cuda_async_pinned_memory_resource() override
{
#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \
CUDA_VERSION < 13000
if (owns_pool_ && pool_handle_ != nullptr) { cudaMemPoolDestroy(pool_handle_); }
#endif
}
cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&) = delete;
cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&) = delete;
cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete;
cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&) = delete;

private:
cudaMemPool_t pool_handle_{};
bool owns_pool_{false};
cuda_async_view_memory_resource pool_{};

/**
* @brief Allocates memory of size at least \p bytes.
*
* The returned pointer will have at minimum 256 byte alignment.
*
* @param bytes The size of the allocation
* @param stream Stream on which to perform allocation
* @return void* Pointer to the newly allocated memory
*/
void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
{
return pool_.allocate(stream, bytes);
}

/**
* @brief Deallocate memory pointed to by \p p.
*
* @param ptr Pointer to be deallocated
* @param bytes The size in bytes of the allocation. This must be equal to the
* value of `bytes` that was passed to the `allocate` call that returned `p`.
* @param stream Stream on which to perform deallocation
*/
void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
{
pool_.deallocate(stream, ptr, bytes);
}

/**
* @brief Compare this resource to another.
*
* @param other The other resource to compare to
* @return true If the two resources are equivalent
* @return false If the two resources are not equal
*/
[[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
{
auto const* async_mr = dynamic_cast<cuda_async_pinned_memory_resource const*>(&other);
return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
}

friend auto get_property(cuda_async_pinned_memory_resource const&,
cuda::mr::device_accessible) noexcept
{
return cuda::mr::device_accessible{};
}
friend auto get_property(cuda_async_pinned_memory_resource const&,
cuda::mr::host_accessible) noexcept
{
return cuda::mr::host_accessible{};
}
};

// static property checks
static_assert(rmm::detail::polyfill::resource<cuda_async_pinned_memory_resource>);
static_assert(rmm::detail::polyfill::async_resource<cuda_async_pinned_memory_resource>);
static_assert(rmm::detail::polyfill::resource_with<cuda_async_pinned_memory_resource,
cuda::mr::host_accessible,
cuda::mr::device_accessible>);
static_assert(rmm::detail::polyfill::async_resource_with<cuda_async_pinned_memory_resource,
cuda::mr::host_accessible,
cuda::mr::device_accessible>);
/** @} */ // end of group
} // namespace mr
} // namespace RMM_NAMESPACE
140 changes: 140 additions & 0 deletions cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#include <rmm/detail/error.hpp>
#include <rmm/detail/runtime_capabilities.hpp>
#include <rmm/mr/cuda_async_pinned_memory_resource.hpp>

#include <cuda_runtime_api.h>

#include <gtest/gtest.h>

namespace rmm::test {
namespace {

using cuda_async_pinned_mr = rmm::mr::cuda_async_pinned_memory_resource;

class AsyncPinnedMRTest : public ::testing::Test {
protected:
void SetUp() override
{
if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) {
GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource "
<< "requires CUDA 12.6 or higher and memory pool support.";
}
}
};

TEST_F(AsyncPinnedMRTest, BasicAllocateDeallocate)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like all the test cases can be parameterized/ templated for both sync and async allocation and deallocation operations

{
const auto alloc_size{100};
cuda_async_pinned_mr mr{};
void* ptr = mr.allocate_sync(alloc_size);
Comment thread
bdice marked this conversation as resolved.
ASSERT_NE(nullptr, ptr);
mr.deallocate_sync(ptr, alloc_size);
}

TEST_F(AsyncPinnedMRTest, EqualityWithSamePool)
{
// Two instances wrapping the same default pinned pool should compare equal if they
// ultimately refer to the same underlying pool handle. Construct two and compare.
cuda_async_pinned_mr mr1{};
cuda_async_pinned_mr mr2{};
EXPECT_TRUE(mr1.is_equal(mr2));
}

TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromHost)
{
const auto alloc_size{sizeof(int) * 100};
cuda_async_pinned_mr mr{};
auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size));
ASSERT_NE(nullptr, ptr);

// Pinned memory should be accessible from host
// Write from host
EXPECT_NO_THROW({
for (int i = 0; i < 100; ++i) {
ptr[i] = i;
}
});

// Verify we can read back
EXPECT_EQ(ptr[0], 0);
EXPECT_EQ(ptr[50], 50);
Comment on lines +55 to +65
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to test that memory is accessible from device too (via some kernel probably, or maybe DtoD memcpy?)

EXPECT_EQ(ptr[99], 99);

mr.deallocate_sync(ptr, alloc_size);
}

TEST_F(AsyncPinnedMRTest, MultipleAllocationsAccessible)
{
const auto alloc_size{512};
cuda_async_pinned_mr mr{};

void* ptr1 = mr.allocate_sync(alloc_size);
void* ptr2 = mr.allocate_sync(alloc_size * 2);
void* ptr3 = mr.allocate_sync(alloc_size / 2);

ASSERT_NE(nullptr, ptr1);
ASSERT_NE(nullptr, ptr2);
ASSERT_NE(nullptr, ptr3);

// Verify all pointers are accessible from host
auto* typed_ptr1 = static_cast<char*>(ptr1);
auto* typed_ptr2 = static_cast<char*>(ptr2);
auto* typed_ptr3 = static_cast<char*>(ptr3);

EXPECT_NO_THROW({
typed_ptr1[0] = 'a';
typed_ptr2[0] = 'b';
typed_ptr3[0] = 'c';
});

EXPECT_EQ(typed_ptr1[0], 'a');
EXPECT_EQ(typed_ptr2[0], 'b');
EXPECT_EQ(typed_ptr3[0], 'c');

mr.deallocate_sync(ptr1, alloc_size);
mr.deallocate_sync(ptr2, alloc_size * 2);
mr.deallocate_sync(ptr3, alloc_size / 2);
}

TEST_F(AsyncPinnedMRTest, PoolHandleIsValid)
{
cuda_async_pinned_mr mr{};
cudaMemPool_t pool_handle = mr.pool_handle();
EXPECT_NE(pool_handle, nullptr);
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also add a device -> pinned host stream ordered copy? Maybe using a device_vector and checking if the copy results in the same

TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromDevice)
{
const auto alloc_size{sizeof(int) * 100};
cuda_async_pinned_mr mr{};
auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size));
ASSERT_NE(nullptr, ptr);

// Initialize from host
for (int i = 0; i < 100; ++i) {
ptr[i] = i;
}

// Allocate device memory and copy from pinned -> device -> back to verify device access
int* d_ptr{};
EXPECT_EQ(cudaMalloc(&d_ptr, alloc_size), cudaSuccess);
EXPECT_EQ(cudaMemcpy(d_ptr, ptr, alloc_size, cudaMemcpyDefault), cudaSuccess);

int result[100];
EXPECT_EQ(cudaMemcpy(result, d_ptr, alloc_size, cudaMemcpyDefault), cudaSuccess);

for (int i = 0; i < 100; ++i) {
EXPECT_EQ(result[i], i);
}

cudaFree(d_ptr);
mr.deallocate_sync(ptr, alloc_size);
}

} // namespace
} // namespace rmm::test
7 changes: 7 additions & 0 deletions python/rmm/rmm/librmm/memory_resource.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@ cdef extern from "rmm/mr/cuda_async_managed_memory_resource.hpp" \
cuda_async_managed_memory_resource() except +
cudaMemPool_t pool_handle() const

cdef extern from "rmm/mr/cuda_async_pinned_memory_resource.hpp" \
namespace "rmm::mr" nogil:

cdef cppclass cuda_async_pinned_memory_resource(device_memory_resource):
cuda_async_pinned_memory_resource() except +
cudaMemPool_t pool_handle() const

cdef extern from "rmm/mr/cuda_async_memory_resource.hpp" \
namespace \
"rmm::mr::cuda_async_memory_resource" \
Expand Down
Loading
Loading