-
Notifications
You must be signed in to change notification settings - Fork 249
Add experimental cuda_async_pinned_memory_resource #2164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,174 @@ | ||
| /* | ||
| * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
| #pragma once | ||
|
|
||
| #include <rmm/cuda_device.hpp> | ||
| #include <rmm/cuda_stream_view.hpp> | ||
| #include <rmm/detail/error.hpp> | ||
| #include <rmm/detail/export.hpp> | ||
| #include <rmm/detail/runtime_capabilities.hpp> | ||
| #include <rmm/detail/thrust_namespace.h> | ||
| #include <rmm/mr/cuda_async_view_memory_resource.hpp> | ||
| #include <rmm/mr/device_memory_resource.hpp> | ||
|
|
||
| #include <cuda/std/type_traits> | ||
| #include <cuda_runtime_api.h> | ||
|
|
||
| #include <cstddef> | ||
|
|
||
| namespace RMM_NAMESPACE { | ||
| namespace mr { | ||
| /** | ||
| * @addtogroup memory_resources | ||
| * @{ | ||
| * @file | ||
| */ | ||
|
|
||
| /** | ||
| * @brief `device_memory_resource` derived class that uses | ||
| * `cudaMallocFromPoolAsync`/`cudaFreeFromPoolAsync` with a pinned memory pool | ||
| * for allocation/deallocation. | ||
| */ | ||
| class cuda_async_pinned_memory_resource final : public device_memory_resource { | ||
| public: | ||
| /** | ||
| * @brief Constructs a cuda_async_pinned_memory_resource with a pinned memory pool for | ||
| * the current device. | ||
| * | ||
| * On CUDA 12.6-12.x, creates a new pinned memory pool using cudaMemPoolCreate. | ||
| * On CUDA 13.0+, uses the default pinned memory pool via cudaMemGetDefaultMemPool. | ||
| * | ||
| * Pool properties such as the release threshold are not modified. | ||
| * | ||
| * @throws rmm::logic_error if the CUDA build version is less than 12.6 | ||
| * @throws rmm::logic_error if the CUDA runtime version does not support pinned memory pools | ||
| * (requires CUDA 12.6 or higher) | ||
| */ | ||
| cuda_async_pinned_memory_resource() | ||
| { | ||
| #if !defined(CUDA_VERSION) || CUDA_VERSION < RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION | ||
| RMM_FAIL( | ||
| "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher. " | ||
| "This build was compiled with an older CUDA version."); | ||
| #else | ||
| // Check if pinned memory pools are supported at runtime | ||
| RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(), | ||
| "cuda_async_pinned_memory_resource requires CUDA 12.6 or higher runtime"); | ||
|
|
||
| // Use host location for pinned memory pool (id is ignored for cudaMemLocationTypeHost) | ||
| cudaMemLocation location{.type = cudaMemLocationTypeHost, .id = 0}; | ||
|
|
||
| #if CUDA_VERSION >= 13000 | ||
| // CUDA 13.0+: Use the default pinned memory pool (no cleanup needed) | ||
| RMM_CUDA_TRY(cudaMemGetDefaultMemPool(&pool_handle_, &location, cudaMemAllocationTypePinned)); | ||
| #else | ||
| // CUDA 12.6-12.x: Create a new pinned memory pool (needs cleanup) | ||
| cudaMemPoolProps pool_props{}; | ||
| pool_props.allocType = cudaMemAllocationTypePinned; | ||
| pool_props.location.type = cudaMemLocationTypeHost; | ||
| pool_props.location.id = 0; | ||
| RMM_CUDA_TRY(cudaMemPoolCreate(&pool_handle_, &pool_props)); | ||
| owns_pool_ = true; | ||
| #endif | ||
|
|
||
| // Enable device access to the pinned memory pool | ||
| cudaMemAccessDesc desc{}; | ||
| desc.location.type = cudaMemLocationTypeDevice; | ||
| desc.location.id = rmm::get_current_cuda_device().value(); | ||
| desc.flags = cudaMemAccessFlagsProtReadWrite; | ||
| RMM_CUDA_TRY(cudaMemPoolSetAccess(pool_handle_, &desc, 1)); | ||
|
|
||
| pool_ = cuda_async_view_memory_resource{pool_handle_}; | ||
| #endif | ||
| } | ||
|
|
||
| /** | ||
| * @brief Returns the underlying native handle to the CUDA pool | ||
| * | ||
| * @return cudaMemPool_t Handle to the underlying CUDA pool | ||
| */ | ||
| [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); } | ||
|
|
||
| ~cuda_async_pinned_memory_resource() override | ||
| { | ||
| #if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION && \ | ||
| CUDA_VERSION < 13000 | ||
| if (owns_pool_ && pool_handle_ != nullptr) { cudaMemPoolDestroy(pool_handle_); } | ||
| #endif | ||
| } | ||
| cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&) = delete; | ||
| cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&) = delete; | ||
| cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete; | ||
| cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&) = delete; | ||
|
|
||
| private: | ||
| cudaMemPool_t pool_handle_{}; | ||
| bool owns_pool_{false}; | ||
| cuda_async_view_memory_resource pool_{}; | ||
|
|
||
| /** | ||
| * @brief Allocates memory of size at least \p bytes. | ||
| * | ||
| * The returned pointer will have at minimum 256 byte alignment. | ||
| * | ||
| * @param bytes The size of the allocation | ||
| * @param stream Stream on which to perform allocation | ||
| * @return void* Pointer to the newly allocated memory | ||
| */ | ||
| void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override | ||
| { | ||
| return pool_.allocate(stream, bytes); | ||
| } | ||
|
|
||
| /** | ||
| * @brief Deallocate memory pointed to by \p p. | ||
| * | ||
| * @param ptr Pointer to be deallocated | ||
| * @param bytes The size in bytes of the allocation. This must be equal to the | ||
| * value of `bytes` that was passed to the `allocate` call that returned `p`. | ||
| * @param stream Stream on which to perform deallocation | ||
| */ | ||
| void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override | ||
| { | ||
| pool_.deallocate(stream, ptr, bytes); | ||
| } | ||
|
|
||
| /** | ||
| * @brief Compare this resource to another. | ||
| * | ||
| * @param other The other resource to compare to | ||
| * @return true If the two resources are equivalent | ||
| * @return false If the two resources are not equal | ||
| */ | ||
| [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override | ||
| { | ||
| auto const* async_mr = dynamic_cast<cuda_async_pinned_memory_resource const*>(&other); | ||
| return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle()); | ||
| } | ||
|
|
||
| friend auto get_property(cuda_async_pinned_memory_resource const&, | ||
| cuda::mr::device_accessible) noexcept | ||
| { | ||
| return cuda::mr::device_accessible{}; | ||
| } | ||
| friend auto get_property(cuda_async_pinned_memory_resource const&, | ||
| cuda::mr::host_accessible) noexcept | ||
| { | ||
| return cuda::mr::host_accessible{}; | ||
| } | ||
| }; | ||
|
|
||
| // static property checks | ||
| static_assert(rmm::detail::polyfill::resource<cuda_async_pinned_memory_resource>); | ||
| static_assert(rmm::detail::polyfill::async_resource<cuda_async_pinned_memory_resource>); | ||
| static_assert(rmm::detail::polyfill::resource_with<cuda_async_pinned_memory_resource, | ||
| cuda::mr::host_accessible, | ||
| cuda::mr::device_accessible>); | ||
| static_assert(rmm::detail::polyfill::async_resource_with<cuda_async_pinned_memory_resource, | ||
| cuda::mr::host_accessible, | ||
| cuda::mr::device_accessible>); | ||
| /** @} */ // end of group | ||
| } // namespace mr | ||
| } // namespace RMM_NAMESPACE |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,140 @@ | ||
| /* | ||
| * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| #include <rmm/detail/error.hpp> | ||
| #include <rmm/detail/runtime_capabilities.hpp> | ||
| #include <rmm/mr/cuda_async_pinned_memory_resource.hpp> | ||
|
|
||
| #include <cuda_runtime_api.h> | ||
|
|
||
| #include <gtest/gtest.h> | ||
|
|
||
| namespace rmm::test { | ||
| namespace { | ||
|
|
||
| using cuda_async_pinned_mr = rmm::mr::cuda_async_pinned_memory_resource; | ||
|
|
||
| class AsyncPinnedMRTest : public ::testing::Test { | ||
| protected: | ||
| void SetUp() override | ||
| { | ||
| if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) { | ||
| GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource " | ||
| << "requires CUDA 12.6 or higher and memory pool support."; | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| TEST_F(AsyncPinnedMRTest, BasicAllocateDeallocate) | ||
| { | ||
| const auto alloc_size{100}; | ||
| cuda_async_pinned_mr mr{}; | ||
| void* ptr = mr.allocate_sync(alloc_size); | ||
|
bdice marked this conversation as resolved.
|
||
| ASSERT_NE(nullptr, ptr); | ||
| mr.deallocate_sync(ptr, alloc_size); | ||
| } | ||
|
|
||
| TEST_F(AsyncPinnedMRTest, EqualityWithSamePool) | ||
| { | ||
| // Two instances wrapping the same default pinned pool should compare equal if they | ||
| // ultimately refer to the same underlying pool handle. Construct two and compare. | ||
| cuda_async_pinned_mr mr1{}; | ||
| cuda_async_pinned_mr mr2{}; | ||
| EXPECT_TRUE(mr1.is_equal(mr2)); | ||
| } | ||
|
|
||
| TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromHost) | ||
| { | ||
| const auto alloc_size{sizeof(int) * 100}; | ||
| cuda_async_pinned_mr mr{}; | ||
| auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size)); | ||
| ASSERT_NE(nullptr, ptr); | ||
|
|
||
| // Pinned memory should be accessible from host | ||
| // Write from host | ||
| EXPECT_NO_THROW({ | ||
| for (int i = 0; i < 100; ++i) { | ||
| ptr[i] = i; | ||
| } | ||
| }); | ||
|
|
||
| // Verify we can read back | ||
| EXPECT_EQ(ptr[0], 0); | ||
| EXPECT_EQ(ptr[50], 50); | ||
|
Comment on lines
+55
to
+65
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to test that memory is accessible from device too (via some kernel probably, or maybe DtoD memcpy?) |
||
| EXPECT_EQ(ptr[99], 99); | ||
|
|
||
| mr.deallocate_sync(ptr, alloc_size); | ||
| } | ||
|
|
||
| TEST_F(AsyncPinnedMRTest, MultipleAllocationsAccessible) | ||
| { | ||
| const auto alloc_size{512}; | ||
| cuda_async_pinned_mr mr{}; | ||
|
|
||
| void* ptr1 = mr.allocate_sync(alloc_size); | ||
| void* ptr2 = mr.allocate_sync(alloc_size * 2); | ||
| void* ptr3 = mr.allocate_sync(alloc_size / 2); | ||
|
|
||
| ASSERT_NE(nullptr, ptr1); | ||
| ASSERT_NE(nullptr, ptr2); | ||
| ASSERT_NE(nullptr, ptr3); | ||
|
|
||
| // Verify all pointers are accessible from host | ||
| auto* typed_ptr1 = static_cast<char*>(ptr1); | ||
| auto* typed_ptr2 = static_cast<char*>(ptr2); | ||
| auto* typed_ptr3 = static_cast<char*>(ptr3); | ||
|
|
||
| EXPECT_NO_THROW({ | ||
| typed_ptr1[0] = 'a'; | ||
| typed_ptr2[0] = 'b'; | ||
| typed_ptr3[0] = 'c'; | ||
| }); | ||
|
|
||
| EXPECT_EQ(typed_ptr1[0], 'a'); | ||
| EXPECT_EQ(typed_ptr2[0], 'b'); | ||
| EXPECT_EQ(typed_ptr3[0], 'c'); | ||
|
|
||
| mr.deallocate_sync(ptr1, alloc_size); | ||
| mr.deallocate_sync(ptr2, alloc_size * 2); | ||
| mr.deallocate_sync(ptr3, alloc_size / 2); | ||
| } | ||
|
|
||
| TEST_F(AsyncPinnedMRTest, PoolHandleIsValid) | ||
| { | ||
| cuda_async_pinned_mr mr{}; | ||
| cudaMemPool_t pool_handle = mr.pool_handle(); | ||
| EXPECT_NE(pool_handle, nullptr); | ||
| } | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also add a device -> pinned host stream ordered copy? Maybe using a device_vector and checking if the copy results in the same |
||
| TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromDevice) | ||
| { | ||
| const auto alloc_size{sizeof(int) * 100}; | ||
| cuda_async_pinned_mr mr{}; | ||
| auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size)); | ||
| ASSERT_NE(nullptr, ptr); | ||
|
|
||
| // Initialize from host | ||
| for (int i = 0; i < 100; ++i) { | ||
| ptr[i] = i; | ||
| } | ||
|
|
||
| // Allocate device memory and copy from pinned -> device -> back to verify device access | ||
| int* d_ptr{}; | ||
| EXPECT_EQ(cudaMalloc(&d_ptr, alloc_size), cudaSuccess); | ||
| EXPECT_EQ(cudaMemcpy(d_ptr, ptr, alloc_size, cudaMemcpyDefault), cudaSuccess); | ||
|
|
||
| int result[100]; | ||
| EXPECT_EQ(cudaMemcpy(result, d_ptr, alloc_size, cudaMemcpyDefault), cudaSuccess); | ||
|
|
||
| for (int i = 0; i < 100; ++i) { | ||
| EXPECT_EQ(result[i], i); | ||
| } | ||
|
|
||
| cudaFree(d_ptr); | ||
| mr.deallocate_sync(ptr, alloc_size); | ||
| } | ||
|
|
||
| } // namespace | ||
| } // namespace rmm::test | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like all the test cases can be parameterized/ templated for both sync and async allocation and deallocation operations