Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,18 @@
#include <cuda/memory_resource>

#include <cstddef>
#include <memory>
#include <mutex>
#include <utility>
#include <vector>

namespace RMM_NAMESPACE {
namespace mr {

// Forward declarations for friend access from detail::fixed_size_memory_resource_impl
class fixed_size_memory_resource;
class multiple_blocks_allocation;

namespace detail {

/**
Expand Down Expand Up @@ -79,6 +85,12 @@ class fixed_size_memory_resource_impl final
std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks);

private:
friend class RMM_NAMESPACE::mr::multiple_blocks_allocation;

// Caller must hold get_mutex().
[[nodiscard]] cudaError_t deallocate_blocks_async_unsafe(std::vector<std::byte*>&& blocks,
cuda_stream_view stream);

free_list blocks_from_upstream(cuda_stream_view stream);

void release();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource> {
bool operator<(stream_event_pair const& rhs) const { return event < rhs.event; }
};

private:
protected:
/**
* @brief get a unique CUDA event (possibly new) associated with `stream`
*
Expand Down
148 changes: 148 additions & 0 deletions cpp/include/rmm/mr/fixed_size_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@
#include <rmm/resource_ref.hpp>

#include <cuda/memory_resource>
#include <cuda/std/span>
#include <cuda/stream_ref>

#include <cstddef>
#include <memory>
#include <vector>

namespace RMM_NAMESPACE {
namespace mr {
Expand Down Expand Up @@ -84,6 +88,150 @@ class RMM_EXPORT fixed_size_memory_resource
static_assert(cuda::mr::resource_with<fixed_size_memory_resource, cuda::mr::device_accessible>,
"fixed_size_memory_resource does not satisfy the cuda::mr::resource concept");

/**
* @brief RAII handle for an allocation that may span multiple fixed-size blocks from a
* `fixed_size_memory_resource`.
*
* When destroyed, all blocks are returned to the memory resource on the same stream used for
* allocation. Copy is disabled to prevent double deallocation; move transfers ownership of the
* blocks. Holds a `fixed_size_memory_resource` (which has shared, refcounted ownership of the
* underlying pool) so the pool outlives the handle.
*/
class RMM_EXPORT multiple_blocks_allocation {
public:
/**
* @brief Allocate device memory spanning one or more fixed-size blocks, stream-ordered on a
Comment thread
wence- marked this conversation as resolved.
* non-PTDS stream.
*
* Use this for allocations larger than a single block. The allocation is ordered on
* `stream`; deallocation (when the returned handle is destroyed) is also ordered on
* the same stream. A single event is recorded for the whole allocation, so there is no
* per-block event overhead.
*
* @param mr The `fixed_size_memory_resource` that supplies blocks. Copied by value since
* `fixed_size_memory_resource` has refcounted shared ownership.
* @param size Minimum number of bytes to allocate. Will be rounded up to a multiple of
* block size (see `get_block_size()` on `*mr`).
* @param stream A non-PTDS CUDA stream on which the allocation is ordered.
* @return Unique handle to the allocation; destroys to deallocate. Empty (zero-size)
* allocation returns a valid handle with size 0 and no blocks.
* @throw rmm::invalid_argument if `stream` is a per-thread default stream.
* @throw Any exception from allocating blocks. Blocks successfully taken from the pool
* before the failure are returned to the pool on `stream` (same ordering as normal
* deallocation).
*/
[[nodiscard]] static std::unique_ptr<multiple_blocks_allocation> make_async(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a factory instead of a normal constructor?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was previously inside the fixed sized mr class. Then I thought a factory method was the best. When I pulled it out, I left it as is. I felt its more idiomatic. We can throw and verify args (I should remove the RMM_EXPECTS statements in the ctr) cleanly. But I am fine either way. WDYT @bdice ?

fixed_size_memory_resource mr, std::size_t size, cuda::stream_ref stream);

/**
* @brief Destroy this handle and return any held blocks to the pool.
*
* `noexcept`. Uses `deallocate_blocks_async_unsafe` under the pool mutex; CUDA errors are
* logged with `RMM_LOG_ERROR` and other exceptions during teardown are caught and logged.
*/
~multiple_blocks_allocation() noexcept;

multiple_blocks_allocation(multiple_blocks_allocation const&) = delete;
multiple_blocks_allocation& operator=(multiple_blocks_allocation const&) = delete;

/**
* @brief Move-constructor
*
* @param other Source handle to move from.
*/
multiple_blocks_allocation(multiple_blocks_allocation&& other) noexcept;

/**
* @brief Move-assignment
* @param other Source handle to move from.
* @return Reference to `*this`.
* @throw rmm::cuda_error if returning the current blocks to the pool fails during `clear()`.
*/
multiple_blocks_allocation& operator=(multiple_blocks_allocation&& other);

/**
* @brief Number of bytes requested for this allocation.
*
* @return Requested size in bytes.
*/
[[nodiscard]] constexpr std::size_t size() const noexcept { return size_; }

/**
* @brief Total capacity in bytes (number of blocks × block size).
*
* @return Capacity in bytes; always >= size().
*/
[[nodiscard]] std::size_t capacity() const noexcept { return block_size() * blocks_.size(); }

/**
* @brief Size in bytes of each block in this allocation.
*
* @return Block size (same as the memory resource's get_block_size()).
*/
[[nodiscard]] std::size_t block_size() const noexcept { return mr_->get_block_size(); }

/**
* @brief Non-owning view of the underlying block pointers.
*
* @return Span of device pointers, one per block; each block has size block_size().
*/
[[nodiscard]] cuda::std::span<std::byte* const> get_blocks() const noexcept
{
return {blocks_.data(), blocks_.size()};
}

/**
* @brief Span over the i-th block's bytes.
*
* @param i Block index in [0, get_blocks().size()).
* @return Span of std::byte over the i-th block.
*/
[[nodiscard]] cuda::std::span<std::byte> operator[](std::size_t i) const
{
return {blocks_[i], mr_->get_block_size()};
}

/**
* @brief Span over the i-th block's bytes with bounds checking.
*
* @param i Block index.
* @return Span of std::byte over the i-th block.
* @throws std::out_of_range if i >= number of blocks.
*/
[[nodiscard]] cuda::std::span<std::byte> at(std::size_t i) const
{
return {blocks_.at(i), mr_->get_block_size()};
}

/**
* @brief Stream on which this allocation is ordered.
*
* @return The stream passed to make_async.
*/
[[nodiscard]] constexpr cuda::stream_ref stream() const noexcept { return stream_; }

/**
* @brief Return all blocks to the pool on `stream()`, then leave this handle empty.
*
* Same ordering as destruction: stream-ordered deallocation on the stream passed to
* `make_async`. After `clear()`, `size()` is 0 and `get_blocks()` is empty.
*
* @throw rmm::cuda_error if the event recording fails.
*/
void clear();

private:
multiple_blocks_allocation(std::size_t size,
std::vector<std::byte*> buffers,
cuda::stream_ref stream,
fixed_size_memory_resource mr) noexcept;

std::vector<std::byte*> blocks_;
std::size_t size_;
cuda::stream_ref stream_;
fixed_size_memory_resource mr_;
};
Comment thread
nirandaperera marked this conversation as resolved.

/** @} */ // end of group
} // namespace mr
} // namespace RMM_NAMESPACE
20 changes: 20 additions & 0 deletions cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/logging_assert.hpp>
#include <rmm/mr/detail/fixed_size_memory_resource_impl.hpp>
#include <rmm/mr/detail/stream_ordered_memory_resource.hpp>

#include <cuda/iterator>
#include <cuda/std/algorithm>
#include <cuda_runtime_api.h>

#include <cstddef>
#include <mutex>
Expand Down Expand Up @@ -107,6 +110,23 @@ std::pair<std::size_t, std::size_t> fixed_size_memory_resource_impl::free_list_s
: std::make_pair(block_size_, blocks.size() * block_size_);
}

cudaError_t fixed_size_memory_resource_impl::deallocate_blocks_async_unsafe(
std::vector<std::byte*>&& blocks, cuda_stream_view stream)
{
if (blocks.empty()) { return cudaSuccess; }

free_list blocks_free_list;
cuda::std::ranges::for_each(blocks, [this, &blocks_free_list](std::byte* ptr) {
blocks_free_list.insert(this->free_block(ptr, get_block_size()));
});

auto stream_event = get_event(stream);
cudaError_t const error = cudaEventRecord(stream_event.event, stream.value());
if (cudaSuccess != error) { return error; }
this->insert_blocks(std::move(blocks_free_list), stream);
return cudaSuccess;
}

#ifdef RMM_DEBUG_PRINT
void fixed_size_memory_resource_impl::print()
{
Expand Down
93 changes: 93 additions & 0 deletions cpp/src/mr/fixed_size_memory_resource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,17 @@
* SPDX-License-Identifier: Apache-2.0
*/

#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/detail/logging_assert.hpp>
#include <rmm/logger.hpp>
#include <rmm/mr/fixed_size_memory_resource.hpp>

#include <cuda/cmath>

#include <cstddef>
#include <mutex>
#include <vector>

namespace RMM_NAMESPACE {
namespace mr {
Expand All @@ -29,5 +37,90 @@ std::size_t fixed_size_memory_resource::get_block_size() const noexcept
return get().get_block_size();
}

// multiple_blocks_allocation

multiple_blocks_allocation::multiple_blocks_allocation(std::size_t size,
std::vector<std::byte*> buffers,
cuda::stream_ref stream,
fixed_size_memory_resource mr) noexcept
: blocks_(std::move(buffers)), size_(size), stream_(stream), mr_(std::move(mr))
{
}

multiple_blocks_allocation::multiple_blocks_allocation(multiple_blocks_allocation&& other) noexcept
: blocks_(std::move(other.blocks_)),
size_(other.size_),
stream_(other.stream_),
mr_(std::move(other.mr_))
{
other.size_ = 0;
}

void multiple_blocks_allocation::clear()
{
if (!blocks_.empty()) {
std::lock_guard<std::mutex> lock(mr_->get_mutex());
RMM_CUDA_TRY(mr_->deallocate_blocks_async_unsafe(std::move(blocks_), stream_));
}
size_ = 0;
}

multiple_blocks_allocation& multiple_blocks_allocation::operator=(
multiple_blocks_allocation&& other)
{
if (this != &other) {
clear();
blocks_ = std::move(other.blocks_);
size_ = other.size_;
stream_ = other.stream_;
mr_ = std::move(other.mr_);
other.size_ = 0;
}
return *this;
}

multiple_blocks_allocation::~multiple_blocks_allocation() noexcept
{
try {
clear();
} catch (...) {
RMM_LOG_ERROR(
"multiple_blocks_allocation: exception while releasing device blocks in destructor");
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

std::unique_ptr<multiple_blocks_allocation> multiple_blocks_allocation::make_async(
fixed_size_memory_resource mr, std::size_t size, cuda::stream_ref stream)
{
RMM_EXPECTS(!cuda_stream_view{stream}.is_per_thread_default(),
"stream must not be a per-thread default stream",
rmm::invalid_argument);

if (size == 0) {
return std::unique_ptr<multiple_blocks_allocation>(
new multiple_blocks_allocation(0, {}, stream, std::move(mr)));
}

auto& self = *mr;
std::lock_guard<std::mutex> lock(self.get_mutex());

auto stream_event = self.get_event(stream);
std::size_t const num_blocks = cuda::ceil_div(size, self.get_block_size());
std::vector<std::byte*> blocks;
blocks.reserve(num_blocks);
try {
for (std::size_t i = 0; i < num_blocks; ++i) {
blocks.push_back(
static_cast<std::byte*>(self.get_block(self.get_block_size(), stream_event).pointer()));
}
} catch (...) {
RMM_CUDA_TRY(self.deallocate_blocks_async_unsafe(std::move(blocks), stream));
throw;
}

return std::unique_ptr<multiple_blocks_allocation>(
new multiple_blocks_allocation(size, std::move(blocks), stream, std::move(mr)));
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

} // namespace mr
} // namespace RMM_NAMESPACE
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ ConfigureTest(SYSTEM_MR_REF_TEST mr/mr_ref_system_tests.cpp)
ConfigureTest(PINNED_MR_REF_TEST mr/mr_ref_pinned_tests.cpp)
ConfigureTest(LOGGING_MR_REF_TEST mr/mr_ref_logging_tests.cpp)
ConfigureTest(FIXED_SIZE_MR_REF_TEST mr/mr_ref_fixed_size_tests.cpp)
ConfigureTest(FIXED_SIZE_MR_TEST mr/fixed_size_mr_test.cpp)
ConfigureTest(DEFAULT_MR_REF_TEST mr/mr_ref_default_tests.cpp)

# general adaptor tests
Expand Down
Loading
Loading