rapidsai · nirandaperera · Apr 24, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -12,12 +12,18 @@
 #include <cuda/memory_resource>
 
 #include <cstddef>
+#include <memory>
 #include <mutex>
 #include <utility>
 #include <vector>
 
 namespace RMM_NAMESPACE {
 namespace mr {
+
+// Forward declarations for friend access from detail::fixed_size_memory_resource_impl
+class fixed_size_memory_resource;
+class multiple_blocks_allocation;
+
 namespace detail {
 
 /**
@@ -79,6 +85,12 @@ class fixed_size_memory_resource_impl final
   std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks);
 
  private:
+  friend class RMM_NAMESPACE::mr::multiple_blocks_allocation;
+
+  // Caller must hold get_mutex().
+  [[nodiscard]] cudaError_t deallocate_blocks_async_unsafe(std::vector<std::byte*>&& blocks,
+                                                           cuda_stream_view stream);
+
   free_list blocks_from_upstream(cuda_stream_view stream);
 
   void release();

@@ -290,7 +290,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource> {
     bool operator<(stream_event_pair const& rhs) const { return event < rhs.event; }
   };
 
- private:
+ protected:
   /**
    * @brief get a unique CUDA event (possibly new) associated with `stream`
    *

@@ -9,8 +9,12 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/memory_resource>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
 
 #include <cstddef>
+#include <memory>
+#include <vector>
 
 namespace RMM_NAMESPACE {
 namespace mr {
@@ -84,6 +88,150 @@ class RMM_EXPORT fixed_size_memory_resource
 static_assert(cuda::mr::resource_with<fixed_size_memory_resource, cuda::mr::device_accessible>,
               "fixed_size_memory_resource does not satisfy the cuda::mr::resource concept");
 
+/**
+ * @brief RAII handle for an allocation that may span multiple fixed-size blocks from a
+ *        `fixed_size_memory_resource`.
+ *
+ * When destroyed, all blocks are returned to the memory resource on the same stream used for
+ * allocation. Copy is disabled to prevent double deallocation; move transfers ownership of the
+ * blocks. Holds a `fixed_size_memory_resource` (which has shared, refcounted ownership of the
+ * underlying pool) so the pool outlives the handle.
+ */
+class RMM_EXPORT multiple_blocks_allocation {
+ public:
+  /**
+   * @brief Allocate device memory spanning one or more fixed-size blocks, stream-ordered on a
+   * non-PTDS stream.
+   *
+   * Use this for allocations larger than a single block. The allocation is ordered on
+   * `stream`; deallocation (when the returned handle is destroyed) is also ordered on
+   * the same stream. A single event is recorded for the whole allocation, so there is no
+   * per-block event overhead.
+   *
+   * @param mr The `fixed_size_memory_resource` that supplies blocks. Copied by value since
+   *        `fixed_size_memory_resource` has refcounted shared ownership.
+   * @param size Minimum number of bytes to allocate. Will be rounded up to a multiple of
+   *        block size (see `get_block_size()` on `*mr`).
+   * @param stream A non-PTDS CUDA stream on which the allocation is ordered.
+   * @return Unique handle to the allocation; destroys to deallocate. Empty (zero-size)
+   *         allocation returns a valid handle with size 0 and no blocks.
+   * @throw rmm::invalid_argument if `stream` is a per-thread default stream.
+   * @throw Any exception from allocating blocks. Blocks successfully taken from the pool
+   *        before the failure are returned to the pool on `stream` (same ordering as normal
+   *        deallocation).
+   */
+  [[nodiscard]] static std::unique_ptr<multiple_blocks_allocation> make_async(
+    fixed_size_memory_resource mr, std::size_t size, cuda::stream_ref stream);
+
+  /**
+   * @brief Destroy this handle and return any held blocks to the pool.
+   *
+   * `noexcept`. Uses `deallocate_blocks_async_unsafe` under the pool mutex; CUDA errors are
+   * logged with `RMM_LOG_ERROR` and other exceptions during teardown are caught and logged.
+   */
+  ~multiple_blocks_allocation() noexcept;
+
+  multiple_blocks_allocation(multiple_blocks_allocation const&)            = delete;
+  multiple_blocks_allocation& operator=(multiple_blocks_allocation const&) = delete;
+
+  /**
+   * @brief Move-constructor
+   *
+   * @param other Source handle to move from.
+   */
+  multiple_blocks_allocation(multiple_blocks_allocation&& other) noexcept;
+
+  /**
+   * @brief Move-assignment
+   * @param other Source handle to move from.
+   * @return Reference to `*this`.
+   * @throw rmm::cuda_error if returning the current blocks to the pool fails during `clear()`.
+   */
+  multiple_blocks_allocation& operator=(multiple_blocks_allocation&& other);
+
+  /**
+   * @brief Number of bytes requested for this allocation.
+   *
+   * @return Requested size in bytes.
+   */
+  [[nodiscard]] constexpr std::size_t size() const noexcept { return size_; }
+
+  /**
+   * @brief Total capacity in bytes (number of blocks × block size).
+   *
+   * @return Capacity in bytes; always >= size().
+   */
+  [[nodiscard]] std::size_t capacity() const noexcept { return block_size() * blocks_.size(); }
+
+  /**
+   * @brief Size in bytes of each block in this allocation.
+   *
+   * @return Block size (same as the memory resource's get_block_size()).
+   */
+  [[nodiscard]] std::size_t block_size() const noexcept { return mr_->get_block_size(); }
+
+  /**
+   * @brief Non-owning view of the underlying block pointers.
+   *
+   * @return Span of device pointers, one per block; each block has size block_size().
+   */
+  [[nodiscard]] cuda::std::span<std::byte* const> get_blocks() const noexcept
+  {
+    return {blocks_.data(), blocks_.size()};
+  }
+
+  /**
+   * @brief Span over the i-th block's bytes.
+   *
+   * @param i Block index in [0, get_blocks().size()).
+   * @return Span of std::byte over the i-th block.
+   */
+  [[nodiscard]] cuda::std::span<std::byte> operator[](std::size_t i) const
+  {
+    return {blocks_[i], mr_->get_block_size()};
+  }
+
+  /**
+   * @brief Span over the i-th block's bytes with bounds checking.
+   *
+   * @param i Block index.
+   * @return Span of std::byte over the i-th block.
+   * @throws std::out_of_range if i >= number of blocks.
+   */
+  [[nodiscard]] cuda::std::span<std::byte> at(std::size_t i) const
+  {
+    return {blocks_.at(i), mr_->get_block_size()};
+  }
+
+  /**
+   * @brief Stream on which this allocation is ordered.
+   *
+   * @return The stream passed to make_async.
+   */
+  [[nodiscard]] constexpr cuda::stream_ref stream() const noexcept { return stream_; }
+
+  /**
+   * @brief Return all blocks to the pool on `stream()`, then leave this handle empty.
+   *
+   * Same ordering as destruction: stream-ordered deallocation on the stream passed to
+   * `make_async`. After `clear()`, `size()` is 0 and `get_blocks()` is empty.
+   *
+   * @throw rmm::cuda_error if the event recording fails.
+   */
+  void clear();
+
+ private:
+  multiple_blocks_allocation(std::size_t size,
+                             std::vector<std::byte*> buffers,
+                             cuda::stream_ref stream,
+                             fixed_size_memory_resource mr) noexcept;
+
+  std::vector<std::byte*> blocks_;
+  std::size_t size_;
+  cuda::stream_ref stream_;
+  fixed_size_memory_resource mr_;
+};
+
 /** @} */  // end of group
 }  // namespace mr
 }  // namespace RMM_NAMESPACE
diff --git a/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp b/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp
@@ -7,8 +7,11 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/mr/detail/fixed_size_memory_resource_impl.hpp>
+#include <rmm/mr/detail/stream_ordered_memory_resource.hpp>
 
 #include <cuda/iterator>
+#include <cuda/std/algorithm>
+#include <cuda_runtime_api.h>
 
 #include <cstddef>
 #include <mutex>
@@ -107,6 +110,23 @@ std::pair<std::size_t, std::size_t> fixed_size_memory_resource_impl::free_list_s
                            : std::make_pair(block_size_, blocks.size() * block_size_);
 }
 
+cudaError_t fixed_size_memory_resource_impl::deallocate_blocks_async_unsafe(
+  std::vector<std::byte*>&& blocks, cuda_stream_view stream)
+{
+  if (blocks.empty()) { return cudaSuccess; }
+
+  free_list blocks_free_list;
+  cuda::std::ranges::for_each(blocks, [this, &blocks_free_list](std::byte* ptr) {
+    blocks_free_list.insert(this->free_block(ptr, get_block_size()));
+  });
+
+  auto stream_event       = get_event(stream);
+  cudaError_t const error = cudaEventRecord(stream_event.event, stream.value());
+  if (cudaSuccess != error) { return error; }
+  this->insert_blocks(std::move(blocks_free_list), stream);
+  return cudaSuccess;
+}
+
 #ifdef RMM_DEBUG_PRINT
 void fixed_size_memory_resource_impl::print()
 {

diff --git a/cpp/src/mr/fixed_size_memory_resource.cpp b/cpp/src/mr/fixed_size_memory_resource.cpp
@@ -3,9 +3,17 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/detail/logging_assert.hpp>
+#include <rmm/logger.hpp>
 #include <rmm/mr/fixed_size_memory_resource.hpp>
 
+#include <cuda/cmath>
+
 #include <cstddef>
+#include <mutex>
+#include <vector>
 
 namespace RMM_NAMESPACE {
 namespace mr {
@@ -29,5 +37,90 @@ std::size_t fixed_size_memory_resource::get_block_size() const noexcept
   return get().get_block_size();
 }
 
+// multiple_blocks_allocation
+
+multiple_blocks_allocation::multiple_blocks_allocation(std::size_t size,
+                                                       std::vector<std::byte*> buffers,
+                                                       cuda::stream_ref stream,
+                                                       fixed_size_memory_resource mr) noexcept
+  : blocks_(std::move(buffers)), size_(size), stream_(stream), mr_(std::move(mr))
+{
+}
+
+multiple_blocks_allocation::multiple_blocks_allocation(multiple_blocks_allocation&& other) noexcept
+  : blocks_(std::move(other.blocks_)),
+    size_(other.size_),
+    stream_(other.stream_),
+    mr_(std::move(other.mr_))
+{
+  other.size_ = 0;
+}
+
+void multiple_blocks_allocation::clear()
+{
+  if (!blocks_.empty()) {
+    std::lock_guard<std::mutex> lock(mr_->get_mutex());
+    RMM_CUDA_TRY(mr_->deallocate_blocks_async_unsafe(std::move(blocks_), stream_));
+  }
+  size_ = 0;
+}
+
+multiple_blocks_allocation& multiple_blocks_allocation::operator=(
+  multiple_blocks_allocation&& other)
+{
+  if (this != &other) {
+    clear();
+    blocks_     = std::move(other.blocks_);
+    size_       = other.size_;
+    stream_     = other.stream_;
+    mr_         = std::move(other.mr_);
+    other.size_ = 0;
+  }
+  return *this;
+}
+
+multiple_blocks_allocation::~multiple_blocks_allocation() noexcept
+{
+  try {
+    clear();
+  } catch (...) {
+    RMM_LOG_ERROR(
+      "multiple_blocks_allocation: exception while releasing device blocks in destructor");
+  }
+}
+
+std::unique_ptr<multiple_blocks_allocation> multiple_blocks_allocation::make_async(
+  fixed_size_memory_resource mr, std::size_t size, cuda::stream_ref stream)
+{
+  RMM_EXPECTS(!cuda_stream_view{stream}.is_per_thread_default(),
+              "stream must not be a per-thread default stream",
+              rmm::invalid_argument);
+
+  if (size == 0) {
+    return std::unique_ptr<multiple_blocks_allocation>(
+      new multiple_blocks_allocation(0, {}, stream, std::move(mr)));
+  }
+
+  auto& self = *mr;
+  std::lock_guard<std::mutex> lock(self.get_mutex());
+
+  auto stream_event            = self.get_event(stream);
+  std::size_t const num_blocks = cuda::ceil_div(size, self.get_block_size());
+  std::vector<std::byte*> blocks;
+  blocks.reserve(num_blocks);
+  try {
+    for (std::size_t i = 0; i < num_blocks; ++i) {
+      blocks.push_back(
+        static_cast<std::byte*>(self.get_block(self.get_block_size(), stream_event).pointer()));
+    }
+  } catch (...) {
+    RMM_CUDA_TRY(self.deallocate_blocks_async_unsafe(std::move(blocks), stream));
+    throw;
+  }
+
+  return std::unique_ptr<multiple_blocks_allocation>(
+    new multiple_blocks_allocation(size, std::move(blocks), stream, std::move(mr)));
+}
+
 }  // namespace mr
 }  // namespace RMM_NAMESPACE
@@ -145,6 +145,7 @@ ConfigureTest(SYSTEM_MR_REF_TEST mr/mr_ref_system_tests.cpp)
 ConfigureTest(PINNED_MR_REF_TEST mr/mr_ref_pinned_tests.cpp)
 ConfigureTest(LOGGING_MR_REF_TEST mr/mr_ref_logging_tests.cpp)
 ConfigureTest(FIXED_SIZE_MR_REF_TEST mr/mr_ref_fixed_size_tests.cpp)
+ConfigureTest(FIXED_SIZE_MR_TEST mr/fixed_size_mr_test.cpp)
 ConfigureTest(DEFAULT_MR_REF_TEST mr/mr_ref_default_tests.cpp)
 
 # general adaptor tests