From 2ebec2bdfa700df06b6131412b5990f942fa372b Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 23 Apr 2026 18:08:27 -0700
Subject: [PATCH 1/6] adding multiblock allocations

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 .../fixed_size_memory_resource_impl.hpp       |  11 ++
 .../detail/stream_ordered_memory_resource.hpp |   2 +-
 .../rmm/mr/fixed_size_memory_resource.hpp     | 117 +++++++++++++++++
 .../fixed_size_memory_resource_impl.cpp       |  19 +++
 cpp/src/mr/fixed_size_memory_resource.cpp     |  55 ++++++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/mr/fixed_size_mr_test.cpp           | 118 ++++++++++++++++++
 7 files changed, 322 insertions(+), 1 deletion(-)
 create mode 100644 cpp/tests/mr/fixed_size_mr_test.cpp
diff --git a/cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp b/cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp
index c651f5077..49580d182 100644
--- a/cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp
+++ b/cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp
@@ -12,12 +12,18 @@
 #include <cuda/memory_resource>
 
 #include <cstddef>
+#include <memory>
 #include <mutex>
 #include <utility>
 #include <vector>
 
 namespace RMM_NAMESPACE {
 namespace mr {
+
+// Forward declarations for friend access from detail::fixed_size_memory_resource_impl
+class fixed_size_memory_resource;
+class multiple_blocks_allocation;
+
 namespace detail {
 
 /**
@@ -79,6 +85,11 @@ class fixed_size_memory_resource_impl final
   std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks);
 
  private:
+  friend class RMM_NAMESPACE::mr::multiple_blocks_allocation;
+
+  // Caller must hold get_mutex().
+  void deallocate_blocks_async_unsafe(std::vector<std::byte*>&& blocks, cuda_stream_view stream);
+
   free_list blocks_from_upstream(cuda_stream_view stream);
 
   void release();
diff --git a/cpp/include/rmm/mr/detail/stream_ordered_memory_resource.hpp b/cpp/include/rmm/mr/detail/stream_ordered_memory_resource.hpp
index e8de8b65e..8da377a16 100644
--- a/cpp/include/rmm/mr/detail/stream_ordered_memory_resource.hpp
+++ b/cpp/include/rmm/mr/detail/stream_ordered_memory_resource.hpp
@@ -290,7 +290,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource> {
     bool operator<(stream_event_pair const& rhs) const { return event < rhs.event; }
   };
 
- private:
+ protected:
   /**
    * @brief get a unique CUDA event (possibly new) associated with `stream`
    *
diff --git a/cpp/include/rmm/mr/fixed_size_memory_resource.hpp b/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
index 5ba97813d..140c1fd0e 100644
--- a/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
+++ b/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
@@ -4,13 +4,17 @@
  */
 #pragma once
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/detail/fixed_size_memory_resource_impl.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/memory_resource>
+#include <cuda/std/span>
 
 #include <cstddef>
+#include <memory>
+#include <vector>
 
 namespace RMM_NAMESPACE {
 namespace mr {
@@ -84,6 +88,119 @@ class RMM_EXPORT fixed_size_memory_resource
 static_assert(cuda::mr::resource_with<fixed_size_memory_resource, cuda::mr::device_accessible>,
               "fixed_size_memory_resource does not satisfy the cuda::mr::resource concept");
 
+/**
+ * @brief RAII handle for an allocation that may span multiple fixed-size blocks from a
+ *        `fixed_size_memory_resource`.
+ *
+ * Returned by `allocate_blocks_async`. When destroyed, all blocks are returned to the
+ * memory resource on the same stream used for allocation. Move and copy are disabled to
+ * prevent double deallocation. Holds a `fixed_size_memory_resource` (which has shared,
+ * refcounted ownership of the underlying pool) so the pool outlives the handle.
+ */
+class RMM_EXPORT multiple_blocks_allocation {
+ public:
+  /**
+   * @brief Allocate device memory spanning one or more fixed-size blocks, stream-ordered.
+   *
+   * Use this for allocations larger than a single block. The allocation is ordered on
+   * `stream`; deallocation (when the returned handle is destroyed) is also ordered on
+   * the same stream. A single event is recorded for the whole allocation, so there is no
+   * per-block event overhead.
+   *
+   * @param mr The `fixed_size_memory_resource` that supplies blocks. Copied by value since
+   *        `fixed_size_memory_resource` has refcounted shared ownership.
+   * @param size Minimum number of bytes to allocate. Will be rounded up to a multiple of
+   *        block size (see `get_block_size()` on `*mr`).
+   * @param stream CUDA stream on which the allocation is ordered.
+   * @return Unique handle to the allocation; destroys to deallocate. Empty (zero-size)
+   *         allocation returns a valid handle with size 0 and no blocks.
+   * @throw Any exception from allocating blocks. Blocks successfully taken from the pool
+   *        before the failure are returned to the pool on `stream` (same ordering as normal
+   *        deallocation).
+   */
+  [[nodiscard]] static std::unique_ptr<multiple_blocks_allocation> make_async(
+    fixed_size_memory_resource mr, std::size_t size, cuda_stream_view stream);
+
+  ~multiple_blocks_allocation();
+
+  multiple_blocks_allocation(multiple_blocks_allocation const&)            = delete;
+  multiple_blocks_allocation& operator=(multiple_blocks_allocation const&) = delete;
+  multiple_blocks_allocation(multiple_blocks_allocation&&)                 = delete;
+  multiple_blocks_allocation& operator=(multiple_blocks_allocation&&)      = delete;
+
+  /**
+   * @brief Number of bytes requested for this allocation.
+   *
+   * @return Requested size in bytes.
+   */
+  constexpr std::size_t size() const noexcept { return size_; }
+
+  /**
+   * @brief Total capacity in bytes (number of blocks × block size).
+   *
+   * @return Capacity in bytes; always >= size().
+   */
+  std::size_t capacity() const noexcept { return block_size() * blocks_.size(); }
+
+  /**
+   * @brief Size in bytes of each block in this allocation.
+   *
+   * @return Block size (same as the memory resource's get_block_size()).
+   */
+  std::size_t block_size() const noexcept { return mr_->get_block_size(); }
+
+  /**
+   * @brief Non-owning view of the underlying block pointers.
+   *
+   * @return Span of device pointers, one per block; each block has size block_size().
+   */
+  [[nodiscard]] cuda::std::span<std::byte* const> get_blocks() const noexcept
+  {
+    return {blocks_.data(), blocks_.size()};
+  }
+
+  /**
+   * @brief Span over the i-th block's bytes.
+   *
+   * @param i Block index in [0, get_blocks().size()).
+   * @return Span of std::byte over the i-th block.
+   */
+  cuda::std::span<std::byte> operator[](std::size_t i) const
+  {
+    return {blocks_[i], mr_->get_block_size()};
+  }
+
+  /**
+   * @brief Span over the i-th block's bytes with bounds checking.
+   *
+   * @param i Block index.
+   * @return Span of std::byte over the i-th block.
+   * @throws std::out_of_range if i >= number of blocks.
+   */
+  cuda::std::span<std::byte> at(std::size_t i) const
+  {
+    return {blocks_.at(i), mr_->get_block_size()};
+  }
+
+  /**
+   * @brief Stream on which this allocation is ordered.
+   *
+   * @return The stream passed to make_async.
+   */
+  constexpr cuda_stream_view stream() const noexcept { return stream_; }
+
+ private:
+  multiple_blocks_allocation(std::size_t size,
+                             std::vector<std::byte*> buffers,
+                             cuda_stream_view stream,
+                             fixed_size_memory_resource mr);
+
+  std::vector<std::byte*> blocks_;
+  std::size_t const size_;
+  cuda_stream_view stream_;
+  fixed_size_memory_resource mr_;
+};
+
 /** @} */  // end of group
 }  // namespace mr
 }  // namespace RMM_NAMESPACE
diff --git a/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp b/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp
index e17f6ecdd..27ba9ee62 100644
--- a/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp
+++ b/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp
@@ -5,10 +5,14 @@
 
 #include <rmm/aligned.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/mr/detail/fixed_size_memory_resource_impl.hpp>
+#include <rmm/mr/detail/stream_ordered_memory_resource.hpp>
 
 #include <cuda/iterator>
+#include <cuda/std/algorithm>
+#include <cuda_runtime_api.h>
 
 #include <cstddef>
 #include <mutex>
@@ -107,6 +111,21 @@ std::pair<std::size_t, std::size_t> fixed_size_memory_resource_impl::free_list_s
                            : std::make_pair(block_size_, blocks.size() * block_size_);
 }
 
+void fixed_size_memory_resource_impl::deallocate_blocks_async_unsafe(
+  std::vector<std::byte*>&& blocks, cuda_stream_view stream)
+{
+  if (blocks.empty()) { return; }
+
+  free_list blocks_free_list;
+  cuda::std::ranges::for_each(blocks, [this, &blocks_free_list](std::byte* ptr) {
+    blocks_free_list.insert(this->free_block(ptr, get_block_size()));
+  });
+
+  auto stream_event = get_event(stream);
+  RMM_ASSERT_CUDA_SUCCESS(cudaEventRecord(stream_event.event, stream.value()));
+  this->insert_blocks(std::move(blocks_free_list), stream);
+}
+
 #ifdef RMM_DEBUG_PRINT
 void fixed_size_memory_resource_impl::print()
 {
diff --git a/cpp/src/mr/fixed_size_memory_resource.cpp b/cpp/src/mr/fixed_size_memory_resource.cpp
index 6f2275d2a..da0cf40d7 100644
--- a/cpp/src/mr/fixed_size_memory_resource.cpp
+++ b/cpp/src/mr/fixed_size_memory_resource.cpp
@@ -3,9 +3,14 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <rmm/detail/logging_assert.hpp>
 #include <rmm/mr/fixed_size_memory_resource.hpp>
 
+#include <cuda/cmath>
+
 #include <cstddef>
+#include <mutex>
+#include <vector>
 
 namespace RMM_NAMESPACE {
 namespace mr {
@@ -29,5 +34,55 @@ std::size_t fixed_size_memory_resource::get_block_size() const noexcept
   return get().get_block_size();
 }
 
+// multiple_blocks_allocation
+
+multiple_blocks_allocation::multiple_blocks_allocation(std::size_t size,
+                                                       std::vector<std::byte*> buffers,
+                                                       cuda_stream_view stream,
+                                                       fixed_size_memory_resource mr)
+  : blocks_(std::move(buffers)), size_(size), stream_(stream), mr_(std::move(mr))
+{
+  RMM_LOGGING_ASSERT(size_ <= mr_->get_block_size() * blocks_.size());
+  RMM_LOGGING_ASSERT(blocks_.empty() ||
+                     blocks_.size() == cuda::ceil_div(size_, mr_->get_block_size()));
+}
+
+multiple_blocks_allocation::~multiple_blocks_allocation()
+{
+  if (!blocks_.empty()) {
+    std::lock_guard<std::mutex> lock(mr_->get_mutex());
+    mr_->deallocate_blocks_async_unsafe(std::move(blocks_), stream_);
+  }
+}
+
+std::unique_ptr<multiple_blocks_allocation> multiple_blocks_allocation::make_async(
+  fixed_size_memory_resource mr, std::size_t size, cuda_stream_view stream)
+{
+  if (size == 0) {
+    return std::unique_ptr<multiple_blocks_allocation>(
+      new multiple_blocks_allocation(0, {}, stream, std::move(mr)));
+  }
+
+  auto& self = *mr;
+  std::lock_guard<std::mutex> lock(self.get_mutex());
+
+  auto stream_event            = self.get_event(stream);
+  std::size_t const num_blocks = cuda::ceil_div(size, self.get_block_size());
+  std::vector<std::byte*> blocks;
+  blocks.reserve(num_blocks);
+  try {
+    for (std::size_t i = 0; i < num_blocks; ++i) {
+      blocks.push_back(
+        static_cast<std::byte*>(self.get_block(self.get_block_size(), stream_event).pointer()));
+    }
+  } catch (...) {
+    self.deallocate_blocks_async_unsafe(std::move(blocks), stream);
+    throw;
+  }
+
+  return std::unique_ptr<multiple_blocks_allocation>(
+    new multiple_blocks_allocation(size, std::move(blocks), stream, std::move(mr)));
+}
+
 }  // namespace mr
 }  // namespace RMM_NAMESPACE
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d56a5a258..07e3cd1fe 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -145,6 +145,7 @@ ConfigureTest(SYSTEM_MR_REF_TEST mr/mr_ref_system_tests.cpp)
 ConfigureTest(PINNED_MR_REF_TEST mr/mr_ref_pinned_tests.cpp)
 ConfigureTest(LOGGING_MR_REF_TEST mr/mr_ref_logging_tests.cpp)
 ConfigureTest(FIXED_SIZE_MR_REF_TEST mr/mr_ref_fixed_size_tests.cpp)
+ConfigureTest(FIXED_SIZE_MR_TEST mr/fixed_size_mr_test.cpp)
 ConfigureTest(DEFAULT_MR_REF_TEST mr/mr_ref_default_tests.cpp)
 
 # general adaptor tests
diff --git a/cpp/tests/mr/fixed_size_mr_test.cpp b/cpp/tests/mr/fixed_size_mr_test.cpp
new file mode 100644
index 000000000..420d3c814
--- /dev/null
+++ b/cpp/tests/mr/fixed_size_mr_test.cpp
@@ -0,0 +1,118 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../byte_literals.hpp"
+
+#include <rmm/aligned.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/cuda_async_memory_resource.hpp>
+#include <rmm/mr/cuda_memory_resource.hpp>
+#include <rmm/mr/fixed_size_memory_resource.hpp>
+#include <rmm/mr/statistics_resource_adaptor.hpp>
+
+#include <cuda/cmath>
+#include <cuda/memory_resource>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace rmm::test {
+namespace {
+
+using stats_mr = rmm::mr::statistics_resource_adaptor;
+
+struct FixedSizeMRTestParam {
+  cuda::mr::any_resource<cuda::mr::device_accessible> upstream;
+  std::string name;
+  std::size_t block_size{0};
+  std::size_t size{0};
+};
+
+std::vector<FixedSizeMRTestParam> make_fixed_size_mr_test_params()
+{
+  std::vector<FixedSizeMRTestParam> params;
+
+  auto add_params = [&](cuda::mr::any_resource<cuda::mr::device_accessible> upstream,
+                        std::string const& name) {
+    for (std::size_t block_sz : {std::size_t{256}, std::size_t{1_KiB}}) {
+      for (std::size_t size : {std::size_t{0}, block_sz / 2, block_sz, std::size_t{3_KiB}}) {
+        params.emplace_back(upstream, name, block_sz, size);
+      }
+    }
+  };
+
+  add_params(rmm::mr::cuda_memory_resource{}, "Cuda");
+  add_params(rmm::mr::cuda_async_memory_resource{}, "CudaAsync");
+
+  return params;
+}
+
+class FixedSizeMRTest : public ::testing::TestWithParam<FixedSizeMRTestParam> {};
+
+TEST_P(FixedSizeMRTest, AllocateBlocksAsyncUpstreamCountedDeallocateDoesNotReturnToUpstream)
+{
+  auto const& param                           = GetParam();
+  constexpr std::size_t blocks_to_preallocate = 1;
+
+  // statistics_resource_adaptor is itself a shared_resource: copying it shares the same pool.
+  auto counting = stats_mr(param.upstream);
+
+  using fixed_size_mr = rmm::mr::fixed_size_memory_resource;
+
+  {
+    auto fixed_mr = fixed_size_mr(counting, param.block_size, blocks_to_preallocate);
+
+    rmm::cuda_stream_pool stream_pool{4};
+    std::vector<std::unique_ptr<rmm::mr::multiple_blocks_allocation>> handles;
+
+    std::size_t const alloc_size  = param.size;
+    constexpr int num_allocations = 4;
+
+    std::size_t const actual_block_size = fixed_mr.get_block_size();
+
+    std::size_t const expected_blocks = [&]() {
+      if (alloc_size == 0) { return std::size_t{0}; }
+      return cuda::ceil_div(alloc_size, actual_block_size);
+    }();
+
+    for (int i = 0; i < num_allocations; ++i) {
+      rmm::cuda_stream_view stream = stream_pool.get_stream();
+      auto const& handle           = handles.emplace_back(
+        rmm::mr::multiple_blocks_allocation::make_async(fixed_mr, alloc_size, stream));
+
+      EXPECT_EQ(handle->size(), alloc_size);
+      EXPECT_EQ(handle->capacity(), expected_blocks * actual_block_size);
+    }
+    auto const bytes_after_alloc = counting.get_bytes_counter().value;
+
+    if (expected_blocks > 0) {
+      EXPECT_GE(bytes_after_alloc, static_cast<std::int64_t>(expected_blocks * num_allocations *
+                                                              actual_block_size));
+    }
+    handles.clear();
+
+    EXPECT_EQ(counting.get_bytes_counter().value, bytes_after_alloc)
+      << "After deallocate, upstream bytes must be unchanged until fixed_size_mr is destroyed";
+  }
+
+  EXPECT_EQ(counting.get_bytes_counter().value, 0)
+    << "After fixed_size pool destruction, upstream should have released all memory";
+}
+
+INSTANTIATE_TEST_SUITE_P(FixedSizeMRTests,
+                         FixedSizeMRTest,
+                         ::testing::ValuesIn(make_fixed_size_mr_test_params()),
+                         [](testing::TestParamInfo<FixedSizeMRTestParam> const& info) {
+                           return info.param.name + "_bs" +
+                                  std::to_string(info.param.block_size) + "_sz" +
+                                  std::to_string(info.param.size);
+                         });
+
+}  // namespace
+}  // namespace rmm::test

From 0e68a53d30f9bf2ddfe0d78b7e3b97212d92ebd0 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 27 Apr 2026 17:22:39 -0700
Subject: [PATCH 2/6] addressing comments and extending the test suite

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 .../rmm/mr/fixed_size_memory_resource.hpp     |   6 +-
 cpp/src/mr/fixed_size_memory_resource.cpp     |  10 +-
 cpp/tests/mr/fixed_size_mr_test.cpp           | 134 +++++++++++++-----
 3 files changed, 109 insertions(+), 41 deletions(-)

diff --git a/cpp/include/rmm/mr/fixed_size_memory_resource.hpp b/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
index 140c1fd0e..e251b8181 100644
--- a/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
+++ b/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
@@ -100,7 +100,8 @@ static_assert(cuda::mr::resource_with<fixed_size_memory_resource, cuda::mr::devi
 class RMM_EXPORT multiple_blocks_allocation {
  public:
   /**
-   * @brief Allocate device memory spanning one or more fixed-size blocks, stream-ordered.
+   * @brief Allocate device memory spanning one or more fixed-size blocks, stream-ordered on a
+   * non-PTDS stream.
    *
    * Use this for allocations larger than a single block. The allocation is ordered on
    * `stream`; deallocation (when the returned handle is destroyed) is also ordered on
@@ -111,9 +112,10 @@ class RMM_EXPORT multiple_blocks_allocation {
    *        `fixed_size_memory_resource` has refcounted shared ownership.
    * @param size Minimum number of bytes to allocate. Will be rounded up to a multiple of
    *        block size (see `get_block_size()` on `*mr`).
-   * @param stream CUDA stream on which the allocation is ordered.
+   * @param stream A non-PTDS CUDA stream on which the allocation is ordered.
    * @return Unique handle to the allocation; destroys to deallocate. Empty (zero-size)
    *         allocation returns a valid handle with size 0 and no blocks.
+   * @throw rmm::invalid_argument if `stream` is a per-thread default stream.
    * @throw Any exception from allocating blocks. Blocks successfully taken from the pool
    *        before the failure are returned to the pool on `stream` (same ordering as normal
    *        deallocation).
diff --git a/cpp/src/mr/fixed_size_memory_resource.cpp b/cpp/src/mr/fixed_size_memory_resource.cpp
index da0cf40d7..348bd31b0 100644
--- a/cpp/src/mr/fixed_size_memory_resource.cpp
+++ b/cpp/src/mr/fixed_size_memory_resource.cpp
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <rmm/detail/error.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/mr/fixed_size_memory_resource.hpp>
 
@@ -58,6 +59,10 @@ multiple_blocks_allocation::~multiple_blocks_allocation()
 std::unique_ptr<multiple_blocks_allocation> multiple_blocks_allocation::make_async(
   fixed_size_memory_resource mr, std::size_t size, cuda_stream_view stream)
 {
+  RMM_EXPECTS(!stream.is_per_thread_default(),
+              "stream must not be a per-thread default stream",
+              rmm::invalid_argument);
+
   if (size == 0) {
     return std::unique_ptr<multiple_blocks_allocation>(
       new multiple_blocks_allocation(0, {}, stream, std::move(mr)));
@@ -75,13 +80,14 @@ std::unique_ptr<multiple_blocks_allocation> multiple_blocks_allocation::make_asy
       blocks.push_back(
         static_cast<std::byte*>(self.get_block(self.get_block_size(), stream_event).pointer()));
     }
+    return std::unique_ptr<multiple_blocks_allocation>(
+      new multiple_blocks_allocation(size, std::move(blocks), stream, std::move(mr)));
   } catch (...) {
     self.deallocate_blocks_async_unsafe(std::move(blocks), stream);
     throw;
   }
 
-  return std::unique_ptr<multiple_blocks_allocation>(
-    new multiple_blocks_allocation(size, std::move(blocks), stream, std::move(mr)));
+  return nullptr;  // unreachable
 }
 
 }  // namespace mr
diff --git a/cpp/tests/mr/fixed_size_mr_test.cpp b/cpp/tests/mr/fixed_size_mr_test.cpp
index 420d3c814..c77d46ade 100644
--- a/cpp/tests/mr/fixed_size_mr_test.cpp
+++ b/cpp/tests/mr/fixed_size_mr_test.cpp
@@ -15,23 +15,26 @@
 
 #include <cuda/cmath>
 #include <cuda/memory_resource>
+#include <cuda/std/algorithm>
 
 #include <gtest/gtest.h>
 
 #include <cstddef>
+#include <future>
+#include <mutex>
 #include <string>
 #include <vector>
 
 namespace rmm::test {
 namespace {
 
-using stats_mr = rmm::mr::statistics_resource_adaptor;
-
 struct FixedSizeMRTestParam {
   cuda::mr::any_resource<cuda::mr::device_accessible> upstream;
-  std::string name;
-  std::size_t block_size{0};
-  std::size_t size{0};
+  std::string name;           ///< Name of the memory resource.
+  std::size_t block_size{0};  ///< Block size in bytes.
+  std::size_t size{0};        ///< Allocation size in bytes.
+  std::size_t n_threads{1};   ///< Number of threads to use for the test.
+  std::size_t n_streams{1};   ///< Number of streams to use for the test.
 };
 
 std::vector<FixedSizeMRTestParam> make_fixed_size_mr_test_params()
@@ -42,7 +45,11 @@ std::vector<FixedSizeMRTestParam> make_fixed_size_mr_test_params()
                         std::string const& name) {
     for (std::size_t block_sz : {std::size_t{256}, std::size_t{1_KiB}}) {
       for (std::size_t size : {std::size_t{0}, block_sz / 2, block_sz, std::size_t{3_KiB}}) {
-        params.emplace_back(upstream, name, block_sz, size);
+        for (std::size_t n_threads : std::vector<std::size_t>{1, 2, 4}) {
+          for (std::size_t n_streams : std::vector<std::size_t>{1, 2, 4}) {
+            params.emplace_back(upstream, name, block_sz, size, n_threads, n_streams);
+          }
+        }
       }
     }
   };
@@ -53,9 +60,23 @@ std::vector<FixedSizeMRTestParam> make_fixed_size_mr_test_params()
   return params;
 }
 
-class FixedSizeMRTest : public ::testing::TestWithParam<FixedSizeMRTestParam> {};
+}  // namespace
+
+class FixedSizeMRTest : public ::testing::TestWithParam<FixedSizeMRTestParam> {
+ protected:
+  using stats_mr          = rmm::mr::statistics_resource_adaptor;
+  using fixed_size_mr     = rmm::mr::fixed_size_memory_resource;
+  using multi_block_alloc = rmm::mr::multiple_blocks_allocation;
 
-TEST_P(FixedSizeMRTest, AllocateBlocksAsyncUpstreamCountedDeallocateDoesNotReturnToUpstream)
+  std::size_t expected_blocks()
+  {
+    auto const& param = GetParam();
+    if (param.size == 0) { return std::size_t{0}; }
+    return cuda::ceil_div(param.size, param.block_size);
+  }
+};
+
+TEST_P(FixedSizeMRTest, AllocAndDeallocBlocksAsync)
 {
   auto const& param                           = GetParam();
   constexpr std::size_t blocks_to_preallocate = 1;
@@ -63,42 +84,82 @@ TEST_P(FixedSizeMRTest, AllocateBlocksAsyncUpstreamCountedDeallocateDoesNotRetur
   // statistics_resource_adaptor is itself a shared_resource: copying it shares the same pool.
   auto counting = stats_mr(param.upstream);
 
-  using fixed_size_mr = rmm::mr::fixed_size_memory_resource;
-
   {
     auto fixed_mr = fixed_size_mr(counting, param.block_size, blocks_to_preallocate);
 
-    rmm::cuda_stream_pool stream_pool{4};
-    std::vector<std::unique_ptr<rmm::mr::multiple_blocks_allocation>> handles;
-
-    std::size_t const alloc_size  = param.size;
-    constexpr int num_allocations = 4;
-
-    std::size_t const actual_block_size = fixed_mr.get_block_size();
+    rmm::cuda_stream_pool stream_pool{param.n_streams, cuda_stream::flags::non_blocking};
+
+    std::size_t const alloc_size = param.size;
+    std::size_t const n_threads  = param.n_threads;
+    std::size_t const block_size = fixed_mr.get_block_size();
+    EXPECT_EQ(block_size, param.block_size);
+
+    constexpr int num_allocations = 16;
+    std::mutex handles_mutex;
+    std::vector<std::unique_ptr<multi_block_alloc>> handles;
+
+    std::vector<std::future<void>> alloc_futs;
+    alloc_futs.reserve(param.n_threads);
+
+    // each thread allocates num_allocations allocations
+    for (std::size_t i = 0; i < n_threads; ++i) {
+      alloc_futs.emplace_back(std::async(std::launch::async, [&] {
+        for (int i = 0; i < num_allocations; ++i) {
+          rmm::cuda_stream_view stream = stream_pool.get_stream();
+          auto handle = multi_block_alloc::make_async(fixed_mr, alloc_size, stream);
+          EXPECT_EQ(handle->size(), alloc_size);
+          EXPECT_EQ(handle->capacity(), expected_blocks() * block_size);
+
+          // enqueue a dummy cudamemsetasync
+          int dummy = 0;
+          cuda::std::ranges::for_each(handle->get_blocks(), [&](auto& block) {
+            RMM_CUDA_TRY(cudaMemsetAsync(block, (dummy++) & 0xFF, block_size, stream.value()));
+          });
+
+          {
+            std::lock_guard lock(handles_mutex);
+            handles.emplace_back(std::move(handle));
+          }
+        }
+      }));
+    }
 
-    std::size_t const expected_blocks = [&]() {
-      if (alloc_size == 0) { return std::size_t{0}; }
-      return cuda::ceil_div(alloc_size, actual_block_size);
-    }();
+    // wait for all allocations to complete
+    cuda::std::ranges::for_each(alloc_futs, [](auto& fut) { fut.get(); });
 
-    for (int i = 0; i < num_allocations; ++i) {
-      rmm::cuda_stream_view stream = stream_pool.get_stream();
-      auto const& handle           = handles.emplace_back(
-        rmm::mr::multiple_blocks_allocation::make_async(fixed_mr, alloc_size, stream));
+    // Note that stream pool is not sync'ed. The counter & driver should be able to account for the
+    // allocations without sync.
+    auto const bytes_after_alloc = counting.get_bytes_counter().value;
 
-      EXPECT_EQ(handle->size(), alloc_size);
-      EXPECT_EQ(handle->capacity(), expected_blocks * actual_block_size);
+    if (expected_blocks() > 0) {
+      EXPECT_GE(
+        bytes_after_alloc,
+        static_cast<std::int64_t>(expected_blocks() * num_allocations * block_size * n_threads));
     }
-    auto const bytes_after_alloc = counting.get_bytes_counter().value;
 
-    if (expected_blocks > 0) {
-      EXPECT_GE(bytes_after_alloc, static_cast<std::int64_t>(expected_blocks * num_allocations *
-                                                              actual_block_size));
+    // deallocate using multiple threads
+    std::vector<std::future<void>> dealloc_futs;
+    dealloc_futs.reserve(param.n_threads);
+    for (std::size_t i = 0; i < n_threads; ++i) {
+      dealloc_futs.emplace_back(std::async(std::launch::async, [&] {
+        while (true) {
+          std::lock_guard lock(handles_mutex);
+          if (handles.empty()) { break; }
+          handles.pop_back();
+        }
+      }));
     }
-    handles.clear();
+
+    // wait for all deallocations to complete
+    cuda::std::ranges::for_each(dealloc_futs, [](auto& fut) { fut.get(); });
 
     EXPECT_EQ(counting.get_bytes_counter().value, bytes_after_alloc)
       << "After deallocate, upstream bytes must be unchanged until fixed_size_mr is destroyed";
+
+    // finally sync the stream pool
+    for (size_t i = 0; i < param.n_streams; ++i) {
+      stream_pool.get_stream(i).synchronize();
+    }
   }
 
   EXPECT_EQ(counting.get_bytes_counter().value, 0)
@@ -109,10 +170,9 @@ INSTANTIATE_TEST_SUITE_P(FixedSizeMRTests,
                          FixedSizeMRTest,
                          ::testing::ValuesIn(make_fixed_size_mr_test_params()),
                          [](testing::TestParamInfo<FixedSizeMRTestParam> const& info) {
-                           return info.param.name + "_bs" +
-                                  std::to_string(info.param.block_size) + "_sz" +
-                                  std::to_string(info.param.size);
+                           return info.param.name + "_bs" + std::to_string(info.param.block_size) +
+                                  "_sz" + std::to_string(info.param.size) + "_nt" +
+                                  std::to_string(info.param.n_threads) + "_ns" +
+                                  std::to_string(info.param.n_streams);
                          });
-
-}  // namespace
 }  // namespace rmm::test

From c754376c196e208d634c6203c0f5529033f8b972 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 28 Apr 2026 16:23:10 -0700
Subject: [PATCH 3/6] addressing comments

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 .../fixed_size_memory_resource_impl.hpp       |  3 +-
 .../rmm/mr/fixed_size_memory_resource.hpp     | 59 ++++++++++++++-----
 .../fixed_size_memory_resource_impl.cpp       | 11 ++--
 cpp/src/mr/fixed_size_memory_resource.cpp     | 51 ++++++++++++----
 4 files changed, 93 insertions(+), 31 deletions(-)

diff --git a/cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp b/cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp
index 49580d182..8951af77a 100644
--- a/cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp
+++ b/cpp/include/rmm/mr/detail/fixed_size_memory_resource_impl.hpp
@@ -88,7 +88,8 @@ class fixed_size_memory_resource_impl final
   friend class RMM_NAMESPACE::mr::multiple_blocks_allocation;
 
   // Caller must hold get_mutex().
-  void deallocate_blocks_async_unsafe(std::vector<std::byte*>&& blocks, cuda_stream_view stream);
+  [[nodiscard]] cudaError_t deallocate_blocks_async_unsafe(std::vector<std::byte*>&& blocks,
+                                                           cuda_stream_view stream);
 
   free_list blocks_from_upstream(cuda_stream_view stream);
 
diff --git a/cpp/include/rmm/mr/fixed_size_memory_resource.hpp b/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
index e251b8181..36b905e59 100644
--- a/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
+++ b/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
@@ -92,10 +92,10 @@ static_assert(cuda::mr::resource_with<fixed_size_memory_resource, cuda::mr::devi
  * @brief RAII handle for an allocation that may span multiple fixed-size blocks from a
  *        `fixed_size_memory_resource`.
  *
- * Returned by `allocate_blocks_async`. When destroyed, all blocks are returned to the
- * memory resource on the same stream used for allocation. Move and copy are disabled to
- * prevent double deallocation. Holds a `fixed_size_memory_resource` (which has shared,
- * refcounted ownership of the underlying pool) so the pool outlives the handle.
+ * When destroyed, all blocks are returned to the memory resource on the same stream used for
+ * allocation. Copy is disabled to prevent double deallocation; move transfers ownership of the
+ * blocks. Holds a `fixed_size_memory_resource` (which has shared, refcounted ownership of the
+ * underlying pool) so the pool outlives the handle.
  */
 class RMM_EXPORT multiple_blocks_allocation {
  public:
@@ -123,33 +123,52 @@ class RMM_EXPORT multiple_blocks_allocation {
   [[nodiscard]] static std::unique_ptr<multiple_blocks_allocation> make_async(
     fixed_size_memory_resource mr, std::size_t size, cuda_stream_view stream);
 
-  ~multiple_blocks_allocation();
+  /**
+   * @brief Destroy this handle and return any held blocks to the pool.
+   *
+   * `noexcept`. Uses `deallocate_blocks_async_unsafe` under the pool mutex; CUDA errors are
+   * logged with `RMM_LOG_ERROR` and other exceptions during teardown are caught and logged.
+   */
+  ~multiple_blocks_allocation() noexcept;
 
   multiple_blocks_allocation(multiple_blocks_allocation const&)            = delete;
   multiple_blocks_allocation& operator=(multiple_blocks_allocation const&) = delete;
-  multiple_blocks_allocation(multiple_blocks_allocation&&)                 = delete;
-  multiple_blocks_allocation& operator=(multiple_blocks_allocation&&)      = delete;
+
+  /**
+   * @brief Move-constructor
+   *
+   * @param other Source handle to move from.
+   */
+  multiple_blocks_allocation(multiple_blocks_allocation&& other) noexcept;
+
+  /**
+   * @brief Move-assignment
+   * @param other Source handle to move from.
+   * @return Reference to `*this`.
+   * @throw rmm::cuda_error if returning the current blocks to the pool fails during `clear()`.
+   */
+  multiple_blocks_allocation& operator=(multiple_blocks_allocation&& other);
 
   /**
    * @brief Number of bytes requested for this allocation.
    *
    * @return Requested size in bytes.
    */
-  constexpr std::size_t size() const noexcept { return size_; }
+  [[nodiscard]] constexpr std::size_t size() const noexcept { return size_; }
 
   /**
    * @brief Total capacity in bytes (number of blocks × block size).
    *
    * @return Capacity in bytes; always >= size().
    */
-  std::size_t capacity() const noexcept { return block_size() * blocks_.size(); }
+  [[nodiscard]] std::size_t capacity() const noexcept { return block_size() * blocks_.size(); }
 
   /**
    * @brief Size in bytes of each block in this allocation.
    *
    * @return Block size (same as the memory resource's get_block_size()).
    */
-  std::size_t block_size() const noexcept { return mr_->get_block_size(); }
+  [[nodiscard]] std::size_t block_size() const noexcept { return mr_->get_block_size(); }
 
   /**
    * @brief Non-owning view of the underlying block pointers.
@@ -167,7 +186,7 @@ class RMM_EXPORT multiple_blocks_allocation {
    * @param i Block index in [0, get_blocks().size()).
    * @return Span of std::byte over the i-th block.
    */
-  cuda::std::span<std::byte> operator[](std::size_t i) const
+  [[nodiscard]] cuda::std::span<std::byte> operator[](std::size_t i) const
   {
     return {blocks_[i], mr_->get_block_size()};
   }
@@ -179,7 +198,7 @@ class RMM_EXPORT multiple_blocks_allocation {
    * @return Span of std::byte over the i-th block.
    * @throws std::out_of_range if i >= number of blocks.
    */
-  cuda::std::span<std::byte> at(std::size_t i) const
+  [[nodiscard]] cuda::std::span<std::byte> at(std::size_t i) const
   {
     return {blocks_.at(i), mr_->get_block_size()};
   }
@@ -189,16 +208,26 @@ class RMM_EXPORT multiple_blocks_allocation {
    *
    * @return The stream passed to make_async.
    */
-  constexpr cuda_stream_view stream() const noexcept { return stream_; }
+  [[nodiscard]] constexpr cuda_stream_view stream() const noexcept { return stream_; }
+
+  /**
+   * @brief Return all blocks to the pool on `stream()`, then leave this handle empty.
+   *
+   * Same ordering as destruction: stream-ordered deallocation on the stream passed to
+   * `make_async`. After `clear()`, `size()` is 0 and `get_blocks()` is empty.
+   *
+   * @throw rmm::cuda_error if the event recording fails.
+   */
+  void clear();
 
  private:
   multiple_blocks_allocation(std::size_t size,
                              std::vector<std::byte*> buffers,
                              cuda_stream_view stream,
-                             fixed_size_memory_resource mr);
+                             fixed_size_memory_resource mr) noexcept;
 
   std::vector<std::byte*> blocks_;
-  std::size_t const size_;
+  std::size_t size_;
   cuda_stream_view stream_;
   fixed_size_memory_resource mr_;
 };
diff --git a/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp b/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp
index 27ba9ee62..ac9b74f63 100644
--- a/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp
+++ b/cpp/src/mr/detail/fixed_size_memory_resource_impl.cpp
@@ -5,7 +5,6 @@
 
 #include <rmm/aligned.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/detail/error.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/mr/detail/fixed_size_memory_resource_impl.hpp>
 #include <rmm/mr/detail/stream_ordered_memory_resource.hpp>
@@ -111,19 +110,21 @@ std::pair<std::size_t, std::size_t> fixed_size_memory_resource_impl::free_list_s
                            : std::make_pair(block_size_, blocks.size() * block_size_);
 }
 
-void fixed_size_memory_resource_impl::deallocate_blocks_async_unsafe(
+cudaError_t fixed_size_memory_resource_impl::deallocate_blocks_async_unsafe(
   std::vector<std::byte*>&& blocks, cuda_stream_view stream)
 {
-  if (blocks.empty()) { return; }
+  if (blocks.empty()) { return cudaSuccess; }
 
   free_list blocks_free_list;
   cuda::std::ranges::for_each(blocks, [this, &blocks_free_list](std::byte* ptr) {
     blocks_free_list.insert(this->free_block(ptr, get_block_size()));
   });
 
-  auto stream_event = get_event(stream);
-  RMM_ASSERT_CUDA_SUCCESS(cudaEventRecord(stream_event.event, stream.value()));
+  auto stream_event       = get_event(stream);
+  cudaError_t const error = cudaEventRecord(stream_event.event, stream.value());
+  if (cudaSuccess != error) { return error; }
   this->insert_blocks(std::move(blocks_free_list), stream);
+  return cudaSuccess;
 }
 
 #ifdef RMM_DEBUG_PRINT
diff --git a/cpp/src/mr/fixed_size_memory_resource.cpp b/cpp/src/mr/fixed_size_memory_resource.cpp
index 348bd31b0..6b234a103 100644
--- a/cpp/src/mr/fixed_size_memory_resource.cpp
+++ b/cpp/src/mr/fixed_size_memory_resource.cpp
@@ -5,6 +5,7 @@
 
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/logging_assert.hpp>
+#include <rmm/logger.hpp>
 #include <rmm/mr/fixed_size_memory_resource.hpp>
 
 #include <cuda/cmath>
@@ -40,19 +41,50 @@ std::size_t fixed_size_memory_resource::get_block_size() const noexcept
 multiple_blocks_allocation::multiple_blocks_allocation(std::size_t size,
                                                        std::vector<std::byte*> buffers,
                                                        cuda_stream_view stream,
-                                                       fixed_size_memory_resource mr)
+                                                       fixed_size_memory_resource mr) noexcept
   : blocks_(std::move(buffers)), size_(size), stream_(stream), mr_(std::move(mr))
 {
-  RMM_LOGGING_ASSERT(size_ <= mr_->get_block_size() * blocks_.size());
-  RMM_LOGGING_ASSERT(blocks_.empty() ||
-                     blocks_.size() == cuda::ceil_div(size_, mr_->get_block_size()));
 }
 
-multiple_blocks_allocation::~multiple_blocks_allocation()
+multiple_blocks_allocation::multiple_blocks_allocation(multiple_blocks_allocation&& other) noexcept
+  : blocks_(std::move(other.blocks_)),
+    size_(other.size_),
+    stream_(other.stream_),
+    mr_(std::move(other.mr_))
+{
+  other.size_ = 0;
+}
+
+void multiple_blocks_allocation::clear()
 {
   if (!blocks_.empty()) {
     std::lock_guard<std::mutex> lock(mr_->get_mutex());
-    mr_->deallocate_blocks_async_unsafe(std::move(blocks_), stream_);
+    RMM_CUDA_TRY(mr_->deallocate_blocks_async_unsafe(std::move(blocks_), stream_));
+  }
+  size_ = 0;
+}
+
+multiple_blocks_allocation& multiple_blocks_allocation::operator=(
+  multiple_blocks_allocation&& other)
+{
+  if (this != &other) {
+    clear();
+    blocks_     = std::move(other.blocks_);
+    size_       = other.size_;
+    stream_     = other.stream_;
+    mr_         = std::move(other.mr_);
+    other.size_ = 0;
+  }
+  return *this;
+}
+
+multiple_blocks_allocation::~multiple_blocks_allocation() noexcept
+{
+  try {
+    clear();
+  } catch (...) {
+    RMM_LOG_ERROR(
+      "multiple_blocks_allocation: exception while releasing device blocks in destructor");
   }
 }
 
@@ -80,14 +112,13 @@ std::unique_ptr<multiple_blocks_allocation> multiple_blocks_allocation::make_asy
       blocks.push_back(
         static_cast<std::byte*>(self.get_block(self.get_block_size(), stream_event).pointer()));
     }
-    return std::unique_ptr<multiple_blocks_allocation>(
-      new multiple_blocks_allocation(size, std::move(blocks), stream, std::move(mr)));
   } catch (...) {
-    self.deallocate_blocks_async_unsafe(std::move(blocks), stream);
+    RMM_CUDA_TRY(self.deallocate_blocks_async_unsafe(std::move(blocks), stream));
     throw;
   }
 
-  return nullptr;  // unreachable
+  return std::unique_ptr<multiple_blocks_allocation>(
+    new multiple_blocks_allocation(size, std::move(blocks), stream, std::move(mr)));
 }
 
 }  // namespace mr

From a5e3f5502782fa0228376fd15d88bfdbb6399690 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 28 Apr 2026 16:25:19 -0700
Subject: [PATCH 4/6] remove test ns

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/tests/mr/fixed_size_mr_test.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/tests/mr/fixed_size_mr_test.cpp b/cpp/tests/mr/fixed_size_mr_test.cpp
index c77d46ade..eb04fe640 100644
--- a/cpp/tests/mr/fixed_size_mr_test.cpp
+++ b/cpp/tests/mr/fixed_size_mr_test.cpp
@@ -25,7 +25,6 @@
 #include <string>
 #include <vector>
 
-namespace rmm::test {
 namespace {
 
 struct FixedSizeMRTestParam {
@@ -175,4 +174,3 @@ INSTANTIATE_TEST_SUITE_P(FixedSizeMRTests,
                                   std::to_string(info.param.n_threads) + "_ns" +
                                   std::to_string(info.param.n_streams);
                          });
-}  // namespace rmm::test

From 4333c99ae8f7516964e2858c396acb927961f04a Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 28 Apr 2026 16:44:14 -0700
Subject: [PATCH 5/6] addressing comments

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/include/rmm/mr/fixed_size_memory_resource.hpp | 10 +++++-----
 cpp/src/mr/fixed_size_memory_resource.cpp         |  7 ++++---
 cpp/tests/mr/fixed_size_mr_test.cpp               |  4 +++-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/cpp/include/rmm/mr/fixed_size_memory_resource.hpp b/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
index 36b905e59..459f8f4b2 100644
--- a/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
+++ b/cpp/include/rmm/mr/fixed_size_memory_resource.hpp
@@ -4,13 +4,13 @@
  */
 #pragma once
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/detail/fixed_size_memory_resource_impl.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/memory_resource>
 #include <cuda/std/span>
+#include <cuda/stream_ref>
 
 #include <cstddef>
 #include <memory>
@@ -121,7 +121,7 @@ class RMM_EXPORT multiple_blocks_allocation {
    *        deallocation).
    */
   [[nodiscard]] static std::unique_ptr<multiple_blocks_allocation> make_async(
-    fixed_size_memory_resource mr, std::size_t size, cuda_stream_view stream);
+    fixed_size_memory_resource mr, std::size_t size, cuda::stream_ref stream);
 
   /**
    * @brief Destroy this handle and return any held blocks to the pool.
@@ -208,7 +208,7 @@ class RMM_EXPORT multiple_blocks_allocation {
    *
    * @return The stream passed to make_async.
    */
-  [[nodiscard]] constexpr cuda_stream_view stream() const noexcept { return stream_; }
+  [[nodiscard]] constexpr cuda::stream_ref stream() const noexcept { return stream_; }
 
   /**
    * @brief Return all blocks to the pool on `stream()`, then leave this handle empty.
@@ -223,12 +223,12 @@ class RMM_EXPORT multiple_blocks_allocation {
  private:
   multiple_blocks_allocation(std::size_t size,
                              std::vector<std::byte*> buffers,
-                             cuda_stream_view stream,
+                             cuda::stream_ref stream,
                              fixed_size_memory_resource mr) noexcept;
 
   std::vector<std::byte*> blocks_;
   std::size_t size_;
-  cuda_stream_view stream_;
+  cuda::stream_ref stream_;
   fixed_size_memory_resource mr_;
 };
 
diff --git a/cpp/src/mr/fixed_size_memory_resource.cpp b/cpp/src/mr/fixed_size_memory_resource.cpp
index 6b234a103..b97c23436 100644
--- a/cpp/src/mr/fixed_size_memory_resource.cpp
+++ b/cpp/src/mr/fixed_size_memory_resource.cpp
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/logger.hpp>
@@ -40,7 +41,7 @@ std::size_t fixed_size_memory_resource::get_block_size() const noexcept
 
 multiple_blocks_allocation::multiple_blocks_allocation(std::size_t size,
                                                        std::vector<std::byte*> buffers,
-                                                       cuda_stream_view stream,
+                                                       cuda::stream_ref stream,
                                                        fixed_size_memory_resource mr) noexcept
   : blocks_(std::move(buffers)), size_(size), stream_(stream), mr_(std::move(mr))
 {
@@ -89,9 +90,9 @@ multiple_blocks_allocation::~multiple_blocks_allocation() noexcept
 }
 
 std::unique_ptr<multiple_blocks_allocation> multiple_blocks_allocation::make_async(
-  fixed_size_memory_resource mr, std::size_t size, cuda_stream_view stream)
+  fixed_size_memory_resource mr, std::size_t size, cuda::stream_ref stream)
 {
-  RMM_EXPECTS(!stream.is_per_thread_default(),
+  RMM_EXPECTS(!cuda_stream_view{stream}.is_per_thread_default(),
               "stream must not be a per-thread default stream",
               rmm::invalid_argument);
 
diff --git a/cpp/tests/mr/fixed_size_mr_test.cpp b/cpp/tests/mr/fixed_size_mr_test.cpp
index eb04fe640..8b47061b1 100644
--- a/cpp/tests/mr/fixed_size_mr_test.cpp
+++ b/cpp/tests/mr/fixed_size_mr_test.cpp
@@ -25,6 +25,8 @@
 #include <string>
 #include <vector>
 
+using namespace rmm::test;
+
 namespace {
 
 struct FixedSizeMRTestParam {
@@ -86,7 +88,7 @@ TEST_P(FixedSizeMRTest, AllocAndDeallocBlocksAsync)
   {
     auto fixed_mr = fixed_size_mr(counting, param.block_size, blocks_to_preallocate);
 
-    rmm::cuda_stream_pool stream_pool{param.n_streams, cuda_stream::flags::non_blocking};
+    rmm::cuda_stream_pool stream_pool{param.n_streams, rmm::cuda_stream::flags::non_blocking};
 
     std::size_t const alloc_size = param.size;
     std::size_t const n_threads  = param.n_threads;

From 47472d6e5430ed4e4be3615dcaf3f3c488632acc Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 29 Apr 2026 10:17:49 -0700
Subject: [PATCH 6/6] coderabbit review

---
 cpp/tests/mr/fixed_size_mr_test.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cpp/tests/mr/fixed_size_mr_test.cpp b/cpp/tests/mr/fixed_size_mr_test.cpp
index 8b47061b1..ae70187ef 100644
--- a/cpp/tests/mr/fixed_size_mr_test.cpp
+++ b/cpp/tests/mr/fixed_size_mr_test.cpp
@@ -144,9 +144,14 @@ TEST_P(FixedSizeMRTest, AllocAndDeallocBlocksAsync)
     for (std::size_t i = 0; i < n_threads; ++i) {
       dealloc_futs.emplace_back(std::async(std::launch::async, [&] {
         while (true) {
-          std::lock_guard lock(handles_mutex);
-          if (handles.empty()) { break; }
-          handles.pop_back();
+          std::unique_ptr<multi_block_alloc> handle;
+          {
+            std::lock_guard lock(handles_mutex);
+            if (handles.empty()) { break; }
+            handle = std::move(handles.back());
+            handles.pop_back();
+          }
+          handle.reset();
         }
       }));
     }