rapidsai · rapids-bot · May 13, 2026 · Feb 16, 2026 · Feb 18, 2026 · Feb 19, 2026
@@ -694,14 +694,15 @@ void kmeans_fit(
 
   rmm::device_uvector<char> batch_workspace(streaming_batch_size, stream);
 
-  cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT> data_batches(
-    X.data_handle(), n_samples, n_features, streaming_batch_size, stream);
+  auto data_batches = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<DataT>(
+    handle, X.data_handle(), n_samples, n_features, streaming_batch_size, stream);
   // Host-path weight batches: only materialized when weights are provided and
   // the data resides on host
-  std::optional<cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT>> weight_batches;
+  std::optional<cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>> weight_batches;
   if constexpr (!data_on_device) {
     if (weight_ptr != nullptr) {
-      weight_batches.emplace(weight_ptr, n_samples, 1, streaming_batch_size, stream);
+      weight_batches = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<DataT>(
+        handle, weight_ptr, n_samples, IndexT{1}, streaming_batch_size, stream);
     } else {
       raft::matrix::fill(handle, batch_weights_buf.view(), DataT{1});
     }
@@ -833,7 +834,7 @@ void kmeans_fit(
         raft::make_device_matrix_view<DataT, IndexT>(new_centroids_ptr, n_clusters, n_features);
 
       data_batches.reset();
-      using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT>;
+      using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>;
       std::optional<wt_iter_t> wt_it;
       if (weight_batches.has_value()) {
         weight_batches->reset();
@@ -932,7 +933,7 @@ void kmeans_fit(
 
       iter_inertia = DataT{0};
       data_batches.reset();
-      using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT>;
+      using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>;
       std::optional<wt_iter_t> wt_it;
       if (weight_batches.has_value()) {
         weight_batches->reset();

@@ -84,10 +84,11 @@ void add_node_core(
   auto host_neighbor_indices =
     raft::make_host_matrix<IdxT, std::int64_t>(max_search_batch_size, base_degree);
 
-  cuvs::spatial::knn::detail::utils::batch_load_iterator<T> additional_dataset_batch(
+  auto additional_dataset_batch = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<T>(
+    handle,
     additional_dataset_view.data_handle(),
-    num_add,
-    additional_dataset_view.stride(0),
+    static_cast<std::int64_t>(num_add),
+    static_cast<std::int64_t>(additional_dataset_view.stride(0)),
     max_search_batch_size,
     raft::resource::get_cuda_stream(handle),
     mr);

@@ -831,29 +831,34 @@ inline std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
     mst_host += (graph_degree - 1) * (graph_degree - 1) * index_size;  // iB_candidates
   }
 
+  // batchsize for both prune and combine stages
+  size_t batch_size = std::min(static_cast<size_t>(256 * 1024), n_rows);
+
   // Prune stage memory
   // We neglect 8 bytes (both on host and device) for stats
-  size_t prune_host = n_rows * intermediate_degree * sizeof(uint8_t);  // detour count
-
-  size_t prune_dev = n_rows * intermediate_degree * 1;     // detour count (uint8_t)
-  prune_dev += n_rows * sizeof(uint32_t);                  // d_num_detour_edges
-  prune_dev += n_rows * intermediate_degree * index_size;  // d_input_graph
+  size_t prune_dev = batch_size * intermediate_degree * 1;  // detour count (uint8_t)
+  prune_dev += batch_size * sizeof(uint32_t);               // d_num_detour_edges
+  prune_dev += n_rows * intermediate_degree * index_size;   // d_input_graph
+  prune_dev += 2 * batch_size * graph_degree * index_size;  // d_output_graph(2*batch)
 
   // Reverse graph stage memory
-  size_t rev_host = n_rows * graph_degree * index_size;  // rev_graph
-  rev_host += n_rows * sizeof(uint32_t);                 // rev_graph_count
-  rev_host += n_rows * index_size;                       // dest_nodes
-
   size_t rev_dev = n_rows * graph_degree * index_size;  // d_rev_graph
   rev_dev += n_rows * sizeof(uint32_t);                 // d_rev_graph_count
-  rev_dev += n_rows * sizeof(uint32_t);                 // d_dest_nodes
+  rev_dev += n_rows * index_size;                       // d_dest_nodes
 
-  // Memory for merging graphs (host only)
+  // Memory for merging graphs (host only optional)
   size_t combine_host =
     n_rows * sizeof(uint32_t) + graph_degree * sizeof(uint32_t);  // in_edge_count + hist
 
-  size_t total_host = mst_host + std::max({prune_host, rev_host, combine_host});
-  size_t total_dev  = std::max(prune_dev, rev_dev);
+  // additional memory for combine stage on device (3 batches)
+  size_t combine_dev = 2 * batch_size * graph_degree * index_size;  // d_output_graph(2*batch)
+  if (mst_optimize) {
+    combine_dev += 2 * batch_size * graph_degree * index_size;  // d_mst_graph(2*batch)
+    combine_dev += 2 * batch_size * sizeof(uint32_t);           // d_mst_graph_num_edges(2*batch)
+  }
+
+  size_t total_host = mst_host + combine_host;
+  size_t total_dev  = std::max(prune_dev, rev_dev + combine_dev);
 
   return std::make_pair(total_host, total_dev);
 }
@@ -1725,11 +1730,12 @@ void build_knn_graph(
   bool first                    = true;
   const auto start_clock        = std::chrono::system_clock::now();
 
-  cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(
+  auto vec_batches = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<DataT>(
+    res,
     dataset.data_handle(),
-    dataset.extent(0),
-    dataset.extent(1),
-    static_cast<int64_t>(max_queries),
+    static_cast<int64_t>(dataset.extent(0)),
+    static_cast<int64_t>(dataset.extent(1)),
+    static_cast<size_t>(max_queries),
     raft::resource::get_cuda_stream(res),
     workspace_mr);
 
@@ -2110,10 +2116,11 @@ auto iterative_build_graph(
 
     // Search.
     // Since there are many queries, divide them into batches and search them.
-    cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
+    auto query_batch = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<T>(
+      res,
       dev_query_view.data_handle(),
-      curr_query_size,
-      dev_query_view.extent(1),
+      static_cast<int64_t>(curr_query_size),
+      static_cast<int64_t>(dev_query_view.extent(1)),
       max_chunk_size,
       raft::resource::get_cuda_stream(res),
       raft::resource::get_workspace_resource_ref(res));