rapidsai · tfeher · Jan 21, 2026 · Feb 23, 2026 · Feb 23, 2026 · Mar 30, 2026
@@ -1010,6 +1010,7 @@ if(NOT BUILD_CPU_ONLY)
     ${iface_pq_inst_files}
     src/neighbors/detail/cagra/topk_for_cagra/topk.cu
     ${cuvs_cagra_search_cuda_inst_files}
+    src/neighbors/detail/cagra/cagra_helpers.cpp
     src/neighbors/dynamic_batching.cu
     src/neighbors/composite/index.cu
     $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/cagra.cpp>

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -42,36 +42,7 @@ auto parse_build_param(const nlohmann::json& conf) ->
   ::parse_build_param<T, IdxT>(conf, cagra_params);
   // If the users provides parameter M, we can use the CAGRA-HNSW heuristics to find optimal
   // parameters for the dataset and HNSW reference.
-  if (conf.contains("M")) {
-    // Postpone the parsing of the CAGRA build params until the dataset extents are known.
-    // We the default parameters depend on the dataset extents; and we still would like to be able
-    // to override them.
-    cagra_params.cagra_params = [conf, hnsw_params](raft::matrix_extent<int64_t> extents,
-                                                    cuvs::distance::DistanceType dist_type) {
-      auto ps = cuvs::neighbors::cagra::index_params::from_hnsw_params(
-        extents,
-        conf.at("M"),
-        hnsw_params.ef_construction,
-        cuvs::neighbors::cagra::hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT,
-        dist_type);
-      ps.metric = dist_type;
-      // Parse ACE parameters if provided
-      if (conf.contains("npartitions") || conf.contains("build_dir") ||
-          conf.contains("ef_construction") || conf.contains("use_disk")) {
-        auto ace_params = cuvs::neighbors::cagra::graph_build_params::ace_params();
-        if (conf.contains("npartitions")) { ace_params.npartitions = conf.at("npartitions"); }
-        if (conf.contains("build_dir")) { ace_params.build_dir = conf.at("build_dir"); }
-        if (conf.contains("ef_construction")) {
-          ace_params.ef_construction = conf.at("ef_construction");
-        }
-        if (conf.contains("use_disk")) { ace_params.use_disk = conf.at("use_disk"); }
-        ps.graph_build_params = ace_params;
-      }
-      // NB: above, we only provide the defaults. Below we parse the explicit parameters as usual.
-      ::parse_build_param<T, uint32_t>(conf, ps);
-      return ps;
-    };
-  }
+  if (conf.contains("M")) { hnsw_params.M = conf.at("M"); }
   return param;
 }
 

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -85,38 +85,9 @@ void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
   // when the data set is on host, we can pass it directly to HNSW
   bool dataset_is_on_host = raft::get_device_for_address(dataset) == -1;
 
-  // re-use the CAGRA wrapper to parse build params
-  auto bps = build_param_.cagra_build_params;
-  // Not very conveniently, the CAGRA wrapper resolves parameters after the dataset shape is known,
-  // so it takes a lambda to do it. Even though we know the shape, we want to use the wrapper as-is,
-  // so we just modify that lambda.
-  bps.cagra_params = [dataset_is_on_host, orig_cagra_params = bps.cagra_params](
-                       auto dataset_extents, auto metric) {
-    auto params                    = orig_cagra_params(dataset_extents, metric);
-    params.attach_dataset_on_build = !dataset_is_on_host;
-    return params;
-  };
-  cuvs_cagra<T, IdxT> cagra_wrapper{this->metric_, this->dim_, bps};
-
-  // build the CAGRA index
-  cagra_wrapper.build(dataset, nrow);
-  auto& cagra_index = *cagra_wrapper.get_index();
-
-  // pass the dataset directly to HNSW if it's on the host
-  std::optional<raft::host_matrix_view<const T, int64_t>> opt_dataset_view = std::nullopt;
-  if (dataset_is_on_host) {
-    opt_dataset_view.emplace(
-      raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_));
-  }
-
+  auto dataset_view = raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
   // convert the index to HNSW format
-  hnsw_index_ = cuvs::neighbors::hnsw::from_cagra(
-    handle_, build_param_.hnsw_index_params, cagra_index, opt_dataset_view);
-
-  // special treatment in save/serialize step
-  if (cagra_index.dataset_fd().has_value() && cagra_index.graph_fd().has_value()) {
-    cagra_ace_build_ = true;
-  }
+  hnsw_index_ = cuvs::neighbors::hnsw::build(handle_, build_param_.hnsw_index_params, dataset_view);
 }
 
 template <typename T, typename IdxT>

@@ -3241,6 +3241,35 @@ namespace neighbors {
 namespace cagra {
 namespace helpers {
 
+  /** Calculates the workspace for graph optimization
+ *
+ * @param[in] n_rows number of rows in the dataset (or number of points in the grapt)
- * @param[in] n_rows number of rows in the dataset (or number of points in the grapt)
+ * @param[in] n_rows number of rows in the dataset (or number of points in the graph)
- * @param[in] n_rows number of rows in the dataset (or number of points in the grapt)
+ * @param[in] n_rows number of rows in the dataset (or number of points in the graph)
+ * @param[in] graph_degree degree of the output graph
+ * @param[in] intermediate_graph_degree degree of the input graph for the optimization process
+ * @param[in] index_size
+ * @return pair of [host_size, device_size] memory sizes in bytes
+ */
+std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
+                                                  size_t graph_degree,
+                                                  size_t intermediate_degree,
+                                                  size_t index_size,
+                                                  bool mst_optimize = false);
-  /** Calculates the workspace for graph optimization
- *
- * @param[in] n_rows number of rows in the dataset (or number of points in the grapt)
- * @param[in] graph_degree degree of the output graph
- * @param[in] intermediate_graph_degree degree of the input graph for the optimization process
- * @param[in] index_size
- * @return pair of [host_size, device_size] memory sizes in bytes
- */
-std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
-                                                  size_t graph_degree,
-                                                  size_t intermediate_degree,
-                                                  size_t index_size,
-                                                  bool mst_optimize = false);
+  /** Calculates the workspace for graph optimization
+ *
+ * `@param`[in] n_rows number of rows in the dataset (or number of points in the graph)
+ * `@param`[in] graph_degree degree of the output graph
+ * `@param`[in] intermediate_graph_degree degree of the input graph for the optimization process
+ * `@param`[in] index_size
+ * `@param`[in] mst_optimize whether to use MST optimization to guarantee graph connectivity
+ * `@return` pair of [host_size, device_size] memory sizes in bytes
+ */
+std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
+                                                  size_t graph_degree,
+                                                  size_t intermediate_degree,
+                                                  size_t index_size,
+                                                  bool mst_optimize = false);
-  /** Calculates the workspace for graph optimization
- *
- * @param[in] n_rows number of rows in the dataset (or number of points in the grapt)
- * @param[in] graph_degree degree of the output graph
- * @param[in] intermediate_graph_degree degree of the input graph for the optimization process
- * @param[in] index_size
- * @return pair of [host_size, device_size] memory sizes in bytes
- */
-std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
-                                                  size_t graph_degree,
-                                                  size_t intermediate_degree,
-                                                  size_t index_size,
-                                                  bool mst_optimize = false);
+  /** Calculates the workspace for graph optimization
+ *
+ * `@param`[in] n_rows number of rows in the dataset (or number of points in the graph)
+ * `@param`[in] graph_degree degree of the output graph
+ * `@param`[in] intermediate_graph_degree degree of the input graph for the optimization process
+ * `@param`[in] index_size
+ * `@param`[in] mst_optimize whether to use MST optimization to guarantee graph connectivity
+ * `@return` pair of [host_size, device_size] memory sizes in bytes
+ */
+std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
+                                                  size_t graph_degree,
+                                                  size_t intermediate_degree,
+                                                  size_t index_size,
+                                                  bool mst_optimize = false);
+
+/**
+ * Calculate memory usage of CAGRA build.
+ *
+ * @param[in] res raft resource
+ * @param[in] dataset shape of the dataset
+ * @param[in] dtype_size size of dataset datatype in bytes
+ * @param[in] cparams CAGRA index building parameters
+ *
+ * @return pair of [host_size, device_size] memory sizes in bytes
+ */
+std::pair<size_t, size_t> cagra_build_mem_usage(raft::resources const& res,
+                                                raft::matrix_extent<int64_t> dataset,
+                                                size_t dtype_size,
+                                                cuvs::neighbors::cagra::index_params cparams);
+
 /**
  * @brief Optimize a KNN graph into a CAGRA graph.
  *

@@ -3294,6 +3294,18 @@ void make_rotation_matrix(
   raft::device_matrix_view<float, uint32_t, raft::row_major> rotation_matrix,
   bool force_random_rotation);
 
+/** Calculate the size of the compressed dataset.
+ *
+ * @param[in] res raft resource
+ * @param[in] dataset shape of the dataset
+ * @param[in] param ivf-pq compression params
+ *
+ * @return compressed dataset size in bytes
+ */
+size_t compressed_dataset_size(raft::resources const& res,
+                               raft::matrix_extent<int64_t> dataset,
+                               cuvs::neighbors::ivf_pq::index_params params);
+
 /**
  * @brief Resize an IVF-PQ list with flat layout.
  *
@@ -3355,6 +3367,7 @@ void resize_list(raft::resources const& res,
                  const list_spec_interleaved<uint32_t, int64_t>& spec,
                  uint32_t new_used_size,
                  uint32_t old_used_size);
+
 /**
  * @}
  */

@@ -814,55 +814,6 @@ constexpr double usable_cpu_memory_fraction = 0.8;
 constexpr double usable_gpu_memory_fraction = 0.8;
 constexpr double imbalance_factor           = 3.0;
 
-// Calculate CAGRA optimize workspace memory requirements.
-// This is the working memory on top of the input/output memory usage.
-inline std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
-                                                         size_t graph_degree,
-                                                         size_t intermediate_degree,
-                                                         size_t index_size,
-                                                         bool mst_optimize = false)
-{
-  // MST optimization memory (host only)
-  size_t mst_host = n_rows * index_size;  // mst_graph_num_edges
-  if (mst_optimize) {
-    mst_host += n_rows * graph_degree * index_size;  // mst_graph allocated in optimize
-    mst_host += n_rows * graph_degree * index_size;  // mst_graph allocated in mst_optimize
-    mst_host += n_rows * index_size * 7;             // vectors with _max_edges suffix
-    mst_host += (graph_degree - 1) * (graph_degree - 1) * index_size;  // iB_candidates
-  }
-
-  // batchsize for both prune and combine stages
-  size_t batch_size = std::min(static_cast<size_t>(256 * 1024), n_rows);
-
-  // Prune stage memory
-  // We neglect 8 bytes (both on host and device) for stats
-  size_t prune_dev = batch_size * intermediate_degree * 1;  // detour count (uint8_t)
-  prune_dev += batch_size * sizeof(uint32_t);               // d_num_detour_edges
-  prune_dev += n_rows * intermediate_degree * index_size;   // d_input_graph
-  prune_dev += 2 * batch_size * graph_degree * index_size;  // d_output_graph(2*batch)
-
-  // Reverse graph stage memory
-  size_t rev_dev = n_rows * graph_degree * index_size;  // d_rev_graph
-  rev_dev += n_rows * sizeof(uint32_t);                 // d_rev_graph_count
-  rev_dev += n_rows * index_size;                       // d_dest_nodes
-
-  // Memory for merging graphs (host only optional)
-  size_t combine_host =
-    n_rows * sizeof(uint32_t) + graph_degree * sizeof(uint32_t);  // in_edge_count + hist
-
-  // additional memory for combine stage on device (3 batches)
-  size_t combine_dev = 2 * batch_size * graph_degree * index_size;  // d_output_graph(2*batch)
-  if (mst_optimize) {
-    combine_dev += 2 * batch_size * graph_degree * index_size;  // d_mst_graph(2*batch)
-    combine_dev += 2 * batch_size * sizeof(uint32_t);           // d_mst_graph_num_edges(2*batch)
-  }
-
-  size_t total_host = mst_host + combine_host;
-  size_t total_dev  = std::max(prune_dev, rev_dev + combine_dev);
-
-  return std::make_pair(total_host, total_dev);
-}
-
 // Check if disk mode should be used for ACE based on memory constraints
 template <typename T, typename IdxT>
 bool ace_check_use_disk_mode(bool use_disk,
@@ -994,7 +945,7 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
   // Compute optimize workspace requirements
   size_t sub_partition_size =
     static_cast<size_t>(imbalance_factor * 2 * (dataset_size / n_partitions));
-  auto [host_workspace_size, gpu_workspace_size] = optimize_workspace_size(
+  auto [host_workspace_size, gpu_workspace_size] = helpers::optimize_workspace_size(
     sub_partition_size, graph_degree, intermediate_degree, sizeof(IdxT), guarantee_connectivity);
 
   // Check host memory requirements
@@ -1081,11 +1032,12 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
 
     size_t new_sub_partition_size =
       static_cast<size_t>(imbalance_factor * 2 * (dataset_size / n_partitions));
-    auto [new_opt_host_ws, new_opt_dev_ws] = optimize_workspace_size(new_sub_partition_size,
-                                                                     graph_degree,
-                                                                     intermediate_degree,
-                                                                     sizeof(IdxT),
-                                                                     guarantee_connectivity);
+    auto [new_opt_host_ws, new_opt_dev_ws] =
+      helpers::optimize_workspace_size(new_sub_partition_size,
+                                       graph_degree,
+                                       intermediate_degree,
+                                       sizeof(IdxT),
+                                       guarantee_connectivity);
 
     RAFT_LOG_INFO(
       "ACE: Updated per-partition memory estimates: dataset %.2f GiB, graph %.2f GiB, "
@@ -1645,7 +1597,7 @@ void build_knn_graph(
     return std::string(model_name);
   }();
 
-  RAFT_LOG_DEBUG("# Building IVF-PQ index %s", model_name.c_str());
+  RAFT_LOG_INFO("# Building IVF-PQ index %s", model_name.c_str());
   auto index = cuvs::neighbors::ivf_pq::build(res, pq.build_params, dataset);
 
   //
@@ -1704,7 +1656,7 @@ void build_knn_graph(
     use_large_workspace ? raft::resource::get_large_workspace_resource_ref(res)
                         : raft::resource::get_workspace_resource_ref(res);
 
-  RAFT_LOG_DEBUG(
+  RAFT_LOG_INFO(
     "IVF-PQ search node_degree: %d, top_k: %d,  gpu_top_k: %d,  max_batch_size:: %d, n_probes: %u",
     node_degree,
     top_k,
@@ -1729,6 +1681,7 @@ void build_knn_graph(
   std::size_t num_self_included = 0;
   bool first                    = true;
   const auto start_clock        = std::chrono::system_clock::now();
+  auto last_tick                = start_clock;
 
   auto vec_batches = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<DataT>(
     res,
@@ -1841,14 +1794,17 @@ void build_knn_graph(
 
     size_t num_queries_done = batch.offset() + batch.size();
     const auto end_clock    = std::chrono::system_clock::now();
-    if (batch.offset() > next_report_offset) {
+    if (batch.offset() > next_report_offset &&
+        std::chrono::duration_cast<std::chrono::seconds>(end_clock - last_tick) >
+          std::chrono::seconds(10)) {
       next_report_offset += d_report_offset;
       const auto time =
         std::chrono::duration_cast<std::chrono::microseconds>(end_clock - start_clock).count() *
         1e-6;
       const auto throughput = num_queries_done / time;
+      last_tick             = end_clock;
 
-      RAFT_LOG_DEBUG(
+      RAFT_LOG_INFO(
         "# Search %12lu / %12lu (%3.2f %%), %e queries/sec, %.2f minutes ETA, self included = "
         "%3.2f %%    \r",
         num_queries_done,