Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,7 @@ if(NOT BUILD_CPU_ONLY)
${iface_pq_inst_files}
src/neighbors/detail/cagra/topk_for_cagra/topk.cu
${cuvs_cagra_search_cuda_inst_files}
src/neighbors/detail/cagra/cagra_helpers.cpp
src/neighbors/dynamic_batching.cu
src/neighbors/composite/index.cu
$<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/cagra.cpp>
Expand Down
33 changes: 2 additions & 31 deletions cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand Down Expand Up @@ -42,36 +42,7 @@ auto parse_build_param(const nlohmann::json& conf) ->
::parse_build_param<T, IdxT>(conf, cagra_params);
// If the users provides parameter M, we can use the CAGRA-HNSW heuristics to find optimal
// parameters for the dataset and HNSW reference.
if (conf.contains("M")) {
// Postpone the parsing of the CAGRA build params until the dataset extents are known.
// We the default parameters depend on the dataset extents; and we still would like to be able
// to override them.
cagra_params.cagra_params = [conf, hnsw_params](raft::matrix_extent<int64_t> extents,
cuvs::distance::DistanceType dist_type) {
auto ps = cuvs::neighbors::cagra::index_params::from_hnsw_params(
extents,
conf.at("M"),
hnsw_params.ef_construction,
cuvs::neighbors::cagra::hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT,
dist_type);
ps.metric = dist_type;
// Parse ACE parameters if provided
if (conf.contains("npartitions") || conf.contains("build_dir") ||
conf.contains("ef_construction") || conf.contains("use_disk")) {
auto ace_params = cuvs::neighbors::cagra::graph_build_params::ace_params();
if (conf.contains("npartitions")) { ace_params.npartitions = conf.at("npartitions"); }
if (conf.contains("build_dir")) { ace_params.build_dir = conf.at("build_dir"); }
if (conf.contains("ef_construction")) {
ace_params.ef_construction = conf.at("ef_construction");
}
if (conf.contains("use_disk")) { ace_params.use_disk = conf.at("use_disk"); }
ps.graph_build_params = ace_params;
}
// NB: above, we only provide the defaults. Below we parse the explicit parameters as usual.
::parse_build_param<T, uint32_t>(conf, ps);
return ps;
};
}
if (conf.contains("M")) { hnsw_params.M = conf.at("M"); }
return param;
}

Expand Down
35 changes: 3 additions & 32 deletions cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
Expand Down Expand Up @@ -85,38 +85,9 @@ void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
// when the data set is on host, we can pass it directly to HNSW
bool dataset_is_on_host = raft::get_device_for_address(dataset) == -1;

// re-use the CAGRA wrapper to parse build params
auto bps = build_param_.cagra_build_params;
// Not very conveniently, the CAGRA wrapper resolves parameters after the dataset shape is known,
// so it takes a lambda to do it. Even though we know the shape, we want to use the wrapper as-is,
// so we just modify that lambda.
bps.cagra_params = [dataset_is_on_host, orig_cagra_params = bps.cagra_params](
auto dataset_extents, auto metric) {
auto params = orig_cagra_params(dataset_extents, metric);
params.attach_dataset_on_build = !dataset_is_on_host;
return params;
};
cuvs_cagra<T, IdxT> cagra_wrapper{this->metric_, this->dim_, bps};

// build the CAGRA index
cagra_wrapper.build(dataset, nrow);
auto& cagra_index = *cagra_wrapper.get_index();

// pass the dataset directly to HNSW if it's on the host
std::optional<raft::host_matrix_view<const T, int64_t>> opt_dataset_view = std::nullopt;
if (dataset_is_on_host) {
opt_dataset_view.emplace(
raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_));
}

auto dataset_view = raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the data expected to always reside in host memory?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ACE only supports host memory right now. The main reasons is that we expect the data size to be large and memory-mapped. Further, we do the partitioning and reordering on the host since there is no benefit of moving it to the GPU only to write it to disk afterwards.

Anyways, I think we can support device datasets easily since these should not end up using ACE with this heuristic. @tfeher What do you think?

// convert the index to HNSW format
hnsw_index_ = cuvs::neighbors::hnsw::from_cagra(
handle_, build_param_.hnsw_index_params, cagra_index, opt_dataset_view);

// special treatment in save/serialize step
if (cagra_index.dataset_fd().has_value() && cagra_index.graph_fd().has_value()) {
cagra_ace_build_ = true;
}
hnsw_index_ = cuvs::neighbors::hnsw::build(handle_, build_param_.hnsw_index_params, dataset_view);
Comment on lines +88 to +90
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Preserve ACE/file-backed build state after direct hnsw::build().

Line 90 can now return an ACE-backed HNSW index, but this path never updates cagra_ace_build_. save() still keys off that flag, so memory-constrained builds will always take the serialize branch and skip the existing file-backed save flow.

🛠️ Proposed fix
   auto dataset_view = raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
   // convert the index to HNSW format
   hnsw_index_ = cuvs::neighbors::hnsw::build(handle_, build_param_.hnsw_index_params, dataset_view);
+  cagra_ace_build_ = !hnsw_index_->file_path().empty();
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h` around lines 88 - 90,
The direct call to hnsw::build(...) can return an ACE/file-backed index but the
wrapper never updates cagra_ace_build_, so save() will incorrectly choose the
serialize path; after assigning hnsw_index_ from
cuvs::neighbors::hnsw::build(handle_, build_param_.hnsw_index_params,
dataset_view) detect whether the returned index is ACE/file-backed (e.g., by
querying its ACE-backend flag/interface or a cast to the ACE-backed index type)
and set cagra_ace_build_ = true and capture any backing-file path/metadata from
build_param_/the index so the existing file-backed save flow is used; ensure
this update occurs in the same code path that assigns hnsw_index_ so state
remains consistent for save().

}

template <typename T, typename IdxT>
Expand Down
29 changes: 29 additions & 0 deletions cpp/include/cuvs/neighbors/cagra.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3241,6 +3241,35 @@ namespace neighbors {
namespace cagra {
namespace helpers {

/** Calculates the workspace for graph optimization
*
* @param[in] n_rows number of rows in the dataset (or number of points in the grapt)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* @param[in] n_rows number of rows in the dataset (or number of points in the grapt)
* @param[in] n_rows number of rows in the dataset (or number of points in the graph)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also mst_optimize is not documented

* @param[in] graph_degree degree of the output graph
* @param[in] intermediate_graph_degree degree of the input graph for the optimization process
* @param[in] index_size
* @return pair of [host_size, device_size] memory sizes in bytes
*/
std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
size_t graph_degree,
size_t intermediate_degree,
size_t index_size,
bool mst_optimize = false);
Comment on lines +3244 to +3256
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Complete the parameter documentation and fix typo.

The documentation is incomplete and contains a typo:

  1. The mst_optimize parameter is not documented (as previously noted in review comments)
  2. Line 3246 has a typo: "grapt" should be "graph"
📝 Proposed documentation fix
 /** Calculates the workspace for graph optimization
  *
- * `@param`[in] n_rows number of rows in the dataset (or number of points in the grapt)
+ * `@param`[in] n_rows number of rows in the dataset (or number of points in the graph)
  * `@param`[in] graph_degree degree of the output graph
  * `@param`[in] intermediate_graph_degree degree of the input graph for the optimization process
  * `@param`[in] index_size
+ * `@param`[in] mst_optimize whether to use MST optimization to guarantee graph connectivity
  * `@return` pair of [host_size, device_size] memory sizes in bytes
  */

As per coding guidelines, public API functions must include complete Doxygen documentation describing parameters, return values, and any side effects.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
/** Calculates the workspace for graph optimization
*
* @param[in] n_rows number of rows in the dataset (or number of points in the grapt)
* @param[in] graph_degree degree of the output graph
* @param[in] intermediate_graph_degree degree of the input graph for the optimization process
* @param[in] index_size
* @return pair of [host_size, device_size] memory sizes in bytes
*/
std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
size_t graph_degree,
size_t intermediate_degree,
size_t index_size,
bool mst_optimize = false);
/** Calculates the workspace for graph optimization
*
* `@param`[in] n_rows number of rows in the dataset (or number of points in the graph)
* `@param`[in] graph_degree degree of the output graph
* `@param`[in] intermediate_graph_degree degree of the input graph for the optimization process
* `@param`[in] index_size
* `@param`[in] mst_optimize whether to use MST optimization to guarantee graph connectivity
* `@return` pair of [host_size, device_size] memory sizes in bytes
*/
std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
size_t graph_degree,
size_t intermediate_degree,
size_t index_size,
bool mst_optimize = false);
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/include/cuvs/neighbors/cagra.hpp` around lines 3244 - 3256, Update the
Doxygen for optimize_workspace_size to correct the typo "grapt" → "graph" and to
document the missing mst_optimize parameter: describe what true/false means for
MST-based optimization and any effects on required workspace sizes; ensure all
params (n_rows, graph_degree, intermediate_degree, index_size, mst_optimize)
have short descriptions and the `@return` clearly states the pair meaning
([host_size, device_size] in bytes) and any side effects or preconditions.


/**
* Calculate memory usage of CAGRA build.
*
* @param[in] res raft resource
* @param[in] dataset shape of the dataset
* @param[in] dtype_size size of dataset datatype in bytes
* @param[in] cparams CAGRA index building parameters
*
* @return pair of [host_size, device_size] memory sizes in bytes
*/
std::pair<size_t, size_t> cagra_build_mem_usage(raft::resources const& res,
raft::matrix_extent<int64_t> dataset,
size_t dtype_size,
cuvs::neighbors::cagra::index_params cparams);

/**
* @brief Optimize a KNN graph into a CAGRA graph.
*
Expand Down
13 changes: 13 additions & 0 deletions cpp/include/cuvs/neighbors/ivf_pq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3294,6 +3294,18 @@ void make_rotation_matrix(
raft::device_matrix_view<float, uint32_t, raft::row_major> rotation_matrix,
bool force_random_rotation);

/** Calculate the size of the compressed dataset.
*
* @param[in] res raft resource
* @param[in] dataset shape of the dataset
* @param[in] param ivf-pq compression params
*
* @return compressed dataset size in bytes
*/
size_t compressed_dataset_size(raft::resources const& res,
raft::matrix_extent<int64_t> dataset,
cuvs::neighbors::ivf_pq::index_params params);

/**
* @brief Resize an IVF-PQ list with flat layout.
*
Expand Down Expand Up @@ -3355,6 +3367,7 @@ void resize_list(raft::resources const& res,
const list_spec_interleaved<uint32_t, int64_t>& spec,
uint32_t new_used_size,
uint32_t old_used_size);

/**
* @}
*/
Expand Down
74 changes: 15 additions & 59 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -814,55 +814,6 @@ constexpr double usable_cpu_memory_fraction = 0.8;
constexpr double usable_gpu_memory_fraction = 0.8;
constexpr double imbalance_factor = 3.0;

// Calculate CAGRA optimize workspace memory requirements.
// This is the working memory on top of the input/output memory usage.
inline std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
size_t graph_degree,
size_t intermediate_degree,
size_t index_size,
bool mst_optimize = false)
{
// MST optimization memory (host only)
size_t mst_host = n_rows * index_size; // mst_graph_num_edges
if (mst_optimize) {
mst_host += n_rows * graph_degree * index_size; // mst_graph allocated in optimize
mst_host += n_rows * graph_degree * index_size; // mst_graph allocated in mst_optimize
mst_host += n_rows * index_size * 7; // vectors with _max_edges suffix
mst_host += (graph_degree - 1) * (graph_degree - 1) * index_size; // iB_candidates
}

// batchsize for both prune and combine stages
size_t batch_size = std::min(static_cast<size_t>(256 * 1024), n_rows);

// Prune stage memory
// We neglect 8 bytes (both on host and device) for stats
size_t prune_dev = batch_size * intermediate_degree * 1; // detour count (uint8_t)
prune_dev += batch_size * sizeof(uint32_t); // d_num_detour_edges
prune_dev += n_rows * intermediate_degree * index_size; // d_input_graph
prune_dev += 2 * batch_size * graph_degree * index_size; // d_output_graph(2*batch)

// Reverse graph stage memory
size_t rev_dev = n_rows * graph_degree * index_size; // d_rev_graph
rev_dev += n_rows * sizeof(uint32_t); // d_rev_graph_count
rev_dev += n_rows * index_size; // d_dest_nodes

// Memory for merging graphs (host only optional)
size_t combine_host =
n_rows * sizeof(uint32_t) + graph_degree * sizeof(uint32_t); // in_edge_count + hist

// additional memory for combine stage on device (3 batches)
size_t combine_dev = 2 * batch_size * graph_degree * index_size; // d_output_graph(2*batch)
if (mst_optimize) {
combine_dev += 2 * batch_size * graph_degree * index_size; // d_mst_graph(2*batch)
combine_dev += 2 * batch_size * sizeof(uint32_t); // d_mst_graph_num_edges(2*batch)
}

size_t total_host = mst_host + combine_host;
size_t total_dev = std::max(prune_dev, rev_dev + combine_dev);

return std::make_pair(total_host, total_dev);
}

// Check if disk mode should be used for ACE based on memory constraints
template <typename T, typename IdxT>
bool ace_check_use_disk_mode(bool use_disk,
Expand Down Expand Up @@ -994,7 +945,7 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
// Compute optimize workspace requirements
size_t sub_partition_size =
static_cast<size_t>(imbalance_factor * 2 * (dataset_size / n_partitions));
auto [host_workspace_size, gpu_workspace_size] = optimize_workspace_size(
auto [host_workspace_size, gpu_workspace_size] = helpers::optimize_workspace_size(
sub_partition_size, graph_degree, intermediate_degree, sizeof(IdxT), guarantee_connectivity);

// Check host memory requirements
Expand Down Expand Up @@ -1081,11 +1032,12 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,

size_t new_sub_partition_size =
static_cast<size_t>(imbalance_factor * 2 * (dataset_size / n_partitions));
auto [new_opt_host_ws, new_opt_dev_ws] = optimize_workspace_size(new_sub_partition_size,
graph_degree,
intermediate_degree,
sizeof(IdxT),
guarantee_connectivity);
auto [new_opt_host_ws, new_opt_dev_ws] =
helpers::optimize_workspace_size(new_sub_partition_size,
graph_degree,
intermediate_degree,
sizeof(IdxT),
guarantee_connectivity);

RAFT_LOG_INFO(
"ACE: Updated per-partition memory estimates: dataset %.2f GiB, graph %.2f GiB, "
Expand Down Expand Up @@ -1645,7 +1597,7 @@ void build_knn_graph(
return std::string(model_name);
}();

RAFT_LOG_DEBUG("# Building IVF-PQ index %s", model_name.c_str());
RAFT_LOG_INFO("# Building IVF-PQ index %s", model_name.c_str());
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this and the following logging changes intentionally? Logging every 10 seconds might write a lot of output on a large run.

auto index = cuvs::neighbors::ivf_pq::build(res, pq.build_params, dataset);

//
Expand Down Expand Up @@ -1704,7 +1656,7 @@ void build_knn_graph(
use_large_workspace ? raft::resource::get_large_workspace_resource_ref(res)
: raft::resource::get_workspace_resource_ref(res);

RAFT_LOG_DEBUG(
RAFT_LOG_INFO(
"IVF-PQ search node_degree: %d, top_k: %d, gpu_top_k: %d, max_batch_size:: %d, n_probes: %u",
node_degree,
top_k,
Expand All @@ -1729,6 +1681,7 @@ void build_knn_graph(
std::size_t num_self_included = 0;
bool first = true;
const auto start_clock = std::chrono::system_clock::now();
auto last_tick = start_clock;

auto vec_batches = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<DataT>(
res,
Expand Down Expand Up @@ -1841,14 +1794,17 @@ void build_knn_graph(

size_t num_queries_done = batch.offset() + batch.size();
const auto end_clock = std::chrono::system_clock::now();
if (batch.offset() > next_report_offset) {
if (batch.offset() > next_report_offset &&
std::chrono::duration_cast<std::chrono::seconds>(end_clock - last_tick) >
std::chrono::seconds(10)) {
next_report_offset += d_report_offset;
const auto time =
std::chrono::duration_cast<std::chrono::microseconds>(end_clock - start_clock).count() *
1e-6;
const auto throughput = num_queries_done / time;
last_tick = end_clock;

RAFT_LOG_DEBUG(
RAFT_LOG_INFO(
"# Search %12lu / %12lu (%3.2f %%), %e queries/sec, %.2f minutes ETA, self included = "
"%3.2f %% \r",
num_queries_done,
Expand Down
Loading
Loading