From c386f7602fc94c08e59cdbb74f749670ea09b2bb Mon Sep 17 00:00:00 2001 From: magdalendobson <58752279+magdalendobson@users.noreply.github.com> Date: Wed, 24 Sep 2025 16:03:25 -0400 Subject: [PATCH] Revert "Merge range search" --- algorithms/bench/Makefile | 3 + algorithms/utils/beamSearch.h | 316 ++++++++++------------- algorithms/utils/check_nn_recall.h | 14 +- algorithms/utils/check_range_recall.h | 152 +++-------- algorithms/utils/doublingSearch.h | 128 --------- algorithms/utils/earlyStopping.h | 30 --- algorithms/utils/euclidian_point.h | 14 +- algorithms/utils/filtered_hashset.h | 85 ------ algorithms/utils/graph.h | 1 - algorithms/utils/mips_point.h | 15 +- algorithms/utils/rangeSearch.h | 278 -------------------- algorithms/utils/stats.h | 10 +- algorithms/utils/types.h | 68 ++--- algorithms/vamana/Makefile | 2 +- algorithms/vamana/neighbors.h | 44 +++- algorithms/vamana/scripts/nytimes | 2 +- data_tools/compute_range_groundtruth.cpp | 39 ++- docs/README.md | 14 - docs/rangesearch.md | 31 --- parlaylib | 2 +- rangeSearch/bench/parallelDefsANN | 2 +- rangeSearch/bench/rangeTime.C | 188 ++++---------- rangeSearch/vamanaRange/Makefile | 2 +- rangeSearch/vamanaRange/range.h | 61 +---- 24 files changed, 341 insertions(+), 1160 deletions(-) delete mode 100644 algorithms/utils/doublingSearch.h delete mode 100644 algorithms/utils/earlyStopping.h delete mode 100644 algorithms/utils/filtered_hashset.h delete mode 100644 algorithms/utils/rangeSearch.h delete mode 100644 docs/rangesearch.md diff --git a/algorithms/bench/Makefile b/algorithms/bench/Makefile index 0b3d44ce..75e1034b 100644 --- a/algorithms/bench/Makefile +++ b/algorithms/bench/Makefile @@ -10,6 +10,9 @@ INCLUDE = -Icommon %.o : %.C $(COMMON) $(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@ +timeDistance : timeDistance.C $(COMMON) + $(CC) $(CFLAGS) $(INCLUDE) -o timeDistance timeDistance.C + # $(BNCHMRK)Check : $(CHECKFILES) # $(CC) $(LFLAGS) -o $@ $(CHECKFILES) diff --git a/algorithms/utils/beamSearch.h b/algorithms/utils/beamSearch.h index e2859434..182907f5 100644 --- a/algorithms/utils/beamSearch.h +++ b/algorithms/utils/beamSearch.h @@ -3,7 +3,6 @@ #include #include -#include #include #include #include @@ -16,147 +15,12 @@ #include "types.h" #include "graph.h" #include "stats.h" -#include "filtered_hashset.h" namespace parlayANN { - struct EarlyStopping { - template - bool operator () (const PointInfo& frontier, - const PointInfo& unvisited_frontier, - const PointInfo& visited, - const QueryParams& QP) { return false;} - }; - - // main beam search -template -std::pair>, - parlay::sequence>>, - size_t> -priority_first_search(const GT &G, - const Point p, const PointRange &Points, - const QPoint qp, const QPointRange &Q_Points, - const parlay::sequence starting_points, - const QueryParams &QP) { - auto is_match = [] (indexType i) { return (parlay::hash()(i) % 2) == 0;}; - - using dtype = typename Point::distanceType; - using id_dist = std::pair; - int beamSize = QP.beamSize; - int max_degree = QP.degree_limit; - - std::vector result; - hashset has_been_seen(2 * (10 + beamSize) * max_degree); - - long distance_comparisons = 0; - auto grt = [] (id_dist a, id_dist b) {return a.second > b.second;}; - std::priority_queue, decltype(grt)> pq(grt); - - for (auto v : starting_points) { - if (has_been_seen(v)) continue; - pq.push(std::pair(v, Points[v].distance(p))); - } - - long position = 0; - std::vector unseen; - while (pq.size() > 0 && result.size() < beamSize + 10) { - auto nxt = pq.top(); - pq.pop(); - if (is_match(nxt.first)) result.push_back(nxt); - unseen.clear(); - for (long i = 0; i < G[nxt.first].size(); i++) { - auto v = G[nxt.first][i]; - if (has_been_seen(v)) continue; - distance_comparisons++; - unseen.push_back(v); - Points[v].prefetch(); - } - for (auto v : unseen) { - auto d = Points[v].distance(p); - pq.push(std::pair(v, d)); - } - } - - auto less = [] (id_dist a, id_dist b) {return a.second < b.second;}; - std::sort(result.begin(), result.end(), less); - parlay::sequence r; - for (int i = 0; i < beamSize; i++) r.push_back(result[i]); - return std::pair(std::pair(std::move(r), parlay::to_sequence(result)), - distance_comparisons); -} - - // main beam search -template -std::pair>, - parlay::sequence>>, - size_t> -priority_first_search_(const GT &G, - const Point p, const PointRange &Points, - const QPoint qp, const QPointRange &Q_Points, - const parlay::sequence starting_points, - const QueryParams &QP, - bool use_filtering = false, - ES early_stop = ES{} - ) { - using dtype = typename Point::distanceType; - using id_dist = std::pair; - int beamSize = QP.beamSize; - int max_degree = QP.degree_limit; - - std::vector result; - hashset has_been_seen(2 * (10 + beamSize) * max_degree); - - long distance_comparisons = 0; - auto grt = [] (id_dist a, id_dist b) {return a.second > b.second;}; - auto less = [] (id_dist a, id_dist b) {return a.second < b.second;}; - std::priority_queue, decltype(grt)> pq1(grt); - std::priority_queue, decltype(grt)> pq2(grt); - - for (auto v : starting_points) { - if (has_been_seen(v)) continue; - pq1.push(std::pair(v, Points[v].distance(p))); - } - - long position = 0; - std::vector unseen; - while (pq1.size() > 0 && result.size() < beamSize + 10) { - auto nxt = pq1.top(); - pq1.pop(); - result.push_back(nxt); - unseen.clear(); - for (long i = 0; i < G[nxt.first].size(); i++) { - auto v = G[nxt.first][i]; - if (has_been_seen(v)) continue; - distance_comparisons++; - unseen.push_back(v); - Points[v].prefetch(); - } - for (auto v : unseen) { - auto d = Q_Points[v].distance(qp); - pq2.push(std::pair(v, d)); - } - while (10 + pq2.size() > 4 * pq1.size()) { - indexType v = pq2.top().first; - auto d = Points[v].distance(p); - pq1.push(std::pair(v, d)); - pq2.pop(); - } - } - - std::sort(result.begin(), result.end(), less); - - parlay::sequence r; - for (int i = 0; i < beamSize; i++) r.push_back(result[i]); - - return std::pair(std::pair(std::move(r), parlay::sequence()), - distance_comparisons); -} - // main beam search template + typename QPoint, typename QPointRange, class GT> std::pair>, parlay::sequence>>, size_t> @@ -165,20 +29,15 @@ filtered_beam_search(const GT &G, const QPoint qp, const QPointRange &Q_Points, const parlay::sequence starting_points, const QueryParams &QP, - bool use_filtering = false, - ES early_stop = ES{} + bool use_filtering = false ) { using dtype = typename Point::distanceType; using id_dist = std::pair; int beamSize = QP.beamSize; - int max_degree = QP.degree_limit; if (starting_points.size() == 0) { std::cout << "beam search expects at least one start point" << std::endl; abort(); - } else if (starting_points.size() > beamSize) { - std::cout << "beam search has more starting points than beam size" << std::endl; - abort(); } // compare two (node_id,distance) pairs, first by distance and then id if @@ -188,8 +47,17 @@ filtered_beam_search(const GT &G, return a.second < b.second || (a.second == b.second && a.first < b.first); }; - hashset has_been_seen(2 * (10 + beamSize) * max_degree); - + // used as a hash filter (can give false negative -- i.e. can say + // not in table when it is) + int bits = std::max(10, std::ceil(std::log2(beamSize * beamSize)) - 2); + std::vector hash_filter(1 << bits, -1); + auto has_been_seen = [&](indexType a) -> bool { + int loc = parlay::hash64_2(a) & ((1 << bits) - 1); + if (hash_filter[loc] == a) return true; + hash_filter[loc] = a; + return false; + }; + // Frontier maintains the closest points found so far and its size // is always at most beamSize. Each entry is a (id,distance) pair. // Initialized with starting points and kept sorted by distance. @@ -239,12 +107,8 @@ filtered_beam_search(const GT &G, // The main loop. Terminate beam search when the entire frontier // has been visited or have reached max_visit. while (remain > offset && num_visited < QP.limit) { - // the next node to visit is the unvisited frontier node that is closest to p id_dist current = unvisited_frontier[offset]; - if (early_stop(frontier, unvisited_frontier, visited, QP)) - break; - G[current.first].prefetch(); // add to visited set auto position = std::upper_bound(visited.begin(), visited.end(), current, less); @@ -255,8 +119,7 @@ filtered_beam_search(const GT &G, // if using filtering based on lower quality distances measure, then maintain the average // of low quality distance to the last point in the frontier (if frontier is full) if (use_filtering && frontier_full) { - //constexpr int width = 5; - int width = frontier.size(); + constexpr int width = 5; indexType id = frontier.back().first; if (filter_threshold_count == 0 || filter_id != id) { filter_tail_mean = 0.0; @@ -309,8 +172,7 @@ filtered_beam_search(const GT &G, // This iproves performance for higher accuracies (e.g. beam sizes of 100+) if (candidates.size() == 0 || (QP.limit >= 2 * beamSize && - //candidates.size() < beamSize/8 && - candidates.size() < QP.batch_factor * beamSize && + candidates.size() < beamSize/8 && offset + 1 < remain)) { offset++; continue; @@ -362,23 +224,13 @@ filtered_beam_search(const GT &G, full_dist_cmps); } - struct EStop { - template - bool operator () (const PointInfo& frontier, - const PointInfo& unvisited_frontier, - const PointInfo& visited, - const QueryParams& QP) { return false;} - }; - - // version without filtering - template // = EarlyStopping> +template std::pair>, parlay::sequence>>, size_t> beam_search(const Point p, const Graph &G, const PointRange &Points, - const parlay::sequence starting_points, const QueryParams &QP - ) { - return filtered_beam_search(G,p, Points, p, Points, starting_points, QP, false); //early_stop); + const parlay::sequence starting_points, const QueryParams &QP) { + return filtered_beam_search(G, p, Points, p, Points, starting_points, QP, false); } // backward compatibility (for hnsw) @@ -399,6 +251,71 @@ beam_search(const Point p, const Graph &G, const PointRange &Points, return beam_search(p, G, Points, start_points, QP); } +// a range search that first finds a close point using a beam search, +// and then uses BFS to find all points within the range +template +std::pair, typename Point::distanceType> +range_search(Point p, GT &G, PointRange &Points, + parlay::sequence starting_points, + typename Point::distanceType radius, + typename Point::distanceType radius_2, + QueryParams &QP, bool use_existing = false) { + // first search for a starting point within the radius + + std::vector result; + std::unordered_set seen; + //std::vector starting_points; + long distance_comparisons = 0; + + // if (use_existing) { + // for (indexType i=0; i 0 || Points[v].same_as(p)) continue; + distance_comparisons++; + if (p.distance(Points[v]) > radius_2 ) continue; + result.push_back(v); + seen.insert(v); + } + + // now do a BFS over all vertices with distance less than radius + long position = 0; + while (position < result.size()) { + indexType next = result[position++]; + std::vector unseen; + for (long i = 0; i < G[next].size(); i++) { + auto v = G[next][i]; + if (seen.count(v) > 0 || Points[v].same_as(p)) + continue; // skip if already seen + unseen.push_back(v); + seen.insert(v); + Points[v].prefetch(); + } + for (auto v : unseen) { + distance_comparisons++; + if (Points[v].distance(p) <= radius_2) + result.push_back(v); + } + } + + + // std::vector result1; + // for (auto v : result) { + // if (p.distance(Points[v]) > radius ) continue; + // result1.push_back(v); + // } + + return std::pair(result, distance_comparisons); +} + // searches every element in q starting from a randomly selected point template parlay::sequence> @@ -502,23 +419,13 @@ beam_search_rerank(const Point &p, bool use_rerank = (Base_Points.params.num_bytes() != Q_Base_Points.params.num_bytes()); bool use_filtering = (Q_Base_Points.params.num_bytes() != QQ_Base_Points.params.num_bytes()); - std::pair, parlay::sequence>, size_t> r; - if (QP.batch_factor == .125) - r = filtered_beam_search(G, - qp, Q_Base_Points, - qqp, QQ_Base_Points, - starting_points, QPP, use_filtering); - else - r = priority_first_search(G, - qp, Q_Base_Points, - qqp, QQ_Base_Points, - starting_points, QPP); - auto [pairElts, dist_cmps] = r; + auto [pairElts, dist_cmps] = filtered_beam_search(G, + qp, Q_Base_Points, + qqp, QQ_Base_Points, + starting_points, QPP, use_filtering); auto [beamElts, visitedElts] = pairElts; if (beamElts.size() < QP.k) { - std::cout << "Error: for point id " << p.id() - << " beam search returned " << beamElts.size() - << " elements, which is less than k = " << QP.k << std::endl; + std::cout << "Error: for point id " << p.id() << " beam search returned " << beamElts.size() << " elements, which is less than k = " << QP.k << std::endl; abort(); } @@ -687,7 +594,54 @@ qsearchAll(const PointRange &Query_Points, return all_neighbors; } +// template +// parlay::sequence> +// RangeSearch(PointRange& Query_Points, +// Graph &G, PointRange &Base_Points, stats &QueryStats, +// indexType starting_point, RangeParams &QP) { +// parlay::sequence start_points = {starting_point}; +// return RangeSearch(Query_Points, G, Base_Points, QueryStats, start_points, QP); +// } +// template +// parlay::sequence> +// RangeSearch(PointRange &Query_Points, +// Graph &G, PointRange &Base_Points, stats &QueryStats, +// parlay::sequence starting_points, +// RangeParams &RP) { + +// parlay::sequence> all_neighbors(Query_Points.size()); +// // parlay::sequence second_round(Query_Points.size(), 0); +// parlay::parallel_for(0, Query_Points.size(), [&](size_t i) { +// parlay::sequence neighbors; +// QueryParams QP(RP.initial_beam, RP.initial_beam, 0.0, G.size(), G.max_degree()); +// auto [pairElts, dist_cmps] = beam_search(Query_Points[i], G, Base_Points, starting_points, QP); +// auto [beamElts, visitedElts] = pairElts; +// for (indexType j = 0; j < beamElts.size(); j++) { +// if(beamElts[j].second <= RP.rad) neighbors.push_back(beamElts[j].first); +// } +// all_neighbors[i] = neighbors; +// // if(neighbors.size() < RP.initial_beam){ +// // all_neighbors[i] = neighbors; +// // } else{ +// // auto [in_range, dist_cmps] = range_search(Query_Points[i], G, Base_Points, neighbors, RP); +// // parlay::sequence ans; +// // for (auto r : in_range) ans.push_back(r.first); +// // if(in_range.size() > neighbors.size()) std::cout << "Range search found additional candidates" << std::endl; +// // all_neighbors[i] = ans; +// // second_round[i] = 1; +// // QueryStats.increment_visited(i, in_range.size()); +// // QueryStats.increment_dist(i, dist_cmps); +// // } + +// QueryStats.increment_visited(i, visitedElts.size()); +// QueryStats.increment_dist(i, dist_cmps); +// }); + +// // std::cout << parlay::reduce(second_round) << " elements advanced to round two" << std::endl; + +// return all_neighbors; +// } } // end namespace diff --git a/algorithms/utils/check_nn_recall.h b/algorithms/utils/check_nn_recall.h index ea06bbbd..63994b92 100644 --- a/algorithms/utils/check_nn_recall.h +++ b/algorithms/utils/check_nn_recall.h @@ -195,8 +195,7 @@ void search_and_parse(Graph_ G_, indexType start_point = 0, bool verbose = false, long fixed_beam_width = 0, - double rerank_factor = 100, - double batch_factor = .125) { + int rerank_factor = 100) { parlay::sequence results; std::vector beams; std::vector allr; @@ -211,7 +210,10 @@ void search_and_parse(Graph_ G_, random, start_point, k, QP, verbose);}; - QueryParams QP(k, 0, 1.0, G.size(), G.max_degree(), rerank_factor, batch_factor); + QueryParams QP; + QP.limit = (long) G.size(); + QP.rerank_factor = rerank_factor; + QP.degree_limit = (long) G.max_degree(); beams = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 45, 50, 55, 60, 65, 70, 80, 90, 100, 120, 140, 160, 180, 200, 225, 250, 275, 300, 375, 500, 750, 1000}; @@ -245,8 +247,7 @@ void search_and_parse(Graph_ G_, //calculate_limits(results[0].avg_visited); //parlay::sequence degree_limits = calculate_limits(G.max_degree()); //degree_limits.push_back(G.max_degree()); - QP = QueryParams(r, r, 1.35, (long) G.size(), (long) G.max_degree(), - rerank_factor, batch_factor); + QP = QueryParams(r, r, 1.35, (long) G.size(), (long) G.max_degree()); for(long l : limits){ QP.limit = l; QP.beamSize = std::max(l, r); @@ -255,8 +256,7 @@ void search_and_parse(Graph_ G_, results.push_back(check(r, QP)); } // check "best accuracy" - QP = QueryParams((long) 100, (long) 1000, (double) 10.0, (long) G.size(), - (long) G.max_degree(), rerank_factor, batch_factor); + QP = QueryParams((long) 100, (long) 1000, (double) 10.0, (long) G.size(), (long) G.max_degree()); results.push_back(check(r, QP)); parlay::sequence buckets = {.1, .2, .3, .4, .5, .6, .7, .75, .8, .85, diff --git a/algorithms/utils/check_range_recall.h b/algorithms/utils/check_range_recall.h index 2e41a541..341c661f 100644 --- a/algorithms/utils/check_range_recall.h +++ b/algorithms/utils/check_range_recall.h @@ -1,9 +1,10 @@ +#ifndef ALGORITHMS_CHECK_RANGE_RECALL +#define ALGORITHMS_CHECK_RANGE_RECALL + #include #include #include "beamSearch.h" -#include "doublingSearch.h" -#include "rangeSearch.h" #include "csvfile.h" #include "parse_results.h" #include "parlay/parallel.h" @@ -13,148 +14,71 @@ namespace parlayANN { -template +template void checkRangeRecall( Graph &G, - PointRange &Base_Points, PointRange &Query_Points, - QPointRange &Q_Base_Points, QPointRange &Q_Query_Points, - RangeGroundTruth GT, QueryParams QP, - long start_point,parlay::sequence &active_indices) { + PointRange &Base_Points, + PointRange &Query_Points, + RangeGroundTruth GT, + RangeParams RP, + long start_point) { - if(QP.range_query_type == Doubling) { - - parlay::internal::timer t; - float query_time; - stats QueryStats(Query_Points.size()); - parlay::sequence start_points = {static_cast(start_point)}; - - auto [all_rr,timings] = DoubleBeamRangeSearch(G, - Query_Points, Base_Points, - Q_Query_Points, Q_Base_Points, - QueryStats, start_points, QP, active_indices); - query_time = t.next_time(); - auto [beam_search_time, other_time] = timings; - - float pointwise_recall = 0.0; - float reported_results = 0.0; - float total_results = 0.0; - float num_nonzero = 0.0; - //since distances are exact, just have to cross-check number of results - size_t n = Query_Points.size(); - for (indexType i = 0; i < n; i++) { - float num_reported_results = all_rr[i].size(); - float num_actual_results = GT[i].size(); - reported_results += num_reported_results; - total_results += num_actual_results; - if(num_actual_results != 0) {pointwise_recall += num_reported_results/num_actual_results; num_nonzero++;} - } - - pointwise_recall /= num_nonzero; - float cumulative_recall = reported_results/total_results; - - float QPS = Query_Points.size() / query_time; - auto stats_ = {QueryStats.dist_stats(), QueryStats.visited_stats()}; - std::cout << "For "; - QP.print(); - std::cout << ", Point Recall=" << pointwise_recall - << ", Cum Recall=" << cumulative_recall - << ", Comparisons=" << QueryStats.dist_stats()[0] - << ", Visited=" << QueryStats.visited_stats()[0] - << ", QPS=" << QPS - << ", ctime=" << (1e9 / (QPS * QueryStats.dist_stats()[0])) - << ", timings= [" << beam_search_time<< ","<< other_time <<"]" - << std::endl; - - } else if (QP.range_query_type == Greedy || QP.range_query_type == Beam) { + parlay::sequence> all_rr; + parlay::internal::timer t; float query_time; stats QueryStats(Query_Points.size()); - parlay::sequence start_points = {static_cast(start_point)}; - parlay::internal::timer t; - - auto [all_rr, timings] = RangeSearch(G, - Query_Points, Base_Points, - Q_Query_Points, Q_Base_Points, - QueryStats, start_point, QP); - auto [beam_search_time, other_time] = timings; + + all_rr = RangeSearch(Query_Points, G, Base_Points, QueryStats, start_point, RP); query_time = t.next_time(); + float pointwise_recall = 0.0; float reported_results = 0.0; float total_results = 0.0; float num_nonzero = 0.0; - //since distances are exact, just have to cross-check number of results - size_t n = Query_Points.size(); - for (indexType i = 0; i < n; i++) { - float num_reported_results = all_rr[i].size(); - float num_actual_results = GT[i].size(); - reported_results += num_reported_results; - total_results += num_actual_results; - if(num_actual_results != 0) {pointwise_recall += num_reported_results/num_actual_results; num_nonzero++;} - } + //since distances are exact, just have to cross-check number of results + size_t n = Query_Points.size(); + for (indexType i = 0; i < n; i++) { + float num_reported_results = all_rr[i].size(); + float num_actual_results = GT[i].size(); + reported_results += num_reported_results; + total_results += num_actual_results; + if(num_actual_results != 0) {pointwise_recall += num_reported_results/num_actual_results; num_nonzero++;} + } - pointwise_recall /= num_nonzero; - float cumulative_recall = reported_results/total_results; + pointwise_recall /= num_nonzero; + float cumulative_recall = reported_results/total_results; float QPS = Query_Points.size() / query_time; auto stats_ = {QueryStats.dist_stats(), QueryStats.visited_stats()}; + std::cout << "For "; - QP.print(); - std::cout << ", Point Recall=" << pointwise_recall - << ", Cum Recall=" << cumulative_recall - << ", Comparisons=" << QueryStats.dist_stats()[0] - << ", Visited=" << QueryStats.visited_stats()[0] - << ", QPS=" << QPS - << ", ctime=" << (1e9 / (QPS * QueryStats.dist_stats()[0])) - << ", timings= [" << beam_search_time<< ","<< other_time <<"]" - << std::endl; - } - else { - std::cout << "Error: No beam search type provided, -seach_mode should be one of [doubling, greedy, beam]" << std::endl; - } + RP.print(); + std::cout << ", Pointwise Recall = " << pointwise_recall << ", Cumulative Recall = " << cumulative_recall << ", QPS = " << QPS << std::endl; + + } -template -void range_search_wrapper(Graph &G, - PointRange &Base_Points, PointRange &Query_Points, - QPointRange &Q_Base_Points, QPointRange &Q_Query_Points, - RangeGroundTruth GT, indexType start_point=0, - bool is_early_stopping = false, double esr = 0.0, - rangeQueryType rtype = None, double rad = 0.0) { +template +void range_search_wrapper(Graph &G, PointRange &Base_Points, + PointRange &Query_Points, + RangeGroundTruth GT, double rad, + indexType start_point=0){ std::vector beams; beams = {10, 20, 30, 40, 50, 100, 1000, 2000, 3000}; - - long es = 0; - - parlay::sequence all = parlay::tabulate(Query_Points.size(), [&] (indexType i){return i;}); - parlay::sequence cumulative_recall; - parlay::sequence> timings; - parlay::sequence beam_size; - - for(long b: beams){ - if (is_early_stopping) - es = std::max((long)10, b/4); - - QueryParams QP(b, b, 0.0, G.size(), G.max_degree(), - is_early_stopping, esr, es, rtype, rad); - - - checkRangeRecall(G, - Base_Points, Query_Points, - Q_Base_Points, Q_Query_Points, - GT, QP, start_point, all); - + RangeParams RP(rad, b); + checkRangeRecall(G, Base_Points, Query_Points, GT, RP, start_point); } - - } } // end namespace +#endif // ALGORITHMS_CHECK_RANGE_RECALL diff --git a/algorithms/utils/doublingSearch.h b/algorithms/utils/doublingSearch.h deleted file mode 100644 index f0be6700..00000000 --- a/algorithms/utils/doublingSearch.h +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "parlay/io.h" -#include "parlay/parallel.h" -#include "parlay/primitives.h" -#include "parlay/random.h" -#include "parlay/worker_specific.h" -#include "types.h" -#include "graph.h" -#include "stats.h" -#include "beamSearch.h" -#include "earlyStopping.h" - -namespace parlayANN{ - template -std::pair>,std::pair> -DoubleBeamRangeSearch(Graph &G, - PointRange &Query_Points, PointRange &Base_Points, - QPointRange &Q_Query_Points, QPointRange &Q_Base_Points, - stats &QueryStats, - parlay::sequence starting_points, - QueryParams &QP, parlay::sequence active_indices) { - parlay::sequence> all_neighbors(active_indices.size()); - parlay::WorkerSpecific first_round_time; - parlay::WorkerSpecific second_round_time; - bool use_rerank = (Base_Points.params.num_bytes() != Q_Base_Points.params.num_bytes()); - - parlay::parallel_for(0, active_indices.size(), [&](size_t i) { - parlay::sequence neighbors; - parlay::internal::timer t_search_first("first round time"); - parlay::internal::timer t_search_other("after first round"); - t_search_first.stop(); - t_search_other.stop(); - - t_search_first.start(); - auto P = Query_Points[active_indices[i]]; - auto Q_P = Q_Query_Points[active_indices[i]]; - using dtype = typename decltype(Query_Points[0])::distanceType; - using id_dist = std::pair; - QueryParams QP1(QP.beamSize, QP.beamSize, 0.0, - G.size(), G.max_degree(), - QP.is_early_stop, Q_P.translate_distance(QP.early_stopping_radius), - QP.early_stopping_count, - QP.range_query_type, QP.radius); - - auto [pairElts, dist_cmps] = filtered_beam_search(G, Q_P, Q_Base_Points, Q_P, Q_Base_Points, - starting_points, QP1, false, - early_stopping>); - auto [beamElts, visitedElts] = pairElts; - - QueryStats.increment_visited(i, visitedElts.size()); - QueryStats.increment_dist(i, dist_cmps); - - // rerank and filter out results not within the radius - for (auto b : beamElts){ - double dist; - if (use_rerank) {dist = P.distance(Base_Points[b.first]);} - else {dist = b.second;} - if (dist <= QP.radius) neighbors.push_back(b.first); - } - - bool results_smaller_than_beam = false; - if (neighbors.size() < QP.beamSize) - results_smaller_than_beam = true; - - all_neighbors[i] = std::move(neighbors); - - size_t initial_beam = QP.beamSize * 2; - // Initialize starting points - parlay::sequence starting_points_idx; - - for (auto s : beamElts) - starting_points_idx.push_back(s.first); - t_search_first.stop(); - - t_search_other.start(); - while(!results_smaller_than_beam){ - parlay::sequence neighbors; - - QueryParams QP2(initial_beam, initial_beam, 0.0, G.size(), G.max_degree()); - auto [pairElts, dist_cmps] = beam_search(Q_P, G, Q_Base_Points, starting_points_idx, QP2); - auto [beamElts, visitedElts] = pairElts; - - starting_points_idx.clear(); - for (auto v : beamElts) - starting_points_idx.push_back(v.first); - - // rerank and filter out results not within the radius - for (auto b : beamElts){ - double dist; - if (use_rerank) {dist = P.distance(Base_Points[b.first]);} - else {dist = b.second;} - if (dist <= QP.radius) neighbors.push_back(b.first); - } - - if (neighbors.size() < initial_beam) - results_smaller_than_beam = true; - - all_neighbors[i] = neighbors; - - QueryStats.increment_visited(i, visitedElts.size()); - QueryStats.increment_dist(i, dist_cmps); - initial_beam *= 2; - neighbors.clear(); - - } - t_search_other.stop(); - *first_round_time += t_search_first.total_time(); - *second_round_time += t_search_other.total_time(); - - }); - - - double total_time_first = 0; - double total_time_second = 0; - for (auto x : first_round_time) total_time_first += x; - for (auto y: second_round_time) total_time_second += y; - - return std::make_pair(all_neighbors,std::make_pair(total_time_first,total_time_second)); -} -} diff --git a/algorithms/utils/earlyStopping.h b/algorithms/utils/earlyStopping.h deleted file mode 100644 index 4f0099d7..00000000 --- a/algorithms/utils/earlyStopping.h +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include -#include - -#include "parlay/io.h" -#include "parlay/parallel.h" -#include "parlay/primitives.h" -#include "parlay/random.h" -#include "beamSearch.h" -#include "types.h" -#include "graph.h" -#include "stats.h" - -namespace parlayANN{ - template - bool early_stopping(const PointInfo& frontier, - const PointInfo& unvisited_frontier, - const PointInfo& visited, - const QueryParams& QP){ - bool has_visited_enough = (visited.size() >= QP.early_stopping_count); - bool early_stop = (QP.early_stopping_count > 0); - bool has_found_candidate = (frontier[0].second <= QP.radius); - bool within_early_stop_rad = (unvisited_frontier[0].second <= QP.early_stopping_radius); - return early_stop && has_visited_enough && !has_found_candidate && !within_early_stop_rad; - } -} diff --git a/algorithms/utils/euclidian_point.h b/algorithms/utils/euclidian_point.h index 79e0dd92..baeb07a0 100644 --- a/algorithms/utils/euclidian_point.h +++ b/algorithms/utils/euclidian_point.h @@ -31,7 +31,7 @@ #include "parlay/internal/file_map.h" #include "types.h" -#include "NSGDist.h" +//#include "NSGDist.h" // #include "common/time_loop.h" #include @@ -80,9 +80,9 @@ inline float euclidian_distance(const int8_t *p, const int8_t *q, unsigned d) { return (float)result; } -float euclidian_distance(const float *p, const float *q, unsigned d) { - // efanna2e::DistanceL2 distfunc; - // return distfunc.compare(p, q, d); +inline float euclidian_distance(const float *p, const float *q, unsigned d) { + //efanna2e::DistanceL2 distfunc; + //return distfunc.compare(p, q, d); float result = 0.0; for (int i = 0; i < d; i++) result += (q[i] - p[i]) * (q[i] - p[i]); @@ -116,12 +116,6 @@ struct Euclidian_Point { return euclidian_distance(this->values, x.values, params.dims); } - float translate_distance(double r) const { - if constexpr (sizeof(T) == 2) - return r * params.slope * params.slope / 256; - else return r * params.slope * params.slope; - } - void normalize() { double norm = 0.0; for (int j = 0; j < params.dims; j++) diff --git a/algorithms/utils/filtered_hashset.h b/algorithms/utils/filtered_hashset.h deleted file mode 100644 index 7222f825..00000000 --- a/algorithms/utils/filtered_hashset.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef ALGORITHMS_ANN_HASHSET_H_ -#define ALGORITHMS_ANN_HASHSET_H_ - -#include -#include - -// a hashset that enters integer keys and can give a false negative -// grows as needed -// filtered_hashset x(n); : creates an empty hashset x of initial capacity n -// x(i) : returns true if i in set, otherwise adds i to set and returns false -template -struct filtered_hashset { - int bits; - std::vector filter; - size_t mask; - long num_entries = 0; - size_t hash(intT const& k) const noexcept { - return k * UINT64_C(0xbf58476d1ce4e5b9); } - - bool operator () (intT a) { - int loc = hash(a) & mask; - if (filter[loc] == a) return true; - if (num_entries > filter.size()/2) { - bits = bits + 1; - std::vector new_filter(1ul << bits, -1); - mask = new_filter.size() - 1; - for (auto x : filter) - new_filter[hash(x) & mask] = x; - loc = hash(a) & mask; - filter = new_filter; - } - filter[loc] = a; - num_entries++; - return false; - }; - filtered_hashset(long n) : - bits(std::ceil(std::log2(n))), - filter(std::vector(1ul << bits, -1)), - mask(filter.size() - 1) - {} -}; - -template -struct hashset { - int bits; - std::vector filter; - size_t mask; - long num_entries = 0; - size_t hash(intT const& k) const noexcept { - return k * UINT64_C(0xbf58476d1ce4e5b9); } - - bool operator () (intT a) { - int loc = hash(a) & mask; - if (filter[loc] == a) return true; - if (filter[loc] != -1) { - loc = (loc + 1) & mask; - while (filter[loc] != -1 && filter[loc] != a) - loc = (loc + 1) & mask; - if (filter[loc] == a) return true; - } - if (num_entries > filter.size()/2) { - bits = bits + 1; - std::vector new_filter(1ul << bits, -1); - mask = new_filter.size() - 1; - for (auto x : filter) { - int loc = hash(x) & mask; - while (new_filter[loc] != -1) - loc = (loc + 1) & mask; - new_filter[loc] = x; - } - loc = hash(a) & mask; - filter = new_filter; - } - filter[loc] = a; - num_entries++; - return false; - }; - hashset(long n) : - bits(std::ceil(std::log2(n))), - filter(std::vector((1ul << bits), -1)), - mask(filter.size() - 1) - {} -}; - -#endif // ALGORITHMS_ANN_HASHSET_H_ diff --git a/algorithms/utils/graph.h b/algorithms/utils/graph.h index dc850660..bbaa8979 100644 --- a/algorithms/utils/graph.h +++ b/algorithms/utils/graph.h @@ -206,7 +206,6 @@ struct Graph{ void save(char* oFile) { std::cout << "Writing graph with " << n << " points and max degree " << maxDeg - << " to " << oFile << std::endl; parlay::sequence preamble = {static_cast(n), static_cast(maxDeg)}; diff --git a/algorithms/utils/mips_point.h b/algorithms/utils/mips_point.h index a85d5fd1..5b667bb9 100644 --- a/algorithms/utils/mips_point.h +++ b/algorithms/utils/mips_point.h @@ -37,7 +37,6 @@ #include #include #include -#include "NSGDist.h" namespace parlayANN { @@ -87,10 +86,6 @@ struct Mips_Point { return mips_distance(this->values, x.values, params.dims); } - float translate_distance(float r) const { - return r; - } - void prefetch() const { int l = (params.dims * sizeof(T) - 1)/64 + 1; for (int i=0; i < l; i++) @@ -293,18 +288,17 @@ struct Mips_Point { template struct Quantized_Mips_Point{ using T = int16_t; - using distanceType = float; + using distanceType = int64_t; //float; using byte = uint8_t; struct parameters { float max_val; int dims; - float scale; int num_bytes() const {return (dims * bits - 1) / 8 + 1;} parameters() : max_val(1), dims(0) {} parameters(int dims) : max_val(1), dims(dims) {} parameters(float max_val, int dims) - : max_val(max_val), dims(dims), scale((range/2) / max_val) {} + : max_val(max_val), dims(dims) {} }; static bool is_metric() {return false;} @@ -371,9 +365,6 @@ struct Quantized_Mips_Point{ } } - float translate_distance(float r) const { - return r * params.scale * params.scale; - } void prefetch() const { int l = (params.num_bytes() - 1)/64 + 1; @@ -426,7 +417,7 @@ struct Quantized_Mips_Point{ static void translate_point(byte* byte_values, const Point& p, const parameters& params) { for (int j = 0; j < params.dims; j++) { float mv = params.max_val; - float scale = params.scale; //(range/2) / mv; + float scale = (range/2) / mv; float pj = p[j]; // cap if underflow or overflow if (pj < -mv) assign(byte_values, j, - range/2); // - 1); diff --git a/algorithms/utils/rangeSearch.h b/algorithms/utils/rangeSearch.h deleted file mode 100644 index 9161b5c0..00000000 --- a/algorithms/utils/rangeSearch.h +++ /dev/null @@ -1,278 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "parlay/io.h" -#include "parlay/parallel.h" -#include "parlay/primitives.h" -#include "parlay/random.h" -#include "beamSearch.h" -#include "earlyStopping.h" -#include "types.h" -#include "graph.h" -#include "stats.h" -#include "filtered_hashset.h" - -namespace parlayANN { - - template - std::pair, long> -greedy_search(Point p, Graph &G, PointRange &Points, - std::vector> &starting_points, - double radius) { - std::vector result; - hashset has_been_seen(2 * starting_points.size() * 64); - long distance_comparisons = 0; - - for (auto [v,d] : starting_points) { - if (has_been_seen(v) || d > radius) continue; - result.push_back(v); - } - - // now do a BFS over all vertices with distance less than radius - long position = 0; - std::vector unseen; - while (position < result.size()) { - indexType next = result[position++]; - unseen.clear(); - for (long i = 0; i < G[next].size(); i++) { - auto v = G[next][i]; - if (has_been_seen(v)) continue; - unseen.push_back(v); - Points[v].prefetch(); - } - for (auto v : unseen) { - distance_comparisons++; - if (Points[v].distance(p) <= radius) - result.push_back(v); - } - } - - return std::pair(std::move(result), distance_comparisons); -} - - // Does a priority-first search up to the radius given - template - std::pair, long> -greedy_search_pq(Point p, Graph &G, PointRange &Points, - std::vector> &starting_points, - double radius) { - - std::vector result; - hashset has_been_seen(2 * starting_points.size() * 64); - - long distance_comparisons = 0; - using did = std::pair; - auto cmp = [] (did a, did b) {return a.first > b.first;}; - std::priority_queue, decltype(cmp)> pq(cmp); - - for (auto [v,d] : starting_points) { - if (has_been_seen(v)) continue; - if (d > radius ) continue; - pq.push(std::pair(d,v)); - } - - long position = 0; - std::vector unseen; - while (pq.top().first <= radius) { - auto nxt = pq.top().second; - pq.pop(); - result.push_back(nxt); - unseen.clear(); - for (long i = 0; i < G[nxt].size(); i++) { - auto v = G[nxt][i]; - if (has_been_seen(v)) continue; - unseen.push_back(v); - Points[v].prefetch(); - } - for (auto v : unseen) { - distance_comparisons++; - pq.push(std::pair(Points[v].distance(p), v)); - } - } - - return std::pair(std::move(result), distance_comparisons); -} - - //a variant specialized for range searching -template -std::pair, size_t> -greedy_search_old(Point p, Graph &G, PointRange &Points, - parlay::sequence> &starting_points, - double radius, - parlay::sequence> &already_visited) { - // compare two (node_id,distance) pairs, first by distance and then id if - // equal - using distanceType = typename Point::distanceType; - auto less = [&](std::pair a, std::pair b) { - return a.second < b.second || (a.second == b.second && a.first < b.first); - }; - - //need to use an unordered map for a dynamically sized hash table - std::unordered_set has_been_seen; - - //Insert everything from visited list into has_been_seen - for(auto v : already_visited){ - if(!has_been_seen.count(v.first) > 0) has_been_seen.insert(v.first); - } - - // Frontier maintains the points within radius found so far - // Each entry is a (id,distance) pair. - // Initialized with starting points - std::queue frontier; - for (auto q : starting_points){ - if (!has_been_seen.count(q.first) > 0) has_been_seen.insert(q.first); - frontier.push(q.first); - } - - - // maintains set of visited vertices (id-distance pairs) - std::vector visited; - - // counters - size_t dist_cmps = starting_points.size(); - int remain = 1; - int num_visited = 0; - double total; - - // used as temporaries in the loop - std::vector keep; - keep.reserve(G.max_degree()); - - // The main loop. Terminate beam search when the entire frontier - // has been visited or have reached max_visit. - while (frontier.size() > 0) { - // the next node to visit is the unvisited frontier node that is closest to - // p - indexType current = frontier.front(); - frontier.pop(); - G[current].prefetch(); - // add to visited set - visited.push_back(current); - num_visited++; - - // keep neighbors that have not been visited (using approximate - // hash). Note that if a visited node is accidentally kept due to - // approximate hash it will be removed below by the union or will - // not bump anyone else. - keep.clear(); - for (indexType i=0; i 0) continue; // skip if already seen - keep.push_back(a); - Points[a].prefetch(); - has_been_seen.insert(a); - } - - for (auto a : keep) { - distanceType dist = Points[a].distance(p); - dist_cmps++; - // filter out if not within radius - if (dist > radius) continue; - frontier.push(a); - } - } - - return std::make_pair(visited, dist_cmps); -} - - template - std::pair>,std::pair> -RangeSearch(Graph &G, - PointRange &Query_Points, PointRange &Base_Points, - QPointRange& Q_Query_Points, QPointRange &Q_Base_Points, - stats &QueryStats, - indexType starting_point, - QueryParams &QP) { - - parlay::sequence starting_points = {starting_point}; - parlay::sequence> all_neighbors(Query_Points.size()); - parlay::WorkerSpecific beam_time; - parlay::WorkerSpecific other_time; - bool use_rerank = (Base_Points.params.num_bytes() != Q_Base_Points.params.num_bytes()); - parlay::parallel_for(0, Query_Points.size(), [&](size_t i) { - parlay::internal::timer t_search_beam("beam search time"); - parlay::internal::timer t_search_other("other time"); - t_search_beam.stop(); - t_search_other.stop(); - std::vector neighbors; - std::vector> neighbors_with_distance; - t_search_beam.start(); - using dtype = typename Point::distanceType; - using id_dist = std::pair; - QueryParams QP1(QP.beamSize, QP.beamSize, 0.0, G.size(), G.max_degree(), - QP.is_early_stop, Q_Query_Points[i].translate_distance(QP.early_stopping_radius), - QP.early_stopping_count, - QP.range_query_type, Q_Query_Points[i].translate_distance(QP.radius)); - - auto [pairElts, dist_cmps_beam] = - filtered_beam_search(G, Q_Query_Points[i], Q_Base_Points, - Q_Query_Points[i], Q_Base_Points, - starting_points, QP1, false, - early_stopping>); - t_search_beam.stop(); - auto [beamElts, visitedElts] = pairElts; - for (auto b : beamElts) { - double dist; - if (use_rerank) { - dist = Query_Points[i].distance(Base_Points[b.first]); - } else { - dist = b.second; - } - if (dist <= QP.radius) { - neighbors.push_back(b.first); - neighbors_with_distance.push_back(b); - } - } - if (neighbors.size() < QP.beamSize || QP.range_query_type == Beam){ - all_neighbors[i] = std::move(neighbors); - } else{ - // if using quantization then use slightly larger radius - t_search_other.start(); - double pad_factor = (QP1.radius > 0) ? 1.05 : .975; - double radius = use_rerank ? pad_factor * QP1.radius : QP1.radius; - auto [in_range, dist_cmps_greedy] = - greedy_search(Q_Query_Points[i], G, Q_Base_Points, - neighbors_with_distance, radius); - - std::vector ans; - - //#define EndWithBeam -#ifdef EndWithBeam - int beamSize = in_range.size() * 1.1; - QueryParams QP2(beamSize, beamSize, 0.0, G.size(), G.max_degree()); - auto [pairElts, dist_cmps2] = beam_search(Q_Query_Points[i], G, Q_Base_Points, in_range, QP2); - for (auto r : pairElts.first) - if (Query_Points[i].distance(Base_Points[r.first]) <= QP.radius) - ans.push_back(r.first); -#else - for (auto r : in_range) - if (!use_rerank || Query_Points[i].distance(Base_Points[r]) <= QP.radius) - ans.push_back(r); -#endif - all_neighbors[i] = std::move(ans); - QueryStats.increment_visited(i, in_range.size()); - QueryStats.increment_dist(i, dist_cmps_greedy); - t_search_other.stop(); - } - - - *beam_time += t_search_beam.total_time(); - *other_time += t_search_other.total_time(); - QueryStats.increment_visited(i, visitedElts.size()); - QueryStats.increment_dist(i, dist_cmps_beam); - - }); - - double total_time_beam = 0; - double total_time_other = 0; - for (auto x : beam_time) total_time_beam += x; - for (auto y: other_time) total_time_other += y; - return std::make_pair(all_neighbors,std::make_pair(total_time_beam,total_time_other)); -} - -} \ No newline at end of file diff --git a/algorithms/utils/stats.h b/algorithms/utils/stats.h index 8ee97670..7443bec3 100644 --- a/algorithms/utils/stats.h +++ b/algorithms/utils/stats.h @@ -81,10 +81,12 @@ struct stats{ distances = parlay::sequence(n, 0); } - static parlay::sequence statistics(parlay::sequence s){ - auto sl = parlay::map(s, [] (long x) { return x;}); - indexType avg = (indexType) (parlay::reduce(sl) / s.size()); - indexType tail = parlay::sort(s)[.99 * ((float)s.size())]; + parlay::sequence statistics(parlay::sequence s){ + parlay::sequence stats = parlay::tabulate(s.size(), [&](size_t i) { return s[i];}); + parlay::sort_inplace(stats); + indexType avg= (int)parlay::reduce(stats) / ((double)s.size()); + indexType tail_index = .99 * ((float)s.size()); + indexType tail = stats[tail_index]; auto result = {avg, tail}; return result; } diff --git a/algorithms/utils/types.h b/algorithms/utils/types.h index 61700e9a..b7f7e060 100644 --- a/algorithms/utils/types.h +++ b/algorithms/utils/types.h @@ -150,8 +150,7 @@ struct RangeGroundTruth{ size_t matches(){return num_matches;} }; -enum rangeQueryType {None, Greedy, Doubling, Beam}; - + struct BuildParams{ long R; //vamana and pynnDescent long L; //vamana @@ -164,38 +163,27 @@ struct BuildParams{ long MST_deg; //HCNNG double delta; //pyNNDescent + bool verbose; int quantize = 0; // use quantization for build and query (0 = none, 1 = one-level, 2 = two-level) + double radius; // for radius search + double radius_2; // for radius search bool self; + bool range; int single_batch; //vamana long Q = 0; //beam width to pass onto query (0 indicates none specified) double trim = 0.0; // for quantization double rerank_factor = 100; // for reranking, k * factor = to rerank - double batch_factor = 1.0; - bool is_early_stop = false; - double early_stopping_radius; // for radius search - rangeQueryType range_query_type = None; - double radius; // for radius search std::string alg_type; - BuildParams(long R, long L, double a, int num_passes, long nc, long cs, long mst, double de, - bool verbose = false, int quantize = 0, - bool self = false, int single_batch = 0, - long Q = 0, double trim = 0.0, - double rerank_factor = 100, double batch_factor = 1.0, - bool is_early_stop = false, double early_stopping_radius = 0.0, - rangeQueryType range_query_type = None, double radius = 0.0) - : R(R), L(L), alpha(a), num_passes(num_passes), num_clusters(nc), - cluster_size(cs), MST_deg(mst), delta(de), - verbose(verbose), quantize(quantize), - self(self), single_batch(single_batch), - Q(Q), trim(trim), - rerank_factor(rerank_factor), batch_factor(batch_factor), - is_early_stop(is_early_stop), early_stopping_radius(early_stopping_radius), - range_query_type(range_query_type), radius(radius) { + bool verbose = false, int quantize = 0, double radius = 0.0, double radius_2 = 0.0, + bool self = false, bool range = false, int single_batch = 0, long Q = 0, double trim = 0.0, + int rerank_factor = 100) + : R(R), L(L), alpha(a), num_passes(num_passes), num_clusters(nc), cluster_size(cs), MST_deg(mst), delta(de), + verbose(verbose), quantize(quantize), radius(radius), radius_2(radius_2), self(self), range(range), single_batch(single_batch), Q(Q), trim(trim), rerank_factor(rerank_factor) { if(R != 0 && L != 0 && alpha != 0){alg_type = m_l>0? "HNSW": "Vamana";} else if(num_clusters != 0 && cluster_size != 0 && MST_deg != 0){alg_type = "HCNNG";} else if(R != 0 && alpha != 0 && num_clusters != 0 && cluster_size != 0 && delta != 0){alg_type = "pyNNDescent";} @@ -233,38 +221,30 @@ struct QueryParams{ double cut; long limit; long degree_limit; - double rerank_factor = 100; - double batch_factor = .125; - bool is_early_stop = false; - double early_stopping_radius; - double early_stopping_count; - rangeQueryType range_query_type = None; - double radius; - + int rerank_factor = 100; float pad = 1.0; - QueryParams(long k, long Q, double cut, long limit, long dg, - double rerank_factor = 100, - double batch_factor = 1.0) - : k(k), beamSize(Q), cut(cut), limit(limit), degree_limit(dg), - rerank_factor(rerank_factor), batch_factor(batch_factor) {} - - QueryParams(long k, long Q, double cut, long limit, long dg, - long es, double esr, long esc, - rangeQueryType range_query_type, double radius) - : k(k), beamSize(Q), cut(cut), limit(limit), degree_limit(dg), - is_early_stop(es), early_stopping_radius(esr), early_stopping_count(esc), - range_query_type(range_query_type), radius(radius) {} + QueryParams(long k, long Q, double cut, long limit, long dg, double rerank_factor = 100) : k(k), beamSize(Q), cut(cut), limit(limit), degree_limit(dg), rerank_factor(rerank_factor) {} QueryParams() {} +}; + +struct RangeParams{ + double rad; + long initial_beam; + + RangeParams(double rad, long ib) : rad(rad), initial_beam(ib) {} + + RangeParams() {} + void print(){ - std::cout << "Beam: " << beamSize; + std::cout << "Beam: " << initial_beam; } - }; + template class Desc_HNSW{ public: diff --git a/algorithms/vamana/Makefile b/algorithms/vamana/Makefile index f4cb02bb..9fae95fc 100644 --- a/algorithms/vamana/Makefile +++ b/algorithms/vamana/Makefile @@ -1,6 +1,6 @@ include ../bench/parallelDefsANN -REQUIRE = ../utils/beamSearch.h index.h ../utils/check_nn_recall.h ../utils/NSGDist.h ../utils/parse_results.h ../utils/graph.h ../utils/point_range.h ../utils/euclidian_point.h ../utils/mips_point.h ../utils/jl_point.h ../utils/filtered_hashset.h ../utils/types.h +REQUIRE = ../utils/beamSearch.h index.h ../utils/check_nn_recall.h ../utils/NSGDist.h ../utils/parse_results.h ../utils/graph.h ../utils/point_range.h ../utils/euclidian_point.h ../utils/mips_point.h ../utils/jl_point.h BENCH = neighbors include ../bench/MakeBench diff --git a/algorithms/vamana/neighbors.h b/algorithms/vamana/neighbors.h index 06ea0894..9d2d7122 100644 --- a/algorithms/vamana/neighbors.h +++ b/algorithms/vamana/neighbors.h @@ -57,7 +57,7 @@ void ANN_Quantized(Graph &G, long k, BuildParams &BP, start_point = 0; } else{ I.build_index(G, Q_Points, QQ_Points, BuildStats); - start_point = 0; // I.get_start(); + start_point = I.get_start(); idx_time = t.next_time(); } std::cout << "start index = " << start_point << std::endl; @@ -82,7 +82,30 @@ void ANN_Quantized(Graph &G, long k, BuildParams &BP, QQ_Points, QQ_Query_Points, GT, res_file, k, false, start_point, - verbose, BP.Q, BP.rerank_factor, BP.batch_factor); + verbose, BP.Q, BP.rerank_factor); + } else if (BP.self) { + if (BP.range) { + parlay::internal::timer t_range("range search time"); + double radius = BP.radius; + double radius_2 = BP.radius_2; + std::cout << "radius = " << radius << " radius_2 = " << radius_2 << std::endl; + QueryParams QP; + long n = Points.size(); + parlay::sequence counts(n); + parlay::sequence distance_comps(n); + parlay::parallel_for(0, G.size(), [&] (long i) { + parlay::sequence pts; + pts.push_back(Points[i].id()); + auto [r, dc] = range_search(Points[i], G, Points, pts, radius, radius_2, QP, true); + counts[i] = r.size(); + distance_comps[i] = dc;}); + t_range.total(); + long range_num_distances = parlay::reduce(distance_comps); + + std::cout << "edges within range: " << parlay::reduce(counts) << std::endl; + std::cout << "distance comparisons during build = " << build_num_distances << std::endl; + std::cout << "distance comparisons during range = " << range_num_distances << std::endl; + } } } @@ -102,15 +125,14 @@ void ANN(Graph &G, long k, BuildParams &BP, if (BP.quantize == 1) { ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, Q_Query_Points, GT, res_file, graph_built, Points, Q_Points, Q_Points); - } - // } else if (BP.quantize == 2) { - // using QQPoint = Euclidean_Bit_Point; - // using QQPR = PointRange; - // QQPR QQ_Points(Points); - // QQPR QQ_Query_Points(Query_Points, QQ_Points.params); - // ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points, - // GT, res_file, graph_built, Points, Q_Points, QQ_Points); - else if (BP.quantize == 3) { + } else if (BP.quantize == 2) { + using QQPoint = Euclidean_Bit_Point; + using QQPR = PointRange; + QQPR QQ_Points(Points); + QQPR QQ_Query_Points(Query_Points, QQ_Points.params); + ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points, + GT, res_file, graph_built, Points, Q_Points, QQ_Points); + } else if (BP.quantize == 3) { using QQPoint = Euclidean_JL_Sparse_Point<1024>; using QQPR = PointRange; QQPR QQ_Points(Points); diff --git a/algorithms/vamana/scripts/nytimes b/algorithms/vamana/scripts/nytimes index ae23e886..ce8bb5ef 100644 --- a/algorithms/vamana/scripts/nytimes +++ b/algorithms/vamana/scripts/nytimes @@ -1,6 +1,6 @@ # bash BUILD_ARGS="-R 130 -L 260 -alpha .85 -num_passes 2 -quantize_bits 8 -verbose" -QUERY_ARGS="-quantize_bits 16 -quantize_mode 5 -verbose" +QUERY_ARGS="-quantize_bits 16 -quantize_mode 3 -verbose" TYPE_ARGS="-data_type float -dist_func mips -normalize -file_type bin" PATH=data/nytimes-256-angular diff --git a/data_tools/compute_range_groundtruth.cpp b/data_tools/compute_range_groundtruth.cpp index eddcaaa1..331dcde8 100644 --- a/data_tools/compute_range_groundtruth.cpp +++ b/data_tools/compute_range_groundtruth.cpp @@ -7,9 +7,8 @@ #include "utils/euclidian_point.h" #include "utils/mips_point.h" #include "utils/point_range.h" -#include "../algorithms/bench/parse_command_line.h" -using namespace parlayANN; + template parlay::sequence> compute_range_groundtruth(PointRange &B, @@ -120,35 +119,35 @@ int main(int argc, char* argv[]) { if(tp == "float"){ std::cout << "Detected float coordinates" << std::endl; if(df == "Euclidian"){ - PointRange> B = PointRange>(bFile); - PointRange> Q = PointRange>(qFile); - answers = compute_range_groundtruth>>(B, Q, r); + PointRange> B = PointRange>(bFile); + PointRange> Q = PointRange>(qFile); + answers = compute_range_groundtruth>>(B, Q, r); } else if(df == "mips"){ - PointRange> B = PointRange>(bFile); - PointRange> Q = PointRange>(qFile); - answers = compute_range_groundtruth>>(B, Q, r); + PointRange> B = PointRange>(bFile); + PointRange> Q = PointRange>(qFile); + answers = compute_range_groundtruth>>(B, Q, r); } }else if(tp == "uint8"){ std::cout << "Detected uint8 coordinates" << std::endl; if(df == "Euclidian"){ - PointRange> B = PointRange>(bFile); - PointRange> Q = PointRange>(qFile); - answers = compute_range_groundtruth>>(B, Q, r); + PointRange> B = PointRange>(bFile); + PointRange> Q = PointRange>(qFile); + answers = compute_range_groundtruth>>(B, Q, r); } else if(df == "mips"){ - PointRange> B = PointRange>(bFile); - PointRange> Q = PointRange>(qFile); - answers = compute_range_groundtruth>>(B, Q, r); + PointRange> B = PointRange>(bFile); + PointRange> Q = PointRange>(qFile); + answers = compute_range_groundtruth>>(B, Q, r); } }else if(tp == "int8"){ std::cout << "Detected int8 coordinates" << std::endl; if(df == "Euclidian"){ - PointRange> B = PointRange>(bFile); - PointRange> Q = PointRange>(qFile); - answers = compute_range_groundtruth>>(B, Q, r); + PointRange> B = PointRange>(bFile); + PointRange> Q = PointRange>(qFile); + answers = compute_range_groundtruth>>(B, Q, r); } else if(df == "mips"){ - PointRange> B = PointRange>(bFile); - PointRange> Q = PointRange>(qFile); - answers = compute_range_groundtruth>>(B, Q, r); + PointRange> B = PointRange>(bFile); + PointRange> Q = PointRange>(qFile); + answers = compute_range_groundtruth>>(B, Q, r); } } write_rangeres(answers, std::string(gFile)); diff --git a/docs/README.md b/docs/README.md index cff70fd2..df8f3e4e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -34,17 +34,3 @@ This repository was built for our paper [Scaling Graph-Based ANNS Algorithms to series = {PPoPP '24} } ``` - -The range search algorithms are from our paper [Range Retrieval with Graph-Based Indices](https://arxiv.org/abs/2502.13245). If you use this repository for your own work, please cite us: - -```bibtex -@misc{manohar2025range, - title={Range Retrieval with Graph-Based Indices}, - author={Magdalen Dobson Manohar and Taekseung Kim and Guy E. Blelloch}, - year={2025}, - eprint={2502.13245}, - archivePrefix={arXiv}, - primaryClass={cs.IR}, - url={https://arxiv.org/abs/2502.13245}, -} -``` \ No newline at end of file diff --git a/docs/rangesearch.md b/docs/rangesearch.md deleted file mode 100644 index d3f36129..00000000 --- a/docs/rangesearch.md +++ /dev/null @@ -1,31 +0,0 @@ -# Range Search - -Range search is defined as finding every point within a specified radius of a query point with respect to some dataset. This repository contains the algorithms introduced in the paper [Range Retrieval with Graph-Based Indices](https://arxiv.org/abs/2502.13245). - -## Sample commandline and parameters - -Range groundtruth file should be computed before running these commands. These tools are provided in data_tools library. For further explanation, see [Data Tools](https://cmuparlay.github.io/ParlayANN/data_tools) - -An example commandline for generating range ground truth is shown below. This example is also explained in the [Quickstart](https://cmuparlay.github.io/ParlayANN/quickstart) guide. In this case, the **SIFT dataset** refers to the BIGANN dataset, as described in the [Quickstart](https://cmuparlay.github.io/ParlayANN/quickstart). - - -``` -cd ../data_tools -make compute_range_groundtruth -./compute_range_groundtruth -base_path ../data/sift/sift_learn.fbin -query_path ../data/sift/sift_query.fbin -data_type float -k 100 -dist_func Euclidian -gt_path ../data/sift/sift-100K -``` - -To run a range search on sift run: -``` -cd rangeSearch/vamanaRange -R=../../data/sift -make -./range -alpha 1.15 -R 64 -L 128 -r 10000 -base_path $R/sift_learn.fbin -data_type uint8 -dist_func Euclidian -query_path $R/sift_query.fbin -gt_path $R/range_gt_1M_10000 -search_mode beamSearch -early_stop -graph_path $R/graph1M -early_stopping_radius 30000 -``` - -All other parameters are same as in [Algorithms](https://cmuparlay.github.io/ParlayANN/algorithms). Here we add descriptions for parameters that are new. - -1. **-r**(`double`): Range search radius -2. **-search_mode**(`string`): The search mode to use can be specified. Possible options are ['doubling', 'greedy', 'beam'], corresponding to the three algorithms introduced in our paper. The default option is beam search. -3. **-early_stop**(optional): Flag for early stopping. With this flag on, range search would stop early based on early stopping radius. -4. **-early_stopping_radius**(`double`): Radius for early stopping. Typically larger than the range search radius. diff --git a/parlaylib b/parlaylib index c655b8f3..7cdb4cae 160000 --- a/parlaylib +++ b/parlaylib @@ -1 +1 @@ -Subproject commit c655b8f3c127bd3c93918d1548db0ca5989cffa0 +Subproject commit 7cdb4cae8f020525f5eb4ad82e2565d1e38cfbc3 diff --git a/rangeSearch/bench/parallelDefsANN b/rangeSearch/bench/parallelDefsANN index c58f719d..17e8eed7 100644 --- a/rangeSearch/bench/parallelDefsANN +++ b/rangeSearch/bench/parallelDefsANN @@ -6,7 +6,7 @@ JEMALLOC = -L$(JEMALLOCLD) -ljemalloc endif CCFLAGS = -mcx16 -O3 -std=c++17 -march=native -DNDEBUG -I . -CLFLAGS = -ldl $(JEMALLOC) +CLFLAGS = -ldl $(JEMALLOC) OMPFLAGS = -DPARLAY_OPENMP -fopenmp CILKFLAGS = -DPARLAY_CILK -fcilkplus diff --git a/rangeSearch/bench/rangeTime.C b/rangeSearch/bench/rangeTime.C index 5ca56f25..709c1f3c 100644 --- a/rangeSearch/bench/rangeTime.C +++ b/rangeSearch/bench/rangeTime.C @@ -32,13 +32,16 @@ #include "../utils/mips_point.h" #include "../utils/graph.h" + + #include #include #include #include #include -using namespace parlayANN; + + // ************************************************************* // TIMING @@ -46,25 +49,36 @@ using namespace parlayANN; using uint = unsigned int; + template void timeRange(Graph &G, - PointRange &Query_Points, long k, - BuildParams &BP, char* outFile, - RangeGroundTruth GT, char* res_file, bool graph_built, - PointRange &Points) + PointRange &Query_Points, double rad, + BuildParams &BP, char* outFile, + RangeGroundTruth GT, char* res_file, bool graph_built, PointRange &Points) { - RNG(G, BP, Query_Points, GT, res_file, - graph_built, Points); - if(outFile != NULL) G.save(outFile); + + + time_loop(1, 0, + [&] () {}, + [&] () { + RNG(G, rad, BP, Query_Points, GT, res_file, graph_built, Points); + }, + [&] () {}); + + if(outFile != NULL) { + G.save(outFile); + } + + } int main(int argc, char* argv[]) { commandLine P(argc,argv, "[-a ] [-d ] [-R ]" - "[-L ] [-k ] [-gt_path ] [-query_path ]" - "[-graph_path ] [-graph_outfile ] [-res_path ]" "[-num_passes ]" - "[-memory_flag ] [-mst_deg ] [-num_clusters ] [-cluster_size ]" - "[-data_type ] [-dist_func ] [-base_path ] "); + "[-L ] [-r ] [-gt_path ] [-query_path ]" + "[-graph_path ] [-graph_outfile ] [-res_path ]" + "[-memory_flag ] [-mst_deg ] [num_clusters ] [cluster_size ]" + "[-data_type ] [-dist_func ][-base_path ] "); char* iFile = P.getOptionValue("-base_path"); char* oFile = P.getOptionValue("-graph_outfile"); @@ -73,7 +87,6 @@ int main(int argc, char* argv[]) { char* cFile = P.getOptionValue("-gt_path"); char* rFile = P.getOptionValue("-res_path"); char* vectype = P.getOptionValue("-data_type"); - long Q = P.getOptionIntValue("-Q", 0); long R = P.getOptionIntValue("-R", 0); if(R<0) P.badArgument(); long L = P.getOptionIntValue("-L", 0); @@ -84,61 +97,19 @@ int main(int argc, char* argv[]) { if(num_clusters<0) P.badArgument(); long cluster_size = P.getOptionIntValue("-cluster_size", 0); if(cluster_size<0) P.badArgument(); - double radius = P.getOptionDoubleValue("-radius", 0.0); - long k = P.getOptionIntValue("-k", 0); - if (k > 1000 || k < 0) P.badArgument(); - double alpha = P.getOptionDoubleValue("-alpha", 1.0); - int num_passes = P.getOptionIntValue("-num_passes", 1); + double r = P.getOptionDoubleValue("-r", 0); + double alpha = P.getOptionDoubleValue("-alpha", 0); int two_pass = P.getOptionIntValue("-two_pass", 0); if(two_pass > 1 | two_pass < 0) P.badArgument(); - if (two_pass == 1) num_passes = 2; + bool pass = (two_pass == 1); double delta = P.getOptionDoubleValue("-delta", 0); if(delta<0) P.badArgument(); char* dfc = P.getOptionValue("-dist_func"); - int quantize = P.getOptionIntValue("-quantize_bits", 0); - int quantize_build = P.getOptionIntValue("-quantize_mode", 0); - bool verbose = P.getOption("-verbose"); - bool normalize = P.getOption("-normalize"); - double trim = P.getOptionDoubleValue("-trim", 0.0); // not used - bool self = P.getOption("-self"); - int rerank_factor = P.getOptionIntValue("-rerank_factor", 100); - bool range = P.getOption("-range"); - bool is_early_stop = P.getOption("-early_stop"); - char* sm = P.getOptionValue("-search_mode"); - double esr = P.getOptionDoubleValue("-early_stopping_radius", 0); - double rad = P.getOptionDoubleValue("-r", 0.0); - double batch_factor = P.getOptionDoubleValue("-batch_factor", .125); - - // this integer represents the number of random edges to start with for - // inserting in a single batch per round - int single_batch = P.getOptionIntValue("-single_batch", 0); - + std::string df = std::string(dfc); std::string tp = std::string(vectype); - std::string searchType = std::string(sm); - rangeQueryType rtype = Beam; - - if (searchType == "doubling") { - rtype = Doubling; - std::cout << "Using doubling range search" << std::endl; - } else if (searchType == "greedy") { - rtype = Greedy; - std::cout << "Using greedy range search" << std::endl; - } - else if (searchType == "beam") { - rtype = Beam; - std::cout << "Using beam range search" << std::endl; - } - else rtype = None; - - BuildParams BP = BuildParams(R, L, alpha, num_passes, num_clusters, cluster_size, MST_deg, delta, - verbose, quantize_build, - self, single_batch, - Q, trim, - rerank_factor, batch_factor, - is_early_stop, esr, - rtype, rad); + BuildParams BP = BuildParams(R, L, alpha, pass, num_clusters, cluster_size, MST_deg, delta); long maxDeg = BP.max_degree(); if((tp != "uint8") && (tp != "int8") && (tp != "float")){ @@ -157,112 +128,59 @@ int main(int argc, char* argv[]) { if(tp == "float"){ if(df == "Euclidian"){ - PointRange> Points(iFile); - PointRange> Query_Points(qFile); - if (normalize) { - std::cout << "normalizing data" << std::endl; - for (int i=0; i < Points.size(); i++) - Points[i].normalize(); - for (int i=0; i < Query_Points.size(); i++) - Query_Points[i].normalize(); - } + PointRange> Points = PointRange>(iFile); + PointRange> Query_Points = PointRange>(qFile); Graph G; if(gFile == NULL) G = Graph(maxDeg, Points.size()); else G = Graph(gFile); - if (quantize == 8) { - std::cout << "quantizing data to 1 byte" << std::endl; - using QT = uint8_t; - using QPoint = Euclidian_Point; - using PR = PointRange; - PR Points_(Points); - PR Query_Points_(Query_Points, Points_.params); - timeRange(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_); - } else if (quantize == 16) { - std::cout << "quantizing data to 2 bytes" << std::endl; - using Point = Euclidian_Point; - using PR = PointRange; - PR Points_(Points); - PR Query_Points_(Query_Points, Points_.params); - timeRange(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_); - } else { - using Point = Euclidian_Point; - using PR = PointRange; - timeRange(G, Query_Points, k, BP, oFile, GT, rFile, graph_built, Points); - } + timeRange, PointRange>, uint>(G, Query_Points, r, BP, + oFile, GT, rFile, graph_built, Points); } else if(df == "mips"){ - PointRange> Points(iFile); - PointRange> Query_Points(qFile); - if (normalize) { - std::cout << "normalizing data" << std::endl; - for (int i=0; i < Points.size(); i++) - Points[i].normalize(); - for (int i=0; i < Query_Points.size(); i++) - Query_Points[i].normalize(); - } + PointRange> Points = PointRange>(iFile); + PointRange> Query_Points = PointRange>(qFile); Graph G; if(gFile == NULL) G = Graph(maxDeg, Points.size()); else G = Graph(gFile); - if (quantize == 8) { - std::cout << "quantizing data to 1 byte" << std::endl; - using QT = int8_t; - using Point = Quantized_Mips_Point<8>; - using PR = PointRange; - PR Points_(Points); - PR Query_Points_(Query_Points, Points_.params); - timeRange(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_); - } else if (quantize == 16) { - std::cout << "quantizing data to 2 bytes" << std::endl; - using QT = int16_t; - using Point = Quantized_Mips_Point<16>; - using PR = PointRange; - PR Points_(Points); - PR Query_Points_(Query_Points, Points_.params); - timeRange(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_); - } else { - using Point = Mips_Point; - using PR = PointRange; - timeRange(G, Query_Points, k, BP, oFile, GT, rFile, graph_built, Points); - } + timeRange, PointRange>, uint>(G, Query_Points, r, BP, + oFile, GT, rFile, graph_built, Points); } + } else if(tp == "uint8"){ if(df == "Euclidian"){ - PointRange> Points(iFile); - PointRange> Query_Points(qFile); + PointRange> Points = PointRange>(iFile); + PointRange> Query_Points = PointRange>(qFile); Graph G; if(gFile == NULL) G = Graph(maxDeg, Points.size()); else G = Graph(gFile); - timeRange, PointRange>, uint>(G, Query_Points, k, BP, + timeRange, PointRange>, uint>(G, Query_Points, r, BP, oFile, GT, rFile, graph_built, Points); } else if(df == "mips"){ - PointRange> Points(iFile); - PointRange> Query_Points(qFile); + PointRange> Points = PointRange>(iFile); + PointRange> Query_Points = PointRange>(qFile); Graph G; if(gFile == NULL) G = Graph(maxDeg, Points.size()); else G = Graph(gFile); - timeRange, PointRange>, uint>(G, Query_Points, k, BP, + timeRange, PointRange>, uint>(G, Query_Points, r, BP, oFile, GT, rFile, graph_built, Points); } } else if(tp == "int8"){ if(df == "Euclidian"){ - PointRange> Points(iFile); - PointRange> Query_Points(qFile); + PointRange> Points = PointRange>(iFile); + PointRange> Query_Points = PointRange>(qFile); Graph G; if(gFile == NULL) G = Graph(maxDeg, Points.size()); else G = Graph(gFile); - timeRange, PointRange>, uint>(G, Query_Points, k, BP, + timeRange, PointRange>, uint>(G, Query_Points, r, BP, oFile, GT, rFile, graph_built, Points); } else if(df == "mips"){ - PointRange> Points(iFile); - PointRange> Query_Points(qFile); + PointRange> Points = PointRange>(iFile); + PointRange> Query_Points = PointRange>(qFile); Graph G; if(gFile == NULL) G = Graph(maxDeg, Points.size()); else G = Graph(gFile); - timeRange, PointRange>, uint>(G, Query_Points, k, BP, + timeRange, PointRange>, uint>(G, Query_Points, r, BP, oFile, GT, rFile, graph_built, Points); } } - return 0; -} - - +} \ No newline at end of file diff --git a/rangeSearch/vamanaRange/Makefile b/rangeSearch/vamanaRange/Makefile index dce33494..36c465f1 100644 --- a/rangeSearch/vamanaRange/Makefile +++ b/rangeSearch/vamanaRange/Makefile @@ -1,6 +1,6 @@ include ../bench/parallelDefsANN -REQUIRE = ../utils/beamSearch.h ../utils/doublingSearch.h ../../algorithms/vamana/index.h ../utils/check_nn_recall.h ../utils/NSGDist.h ../utils/parse_results.h ../utils/graph.h ../utils/point_range.h ../utils/check_range_recall.h ../utils/earlyStopping.h ../utils/rangeSearch.h ../utils/types.h ../utils/stats.h +REQUIRE = ../utils/beamSearch.h ../../algorithms/vamana/index.h ../utils/check_nn_recall.h ../utils/NSGDist.h ../utils/parse_results.h ../utils/graph.h ../utils/point_range.h BENCH = range include ../bench/MakeBench diff --git a/rangeSearch/vamanaRange/range.h b/rangeSearch/vamanaRange/range.h index 667607b6..643b01c9 100644 --- a/rangeSearch/vamanaRange/range.h +++ b/rangeSearch/vamanaRange/range.h @@ -30,37 +30,30 @@ #include "../utils/stats.h" #include "../utils/types.h" #include "../utils/graph.h" -#include "../utils/mips_point.h" -#include "../utils/euclidian_point.h" #include "../../algorithms/vamana/index.h" #include "parlay/parallel.h" #include "parlay/primitives.h" #include "parlay/random.h" -namespace parlayANN{ -template -void RNG(Graph &G, BuildParams &BP, - PointRange_ &Query_Points, +template +void RNG(Graph &G, double rad, BuildParams &BP, + PointRange &Query_Points, RangeGroundTruth GT, - char* res_file, bool graph_built, PointRange_ &Points) { + char* res_file, bool graph_built, PointRange &Points) { parlay::internal::timer t("ANN"); - using findex = knn_index; + using findex = knn_index; findex I(BP); double idx_time; - indexType start_point; - stats BuildStats(G.size()); if(graph_built){ idx_time = 0; - start_point = 1; } else{ - I.build_index(G, Points, Points, BuildStats); - start_point = 1; // I.get_start(); + I.build_index(G, Points, BuildStats); idx_time = t.next_time(); } - + indexType start_point = I.get_start(); std::string name = "Vamana"; std::string params = "R = " + std::to_string(BP.R) + ", L = " + std::to_string(BP.L); @@ -70,40 +63,8 @@ void RNG(Graph &G, BuildParams &BP, << std::endl; Graph_ G_(name, params, G.size(), avg_deg, max_deg, idx_time); G_.print(); - double esr = BP.early_stopping_radius; - double rad = BP.radius; - if(Query_Points.size() != 0) { - if (BP.quantize != 0) { - std::cout << "quantizing build and first pass of search to 1 byte" << std::endl; - if (Point::is_metric()) { - using QT = uint8_t; - using QPoint = Euclidian_Point; - using QPR = PointRange; - QPR Q_Points(Points); // quantized to one byte - QPR Q_Query_Points(Query_Points, Q_Points.params); - range_search_wrapper(G, - Points, Query_Points, - Q_Points, Q_Query_Points, - GT, start_point, - BP.is_early_stop, esr, BP.range_query_type, rad); - } else { - using QPoint = Quantized_Mips_Point<8,true,255>; - using QPR = PointRange; - QPR Q_Points(Points); - QPR Q_Query_Points(Query_Points, Q_Points.params); - range_search_wrapper(G, - Points, Query_Points, - Q_Points, Q_Query_Points, - GT, start_point, - BP.is_early_stop, esr, BP.range_query_type, rad); - } - } else { - range_search_wrapper(G, - Points, Query_Points, - Points, Query_Points, - GT, start_point, - BP.is_early_stop, esr, BP.range_query_type, rad); - } - } -} + if(Query_Points.size() != 0) range_search_wrapper(G, Points, Query_Points, GT, rad, start_point); } + + +