diff --git a/README.md b/README.md index e75b8a2b2..69d1427a4 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://gith ### Major Updates +__02/06/2026__ Removed legacy `static_map` implementation from `cuco::legacy` namespace + __02/03/2026__ Modernized `dynamic_map`: promoted `cuco::experimental::dynamic_map` to `cuco::dynamic_map` and removed the legacy implementation __01/30/2026__ Removed legacy `static_multimap` implementation and promoted `cuco::experimental::static_multimap` to `cuco::static_multimap` @@ -25,8 +27,6 @@ __06/04/2025__ Removed CUDA 11 support __11/01/2024__ Refined the term `window` as `bucket` -__01/02/2024__ Moved the legacy `static_map` to `cuco::legacy` namespace - ## Getting cuCollections diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl deleted file mode 100644 index 5dfb5fca6..000000000 --- a/include/cuco/detail/static_map.inl +++ /dev/null @@ -1,889 +0,0 @@ -/* - * Copyright (c) 2020-2026, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -namespace cuco::legacy { - -template -static_map::static_map(std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - Allocator const& alloc, - cudaStream_t stream) - : capacity_{std::max(capacity, std::size_t{1})}, // to avoid dereferencing a nullptr (Issue #72) - empty_key_sentinel_{empty_key_sentinel.value}, - empty_value_sentinel_{empty_value_sentinel.value}, - erased_key_sentinel_{empty_key_sentinel.value}, - slot_allocator_{alloc}, - counter_allocator_{alloc} -{ - slots_ = slot_allocator_.allocate(capacity_, cuda::stream_ref{stream}); - num_successes_ = counter_allocator_.allocate(1, cuda::stream_ref{stream}); - - auto constexpr block_size = 256; - auto constexpr stride = 4; - auto const grid_size = (capacity_ + stride * block_size - 1) / (stride * block_size); - detail::initialize - <<>>( - slots_, empty_key_sentinel_, empty_value_sentinel_, capacity_); -} - -template -static_map::static_map(std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - erased_key erased_key_sentinel, - Allocator const& alloc, - cudaStream_t stream) - : capacity_{std::max(capacity, std::size_t{1})}, // to avoid dereferencing a nullptr (Issue #72) - empty_key_sentinel_{empty_key_sentinel.value}, - empty_value_sentinel_{empty_value_sentinel.value}, - erased_key_sentinel_{erased_key_sentinel.value}, - slot_allocator_{alloc}, - counter_allocator_{alloc} -{ - CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, - "The empty key sentinel and erased key sentinel cannot be the same value.", - std::runtime_error); - - slots_ = slot_allocator_.allocate(capacity_, cuda::stream_ref{stream}); - num_successes_ = counter_allocator_.allocate(1, cuda::stream_ref{stream}); - - auto constexpr block_size = 256; - auto constexpr stride = 4; - auto const grid_size = (capacity_ + stride * block_size - 1) / (stride * block_size); - detail::initialize - <<>>( - slots_, empty_key_sentinel_, empty_value_sentinel_, capacity_); -} - -template -static_map::~static_map() -{ - slot_allocator_.deallocate(slots_, capacity_, cuda::stream_ref{cudaStream_t{nullptr}}); - counter_allocator_.deallocate(num_successes_, 1, cuda::stream_ref{cudaStream_t{nullptr}}); -} - -template -template -void static_map::insert( - InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_mutable_view(); - - // TODO: memset an atomic variable is unsafe - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); - std::size_t h_num_successes; - - detail::insert - <<>>(first, num_keys, num_successes_, view, hash, key_equal); - CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, - num_successes_, - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost, - cuda::stream_ref{stream})); - - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated - - size_ += h_num_successes; -} - -template -template -void static_map::insert_if(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto constexpr tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_mutable_view(); - - // TODO: memset an atomic variable is unsafe - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); - std::size_t h_num_successes; - - detail::insert_if_n<<>>( - first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); - CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, - num_successes_, - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost, - cuda::stream_ref{stream})); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - - size_ += h_num_successes; -} - -template -template -void static_map::erase( - InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) -{ - CUCO_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), - "You must provide a unique erased key sentinel value at map construction.", - std::runtime_error); - - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto constexpr tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_mutable_view(); - - // TODO: memset an atomic variable is unsafe - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); - std::size_t h_num_successes; - - detail::erase - <<>>(first, num_keys, num_successes_, view, hash, key_equal); - CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, - num_successes_, - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost, - cuda::stream_ref{stream})); - - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated - - size_ -= h_num_successes; -} - -template -template -void static_map::find(InputIt first, - InputIt last, - OutputIt output_begin, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_view(); - - detail::find - <<>>(first, num_keys, output_begin, view, hash, key_equal); -} - -template -template -std::pair static_map::retrieve_all( - KeyOut keys_out, ValueOut values_out, cudaStream_t stream) const -{ - static_assert(sizeof(pair_atomic_type) == sizeof(value_type)); - auto slots_begin = reinterpret_cast(slots_); - - auto begin = - thrust::make_transform_iterator(slots_begin, cuco::detail::slot_to_tuple{}); - auto filled = cuco::detail::slot_is_filled{get_empty_key_sentinel()}; - auto zipped_out_begin = thrust::make_zip_iterator(cuda::std::tuple{keys_out, values_out}); - - std::size_t temp_storage_bytes = 0; - using temp_allocator_type = - typename std::allocator_traits::template rebind_alloc; - auto temp_allocator = temp_allocator_type{slot_allocator_}; - auto d_num_out = reinterpret_cast( - temp_allocator.allocate(sizeof(std::size_t), cuda::stream_ref{stream})); - cub::DeviceSelect::If(nullptr, - temp_storage_bytes, - begin, - zipped_out_begin, - d_num_out, - get_capacity(), - filled, - stream); - - // Allocate temporary storage - auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, cuda::stream_ref{stream}); - - cub::DeviceSelect::If(d_temp_storage, - temp_storage_bytes, - begin, - zipped_out_begin, - d_num_out, - get_capacity(), - filled, - stream); - - std::size_t h_num_out; - CUCO_CUDA_TRY(cuco::detail::memcpy_async( - &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream})); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - temp_allocator.deallocate( - reinterpret_cast(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream}); - temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, cuda::stream_ref{stream}); - - return std::make_pair(keys_out + h_num_out, values_out + h_num_out); -} - -template -template -void static_map::contains(InputIt first, - InputIt last, - OutputIt output_begin, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) const -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_view(); - - detail::contains - <<>>(first, num_keys, output_begin, view, hash, key_equal); -} - -template -template -__device__ static_map::device_mutable_view::insert_result -static_map::device_mutable_view::packed_cas( - iterator current_slot, - value_type const& insert_pair, - KeyEqual key_equal, - Key expected_key) noexcept -{ - auto expected_value = this->get_empty_value_sentinel(); - - cuco::detail::pair_converter expected_pair{ - cuco::make_pair(expected_key, expected_value)}; - cuco::detail::pair_converter new_pair{insert_pair}; - - auto slot = - reinterpret_cast::packed_type>*>( - current_slot); - - bool success = slot->compare_exchange_strong( - expected_pair.packed, new_pair.packed, cuda::std::memory_order_relaxed); - if (success) { - return insert_result::SUCCESS; - } - // duplicate present during insert - else if (key_equal(insert_pair.first, expected_pair.pair.first)) { - return insert_result::DUPLICATE; - } - - return insert_result::CONTINUE; -} - -template -template -__device__ static_map::device_mutable_view::insert_result -static_map::device_mutable_view::back_to_back_cas( - iterator current_slot, - value_type const& insert_pair, - KeyEqual key_equal, - Key expected_key) noexcept -{ - using cuda::std::memory_order_relaxed; - - auto expected_value = this->get_empty_value_sentinel(); - - // Back-to-back CAS for 8B/8B key/value pairs - auto& slot_key = current_slot->first; - auto& slot_value = current_slot->second; - - bool key_success = - slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); - bool value_success = - slot_value.compare_exchange_strong(expected_value, insert_pair.second, memory_order_relaxed); - - if (key_success) { - while (not value_success) { - value_success = - slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(), - insert_pair.second, - memory_order_relaxed); - } - return insert_result::SUCCESS; - } else if (value_success) { - slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed); - } - - // our key was already present in the slot, so our key is a duplicate - if (key_equal(insert_pair.first, expected_key)) { return insert_result::DUPLICATE; } - - return insert_result::CONTINUE; -} - -template -template -__device__ static_map::device_mutable_view::insert_result -static_map::device_mutable_view::cas_dependent_write( - iterator current_slot, - value_type const& insert_pair, - KeyEqual key_equal, - Key expected_key) noexcept -{ - using cuda::std::memory_order_relaxed; - - auto& slot_key = current_slot->first; - - auto const key_success = - slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); - - if (key_success) { - auto& slot_value = current_slot->second; - slot_value.store(insert_pair.second, memory_order_relaxed); - return insert_result::SUCCESS; - } - - // our key was already present in the slot, so our key is a duplicate - if (key_equal(insert_pair.first, expected_key)) { return insert_result::DUPLICATE; } - - return insert_result::CONTINUE; -} - -template -template -__device__ bool static_map::device_mutable_view::insert( - value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept -{ - auto current_slot{this->initial_slot(insert_pair.first, hash)}; - - while (true) { - key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the - // sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_available = - cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()) or - cuco::detail::bitwise_compare(existing_key, this->get_erased_key_sentinel()); - - // the key we are trying to insert is already in the map, so we return with failure to insert - if (not slot_is_available and key_equal(existing_key, insert_pair.first)) { return false; } - - if (slot_is_available) { - auto const status = [&]() { - // One single CAS operation if `value_type` is packable - if constexpr (cuco::detail::is_packable()) { - return packed_cas(current_slot, insert_pair, key_equal, existing_key); - } - - if constexpr (not cuco::detail::is_packable()) { -#if (__CUDA_ARCH__ < 700) - return cas_dependent_write(current_slot, insert_pair, key_equal, existing_key); -#else - return back_to_back_cas(current_slot, insert_pair, key_equal, existing_key); -#endif - } - }(); - - // successful insert - if (status == insert_result::SUCCESS) { return true; } - // duplicate present during insert - if (status == insert_result::DUPLICATE) { return false; } - } - - // if we couldn't insert the key, but it wasn't a duplicate, then there must - // have been some other key there, so we keep looking for a slot - current_slot = this->next_slot(current_slot); - } -} - -template -template -__device__ - cuda::std::pair::device_mutable_view::iterator, - bool> - static_map::device_mutable_view::insert_and_find( - value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept -{ -#if __CUDA_ARCH__ < 700 - // Spinning to ensure that the write to the value part took place requires - // independent thread scheduling introduced with the Volta architecture. - static_assert(cuco::detail::is_packable(), - "insert_and_find is not supported for unpackable data on pre-Volta GPUs."); -#endif - - auto current_slot{this->initial_slot(insert_pair.first, hash)}; - - while (true) { - key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the - // sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_available = - cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()) or - cuco::detail::bitwise_compare(existing_key, this->get_erased_key_sentinel()); - - // the key we are trying to insert is already in the map, so we return with failure to insert - if (not slot_is_available and key_equal(existing_key, insert_pair.first)) { - // If we cannot use a single CAS operation, ensure that the write to - // the value part also took place. - if constexpr (not cuco::detail::is_packable()) { - auto& slot_value = current_slot->second; - auto const empty_value = this->get_empty_value_sentinel(); - while (cuco::detail::bitwise_compare(slot_value.load(cuda::std::memory_order_relaxed), - empty_value)) { - // spin - } - } - - return cuda::std::pair{current_slot, false}; - } - - if (slot_is_available) { - auto const status = [&]() { - // One single CAS operation if `value_type` is packable - if constexpr (cuco::detail::is_packable()) { - return packed_cas(current_slot, insert_pair, key_equal, existing_key); - } - - if constexpr (not cuco::detail::is_packable()) { - // Only use cas_dependent_write; for back_to_back_cas we cannot - // guarantee that we get a valid iterator: Consider the case of two - // threads inserting the same key, and one gets the key while the - // other gets the value. For a third thread, the entry looks valid, - // but the second thread will first reset the value to the empty - // sentinel to signal that the first thread can write its value. - // This ambiguity cannot be solved for the third thread, so we have - // to avoid it. - return cas_dependent_write(current_slot, insert_pair, key_equal, existing_key); - } - }(); - - // successful insert - if (status == insert_result::SUCCESS) { - // This thread did the insertion, so the iterator is guaranteed to be - // valid without any special care. - return cuda::std::pair{current_slot, true}; - } - // duplicate present during insert - if (status == insert_result::DUPLICATE) { - // If we cannot use a single CAS operation, ensure that the write to - // the value part also took place. - if constexpr (not cuco::detail::is_packable()) { - auto& slot_value = current_slot->second; - auto const empty_value = this->get_empty_value_sentinel(); - while (cuco::detail::bitwise_compare(slot_value.load(cuda::std::memory_order_relaxed), - empty_value)) { - // spin - } - } - - return cuda::std::pair{current_slot, false}; - } - } - - // if we couldn't insert the key, but it wasn't a duplicate, then there must - // have been some other key there, so we keep looking for a slot - current_slot = this->next_slot(current_slot); - } -} - -template -template -__device__ bool static_map::device_mutable_view::insert( - CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept -{ - auto current_slot = this->initial_slot(g, insert_pair.first, hash); - - while (true) { - key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the - // sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_available = - cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()) or - cuco::detail::bitwise_compare(existing_key, this->get_erased_key_sentinel()); - - // the key we are trying to insert is already in the map, so we return with failure to insert - if (g.any(not slot_is_available and key_equal(existing_key, insert_pair.first))) { - return false; - } - - auto const bucket_contains_available = g.ballot(slot_is_available); - - // we found an empty slot, but not the key we are inserting, so this must - // be an empty slot into which we can insert the key - if (bucket_contains_available) { - // the first lane in the group with an empty slot will attempt the insert - insert_result status{insert_result::CONTINUE}; - uint32_t src_lane = __ffs(bucket_contains_available) - 1; - - if (g.thread_rank() == src_lane) { - // One single CAS operation if `value_type` is packable - if constexpr (cuco::detail::is_packable()) { - status = packed_cas(current_slot, insert_pair, key_equal, existing_key); - } - // Otherwise, two back-to-back CAS operations - else { -#if (__CUDA_ARCH__ < 700) - status = cas_dependent_write(current_slot, insert_pair, key_equal, existing_key); -#else - status = back_to_back_cas(current_slot, insert_pair, key_equal, existing_key); -#endif - } - } - - uint32_t res_status = g.shfl(static_cast(status), src_lane); - status = static_cast(res_status); - - // successful insert - if (status == insert_result::SUCCESS) { return true; } - // duplicate present during insert - if (status == insert_result::DUPLICATE) { return false; } - // if we've gotten this far, a different key took our spot - // before we could insert. We need to retry the insert on the - // same bucket - } - // if there are no empty slots in the current bucket, - // we move onto the next bucket - else { - current_slot = this->next_slot(g, current_slot); - } - } -} - -template -template -__device__ bool static_map::device_mutable_view::erase( - key_type const& k, Hash hash, KeyEqual key_equal) noexcept -{ - auto current_slot{this->initial_slot(k, hash)}; - auto const init_slot = current_slot; - - value_type const insert_pair = - make_pair(this->get_erased_key_sentinel(), this->get_empty_value_sentinel()); - - while (true) { - static_assert(sizeof(Key) == sizeof(atomic_key_type)); - static_assert(sizeof(Value) == sizeof(atomic_mapped_type)); - // TODO: Replace reinterpret_cast with atomic ref when available. - value_type slot_contents = *reinterpret_cast(current_slot); - auto existing_key = slot_contents.first; - auto existing_value = slot_contents.second; - - // Key doesn't exist, return false - if (cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) { - return false; - } - - // Key exists, return true if successfully deleted - if (key_equal(existing_key, k)) { - if constexpr (cuco::detail::is_packable()) { - auto slot = reinterpret_cast< - cuda::atomic::packed_type>*>( - current_slot); - cuco::detail::pair_converter expected_pair{ - cuco::make_pair(existing_key, existing_value)}; - cuco::detail::pair_converter new_pair{insert_pair}; - - return slot->compare_exchange_strong( - expected_pair.packed, new_pair.packed, cuda::std::memory_order_relaxed); - } - if constexpr (not cuco::detail::is_packable()) { - current_slot->second.compare_exchange_strong( - existing_value, insert_pair.second, cuda::std::memory_order_relaxed); - return current_slot->first.compare_exchange_strong( - existing_key, insert_pair.first, cuda::std::memory_order_relaxed); - } - } - - current_slot = this->next_slot(current_slot); - // if all keys in this map has been erased, return false - if (current_slot == init_slot) { return false; } - } -} - -template -template -__device__ bool static_map::device_mutable_view::erase( - CG g, key_type const& k, Hash hash, KeyEqual key_equal) noexcept -{ - auto current_slot = this->initial_slot(g, k, hash); - auto const init_slot = current_slot; - value_type const insert_pair = - make_pair(this->get_erased_key_sentinel(), this->get_empty_value_sentinel()); - - while (true) { - static_assert(sizeof(Key) == sizeof(atomic_key_type)); - static_assert(sizeof(Value) == sizeof(atomic_mapped_type)); - // TODO: Replace reinterpret_cast with atomic ref when available. - value_type slot_contents = *reinterpret_cast(current_slot); - auto existing_key = slot_contents.first; - auto existing_value = slot_contents.second; - - auto const slot_is_empty = - cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); - - auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k)); - - // Key exists, return true if successfully deleted - if (exists) { - uint32_t src_lane = __ffs(exists) - 1; - - bool status; - if (g.thread_rank() == src_lane) { - if constexpr (cuco::detail::is_packable()) { - auto slot = reinterpret_cast< - cuda::atomic::packed_type>*>( - current_slot); - cuco::detail::pair_converter expected_pair{ - cuco::make_pair(existing_key, existing_value)}; - cuco::detail::pair_converter new_pair{insert_pair}; - - status = slot->compare_exchange_strong( - expected_pair.packed, new_pair.packed, cuda::std::memory_order_relaxed); - } - if constexpr (not cuco::detail::is_packable()) { - current_slot->second.compare_exchange_strong( - existing_value, insert_pair.second, cuda::std::memory_order_relaxed); - status = current_slot->first.compare_exchange_strong( - existing_key, insert_pair.first, cuda::std::memory_order_relaxed); - } - } - - uint32_t res_status = g.shfl(static_cast(status), src_lane); - return static_cast(res_status); - } - - // empty slot found, but key not found, must not be in the map - if (g.ballot(slot_is_empty)) { return false; } - - current_slot = this->next_slot(g, current_slot); - if (current_slot == init_slot) { return false; } - } -} - -template -template -__device__ typename static_map::device_view::iterator -static_map::device_view::find(Key const& k, - Hash hash, - KeyEqual key_equal) noexcept -{ - auto current_slot = this->initial_slot(k, hash); - auto const init_slot = current_slot; - - while (true) { - auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - // Key doesn't exist, return end() - if (cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) { - return this->end(); - } - - // Key exists, return iterator to location - if (key_equal(existing_key, k)) { return current_slot; } - - current_slot = this->next_slot(current_slot); - if (current_slot == init_slot) { return this->end(); } - } -} - -template -template -__device__ typename static_map::device_view::const_iterator -static_map::device_view::find(Key const& k, - Hash hash, - KeyEqual key_equal) const noexcept -{ - auto current_slot = this->initial_slot(k, hash); - auto const init_slot = current_slot; - - while (true) { - auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - // Key doesn't exist, return end() - if (cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) { - return this->end(); - } - - // Key exists, return iterator to location - if (key_equal(existing_key, k)) { return current_slot; } - - current_slot = this->next_slot(current_slot); - if (current_slot == init_slot) { return this->end(); } - } -} - -template -template -__device__ typename static_map::device_view::iterator -static_map::device_view::find(CG g, - Key const& k, - Hash hash, - KeyEqual key_equal) noexcept -{ - auto current_slot = this->initial_slot(g, k, hash); - auto const init_slot = current_slot; - - while (true) { - auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as - // the sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_empty = - cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); - - // the key we were searching for was found by one of the threads, - // so we return an iterator to the entry - auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k)); - if (exists) { - uint32_t src_lane = __ffs(exists) - 1; - // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the - // current_slot and shuffle that instead. - intptr_t res_slot = g.shfl(reinterpret_cast(current_slot), src_lane); - return reinterpret_cast(res_slot); - } - - // we found an empty slot, meaning that the key we're searching for isn't present - if (g.ballot(slot_is_empty)) { return this->end(); } - - // otherwise, all slots in the current bucket are full with other keys, so we move onto the - // next bucket - current_slot = this->next_slot(g, current_slot); - if (current_slot == init_slot) { return this->end(); } - } -} - -template -template -__device__ typename static_map::device_view::const_iterator -static_map::device_view::find(CG g, - Key const& k, - Hash hash, - KeyEqual key_equal) const noexcept -{ - auto current_slot = this->initial_slot(g, k, hash); - auto const init_slot = current_slot; - - while (true) { - auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as - // the sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_empty = - cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); - - // the key we were searching for was found by one of the threads, so we return an iterator to - // the entry - auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k)); - if (exists) { - uint32_t src_lane = __ffs(exists) - 1; - // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the - // current_slot and shuffle that instead. - intptr_t res_slot = g.shfl(reinterpret_cast(current_slot), src_lane); - return reinterpret_cast(res_slot); - } - - // we found an empty slot, meaning that the key we're searching - // for isn't in this submap, so we should move onto the next one - if (g.ballot(slot_is_empty)) { return this->end(); } - - // otherwise, all slots in the current bucket are full with other keys, - // so we move onto the next bucket in the current submap - - current_slot = this->next_slot(g, current_slot); - if (current_slot == init_slot) { return this->end(); } - } -} - -template -template -__device__ bool static_map::device_view::contains( - ProbeKey const& k, Hash hash, KeyEqual key_equal) const noexcept -{ - auto current_slot = this->initial_slot(k, hash); - auto const init_slot = current_slot; - - while (true) { - auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - - if (cuco::detail::bitwise_compare(existing_key, this->empty_key_sentinel_)) { return false; } - - if (key_equal(existing_key, k)) { return true; } - - current_slot = this->next_slot(current_slot); - if (current_slot == init_slot) { return false; } - } -} - -template -template -__device__ cuda::std::enable_if_t, bool> -static_map::device_view::contains(CG g, - ProbeKey const& k, - Hash hash, - KeyEqual key_equal) const noexcept -{ - auto current_slot = this->initial_slot(g, k, hash); - auto const init_slot = current_slot; - - while (true) { - key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as - // the sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_empty = - cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); - - // the key we were searching for was found by one of the threads, so we return an iterator to - // the entry - if (g.ballot(not slot_is_empty and key_equal(existing_key, k))) { return true; } - - // we found an empty slot, meaning that the key we're searching for isn't present - if (g.ballot(slot_is_empty)) { return false; } - - // otherwise, all slots in the current bucket are full with other keys, so we move onto the - // next bucket - current_slot = this->next_slot(g, current_slot); - if (current_slot == init_slot) { return false; } - } -} -} // namespace cuco::legacy diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh deleted file mode 100644 index fd50f73df..000000000 --- a/include/cuco/detail/static_map_kernels.cuh +++ /dev/null @@ -1,586 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include - -#include - -namespace cuco::legacy::detail { -namespace cg = cooperative_groups; - -CUCO_SUPPRESS_KERNEL_WARNINGS -/** - * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. - * - * Each space in `slots` that can hold a key value pair is initialized to a - * `pair_atomic_type` containing the key `k` and the value `v`. - * - * @tparam atomic_key_type Type of the `Key` atomic container - * @tparam atomic_mapped_type Type of the `Value` atomic container - * @tparam Key key type - * @tparam Value value type - * @tparam pair_atomic_type key/value pair type - * - * @param slots Pointer to flat storage for the map's key/value pairs - * @param k Key to which all keys in `slots` are initialized - * @param v Value to which all values in `slots` are initialized - * @param size Size of the storage pointed to by `slots` - */ -template -CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) -{ - int64_t const loop_stride = gridDim.x * block_size; - int64_t idx = block_size * blockIdx.x + threadIdx.x; - while (idx < size) { - new (&slots[idx].first) atomic_key_type{k}; - new (&slots[idx].second) atomic_mapped_type{v}; - idx += loop_stride; - } -} - -/** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * - * If multiple keys in `[first, last)` compare equal, it is unspecified which - * element is inserted. - * - * @tparam block_size - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of key/value pairs - * @param n Number of the key/value pairs to insert - * @param num_successes The number of successfully inserted key/value pairs - * @param view Mutable device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function used to compare two keys for equality - */ -template -CUCO_KERNEL void insert( - InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - - int64_t const loop_stride = gridDim.x * block_size; - int64_t idx = block_size * blockIdx.x + threadIdx.x; - - while (idx < n) { - typename viewT::value_type const insert_pair{*(first + idx)}; - if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } -} - -/** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * - * If multiple keys in `[first, last)` compare equal, it is unspecified which - * element is inserted. Uses the CUDA Cooperative Groups API to leverage groups - * of multiple threads to perform each key/value insertion. This provides a - * significant boost in throughput compared to the non Cooperative Group - * `insert` at moderate to high load factors. - * - * @tparam block_size - * @tparam tile_size The number of threads in the Cooperative Groups used to perform - * inserts - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of key/value pairs - * @param n Number of the key/value pairs to insert - * @param num_successes The number of successfully inserted key/value pairs - * @param view Mutable device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function used to compare two keys for equality - */ -template -CUCO_KERNEL void insert( - InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - - while (idx < n) { - // force conversion to value_type - typename viewT::value_type const insert_pair{*(first + idx)}; - if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { - thread_num_successes++; - } - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } -} - -/** - * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, its slot is erased and made available for future - * insertions. - * Else, no effect. - * - * @tparam block_size The size of the thread block - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param num_successes The number of successfully erased key/value pairs - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void erase( - InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) -{ - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - - const int64_t loop_stride = gridDim.x * block_size; - int64_t idx = block_size * blockIdx.x + threadIdx.x; - - while (idx < n) { - if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; } - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, its slot is erased and made available for future - * insertions. - * Else, no effect. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param num_successes The number of successfully erased key/value pairs - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void erase( - InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - - while (idx < n) { - if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) { - thread_num_successes++; - } - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Inserts key/value pairs in the range `[first, first + n)` if `pred` of the - * corresponding stencil returns true. - * - * If multiple keys in `[first, last)` compare equal, it is unspecified which - * element is inserted. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform insert - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` - * and argument type is convertible from `std::iterator_traits::value_type` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of key/value pairs - * @param n Number of elements to insert - * @param num_successes The number of successfully inserted key/value pairs - * @param view Mutable device view used to access the hash map's slot storage - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[s, s + n)` - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function used to compare two keys for equality - */ -template -CUCO_KERNEL void insert_if_n(InputIt first, - int64_t n, - atomicT* num_successes, - viewT view, - StencilIt stencil, - Predicate pred, - Hash hash, - KeyEqual key_equal) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - - while (idx < n) { - if (pred(*(stencil + idx))) { - typename viewT::value_type const insert_pair{*(first + idx)}; - if (view.insert(tile, insert_pair, hash, key_equal) and tile.thread_rank() == 0) { - thread_num_successes++; - } - } - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Finds the values corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. - * Else, copies the empty value sentinel. - * @tparam block_size The size of the thread block - * @tparam Value The type of the mapped value for the map - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param n Number of keys to query - * @param output_begin Beginning of the sequence of values retrieved for each key - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void find( - InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) -{ - int64_t const loop_stride = gridDim.x * block_size; - int64_t idx = block_size * blockIdx.x + threadIdx.x; - __shared__ Value writeBuffer[block_size]; - - while (idx < n) { - auto key = *(first + idx); - auto found = view.find(key, hash, key_equal); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - writeBuffer[threadIdx.x] = found == view.end() - ? view.get_empty_value_sentinel() - : found->second.load(cuda::std::memory_order_relaxed); - __syncthreads(); - *(output_begin + idx) = writeBuffer[threadIdx.x]; - idx += loop_stride; - } -} - -/** - * @brief Finds the values corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. - * Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage groups - * of multiple threads to find each key. This provides a significant boost in throughput compared - * to the non Cooperative Group `find` at moderate to high load factors. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform - * inserts - * @tparam Value The type of the mapped value for the map - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param n Number of keys to query - * @param output_begin Beginning of the sequence of values retrieved for each key - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void find( - InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; -#pragma nv_diagnostic push -#pragma nv_diag_suppress static_var_with_dynamic_init - // Get rid of a false-positive build warning with ARM - __shared__ Value writeBuffer[block_size / tile_size]; -#pragma nv_diagnostic pop - - while (idx < n) { - auto key = *(first + idx); - auto found = view.find(tile, key, hash, key_equal); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - if (tile.thread_rank() == 0) { - writeBuffer[threadIdx.x / tile_size] = - found == view.end() ? view.get_empty_value_sentinel() - : found->second.load(cuda::std::memory_order_relaxed); - } - __syncthreads(); - if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } - idx += loop_stride; - } -} - -/** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * - * @tparam block_size The size of the thread block - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param n Number of keys to query - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void contains( - InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) -{ - int64_t const loop_stride = gridDim.x * block_size; - int64_t idx = block_size * blockIdx.x + threadIdx.x; - __shared__ bool writeBuffer[block_size]; - - while (idx < n) { - auto key = *(first + idx); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal); - __syncthreads(); - *(output_begin + idx) = writeBuffer[threadIdx.x]; - idx += loop_stride; - } -} - -/** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the - * contains operation for each key. This provides a significant boost in throughput compared - * to the non Cooperative Group `contains` at moderate to high load factors. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform - * inserts - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param n Number of keys to query - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void contains( - InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - __shared__ bool writeBuffer[block_size / tile_size]; - - while (idx < n) { - auto key = *(first + idx); - auto found = view.contains(tile, key, hash, key_equal); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } - __syncthreads(); - if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } - idx += loop_stride; - } -} - -} // namespace cuco::legacy::detail diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index f11f9bed9..d8afe8c13 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -16,12 +16,10 @@ #pragma once #include -#include #include #include #include -#include #include namespace cuco { @@ -37,53 +35,6 @@ __device__ __forceinline__ cuda::std::int32_t count_least_significant_bits(cuda: return __popc(x & (1 << n) - 1); } -/** - * @brief Converts pair to `cuda::std::tuple` to allow assigning to a zip iterator. - * - * @tparam Key The slot key type - * @tparam Value The slot value type - */ -template -struct slot_to_tuple { - /** - * @brief Converts a pair to a `cuda::std::tuple`. - * - * @tparam S The slot type - * - * @param s The slot to convert - * @return A cuda::std::tuple containing `s.first` and `s.second` - */ - template - __device__ cuda::std::tuple operator()(S const& s) - { - return cuda::std::tuple(s.first, s.second); - } -}; - -/** - * @brief Device functor returning whether the input slot `s` is filled. - * - * @tparam Key The slot key type - */ -template -struct slot_is_filled { - Key empty_key_sentinel_; ///< The value of the empty key sentinel - - /** - * @brief Indicates if the target slot `s` is filled. - * - * @tparam S The slot type - * - * @param s The slot to query - * @return `true` if slot `s` is filled - */ - template - __device__ bool operator()(S const& s) - { - return not cuco::detail::bitwise_compare(cuda::std::get<0>(s), empty_key_sentinel_); - } -}; - template __host__ __device__ constexpr SizeType to_positive(HashType hash) { diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index dde5b249e..b70fa593a 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -1249,1394 +1248,6 @@ class static_map { mapped_type empty_value_sentinel_; ///< Sentinel value that indicates an empty payload }; -template -class dynamic_map; - -namespace legacy { - -/** - * @brief A GPU-accelerated, unordered, associative container of key-value - * pairs with unique keys. - * - * Allows constant time concurrent inserts or concurrent find operations from threads in device - * code. Concurrent insert and find are supported only if the pair type is packable (see - * `cuco::detail::is_packable` constexpr). - * - * Current limitations: - * - Requires keys and values that where `cuco::is_bitwise_comparable_v` is true - * - Comparisons against the "sentinel" values will always be done with bitwise comparisons. - * - Capacity is fixed and will not grow automatically - * - Requires the user to specify sentinel values for both key and mapped value to indicate empty - * slots - * - Conditionally support concurrent insert and find operations - * - * The `static_map` supports two types of operations: - * - Host-side "bulk" operations - * - Device-side "singular" operations - * - * The host-side bulk operations include `insert`, `erase`, `find`, and `contains`. These - * APIs should be used when there are a large number of keys to insert, erase or lookup - * in the map. For example, given a range of keys specified by device-accessible - * iterators, the bulk `insert` function will insert all keys into the map. Note that in order - * for a `static_map` instance to support `erase`, the user must provide an `erased_key_sentinel` - * which is distinct from the `empty_key_sentinel` at construction. If `erase` is called on a - * `static_map` which was not constructed in this way, a runtime error will be generated. - * - * The singular device-side operations allow individual threads to perform - * independent insert or find/contains operations from device code. These - * operations are accessed through non-owning, trivially copyable "view" types: - * `device_view` and `device_mutable_view`. The `device_view` class is an - * immutable view that allows only non-modifying operations such as `find` or - * `contains`. The `device_mutable_view` class only allows `insert` and `erase` operations. - * The two types are separate to prevent erroneous concurrent insert/erase/find - * operations. Note that the device-side `erase` may only be called if the corresponding - * `device_mutable_view` was constructed with a user-provided `erased_key_sentinel`. It is - * up to the user to ensure this condition is met. - * - * Example: - * \code{.cpp} - * int empty_key_sentinel = -1; - * int empty_value_sentinel = -1; - * int erased_key_sentinel = -2; - * - * // Constructs a map with 100,000 slots using -1 and -1 as the empty key/value - * // sentinels. The supplied erased key sentinel of -2 must be a different value from the empty - * // key sentinel. If erase functionality is not needed, you may elect to not supply an erased - * // key sentinel to the constructor. Note the capacity is chosen knowing we will insert 50,000 - * // keys, for an load factor of 50%. - * static_map m{100'000, empty_key_sentinel, empty_value_sentinel, erased_value_sentinel}; - * - * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}} - * thrust::device_vector> pairs(50,000); - * thrust::transform(thrust::make_counting_iterator(0), - * thrust::make_counting_iterator(pairs.size()), - * pairs.begin(), - * []__device__(auto i){ return cuco::pair{i,i}; }; - * - * - * // Inserts all pairs into the map - * m.insert(pairs.begin(), pairs.end()); - * - * // Get a `device_view` and passes it to a kernel where threads may perform - * // `find/contains` lookups - * kernel<<<...>>>(m.get_device_view()); - * \endcode - * - * - * @tparam Key Arithmetic type used for key - * @tparam Value Type of the mapped values - * @tparam Scope The scope in which insert/find operations will be performed by - * individual threads. - * @tparam Allocator Type of allocator used for device storage - */ -template > -class static_map { - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - - static_assert(cuco::is_bitwise_comparable_v, - "Value type must have unique object representations or have been explicitly " - "declared as safe for bitwise comparison via specialization of " - "cuco::is_bitwise_comparable_v."); - - public: - using value_type = cuco::pair; ///< Type of key/value pairs - using key_type = Key; ///< Key type - using mapped_type = Value; ///< Type of mapped values - using atomic_key_type = cuda::atomic; ///< Type of atomic keys - using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values - using pair_atomic_type = - cuco::pair; ///< Pair type of atomic key and atomic mapped value - using slot_type = pair_atomic_type; ///< Type of hash map slots - using atomic_ctr_type = cuda::atomic; ///< Atomic counter type - using allocator_type = Allocator; ///< Allocator type - using slot_allocator_type = typename std::allocator_traits::template rebind_alloc< - pair_atomic_type>; ///< Type of the allocator to (de)allocate slots - using counter_allocator_type = typename std::allocator_traits::template rebind_alloc< - atomic_ctr_type>; ///< Type of the allocator to (de)allocate atomic counters - -#if !defined(CUCO_HAS_INDEPENDENT_THREADS) - static_assert(atomic_key_type::is_always_lock_free, - "A key type larger than 8B is supported for only sm_70 and up."); - static_assert(atomic_mapped_type::is_always_lock_free, - "A value type larger than 8B is supported for only sm_70 and up."); -#endif - - static_map(static_map const&) = delete; - static_map(static_map&&) = delete; - - static_map& operator=(static_map const&) = delete; - static_map& operator=(static_map&&) = delete; - - /** - * @brief Indicates if concurrent insert/find is supported for the key/value types. - * - * @return Boolean indicating if concurrent insert/find is supported. - */ - __host__ __device__ static constexpr bool supports_concurrent_insert_find() noexcept - { - return cuco::detail::is_packable(); - } - - /** - * @brief Constructs a statically sized map with the specified number of slots - * and sentinel values. - * - * The capacity of the map is fixed. Insert operations will not automatically - * grow the map. Attempting to insert equal to or more unique keys than the capacity - * of the map results in undefined behavior (there should be at least one empty slot). - * - * Performance begins to degrade significantly beyond a load factor of ~70%. - * For best performance, choose a capacity that will keep the load factor - * below 70%. E.g., if inserting `N` unique keys, choose a capacity of - * `N * (1/0.7)`. - * - * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and - * undefined behavior results from attempting to insert any key/value pair - * that contains either. - * - * @param capacity The total number of slots in the map - * @param empty_key_sentinel The reserved key value for empty slots - * @param empty_value_sentinel The reserved mapped value for empty slots - * @param alloc Allocator used for allocating device storage - * @param stream Stream used for executing the kernels - */ - static_map(std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - Allocator const& alloc = Allocator{}, - cudaStream_t stream = 0); - - /** - * @brief Constructs a fixed-size map with erase capability. - * empty_key_sentinel and erased_key_sentinel must be different values. - * - * @throw std::runtime error if the empty key sentinel and erased key sentinel - * are the same value - * - * @param capacity The total number of slots in the map - * @param empty_key_sentinel The reserved key value for empty slots - * @param empty_value_sentinel The reserved mapped value for empty slots - * @param erased_key_sentinel The reserved value to denote erased slots - * @param alloc Allocator used for allocating device storage - * @param stream Stream used for executing the kernels - */ - static_map(std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - erased_key erased_key_sentinel, - Allocator const& alloc = Allocator{}, - cudaStream_t stream = 0); - - /** - * @brief Destroys the map and frees its contents. - * - */ - ~static_map(); - - /** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * - * This function synchronizes `stream`. - * - * If multiple keys in `[first, last)` compare equal, it is unspecified which - * element is inserted. - * - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - * @param stream Stream used for executing the kernels - */ - template , - typename KeyEqual = cuda::std::equal_to> - void insert(InputIt first, - InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); - - /** - * @brief Inserts key/value pairs in the range `[first, last)` if `pred` - * of the corresponding stencil returns true. - * - * The key/value pair `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * - * @tparam InputIt Device accessible random access iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and - * argument type is convertible from std::iterator_traits::value_type - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + - * std::distance(first, last))` - * @param hash The unary function to hash each key - * @param key_equal The binary function to compare two keys for equality - * @param stream CUDA stream used for insert - */ - template , - typename KeyEqual = cuda::std::equal_to> - void insert_if(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); - - /** - * @brief Erases keys in the range `[first, last)`. - * - * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's - * associated value from the map. Else, no effect. - * - * Side-effects: - * - `contains(k) == false` - * - `find(k) == end()` - * - `insert({k,v}) == true` - * - `get_size()` is reduced by the total number of erased keys - * - * This function synchronizes `stream`. - * - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - * @param stream Stream used for executing the kernels - * - * @throw std::runtime_error if a unique erased key sentinel value was not - * provided at construction - */ - template , - typename KeyEqual = cuda::std::equal_to> - void erase(InputIt first, - InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); - - /** - * @brief Finds the values corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + - * i)`. Else, copies the empty value sentinel. - * - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of values retrieved for each key - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - * @param stream Stream used for executing the kernels - */ - template , - typename KeyEqual = cuda::std::equal_to> - void find(InputIt first, - InputIt last, - OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); - - /** - * @brief Retrieves all of the keys and their associated values. - * - * The order in which keys are returned is implementation defined and not guaranteed to be - * consistent between subsequent calls to `retrieve_all`. - * - * Behavior is undefined if the range beginning at `keys_out` or `values_out` is less than - * `get_size()` - * - * @tparam KeyOut Device accessible random access output iterator whose `value_type` is - * convertible from `key_type`. - * @tparam ValueOut Device accessible random access output iterator whose `value_type` is - * convertible from `mapped_type`. - * @param keys_out Beginning output iterator for keys - * @param values_out Beginning output iterator for values - * @param stream CUDA stream used for this operation - * @return Pair of iterators indicating the last elements in the output - */ - template - std::pair retrieve_all(KeyOut keys_out, - ValueOut values_out, - cudaStream_t stream = 0) const; - - /** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * - * Hash should be callable with both std::iterator_traits::value_type and Key - * type. std::invoke_result::value_type, Key> - * must be well-formed. - * - * @tparam InputIt Device accessible input iterator - * @tparam OutputIt Device accessible output iterator assignable from `bool` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - * @param stream Stream used for executing the kernels - */ - template , - typename KeyEqual = cuda::std::equal_to> - void contains(InputIt first, - InputIt last, - OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0) const; - - private: - class device_view_base { - protected: - // Import member type definitions from `static_map` - using value_type = value_type; - using key_type = Key; - using mapped_type = Value; - using iterator = pair_atomic_type*; - using const_iterator = pair_atomic_type const*; - using slot_type = slot_type; - - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Key erased_key_sentinel_{}; ///< Key value that represents an erased slot - Value empty_value_sentinel_{}; ///< Initial Value of empty slot - pair_atomic_type* slots_{}; ///< Pointer to flat slots storage - std::size_t capacity_{}; ///< Total number of slots - - __host__ __device__ device_view_base(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel) noexcept - : slots_{slots}, - capacity_{capacity}, - empty_key_sentinel_{empty_key_sentinel.value}, - erased_key_sentinel_{empty_key_sentinel.value}, - empty_value_sentinel_{empty_value_sentinel.value} - { - } - - __host__ __device__ device_view_base(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - erased_key erased_key_sentinel) noexcept - : slots_{slots}, - capacity_{capacity}, - empty_key_sentinel_{empty_key_sentinel.value}, - erased_key_sentinel_{erased_key_sentinel.value}, - empty_value_sentinel_{empty_value_sentinel.value} - { - } - - /** - * @brief Returns the initial slot for a given key `k` - * - * @tparam ProbeKey Probe key type - * @tparam Hash Unary callable type - * - * @param k The key to get the slot for - * @param hash The unary callable used to hash the key - * @return Pointer to the initial slot for `k` - */ - template - __device__ iterator initial_slot(ProbeKey const& k, Hash hash) noexcept - { - return &slots_[hash(k) % capacity_]; - } - - /** - * @brief Returns the initial slot for a given key `k` - * - * @tparam ProbeKey Probe key type - * @tparam Hash Unary callable type - * - * @param k The key to get the slot for - * @param hash The unary callable used to hash the key - * @return Pointer to the initial slot for `k` - */ - template - __device__ const_iterator initial_slot(ProbeKey const& k, Hash hash) const noexcept - { - return &slots_[hash(k) % capacity_]; - } - - /** - * @brief Returns the initial slot for a given key `k` - * - * To be used for Cooperative Group based probing. - * - * @tparam CG Cooperative Group type - * @tparam ProbeKey Probe key type - * @tparam Hash Unary callable type - * - * @param g the Cooperative Group for which the initial slot is needed - * @param k The key to get the slot for - * @param hash The unary callable used to hash the key - * @return Pointer to the initial slot for `k` - */ - template - __device__ iterator initial_slot(CG g, ProbeKey const& k, Hash hash) noexcept - { - return &slots_[(hash(k) + g.thread_rank()) % capacity_]; - } - - /** - * @brief Returns the initial slot for a given key `k` - * - * To be used for Cooperative Group based probing. - * - * @tparam CG Cooperative Group type - * @tparam ProbeKey Probe key type - * @tparam Hash Unary callable type - * - * @param g the Cooperative Group for which the initial slot is needed - * @param k The key to get the slot for - * @param hash The unary callable used to hash the key - * @return Pointer to the initial slot for `k` - */ - template - __device__ const_iterator initial_slot(CG g, ProbeKey const& k, Hash hash) const noexcept - { - return &slots_[(hash(k) + g.thread_rank()) % capacity_]; - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. - * - * @param s The slot to advance - * @return The next slot after `s` - */ - __device__ iterator next_slot(iterator s) noexcept { return (++s < end()) ? s : begin_slot(); } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. - * - * @param s The slot to advance - * @return The next slot after `s` - */ - __device__ const_iterator next_slot(const_iterator s) const noexcept - { - return (++s < end()) ? s : begin_slot(); - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. To - * be used for Cooperative Group based probing. - * - * @tparam CG The Cooperative Group type - * @param g The Cooperative Group for which the next slot is needed - * @param s The slot to advance - * @return The next slot after `s` - */ - template - __device__ iterator next_slot(CG g, iterator s) noexcept - { - uint32_t index = s - slots_; - return &slots_[(index + g.size()) % capacity_]; - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. To - * be used for Cooperative Group based probing. - * - * @tparam CG The Cooperative Group type - * @param g The Cooperative Group for which the next slot is needed - * @param s The slot to advance - * @return The next slot after `s` - */ - template - __device__ const_iterator next_slot(CG g, const_iterator s) const noexcept - { - uint32_t index = s - slots_; - return &slots_[(index + g.size()) % capacity_]; - } - - /** - * @brief Initializes the given array of slots to the specified values given by `k` and `v` - * using the threads in the group `g`. - * - * @note This function synchronizes the group `g`. - * - * @tparam CG The type of the cooperative thread group - * @param g The cooperative thread group used to initialize the slots - * @param slots Pointer to the array of slots to initialize - * @param num_slots Number of slots to initialize - * @param k The desired key value for each slot - * @param v The desired mapped value for each slot - */ - - template - __device__ static void initialize_slots( - CG g, pair_atomic_type* slots, std::size_t num_slots, Key k, Value v) - { - auto tid = g.thread_rank(); - while (tid < num_slots) { - new (&slots[tid].first) atomic_key_type{k}; - new (&slots[tid].second) atomic_mapped_type{v}; - tid += g.size(); - } - g.sync(); - } - - public: - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __host__ __device__ pair_atomic_type* get_slots() noexcept { return slots_; } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __host__ __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; } - - /** - * @brief Gets the maximum number of elements the hash map can hold. - * - * @return The maximum number of elements the hash map can hold - */ - __host__ __device__ std::size_t get_capacity() const noexcept { return capacity_; } - - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - __host__ __device__ Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; } - - /** - * @brief Gets the sentinel value used to represent an empty value slot. - * - * @return The sentinel value used to represent an empty value slot - */ - __host__ __device__ Value get_empty_value_sentinel() const noexcept - { - return empty_value_sentinel_; - } - - __host__ __device__ Key get_erased_key_sentinel() const noexcept - { - return erased_key_sentinel_; - } - - /** - * @brief Returns iterator to the first slot. - * - * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first - * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage. - * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those - * both empty and filled. - * - * There is no `begin()` iterator to avoid confusion as it is not possible to provide an - * iterator over only the filled slots. - * - * @return Iterator to the first slot - */ - __device__ iterator begin_slot() noexcept { return slots_; } - - /** - * @brief Returns iterator to the first slot. - * - * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first - * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage. - * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those - * both empty and filled. - * - * There is no `begin()` iterator to avoid confusion as it is not possible to provide an - * iterator over only the filled slots. - * - * @return Iterator to the first slot - */ - __device__ const_iterator begin_slot() const noexcept { return slots_; } - - /** - * @brief Returns a const_iterator to one past the last slot. - * - * @return A const_iterator to one past the last slot - */ - __host__ __device__ const_iterator end_slot() const noexcept { return slots_ + capacity_; } - - /** - * @brief Returns an iterator to one past the last slot. - * - * @return An iterator to one past the last slot - */ - __host__ __device__ iterator end_slot() noexcept { return slots_ + capacity_; } - - /** - * @brief Returns a const_iterator to one past the last slot. - * - * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking - * an iterator returned from `find()` against the `end()` iterator. - * - * @return A const_iterator to one past the last slot - */ - __host__ __device__ const_iterator end() const noexcept { return end_slot(); } - - /** - * @brief Returns an iterator to one past the last slot. - * - * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking - * an iterator returned from `find()` against the `end()` iterator. - * - * @return An iterator to one past the last slot - */ - __host__ __device__ iterator end() noexcept { return end_slot(); } - }; - - public: - /** - * @brief Mutable, non-owning view-type that may be used in device code to - * perform singular inserts into the map. - * - * `device_mutable_view` is trivially-copyable and is intended to be passed by - * value. - * - * Example: - * \code{.cpp} - * cuco::static_map m{100'000, -1, -1}; - * - * // Inserts a sequence of pairs {{0,0}, {1,1}, ... {i,i}} - * thrust::for_each(thrust::make_counting_iterator(0), - * thrust::make_counting_iterator(50'000), - * [map = m.get_device_mutable_view()] - * __device__ (auto i) mutable { - * map.insert(cuco::pair{i,i}); - * }); - * \endcode - */ - class device_mutable_view : public device_view_base { - public: - using value_type = typename device_view_base::value_type; ///< Type of key/value pairs - using key_type = typename device_view_base::key_type; ///< Key type - using mapped_type = typename device_view_base::mapped_type; ///< Type of the mapped values - using iterator = - typename device_view_base::iterator; ///< Type of the forward iterator to `value_type` - using const_iterator = - typename device_view_base::const_iterator; ///< Type of the forward iterator to `const - ///< value_type` - using slot_type = typename device_view_base::slot_type; ///< Type of hash map slots - - /** - * @brief Constructs a mutable view of the first `capacity` slots of the - * slots array pointed to by `slots`. - * - * @param slots Pointer to beginning of initialized slots array - * @param capacity The number of slots viewed by this object - * @param empty_key_sentinel The reserved value for keys to represent empty slots - * @param empty_value_sentinel The reserved value for mapped values to - * represent empty slots - */ - __host__ __device__ device_mutable_view(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel) noexcept - : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} - { - } - - /** - * @brief Constructs a mutable view of the first `capacity` slots of the - * slots array pointed to by `slots`. - * - * @param slots Pointer to beginning of initialized slots array - * @param capacity The number of slots viewed by this object - * @param empty_key_sentinel The reserved value for keys to represent empty slots - * @param empty_value_sentinel The reserved value for mapped values to represent empty slots - * @param erased_key_sentinel The reserved value for keys to represent erased slots - */ - __host__ __device__ device_mutable_view(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - erased_key erased_key_sentinel) noexcept - : device_view_base{ - slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel} - { - } - - private: - /** - * @brief Enumeration of the possible results of attempting to insert into a hash slot. - */ - enum class insert_result { - CONTINUE, ///< Insert did not succeed, continue trying to insert - SUCCESS, ///< New pair inserted successfully - DUPLICATE ///< Insert did not succeed, key is already present - }; - - /** - * @brief Inserts the specified key/value pair with one single CAS operation. - * - * @tparam KeyEqual Binary callable type - * @param current_slot The slot to insert - * @param insert_pair The pair to insert - * @param key_equal The binary callable used to compare two keys for - * equality - * @param expected_key The expected value of the key in the target slot - * @return An insert result from the `insert_resullt` enumeration. - */ - template - __device__ insert_result packed_cas(iterator current_slot, - value_type const& insert_pair, - KeyEqual key_equal, - Key expected_key) noexcept; - - /** - * @brief Inserts the specified key/value pair with two back-to-back CAS operations. - * - * @tparam KeyEqual Binary callable type - * @param current_slot The slot to insert - * @param insert_pair The pair to insert - * @param key_equal The binary callable used to compare two keys for - * equality - * @param expected_key The expected value of the key in the target slot - * @return An insert result from the `insert_resullt` enumeration. - */ - template - __device__ insert_result back_to_back_cas(iterator current_slot, - value_type const& insert_pair, - KeyEqual key_equal, - Key expected_key) noexcept; - - /** - * @brief Inserts the specified key/value pair with a CAS of the key and a dependent write of - * the value. - * - * @tparam KeyEqual Binary callable type - * @param current_slot The slot to insert - * @param insert_pair The pair to insert - * @param key_equal The binary callable used to compare two keys for - * equality - * @param expected_key The expected value of the key in the target slot - * @return An insert result from the `insert_resullt` enumeration. - */ - template - __device__ insert_result cas_dependent_write(iterator current_slot, - value_type const& insert_pair, - KeyEqual key_equal, - Key expected_key) noexcept; - - public: - /** - * @brief Given a slot pointer `slots`, initializes the first `capacity` slots with the given - * sentinel values and returns a `device_mutable_view` object of those slots. - * - * @tparam CG The type of the cooperative thread group - * - * @param g The cooperative thread group used to copy the slots - * @param slots Pointer to the hash map slots - * @param capacity The total number of slots in the map - * @param empty_key_sentinel The reserved value for keys to represent empty slots - * @param empty_value_sentinel The reserved value for mapped values to represent empty slots - * @return A device_mutable_view object based on the given parameters - */ - template - __device__ static device_mutable_view make_from_uninitialized_slots( - CG g, - pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel) noexcept - { - device_view_base::initialize_slots( - g, slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value); - return device_mutable_view{slots, - capacity, - empty_key_sentinel, - empty_value_sentinel, - erased_key{empty_key_sentinel.value}}; - } - - /** - * @brief Given a slot pointer `slots`, initializes the first `capacity` slots with the given - * sentinel values and returns a `device_mutable_view` object of those slots. - * - * @tparam CG The type of the cooperative thread group - * - * @param g The cooperative thread group used to copy the slots - * @param slots Pointer to the hash map slots - * @param capacity The total number of slots in the map - * @param empty_key_sentinel The reserved value for keys to represent empty slots - * @param empty_value_sentinel The reserved value for mapped values to represent empty slots - * @param erased_key_sentinel The reserved value for keys to represent erased slots - * @return A device_mutable_view object based on the given parameters - */ - template - __device__ static device_mutable_view make_from_uninitialized_slots( - CG g, - pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - erased_key erased_key_sentinel) noexcept - { - device_view_base::initialize_slots( - g, slots, capacity, empty_key_sentinel, empty_value_sentinel); - return device_mutable_view{ - slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel}; - } - - /** - * @brief Inserts the specified key/value pair into the map. - * - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param insert_pair The pair to insert - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys for - * equality - * @return `true` if the insert was successful, `false` otherwise. - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ bool insert(value_type const& insert_pair, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Inserts the specified key/value pair into the map. - * - * Returns a pair consisting of an iterator to the inserted element (or to - * the element that prevented the insertion) and a `bool` denoting whether - * the insertion took place. - * - * Note: In order to guarantee the validity of the returned iterator, - * `insert_and_find` may be less efficient than `insert` in some situations. - * Prefer using `insert` unless the returned iterator is required. - * - * Note: `insert_and_find` may only be used concurrently with `insert`, - * `find`, and `erase` when `supports_concurrent_insert_find()` returns - * true. - * - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param insert_pair The pair to insert - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys for - * equality - * @return a pair consisting of an iterator to the element and a bool, - * either `true` if the insert was successful, `false` otherwise. - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ cuda::std::pair insert_and_find( - value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Inserts the specified key/value pair into the map. - * - * Uses the CUDA Cooperative Groups API to to leverage multiple threads to - * perform a single insert. This provides a significant boost in throughput - * compared to the non Cooperative Group `insert` at moderate to high load - * factors. - * - * @tparam CG Cooperative Group type - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param g The Cooperative Group that performs the insert - * @param insert_pair The pair to insert - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys for - * equality - * @return `true` if the insert was successful, `false` otherwise. - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ bool insert(CG g, - value_type const& insert_pair, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Erases the specified key across the map. - * - * Behavior is undefined if `empty_key_sentinel_` equals to `erased_key_sentinel_`. - * - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param k The key to be erased - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys for - * equality - * @return `true` if the erasure was successful, `false` otherwise. - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ bool erase(key_type const& k, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Erases the specified key across the map. - * - * Behavior is undefined if `empty_key_sentinel_` equals to `erased_key_sentinel_`. - * - * @tparam CG Cooperative Group type - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param g The Cooperative Group that performs the erasure - * @param k The key to be erased - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys for - * equality - * @return `true` if the erasure was successful, `false` otherwise. - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ bool erase(CG g, - key_type const& k, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; - - }; // class device mutable view - - /** - * @brief Non-owning view-type that may be used in device code to - * perform singular find and contains operations for the map. - * - * `device_view` is trivially-copyable and is intended to be passed by - * value. - * - */ - class device_view : public device_view_base { - public: - using value_type = typename device_view_base::value_type; ///< Type of key/value pairs - using key_type = typename device_view_base::key_type; ///< Key type - using mapped_type = typename device_view_base::mapped_type; ///< Type of the mapped values - using iterator = - typename device_view_base::iterator; ///< Type of the forward iterator to `value_type` - using const_iterator = - typename device_view_base::const_iterator; ///< Type of the forward iterator to `const - ///< value_type` - using slot_type = typename device_view_base::slot_type; ///< Type of hash map slots - - /** - * @brief Construct a view of the first `capacity` slots of the - * slots array pointed to by `slots`. - * - * @param slots Pointer to beginning of initialized slots array - * @param capacity The number of slots viewed by this object - * @param empty_key_sentinel The reserved value for keys to represent empty slots - * @param empty_value_sentinel The reserved value for mapped values to represent empty slots - */ - __host__ __device__ device_view(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel) noexcept - : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} - { - } - - /** - * @brief Construct a view of the first `capacity` slots of the - * slots array pointed to by `slots`. - * - * @param slots Pointer to beginning of initialized slots array - * @param capacity The number of slots viewed by this object - * @param empty_key_sentinel The reserved value for keys to represent empty slots - * @param empty_value_sentinel The reserved value for mapped values to represent empty slots - * @param erased_key_sentinel The reserved value for keys to represent erased slots - */ - __host__ __device__ device_view(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - erased_key erased_key_sentinel) noexcept - : device_view_base{ - slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel} - { - } - - /** - * @brief Construct a `device_view` from a `device_mutable_view` object - * - * @param mutable_map object of type `device_mutable_view` - */ - __host__ __device__ explicit device_view(device_mutable_view mutable_map) - : device_view_base{mutable_map.get_slots(), - mutable_map.get_capacity(), - empty_key{mutable_map.get_empty_key_sentinel()}, - empty_value{mutable_map.get_empty_value_sentinel()}, - erased_key{mutable_map.get_erased_key_sentinel()}} - { - } - - /** - * @brief Makes a copy of given `device_view` using non-owned memory. - * - * This function is intended to be used to create shared memory copies of small static maps, - * although global memory can be used as well. - * - * Example: - * @code{.cpp} - * template - * __global__ void use_device_view(const typename MapType::device_view device_view, - * map_key_t const* const keys_to_search, - * map_value_t* const values_found, - * const size_t number_of_elements) - * { - * const size_t index = blockIdx.x * blockDim.x + threadIdx.x; - * - * __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY]; - * - * auto g = cg::this_thread_block(); - * - * const map_t::device_view sm_static_map = device_view.make_copy(g, - * sm_buffer); - * - * for (size_t i = g.thread_rank(); i < number_of_elements; i += g.size()) - * { - * values_found[i] = sm_static_map.find(keys_to_search[i])->second; - * } - * } - * @endcode - * - * @tparam CG The type of the cooperative thread group - * @param g The cooperative thread group used to copy the slots - * @param source_device_view `device_view` to copy from - * @param memory_to_use Array large enough to support `capacity` elements. Object does not take - * the ownership of the memory - * @return Copy of passed `device_view` - */ - template - __device__ static device_view make_copy(CG g, - pair_atomic_type* const memory_to_use, - device_view source_device_view) noexcept - { -#if defined(CUCO_HAS_CUDA_BARRIER) - __shared__ cuda::barrier barrier; - if (g.thread_rank() == 0) { init(&barrier, g.size()); } - g.sync(); - - cuda::memcpy_async(g, - memory_to_use, - source_device_view.get_slots(), - sizeof(pair_atomic_type) * source_device_view.get_capacity(), - barrier); - - barrier.arrive_and_wait(); -#else - pair_atomic_type const* const slots_ptr = source_device_view.get_slots(); - for (std::size_t i = g.thread_rank(); i < source_device_view.get_capacity(); i += g.size()) { - new (&memory_to_use[i].first) - atomic_key_type{slots_ptr[i].first.load(cuda::memory_order_relaxed)}; - new (&memory_to_use[i].second) - atomic_mapped_type{slots_ptr[i].second.load(cuda::memory_order_relaxed)}; - } - g.sync(); -#endif - - return device_view(memory_to_use, - source_device_view.get_capacity(), - empty_key{source_device_view.get_empty_key_sentinel()}, - empty_value{source_device_view.get_empty_value_sentinel()}, - erased_key{source_device_view.get_erased_key_sentinel()}); - } - - /** - * @brief Finds the value corresponding to the key `k`. - * - * Returns an iterator to the pair whose key is equivalent to `k`. - * If no such pair exists, returns `end()`. - * - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param k The key to search for - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys - * for equality - * @return An iterator to the position at which the key/value pair - * containing `k` was inserted - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ iterator find(Key const& k, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; - - /** @brief Finds the value corresponding to the key `k`. - * - * Returns a const_iterator to the pair whose key is equivalent to `k`. - * If no such pair exists, returns `end()`. - * - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param k The key to search for - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys - * for equality - * @return An iterator to the position at which the key/value pair - * containing `k` was inserted - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ const_iterator find(Key const& k, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) const noexcept; - - /** - * @brief Finds the value corresponding to the key `k`. - * - * Returns an iterator to the pair whose key is equivalent to `k`. - * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to - * to leverage multiple threads to perform a single find. This provides a - * significant boost in throughput compared to the non Cooperative Group - * `find` at moderate to high load factors. - * - * @tparam CG Cooperative Group type - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param g The Cooperative Group used to perform the find - * @param k The key to search for - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys - * for equality - * @return An iterator to the position at which the key/value pair - * containing `k` was inserted - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ iterator - find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Finds the value corresponding to the key `k`. - * - * Returns a const_iterator to the pair whose key is equivalent to `k`. - * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to - * to leverage multiple threads to perform a single find. This provides a - * significant boost in throughput compared to the non Cooperative Group - * `find` at moderate to high load factors. - * - * @tparam CG Cooperative Group type - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param g The Cooperative Group used to perform the find - * @param k The key to search for - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys - * for equality - * @return An iterator to the position at which the key/value pair - * containing `k` was inserted - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ const_iterator - find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; - - /** - * @brief Indicates whether the key `k` was inserted into the map. - * - * If the key `k` was inserted into the map, find returns - * true. Otherwise, it returns false. - * - * Hash should be callable with both ProbeKey and Key type. `std::invoke_result` must be well-formed. - * - * If `key_equal(probe_key, slot_key)` returns true, `hash(probe_key) == hash(slot_key)` must - * also be true. - * - * @tparam ProbeKey Probe key type - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param k The key to search for - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys - * for equality - * @return A boolean indicating whether the key/value pair - * containing `k` was inserted - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ bool contains(ProbeKey const& k, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) const noexcept; - - /** - * @brief Indicates whether the key `k` was inserted into the map. - * - * If the key `k` was inserted into the map, find returns true. Otherwise, it returns false. - * Uses the CUDA Cooperative Groups API to to leverage multiple threads to perform a single - * contains operation. This provides a significant boost in throughput compared to the non - * Cooperative Group `contains` at moderate to high load factors. - * - * Hash should be callable with both ProbeKey and Key type. `std::invoke_result` must be well-formed. - * - * If `key_equal(probe_key, slot_key)` returns true, `hash(probe_key) == hash(slot_key)` must - * also be true. - * - * @tparam CG Cooperative Group type - * @tparam ProbeKey Probe key type - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param g The Cooperative Group used to perform the contains operation - * @param k The key to search for - * @param hash The unary callable used to hash the key - * @param key_equal The binary callable used to compare two keys - * for equality - * @return A boolean indicating whether the key/value pair - * containing `k` was inserted - */ - template , - typename KeyEqual = cuda::std::equal_to> - __device__ cuda::std::enable_if_t, bool> contains( - CG g, ProbeKey const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; - }; // class device_view - - /** - * @brief Gets the maximum number of elements the hash map can hold. - * - * @return The maximum number of elements the hash map can hold - */ - std::size_t get_capacity() const noexcept { return capacity_; } - - /** - * @brief Gets the number of elements in the hash map. - * - * @return The number of elements in the map - */ - std::size_t get_size() const noexcept { return size_; } - - /** - * @brief Gets the load factor of the hash map. - * - * @return The load factor of the hash map - */ - float get_load_factor() const noexcept { return static_cast(size_) / capacity_; } - - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; } - - /** - * @brief Gets the sentinel value used to represent an empty value slot. - * - * @return The sentinel value used to represent an empty value slot - */ - Value get_empty_value_sentinel() const noexcept { return empty_value_sentinel_; } - - /** - * @brief Gets the sentinel value used to represent an erased value slot. - * - * @return The sentinel value used to represent an erased value slot - */ - Key get_erased_key_sentinel() const noexcept { return erased_key_sentinel_; } - - /** - * @brief Constructs a device_view object based on the members of the `static_map` object. - * - * @return A device_view object based on the members of the `static_map` object - */ - device_view get_device_view() const noexcept - { - return device_view(slots_, - capacity_, - empty_key{empty_key_sentinel_}, - empty_value{empty_value_sentinel_}, - erased_key{erased_key_sentinel_}); - } - - /** - * @brief Constructs a device_mutable_view object based on the members of the `static_map` object - * - * @return A device_mutable_view object based on the members of the `static_map` object - */ - device_mutable_view get_device_mutable_view() const noexcept - { - return device_mutable_view(slots_, - capacity_, - empty_key{empty_key_sentinel_}, - empty_value{empty_value_sentinel_}, - erased_key{erased_key_sentinel_}); - } - - private: - pair_atomic_type* slots_{}; ///< Pointer to flat slots storage - std::size_t capacity_{}; ///< Total number of slots - std::size_t size_{}; ///< Number of keys in map - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Value empty_value_sentinel_{}; ///< Initial value of empty slot - Key erased_key_sentinel_{}; ///< Key value that represents an erased slot - atomic_ctr_type* num_successes_{}; ///< Number of successfully inserted keys on insert - slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots - counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` -}; -} // namespace legacy } // namespace cuco -#include #include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a4787f432..205b0100b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -78,7 +78,6 @@ ConfigureTest(STATIC_SET_TEST ConfigureTest(STATIC_MAP_TEST static_map/capacity_test.cu static_map/contains_test.cu - static_map/custom_type_test.cu static_map/duplicate_keys_test.cu static_map/erase_test.cu static_map/find_test.cu diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu deleted file mode 100644 index 7860e87e4..000000000 --- a/tests/static_map/custom_type_test.cu +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -// User-defined key type -template -struct key_pair_type { - T a; - T b; - - __host__ __device__ key_pair_type() {} - __host__ __device__ key_pair_type(T x) : a{x}, b{x} {} - - // Device equality operator is mandatory due to libcudacxx bug: - // https://github.com/NVIDIA/libcudacxx/issues/223 - __device__ bool operator==(key_pair_type const& other) const - { - return a == other.a and b == other.b; - } -}; - -// User-defined key type -template -struct large_key_type { - T a; - T b; - T c; - - __host__ __device__ large_key_type() {} - __host__ __device__ large_key_type(T x) : a{x}, b{x}, c{x} {} - - // Device equality operator is mandatory due to libcudacxx bug: - // https://github.com/NVIDIA/libcudacxx/issues/223 - __device__ bool operator==(large_key_type const& other) const - { - return a == other.a and b == other.b and c == other.c; - } -}; - -// User-defined value type -template -struct value_pair_type { - T f; - T s; - - __host__ __device__ value_pair_type() {} - __host__ __device__ value_pair_type(T x) : f{x}, s{x} {} - - __device__ bool operator==(value_pair_type const& other) const - { - return f == other.f and s == other.s; - } -}; - -// User-defined device hasher -struct hash_custom_key { - template - __device__ uint32_t operator()(custom_type k) - { - return thrust::raw_reference_cast(k).a; - }; -}; - -// User-defined device key equality -struct custom_key_equals { - template - __device__ bool operator()(lhs_type lhs, rhs_type rhs) - { - return lhs == static_cast(rhs); - } -}; - -TEMPLATE_TEST_CASE_SIG("static_map custom key and value type tests", - "", - ((typename Key, typename Value), Key, Value), -#if defined(CUCO_HAS_INDEPENDENT_THREADS) // Key type larger than 8B only supported for sm_70 and - // up - (key_pair_type, value_pair_type), - (key_pair_type, value_pair_type), - (large_key_type, value_pair_type), -#endif - (key_pair_type, value_pair_type)) -{ - auto const sentinel_key = Key{-1}; - auto const sentinel_value = Value{-1}; - - constexpr std::size_t num = 100; - constexpr std::size_t capacity = num * 2; - cuco::legacy::static_map map{ - capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; - - auto insert_keys = thrust::make_transform_iterator( - thrust::counting_iterator(0), - cuda::proclaim_return_type([] __device__(auto i) { return Key{i}; })); - - auto insert_values = thrust::make_transform_iterator( - thrust::counting_iterator(0), - cuda::proclaim_return_type([] __device__(auto i) { return Value{i}; })); - - auto insert_pairs = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type>( - [] __device__(auto i) { return cuco::pair(i, i); })); - - SECTION("All inserted keys-value pairs should be correctly recovered during find") - { - thrust::device_vector found_values(num); - map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{}); - - REQUIRE(num == map.get_size()); - - map.find( - insert_keys, insert_keys + num, found_values.begin(), hash_custom_key{}, custom_key_equals{}); - - REQUIRE(cuco::test::equal(insert_values, - insert_values + num, - found_values.begin(), - cuda::proclaim_return_type([] __device__(Value lhs, Value rhs) { - return cuda::std::tie(lhs.f, lhs.s) == cuda::std::tie(rhs.f, rhs.s); - }))); - } - - SECTION("All inserted keys-value pairs should be contained") - { - thrust::device_vector contained(num); - map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{}); - map.contains( - insert_keys, insert_keys + num, contained.begin(), hash_custom_key{}, custom_key_equals{}); - REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); - } - - SECTION("All conditionally inserted keys-value pairs should be contained") - { - thrust::device_vector contained(num); - map.insert_if( - insert_pairs, - insert_pairs + num, - thrust::counting_iterator(0), - cuda::proclaim_return_type([] __device__(auto const& key) { return (key % 2) == 0; }), - hash_custom_key{}, - custom_key_equals{}); - - REQUIRE(num / 2 == map.get_size()); - - map.contains( - insert_keys, insert_keys + num, contained.begin(), hash_custom_key{}, custom_key_equals{}); - - REQUIRE(cuco::test::equal( - contained.begin(), - contained.end(), - thrust::counting_iterator(0), - cuda::proclaim_return_type([] __device__(auto const& idx_contained, auto const& idx) { - return ((idx % 2) == 0) == idx_contained; - }))); - } - - SECTION("Non-inserted keys-value pairs should not be contained") - { - thrust::device_vector contained(num); - map.contains( - insert_keys, insert_keys + num, contained.begin(), hash_custom_key{}, custom_key_equals{}); - REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), cuda::std::identity{})); - } - - SECTION("All inserted keys-value pairs should be contained") - { - thrust::device_vector contained(num); - map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{}); - auto view = map.get_device_view(); - REQUIRE(cuco::test::all_of( - insert_pairs, - insert_pairs + num, - cuda::proclaim_return_type([view] __device__(cuco::pair const& pair) { - return view.contains(pair.first, hash_custom_key{}, custom_key_equals{}); - }))); - } - - SECTION("Inserting unique keys should return insert success.") - { - auto m_view = map.get_device_mutable_view(); - REQUIRE(cuco::test::all_of(insert_pairs, - insert_pairs + num, - cuda::proclaim_return_type( - [m_view] __device__(cuco::pair const& pair) mutable { - return m_view.insert( - pair, hash_custom_key{}, custom_key_equals{}); - }))); - } - - SECTION("Cannot find any key in an empty hash map") - { - SECTION("non-const view") - { - auto view = map.get_device_view(); - REQUIRE(cuco::test::all_of( - insert_pairs, - insert_pairs + num, - cuda::proclaim_return_type( - [view] __device__(cuco::pair const& pair) mutable { - return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); - }))); - } - - SECTION("const view") - { - auto const view = map.get_device_view(); - REQUIRE(cuco::test::all_of( - insert_pairs, - insert_pairs + num, - cuda::proclaim_return_type([view] __device__(cuco::pair const& pair) { - return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); - }))); - } - } -}