diff --git a/README.md b/README.md
index e75b8a2b2..69d1427a4 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://gith
 
 ### Major Updates
 
+__02/06/2026__ Removed legacy `static_map` implementation from `cuco::legacy` namespace
+
 __02/03/2026__ Modernized `dynamic_map`: promoted `cuco::experimental::dynamic_map` to `cuco::dynamic_map` and removed the legacy implementation
 
 __01/30/2026__ Removed legacy `static_multimap` implementation and promoted `cuco::experimental::static_multimap` to `cuco::static_multimap`
@@ -25,8 +27,6 @@ __06/04/2025__ Removed CUDA 11 support
 
 __11/01/2024__ Refined the term `window` as `bucket`
 
-__01/02/2024__ Moved the legacy `static_map` to `cuco::legacy` namespace
-
 
 ## Getting cuCollections
 
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
deleted file mode 100644
index 5dfb5fca6..000000000
--- a/include/cuco/detail/static_map.inl
+++ /dev/null
@@ -1,889 +0,0 @@
-/*
- * Copyright (c) 2020-2026, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuco/detail/bitwise_compare.cuh>
-#include <cuco/detail/error.hpp>
-#include <cuco/detail/utility/memcpy_async.hpp>
-#include <cuco/detail/utils.cuh>
-#include <cuco/detail/utils.hpp>
-
-#include <cub/device/device_select.cuh>
-#include <cuda/std/tuple>
-#include <cuda/std/utility>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
-namespace cuco::legacy {
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
-                                                     empty_key<Key> empty_key_sentinel,
-                                                     empty_value<Value> empty_value_sentinel,
-                                                     Allocator const& alloc,
-                                                     cudaStream_t stream)
-  : capacity_{std::max(capacity, std::size_t{1})},  // to avoid dereferencing a nullptr (Issue #72)
-    empty_key_sentinel_{empty_key_sentinel.value},
-    empty_value_sentinel_{empty_value_sentinel.value},
-    erased_key_sentinel_{empty_key_sentinel.value},
-    slot_allocator_{alloc},
-    counter_allocator_{alloc}
-{
-  slots_         = slot_allocator_.allocate(capacity_, cuda::stream_ref{stream});
-  num_successes_ = counter_allocator_.allocate(1, cuda::stream_ref{stream});
-
-  auto constexpr block_size = 256;
-  auto constexpr stride     = 4;
-  auto const grid_size      = (capacity_ + stride * block_size - 1) / (stride * block_size);
-  detail::initialize<block_size, atomic_key_type, atomic_mapped_type>
-    <<<grid_size, block_size, 0, stream>>>(
-      slots_, empty_key_sentinel_, empty_value_sentinel_, capacity_);
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
-                                                     empty_key<Key> empty_key_sentinel,
-                                                     empty_value<Value> empty_value_sentinel,
-                                                     erased_key<Key> erased_key_sentinel,
-                                                     Allocator const& alloc,
-                                                     cudaStream_t stream)
-  : capacity_{std::max(capacity, std::size_t{1})},  // to avoid dereferencing a nullptr (Issue #72)
-    empty_key_sentinel_{empty_key_sentinel.value},
-    empty_value_sentinel_{empty_value_sentinel.value},
-    erased_key_sentinel_{erased_key_sentinel.value},
-    slot_allocator_{alloc},
-    counter_allocator_{alloc}
-{
-  CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_,
-               "The empty key sentinel and erased key sentinel cannot be the same value.",
-               std::runtime_error);
-
-  slots_         = slot_allocator_.allocate(capacity_, cuda::stream_ref{stream});
-  num_successes_ = counter_allocator_.allocate(1, cuda::stream_ref{stream});
-
-  auto constexpr block_size = 256;
-  auto constexpr stride     = 4;
-  auto const grid_size      = (capacity_ + stride * block_size - 1) / (stride * block_size);
-  detail::initialize<block_size, atomic_key_type, atomic_mapped_type>
-    <<<grid_size, block_size, 0, stream>>>(
-      slots_, empty_key_sentinel_, empty_value_sentinel_, capacity_);
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-static_map<Key, Value, Scope, Allocator>::~static_map()
-{
-  slot_allocator_.deallocate(slots_, capacity_, cuda::stream_ref{cudaStream_t{nullptr}});
-  counter_allocator_.deallocate(num_successes_, 1, cuda::stream_ref{cudaStream_t{nullptr}});
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt, typename Hash, typename KeyEqual>
-void static_map<Key, Value, Scope, Allocator>::insert(
-  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
-  auto view             = get_device_mutable_view();
-
-  // TODO: memset an atomic variable is unsafe
-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-  CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_num_successes;
-
-  detail::insert<block_size, tile_size>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
-                                           num_successes_,
-                                           sizeof(atomic_ctr_type),
-                                           cudaMemcpyDeviceToHost,
-                                           cuda::stream_ref{stream}));
-
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
-
-  size_ += h_num_successes;
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt,
-          typename StencilIt,
-          typename Predicate,
-          typename Hash,
-          typename KeyEqual>
-void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
-                                                         InputIt last,
-                                                         StencilIt stencil,
-                                                         Predicate pred,
-                                                         Hash hash,
-                                                         KeyEqual key_equal,
-                                                         cudaStream_t stream)
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-  auto constexpr tile_size  = 4;
-  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
-  auto view            = get_device_mutable_view();
-
-  // TODO: memset an atomic variable is unsafe
-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-  CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_num_successes;
-
-  detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
-    first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
-  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
-                                           num_successes_,
-                                           sizeof(atomic_ctr_type),
-                                           cudaMemcpyDeviceToHost,
-                                           cuda::stream_ref{stream}));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-
-  size_ += h_num_successes;
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt, typename Hash, typename KeyEqual>
-void static_map<Key, Value, Scope, Allocator>::erase(
-  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
-{
-  CUCO_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(),
-               "You must provide a unique erased key sentinel value at map construction.",
-               std::runtime_error);
-
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-  auto constexpr tile_size  = 4;
-  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
-  auto view            = get_device_mutable_view();
-
-  // TODO: memset an atomic variable is unsafe
-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-  CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_num_successes;
-
-  detail::erase<block_size, tile_size>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
-                                           num_successes_,
-                                           sizeof(atomic_ctr_type),
-                                           cudaMemcpyDeviceToHost,
-                                           cuda::stream_ref{stream}));
-
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
-
-  size_ -= h_num_successes;
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void static_map<Key, Value, Scope, Allocator>::find(InputIt first,
-                                                    InputIt last,
-                                                    OutputIt output_begin,
-                                                    Hash hash,
-                                                    KeyEqual key_equal,
-                                                    cudaStream_t stream)
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
-  auto view             = get_device_view();
-
-  detail::find<block_size, tile_size, Value>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, output_begin, view, hash, key_equal);
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename KeyOut, typename ValueOut>
-std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_all(
-  KeyOut keys_out, ValueOut values_out, cudaStream_t stream) const
-{
-  static_assert(sizeof(pair_atomic_type) == sizeof(value_type));
-  auto slots_begin = reinterpret_cast<value_type*>(slots_);
-
-  auto begin =
-    thrust::make_transform_iterator(slots_begin, cuco::detail::slot_to_tuple<Key, Value>{});
-  auto filled           = cuco::detail::slot_is_filled<Key>{get_empty_key_sentinel()};
-  auto zipped_out_begin = thrust::make_zip_iterator(cuda::std::tuple{keys_out, values_out});
-
-  std::size_t temp_storage_bytes = 0;
-  using temp_allocator_type =
-    typename std::allocator_traits<Allocator>::template rebind_alloc<char>;
-  auto temp_allocator = temp_allocator_type{slot_allocator_};
-  auto d_num_out      = reinterpret_cast<std::size_t*>(
-    temp_allocator.allocate(sizeof(std::size_t), cuda::stream_ref{stream}));
-  cub::DeviceSelect::If(nullptr,
-                        temp_storage_bytes,
-                        begin,
-                        zipped_out_begin,
-                        d_num_out,
-                        get_capacity(),
-                        filled,
-                        stream);
-
-  // Allocate temporary storage
-  auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, cuda::stream_ref{stream});
-
-  cub::DeviceSelect::If(d_temp_storage,
-                        temp_storage_bytes,
-                        begin,
-                        zipped_out_begin,
-                        d_num_out,
-                        get_capacity(),
-                        filled,
-                        stream);
-
-  std::size_t h_num_out;
-  CUCO_CUDA_TRY(cuco::detail::memcpy_async(
-    &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream}));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-  temp_allocator.deallocate(
-    reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});
-  temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, cuda::stream_ref{stream});
-
-  return std::make_pair(keys_out + h_num_out, values_out + h_num_out);
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void static_map<Key, Value, Scope, Allocator>::contains(InputIt first,
-                                                        InputIt last,
-                                                        OutputIt output_begin,
-                                                        Hash hash,
-                                                        KeyEqual key_equal,
-                                                        cudaStream_t stream) const
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
-  auto view             = get_device_view();
-
-  detail::contains<block_size, tile_size>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, output_begin, view, hash, key_equal);
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename KeyEqual>
-__device__ static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert_result
-static_map<Key, Value, Scope, Allocator>::device_mutable_view::packed_cas(
-  iterator current_slot,
-  value_type const& insert_pair,
-  KeyEqual key_equal,
-  Key expected_key) noexcept
-{
-  auto expected_value = this->get_empty_value_sentinel();
-
-  cuco::detail::pair_converter<value_type> expected_pair{
-    cuco::make_pair(expected_key, expected_value)};
-  cuco::detail::pair_converter<value_type> new_pair{insert_pair};
-
-  auto slot =
-    reinterpret_cast<cuda::atomic<typename cuco::detail::pair_converter<value_type>::packed_type>*>(
-      current_slot);
-
-  bool success = slot->compare_exchange_strong(
-    expected_pair.packed, new_pair.packed, cuda::std::memory_order_relaxed);
-  if (success) {
-    return insert_result::SUCCESS;
-  }
-  // duplicate present during insert
-  else if (key_equal(insert_pair.first, expected_pair.pair.first)) {
-    return insert_result::DUPLICATE;
-  }
-
-  return insert_result::CONTINUE;
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename KeyEqual>
-__device__ static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert_result
-static_map<Key, Value, Scope, Allocator>::device_mutable_view::back_to_back_cas(
-  iterator current_slot,
-  value_type const& insert_pair,
-  KeyEqual key_equal,
-  Key expected_key) noexcept
-{
-  using cuda::std::memory_order_relaxed;
-
-  auto expected_value = this->get_empty_value_sentinel();
-
-  // Back-to-back CAS for 8B/8B key/value pairs
-  auto& slot_key   = current_slot->first;
-  auto& slot_value = current_slot->second;
-
-  bool key_success =
-    slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed);
-  bool value_success =
-    slot_value.compare_exchange_strong(expected_value, insert_pair.second, memory_order_relaxed);
-
-  if (key_success) {
-    while (not value_success) {
-      value_success =
-        slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(),
-                                           insert_pair.second,
-                                           memory_order_relaxed);
-    }
-    return insert_result::SUCCESS;
-  } else if (value_success) {
-    slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed);
-  }
-
-  // our key was already present in the slot, so our key is a duplicate
-  if (key_equal(insert_pair.first, expected_key)) { return insert_result::DUPLICATE; }
-
-  return insert_result::CONTINUE;
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename KeyEqual>
-__device__ static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert_result
-static_map<Key, Value, Scope, Allocator>::device_mutable_view::cas_dependent_write(
-  iterator current_slot,
-  value_type const& insert_pair,
-  KeyEqual key_equal,
-  Key expected_key) noexcept
-{
-  using cuda::std::memory_order_relaxed;
-
-  auto& slot_key = current_slot->first;
-
-  auto const key_success =
-    slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed);
-
-  if (key_success) {
-    auto& slot_value = current_slot->second;
-    slot_value.store(insert_pair.second, memory_order_relaxed);
-    return insert_result::SUCCESS;
-  }
-
-  // our key was already present in the slot, so our key is a duplicate
-  if (key_equal(insert_pair.first, expected_key)) { return insert_result::DUPLICATE; }
-
-  return insert_result::CONTINUE;
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename Hash, typename KeyEqual>
-__device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert(
-  value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
-{
-  auto current_slot{this->initial_slot(insert_pair.first, hash)};
-
-  while (true) {
-    key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the
-    // sentinel is not a valid key value. Therefore, first check for the sentinel
-    auto const slot_is_available =
-      cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()) or
-      cuco::detail::bitwise_compare(existing_key, this->get_erased_key_sentinel());
-
-    // the key we are trying to insert is already in the map, so we return with failure to insert
-    if (not slot_is_available and key_equal(existing_key, insert_pair.first)) { return false; }
-
-    if (slot_is_available) {
-      auto const status = [&]() {
-        // One single CAS operation if `value_type` is packable
-        if constexpr (cuco::detail::is_packable<value_type>()) {
-          return packed_cas(current_slot, insert_pair, key_equal, existing_key);
-        }
-
-        if constexpr (not cuco::detail::is_packable<value_type>()) {
-#if (__CUDA_ARCH__ < 700)
-          return cas_dependent_write(current_slot, insert_pair, key_equal, existing_key);
-#else
-          return back_to_back_cas(current_slot, insert_pair, key_equal, existing_key);
-#endif
-        }
-      }();
-
-      // successful insert
-      if (status == insert_result::SUCCESS) { return true; }
-      // duplicate present during insert
-      if (status == insert_result::DUPLICATE) { return false; }
-    }
-
-    // if we couldn't insert the key, but it wasn't a duplicate, then there must
-    // have been some other key there, so we keep looking for a slot
-    current_slot = this->next_slot(current_slot);
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename Hash, typename KeyEqual>
-__device__
-  cuda::std::pair<typename static_map<Key, Value, Scope, Allocator>::device_mutable_view::iterator,
-                  bool>
-  static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert_and_find(
-    value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
-{
-#if __CUDA_ARCH__ < 700
-  // Spinning to ensure that the write to the value part took place requires
-  // independent thread scheduling introduced with the Volta architecture.
-  static_assert(cuco::detail::is_packable<value_type>(),
-                "insert_and_find is not supported for unpackable data on pre-Volta GPUs.");
-#endif
-
-  auto current_slot{this->initial_slot(insert_pair.first, hash)};
-
-  while (true) {
-    key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the
-    // sentinel is not a valid key value. Therefore, first check for the sentinel
-    auto const slot_is_available =
-      cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()) or
-      cuco::detail::bitwise_compare(existing_key, this->get_erased_key_sentinel());
-
-    // the key we are trying to insert is already in the map, so we return with failure to insert
-    if (not slot_is_available and key_equal(existing_key, insert_pair.first)) {
-      // If we cannot use a single CAS operation, ensure that the write to
-      // the value part also took place.
-      if constexpr (not cuco::detail::is_packable<value_type>()) {
-        auto& slot_value       = current_slot->second;
-        auto const empty_value = this->get_empty_value_sentinel();
-        while (cuco::detail::bitwise_compare(slot_value.load(cuda::std::memory_order_relaxed),
-                                             empty_value)) {
-          // spin
-        }
-      }
-
-      return cuda::std::pair{current_slot, false};
-    }
-
-    if (slot_is_available) {
-      auto const status = [&]() {
-        // One single CAS operation if `value_type` is packable
-        if constexpr (cuco::detail::is_packable<value_type>()) {
-          return packed_cas(current_slot, insert_pair, key_equal, existing_key);
-        }
-
-        if constexpr (not cuco::detail::is_packable<value_type>()) {
-          // Only use cas_dependent_write; for back_to_back_cas we cannot
-          // guarantee that we get a valid iterator: Consider the case of two
-          // threads inserting the same key, and one gets the key while the
-          // other gets the value. For a third thread, the entry looks valid,
-          // but the second thread will first reset the value to the empty
-          // sentinel to signal that the first thread can write its value.
-          // This ambiguity cannot be solved for the third thread, so we have
-          // to avoid it.
-          return cas_dependent_write(current_slot, insert_pair, key_equal, existing_key);
-        }
-      }();
-
-      // successful insert
-      if (status == insert_result::SUCCESS) {
-        // This thread did the insertion, so the iterator is guaranteed to be
-        // valid without any special care.
-        return cuda::std::pair{current_slot, true};
-      }
-      // duplicate present during insert
-      if (status == insert_result::DUPLICATE) {
-        // If we cannot use a single CAS operation, ensure that the write to
-        // the value part also took place.
-        if constexpr (not cuco::detail::is_packable<value_type>()) {
-          auto& slot_value       = current_slot->second;
-          auto const empty_value = this->get_empty_value_sentinel();
-          while (cuco::detail::bitwise_compare(slot_value.load(cuda::std::memory_order_relaxed),
-                                               empty_value)) {
-            // spin
-          }
-        }
-
-        return cuda::std::pair{current_slot, false};
-      }
-    }
-
-    // if we couldn't insert the key, but it wasn't a duplicate, then there must
-    // have been some other key there, so we keep looking for a slot
-    current_slot = this->next_slot(current_slot);
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename CG, typename Hash, typename KeyEqual>
-__device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert(
-  CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
-{
-  auto current_slot = this->initial_slot(g, insert_pair.first, hash);
-
-  while (true) {
-    key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-
-    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the
-    // sentinel is not a valid key value. Therefore, first check for the sentinel
-    auto const slot_is_available =
-      cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()) or
-      cuco::detail::bitwise_compare(existing_key, this->get_erased_key_sentinel());
-
-    // the key we are trying to insert is already in the map, so we return with failure to insert
-    if (g.any(not slot_is_available and key_equal(existing_key, insert_pair.first))) {
-      return false;
-    }
-
-    auto const bucket_contains_available = g.ballot(slot_is_available);
-
-    // we found an empty slot, but not the key we are inserting, so this must
-    // be an empty slot into which we can insert the key
-    if (bucket_contains_available) {
-      // the first lane in the group with an empty slot will attempt the insert
-      insert_result status{insert_result::CONTINUE};
-      uint32_t src_lane = __ffs(bucket_contains_available) - 1;
-
-      if (g.thread_rank() == src_lane) {
-        // One single CAS operation if `value_type` is packable
-        if constexpr (cuco::detail::is_packable<value_type>()) {
-          status = packed_cas(current_slot, insert_pair, key_equal, existing_key);
-        }
-        // Otherwise, two back-to-back CAS operations
-        else {
-#if (__CUDA_ARCH__ < 700)
-          status = cas_dependent_write(current_slot, insert_pair, key_equal, existing_key);
-#else
-          status = back_to_back_cas(current_slot, insert_pair, key_equal, existing_key);
-#endif
-        }
-      }
-
-      uint32_t res_status = g.shfl(static_cast<uint32_t>(status), src_lane);
-      status              = static_cast<insert_result>(res_status);
-
-      // successful insert
-      if (status == insert_result::SUCCESS) { return true; }
-      // duplicate present during insert
-      if (status == insert_result::DUPLICATE) { return false; }
-      // if we've gotten this far, a different key took our spot
-      // before we could insert. We need to retry the insert on the
-      // same bucket
-    }
-    // if there are no empty slots in the current bucket,
-    // we move onto the next bucket
-    else {
-      current_slot = this->next_slot(g, current_slot);
-    }
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename Hash, typename KeyEqual>
-__device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::erase(
-  key_type const& k, Hash hash, KeyEqual key_equal) noexcept
-{
-  auto current_slot{this->initial_slot(k, hash)};
-  auto const init_slot = current_slot;
-
-  value_type const insert_pair =
-    make_pair<Key, Value>(this->get_erased_key_sentinel(), this->get_empty_value_sentinel());
-
-  while (true) {
-    static_assert(sizeof(Key) == sizeof(atomic_key_type));
-    static_assert(sizeof(Value) == sizeof(atomic_mapped_type));
-    // TODO: Replace reinterpret_cast with atomic ref when available.
-    value_type slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-    auto existing_key        = slot_contents.first;
-    auto existing_value      = slot_contents.second;
-
-    // Key doesn't exist, return false
-    if (cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) {
-      return false;
-    }
-
-    // Key exists, return true if successfully deleted
-    if (key_equal(existing_key, k)) {
-      if constexpr (cuco::detail::is_packable<value_type>()) {
-        auto slot = reinterpret_cast<
-          cuda::atomic<typename cuco::detail::pair_converter<value_type>::packed_type>*>(
-          current_slot);
-        cuco::detail::pair_converter<value_type> expected_pair{
-          cuco::make_pair(existing_key, existing_value)};
-        cuco::detail::pair_converter<value_type> new_pair{insert_pair};
-
-        return slot->compare_exchange_strong(
-          expected_pair.packed, new_pair.packed, cuda::std::memory_order_relaxed);
-      }
-      if constexpr (not cuco::detail::is_packable<value_type>()) {
-        current_slot->second.compare_exchange_strong(
-          existing_value, insert_pair.second, cuda::std::memory_order_relaxed);
-        return current_slot->first.compare_exchange_strong(
-          existing_key, insert_pair.first, cuda::std::memory_order_relaxed);
-      }
-    }
-
-    current_slot = this->next_slot(current_slot);
-    // if all keys in this map has been erased, return false
-    if (current_slot == init_slot) { return false; }
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename CG, typename Hash, typename KeyEqual>
-__device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::erase(
-  CG g, key_type const& k, Hash hash, KeyEqual key_equal) noexcept
-{
-  auto current_slot    = this->initial_slot(g, k, hash);
-  auto const init_slot = current_slot;
-  value_type const insert_pair =
-    make_pair<Key, Value>(this->get_erased_key_sentinel(), this->get_empty_value_sentinel());
-
-  while (true) {
-    static_assert(sizeof(Key) == sizeof(atomic_key_type));
-    static_assert(sizeof(Value) == sizeof(atomic_mapped_type));
-    // TODO: Replace reinterpret_cast with atomic ref when available.
-    value_type slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-    auto existing_key        = slot_contents.first;
-    auto existing_value      = slot_contents.second;
-
-    auto const slot_is_empty =
-      cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel());
-
-    auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k));
-
-    // Key exists, return true if successfully deleted
-    if (exists) {
-      uint32_t src_lane = __ffs(exists) - 1;
-
-      bool status;
-      if (g.thread_rank() == src_lane) {
-        if constexpr (cuco::detail::is_packable<value_type>()) {
-          auto slot = reinterpret_cast<
-            cuda::atomic<typename cuco::detail::pair_converter<value_type>::packed_type>*>(
-            current_slot);
-          cuco::detail::pair_converter<value_type> expected_pair{
-            cuco::make_pair(existing_key, existing_value)};
-          cuco::detail::pair_converter<value_type> new_pair{insert_pair};
-
-          status = slot->compare_exchange_strong(
-            expected_pair.packed, new_pair.packed, cuda::std::memory_order_relaxed);
-        }
-        if constexpr (not cuco::detail::is_packable<value_type>()) {
-          current_slot->second.compare_exchange_strong(
-            existing_value, insert_pair.second, cuda::std::memory_order_relaxed);
-          status = current_slot->first.compare_exchange_strong(
-            existing_key, insert_pair.first, cuda::std::memory_order_relaxed);
-        }
-      }
-
-      uint32_t res_status = g.shfl(static_cast<uint32_t>(status), src_lane);
-      return static_cast<bool>(res_status);
-    }
-
-    // empty slot found, but key not found, must not be in the map
-    if (g.ballot(slot_is_empty)) { return false; }
-
-    current_slot = this->next_slot(g, current_slot);
-    if (current_slot == init_slot) { return false; }
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename Hash, typename KeyEqual>
-__device__ typename static_map<Key, Value, Scope, Allocator>::device_view::iterator
-static_map<Key, Value, Scope, Allocator>::device_view::find(Key const& k,
-                                                            Hash hash,
-                                                            KeyEqual key_equal) noexcept
-{
-  auto current_slot    = this->initial_slot(k, hash);
-  auto const init_slot = current_slot;
-
-  while (true) {
-    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-    // Key doesn't exist, return end()
-    if (cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) {
-      return this->end();
-    }
-
-    // Key exists, return iterator to location
-    if (key_equal(existing_key, k)) { return current_slot; }
-
-    current_slot = this->next_slot(current_slot);
-    if (current_slot == init_slot) { return this->end(); }
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename Hash, typename KeyEqual>
-__device__ typename static_map<Key, Value, Scope, Allocator>::device_view::const_iterator
-static_map<Key, Value, Scope, Allocator>::device_view::find(Key const& k,
-                                                            Hash hash,
-                                                            KeyEqual key_equal) const noexcept
-{
-  auto current_slot    = this->initial_slot(k, hash);
-  auto const init_slot = current_slot;
-
-  while (true) {
-    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-    // Key doesn't exist, return end()
-    if (cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) {
-      return this->end();
-    }
-
-    // Key exists, return iterator to location
-    if (key_equal(existing_key, k)) { return current_slot; }
-
-    current_slot = this->next_slot(current_slot);
-    if (current_slot == init_slot) { return this->end(); }
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename CG, typename Hash, typename KeyEqual>
-__device__ typename static_map<Key, Value, Scope, Allocator>::device_view::iterator
-static_map<Key, Value, Scope, Allocator>::device_view::find(CG g,
-                                                            Key const& k,
-                                                            Hash hash,
-                                                            KeyEqual key_equal) noexcept
-{
-  auto current_slot    = this->initial_slot(g, k, hash);
-  auto const init_slot = current_slot;
-
-  while (true) {
-    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-
-    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
-    // the sentinel is not a valid key value. Therefore, first check for the sentinel
-    auto const slot_is_empty =
-      cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel());
-
-    // the key we were searching for was found by one of the threads,
-    // so we return an iterator to the entry
-    auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k));
-    if (exists) {
-      uint32_t src_lane = __ffs(exists) - 1;
-      // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the
-      // current_slot and shuffle that instead.
-      intptr_t res_slot = g.shfl(reinterpret_cast<intptr_t>(current_slot), src_lane);
-      return reinterpret_cast<iterator>(res_slot);
-    }
-
-    // we found an empty slot, meaning that the key we're searching for isn't present
-    if (g.ballot(slot_is_empty)) { return this->end(); }
-
-    // otherwise, all slots in the current bucket are full with other keys, so we move onto the
-    // next bucket
-    current_slot = this->next_slot(g, current_slot);
-    if (current_slot == init_slot) { return this->end(); }
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename CG, typename Hash, typename KeyEqual>
-__device__ typename static_map<Key, Value, Scope, Allocator>::device_view::const_iterator
-static_map<Key, Value, Scope, Allocator>::device_view::find(CG g,
-                                                            Key const& k,
-                                                            Hash hash,
-                                                            KeyEqual key_equal) const noexcept
-{
-  auto current_slot    = this->initial_slot(g, k, hash);
-  auto const init_slot = current_slot;
-
-  while (true) {
-    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-
-    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
-    // the sentinel is not a valid key value. Therefore, first check for the sentinel
-    auto const slot_is_empty =
-      cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel());
-
-    // the key we were searching for was found by one of the threads, so we return an iterator to
-    // the entry
-    auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k));
-    if (exists) {
-      uint32_t src_lane = __ffs(exists) - 1;
-      // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the
-      // current_slot and shuffle that instead.
-      intptr_t res_slot = g.shfl(reinterpret_cast<intptr_t>(current_slot), src_lane);
-      return reinterpret_cast<const_iterator>(res_slot);
-    }
-
-    // we found an empty slot, meaning that the key we're searching
-    // for isn't in this submap, so we should move onto the next one
-    if (g.ballot(slot_is_empty)) { return this->end(); }
-
-    // otherwise, all slots in the current bucket are full with other keys,
-    // so we move onto the next bucket in the current submap
-
-    current_slot = this->next_slot(g, current_slot);
-    if (current_slot == init_slot) { return this->end(); }
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename ProbeKey, typename Hash, typename KeyEqual>
-__device__ bool static_map<Key, Value, Scope, Allocator>::device_view::contains(
-  ProbeKey const& k, Hash hash, KeyEqual key_equal) const noexcept
-{
-  auto current_slot    = this->initial_slot(k, hash);
-  auto const init_slot = current_slot;
-
-  while (true) {
-    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-
-    if (cuco::detail::bitwise_compare(existing_key, this->empty_key_sentinel_)) { return false; }
-
-    if (key_equal(existing_key, k)) { return true; }
-
-    current_slot = this->next_slot(current_slot);
-    if (current_slot == init_slot) { return false; }
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename CG, typename ProbeKey, typename Hash, typename KeyEqual>
-__device__ cuda::std::enable_if_t<std::is_invocable_v<KeyEqual, ProbeKey, Key>, bool>
-static_map<Key, Value, Scope, Allocator>::device_view::contains(CG g,
-                                                                ProbeKey const& k,
-                                                                Hash hash,
-                                                                KeyEqual key_equal) const noexcept
-{
-  auto current_slot    = this->initial_slot(g, k, hash);
-  auto const init_slot = current_slot;
-
-  while (true) {
-    key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
-
-    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
-    // the sentinel is not a valid key value. Therefore, first check for the sentinel
-    auto const slot_is_empty =
-      cuco::detail::bitwise_compare(existing_key, this->get_empty_key_sentinel());
-
-    // the key we were searching for was found by one of the threads, so we return an iterator to
-    // the entry
-    if (g.ballot(not slot_is_empty and key_equal(existing_key, k))) { return true; }
-
-    // we found an empty slot, meaning that the key we're searching for isn't present
-    if (g.ballot(slot_is_empty)) { return false; }
-
-    // otherwise, all slots in the current bucket are full with other keys, so we move onto the
-    // next bucket
-    current_slot = this->next_slot(g, current_slot);
-    if (current_slot == init_slot) { return false; }
-  }
-}
-}  // namespace cuco::legacy
diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh
deleted file mode 100644
index fd50f73df..000000000
--- a/include/cuco/detail/static_map_kernels.cuh
+++ /dev/null
@@ -1,586 +0,0 @@
-/*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuco/detail/utility/cuda.cuh>
-
-#include <cub/block/block_reduce.cuh>
-#include <cuda/std/atomic>
-
-#include <cooperative_groups.h>
-
-namespace cuco::legacy::detail {
-namespace cg = cooperative_groups;
-
-CUCO_SUPPRESS_KERNEL_WARNINGS
-/**
- * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`.
- *
- * Each space in `slots` that can hold a key value pair is initialized to a
- * `pair_atomic_type` containing the key `k` and the value `v`.
- *
- * @tparam atomic_key_type Type of the `Key` atomic container
- * @tparam atomic_mapped_type Type of the `Value` atomic container
- * @tparam Key key type
- * @tparam Value value type
- * @tparam pair_atomic_type key/value pair type
- *
- * @param slots Pointer to flat storage for the map's key/value pairs
- * @param k Key to which all keys in `slots` are initialized
- * @param v Value to which all values in `slots` are initialized
- * @param size Size of the storage pointed to by `slots`
- */
-template <std::size_t block_size,
-          typename atomic_key_type,
-          typename atomic_mapped_type,
-          typename Key,
-          typename Value,
-          typename pair_atomic_type>
-CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size)
-{
-  int64_t const loop_stride = gridDim.x * block_size;
-  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
-  while (idx < size) {
-    new (&slots[idx].first) atomic_key_type{k};
-    new (&slots[idx].second) atomic_mapped_type{v};
-    idx += loop_stride;
-  }
-}
-
-/**
- * @brief Inserts all key/value pairs in the range `[first, last)`.
- *
- * If multiple keys in `[first, last)` compare equal, it is unspecified which
- * element is inserted.
- *
- * @tparam block_size
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `value_type`
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of key/value pairs
- * @param n Number of the key/value pairs to insert
- * @param num_successes The number of successfully inserted key/value pairs
- * @param view Mutable device view used to access the hash map's slot storage
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function used to compare two keys for equality
- */
-template <std::size_t block_size,
-          typename InputIt,
-          typename atomicT,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void insert(
-  InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
-{
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_successes = 0;
-
-  int64_t const loop_stride = gridDim.x * block_size;
-  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
-
-  while (idx < n) {
-    typename viewT::value_type const insert_pair{*(first + idx)};
-    if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; }
-    idx += loop_stride;
-  }
-
-  // compute number of successfully inserted elements for each block
-  // and atomically add to the grand total
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
-}
-
-/**
- * @brief Inserts all key/value pairs in the range `[first, last)`.
- *
- * If multiple keys in `[first, last)` compare equal, it is unspecified which
- * element is inserted. Uses the CUDA Cooperative Groups API to leverage groups
- * of multiple threads to perform each key/value insertion. This provides a
- * significant boost in throughput compared to the non Cooperative Group
- * `insert` at moderate to high load factors.
- *
- * @tparam block_size
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform
- * inserts
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `value_type`
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of key/value pairs
- * @param n Number of the key/value pairs to insert
- * @param num_successes The number of successfully inserted key/value pairs
- * @param view Mutable device view used to access the hash map's slot storage
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function used to compare two keys for equality
- */
-template <std::size_t block_size,
-          uint32_t tile_size,
-          typename InputIt,
-          typename atomicT,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void insert(
-  InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
-{
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_successes = 0;
-
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-
-  while (idx < n) {
-    // force conversion to value_type
-    typename viewT::value_type const insert_pair{*(first + idx)};
-    if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) {
-      thread_num_successes++;
-    }
-    idx += loop_stride;
-  }
-
-  // compute number of successfully inserted elements for each block
-  // and atomically add to the grand total
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
-}
-
-/**
- * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
- * insertions.
- * Else, no effect.
- *
- * @tparam block_size The size of the thread block
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
- * @param num_successes The number of successfully erased key/value pairs
- * @param view Device view used to access the hash map's slot storage
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <std::size_t block_size,
-          typename InputIt,
-          typename atomicT,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void erase(
-  InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
-{
-  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_successes = 0;
-
-  const int64_t loop_stride = gridDim.x * block_size;
-  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
-
-  while (idx < n) {
-    if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; }
-    idx += loop_stride;
-  }
-
-  // compute number of successfully inserted elements for each block
-  // and atomically add to the grand total
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) {
-    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
-  }
-}
-
-/**
- * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
- * insertions.
- * Else, no effect.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
- * @param num_successes The number of successfully erased key/value pairs
- * @param view Device view used to access the hash map's slot storage
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <std::size_t block_size,
-          uint32_t tile_size,
-          typename InputIt,
-          typename atomicT,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void erase(
-  InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
-{
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_successes = 0;
-
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-
-  while (idx < n) {
-    if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) {
-      thread_num_successes++;
-    }
-    idx += loop_stride;
-  }
-
-  // compute number of successfully inserted elements for each block
-  // and atomically add to the grand total
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) {
-    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
-  }
-}
-
-/**
- * @brief Inserts key/value pairs in the range `[first, first + n)` if `pred` of the
- * corresponding stencil returns true.
- *
- * If multiple keys in `[first, last)` compare equal, it is unspecified which
- * element is inserted.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform insert
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `value_type`
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam StencilIt Device accessible random access iterator whose value_type is
- * convertible to Predicate's argument type
- * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool`
- * and argument type is convertible from `std::iterator_traits<StencilIt>::value_type`
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of key/value pairs
- * @param n Number of elements to insert
- * @param num_successes The number of successfully inserted key/value pairs
- * @param view Mutable device view used to access the hash map's slot storage
- * @param stencil Beginning of the stencil sequence
- * @param pred Predicate to test on every element in the range `[s, s + n)`
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function used to compare two keys for equality
- */
-template <std::size_t block_size,
-          uint32_t tile_size,
-          typename InputIt,
-          typename atomicT,
-          typename viewT,
-          typename StencilIt,
-          typename Predicate,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void insert_if_n(InputIt first,
-                             int64_t n,
-                             atomicT* num_successes,
-                             viewT view,
-                             StencilIt stencil,
-                             Predicate pred,
-                             Hash hash,
-                             KeyEqual key_equal)
-{
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_successes = 0;
-
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-
-  while (idx < n) {
-    if (pred(*(stencil + idx))) {
-      typename viewT::value_type const insert_pair{*(first + idx)};
-      if (view.insert(tile, insert_pair, hash, key_equal) and tile.thread_rank() == 0) {
-        thread_num_successes++;
-      }
-    }
-    idx += loop_stride;
-  }
-
-  // compute number of successfully inserted elements for each block
-  // and atomically add to the grand total
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) {
-    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
-  }
-}
-
-/**
- * @brief Finds the values corresponding to all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
- * Else, copies the empty value sentinel.
- * @tparam block_size The size of the thread block
- * @tparam Value The type of the mapped value for the map
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of keys to query
- * @param output_begin Beginning of the sequence of values retrieved for each key
- * @param view Device view used to access the hash map's slot storage
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <std::size_t block_size,
-          typename Value,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void find(
-  InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
-{
-  int64_t const loop_stride = gridDim.x * block_size;
-  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
-  __shared__ Value writeBuffer[block_size];
-
-  while (idx < n) {
-    auto key   = *(first + idx);
-    auto found = view.find(key, hash, key_equal);
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    writeBuffer[threadIdx.x] = found == view.end()
-                                 ? view.get_empty_value_sentinel()
-                                 : found->second.load(cuda::std::memory_order_relaxed);
-    __syncthreads();
-    *(output_begin + idx) = writeBuffer[threadIdx.x];
-    idx += loop_stride;
-  }
-}
-
-/**
- * @brief Finds the values corresponding to all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
- * Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage groups
- * of multiple threads to find each key. This provides a significant boost in throughput compared
- * to the non Cooperative Group `find` at moderate to high load factors.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform
- * inserts
- * @tparam Value The type of the mapped value for the map
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of keys to query
- * @param output_begin Beginning of the sequence of values retrieved for each key
- * @param view Device view used to access the hash map's slot storage
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <std::size_t block_size,
-          uint32_t tile_size,
-          typename Value,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void find(
-  InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
-{
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress static_var_with_dynamic_init
-  // Get rid of a false-positive build warning with ARM
-  __shared__ Value writeBuffer[block_size / tile_size];
-#pragma nv_diagnostic pop
-
-  while (idx < n) {
-    auto key   = *(first + idx);
-    auto found = view.find(tile, key, hash, key_equal);
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    if (tile.thread_rank() == 0) {
-      writeBuffer[threadIdx.x / tile_size] =
-        found == view.end() ? view.get_empty_value_sentinel()
-                            : found->second.load(cuda::std::memory_order_relaxed);
-    }
-    __syncthreads();
-    if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; }
-    idx += loop_stride;
-  }
-}
-
-/**
- * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
- *
- * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
- *
- * @tparam block_size The size of the thread block
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of keys to query
- * @param output_begin Beginning of the sequence of booleans for the presence of each key
- * @param view Device view used to access the hash map's slot storage
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <std::size_t block_size,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void contains(
-  InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
-{
-  int64_t const loop_stride = gridDim.x * block_size;
-  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
-  __shared__ bool writeBuffer[block_size];
-
-  while (idx < n) {
-    auto key = *(first + idx);
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal);
-    __syncthreads();
-    *(output_begin + idx) = writeBuffer[threadIdx.x];
-    idx += loop_stride;
-  }
-}
-
-/**
- * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
- *
- * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
- * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the
- * contains operation for each key. This provides a significant boost in throughput compared
- * to the non Cooperative Group `contains` at moderate to high load factors.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform
- * inserts
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of keys to query
- * @param output_begin Beginning of the sequence of booleans for the presence of each key
- * @param view Device view used to access the hash map's slot storage
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <std::size_t block_size,
-          uint32_t tile_size,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void contains(
-  InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
-{
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-  __shared__ bool writeBuffer[block_size / tile_size];
-
-  while (idx < n) {
-    auto key   = *(first + idx);
-    auto found = view.contains(tile, key, hash, key_equal);
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; }
-    __syncthreads();
-    if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; }
-    idx += loop_stride;
-  }
-}
-
-}  // namespace cuco::legacy::detail
diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh
index f11f9bed9..d8afe8c13 100644
--- a/include/cuco/detail/utils.cuh
+++ b/include/cuco/detail/utils.cuh
@@ -16,12 +16,10 @@
 #pragma once
 
 #include <cuco/detail/__config>
-#include <cuco/detail/bitwise_compare.cuh>
 
 #include <cuda/std/array>
 #include <cuda/std/cmath>
 #include <cuda/std/cstdint>
-#include <cuda/std/tuple>
 #include <cuda/std/type_traits>
 
 namespace cuco {
@@ -37,53 +35,6 @@ __device__ __forceinline__ cuda::std::int32_t count_least_significant_bits(cuda:
   return __popc(x & (1 << n) - 1);
 }
 
-/**
- * @brief Converts pair to `cuda::std::tuple` to allow assigning to a zip iterator.
- *
- * @tparam Key The slot key type
- * @tparam Value The slot value type
- */
-template <typename Key, typename Value>
-struct slot_to_tuple {
-  /**
-   * @brief Converts a pair to a `cuda::std::tuple`.
-   *
-   * @tparam S The slot type
-   *
-   * @param s The slot to convert
-   * @return A cuda::std::tuple containing `s.first` and `s.second`
-   */
-  template <typename S>
-  __device__ cuda::std::tuple<Key, Value> operator()(S const& s)
-  {
-    return cuda::std::tuple<Key, Value>(s.first, s.second);
-  }
-};
-
-/**
- * @brief Device functor returning whether the input slot `s` is filled.
- *
- * @tparam Key The slot key type
- */
-template <typename Key>
-struct slot_is_filled {
-  Key empty_key_sentinel_;  ///< The value of the empty key sentinel
-
-  /**
-   * @brief Indicates if the target slot `s` is filled.
-   *
-   * @tparam S The slot type
-   *
-   * @param s The slot to query
-   * @return `true` if slot `s` is filled
-   */
-  template <typename S>
-  __device__ bool operator()(S const& s)
-  {
-    return not cuco::detail::bitwise_compare(cuda::std::get<0>(s), empty_key_sentinel_);
-  }
-};
-
 template <typename SizeType, typename HashType>
 __host__ __device__ constexpr SizeType to_positive(HashType hash)
 {
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index dde5b249e..b70fa593a 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -18,7 +18,6 @@
 
 #include <cuco/detail/__config>
 #include <cuco/detail/open_addressing/open_addressing_impl.cuh>
-#include <cuco/detail/static_map_kernels.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/pair.cuh>
 #include <cuco/static_map_ref.cuh>
@@ -1249,1394 +1248,6 @@ class static_map {
   mapped_type empty_value_sentinel_;  ///< Sentinel value that indicates an empty payload
 };
 
-template <class Key,
-          class T,
-          class Extent,
-          cuda::thread_scope Scope,
-          class KeyEqual,
-          class ProbingScheme,
-          class Allocator,
-          class Storage>
-class dynamic_map;
-
-namespace legacy {
-
-/**
- * @brief A GPU-accelerated, unordered, associative container of key-value
- * pairs with unique keys.
- *
- * Allows constant time concurrent inserts or concurrent find operations from threads in device
- * code. Concurrent insert and find are supported only if the pair type is packable (see
- * `cuco::detail::is_packable` constexpr).
- *
- * Current limitations:
- * - Requires keys and values that where `cuco::is_bitwise_comparable_v<T>` is true
- *    - Comparisons against the "sentinel" values will always be done with bitwise comparisons.
- * - Capacity is fixed and will not grow automatically
- * - Requires the user to specify sentinel values for both key and mapped value to indicate empty
- * slots
- * - Conditionally support concurrent insert and find operations
- *
- * The `static_map` supports two types of operations:
- * - Host-side "bulk" operations
- * - Device-side "singular" operations
- *
- * The host-side bulk operations include `insert`, `erase`, `find`, and `contains`. These
- * APIs should be used when there are a large number of keys to insert, erase or lookup
- * in the map. For example, given a range of keys specified by device-accessible
- * iterators, the bulk `insert` function will insert all keys into the map. Note that in order
- * for a `static_map` instance to support `erase`, the user must provide an `erased_key_sentinel`
- * which is distinct from the `empty_key_sentinel` at construction. If `erase` is called on a
- * `static_map` which was not constructed in this way, a runtime error will be generated.
- *
- * The singular device-side operations allow individual threads to perform
- * independent insert or find/contains operations from device code. These
- * operations are accessed through non-owning, trivially copyable "view" types:
- * `device_view` and `device_mutable_view`. The `device_view` class is an
- * immutable view that allows only non-modifying operations such as `find` or
- * `contains`. The `device_mutable_view` class only allows `insert` and `erase` operations.
- * The two types are separate to prevent erroneous concurrent insert/erase/find
- * operations. Note that the device-side `erase` may only be called if the corresponding
- * `device_mutable_view` was constructed with a user-provided `erased_key_sentinel`. It is
- * up to the user to ensure this condition is met.
- *
- * Example:
- * \code{.cpp}
- * int empty_key_sentinel = -1;
- * int empty_value_sentinel = -1;
- * int erased_key_sentinel = -2;
- *
- * // Constructs a map with 100,000 slots using -1 and -1 as the empty key/value
- * // sentinels. The supplied erased key sentinel of -2 must be a different value from the empty
- * // key sentinel. If erase functionality is not needed, you may elect to not supply an erased
- * // key sentinel to the constructor. Note the capacity is chosen knowing we will insert 50,000
- * // keys, for an load factor of 50%.
- * static_map<int, int> m{100'000, empty_key_sentinel, empty_value_sentinel, erased_value_sentinel};
- *
- * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}}
- * thrust::device_vector<cuda::std::pair<int,int>> pairs(50,000);
- * thrust::transform(thrust::make_counting_iterator(0),
- *                   thrust::make_counting_iterator(pairs.size()),
- *                   pairs.begin(),
- *                   []__device__(auto i){ return cuco::pair{i,i}; };
- *
- *
- * // Inserts all pairs into the map
- * m.insert(pairs.begin(), pairs.end());
- *
- * // Get a `device_view` and passes it to a kernel where threads may perform
- * // `find/contains` lookups
- * kernel<<<...>>>(m.get_device_view());
- * \endcode
- *
- *
- * @tparam Key Arithmetic type used for key
- * @tparam Value Type of the mapped values
- * @tparam Scope The scope in which insert/find operations will be performed by
- * individual threads.
- * @tparam Allocator Type of allocator used for device storage
- */
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope = cuda::thread_scope_device,
-          typename Allocator       = cuco::cuda_allocator<char>>
-class static_map {
-  static_assert(
-    cuco::is_bitwise_comparable_v<Key>,
-    "Key type must have unique object representations or have been explicitly declared as safe for "
-    "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
-
-  static_assert(cuco::is_bitwise_comparable_v<Value>,
-                "Value type must have unique object representations or have been explicitly "
-                "declared as safe for bitwise comparison via specialization of "
-                "cuco::is_bitwise_comparable_v<Value>.");
-
- public:
-  using value_type         = cuco::pair<Key, Value>;            ///< Type of key/value pairs
-  using key_type           = Key;                               ///< Key type
-  using mapped_type        = Value;                             ///< Type of mapped values
-  using atomic_key_type    = cuda::atomic<key_type, Scope>;     ///< Type of atomic keys
-  using atomic_mapped_type = cuda::atomic<mapped_type, Scope>;  ///< Type of atomic mapped values
-  using pair_atomic_type =
-    cuco::pair<atomic_key_type,
-               atomic_mapped_type>;  ///< Pair type of atomic key and atomic mapped value
-  using slot_type           = pair_atomic_type;                  ///< Type of hash map slots
-  using atomic_ctr_type     = cuda::atomic<std::size_t, Scope>;  ///< Atomic counter type
-  using allocator_type      = Allocator;                         ///< Allocator type
-  using slot_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
-    pair_atomic_type>;  ///< Type of the allocator to (de)allocate slots
-  using counter_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
-    atomic_ctr_type>;  ///< Type of the allocator to (de)allocate atomic counters
-
-#if !defined(CUCO_HAS_INDEPENDENT_THREADS)
-  static_assert(atomic_key_type::is_always_lock_free,
-                "A key type larger than 8B is supported for only sm_70 and up.");
-  static_assert(atomic_mapped_type::is_always_lock_free,
-                "A value type larger than 8B is supported for only sm_70 and up.");
-#endif
-
-  static_map(static_map const&) = delete;
-  static_map(static_map&&)      = delete;
-
-  static_map& operator=(static_map const&) = delete;
-  static_map& operator=(static_map&&)      = delete;
-
-  /**
-   * @brief Indicates if concurrent insert/find is supported for the key/value types.
-   *
-   * @return Boolean indicating if concurrent insert/find is supported.
-   */
-  __host__ __device__ static constexpr bool supports_concurrent_insert_find() noexcept
-  {
-    return cuco::detail::is_packable<value_type>();
-  }
-
-  /**
-   * @brief Constructs a statically sized map with the specified number of slots
-   * and sentinel values.
-   *
-   * The capacity of the map is fixed. Insert operations will not automatically
-   * grow the map. Attempting to insert equal to or more unique keys than the capacity
-   * of the map results in undefined behavior (there should be at least one empty slot).
-   *
-   * Performance begins to degrade significantly beyond a load factor of ~70%.
-   * For best performance, choose a capacity that will keep the load factor
-   * below 70%. E.g., if inserting `N` unique keys, choose a capacity of
-   * `N * (1/0.7)`.
-   *
-   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
-   * undefined behavior results from attempting to insert any key/value pair
-   * that contains either.
-   *
-   * @param capacity The total number of slots in the map
-   * @param empty_key_sentinel The reserved key value for empty slots
-   * @param empty_value_sentinel The reserved mapped value for empty slots
-   * @param alloc Allocator used for allocating device storage
-   * @param stream Stream used for executing the kernels
-   */
-  static_map(std::size_t capacity,
-             empty_key<Key> empty_key_sentinel,
-             empty_value<Value> empty_value_sentinel,
-             Allocator const& alloc = Allocator{},
-             cudaStream_t stream    = 0);
-
-  /**
-   * @brief Constructs a fixed-size map with erase capability.
-   * empty_key_sentinel and erased_key_sentinel must be different values.
-   *
-   * @throw std::runtime error if the empty key sentinel and erased key sentinel
-   * are the same value
-   *
-   * @param capacity The total number of slots in the map
-   * @param empty_key_sentinel The reserved key value for empty slots
-   * @param empty_value_sentinel The reserved mapped value for empty slots
-   * @param erased_key_sentinel The reserved value to denote erased slots
-   * @param alloc Allocator used for allocating device storage
-   * @param stream Stream used for executing the kernels
-   */
-  static_map(std::size_t capacity,
-             empty_key<Key> empty_key_sentinel,
-             empty_value<Value> empty_value_sentinel,
-             erased_key<Key> erased_key_sentinel,
-             Allocator const& alloc = Allocator{},
-             cudaStream_t stream    = 0);
-
-  /**
-   * @brief Destroys the map and frees its contents.
-   *
-   */
-  ~static_map();
-
-  /**
-   * @brief Inserts all key/value pairs in the range `[first, last)`.
-   *
-   * This function synchronizes `stream`.
-   *
-   * If multiple keys in `[first, last)` compare equal, it is unspecified which
-   * element is inserted.
-   *
-   * @tparam InputIt Device accessible input iterator whose `value_type` is
-   * convertible to the map's `value_type`
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
-   * @param first Beginning of the sequence of key/value pairs
-   * @param last End of the sequence of key/value pairs
-   * @param hash The unary function to apply to hash each key
-   * @param key_equal The binary function to compare two keys for equality
-   * @param stream Stream used for executing the kernels
-   */
-  template <typename InputIt,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
-  void insert(InputIt first,
-              InputIt last,
-              Hash hash           = Hash{},
-              KeyEqual key_equal  = KeyEqual{},
-              cudaStream_t stream = 0);
-
-  /**
-   * @brief Inserts key/value pairs in the range `[first, last)` if `pred`
-   * of the corresponding stencil returns true.
-   *
-   * The key/value pair `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
-   *
-   * @tparam InputIt Device accessible random access iterator whose `value_type` is
-   * convertible to the map's `value_type`
-   * @tparam StencilIt Device accessible random access iterator whose value_type is
-   * convertible to Predicate's argument type
-   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
-   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
-   * @param first Beginning of the sequence of key/value pairs
-   * @param last End of the sequence of key/value pairs
-   * @param stencil Beginning of the stencil sequence
-   * @param pred Predicate to test on every element in the range `[stencil, stencil +
-   * std::distance(first, last))`
-   * @param hash The unary function to hash each key
-   * @param key_equal The binary function to compare two keys for equality
-   * @param stream CUDA stream used for insert
-   */
-  template <typename InputIt,
-            typename StencilIt,
-            typename Predicate,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
-  void insert_if(InputIt first,
-                 InputIt last,
-                 StencilIt stencil,
-                 Predicate pred,
-                 Hash hash           = Hash{},
-                 KeyEqual key_equal  = KeyEqual{},
-                 cudaStream_t stream = 0);
-
-  /**
-   * @brief Erases keys in the range `[first, last)`.
-   *
-   * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's
-   * associated value from the map. Else, no effect.
-   *
-   *  Side-effects:
-   *  - `contains(k) == false`
-   *  - `find(k) == end()`
-   *  - `insert({k,v}) == true`
-   *  - `get_size()` is reduced by the total number of erased keys
-   *
-   * This function synchronizes `stream`.
-   *
-   * @tparam InputIt Device accessible input iterator whose `value_type` is
-   * convertible to the map's `value_type`
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
-   * @param first Beginning of the sequence of keys
-   * @param last End of the sequence of keys
-   * @param hash The unary function to apply to hash each key
-   * @param key_equal The binary function to compare two keys for equality
-   * @param stream Stream used for executing the kernels
-   *
-   * @throw std::runtime_error if a unique erased key sentinel value was not
-   * provided at construction
-   */
-  template <typename InputIt,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
-  void erase(InputIt first,
-             InputIt last,
-             Hash hash           = Hash{},
-             KeyEqual key_equal  = KeyEqual{},
-             cudaStream_t stream = 0);
-
-  /**
-   * @brief Finds the values corresponding to all keys in the range `[first, last)`.
-   *
-   * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin +
-   * i)`. Else, copies the empty value sentinel.
-   *
-   * @tparam InputIt Device accessible input iterator whose `value_type` is
-   * convertible to the map's `key_type`
-   * @tparam OutputIt Device accessible output iterator whose `value_type` is
-   * convertible to the map's `mapped_type`
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
-   * @param first Beginning of the sequence of keys
-   * @param last End of the sequence of keys
-   * @param output_begin Beginning of the sequence of values retrieved for each key
-   * @param hash The unary function to apply to hash each key
-   * @param key_equal The binary function to compare two keys for equality
-   * @param stream Stream used for executing the kernels
-   */
-  template <typename InputIt,
-            typename OutputIt,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
-  void find(InputIt first,
-            InputIt last,
-            OutputIt output_begin,
-            Hash hash           = Hash{},
-            KeyEqual key_equal  = KeyEqual{},
-            cudaStream_t stream = 0);
-
-  /**
-   * @brief Retrieves all of the keys and their associated values.
-   *
-   * The order in which keys are returned is implementation defined and not guaranteed to be
-   * consistent between subsequent calls to `retrieve_all`.
-   *
-   * Behavior is undefined if the range beginning at `keys_out` or `values_out` is less than
-   * `get_size()`
-   *
-   * @tparam KeyOut Device accessible random access output iterator whose `value_type` is
-   * convertible from `key_type`.
-   * @tparam ValueOut Device accessible random access output iterator whose `value_type` is
-   * convertible from `mapped_type`.
-   * @param keys_out Beginning output iterator for keys
-   * @param values_out Beginning output iterator for values
-   * @param stream CUDA stream used for this operation
-   * @return Pair of iterators indicating the last elements in the output
-   */
-  template <typename KeyOut, typename ValueOut>
-  std::pair<KeyOut, ValueOut> retrieve_all(KeyOut keys_out,
-                                           ValueOut values_out,
-                                           cudaStream_t stream = 0) const;
-
-  /**
-   * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
-   *
-   * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
-   *
-   * Hash should be callable with both <tt>std::iterator_traits<InputIt>::value_type</tt> and Key
-   * type. <tt>std::invoke_result<KeyEqual, std::iterator_traits<InputIt>::value_type, Key></tt>
-   * must be well-formed.
-   *
-   * @tparam InputIt Device accessible input iterator
-   * @tparam OutputIt Device accessible output iterator assignable from `bool`
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
-   *
-   * @param first Beginning of the sequence of keys
-   * @param last End of the sequence of keys
-   * @param output_begin Beginning of the sequence of booleans for the presence of each key
-   * @param hash The unary function to apply to hash each key
-   * @param key_equal The binary function to compare two keys for equality
-   * @param stream Stream used for executing the kernels
-   */
-  template <typename InputIt,
-            typename OutputIt,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
-  void contains(InputIt first,
-                InputIt last,
-                OutputIt output_begin,
-                Hash hash           = Hash{},
-                KeyEqual key_equal  = KeyEqual{},
-                cudaStream_t stream = 0) const;
-
- private:
-  class device_view_base {
-   protected:
-    // Import member type definitions from `static_map`
-    using value_type     = value_type;
-    using key_type       = Key;
-    using mapped_type    = Value;
-    using iterator       = pair_atomic_type*;
-    using const_iterator = pair_atomic_type const*;
-    using slot_type      = slot_type;
-
-    Key empty_key_sentinel_{};      ///< Key value that represents an empty slot
-    Key erased_key_sentinel_{};     ///< Key value that represents an erased slot
-    Value empty_value_sentinel_{};  ///< Initial Value of empty slot
-    pair_atomic_type* slots_{};     ///< Pointer to flat slots storage
-    std::size_t capacity_{};        ///< Total number of slots
-
-    __host__ __device__ device_view_base(pair_atomic_type* slots,
-                                         std::size_t capacity,
-                                         empty_key<Key> empty_key_sentinel,
-                                         empty_value<Value> empty_value_sentinel) noexcept
-      : slots_{slots},
-        capacity_{capacity},
-        empty_key_sentinel_{empty_key_sentinel.value},
-        erased_key_sentinel_{empty_key_sentinel.value},
-        empty_value_sentinel_{empty_value_sentinel.value}
-    {
-    }
-
-    __host__ __device__ device_view_base(pair_atomic_type* slots,
-                                         std::size_t capacity,
-                                         empty_key<Key> empty_key_sentinel,
-                                         empty_value<Value> empty_value_sentinel,
-                                         erased_key<Key> erased_key_sentinel) noexcept
-      : slots_{slots},
-        capacity_{capacity},
-        empty_key_sentinel_{empty_key_sentinel.value},
-        erased_key_sentinel_{erased_key_sentinel.value},
-        empty_value_sentinel_{empty_value_sentinel.value}
-    {
-    }
-
-    /**
-     * @brief Returns the initial slot for a given key `k`
-     *
-     * @tparam ProbeKey Probe key type
-     * @tparam Hash Unary callable type
-     *
-     * @param k The key to get the slot for
-     * @param hash The unary callable used to hash the key
-     * @return Pointer to the initial slot for `k`
-     */
-    template <typename ProbeKey, typename Hash>
-    __device__ iterator initial_slot(ProbeKey const& k, Hash hash) noexcept
-    {
-      return &slots_[hash(k) % capacity_];
-    }
-
-    /**
-     * @brief Returns the initial slot for a given key `k`
-     *
-     * @tparam ProbeKey Probe key type
-     * @tparam Hash Unary callable type
-     *
-     * @param k The key to get the slot for
-     * @param hash The unary callable used to hash the key
-     * @return Pointer to the initial slot for `k`
-     */
-    template <typename ProbeKey, typename Hash>
-    __device__ const_iterator initial_slot(ProbeKey const& k, Hash hash) const noexcept
-    {
-      return &slots_[hash(k) % capacity_];
-    }
-
-    /**
-     * @brief Returns the initial slot for a given key `k`
-     *
-     * To be used for Cooperative Group based probing.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam ProbeKey Probe key type
-     * @tparam Hash Unary callable type
-     *
-     * @param g the Cooperative Group for which the initial slot is needed
-     * @param k The key to get the slot for
-     * @param hash The unary callable used to hash the key
-     * @return Pointer to the initial slot for `k`
-     */
-    template <typename CG, typename ProbeKey, typename Hash>
-    __device__ iterator initial_slot(CG g, ProbeKey const& k, Hash hash) noexcept
-    {
-      return &slots_[(hash(k) + g.thread_rank()) % capacity_];
-    }
-
-    /**
-     * @brief Returns the initial slot for a given key `k`
-     *
-     * To be used for Cooperative Group based probing.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam ProbeKey Probe key type
-     * @tparam Hash Unary callable type
-     *
-     * @param g the Cooperative Group for which the initial slot is needed
-     * @param k The key to get the slot for
-     * @param hash The unary callable used to hash the key
-     * @return Pointer to the initial slot for `k`
-     */
-    template <typename CG, typename ProbeKey, typename Hash>
-    __device__ const_iterator initial_slot(CG g, ProbeKey const& k, Hash hash) const noexcept
-    {
-      return &slots_[(hash(k) + g.thread_rank()) % capacity_];
-    }
-
-    /**
-     * @brief Given a slot `s`, returns the next slot.
-     *
-     * If `s` is the last slot, wraps back around to the first slot.
-     *
-     * @param s The slot to advance
-     * @return The next slot after `s`
-     */
-    __device__ iterator next_slot(iterator s) noexcept { return (++s < end()) ? s : begin_slot(); }
-
-    /**
-     * @brief Given a slot `s`, returns the next slot.
-     *
-     * If `s` is the last slot, wraps back around to the first slot.
-     *
-     * @param s The slot to advance
-     * @return The next slot after `s`
-     */
-    __device__ const_iterator next_slot(const_iterator s) const noexcept
-    {
-      return (++s < end()) ? s : begin_slot();
-    }
-
-    /**
-     * @brief Given a slot `s`, returns the next slot.
-     *
-     * If `s` is the last slot, wraps back around to the first slot. To
-     * be used for Cooperative Group based probing.
-     *
-     * @tparam CG The Cooperative Group type
-     * @param g The Cooperative Group for which the next slot is needed
-     * @param s The slot to advance
-     * @return The next slot after `s`
-     */
-    template <typename CG>
-    __device__ iterator next_slot(CG g, iterator s) noexcept
-    {
-      uint32_t index = s - slots_;
-      return &slots_[(index + g.size()) % capacity_];
-    }
-
-    /**
-     * @brief Given a slot `s`, returns the next slot.
-     *
-     * If `s` is the last slot, wraps back around to the first slot. To
-     * be used for Cooperative Group based probing.
-     *
-     * @tparam CG The Cooperative Group type
-     * @param g The Cooperative Group for which the next slot is needed
-     * @param s The slot to advance
-     * @return The next slot after `s`
-     */
-    template <typename CG>
-    __device__ const_iterator next_slot(CG g, const_iterator s) const noexcept
-    {
-      uint32_t index = s - slots_;
-      return &slots_[(index + g.size()) % capacity_];
-    }
-
-    /**
-     * @brief Initializes the given array of slots to the specified values given by `k` and `v`
-     * using the threads in the group `g`.
-     *
-     * @note This function synchronizes the group `g`.
-     *
-     * @tparam CG The type of the cooperative thread group
-     * @param g The cooperative thread group used to initialize the slots
-     * @param slots Pointer to the array of slots to initialize
-     * @param num_slots Number of slots to initialize
-     * @param k The desired key value for each slot
-     * @param v The desired mapped value for each slot
-     */
-
-    template <typename CG>
-    __device__ static void initialize_slots(
-      CG g, pair_atomic_type* slots, std::size_t num_slots, Key k, Value v)
-    {
-      auto tid = g.thread_rank();
-      while (tid < num_slots) {
-        new (&slots[tid].first) atomic_key_type{k};
-        new (&slots[tid].second) atomic_mapped_type{v};
-        tid += g.size();
-      }
-      g.sync();
-    }
-
-   public:
-    /**
-     * @brief Gets slots array.
-     *
-     * @return Slots array
-     */
-    __host__ __device__ pair_atomic_type* get_slots() noexcept { return slots_; }
-
-    /**
-     * @brief Gets slots array.
-     *
-     * @return Slots array
-     */
-    __host__ __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; }
-
-    /**
-     * @brief Gets the maximum number of elements the hash map can hold.
-     *
-     * @return The maximum number of elements the hash map can hold
-     */
-    __host__ __device__ std::size_t get_capacity() const noexcept { return capacity_; }
-
-    /**
-     * @brief Gets the sentinel value used to represent an empty key slot.
-     *
-     * @return The sentinel value used to represent an empty key slot
-     */
-    __host__ __device__ Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; }
-
-    /**
-     * @brief Gets the sentinel value used to represent an empty value slot.
-     *
-     * @return The sentinel value used to represent an empty value slot
-     */
-    __host__ __device__ Value get_empty_value_sentinel() const noexcept
-    {
-      return empty_value_sentinel_;
-    }
-
-    __host__ __device__ Key get_erased_key_sentinel() const noexcept
-    {
-      return erased_key_sentinel_;
-    }
-
-    /**
-     * @brief Returns iterator to the first slot.
-     *
-     * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first
-     * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage.
-     * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those
-     * both empty and filled.
-     *
-     * There is no `begin()` iterator to avoid confusion as it is not possible to provide an
-     * iterator over only the filled slots.
-     *
-     * @return Iterator to the first slot
-     */
-    __device__ iterator begin_slot() noexcept { return slots_; }
-
-    /**
-     * @brief Returns iterator to the first slot.
-     *
-     * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first
-     * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage.
-     * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those
-     * both empty and filled.
-     *
-     * There is no `begin()` iterator to avoid confusion as it is not possible to provide an
-     * iterator over only the filled slots.
-     *
-     * @return Iterator to the first slot
-     */
-    __device__ const_iterator begin_slot() const noexcept { return slots_; }
-
-    /**
-     * @brief Returns a const_iterator to one past the last slot.
-     *
-     * @return A const_iterator to one past the last slot
-     */
-    __host__ __device__ const_iterator end_slot() const noexcept { return slots_ + capacity_; }
-
-    /**
-     * @brief Returns an iterator to one past the last slot.
-     *
-     * @return An iterator to one past the last slot
-     */
-    __host__ __device__ iterator end_slot() noexcept { return slots_ + capacity_; }
-
-    /**
-     * @brief Returns a const_iterator to one past the last slot.
-     *
-     * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking
-     * an iterator returned from `find()` against the `end()` iterator.
-     *
-     * @return A const_iterator to one past the last slot
-     */
-    __host__ __device__ const_iterator end() const noexcept { return end_slot(); }
-
-    /**
-     * @brief Returns an iterator to one past the last slot.
-     *
-     * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking
-     * an iterator returned from `find()` against the `end()` iterator.
-     *
-     * @return An iterator to one past the last slot
-     */
-    __host__ __device__ iterator end() noexcept { return end_slot(); }
-  };
-
- public:
-  /**
-   * @brief Mutable, non-owning view-type that may be used in device code to
-   * perform singular inserts into the map.
-   *
-   * `device_mutable_view` is trivially-copyable and is intended to be passed by
-   * value.
-   *
-   * Example:
-   * \code{.cpp}
-   * cuco::static_map<int,int> m{100'000, -1, -1};
-   *
-   * // Inserts a sequence of pairs {{0,0}, {1,1}, ... {i,i}}
-   * thrust::for_each(thrust::make_counting_iterator(0),
-   *                  thrust::make_counting_iterator(50'000),
-   *                  [map = m.get_device_mutable_view()]
-   *                  __device__ (auto i) mutable {
-   *                     map.insert(cuco::pair{i,i});
-   *                  });
-   * \endcode
-   */
-  class device_mutable_view : public device_view_base {
-   public:
-    using value_type  = typename device_view_base::value_type;   ///< Type of key/value pairs
-    using key_type    = typename device_view_base::key_type;     ///< Key type
-    using mapped_type = typename device_view_base::mapped_type;  ///< Type of the mapped values
-    using iterator =
-      typename device_view_base::iterator;  ///< Type of the forward iterator to `value_type`
-    using const_iterator =
-      typename device_view_base::const_iterator;  ///< Type of the forward iterator to `const
-                                                  ///< value_type`
-    using slot_type = typename device_view_base::slot_type;  ///< Type of hash map slots
-
-    /**
-     * @brief Constructs a mutable view of the first `capacity` slots of the
-     * slots array pointed to by `slots`.
-     *
-     * @param slots Pointer to beginning of initialized slots array
-     * @param capacity The number of slots viewed by this object
-     * @param empty_key_sentinel The reserved value for keys to represent empty slots
-     * @param empty_value_sentinel The reserved value for mapped values to
-     * represent empty slots
-     */
-    __host__ __device__ device_mutable_view(pair_atomic_type* slots,
-                                            std::size_t capacity,
-                                            empty_key<Key> empty_key_sentinel,
-                                            empty_value<Value> empty_value_sentinel) noexcept
-      : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel}
-    {
-    }
-
-    /**
-     * @brief Constructs a mutable view of the first `capacity` slots of the
-     * slots array pointed to by `slots`.
-     *
-     * @param slots Pointer to beginning of initialized slots array
-     * @param capacity The number of slots viewed by this object
-     * @param empty_key_sentinel The reserved value for keys to represent empty slots
-     * @param empty_value_sentinel The reserved value for mapped values to represent empty slots
-     * @param erased_key_sentinel The reserved value for keys to represent erased slots
-     */
-    __host__ __device__ device_mutable_view(pair_atomic_type* slots,
-                                            std::size_t capacity,
-                                            empty_key<Key> empty_key_sentinel,
-                                            empty_value<Value> empty_value_sentinel,
-                                            erased_key<Key> erased_key_sentinel) noexcept
-      : device_view_base{
-          slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel}
-    {
-    }
-
-   private:
-    /**
-     * @brief Enumeration of the possible results of attempting to insert into a hash slot.
-     */
-    enum class insert_result {
-      CONTINUE,  ///< Insert did not succeed, continue trying to insert
-      SUCCESS,   ///< New pair inserted successfully
-      DUPLICATE  ///< Insert did not succeed, key is already present
-    };
-
-    /**
-     * @brief Inserts the specified key/value pair with one single CAS operation.
-     *
-     * @tparam KeyEqual Binary callable type
-     * @param current_slot The slot to insert
-     * @param insert_pair The pair to insert
-     * @param key_equal The binary callable used to compare two keys for
-     * equality
-     * @param expected_key The expected value of the key in the target slot
-     * @return An insert result from the `insert_resullt` enumeration.
-     */
-    template <typename KeyEqual>
-    __device__ insert_result packed_cas(iterator current_slot,
-                                        value_type const& insert_pair,
-                                        KeyEqual key_equal,
-                                        Key expected_key) noexcept;
-
-    /**
-     * @brief Inserts the specified key/value pair with two back-to-back CAS operations.
-     *
-     * @tparam KeyEqual Binary callable type
-     * @param current_slot The slot to insert
-     * @param insert_pair The pair to insert
-     * @param key_equal The binary callable used to compare two keys for
-     * equality
-     * @param expected_key The expected value of the key in the target slot
-     * @return An insert result from the `insert_resullt` enumeration.
-     */
-    template <typename KeyEqual>
-    __device__ insert_result back_to_back_cas(iterator current_slot,
-                                              value_type const& insert_pair,
-                                              KeyEqual key_equal,
-                                              Key expected_key) noexcept;
-
-    /**
-     * @brief Inserts the specified key/value pair with a CAS of the key and a dependent write of
-     * the value.
-     *
-     * @tparam KeyEqual Binary callable type
-     * @param current_slot The slot to insert
-     * @param insert_pair The pair to insert
-     * @param key_equal The binary callable used to compare two keys for
-     * equality
-     * @param expected_key The expected value of the key in the target slot
-     * @return An insert result from the `insert_resullt` enumeration.
-     */
-    template <typename KeyEqual>
-    __device__ insert_result cas_dependent_write(iterator current_slot,
-                                                 value_type const& insert_pair,
-                                                 KeyEqual key_equal,
-                                                 Key expected_key) noexcept;
-
-   public:
-    /**
-     * @brief Given a slot pointer `slots`, initializes the first `capacity` slots with the given
-     * sentinel values and returns a `device_mutable_view` object of those slots.
-     *
-     * @tparam CG The type of the cooperative thread group
-     *
-     * @param g The cooperative thread group used to copy the slots
-     * @param slots Pointer to the hash map slots
-     * @param capacity The total number of slots in the map
-     * @param empty_key_sentinel The reserved value for keys to represent empty slots
-     * @param empty_value_sentinel The reserved value for mapped values to represent empty slots
-     * @return A device_mutable_view object based on the given parameters
-     */
-    template <typename CG>
-    __device__ static device_mutable_view make_from_uninitialized_slots(
-      CG g,
-      pair_atomic_type* slots,
-      std::size_t capacity,
-      empty_key<Key> empty_key_sentinel,
-      empty_value<Value> empty_value_sentinel) noexcept
-    {
-      device_view_base::initialize_slots(
-        g, slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value);
-      return device_mutable_view{slots,
-                                 capacity,
-                                 empty_key_sentinel,
-                                 empty_value_sentinel,
-                                 erased_key<Key>{empty_key_sentinel.value}};
-    }
-
-    /**
-     * @brief Given a slot pointer `slots`, initializes the first `capacity` slots with the given
-     * sentinel values and returns a `device_mutable_view` object of those slots.
-     *
-     * @tparam CG The type of the cooperative thread group
-     *
-     * @param g The cooperative thread group used to copy the slots
-     * @param slots Pointer to the hash map slots
-     * @param capacity The total number of slots in the map
-     * @param empty_key_sentinel The reserved value for keys to represent empty slots
-     * @param empty_value_sentinel The reserved value for mapped values to represent empty slots
-     * @param erased_key_sentinel The reserved value for keys to represent erased slots
-     * @return A device_mutable_view object based on the given parameters
-     */
-    template <typename CG>
-    __device__ static device_mutable_view make_from_uninitialized_slots(
-      CG g,
-      pair_atomic_type* slots,
-      std::size_t capacity,
-      empty_key<Key> empty_key_sentinel,
-      empty_value<Value> empty_value_sentinel,
-      erased_key<Key> erased_key_sentinel) noexcept
-    {
-      device_view_base::initialize_slots(
-        g, slots, capacity, empty_key_sentinel, empty_value_sentinel);
-      return device_mutable_view{
-        slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel};
-    }
-
-    /**
-     * @brief Inserts the specified key/value pair into the map.
-     *
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     * @param insert_pair The pair to insert
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys for
-     * equality
-     * @return `true` if the insert was successful, `false` otherwise.
-     */
-    template <typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ bool insert(value_type const& insert_pair,
-                           Hash hash          = Hash{},
-                           KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Inserts the specified key/value pair into the map.
-     *
-     * Returns a pair consisting of an iterator to the inserted element (or to
-     * the element that prevented the insertion) and a `bool` denoting whether
-     * the insertion took place.
-     *
-     * Note: In order to guarantee the validity of the returned iterator,
-     * `insert_and_find` may be less efficient than `insert` in some situations.
-     * Prefer using `insert` unless the returned iterator is required.
-     *
-     * Note: `insert_and_find` may only be used concurrently with `insert`,
-     * `find`, and `erase` when `supports_concurrent_insert_find()` returns
-     * true.
-     *
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     *
-     * @param insert_pair The pair to insert
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys for
-     * equality
-     * @return a pair consisting of an iterator to the element and a bool,
-     * either `true` if the insert was successful, `false` otherwise.
-     */
-    template <typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ cuda::std::pair<iterator, bool> insert_and_find(
-      value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Inserts the specified key/value pair into the map.
-     *
-     * Uses the CUDA Cooperative Groups API to to leverage multiple threads to
-     * perform a single insert. This provides a significant boost in throughput
-     * compared to the non Cooperative Group `insert` at moderate to high load
-     * factors.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     *
-     * @param g The Cooperative Group that performs the insert
-     * @param insert_pair The pair to insert
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys for
-     * equality
-     * @return `true` if the insert was successful, `false` otherwise.
-     */
-    template <typename CG,
-              typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ bool insert(CG g,
-                           value_type const& insert_pair,
-                           Hash hash          = Hash{},
-                           KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Erases the specified key across the map.
-     *
-     * Behavior is undefined if `empty_key_sentinel_` equals to `erased_key_sentinel_`.
-     *
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     *
-     * @param k The key to be erased
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys for
-     * equality
-     * @return `true` if the erasure was successful, `false` otherwise.
-     */
-    template <typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ bool erase(key_type const& k,
-                          Hash hash          = Hash{},
-                          KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Erases the specified key across the map.
-     *
-     * Behavior is undefined if `empty_key_sentinel_` equals to `erased_key_sentinel_`.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     *
-     * @param g The Cooperative Group that performs the erasure
-     * @param k The key to be erased
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys for
-     * equality
-     * @return `true` if the erasure was successful, `false` otherwise.
-     */
-    template <typename CG,
-              typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ bool erase(CG g,
-                          key_type const& k,
-                          Hash hash          = Hash{},
-                          KeyEqual key_equal = KeyEqual{}) noexcept;
-
-  };  // class device mutable view
-
-  /**
-   * @brief Non-owning view-type that may be used in device code to
-   * perform singular find and contains operations for the map.
-   *
-   * `device_view` is trivially-copyable and is intended to be passed by
-   * value.
-   *
-   */
-  class device_view : public device_view_base {
-   public:
-    using value_type  = typename device_view_base::value_type;   ///< Type of key/value pairs
-    using key_type    = typename device_view_base::key_type;     ///< Key type
-    using mapped_type = typename device_view_base::mapped_type;  ///< Type of the mapped values
-    using iterator =
-      typename device_view_base::iterator;  ///< Type of the forward iterator to `value_type`
-    using const_iterator =
-      typename device_view_base::const_iterator;  ///< Type of the forward iterator to `const
-                                                  ///< value_type`
-    using slot_type = typename device_view_base::slot_type;  ///< Type of hash map slots
-
-    /**
-     * @brief Construct a view of the first `capacity` slots of the
-     * slots array pointed to by `slots`.
-     *
-     * @param slots Pointer to beginning of initialized slots array
-     * @param capacity The number of slots viewed by this object
-     * @param empty_key_sentinel The reserved value for keys to represent empty slots
-     * @param empty_value_sentinel The reserved value for mapped values to represent empty slots
-     */
-    __host__ __device__ device_view(pair_atomic_type* slots,
-                                    std::size_t capacity,
-                                    empty_key<Key> empty_key_sentinel,
-                                    empty_value<Value> empty_value_sentinel) noexcept
-      : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel}
-    {
-    }
-
-    /**
-     * @brief Construct a view of the first `capacity` slots of the
-     * slots array pointed to by `slots`.
-     *
-     * @param slots Pointer to beginning of initialized slots array
-     * @param capacity The number of slots viewed by this object
-     * @param empty_key_sentinel The reserved value for keys to represent empty slots
-     * @param empty_value_sentinel The reserved value for mapped values to represent empty slots
-     * @param erased_key_sentinel The reserved value for keys to represent erased slots
-     */
-    __host__ __device__ device_view(pair_atomic_type* slots,
-                                    std::size_t capacity,
-                                    empty_key<Key> empty_key_sentinel,
-                                    empty_value<Value> empty_value_sentinel,
-                                    erased_key<Key> erased_key_sentinel) noexcept
-      : device_view_base{
-          slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel}
-    {
-    }
-
-    /**
-     * @brief Construct a `device_view` from a `device_mutable_view` object
-     *
-     * @param mutable_map object of type `device_mutable_view`
-     */
-    __host__ __device__ explicit device_view(device_mutable_view mutable_map)
-      : device_view_base{mutable_map.get_slots(),
-                         mutable_map.get_capacity(),
-                         empty_key<Key>{mutable_map.get_empty_key_sentinel()},
-                         empty_value<Value>{mutable_map.get_empty_value_sentinel()},
-                         erased_key<Key>{mutable_map.get_erased_key_sentinel()}}
-    {
-    }
-
-    /**
-     * @brief Makes a copy of given `device_view` using non-owned memory.
-     *
-     * This function is intended to be used to create shared memory copies of small static maps,
-     * although global memory can be used as well.
-     *
-     * Example:
-     * @code{.cpp}
-     * template <typename MapType, int CAPACITY>
-     * __global__ void use_device_view(const typename MapType::device_view device_view,
-     *                                 map_key_t const* const keys_to_search,
-     *                                 map_value_t* const values_found,
-     *                                 const size_t number_of_elements)
-     * {
-     *     const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-     *
-     *     __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY];
-     *
-     *     auto g = cg::this_thread_block();
-     *
-     *     const map_t::device_view sm_static_map = device_view.make_copy(g,
-     *                                                                    sm_buffer);
-     *
-     *     for (size_t i = g.thread_rank(); i < number_of_elements; i += g.size())
-     *     {
-     *         values_found[i] = sm_static_map.find(keys_to_search[i])->second;
-     *     }
-     * }
-     * @endcode
-     *
-     * @tparam CG The type of the cooperative thread group
-     * @param g The cooperative thread group used to copy the slots
-     * @param source_device_view `device_view` to copy from
-     * @param memory_to_use Array large enough to support `capacity` elements. Object does not take
-     * the ownership of the memory
-     * @return Copy of passed `device_view`
-     */
-    template <typename CG>
-    __device__ static device_view make_copy(CG g,
-                                            pair_atomic_type* const memory_to_use,
-                                            device_view source_device_view) noexcept
-    {
-#if defined(CUCO_HAS_CUDA_BARRIER)
-      __shared__ cuda::barrier<cuda::thread_scope::thread_scope_block> barrier;
-      if (g.thread_rank() == 0) { init(&barrier, g.size()); }
-      g.sync();
-
-      cuda::memcpy_async(g,
-                         memory_to_use,
-                         source_device_view.get_slots(),
-                         sizeof(pair_atomic_type) * source_device_view.get_capacity(),
-                         barrier);
-
-      barrier.arrive_and_wait();
-#else
-      pair_atomic_type const* const slots_ptr = source_device_view.get_slots();
-      for (std::size_t i = g.thread_rank(); i < source_device_view.get_capacity(); i += g.size()) {
-        new (&memory_to_use[i].first)
-          atomic_key_type{slots_ptr[i].first.load(cuda::memory_order_relaxed)};
-        new (&memory_to_use[i].second)
-          atomic_mapped_type{slots_ptr[i].second.load(cuda::memory_order_relaxed)};
-      }
-      g.sync();
-#endif
-
-      return device_view(memory_to_use,
-                         source_device_view.get_capacity(),
-                         empty_key<Key>{source_device_view.get_empty_key_sentinel()},
-                         empty_value<Value>{source_device_view.get_empty_value_sentinel()},
-                         erased_key<Key>{source_device_view.get_erased_key_sentinel()});
-    }
-
-    /**
-     * @brief Finds the value corresponding to the key `k`.
-     *
-     * Returns an iterator to the pair whose key is equivalent to `k`.
-     * If no such pair exists, returns `end()`.
-     *
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     * @param k The key to search for
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return An iterator to the position at which the key/value pair
-     * containing `k` was inserted
-     */
-    template <typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ iterator find(Key const& k,
-                             Hash hash          = Hash{},
-                             KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /** @brief Finds the value corresponding to the key `k`.
-     *
-     * Returns a const_iterator to the pair whose key is equivalent to `k`.
-     * If no such pair exists, returns `end()`.
-     *
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     * @param k The key to search for
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return An iterator to the position at which the key/value pair
-     * containing `k` was inserted
-     */
-    template <typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ const_iterator find(Key const& k,
-                                   Hash hash          = Hash{},
-                                   KeyEqual key_equal = KeyEqual{}) const noexcept;
-
-    /**
-     * @brief Finds the value corresponding to the key `k`.
-     *
-     * Returns an iterator to the pair whose key is equivalent to `k`.
-     * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to
-     * to leverage multiple threads to perform a single find. This provides a
-     * significant boost in throughput compared to the non Cooperative Group
-     * `find` at moderate to high load factors.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     * @param g The Cooperative Group used to perform the find
-     * @param k The key to search for
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return An iterator to the position at which the key/value pair
-     * containing `k` was inserted
-     */
-    template <typename CG,
-              typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ iterator
-    find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Finds the value corresponding to the key `k`.
-     *
-     * Returns a const_iterator to the pair whose key is equivalent to `k`.
-     * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to
-     * to leverage multiple threads to perform a single find. This provides a
-     * significant boost in throughput compared to the non Cooperative Group
-     * `find` at moderate to high load factors.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     * @param g The Cooperative Group used to perform the find
-     * @param k The key to search for
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return An iterator to the position at which the key/value pair
-     * containing `k` was inserted
-     */
-    template <typename CG,
-              typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ const_iterator
-    find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept;
-
-    /**
-     * @brief Indicates whether the key `k` was inserted into the map.
-     *
-     * If the key `k` was inserted into the map, find returns
-     * true. Otherwise, it returns false.
-     *
-     * Hash should be callable with both ProbeKey and Key type. `std::invoke_result<KeyEqual,
-     * ProbeKey, Key>` must be well-formed.
-     *
-     * If `key_equal(probe_key, slot_key)` returns true, `hash(probe_key) == hash(slot_key)` must
-     * also be true.
-     *
-     * @tparam ProbeKey Probe key type
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     *
-     * @param k The key to search for
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return A boolean indicating whether the key/value pair
-     * containing `k` was inserted
-     */
-    template <typename ProbeKey,
-              typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ bool contains(ProbeKey const& k,
-                             Hash hash          = Hash{},
-                             KeyEqual key_equal = KeyEqual{}) const noexcept;
-
-    /**
-     * @brief Indicates whether the key `k` was inserted into the map.
-     *
-     * If the key `k` was inserted into the map, find returns true. Otherwise, it returns false.
-     * Uses the CUDA Cooperative Groups API to to leverage multiple threads to perform a single
-     * contains operation. This provides a significant boost in throughput compared to the non
-     * Cooperative Group `contains` at moderate to high load factors.
-     *
-     * Hash should be callable with both ProbeKey and Key type. `std::invoke_result<KeyEqual,
-     * ProbeKey, Key>` must be well-formed.
-     *
-     * If `key_equal(probe_key, slot_key)` returns true, `hash(probe_key) == hash(slot_key)` must
-     * also be true.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam ProbeKey Probe key type
-     * @tparam Hash Unary callable type
-     * @tparam KeyEqual Binary callable type
-     *
-     * @param g The Cooperative Group used to perform the contains operation
-     * @param k The key to search for
-     * @param hash The unary callable used to hash the key
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return A boolean indicating whether the key/value pair
-     * containing `k` was inserted
-     */
-    template <typename CG,
-              typename ProbeKey,
-              typename Hash     = cuco::default_hash_function<key_type>,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ cuda::std::enable_if_t<std::is_invocable_v<KeyEqual, ProbeKey, Key>, bool> contains(
-      CG g, ProbeKey const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept;
-  };  // class device_view
-
-  /**
-   * @brief Gets the maximum number of elements the hash map can hold.
-   *
-   * @return The maximum number of elements the hash map can hold
-   */
-  std::size_t get_capacity() const noexcept { return capacity_; }
-
-  /**
-   * @brief Gets the number of elements in the hash map.
-   *
-   * @return The number of elements in the map
-   */
-  std::size_t get_size() const noexcept { return size_; }
-
-  /**
-   * @brief Gets the load factor of the hash map.
-   *
-   * @return The load factor of the hash map
-   */
-  float get_load_factor() const noexcept { return static_cast<float>(size_) / capacity_; }
-
-  /**
-   * @brief Gets the sentinel value used to represent an empty key slot.
-   *
-   * @return The sentinel value used to represent an empty key slot
-   */
-  Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; }
-
-  /**
-   * @brief Gets the sentinel value used to represent an empty value slot.
-   *
-   * @return The sentinel value used to represent an empty value slot
-   */
-  Value get_empty_value_sentinel() const noexcept { return empty_value_sentinel_; }
-
-  /**
-   * @brief Gets the sentinel value used to represent an erased value slot.
-   *
-   * @return The sentinel value used to represent an erased value slot
-   */
-  Key get_erased_key_sentinel() const noexcept { return erased_key_sentinel_; }
-
-  /**
-   * @brief Constructs a device_view object based on the members of the `static_map` object.
-   *
-   * @return A device_view object based on the members of the `static_map` object
-   */
-  device_view get_device_view() const noexcept
-  {
-    return device_view(slots_,
-                       capacity_,
-                       empty_key<Key>{empty_key_sentinel_},
-                       empty_value<Value>{empty_value_sentinel_},
-                       erased_key<Key>{erased_key_sentinel_});
-  }
-
-  /**
-   * @brief Constructs a device_mutable_view object based on the members of the `static_map` object
-   *
-   * @return A device_mutable_view object based on the members of the `static_map` object
-   */
-  device_mutable_view get_device_mutable_view() const noexcept
-  {
-    return device_mutable_view(slots_,
-                               capacity_,
-                               empty_key<Key>{empty_key_sentinel_},
-                               empty_value<Value>{empty_value_sentinel_},
-                               erased_key<Key>{erased_key_sentinel_});
-  }
-
- private:
-  pair_atomic_type* slots_{};                   ///< Pointer to flat slots storage
-  std::size_t capacity_{};                      ///< Total number of slots
-  std::size_t size_{};                          ///< Number of keys in map
-  Key empty_key_sentinel_{};                    ///< Key value that represents an empty slot
-  Value empty_value_sentinel_{};                ///< Initial value of empty slot
-  Key erased_key_sentinel_{};                   ///< Key value that represents an erased slot
-  atomic_ctr_type* num_successes_{};            ///< Number of successfully inserted keys on insert
-  slot_allocator_type slot_allocator_{};        ///< Allocator used to allocate slots
-  counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
-};
-}  // namespace legacy
 }  // namespace cuco
 
-#include <cuco/detail/static_map.inl>
 #include <cuco/detail/static_map/static_map.inl>
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a4787f432..205b0100b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -78,7 +78,6 @@ ConfigureTest(STATIC_SET_TEST
 ConfigureTest(STATIC_MAP_TEST
     static_map/capacity_test.cu
     static_map/contains_test.cu
-    static_map/custom_type_test.cu
     static_map/duplicate_keys_test.cu
     static_map/erase_test.cu
     static_map/find_test.cu
diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu
deleted file mode 100644
index 7860e87e4..000000000
--- a/tests/static_map/custom_type_test.cu
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <test_utils.hpp>
-
-#include <cuco/static_map.cuh>
-
-#include <cuda/functional>
-#include <cuda/std/functional>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/transform.h>
-
-#include <catch2/catch_template_test_macros.hpp>
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-// User-defined key type
-template <typename T>
-struct key_pair_type {
-  T a;
-  T b;
-
-  __host__ __device__ key_pair_type() {}
-  __host__ __device__ key_pair_type(T x) : a{x}, b{x} {}
-
-  // Device equality operator is mandatory due to libcudacxx bug:
-  // https://github.com/NVIDIA/libcudacxx/issues/223
-  __device__ bool operator==(key_pair_type const& other) const
-  {
-    return a == other.a and b == other.b;
-  }
-};
-
-// User-defined key type
-template <typename T>
-struct large_key_type {
-  T a;
-  T b;
-  T c;
-
-  __host__ __device__ large_key_type() {}
-  __host__ __device__ large_key_type(T x) : a{x}, b{x}, c{x} {}
-
-  // Device equality operator is mandatory due to libcudacxx bug:
-  // https://github.com/NVIDIA/libcudacxx/issues/223
-  __device__ bool operator==(large_key_type const& other) const
-  {
-    return a == other.a and b == other.b and c == other.c;
-  }
-};
-
-// User-defined value type
-template <typename T>
-struct value_pair_type {
-  T f;
-  T s;
-
-  __host__ __device__ value_pair_type() {}
-  __host__ __device__ value_pair_type(T x) : f{x}, s{x} {}
-
-  __device__ bool operator==(value_pair_type const& other) const
-  {
-    return f == other.f and s == other.s;
-  }
-};
-
-// User-defined device hasher
-struct hash_custom_key {
-  template <typename custom_type>
-  __device__ uint32_t operator()(custom_type k)
-  {
-    return thrust::raw_reference_cast(k).a;
-  };
-};
-
-// User-defined device key equality
-struct custom_key_equals {
-  template <typename lhs_type, typename rhs_type>
-  __device__ bool operator()(lhs_type lhs, rhs_type rhs)
-  {
-    return lhs == static_cast<lhs_type>(rhs);
-  }
-};
-
-TEMPLATE_TEST_CASE_SIG("static_map custom key and value type tests",
-                       "",
-                       ((typename Key, typename Value), Key, Value),
-#if defined(CUCO_HAS_INDEPENDENT_THREADS)  // Key type larger than 8B only supported for sm_70 and
-                                           // up
-                       (key_pair_type<int64_t>, value_pair_type<int32_t>),
-                       (key_pair_type<int64_t>, value_pair_type<int64_t>),
-                       (large_key_type<int32_t>, value_pair_type<int32_t>),
-#endif
-                       (key_pair_type<int32_t>, value_pair_type<int32_t>))
-{
-  auto const sentinel_key   = Key{-1};
-  auto const sentinel_value = Value{-1};
-
-  constexpr std::size_t num      = 100;
-  constexpr std::size_t capacity = num * 2;
-  cuco::legacy::static_map<Key, Value> map{
-    capacity, cuco::empty_key<Key>{sentinel_key}, cuco::empty_value<Value>{sentinel_value}};
-
-  auto insert_keys = thrust::make_transform_iterator(
-    thrust::counting_iterator<int>(0),
-    cuda::proclaim_return_type<Key>([] __device__(auto i) { return Key{i}; }));
-
-  auto insert_values = thrust::make_transform_iterator(
-    thrust::counting_iterator<int>(0),
-    cuda::proclaim_return_type<Value>([] __device__(auto i) { return Value{i}; }));
-
-  auto insert_pairs = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int>(0),
-    cuda::proclaim_return_type<cuco::pair<Key, Value>>(
-      [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); }));
-
-  SECTION("All inserted keys-value pairs should be correctly recovered during find")
-  {
-    thrust::device_vector<Value> found_values(num);
-    map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{});
-
-    REQUIRE(num == map.get_size());
-
-    map.find(
-      insert_keys, insert_keys + num, found_values.begin(), hash_custom_key{}, custom_key_equals{});
-
-    REQUIRE(cuco::test::equal(insert_values,
-                              insert_values + num,
-                              found_values.begin(),
-                              cuda::proclaim_return_type<bool>([] __device__(Value lhs, Value rhs) {
-                                return cuda::std::tie(lhs.f, lhs.s) == cuda::std::tie(rhs.f, rhs.s);
-                              })));
-  }
-
-  SECTION("All inserted keys-value pairs should be contained")
-  {
-    thrust::device_vector<bool> contained(num);
-    map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{});
-    map.contains(
-      insert_keys, insert_keys + num, contained.begin(), hash_custom_key{}, custom_key_equals{});
-    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
-  }
-
-  SECTION("All conditionally inserted keys-value pairs should be contained")
-  {
-    thrust::device_vector<bool> contained(num);
-    map.insert_if(
-      insert_pairs,
-      insert_pairs + num,
-      thrust::counting_iterator<int>(0),
-      cuda::proclaim_return_type<bool>([] __device__(auto const& key) { return (key % 2) == 0; }),
-      hash_custom_key{},
-      custom_key_equals{});
-
-    REQUIRE(num / 2 == map.get_size());
-
-    map.contains(
-      insert_keys, insert_keys + num, contained.begin(), hash_custom_key{}, custom_key_equals{});
-
-    REQUIRE(cuco::test::equal(
-      contained.begin(),
-      contained.end(),
-      thrust::counting_iterator<int>(0),
-      cuda::proclaim_return_type<bool>([] __device__(auto const& idx_contained, auto const& idx) {
-        return ((idx % 2) == 0) == idx_contained;
-      })));
-  }
-
-  SECTION("Non-inserted keys-value pairs should not be contained")
-  {
-    thrust::device_vector<bool> contained(num);
-    map.contains(
-      insert_keys, insert_keys + num, contained.begin(), hash_custom_key{}, custom_key_equals{});
-    REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), cuda::std::identity{}));
-  }
-
-  SECTION("All inserted keys-value pairs should be contained")
-  {
-    thrust::device_vector<bool> contained(num);
-    map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{});
-    auto view = map.get_device_view();
-    REQUIRE(cuco::test::all_of(
-      insert_pairs,
-      insert_pairs + num,
-      cuda::proclaim_return_type<bool>([view] __device__(cuco::pair<Key, Value> const& pair) {
-        return view.contains(pair.first, hash_custom_key{}, custom_key_equals{});
-      })));
-  }
-
-  SECTION("Inserting unique keys should return insert success.")
-  {
-    auto m_view = map.get_device_mutable_view();
-    REQUIRE(cuco::test::all_of(insert_pairs,
-                               insert_pairs + num,
-                               cuda::proclaim_return_type<bool>(
-                                 [m_view] __device__(cuco::pair<Key, Value> const& pair) mutable {
-                                   return m_view.insert(
-                                     pair, hash_custom_key{}, custom_key_equals{});
-                                 })));
-  }
-
-  SECTION("Cannot find any key in an empty hash map")
-  {
-    SECTION("non-const view")
-    {
-      auto view = map.get_device_view();
-      REQUIRE(cuco::test::all_of(
-        insert_pairs,
-        insert_pairs + num,
-        cuda::proclaim_return_type<bool>(
-          [view] __device__(cuco::pair<Key, Value> const& pair) mutable {
-            return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end();
-          })));
-    }
-
-    SECTION("const view")
-    {
-      auto const view = map.get_device_view();
-      REQUIRE(cuco::test::all_of(
-        insert_pairs,
-        insert_pairs + num,
-        cuda::proclaim_return_type<bool>([view] __device__(cuco::pair<Key, Value> const& pair) {
-          return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end();
-        })));
-    }
-  }
-}