diff --git a/src/modules/cm-precompute.ts b/src/modules/cm-precompute.ts index 3ee35c829..a0e55602b 100644 --- a/src/modules/cm-precompute.ts +++ b/src/modules/cm-precompute.ts @@ -205,7 +205,7 @@ const listCandidateCount = (ncList: NameCount.List): number => { }; export const preCompute = (first: number, last: number, args: any): boolean => { - const maxSum = last; + const maxSum = args.load_max; const merge_only = args.merge_only || false; const begin = new Date(); diff --git a/src/native-modules/experiment/filter-stream.cpp b/src/native-modules/experiment/filter-stream.cpp index 3343a2c54..62264630f 100644 --- a/src/native-modules/experiment/filter-stream.cpp +++ b/src/native-modules/experiment/filter-stream.cpp @@ -14,7 +14,7 @@ extern __constant__ FilterStreamData::Device stream_data_[kMaxStreams]; void FilterStreamData::Device::init(FilterStream& stream, FilterData& mfd) { if (!stream.host.device_initialized) { assert(stream.host.stream_idx() < kMaxStreams); - alloc_buffers(mfd, stream.cuda_stream); + alloc_buffers(mfd, stream.stride, stream.cuda_stream); copy_to_symbol(stream.host.stream_idx(), stream.cuda_stream); stream.host.device_initialized = true; } @@ -32,10 +32,10 @@ void FilterStreamData::Device::alloc_copy_source_index_list(const Host& host, } cuda_malloc_async((void**)&src_idx_list, num_bytes, stream, "device.src_idx_list"); - num_src_idx = num_host_src_idx; - // device members updated; need to (re)copy to symbol - copy_to_symbol(host.stream_idx(), stream); } + num_src_idx = num_host_src_idx; + // num_src_idx may shrink between sums, so always refresh constant memory. + copy_to_symbol(host.stream_idx(), stream); // copy source indices auto err = cudaMemcpyAsync(src_idx_list, host.src_idx_list.data(), num_bytes, cudaMemcpyHostToDevice, stream); @@ -43,8 +43,9 @@ void FilterStreamData::Device::alloc_copy_source_index_list(const Host& host, } void FilterStreamData::Device::alloc_buffers(FilterData& mfd, - cudaStream_t stream) { - const auto [grid_size, _] = get_filter_kernel_grid_block_sizes(); + size_t max_active_sources, cudaStream_t stream) { + const auto [grid_size, _] = + get_filter_kernel_grid_block_sizes(max_active_sources); if (mfd.device_xor.num_unique_variations) { const auto num_bytes = mfd.device_xor.num_unique_variations * grid_size * sizeof(index_t); diff --git a/src/native-modules/experiment/filter-stream.h b/src/native-modules/experiment/filter-stream.h index 47713d31e..74b94c668 100644 --- a/src/native-modules/experiment/filter-stream.h +++ b/src/native-modules/experiment/filter-stream.h @@ -27,6 +27,13 @@ struct FilterStreamData { struct Device { void init() { src_idx_list = nullptr; + xor_src_compat_uv_indices = nullptr; + or_xor_compat_uv_indices = nullptr; +#ifdef VARIATIONS_RESULTS + variations_compat_results = nullptr; + num_variations_results_per_block = 0; +#endif + or_src_bits_compat_results = nullptr; num_src_idx = 0; } void init(FilterStream& stream, FilterData& mfd); @@ -61,7 +68,8 @@ struct FilterStreamData { index_t num_src_idx; private: - void alloc_buffers(FilterData& fd, cudaStream_t stream); + void alloc_buffers(FilterData& fd, size_t max_active_sources, + cudaStream_t stream); void copy_to_symbol(index_t idx, cudaStream_t stream); }; // struct Device }; // struct FilterStreamData diff --git a/src/native-modules/experiment/filter-support.cpp b/src/native-modules/experiment/filter-support.cpp index dc1032e45..4c09176df 100644 --- a/src/native-modules/experiment/filter-support.cpp +++ b/src/native-modules/experiment/filter-support.cpp @@ -500,8 +500,6 @@ filter_sources(FilterSwarm& swarm, const FilterParams& params, // TODO: FilterStreamData::Host, resize_results() std::vector results(num_results); - // TODO: this is dumb - swarm.init(stride); auto t = util::Timer::start_timer(); const auto [num_processed, num_compat] = // run_concurrent_filter_kernels(params.sum, swarm, params.threads_per_block, @@ -655,6 +653,8 @@ filter_task_result_t filter_task(FilterData& mfd, FilterParams params) { auto& swarm = swarm_pool_.acquire(); const auto num_streams = params.num_streams ? params.num_streams : 1; swarm.ensure_streams(num_streams); // set_num_streams might be better + const auto stride = params.stride ? params.stride : int(idx_lists.size()); + swarm.init(stride); const auto& stream = swarm.at(0); stream.copy_start.record(stream.cuda_stream); diff --git a/src/native-modules/experiment/filter.cu b/src/native-modules/experiment/filter.cu index 2bb39f6ab..d28e55d89 100644 --- a/src/native-modules/experiment/filter.cu +++ b/src/native-modules/experiment/filter.cu @@ -778,13 +778,15 @@ __global__ void filter_kernel(result_t* RESTRICT results, index_t swarm_idx, } // anonymous namespace -std::pair get_filter_kernel_grid_block_sizes() { +std::pair get_filter_kernel_grid_block_sizes(size_t num_sources) { // hard-code 64 due to cub::BlockScan const auto block_size = 64; // threads_per_block ? threads_per_block : 64; const auto max_threads_per_sm = CudaDevice::get().max_threads_per_sm(); const auto blocks_per_sm = max_threads_per_sm / block_size; assert(blocks_per_sm * block_size == max_threads_per_sm); - const auto grid_size = CudaDevice::get().num_sm() * blocks_per_sm; + const auto max_grid_size = CudaDevice::get().num_sm() * blocks_per_sm; + assert(num_sources > 0); + const auto grid_size = std::min(max_grid_size, int(num_sources)); return std::make_pair(grid_size, block_size); } @@ -805,7 +807,8 @@ void run_filter_kernel(int /*threads_per_block*/, index_t swarm_idx, FilterStream& stream, result_t* device_results) { stream.is_running = true; stream.increment_sequence_num(); - const auto [grid_size, block_size] = get_filter_kernel_grid_block_sizes(); + const auto [grid_size, block_size] = + get_filter_kernel_grid_block_sizes(stream.host.src_idx_list.size()); // xor_results could probably be moved to global const auto shared_bytes = kSharedIndexCount * sizeof(shared_index_t) // indices diff --git a/src/native-modules/experiment/filter.cuh b/src/native-modules/experiment/filter.cuh index e7f191bd1..31c73c5bf 100644 --- a/src/native-modules/experiment/filter.cuh +++ b/src/native-modules/experiment/filter.cuh @@ -20,7 +20,7 @@ void run_get_compatible_sources_kernel( size_t num_src_desc_pairs, result_t* device_resultsy, cudaStream_t sync_stream, cudaStream_t stream); -std::pair get_filter_kernel_grid_block_sizes(); +std::pair get_filter_kernel_grid_block_sizes(size_t num_sources); void copy_filter_data_to_symbols(const FilterData& mfd, cudaStream_t stream); @@ -42,4 +42,3 @@ inline constexpr void dump_compat_src_indices( #endif } // namespace cm -