Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/modules/cm-precompute.ts
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ const listCandidateCount = (ncList: NameCount.List): number => {
};

export const preCompute = (first: number, last: number, args: any): boolean => {
const maxSum = last;
const maxSum = args.load_max;
const merge_only = args.merge_only || false;

const begin = new Date();
Expand Down
13 changes: 7 additions & 6 deletions src/native-modules/experiment/filter-stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ extern __constant__ FilterStreamData::Device stream_data_[kMaxStreams];
void FilterStreamData::Device::init(FilterStream& stream, FilterData& mfd) {
if (!stream.host.device_initialized) {
assert(stream.host.stream_idx() < kMaxStreams);
alloc_buffers(mfd, stream.cuda_stream);
alloc_buffers(mfd, stream.stride, stream.cuda_stream);
copy_to_symbol(stream.host.stream_idx(), stream.cuda_stream);
stream.host.device_initialized = true;
}
Expand All @@ -32,19 +32,20 @@ void FilterStreamData::Device::alloc_copy_source_index_list(const Host& host,
}
cuda_malloc_async((void**)&src_idx_list, num_bytes, stream,
"device.src_idx_list");
num_src_idx = num_host_src_idx;
// device members updated; need to (re)copy to symbol
copy_to_symbol(host.stream_idx(), stream);
}
num_src_idx = num_host_src_idx;
// num_src_idx may shrink between sums, so always refresh constant memory.
copy_to_symbol(host.stream_idx(), stream);
// copy source indices
auto err = cudaMemcpyAsync(src_idx_list, host.src_idx_list.data(), num_bytes,
cudaMemcpyHostToDevice, stream);
assert_cuda_success(err, "copy device.src_idx_list");
}

void FilterStreamData::Device::alloc_buffers(FilterData& mfd,
cudaStream_t stream) {
const auto [grid_size, _] = get_filter_kernel_grid_block_sizes();
size_t max_active_sources, cudaStream_t stream) {
const auto [grid_size, _] =
get_filter_kernel_grid_block_sizes(max_active_sources);
if (mfd.device_xor.num_unique_variations) {
const auto num_bytes = mfd.device_xor.num_unique_variations * grid_size
* sizeof(index_t);
Expand Down
10 changes: 9 additions & 1 deletion src/native-modules/experiment/filter-stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ struct FilterStreamData {
struct Device {
void init() {
src_idx_list = nullptr;
xor_src_compat_uv_indices = nullptr;
or_xor_compat_uv_indices = nullptr;
#ifdef VARIATIONS_RESULTS
variations_compat_results = nullptr;
num_variations_results_per_block = 0;
#endif
or_src_bits_compat_results = nullptr;
num_src_idx = 0;
}
void init(FilterStream& stream, FilterData& mfd);
Expand Down Expand Up @@ -61,7 +68,8 @@ struct FilterStreamData {
index_t num_src_idx;

private:
void alloc_buffers(FilterData& fd, cudaStream_t stream);
void alloc_buffers(FilterData& fd, size_t max_active_sources,
cudaStream_t stream);
void copy_to_symbol(index_t idx, cudaStream_t stream);
}; // struct Device
}; // struct FilterStreamData
Expand Down
4 changes: 2 additions & 2 deletions src/native-modules/experiment/filter-support.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,8 +500,6 @@ filter_sources(FilterSwarm& swarm, const FilterParams& params,

// TODO: FilterStreamData::Host, resize_results()
std::vector<result_t> results(num_results);
// TODO: this is dumb
swarm.init(stride);
auto t = util::Timer::start_timer();
const auto [num_processed, num_compat] = //
run_concurrent_filter_kernels(params.sum, swarm, params.threads_per_block,
Expand Down Expand Up @@ -655,6 +653,8 @@ filter_task_result_t filter_task(FilterData& mfd, FilterParams params) {
auto& swarm = swarm_pool_.acquire();
const auto num_streams = params.num_streams ? params.num_streams : 1;
swarm.ensure_streams(num_streams); // set_num_streams might be better
const auto stride = params.stride ? params.stride : int(idx_lists.size());
swarm.init(stride);

const auto& stream = swarm.at(0);
stream.copy_start.record(stream.cuda_stream);
Expand Down
9 changes: 6 additions & 3 deletions src/native-modules/experiment/filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -778,13 +778,15 @@ __global__ void filter_kernel(result_t* RESTRICT results, index_t swarm_idx,

} // anonymous namespace

std::pair<int, int> get_filter_kernel_grid_block_sizes() {
std::pair<int, int> get_filter_kernel_grid_block_sizes(size_t num_sources) {
// hard-code 64 due to cub::BlockScan
const auto block_size = 64; // threads_per_block ? threads_per_block : 64;
const auto max_threads_per_sm = CudaDevice::get().max_threads_per_sm();
const auto blocks_per_sm = max_threads_per_sm / block_size;
assert(blocks_per_sm * block_size == max_threads_per_sm);
const auto grid_size = CudaDevice::get().num_sm() * blocks_per_sm;
const auto max_grid_size = CudaDevice::get().num_sm() * blocks_per_sm;
assert(num_sources > 0);
const auto grid_size = std::min<int>(max_grid_size, int(num_sources));
return std::make_pair(grid_size, block_size);
}

Expand All @@ -805,7 +807,8 @@ void run_filter_kernel(int /*threads_per_block*/, index_t swarm_idx,
FilterStream& stream, result_t* device_results) {
stream.is_running = true;
stream.increment_sequence_num();
const auto [grid_size, block_size] = get_filter_kernel_grid_block_sizes();
const auto [grid_size, block_size] =
get_filter_kernel_grid_block_sizes(stream.host.src_idx_list.size());
// xor_results could probably be moved to global
const auto shared_bytes = kSharedIndexCount
* sizeof(shared_index_t) // indices
Expand Down
3 changes: 1 addition & 2 deletions src/native-modules/experiment/filter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ void run_get_compatible_sources_kernel(
size_t num_src_desc_pairs, result_t* device_resultsy,
cudaStream_t sync_stream, cudaStream_t stream);

std::pair<int, int> get_filter_kernel_grid_block_sizes();
std::pair<int, int> get_filter_kernel_grid_block_sizes(size_t num_sources);

void copy_filter_data_to_symbols(const FilterData& mfd, cudaStream_t stream);

Expand All @@ -42,4 +42,3 @@ inline constexpr void dump_compat_src_indices(
#endif

} // namespace cm