From c8747af8c1e90c534afae4a08ba5cb71feae1602 Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Sun, 14 Jun 2026 14:32:59 +0200 Subject: [PATCH 1/4] correctly identify cooperative dimensions in ILS/OLS --- opt/src/transformations/in_local_storage.cpp | 222 ++++++++++------ opt/src/transformations/out_local_storage.cpp | 245 +++++++++++------- .../transformations/in_local_storage_test.cpp | 5 +- .../out_local_storage_test.cpp | 12 +- 4 files changed, 312 insertions(+), 172 deletions(-) diff --git a/opt/src/transformations/in_local_storage.cpp b/opt/src/transformations/in_local_storage.cpp index 8411a344..7110d2cf 100644 --- a/opt/src/transformations/in_local_storage.cpp +++ b/opt/src/transformations/in_local_storage.cpp @@ -1,5 +1,6 @@ #include "sdfg/transformations/in_local_storage.h" +#include #include #include #include @@ -218,30 +219,114 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An // Create local buffer name local_name_ = builder.find_new_name("__daisy_in_local_storage_" + this->container_); - // Collect varying dimensions (extent > 1) and compute buffer layout + // Collect varying dimensions (extent > 1) and their sizes std::vector varying_dims; - std::vector dim_sizes; + std::vector varying_dim_sizes; for (size_t d = 0; d < tile_info_.dimensions.size(); d++) { auto& dim_size = tile_info_.dimensions.at(d); if (!symbolic::eq(dim_size, symbolic::integer(1))) { varying_dims.push_back(d); - dim_sizes.push_back(dim_size); + varying_dim_sizes.push_back(dim_size); } } - // Compute total buffer size - symbolic::Expression total_size = symbolic::integer(1); - for (auto& ds : dim_sizes) { - total_size = symbolic::mul(total_size, ds); + // GPU classification: each ancestor GPU Map is either + // - per-thread (its Map indvar appears in tile.bases — each thread sees a + // distinct slice along that dim, so the shared buffer gets its own + // per-thread slot indexed by the within-block thread_idx), or + // - cooperative (Map indvar not in bases — all threads along that dim + // cooperatively load the same shared tile, strided by thread_idx). + struct GpuDim { + gpu::GPUDimension dim; + symbolic::Symbol map_indvar; // global thread index (== thread_idx + blockIdx * blockDim) + symbolic::Symbol thread_idx; // within-block thread index (NV_Symbol) + symbolic::Integer block_size; + bool is_per_thread; + }; + std::vector per_thread_dims; // populated only on GPU path + std::vector coop_dims; // populated only on GPU path + bool is_rocm = false; + + if (storage_type_.is_nv_shared()) { + auto ancestors = ControlFlowNode::parent_chain(loop_); + for (auto* node : ancestors) { + auto* m = dynamic_cast(node); + if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue; + if (m->schedule_type().value() == "ROCM") { + is_rocm = true; + break; + } + } + const std::string prefix = is_rocm ? "__daisy_hip_thread_idx_" : "__daisy_cuda_thread_idx_"; + auto suffix = [](gpu::GPUDimension d) -> std::string { + switch (d) { + case gpu::GPUDimension::X: + return "x"; + case gpu::GPUDimension::Y: + return "y"; + case gpu::GPUDimension::Z: + return "z"; + } + return "?"; + }; + for (auto* node : ancestors) { + auto* m = dynamic_cast(node); + if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue; + GpuDim gd; + gd.dim = gpu::gpu_dimension(m->schedule_type()); + gd.map_indvar = m->indvar(); + gd.thread_idx = symbolic::symbol(prefix + suffix(gd.dim)); + gd.block_size = gpu::gpu_block_size(m->schedule_type()); + gd.is_per_thread = false; + for (auto& base : tile_info_.bases) { + if (symbolic::uses(base, m->indvar())) { + gd.is_per_thread = true; + break; + } + } + (gd.is_per_thread ? per_thread_dims : coop_dims).push_back(gd); + } + auto by_dim = [](const GpuDim& a, const GpuDim& b) { + return static_cast(a.dim) < static_cast(b.dim); + }; + std::sort(per_thread_dims.begin(), per_thread_dims.end(), by_dim); + std::sort(coop_dims.begin(), coop_dims.end(), by_dim); + + // Ensure within-block thread_idx containers exist. Codegen recognises + // NV_Symbol-typed scalars and substitutes them with threadIdx.{x,y,z} + // (CUDA) or the ROCm equivalent at emission time. + auto ensure_idx = [&](const symbolic::Symbol& sym) { + if (!sdfg.exists(sym->get_name())) { + types::Scalar idx_type(types::PrimitiveType::Int32); + idx_type.storage_type(types::StorageType::NV_Symbol()); + builder.add_container(sym->get_name(), idx_type); + } + }; + for (auto& gd : per_thread_dims) ensure_idx(gd.thread_idx); + for (auto& gd : coop_dims) ensure_idx(gd.thread_idx); } - // Helper: build linearized local index from per-dimension symbolic expressions + // Buffer dim sizes: [per-thread block sizes (X, Y, Z canonical order)] ++ + // [varying tile dim sizes (original access-dim order)] + std::vector buf_dim_sizes; + for (auto& gd : per_thread_dims) buf_dim_sizes.push_back(gd.block_size); + for (auto& s : varying_dim_sizes) buf_dim_sizes.push_back(s); + + // Total buffer size (number of scalar slots) + symbolic::Expression total_size = symbolic::integer(1); + for (auto& s : buf_dim_sizes) total_size = symbolic::mul(total_size, s); + + // Per-thread index prefix (each thread's fixed buffer coords) + std::vector per_thread_indices; + for (auto& gd : per_thread_dims) per_thread_indices.push_back(gd.thread_idx); + + // Row-major linearization over buf_dim_sizes (leftmost dim = outermost stride) auto linearize_exprs = [&](const std::vector& indices) -> symbolic::Expression { symbolic::Expression linear_idx = symbolic::integer(0); symbolic::Expression stride = symbolic::integer(1); - for (int i = indices.size() - 1; i >= 0; i--) { + for (int i = static_cast(indices.size()) - 1; i >= 0; i--) { linear_idx = symbolic::add(linear_idx, symbolic::mul(indices[i], stride)); - stride = symbolic::mul(stride, dim_sizes[i]); + stride = symbolic::mul(stride, buf_dim_sizes[i]); } return linear_idx; }; @@ -278,114 +363,93 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An // ============================================================ // GPU COOPERATIVE PATH // ============================================================ - auto ancestors = ControlFlowNode::parent_chain(loop_); + // Each thread owns a fixed slot along per-thread buffer dims and + // strides through the varying-flat range with the other threads + // sharing that slot (i.e. threads in cooperative dims only). - // Collect cooperative GPU dimensions (indvar not in tile bases) - struct CoopDim { - symbolic::Symbol indvar; - symbolic::Integer block_size; - gpu::GPUDimension dimension; - }; - std::vector coop_dims; + // Total cooperative-thread count (= 1 if no cooperative dims) + symbolic::Expression total_coop_threads = symbolic::integer(1); + for (auto& cd : coop_dims) { + total_coop_threads = symbolic::mul(total_coop_threads, cd.block_size); + } - for (auto* node : ancestors) { - if (auto* ancestor_map = dynamic_cast(node)) { - if (!gpu::is_gpu_schedule(ancestor_map->schedule_type())) { - continue; - } - bool appears_in_bases = false; - for (auto& base : tile_info_.bases) { - if (symbolic::uses(base, ancestor_map->indvar())) { - appears_in_bases = true; - break; - } - } - if (!appears_in_bases) { - coop_dims.push_back( - {ancestor_map->indvar(), - gpu::gpu_block_size(ancestor_map->schedule_type()), - gpu::gpu_dimension(ancestor_map->schedule_type())} - ); - } + // Flat within-block index over cooperative dims only (= 0 if none). + // Row-major: X is least-significant when present. + symbolic::Expression coop_flat = symbolic::integer(0); + { + symbolic::Expression stride = symbolic::integer(1); + for (auto it = coop_dims.rbegin(); it != coop_dims.rend(); ++it) { + coop_flat = symbolic::add(coop_flat, symbolic::mul(it->thread_idx, stride)); + stride = symbolic::mul(stride, it->block_size); } } - // Compute total cooperative thread count - symbolic::Expression total_coop_threads = symbolic::integer(1); - for (auto& cd : coop_dims) { - total_coop_threads = symbolic::mul(total_coop_threads, cd.block_size); + // Varying-flat size = product of tile dim extents (excluding extent==1). + // This is the address range each thread cooperatively walks within its + // per-thread slot. + symbolic::Expression varying_flat_size = symbolic::integer(1); + for (auto& s : varying_dim_sizes) { + varying_flat_size = symbolic::mul(varying_flat_size, s); } // Create the local buffer with NV_Shared storage types::Array buffer_type(storage_type_, 0, {}, scalar_type, total_size); builder.add_container(local_name_, buffer_type); - // Emit: barrier → guarded cooperative copy → barrier → loop + // Emit: barrier → cooperative copy loop → barrier → main loop // 1. Barrier before copy auto& barrier_block1 = builder.add_block_before(*parent, loop_, {}, loop_.debug_info()); builder.add_library_node(barrier_block1, {}); - // 2. Cooperative copy with if_else guard - // Flatten cooperative thread index: coop_flat = sum(indvar[i] * product(block_size[j] for j>i)) - symbolic::Expression coop_flat = symbolic::integer(0); - symbolic::Expression coop_stride = symbolic::integer(1); - for (int i = coop_dims.size() - 1; i >= 0; i--) { - coop_flat = symbolic::add(coop_flat, symbolic::mul(coop_dims[i].indvar, coop_stride)); - coop_stride = symbolic::mul(coop_stride, coop_dims[i].block_size); - } - - // Each thread loads elements strided by total_coop_threads - // Thread t loads elements: t, t + total_threads, t + 2*total_threads, ... - // We emit a loop: for (idx = coop_flat; idx < total_size; idx += total_coop_threads) + // 2. Cooperative copy: for (idx = coop_flat; idx < varying_flat_size; idx += total_coop_threads) auto idx_name = builder.find_new_name("__daisy_ils_coop_" + this->container_); types::Scalar idx_type(types::PrimitiveType::UInt64); builder.add_container(idx_name, idx_type); auto idx_var = symbolic::symbol(idx_name); - auto copy_init = coop_flat; - auto copy_condition = symbolic::Lt(idx_var, total_size); - auto copy_update = symbolic::add(idx_var, total_coop_threads); - auto& copy_loop = builder.add_map_before( *parent, loop_, idx_var, - copy_condition, - copy_init, - copy_update, + symbolic::Lt(idx_var, varying_flat_size), + coop_flat, + symbolic::add(idx_var, total_coop_threads), structured_control_flow::ScheduleType_Sequential::create(), {}, loop_.debug_info() ); - // Decompose flat idx back into per-dimension indices for source subset - // idx maps to varying_dims in row-major order auto& copy_scope = copy_loop.root(); auto& copy_block = builder.add_block(copy_scope); auto& copy_src = builder.add_access(copy_block, this->container_); auto& copy_dst = builder.add_access(copy_block, local_name_); auto& copy_tasklet = builder.add_tasklet(copy_block, data_flow::TaskletCode::assign, "_out", {"_in"}); - // Decompose idx_var into per-dim indices - std::vector copy_indices; + // Decompose idx_var into per-varying-dim indices (row-major). + // For a single varying dim this is just idx_var. + std::vector varying_decomp; symbolic::Expression remainder = idx_var; - for (size_t i = 0; i < dim_sizes.size(); i++) { - if (i < dim_sizes.size() - 1) { - // integer division: idx / (product of remaining dims) + for (size_t i = 0; i < varying_dim_sizes.size(); i++) { + if (i + 1 < varying_dim_sizes.size()) { symbolic::Expression divisor = symbolic::integer(1); - for (size_t j = i + 1; j < dim_sizes.size(); j++) { - divisor = symbolic::mul(divisor, dim_sizes[j]); + for (size_t j = i + 1; j < varying_dim_sizes.size(); j++) { + divisor = symbolic::mul(divisor, varying_dim_sizes[j]); } - auto quotient = symbolic::div(remainder, divisor); - copy_indices.push_back(quotient); + varying_decomp.push_back(symbolic::div(remainder, divisor)); remainder = symbolic::mod(remainder, divisor); } else { - copy_indices.push_back(remainder); + varying_decomp.push_back(remainder); } } - auto copy_src_subset = build_original_subset(copy_indices); - data_flow::Subset copy_dst_subset = {idx_var}; + // Source = original container at (bases — which already use the global + // Map indvars — plus the varying decomposition along each varying dim). + auto copy_src_subset = build_original_subset(varying_decomp); + + // Destination = buffer at (per_thread_indices ++ varying_decomp) linearized. + std::vector dest_indices = per_thread_indices; + for (auto& v : varying_decomp) dest_indices.push_back(v); + data_flow::Subset copy_dst_subset = {linearize_exprs(dest_indices)}; builder.add_computational_memlet(copy_block, copy_src, copy_tasklet, "_in", copy_src_subset, pointer_type); builder.add_computational_memlet(copy_block, copy_tasklet, "_out", copy_dst, copy_dst_subset, buffer_type); @@ -413,7 +477,7 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An copy_indvars.push_back(indvar); auto init = symbolic::integer(0); - auto condition = symbolic::Lt(indvar, dim_sizes[i]); + auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]); auto update = symbolic::add(indvar, symbolic::integer(1)); auto& copy_loop = builder.add_map( @@ -481,7 +545,8 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An } auto* acc = mla.access(memlet); if (acc && acc->subset.size() == tile_info_.dimensions.size()) { - std::vector local_indices; + // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]] + std::vector local_indices = per_thread_indices; for (size_t d = 0; d < tile_info_.dimensions.size(); d++) { if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) { local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d))); @@ -498,7 +563,8 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An } auto* acc = mla.access(memlet); if (acc && acc->subset.size() == tile_info_.dimensions.size()) { - std::vector local_indices; + // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]] + std::vector local_indices = per_thread_indices; for (size_t d = 0; d < tile_info_.dimensions.size(); d++) { if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) { local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d))); diff --git a/opt/src/transformations/out_local_storage.cpp b/opt/src/transformations/out_local_storage.cpp index ff16ab11..4e6c598f 100644 --- a/opt/src/transformations/out_local_storage.cpp +++ b/opt/src/transformations/out_local_storage.cpp @@ -1,5 +1,6 @@ #include "sdfg/transformations/out_local_storage.h" +#include #include #include #include @@ -206,39 +207,123 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A local_name_ = builder.find_new_name("__daisy_out_local_storage_" + this->container_); - // Collect varying dimensions (extent > 1) and compute buffer layout. + // Collect varying dimensions (extent > 1) and their sizes. // Extent-1 dimensions are degenerate (no loop is needed) and must be // skipped when sizing the buffer, when creating copy indvars, and when // linearizing into the local buffer. The bookkeeping must match what // `build_original_subset` expects (it indexes copy_indices by varying // dimension only). std::vector varying_dims; - std::vector dim_sizes; + std::vector varying_dim_sizes; for (size_t d = 0; d < tile_info_.dimensions.size(); d++) { auto& dim_size = tile_info_.dimensions.at(d); if (!symbolic::eq(dim_size, symbolic::integer(1))) { varying_dims.push_back(d); - dim_sizes.push_back(dim_size); + varying_dim_sizes.push_back(dim_size); } } - // Compute total buffer size - symbolic::Expression total_size = symbolic::integer(1); - for (auto& ds : dim_sizes) { - total_size = symbolic::mul(total_size, ds); + // GPU classification: each ancestor GPU Map is either + // - per-thread (its Map indvar appears in tile.bases — each thread sees a + // distinct slice along that dim, so the shared buffer gets its own + // per-thread slot indexed by the within-block thread_idx), or + // - cooperative (Map indvar not in bases — all threads along that dim + // cooperatively load/store the same shared tile, strided by thread_idx). + struct GpuDim { + gpu::GPUDimension dim; + symbolic::Symbol map_indvar; // global thread index (== thread_idx + blockIdx * blockDim) + symbolic::Symbol thread_idx; // within-block thread index (NV_Symbol) + symbolic::Integer block_size; + bool is_per_thread; + }; + std::vector per_thread_dims; // populated only on GPU path + std::vector coop_dims; // populated only on GPU path + bool is_rocm = false; + + if (storage_type_.is_nv_shared()) { + auto ancestors = ControlFlowNode::parent_chain(loop_); + for (auto* node : ancestors) { + auto* m = dynamic_cast(node); + if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue; + if (m->schedule_type().value() == "ROCM") { + is_rocm = true; + break; + } + } + const std::string prefix = is_rocm ? "__daisy_hip_thread_idx_" : "__daisy_cuda_thread_idx_"; + auto suffix = [](gpu::GPUDimension d) -> std::string { + switch (d) { + case gpu::GPUDimension::X: + return "x"; + case gpu::GPUDimension::Y: + return "y"; + case gpu::GPUDimension::Z: + return "z"; + } + return "?"; + }; + for (auto* node : ancestors) { + auto* m = dynamic_cast(node); + if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue; + GpuDim gd; + gd.dim = gpu::gpu_dimension(m->schedule_type()); + gd.map_indvar = m->indvar(); + gd.thread_idx = symbolic::symbol(prefix + suffix(gd.dim)); + gd.block_size = gpu::gpu_block_size(m->schedule_type()); + gd.is_per_thread = false; + for (auto& base : tile_info_.bases) { + if (symbolic::uses(base, m->indvar())) { + gd.is_per_thread = true; + break; + } + } + (gd.is_per_thread ? per_thread_dims : coop_dims).push_back(gd); + } + auto by_dim = [](const GpuDim& a, const GpuDim& b) { + return static_cast(a.dim) < static_cast(b.dim); + }; + std::sort(per_thread_dims.begin(), per_thread_dims.end(), by_dim); + std::sort(coop_dims.begin(), coop_dims.end(), by_dim); + + // Ensure within-block thread_idx containers exist. Codegen recognises + // NV_Symbol-typed scalars and substitutes them with threadIdx.{x,y,z} + // (CUDA) or the ROCm equivalent at emission time. + auto ensure_idx = [&](const symbolic::Symbol& sym) { + if (!sdfg.exists(sym->get_name())) { + types::Scalar idx_type(types::PrimitiveType::Int32); + idx_type.storage_type(types::StorageType::NV_Symbol()); + builder.add_container(sym->get_name(), idx_type); + } + }; + for (auto& gd : per_thread_dims) ensure_idx(gd.thread_idx); + for (auto& gd : coop_dims) ensure_idx(gd.thread_idx); } + // Buffer dim sizes: [per-thread block sizes (X, Y, Z canonical order)] ++ + // [varying tile dim sizes (original access-dim order)] + std::vector buf_dim_sizes; + for (auto& gd : per_thread_dims) buf_dim_sizes.push_back(gd.block_size); + for (auto& s : varying_dim_sizes) buf_dim_sizes.push_back(s); + + // Total buffer size (number of scalar slots) + symbolic::Expression total_size = symbolic::integer(1); + for (auto& s : buf_dim_sizes) total_size = symbolic::mul(total_size, s); + + // Per-thread index prefix (each thread's fixed buffer coords) + std::vector per_thread_indices; + for (auto& gd : per_thread_dims) per_thread_indices.push_back(gd.thread_idx); + // Create the local buffer with specified storage type types::Array buffer_type(storage_type_, 0, {}, scalar_type, total_size); builder.add_container(local_name_, buffer_type); - // Helper: build linearized local index from per-dimension expressions + // Row-major linearization over buf_dim_sizes (leftmost dim = outermost stride) auto linearize_exprs = [&](const std::vector& indices) -> symbolic::Expression { symbolic::Expression linear_idx = symbolic::integer(0); symbolic::Expression stride = symbolic::integer(1); - for (int i = indices.size() - 1; i >= 0; i--) { + for (int i = static_cast(indices.size()) - 1; i >= 0; i--) { linear_idx = symbolic::add(linear_idx, symbolic::mul(indices[i], stride)); - stride = symbolic::mul(stride, dim_sizes[i]); + stride = symbolic::mul(stride, buf_dim_sizes[i]); } return linear_idx; }; @@ -272,52 +357,60 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A // ============================================================ // GPU COOPERATIVE PATH // ============================================================ - auto ancestors = ControlFlowNode::parent_chain(loop_); + // Each thread owns a fixed slot along per-thread buffer dims and + // strides through the varying-flat range with the other threads + // sharing that slot (i.e. threads in cooperative dims only). - // Collect cooperative GPU dimensions - struct CoopDim { - symbolic::Symbol indvar; - symbolic::Integer block_size; - gpu::GPUDimension dimension; - }; - std::vector coop_dims; - - for (auto* node : ancestors) { - if (auto* ancestor_map = dynamic_cast(node)) { - if (!gpu::is_gpu_schedule(ancestor_map->schedule_type())) { - continue; - } - bool appears_in_bases = false; - for (auto& base : tile_info_.bases) { - if (symbolic::uses(base, ancestor_map->indvar())) { - appears_in_bases = true; - break; - } - } - if (!appears_in_bases) { - coop_dims.push_back( - {ancestor_map->indvar(), - gpu::gpu_block_size(ancestor_map->schedule_type()), - gpu::gpu_dimension(ancestor_map->schedule_type())} - ); - } - } - } - - // Compute total cooperative thread count + // Total cooperative-thread count (= 1 if no cooperative dims) symbolic::Expression total_coop_threads = symbolic::integer(1); for (auto& cd : coop_dims) { total_coop_threads = symbolic::mul(total_coop_threads, cd.block_size); } - // Flatten cooperative thread index + // Flat within-block index over cooperative dims only (= 0 if none). + // Row-major: X is least-significant when present. symbolic::Expression coop_flat = symbolic::integer(0); - symbolic::Expression coop_stride = symbolic::integer(1); - for (int i = coop_dims.size() - 1; i >= 0; i--) { - coop_flat = symbolic::add(coop_flat, symbolic::mul(coop_dims[i].indvar, coop_stride)); - coop_stride = symbolic::mul(coop_stride, coop_dims[i].block_size); + { + symbolic::Expression stride = symbolic::integer(1); + for (auto it = coop_dims.rbegin(); it != coop_dims.rend(); ++it) { + coop_flat = symbolic::add(coop_flat, symbolic::mul(it->thread_idx, stride)); + stride = symbolic::mul(stride, it->block_size); + } } + // Varying-flat size = product of tile dim extents (excluding extent==1). + // This is the address range each thread cooperatively walks within its + // per-thread slot. + symbolic::Expression varying_flat_size = symbolic::integer(1); + for (auto& s : varying_dim_sizes) { + varying_flat_size = symbolic::mul(varying_flat_size, s); + } + + // Helper to decompose a flat varying index into per-varying-dim indices + // (row-major), and to build the buffer dest subset (per_thread ++ varying). + auto decompose = [&](const symbolic::Symbol& idx_var) { + std::vector result; + symbolic::Expression remainder = idx_var; + for (size_t i = 0; i < varying_dim_sizes.size(); i++) { + if (i + 1 < varying_dim_sizes.size()) { + symbolic::Expression divisor = symbolic::integer(1); + for (size_t j = i + 1; j < varying_dim_sizes.size(); j++) { + divisor = symbolic::mul(divisor, varying_dim_sizes[j]); + } + result.push_back(symbolic::div(remainder, divisor)); + remainder = symbolic::mod(remainder, divisor); + } else { + result.push_back(remainder); + } + } + return result; + }; + auto buf_subset_for = [&](const std::vector& varying_decomp) -> data_flow::Subset { + std::vector dest_indices = per_thread_indices; + for (auto& v : varying_decomp) dest_indices.push_back(v); + return {linearize_exprs(dest_indices)}; + }; + // INIT: barrier → cooperative copy-in → barrier (if has_read) if (tile_info_.has_read) { // Barrier before init @@ -334,7 +427,7 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A *parent, loop_, idx_var, - symbolic::Lt(idx_var, total_size), + symbolic::Lt(idx_var, varying_flat_size), coop_flat, symbolic::add(idx_var, total_coop_threads), structured_control_flow::ScheduleType_Sequential::create(), @@ -347,25 +440,11 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A auto& init_dst = builder.add_access(init_block, local_name_); auto& init_tasklet = builder.add_tasklet(init_block, data_flow::TaskletCode::assign, "_out", {"_in"}); - // Decompose idx_var into per-dim indices over varying dims only - std::vector init_indices; - symbolic::Expression remainder = idx_var; - for (size_t i = 0; i < dim_sizes.size(); i++) { - if (i < dim_sizes.size() - 1) { - symbolic::Expression divisor = symbolic::integer(1); - for (size_t j = i + 1; j < dim_sizes.size(); j++) { - divisor = symbolic::mul(divisor, dim_sizes[j]); - } - init_indices.push_back(symbolic::div(remainder, divisor)); - remainder = symbolic::mod(remainder, divisor); - } else { - init_indices.push_back(remainder); - } - } - - auto init_src_subset = build_original_subset(init_indices); + auto init_decomp = decompose(idx_var); + auto init_src_subset = build_original_subset(init_decomp); + auto init_dst_subset = buf_subset_for(init_decomp); builder.add_computational_memlet(init_block, init_src, init_tasklet, "_in", init_src_subset, pointer_type); - builder.add_computational_memlet(init_block, init_tasklet, "_out", init_dst, {idx_var}, buffer_type); + builder.add_computational_memlet(init_block, init_tasklet, "_out", init_dst, init_dst_subset, buffer_type); // Barrier after init auto& barrier_block2 = builder.add_block_before(*parent, loop_, {}, loop_.debug_info()); @@ -388,7 +467,7 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A *parent, loop_, idx_var, - symbolic::Lt(idx_var, total_size), + symbolic::Lt(idx_var, varying_flat_size), coop_flat, symbolic::add(idx_var, total_coop_threads), structured_control_flow::ScheduleType_Sequential::create(), @@ -401,24 +480,10 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A auto& wb_dst = builder.add_access(wb_block, this->container_); auto& wb_tasklet = builder.add_tasklet(wb_block, data_flow::TaskletCode::assign, "_out", {"_in"}); - // Decompose idx_var into per-dim indices over varying dims only - std::vector wb_indices; - symbolic::Expression remainder = idx_var; - for (size_t i = 0; i < dim_sizes.size(); i++) { - if (i < dim_sizes.size() - 1) { - symbolic::Expression divisor = symbolic::integer(1); - for (size_t j = i + 1; j < dim_sizes.size(); j++) { - divisor = symbolic::mul(divisor, dim_sizes[j]); - } - wb_indices.push_back(symbolic::div(remainder, divisor)); - remainder = symbolic::mod(remainder, divisor); - } else { - wb_indices.push_back(remainder); - } - } - - auto wb_dst_subset = build_original_subset(wb_indices); - builder.add_computational_memlet(wb_block, wb_src, wb_tasklet, "_in", {idx_var}, buffer_type); + auto wb_decomp = decompose(idx_var); + auto wb_src_subset = buf_subset_for(wb_decomp); + auto wb_dst_subset = build_original_subset(wb_decomp); + builder.add_computational_memlet(wb_block, wb_src, wb_tasklet, "_in", wb_src_subset, buffer_type); builder.add_computational_memlet(wb_block, wb_tasklet, "_out", wb_dst, wb_dst_subset, pointer_type); // Barrier after writeback @@ -443,7 +508,7 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A init_indvars.push_back(indvar); auto init = symbolic::integer(0); - auto condition = symbolic::Lt(indvar, dim_sizes[i]); + auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]); auto update = symbolic::add(indvar, symbolic::integer(1)); auto& init_loop = builder.add_map( @@ -488,7 +553,7 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A wb_indvars.push_back(indvar); auto init = symbolic::integer(0); - auto condition = symbolic::Lt(indvar, dim_sizes[i]); + auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]); auto update = symbolic::add(indvar, symbolic::integer(1)); auto& wb_loop = builder.add_map( @@ -556,7 +621,8 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A } auto* acc = mla.access(memlet); if (acc && acc->subset.size() == tile_info_.dimensions.size()) { - std::vector local_indices; + // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]] + std::vector local_indices = per_thread_indices; for (size_t d = 0; d < tile_info_.dimensions.size(); d++) { if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) { local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d))); @@ -580,7 +646,8 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A } auto* acc = mla.access(memlet); if (acc && acc->subset.size() == tile_info_.dimensions.size()) { - std::vector local_indices; + // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]] + std::vector local_indices = per_thread_indices; for (size_t d = 0; d < tile_info_.dimensions.size(); d++) { if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) { local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d))); diff --git a/opt/tests/transformations/in_local_storage_test.cpp b/opt/tests/transformations/in_local_storage_test.cpp index f0eecd5b..a46e0dd6 100644 --- a/opt/tests/transformations/in_local_storage_test.cpp +++ b/opt/tests/transformations/in_local_storage_test.cpp @@ -2289,8 +2289,9 @@ TEST(InLocalStorageTest, GPU_Cooperative_SymbolicBounds) { EXPECT_EQ(buf_type.storage_type(), types::StorageType::NV_Shared()); auto& arr_type = static_cast(buf_type); - // Extent M resolved to 8 from GPU Y-dim block_size - EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(8))); + // Per-thread X dim contributes BX=32 slots; varying dim (extent M→8) contributes 8. + // Total = 32 * 8 = 256. + EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(256))); // Verify structure: [barrier, copy_loop, barrier, main_loop] auto& map_y_body = map_y.root(); diff --git a/opt/tests/transformations/out_local_storage_test.cpp b/opt/tests/transformations/out_local_storage_test.cpp index 9a5a2c2f..44d00513 100644 --- a/opt/tests/transformations/out_local_storage_test.cpp +++ b/opt/tests/transformations/out_local_storage_test.cpp @@ -2476,7 +2476,9 @@ TEST(OutLocalStorageTest, GPU_Cooperative_FlatPointer) { auto& buf_type = builder_opt.subject().type("__daisy_out_local_storage_C0"); EXPECT_EQ(buf_type.storage_type(), types::StorageType::NV_Shared()); auto& arr_type = static_cast(buf_type); - EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(8))); + // Per-thread Y dim (j in C base) contributes BY=8 slots; varying dim (M→8) contributes 8. + // Total = 8 * 8 = 64. + EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(64))); // Verify structure: write-only → [main_loop, barrier, writeback_loop, barrier] auto& map_y_body = map_y.root(); @@ -2595,7 +2597,9 @@ TEST(OutLocalStorageTest, GPU_Cooperative_ReadWrite) { EXPECT_EQ(buf_type.type_id(), types::TypeID::Array); auto& arr_type = static_cast(buf_type); - EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(32))); + // Per-thread Y dim (j in C base) contributes BY=8 slots; varying dim (N→32) contributes 32. + // Total = 8 * 32 = 256. + EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(256))); // Verify structure: has_read → [barrier, init_copy, barrier, main_loop, barrier, writeback, barrier] auto& map_y_body = map_y.root(); @@ -2880,6 +2884,8 @@ TEST(OutLocalStorageTest, GPU_Cooperative_SymbolicBounds) { EXPECT_TRUE(builder_opt.subject().exists("__daisy_out_local_storage_C0")); auto& buf_type = builder_opt.subject().type("__daisy_out_local_storage_C0"); auto& arr_type = static_cast(buf_type); - EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(8))); + // Per-thread X dim (i in C base) contributes BX=32 slots; varying dim (M→8) contributes 8. + // Total = 32 * 8 = 256. + EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(256))); EXPECT_EQ(buf_type.storage_type(), types::StorageType::NV_Shared()); } From 154d1c60991d116ba319d4dc29ba0a903282cd2e Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Sun, 14 Jun 2026 14:47:16 +0200 Subject: [PATCH 2/4] moves optimizations tests --- opt/tests/CMakeLists.txt | 4 ++-- .../{transformations => }/optimizations/blocking_test.cpp | 0 .../optimizations/diamond_tiling_test.cpp | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename opt/tests/{transformations => }/optimizations/blocking_test.cpp (100%) rename opt/tests/{transformations => }/optimizations/diamond_tiling_test.cpp (100%) diff --git a/opt/tests/CMakeLists.txt b/opt/tests/CMakeLists.txt index 907a2775..4f5e2ad2 100644 --- a/opt/tests/CMakeLists.txt +++ b/opt/tests/CMakeLists.txt @@ -31,6 +31,8 @@ set(TEST_FILES transformations/offloading/rocblas_data_transfer_extraction_test.cpp transformations/offloading/cuda_stdlib_data_transfer_extraction_test.cpp transformations/offloading/rocm_stdlib_data_transfer_extraction_test.cpp + optimizations/diamond_tiling_test.cpp + optimizations/blocking_test.cpp passes/offloading/cuda_library_node_transfer_extraction_pass_test.cpp passes/offloading/cuda_library_node_expand_tests.cpp passes/offloading/rocm_library_node_transfer_extraction_pass_test.cpp @@ -64,8 +66,6 @@ set(TEST_FILES transformations/in_local_storage_test.cpp transformations/recorder_test.cpp transformations/vectorize_transform_test.cpp - transformations/optimizations/diamond_tiling_test.cpp - transformations/optimizations/blocking_test.cpp transformations/transformation_serialization_test.cpp test.cpp ) diff --git a/opt/tests/transformations/optimizations/blocking_test.cpp b/opt/tests/optimizations/blocking_test.cpp similarity index 100% rename from opt/tests/transformations/optimizations/blocking_test.cpp rename to opt/tests/optimizations/blocking_test.cpp diff --git a/opt/tests/transformations/optimizations/diamond_tiling_test.cpp b/opt/tests/optimizations/diamond_tiling_test.cpp similarity index 100% rename from opt/tests/transformations/optimizations/diamond_tiling_test.cpp rename to opt/tests/optimizations/diamond_tiling_test.cpp From 4aa1204ca2443f7c9469745805a1d5ba16e401a0 Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Sun, 14 Jun 2026 15:31:22 +0200 Subject: [PATCH 3/4] deprecates compound GPU transformations in favor of more searchable simple transformations --- .../sdfg/passes/offloading/gpu_tiling_pass.h | 16 +- .../transformations/offloading/gpu_tiling.h | 22 +- .../offloading/kernel_local_storage.h | 16 +- opt/tests/CMakeLists.txt | 1 + opt/tests/optimizations/gpu_kernels_test.cpp | 603 ++++++++++++++++++ 5 files changed, 654 insertions(+), 4 deletions(-) create mode 100644 opt/tests/optimizations/gpu_kernels_test.cpp diff --git a/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h b/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h index aa8e374a..cc56e586 100644 --- a/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h +++ b/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h @@ -8,14 +8,26 @@ namespace sdfg { namespace passes { /** - * @brief Phased GPU tiling pass. + * @brief [DEPRECATED] Phased GPU tiling pass. + * + * Drives the legacy `transformations::GPUTiling` + + * `transformations::KernelLocalStorage` pair. Both are deprecated. New code + * should compose `LoopTiling`, `CUDAParallelizeNestedMap` / `cuda::CUDATransform`, + * `InLocalStorage` / `OutLocalStorage` (with `NV_Shared`), and + * `passes::SyncConditionPropagation` directly — see + * `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked example. + * + * Retained for the existing CUDA and ROCm schedulers that have not yet been + * migrated. * * Given a set of outer maps, finds all descendant structured loops and applies * GPU tiling in two phases: * 1. can_be_applied phase: collects all loops where tiling is applicable * 2. apply phase: applies tiling to all collected loops */ -class GPUTilingPass : public Pass { +class [[deprecated( + "Use LoopTiling + CUDA/ROCm parallelize + In/OutLocalStorage + SyncConditionPropagation. See gpu_kernels_test.cpp." +)]] GPUTilingPass : public Pass { private: const std::vector& maps_; size_t tile_size_; diff --git a/opt/include/sdfg/transformations/offloading/gpu_tiling.h b/opt/include/sdfg/transformations/offloading/gpu_tiling.h index 0a8089ea..daabec2b 100644 --- a/opt/include/sdfg/transformations/offloading/gpu_tiling.h +++ b/opt/include/sdfg/transformations/offloading/gpu_tiling.h @@ -6,7 +6,27 @@ namespace sdfg { namespace transformations { -class GPUTiling : public Transformation { +/** + * @brief [DEPRECATED] Monolithic GPU tiling transformation. + * + * Prefer the composable pipeline instead: + * 1. transformations::LoopTiling — strip-mine the target loop + * 2. transformations::CUDAParallelizeNestedMap / cuda::CUDATransform + * — assign GPU schedules + * 3. transformations::InLocalStorage (NV_Shared) — stage read tiles + * transformations::OutLocalStorage (NV_Shared) — stage write tiles + * 4. passes::SyncConditionPropagation — guard out-of-bounds threads + * + * See `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked + * example (GEMM). KernelLocalStorage and GPUTilingPass are deprecated for + * the same reason. + * + * The legacy transformation is retained for autotuning search spaces and + * existing schedulers that have not yet been migrated. + */ +class [[deprecated( + "Use LoopTiling + CUDA/ROCm parallelize + In/OutLocalStorage + SyncConditionPropagation. See gpu_kernels_test.cpp." +)]] GPUTiling : public Transformation { structured_control_flow::StructuredLoop& loop_; size_t size_; bool applied_ = false; diff --git a/opt/include/sdfg/transformations/offloading/kernel_local_storage.h b/opt/include/sdfg/transformations/offloading/kernel_local_storage.h index a009f3a7..8faa8db9 100644 --- a/opt/include/sdfg/transformations/offloading/kernel_local_storage.h +++ b/opt/include/sdfg/transformations/offloading/kernel_local_storage.h @@ -9,7 +9,21 @@ namespace sdfg { namespace transformations { -class KernelLocalStorage : public Transformation { +/** + * @brief [DEPRECATED] Monolithic shared-memory staging transformation. + * + * Prefer `transformations::InLocalStorage` (for read tiles) and + * `transformations::OutLocalStorage` (for write tiles) with the + * `types::StorageType::NV_Shared()` storage. Those transformations are + * composable with `LoopTiling`, `CUDATransform` / `CUDAParallelizeNestedMap`, + * and `passes::SyncConditionPropagation`. See + * `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked example. + * + * The legacy transformation is retained for autotuning search spaces and + * existing schedulers that have not yet been migrated. + */ +class [[deprecated("Use InLocalStorage / OutLocalStorage with NV_Shared. See gpu_kernels_test.cpp." +)]] KernelLocalStorage : public Transformation { private: structured_control_flow::StructuredLoop& loop_; symbolic::Expression offset_; diff --git a/opt/tests/CMakeLists.txt b/opt/tests/CMakeLists.txt index 4f5e2ad2..35c7f0da 100644 --- a/opt/tests/CMakeLists.txt +++ b/opt/tests/CMakeLists.txt @@ -33,6 +33,7 @@ set(TEST_FILES transformations/offloading/rocm_stdlib_data_transfer_extraction_test.cpp optimizations/diamond_tiling_test.cpp optimizations/blocking_test.cpp + optimizations/gpu_kernels_test.cpp passes/offloading/cuda_library_node_transfer_extraction_pass_test.cpp passes/offloading/cuda_library_node_expand_tests.cpp passes/offloading/rocm_library_node_transfer_extraction_pass_test.cpp diff --git a/opt/tests/optimizations/gpu_kernels_test.cpp b/opt/tests/optimizations/gpu_kernels_test.cpp new file mode 100644 index 00000000..7f081aef --- /dev/null +++ b/opt/tests/optimizations/gpu_kernels_test.cpp @@ -0,0 +1,603 @@ +// ============================================================================= +// GPU Kernel Tests +// ============================================================================= +// +// The pipeline demonstrated here is the canonical way to bring a sequential, +// flat-pointer matmul kernel onto the GPU: +// +// sequential Map(i) Map(j) For(k) // host-side scalar kernel +// | +// | (1) LoopInterchange(i, j) // coalesce X-dim along j +// v +// Map(j) Map(i) For(k) +// | +// | (2) cuda::CUDATransform(map_j, BX=32) // offload to CUDA, X-dim +// | (3) CUDAParallelizeNestedMap(map_i, BY) // nested Y-dim +// v +// Map_X(j, BX=32) Map_Y(i, BY=8) For(k) // kernel skeleton +// | +// | (4) LoopTiling(for_k, TK=8) // strip-mine k +// v +// Map_X Map_Y For(kk, step=8) For(k_in) +// | +// | (5) InLocalStorage(for_k_inner, A, NV_Shared) // stage A tile in SMEM +// | (6) InLocalStorage(for_k_inner, B, NV_Shared) // stage B tile in SMEM +// | (7) OutLocalStorage(for_kk, C) // promote C to register +// v +// Map_X Map_Y { C_reg = C[..], +// for(kk) { +// barriers + coop copy_A, +// barriers + coop copy_B, +// for(k_in) [fma reads A_local, B_local; accumulates C_reg] +// }, +// C[..] = C_reg } +// +// ============================================================================= + +#include + +#include +#include +#include + +#include "sdfg/analysis/analysis.h" +#include "sdfg/analysis/users.h" +#include "sdfg/builder/structured_sdfg_builder.h" +#include "sdfg/data_flow/access_node.h" +#include "sdfg/data_flow/library_node.h" +#include "sdfg/data_flow/library_nodes/barrier_local_node.h" +#include "sdfg/data_flow/tasklet.h" +#include "sdfg/passes/offloading/sync_condition_propagation.h" +#include "sdfg/passes/structured_control_flow/dead_cfg_elimination.h" +#include "sdfg/passes/structured_control_flow/sequence_fusion.h" +#include "sdfg/structured_control_flow/block.h" +#include "sdfg/structured_control_flow/for.h" +#include "sdfg/structured_control_flow/if_else.h" +#include "sdfg/structured_control_flow/map.h" +#include "sdfg/structured_control_flow/sequence.h" +#include "sdfg/structured_control_flow/structured_loop.h" +#include "sdfg/structured_control_flow/while.h" +#include "sdfg/symbolic/symbolic.h" +#include "sdfg/targets/cuda/cuda.h" +#include "sdfg/targets/gpu/gpu_schedule_type.h" +#include "sdfg/transformations/in_local_storage.h" +#include "sdfg/transformations/loop_interchange.h" +#include "sdfg/transformations/loop_tiling.h" +#include "sdfg/transformations/offloading/cuda_parallelize_nested_map.h" +#include "sdfg/transformations/offloading/cuda_transform.h" +#include "sdfg/transformations/out_local_storage.h" +#include "sdfg/types/array.h" +#include "sdfg/types/pointer.h" +#include "sdfg/types/scalar.h" + +using namespace sdfg; + +namespace { + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +void cleanup(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& am) { + passes::SequenceFusion sequence_fusion; + passes::DeadCFGElimination dead_cfg; + bool applies; + do { + applies = false; + applies |= dead_cfg.run(builder, am); + applies |= sequence_fusion.run(builder, am); + } while (applies); +} + +// Find an access node for any container whose name matches `predicate`, +// reachable from `node` (recursively). Used to look up access nodes after +// CUDATransform renames pointer arguments (e.g. A -> __daisy_cuda__A). +data_flow::AccessNode* find_access_by_suffix(structured_control_flow::ControlFlowNode& node, const std::string& suffix) { + // Walk the node recursively. We only need a single match - first wins. + std::function walk = + [&](structured_control_flow::ControlFlowNode& n) -> data_flow::AccessNode* { + if (auto* block = dynamic_cast(&n)) { + for (auto& dn : block->dataflow().nodes()) { + if (auto* an = dynamic_cast(&dn)) { + const auto& data = an->data(); + if (data.size() >= suffix.size() && + data.compare(data.size() - suffix.size(), suffix.size(), suffix) == 0) { + return an; + } + } + } + } else if (auto* seq = dynamic_cast(&n)) { + for (size_t i = 0; i < seq->size(); ++i) { + if (auto* hit = walk(seq->at(i).first)) { + return hit; + } + } + } else if (auto* ifelse = dynamic_cast(&n)) { + for (size_t i = 0; i < ifelse->size(); ++i) { + if (auto* hit = walk(ifelse->at(i).first)) { + return hit; + } + } + } else if (auto* loop_node = dynamic_cast(&n)) { + if (auto* hit = walk(loop_node->root())) { + return hit; + } + } + return nullptr; + }; + // For a StructuredLoop, walk its body; for everything else, walk the node + // itself (Sequence/Block/IfElse search recursively). + if (auto* loop = dynamic_cast(&node)) { + return walk(loop->root()); + } + return walk(node); +} + +// Find the unique container whose name starts with `prefix`. +std::string find_container_by_prefix(const sdfg::Function& fn, const std::string& prefix) { + for (auto& name : fn.containers()) { + if (name.size() >= prefix.size() && name.compare(0, prefix.size(), prefix) == 0) { + return name; + } + } + return {}; +} + +// Find the unique container name ending in `suffix` (e.g. "_A0" for the SMEM +// buffer that ILS created for the A pointer argument). +std::string find_container_by_suffix(const sdfg::Function& fn, const std::string& suffix) { + for (auto& name : fn.containers()) { + if (name.size() >= suffix.size() && name.compare(name.size() - suffix.size(), suffix.size(), suffix) == 0) { + return name; + } + } + return {}; +} + +// Find the first For loop under `seq`. +structured_control_flow::For* find_first_for(structured_control_flow::Sequence& seq) { + for (size_t i = 0; i < seq.size(); ++i) { + if (auto* f = dynamic_cast(&seq.at(i).first)) { + return f; + } + } + return nullptr; +} + +// Find the first Map under `seq`. +structured_control_flow::Map* find_first_map(structured_control_flow::Sequence& seq) { + for (size_t i = 0; i < seq.size(); ++i) { + if (auto* m = dynamic_cast(&seq.at(i).first)) { + return m; + } + } + return nullptr; +} + +// --------------------------------------------------------------------------- +// Matmul fixture: flat-pointer, linearized accesses, fixed bounds +// --------------------------------------------------------------------------- +// +// Builds: +// for (i = 0; i < M; ++i) +// for (j = 0; j < N; ++j) +// for (k = 0; k < K; ++k) +// C[i*N+j] = A[i*K+k] * B[k*N+j] + C[i*N+j] // fma +// +// Dimensions are integer constants so the argument-size analysis used by +// CUDATransform can succeed without `allow_dynamic_sizes=true`. + +struct MatmulFixture { + static constexpr long M = 128; + static constexpr long N = 128; + static constexpr long K = 64; + + std::unique_ptr builder; + structured_control_flow::Map* for_i = nullptr; + structured_control_flow::Map* for_j = nullptr; + structured_control_flow::For* for_k = nullptr; + + void build() { + builder = std::make_unique("gpu_kernel_gemm", FunctionType_CPU); + + types::Scalar idx_desc(types::PrimitiveType::Int64); + types::Scalar elem_desc(types::PrimitiveType::Double); + types::Pointer ptr_desc(elem_desc); + + builder->add_container("i", idx_desc); + builder->add_container("j", idx_desc); + builder->add_container("k", idx_desc); + builder->add_container("A", ptr_desc, /*is_argument=*/true); + builder->add_container("B", ptr_desc, /*is_argument=*/true); + builder->add_container("C", ptr_desc, /*is_argument=*/true); + + auto i = symbolic::symbol("i"); + auto j = symbolic::symbol("j"); + auto k = symbolic::symbol("k"); + auto M_e = symbolic::integer(M); + auto N_e = symbolic::integer(N); + auto K_e = symbolic::integer(K); + + auto& root = builder->subject().root(); + + for_i = &builder->add_map( + root, + i, + symbolic::Lt(i, M_e), + symbolic::integer(0), + symbolic::add(i, symbolic::integer(1)), + structured_control_flow::ScheduleType_Sequential::create() + ); + for_j = &builder->add_map( + for_i->root(), + j, + symbolic::Lt(j, N_e), + symbolic::integer(0), + symbolic::add(j, symbolic::integer(1)), + structured_control_flow::ScheduleType_Sequential::create() + ); + for_k = &builder->add_for( + for_j->root(), k, symbolic::Lt(k, K_e), symbolic::integer(0), symbolic::add(k, symbolic::integer(1)) + ); + + // fma: C[i*N+j] = A[i*K+k] * B[k*N+j] + C[i*N+j] + auto& block = builder->add_block(for_k->root()); + auto& a_in = builder->add_access(block, "A"); + auto& b_in = builder->add_access(block, "B"); + auto& c_in = builder->add_access(block, "C"); + auto& c_out = builder->add_access(block, "C"); + + auto& tasklet = builder->add_tasklet(block, data_flow::TaskletCode::fp_fma, "_out", {"_in1", "_in2", "_in3"}); + + builder + ->add_computational_memlet(block, a_in, tasklet, "_in1", {symbolic::add(symbolic::mul(i, K_e), k)}, ptr_desc); + builder + ->add_computational_memlet(block, b_in, tasklet, "_in2", {symbolic::add(symbolic::mul(k, N_e), j)}, ptr_desc); + builder + ->add_computational_memlet(block, c_in, tasklet, "_in3", {symbolic::add(symbolic::mul(i, N_e), j)}, ptr_desc); + builder + ->add_computational_memlet(block, tasklet, "_out", c_out, {symbolic::add(symbolic::mul(i, N_e), j)}, ptr_desc); + } +}; + +} // namespace + +// ============================================================================= +// Test +// ============================================================================= + +TEST(GPUKernelTest, GEMM_CudaTilingILS) { + constexpr int BX = 32; // X-dim block size (warp width, coalesced along j) + constexpr int BY = 8; // Y-dim block size + constexpr int TK = 8; // K-tile size + + MatmulFixture fix; + fix.build(); + auto& builder = *fix.builder; + analysis::AnalysisManager am(builder.subject()); + + // ------------------------------------------------------------------------- + // (1) Loop interchange: i <-> j so j becomes outermost. + // This makes the j-dim the X-dim (coalesced along the fast axis of + // row-major C/B) once we offload. + // ------------------------------------------------------------------------- + { + transformations::LoopInterchange interchange(*fix.for_i, *fix.for_j); + ASSERT_TRUE(interchange.can_be_applied(builder, am)) << "LoopInterchange(i, j) must apply"; + interchange.apply(builder, am); + am.invalidate_all(); + } + + // After interchange the new outer loop body is the (former) i loop, and + // its body still contains the unchanged k loop. Re-find references. + auto* for_j_outer = find_first_map(builder.subject().root()); + ASSERT_NE(for_j_outer, nullptr); + EXPECT_EQ(for_j_outer->indvar()->get_name(), "j"); + + auto* for_i_inner = find_first_map(for_j_outer->root()); + ASSERT_NE(for_i_inner, nullptr); + EXPECT_EQ(for_i_inner->indvar()->get_name(), "i"); + + auto* for_k_inner = find_first_for(for_i_inner->root()); + ASSERT_NE(for_k_inner, nullptr); + EXPECT_EQ(for_k_inner->indvar()->get_name(), "k"); + + // ------------------------------------------------------------------------- + // (2) CUDATransform on Map(j): X-dim CUDA kernel, BX=32. + // Also inserts H2D/D2H blocks around the kernel and renames pointer + // arguments A, B, C -> __daisy_cuda__{A,B,C}. + // ------------------------------------------------------------------------- + { + cuda::CUDATransform offload(*for_j_outer, /*block_size=*/BX); + ASSERT_TRUE(offload.can_be_applied(builder, am)) << "CUDATransform on Map(j) must apply"; + offload.apply(builder, am); + am.invalidate_all(); + } + + // for_j_outer is unchanged in identity but the schedule type was updated. + EXPECT_EQ(for_j_outer->schedule_type().value(), cuda::ScheduleType_CUDA::value()); + EXPECT_EQ(cuda::ScheduleType_CUDA::dimension(for_j_outer->schedule_type()), cuda::CUDADimension::X); + EXPECT_TRUE(symbolic::eq(cuda::ScheduleType_CUDA::block_size(for_j_outer->schedule_type()), symbolic::integer(BX))); + + // ------------------------------------------------------------------------- + // (3) Promote Map(i) to the Y dimension (BY=8). + // ------------------------------------------------------------------------- + { + transformations::CUDAParallelizeNestedMap parallelize(*for_i_inner, /*block_size=*/BY); + ASSERT_TRUE(parallelize.can_be_applied(builder, am)) << "CUDAParallelizeNestedMap on Map(i) must apply"; + parallelize.apply(builder, am); + am.invalidate_all(); + } + + EXPECT_EQ(for_i_inner->schedule_type().value(), cuda::ScheduleType_CUDA::value()); + EXPECT_EQ(cuda::ScheduleType_CUDA::dimension(for_i_inner->schedule_type()), cuda::CUDADimension::Y); + EXPECT_TRUE(symbolic::eq(cuda::ScheduleType_CUDA::block_size(for_i_inner->schedule_type()), symbolic::integer(BY))); + + // ------------------------------------------------------------------------- + // (4) Strip-mine the k loop: For(k) -> For(k_tile, step=TK) For(k). + // ------------------------------------------------------------------------- + auto* for_k_in_kernel = find_first_for(for_i_inner->root()); + ASSERT_NE(for_k_in_kernel, nullptr); + EXPECT_EQ(for_k_in_kernel->indvar()->get_name(), "k"); + + structured_control_flow::StructuredLoop* for_kk = nullptr; + { + transformations::LoopTiling tile(*for_k_in_kernel, /*tile_size=*/TK); + ASSERT_TRUE(tile.can_be_applied(builder, am)) << "LoopTiling on for_k must apply"; + tile.apply(builder, am); + for_kk = tile.outer_loop(); + ASSERT_NE(for_kk, nullptr); + am.invalidate_all(); + } + + // for_kk should now be the only direct loop child of for_i_inner->root(). + // LoopTiling uses find_new_name for the outer indvar, so the actual name + // includes a disambiguating suffix ("k_tile0"). + { + const std::string indvar = for_kk->indvar()->get_name(); + EXPECT_EQ(indvar.compare(0, std::strlen("k_tile"), "k_tile"), 0) << "Unexpected k_tile indvar: " << indvar; + } + + // ------------------------------------------------------------------------- + // (5) Stage A tile in SMEM. ILS is applied to the *inner* k-loop (post + // tiling), so the cooperative load lands BETWEEN for_kk and the inner + // for_k. Each for_kk iteration loads exactly TK elements per i thread. + // After CUDATransform, A is renamed - look it up by suffix. + // ------------------------------------------------------------------------- + auto* for_k_inner_tiled = find_first_for(for_kk->root()); + ASSERT_NE(for_k_inner_tiled, nullptr); + EXPECT_EQ(for_k_inner_tiled->indvar()->get_name(), "k"); + + { + auto* a_access = find_access_by_suffix(*for_k_inner_tiled, "_A"); + ASSERT_NE(a_access, nullptr) << "Renamed A access not found under for_k_inner_tiled"; + + transformations::InLocalStorage ils_a(*for_k_inner_tiled, *a_access, types::StorageType::NV_Shared()); + ASSERT_TRUE(ils_a.can_be_applied(builder, am)) << "ILS on A must apply"; + ils_a.apply(builder, am); + am.invalidate_all(); + } + + // ------------------------------------------------------------------------- + // (6) Stage B tile in SMEM. + // ------------------------------------------------------------------------- + { + auto* b_access = find_access_by_suffix(*for_k_inner_tiled, "_B"); + ASSERT_NE(b_access, nullptr) << "Renamed B access not found under for_k_inner_tiled"; + + transformations::InLocalStorage ils_b(*for_k_inner_tiled, *b_access, types::StorageType::NV_Shared()); + ASSERT_TRUE(ils_b.can_be_applied(builder, am)) << "ILS on B must apply"; + ils_b.apply(builder, am); + am.invalidate_all(); + } + + // ------------------------------------------------------------------------- + // (7) Promote C to a per-thread register accumulator. + // + // OLS applied at the for_kk scope hoists the C load OUT of for_kk and + // the writeback AFTER for_kk. Each thread sees C[i*N + j] (a single + // scalar slot per thread), so the tile collapses to a 1-element local + // and the entire k-traversal accumulates in a register. + // + // This cuts GMEM traffic on C from 2*M*N*(K/TK) accesses to just 2*M*N + // (one load + one store per output element), and is the prerequisite + // for any further inner-loop optimization (unrolling, vectorization, + // tensor-core MMA), since the inner k loop is now a pure FMA chain + // with no DRAM round-trip on the accumulator. + // + // Default storage = CPU_Stack: on GPU codegen this lowers to a + // per-thread local (register or stack slot, at the compiler's + // discretion). + // ------------------------------------------------------------------------- + { + auto* c_access = find_access_by_suffix(*for_kk, "_C"); + ASSERT_NE(c_access, nullptr) << "Renamed C access not found under for_kk"; + + transformations::OutLocalStorage ols_c(*for_kk, *c_access); + ASSERT_TRUE(ols_c.can_be_applied(builder, am)) << "OLS on C must apply"; + ols_c.apply(builder, am); + am.invalidate_all(); + } + + // ------------------------------------------------------------------------- + // (8) Propagate the grid-condition through the kernel so that out-of-bounds + // threads still hit every barrier but skip non-barrier work. For + // perfectly divisible bounds (M%BY == 0, N%BX == 0) this is a no-op + // w.r.t. semantics, but it sets the `nested_sync` schedule property + // and is part of the canonical pipeline. + // ------------------------------------------------------------------------- + { + passes::SyncConditionPropagation sync_prop; + sync_prop.run_pass(builder, am); + am.invalidate_all(); + } + + cleanup(builder, am); + + // ------------------------------------------------------------------------- + // Verification: SMEM buffers exist with correct sizes. + // A_local: per-thread on Y (BY) x varying TK = 8 * 8 = 64 elements + // B_local: per-thread on X (BX) x varying TK = 32 * 8 = 256 elements + // + // ILS names its buffer `__daisy_in_local_storage_` (where + // is the renamed device arg, e.g. `__daisy_cuda_0_A`). The + // cooperative copy indvar uses a similar suffix (`__daisy_ils_coop_...`), + // so we look up by the buffer prefix to disambiguate. + // ------------------------------------------------------------------------- + auto a_local = find_container_by_prefix(builder.subject(), "__daisy_in_local_storage_"); + ASSERT_FALSE(a_local.empty()) << "First SMEM container not created"; + std::string b_local; + for (auto& name : builder.subject().containers()) { + if (name == a_local) continue; + if (name.compare(0, std::strlen("__daisy_in_local_storage_"), "__daisy_in_local_storage_") == 0) { + b_local = name; + break; + } + } + ASSERT_FALSE(b_local.empty()) << "Second SMEM container not created"; + + // Pin A vs B by the container suffix (which carries the original arg name). + if (a_local.find("_B") != std::string::npos) { + std::swap(a_local, b_local); + } + EXPECT_NE(a_local.find("_A"), std::string::npos) << "A buffer name: " << a_local; + EXPECT_NE(b_local.find("_B"), std::string::npos) << "B buffer name: " << b_local; + + const auto& a_type = builder.subject().type(a_local); + const auto& b_type = builder.subject().type(b_local); + + EXPECT_EQ(a_type.storage_type(), types::StorageType::NV_Shared()); + EXPECT_EQ(b_type.storage_type(), types::StorageType::NV_Shared()); + + const auto* a_arr = dynamic_cast(&a_type); + const auto* b_arr = dynamic_cast(&b_type); + + ASSERT_NE(a_arr, nullptr) << "A_local is not an Array"; + ASSERT_NE(b_arr, nullptr) << "B_local is not an Array"; + + EXPECT_TRUE(symbolic::eq(a_arr->num_elements(), symbolic::integer(BY * TK))) + << "A_local size mismatch: got " << a_arr->num_elements()->__str__() << ", expected " << (BY * TK); + EXPECT_TRUE(symbolic::eq(b_arr->num_elements(), symbolic::integer(BX * TK))) + << "B_local size mismatch: got " << b_arr->num_elements()->__str__() << ", expected " << (BX * TK); + + // ------------------------------------------------------------------------- + // Verification: C register accumulator exists as a 1-element per-thread + // local (CPU_Stack on GPU = register/local slot). OLS names its buffer + // `__daisy_out_local_storage_`. + // ------------------------------------------------------------------------- + auto c_local = find_container_by_prefix(builder.subject(), "__daisy_out_local_storage_"); + ASSERT_FALSE(c_local.empty()) << "C register-accumulator container not created"; + EXPECT_NE(c_local.find("_C"), std::string::npos) << "OLS buffer name: " << c_local; + + const auto& c_type = builder.subject().type(c_local); + EXPECT_EQ(c_type.storage_type(), types::StorageType::CPU_Stack()) + << "C accumulator should be a per-thread local (CPU_Stack), got " << c_type.storage_type().value(); + + const auto* c_arr = dynamic_cast(&c_type); + ASSERT_NE(c_arr, nullptr) << "C_local is not an Array"; + EXPECT_TRUE(symbolic::eq(c_arr->num_elements(), symbolic::integer(1))) + << "C_local should be a single-element register accumulator, got " << c_arr->num_elements()->__str__(); + + // ------------------------------------------------------------------------- + // Loop structure: map_j(X, BX) > map_i(Y, BY) > for_kk > [prologue + // blocks/maps..., for_k_inner > fma_block]. ILS was applied to the inner + // for_k, so the cooperative copy + barriers live INSIDE for_kk's body, + // ahead of the inner k loop. map_i_body should still hold just for_kk as + // its only meaningful child. + // ------------------------------------------------------------------------- + auto& map_i_body = for_i_inner->root(); + + // Find the (now tiled) outer kk loop under map_i_body. After ILS+cleanup, + // it should be the only For directly under map_i_body. + structured_control_flow::For* kk_loop = nullptr; + for (size_t i = 0; i < map_i_body.size(); ++i) { + if (auto* f = dynamic_cast(&map_i_body.at(i).first)) { + const std::string indvar = f->indvar()->get_name(); + if (indvar.compare(0, std::strlen("k_tile"), "k_tile") == 0) { + kk_loop = f; + break; + } + } + } + ASSERT_NE(kk_loop, nullptr) << "k_tile loop not found under map_i_body"; + + auto& kk_body = kk_loop->root(); + EXPECT_GE(kk_body.size(), 4u) << "for_kk body should hold prologue + inner k loop"; + + // SyncConditionPropagation wraps each non-barrier child of a GPU Map body + // in an `if (map.condition()) { ... }` guard. The pass runs once per GPU + // ancestor (Map(j) X, then Map(i) Y), so each non-barrier child of for_kk + // ends up wrapped in TWO nested single-case IfElse nodes. Unwrap to a + // fixpoint. + auto unwrap_if_else = [](structured_control_flow::ControlFlowNode* node + ) -> structured_control_flow::ControlFlowNode* { + while (auto* ie = dynamic_cast(node)) { + if (ie->size() != 1) return nullptr; + auto& inner = ie->at(0).first; + if (inner.size() != 1) return nullptr; + node = &inner.at(0).first; + } + return node; + }; + + // Last child of kk_body must be (a wrapper around) the inner k loop. + auto* last_raw = &kk_body.at(kk_body.size() - 1).first; + auto* last_unwrapped = unwrap_if_else(last_raw); + ASSERT_NE(last_unwrapped, nullptr) << "last kk_body child does not unwrap to a single inner node"; + + auto* last_child = dynamic_cast(last_unwrapped); + ASSERT_NE(last_child, nullptr) << "last unwrapped child is not a For"; + EXPECT_EQ(last_child->indvar()->get_name(), "k"); + + // The fma body references the SMEM buffers (no longer the renamed device + // pointers for A/B) and the C register accumulator (no longer the C device + // pointer). + auto* a_in_body = find_access_by_suffix(*last_child, a_local); + auto* b_in_body = find_access_by_suffix(*last_child, b_local); + EXPECT_NE(a_in_body, nullptr) << "fma body does not read A from SMEM"; + EXPECT_NE(b_in_body, nullptr) << "fma body does not read B from SMEM"; + + // OLS rewrote all C accesses inside for_kk to point at C_local. The + // device pointer "_C" (i.e. `__daisy_cuda_0_C`) should no longer appear + // anywhere inside the inner k loop. C_local must appear instead. + auto* c_dev_in_body = find_access_by_suffix(*last_child, "_C"); + EXPECT_EQ(c_dev_in_body, nullptr) << "C device pointer still present inside inner k loop after OLS"; + + auto* c_reg_in_body = find_access_by_suffix(*last_child, c_local); + EXPECT_NE(c_reg_in_body, nullptr) << "fma body does not reference C register accumulator"; + + // The C device pointer must still appear OUTSIDE for_kk — once for the + // initial load (C_local = C[..]) and once for the writeback (C[..] = + // C_local). Both live as siblings of for_kk under map_i_body, wrapped in + // IfElse by SyncConditionPropagation. + auto* c_dev_outside = find_access_by_suffix(map_i_body, "_C"); + EXPECT_NE(c_dev_outside, nullptr) << "C device pointer load/store vanished from kernel"; + + // Exactly two cooperative copy Maps must exist in kk_body, possibly wrapped + // in IfElse by SyncConditionPropagation. + int copy_map_count = 0; + for (size_t i = 0; i < kk_body.size(); ++i) { + auto* unwrapped = unwrap_if_else(&kk_body.at(i).first); + if (unwrapped && dynamic_cast(unwrapped)) { + ++copy_map_count; + } + } + EXPECT_EQ(copy_map_count, 2) << "Expected exactly two cooperative copy Maps inside for_kk"; + + // Barrier blocks: at least one per staged buffer pair (before & after copy) + // -> 4 total. Barriers are NOT wrapped (they must execute for all threads). + int barrier_block_count = 0; + for (size_t i = 0; i < kk_body.size(); ++i) { + auto* blk = dynamic_cast(&kk_body.at(i).first); + if (!blk) continue; + for (auto& n : blk->dataflow().nodes()) { + if (auto* lib = dynamic_cast(&n)) { + if (lib->code() == data_flow::LibraryNodeType_BarrierLocal) { + ++barrier_block_count; + break; + } + } + } + } + EXPECT_GE(barrier_block_count, 2) << "Expected at least 2 barrier blocks (one per copy boundary pair)"; +} From 4be7da0a488588defb5429e224cdd3bbe93a289b Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Sun, 14 Jun 2026 17:55:54 +0200 Subject: [PATCH 4/4] allows GPU schedules on strided maps --- opt/include/sdfg/targets/gpu/gpu_map_utils.h | 24 +++++++ opt/src/targets/cuda/cuda_map_dispatcher.cpp | 69 ++++++++++++++++--- opt/src/targets/gpu/gpu_map_utils.cpp | 41 ++++++++--- opt/src/targets/rocm/rocm_map_dispatcher.cpp | 57 ++++++++++++--- .../cuda_parallelize_nested_map.cpp | 14 ++-- .../offloading/offload_transform.cpp | 12 ++-- .../rocm_parallelize_nested_map.cpp | 14 ++-- 7 files changed, 175 insertions(+), 56 deletions(-) diff --git a/opt/include/sdfg/targets/gpu/gpu_map_utils.h b/opt/include/sdfg/targets/gpu/gpu_map_utils.h index 7c62c6d2..85fc0685 100644 --- a/opt/include/sdfg/targets/gpu/gpu_map_utils.h +++ b/opt/include/sdfg/targets/gpu/gpu_map_utils.h @@ -1,5 +1,7 @@ #pragma once +#include + #include "sdfg/analysis/analysis.h" #include "sdfg/structured_control_flow/map.h" #include "sdfg/symbolic/symbolic.h" @@ -75,6 +77,23 @@ symbolic::SymbolSet get_gpu_indvars( structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension ); +/** + * @brief Get all GPU Map nodes in a given dimension (in tree traversal order). + * + * Unlike get_gpu_indvars, this preserves access to each Map's init / stride + * so the codegen can emit `indvar = init + thread_flat_id * stride` for + * arbitrary affine grid loops. + * + * @tparam ScheduleT Schedule type class with value() and dimension() static methods + * @param node The current map node + * @param analysis_manager Analysis manager for loop analysis + * @param dimension GPU dimension (X, Y, or Z) + * @return Vector of Map pointers in the given GPU dimension + */ +template +std::vector +get_gpu_maps(structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension); + // Extern template declarations to prevent implicit instantiation extern template symbolic::Expression find_nested_gpu_blocksize< cuda::ScheduleType_CUDA>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension); @@ -96,5 +115,10 @@ extern template symbolic::SymbolSet get_gpu_indvars< extern template symbolic::SymbolSet get_gpu_indvars< rocm::ScheduleType_ROCM>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension); +extern template std::vector get_gpu_maps< + cuda::ScheduleType_CUDA>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension); +extern template std::vector get_gpu_maps< + rocm::ScheduleType_ROCM>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension); + } // namespace gpu } // namespace sdfg diff --git a/opt/src/targets/cuda/cuda_map_dispatcher.cpp b/opt/src/targets/cuda/cuda_map_dispatcher.cpp index 61ad78d1..02f507cd 100644 --- a/opt/src/targets/cuda/cuda_map_dispatcher.cpp +++ b/opt/src/targets/cuda/cuda_map_dispatcher.cpp @@ -207,8 +207,27 @@ void CUDAMapDispatcher::dispatch_kernel_body( } // Boundary Conditions if (!ScheduleType_CUDA::nested_sync(node_.schedule_type())) { - library_stream << "if (" << indvar->get_name() << " < " << cuda_language_extension.expression(num_iterations) - << ") {" << std::endl; + // Guard on the flat thread id rather than the per-Map indvar so that + // Maps with non-unit stride or non-zero init still get a correct OOB + // check (the per-Map indvar = init + flat_id * stride and is + // only well-defined when flat_id < num_iterations). + std::string flat_id; + switch (ScheduleType_CUDA::dimension(node_.schedule_type())) { + case CUDADimension::X: + flat_id = "__daisy_cuda_indvar_x"; + break; + case CUDADimension::Y: + flat_id = "__daisy_cuda_indvar_y"; + break; + case CUDADimension::Z: + flat_id = "__daisy_cuda_indvar_z"; + break; + default: + flat_id = indvar->get_name(); + break; + } + library_stream << "if (" << flat_id << " < " << cuda_language_extension.expression(num_iterations) << ") {" + << std::endl; library_stream.setIndent(library_stream.indent() + 4); } @@ -317,18 +336,46 @@ void CUDAMapDispatcher::dispatch_kernel_preamble( library_stream << "int " << indvar_z << " = " << this->language_extension_.expression(gpu_indvar_z) << ";" << std::endl; - // Declare all other indvars in the kernel - for (auto& var : x_vars) { - library_stream << "int " << var->get_name() << " = " << indvar_x << ";" << std::endl; - } + // Declare each per-Map indvar as a strided affine of the flat thread id: + // = + * + // + // This lets the dispatcher consume Maps with arbitrary init / stride + // (e.g. block-tiled outer loops produced by LoopTiling). The bound check + // in dispatch_kernel_body() guards on the flat id against num_iterations, + // so out-of-grid threads are skipped before any body access. + auto x_maps = gpu::get_gpu_maps(node_, analysis_manager, CUDADimension::X); + auto y_maps = gpu::get_gpu_maps(node_, analysis_manager, CUDADimension::Y); + auto z_maps = gpu::get_gpu_maps(node_, analysis_manager, CUDADimension::Z); + + auto emit_indvar = [&](structured_control_flow::Map* map, const std::string& flat_id_var) { + symbolic::Expression value = symbolic::symbol(flat_id_var); + auto stride = map->stride(); + if (!stride.is_null() && !symbolic::eq(stride, symbolic::one())) { + value = symbolic::mul(value, stride); + } + auto init = map->init(); + if (!symbolic::eq(init, symbolic::zero())) { + value = symbolic::add(init, value); + } + library_stream << "int " << map->indvar()->get_name() << " = " << this->language_extension_.expression(value) + << ";" << std::endl; + }; - for (auto& var : y_vars) { - library_stream << "int " << var->get_name() << " = " << indvar_y << ";" << std::endl; + for (auto* map : x_maps) { + emit_indvar(map, indvar_x); } - - for (auto& var : z_vars) { - library_stream << "int " << var->get_name() << " = " << indvar_z << ";" << std::endl; + for (auto* map : y_maps) { + emit_indvar(map, indvar_y); + } + for (auto* map : z_maps) { + emit_indvar(map, indvar_z); } + // x_vars/y_vars/z_vars params kept for signature compatibility (used by + // callers to filter scope_variables); their iteration here would be + // redundant with the per-Map loops above. + (void) x_vars; + (void) y_vars; + (void) z_vars; } codegen::InstrumentationInfo CUDAMapDispatcher::instrumentation_info() const { diff --git a/opt/src/targets/gpu/gpu_map_utils.cpp b/opt/src/targets/gpu/gpu_map_utils.cpp index 02b2afc9..f3a87879 100644 --- a/opt/src/targets/gpu/gpu_map_utils.cpp +++ b/opt/src/targets/gpu/gpu_map_utils.cpp @@ -88,16 +88,10 @@ symbolic::Expression find_nested_gpu_iterations( continue; } - auto init = map->init(); - if (!symbolic::eq(init, symbolic::zero())) { - throw InvalidSDFGException("Init is not zero"); - } - - auto stride = map->stride(); - if (!symbolic::eq(stride, symbolic::one())) { - throw InvalidSDFGException("Stride is not one"); - } - + // Note: arbitrary `init` and `stride` are permitted here; the + // dispatcher emits `indvar = init + thread_flat_id * stride` so + // the body sees the natural strided value. `num_iterations()` + // already accounts for both. auto num_iterations = map->num_iterations(); if (num_iterations.is_null()) { throw InvalidSDFGException("Cannot determine number of iterations for nested map in GPU kernel"); @@ -144,6 +138,25 @@ symbolic::SymbolSet get_gpu_indvars( return indvars; } +template +std::vector +get_gpu_maps(structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension) { + auto& loop_analysis = analysis_manager.get(); + auto loops = loop_analysis.descendants(&node); + loops.insert(&node); + std::vector maps; + for (const auto& loop : loops) { + if (auto map = dynamic_cast(loop)) { + if (map->schedule_type().value() == ScheduleT::value()) { + if (ScheduleT::dimension(map->schedule_type()) == dimension) { + maps.push_back(map); + } + } + } + } + return maps; +} + // Explicit template instantiations for CUDA template symbolic::Expression find_nested_gpu_blocksize( structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension @@ -160,6 +173,10 @@ template symbolic::SymbolSet get_gpu_indvars( structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension ); +template std::vector get_gpu_maps( + structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension +); + // Explicit template instantiations for ROCM template symbolic::Expression find_nested_gpu_blocksize( structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension @@ -176,5 +193,9 @@ template symbolic::SymbolSet get_gpu_indvars( structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension ); +template std::vector get_gpu_maps( + structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension +); + } // namespace gpu } // namespace sdfg diff --git a/opt/src/targets/rocm/rocm_map_dispatcher.cpp b/opt/src/targets/rocm/rocm_map_dispatcher.cpp index 0c1b4151..08634ac7 100644 --- a/opt/src/targets/rocm/rocm_map_dispatcher.cpp +++ b/opt/src/targets/rocm/rocm_map_dispatcher.cpp @@ -207,8 +207,23 @@ void ROCMMapDispatcher::dispatch_kernel_body( } // Boundary Conditions if (!ScheduleType_ROCM::nested_sync(node_.schedule_type())) { - library_stream << "if (" << indvar->get_name() << " < " << rocm_language_extension.expression(num_iterations) - << ") {" << std::endl; + std::string flat_id; + switch (ScheduleType_ROCM::dimension(node_.schedule_type())) { + case ROCMDimension::X: + flat_id = "__daisy_hip_indvar_x"; + break; + case ROCMDimension::Y: + flat_id = "__daisy_hip_indvar_y"; + break; + case ROCMDimension::Z: + flat_id = "__daisy_hip_indvar_z"; + break; + default: + flat_id = indvar->get_name(); + break; + } + library_stream << "if (" << flat_id << " < " << rocm_language_extension.expression(num_iterations) << ") {" + << std::endl; library_stream.setIndent(library_stream.indent() + 4); } @@ -316,18 +331,38 @@ void ROCMMapDispatcher::dispatch_kernel_preamble( library_stream << "int " << indvar_z << " = " << this->language_extension_.expression(gpu_indvar_z) << ";" << std::endl; - // Declare all other indvars in the kernel - for (auto& var : x_vars) { - library_stream << "int " << var->get_name() << " = " << indvar_x << ";" << std::endl; - } + // Declare each per-Map indvar as a strided affine of the flat thread id: + // = + * + auto x_maps = gpu::get_gpu_maps(node_, analysis_manager, ROCMDimension::X); + auto y_maps = gpu::get_gpu_maps(node_, analysis_manager, ROCMDimension::Y); + auto z_maps = gpu::get_gpu_maps(node_, analysis_manager, ROCMDimension::Z); + + auto emit_indvar = [&](structured_control_flow::Map* map, const std::string& flat_id_var) { + symbolic::Expression value = symbolic::symbol(flat_id_var); + auto stride = map->stride(); + if (!stride.is_null() && !symbolic::eq(stride, symbolic::one())) { + value = symbolic::mul(value, stride); + } + auto init = map->init(); + if (!symbolic::eq(init, symbolic::zero())) { + value = symbolic::add(init, value); + } + library_stream << "int " << map->indvar()->get_name() << " = " << this->language_extension_.expression(value) + << ";" << std::endl; + }; - for (auto& var : y_vars) { - library_stream << "int " << var->get_name() << " = " << indvar_y << ";" << std::endl; + for (auto* map : x_maps) { + emit_indvar(map, indvar_x); } - - for (auto& var : z_vars) { - library_stream << "int " << var->get_name() << " = " << indvar_z << ";" << std::endl; + for (auto* map : y_maps) { + emit_indvar(map, indvar_y); + } + for (auto* map : z_maps) { + emit_indvar(map, indvar_z); } + (void) x_vars; + (void) y_vars; + (void) z_vars; } codegen::InstrumentationInfo ROCMMapDispatcher::instrumentation_info() const { diff --git a/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp b/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp index 54f1ae6c..24586897 100644 --- a/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp +++ b/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp @@ -53,16 +53,10 @@ bool CUDAParallelizeNestedMap:: return false; } - // Condition: Check if current loop starts from 0 - if (!symbolic::eq(loop_.init(), symbolic::zero())) { - return false; - } - - // Condition: Loop has a stride of 1 - auto stride = loop_.stride(); - if (!symbolic::eq(stride, symbolic::one())) { - return false; - } + // Note: arbitrary `init` and `stride` are permitted. The CUDA dispatcher + // emits ` = init + thread_flat_id * stride`, so the body sees + // the natural strided value; `num_iterations()` accounts for both when + // computing the grid geometry. // Condition: Resulting CUDA grid dimension must not exceed hardware limits. // Y and Z grid dimensions are limited to 65535. diff --git a/opt/src/transformations/offloading/offload_transform.cpp b/opt/src/transformations/offloading/offload_transform.cpp index bc47d294..6130aafd 100644 --- a/opt/src/transformations/offloading/offload_transform.cpp +++ b/opt/src/transformations/offloading/offload_transform.cpp @@ -75,10 +75,14 @@ bool OffloadTransform::can_be_applied(builder::StructuredSDFGBuilder& builder, a } } - // Criterion: Map must start at 0 - if (!symbolic::eq(this->map_.init(), symbolic::zero())) { - if (report_) report_->transform_impossible(this, "non zero start"); - DEBUG_PRINTLN("Cannot apply transform: map does not start at zero"); + // Note: arbitrary `init` and `stride` are permitted on the kernel-boundary + // Map. The CUDA/ROCm dispatchers emit + // ` = init + thread_flat_id * stride`, + // and `num_iterations()` already accounts for both when computing the grid + // geometry. + if (map_.num_iterations().is_null()) { + if (report_) report_->transform_impossible(this, "cannot determine num iterations"); + DEBUG_PRINTLN("Cannot apply transform: cannot determine number of iterations for map"); return false; } diff --git a/opt/src/transformations/offloading/rocm_parallelize_nested_map.cpp b/opt/src/transformations/offloading/rocm_parallelize_nested_map.cpp index 1b25864b..0bfd55b9 100644 --- a/opt/src/transformations/offloading/rocm_parallelize_nested_map.cpp +++ b/opt/src/transformations/offloading/rocm_parallelize_nested_map.cpp @@ -53,16 +53,10 @@ bool ROCMParallelizeNestedMap:: return false; } - // Condition: Check if current loop starts from 0 - if (!symbolic::eq(loop_.init(), symbolic::zero())) { - return false; - } - - // Condition: Loop has a stride of 1 - auto stride = loop_.stride(); - if (!symbolic::eq(stride, symbolic::one())) { - return false; - } + // Note: arbitrary `init` and `stride` are permitted. The ROCm dispatcher + // emits ` = init + thread_flat_id * stride`, so the body sees + // the natural strided value; `num_iterations()` accounts for both when + // computing the grid geometry. // Condition: Resulting ROCm grid dimension must not exceed hardware limits. // Y and Z grid dimensions are limited to 65535.