From c8747af8c1e90c534afae4a08ba5cb71feae1602 Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Sun, 14 Jun 2026 14:32:59 +0200
Subject: [PATCH 1/4] correctly identify cooperative dimensions in ILS/OLS

---
 opt/src/transformations/in_local_storage.cpp  | 222 ++++++++++------
 opt/src/transformations/out_local_storage.cpp | 245 +++++++++++-------
 .../transformations/in_local_storage_test.cpp |   5 +-
 .../out_local_storage_test.cpp                |  12 +-
 4 files changed, 312 insertions(+), 172 deletions(-)
diff --git a/opt/src/transformations/in_local_storage.cpp b/opt/src/transformations/in_local_storage.cpp
index 8411a344..7110d2cf 100644
--- a/opt/src/transformations/in_local_storage.cpp
+++ b/opt/src/transformations/in_local_storage.cpp
@@ -1,5 +1,6 @@
 #include "sdfg/transformations/in_local_storage.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <functional>
 #include <string>
@@ -218,30 +219,114 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An
     // Create local buffer name
     local_name_ = builder.find_new_name("__daisy_in_local_storage_" + this->container_);
 
-    // Collect varying dimensions (extent > 1) and compute buffer layout
+    // Collect varying dimensions (extent > 1) and their sizes
     std::vector<size_t> varying_dims;
-    std::vector<symbolic::Expression> dim_sizes;
+    std::vector<symbolic::Expression> varying_dim_sizes;
     for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
         auto& dim_size = tile_info_.dimensions.at(d);
         if (!symbolic::eq(dim_size, symbolic::integer(1))) {
             varying_dims.push_back(d);
-            dim_sizes.push_back(dim_size);
+            varying_dim_sizes.push_back(dim_size);
         }
     }
 
-    // Compute total buffer size
-    symbolic::Expression total_size = symbolic::integer(1);
-    for (auto& ds : dim_sizes) {
-        total_size = symbolic::mul(total_size, ds);
+    // GPU classification: each ancestor GPU Map is either
+    //  - per-thread (its Map indvar appears in tile.bases — each thread sees a
+    //    distinct slice along that dim, so the shared buffer gets its own
+    //    per-thread slot indexed by the within-block thread_idx), or
+    //  - cooperative (Map indvar not in bases — all threads along that dim
+    //    cooperatively load the same shared tile, strided by thread_idx).
+    struct GpuDim {
+        gpu::GPUDimension dim;
+        symbolic::Symbol map_indvar; // global thread index (== thread_idx + blockIdx * blockDim)
+        symbolic::Symbol thread_idx; // within-block thread index (NV_Symbol)
+        symbolic::Integer block_size;
+        bool is_per_thread;
+    };
+    std::vector<GpuDim> per_thread_dims; // populated only on GPU path
+    std::vector<GpuDim> coop_dims; // populated only on GPU path
+    bool is_rocm = false;
+
+    if (storage_type_.is_nv_shared()) {
+        auto ancestors = ControlFlowNode::parent_chain(loop_);
+        for (auto* node : ancestors) {
+            auto* m = dynamic_cast<structured_control_flow::Map*>(node);
+            if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue;
+            if (m->schedule_type().value() == "ROCM") {
+                is_rocm = true;
+                break;
+            }
+        }
+        const std::string prefix = is_rocm ? "__daisy_hip_thread_idx_" : "__daisy_cuda_thread_idx_";
+        auto suffix = [](gpu::GPUDimension d) -> std::string {
+            switch (d) {
+                case gpu::GPUDimension::X:
+                    return "x";
+                case gpu::GPUDimension::Y:
+                    return "y";
+                case gpu::GPUDimension::Z:
+                    return "z";
+            }
+            return "?";
+        };
+        for (auto* node : ancestors) {
+            auto* m = dynamic_cast<structured_control_flow::Map*>(node);
+            if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue;
+            GpuDim gd;
+            gd.dim = gpu::gpu_dimension(m->schedule_type());
+            gd.map_indvar = m->indvar();
+            gd.thread_idx = symbolic::symbol(prefix + suffix(gd.dim));
+            gd.block_size = gpu::gpu_block_size(m->schedule_type());
+            gd.is_per_thread = false;
+            for (auto& base : tile_info_.bases) {
+                if (symbolic::uses(base, m->indvar())) {
+                    gd.is_per_thread = true;
+                    break;
+                }
+            }
+            (gd.is_per_thread ? per_thread_dims : coop_dims).push_back(gd);
+        }
+        auto by_dim = [](const GpuDim& a, const GpuDim& b) {
+            return static_cast<int>(a.dim) < static_cast<int>(b.dim);
+        };
+        std::sort(per_thread_dims.begin(), per_thread_dims.end(), by_dim);
+        std::sort(coop_dims.begin(), coop_dims.end(), by_dim);
+
+        // Ensure within-block thread_idx containers exist. Codegen recognises
+        // NV_Symbol-typed scalars and substitutes them with threadIdx.{x,y,z}
+        // (CUDA) or the ROCm equivalent at emission time.
+        auto ensure_idx = [&](const symbolic::Symbol& sym) {
+            if (!sdfg.exists(sym->get_name())) {
+                types::Scalar idx_type(types::PrimitiveType::Int32);
+                idx_type.storage_type(types::StorageType::NV_Symbol());
+                builder.add_container(sym->get_name(), idx_type);
+            }
+        };
+        for (auto& gd : per_thread_dims) ensure_idx(gd.thread_idx);
+        for (auto& gd : coop_dims) ensure_idx(gd.thread_idx);
     }
 
-    // Helper: build linearized local index from per-dimension symbolic expressions
+    // Buffer dim sizes: [per-thread block sizes (X, Y, Z canonical order)] ++
+    //                   [varying tile dim sizes (original access-dim order)]
+    std::vector<symbolic::Expression> buf_dim_sizes;
+    for (auto& gd : per_thread_dims) buf_dim_sizes.push_back(gd.block_size);
+    for (auto& s : varying_dim_sizes) buf_dim_sizes.push_back(s);
+
+    // Total buffer size (number of scalar slots)
+    symbolic::Expression total_size = symbolic::integer(1);
+    for (auto& s : buf_dim_sizes) total_size = symbolic::mul(total_size, s);
+
+    // Per-thread index prefix (each thread's fixed buffer coords)
+    std::vector<symbolic::Expression> per_thread_indices;
+    for (auto& gd : per_thread_dims) per_thread_indices.push_back(gd.thread_idx);
+
+    // Row-major linearization over buf_dim_sizes (leftmost dim = outermost stride)
     auto linearize_exprs = [&](const std::vector<symbolic::Expression>& indices) -> symbolic::Expression {
         symbolic::Expression linear_idx = symbolic::integer(0);
         symbolic::Expression stride = symbolic::integer(1);
-        for (int i = indices.size() - 1; i >= 0; i--) {
+        for (int i = static_cast<int>(indices.size()) - 1; i >= 0; i--) {
             linear_idx = symbolic::add(linear_idx, symbolic::mul(indices[i], stride));
-            stride = symbolic::mul(stride, dim_sizes[i]);
+            stride = symbolic::mul(stride, buf_dim_sizes[i]);
         }
         return linear_idx;
     };
@@ -278,114 +363,93 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An
         // ============================================================
         // GPU COOPERATIVE PATH
         // ============================================================
-        auto ancestors = ControlFlowNode::parent_chain(loop_);
+        // Each thread owns a fixed slot along per-thread buffer dims and
+        // strides through the varying-flat range with the other threads
+        // sharing that slot (i.e. threads in cooperative dims only).
 
-        // Collect cooperative GPU dimensions (indvar not in tile bases)
-        struct CoopDim {
-            symbolic::Symbol indvar;
-            symbolic::Integer block_size;
-            gpu::GPUDimension dimension;
-        };
-        std::vector<CoopDim> coop_dims;
+        // Total cooperative-thread count (= 1 if no cooperative dims)
+        symbolic::Expression total_coop_threads = symbolic::integer(1);
+        for (auto& cd : coop_dims) {
+            total_coop_threads = symbolic::mul(total_coop_threads, cd.block_size);
+        }
 
-        for (auto* node : ancestors) {
-            if (auto* ancestor_map = dynamic_cast<structured_control_flow::Map*>(node)) {
-                if (!gpu::is_gpu_schedule(ancestor_map->schedule_type())) {
-                    continue;
-                }
-                bool appears_in_bases = false;
-                for (auto& base : tile_info_.bases) {
-                    if (symbolic::uses(base, ancestor_map->indvar())) {
-                        appears_in_bases = true;
-                        break;
-                    }
-                }
-                if (!appears_in_bases) {
-                    coop_dims.push_back(
-                        {ancestor_map->indvar(),
-                         gpu::gpu_block_size(ancestor_map->schedule_type()),
-                         gpu::gpu_dimension(ancestor_map->schedule_type())}
-                    );
-                }
+        // Flat within-block index over cooperative dims only (= 0 if none).
+        // Row-major: X is least-significant when present.
+        symbolic::Expression coop_flat = symbolic::integer(0);
+        {
+            symbolic::Expression stride = symbolic::integer(1);
+            for (auto it = coop_dims.rbegin(); it != coop_dims.rend(); ++it) {
+                coop_flat = symbolic::add(coop_flat, symbolic::mul(it->thread_idx, stride));
+                stride = symbolic::mul(stride, it->block_size);
             }
         }
 
-        // Compute total cooperative thread count
-        symbolic::Expression total_coop_threads = symbolic::integer(1);
-        for (auto& cd : coop_dims) {
-            total_coop_threads = symbolic::mul(total_coop_threads, cd.block_size);
+        // Varying-flat size = product of tile dim extents (excluding extent==1).
+        // This is the address range each thread cooperatively walks within its
+        // per-thread slot.
+        symbolic::Expression varying_flat_size = symbolic::integer(1);
+        for (auto& s : varying_dim_sizes) {
+            varying_flat_size = symbolic::mul(varying_flat_size, s);
         }
 
         // Create the local buffer with NV_Shared storage
         types::Array buffer_type(storage_type_, 0, {}, scalar_type, total_size);
         builder.add_container(local_name_, buffer_type);
 
-        // Emit: barrier → guarded cooperative copy → barrier → loop
+        // Emit: barrier → cooperative copy loop → barrier → main loop
         // 1. Barrier before copy
         auto& barrier_block1 = builder.add_block_before(*parent, loop_, {}, loop_.debug_info());
         builder.add_library_node<data_flow::BarrierLocalNode>(barrier_block1, {});
 
-        // 2. Cooperative copy with if_else guard
-        // Flatten cooperative thread index: coop_flat = sum(indvar[i] * product(block_size[j] for j>i))
-        symbolic::Expression coop_flat = symbolic::integer(0);
-        symbolic::Expression coop_stride = symbolic::integer(1);
-        for (int i = coop_dims.size() - 1; i >= 0; i--) {
-            coop_flat = symbolic::add(coop_flat, symbolic::mul(coop_dims[i].indvar, coop_stride));
-            coop_stride = symbolic::mul(coop_stride, coop_dims[i].block_size);
-        }
-
-        // Each thread loads elements strided by total_coop_threads
-        // Thread t loads elements: t, t + total_threads, t + 2*total_threads, ...
-        // We emit a loop: for (idx = coop_flat; idx < total_size; idx += total_coop_threads)
+        // 2. Cooperative copy: for (idx = coop_flat; idx < varying_flat_size; idx += total_coop_threads)
         auto idx_name = builder.find_new_name("__daisy_ils_coop_" + this->container_);
         types::Scalar idx_type(types::PrimitiveType::UInt64);
         builder.add_container(idx_name, idx_type);
         auto idx_var = symbolic::symbol(idx_name);
 
-        auto copy_init = coop_flat;
-        auto copy_condition = symbolic::Lt(idx_var, total_size);
-        auto copy_update = symbolic::add(idx_var, total_coop_threads);
-
         auto& copy_loop = builder.add_map_before(
             *parent,
             loop_,
             idx_var,
-            copy_condition,
-            copy_init,
-            copy_update,
+            symbolic::Lt(idx_var, varying_flat_size),
+            coop_flat,
+            symbolic::add(idx_var, total_coop_threads),
             structured_control_flow::ScheduleType_Sequential::create(),
             {},
             loop_.debug_info()
         );
 
-        // Decompose flat idx back into per-dimension indices for source subset
-        // idx maps to varying_dims in row-major order
         auto& copy_scope = copy_loop.root();
         auto& copy_block = builder.add_block(copy_scope);
         auto& copy_src = builder.add_access(copy_block, this->container_);
         auto& copy_dst = builder.add_access(copy_block, local_name_);
         auto& copy_tasklet = builder.add_tasklet(copy_block, data_flow::TaskletCode::assign, "_out", {"_in"});
 
-        // Decompose idx_var into per-dim indices
-        std::vector<symbolic::Expression> copy_indices;
+        // Decompose idx_var into per-varying-dim indices (row-major).
+        // For a single varying dim this is just idx_var.
+        std::vector<symbolic::Expression> varying_decomp;
         symbolic::Expression remainder = idx_var;
-        for (size_t i = 0; i < dim_sizes.size(); i++) {
-            if (i < dim_sizes.size() - 1) {
-                // integer division: idx / (product of remaining dims)
+        for (size_t i = 0; i < varying_dim_sizes.size(); i++) {
+            if (i + 1 < varying_dim_sizes.size()) {
                 symbolic::Expression divisor = symbolic::integer(1);
-                for (size_t j = i + 1; j < dim_sizes.size(); j++) {
-                    divisor = symbolic::mul(divisor, dim_sizes[j]);
+                for (size_t j = i + 1; j < varying_dim_sizes.size(); j++) {
+                    divisor = symbolic::mul(divisor, varying_dim_sizes[j]);
                 }
-                auto quotient = symbolic::div(remainder, divisor);
-                copy_indices.push_back(quotient);
+                varying_decomp.push_back(symbolic::div(remainder, divisor));
                 remainder = symbolic::mod(remainder, divisor);
             } else {
-                copy_indices.push_back(remainder);
+                varying_decomp.push_back(remainder);
             }
         }
 
-        auto copy_src_subset = build_original_subset(copy_indices);
-        data_flow::Subset copy_dst_subset = {idx_var};
+        // Source = original container at (bases — which already use the global
+        // Map indvars — plus the varying decomposition along each varying dim).
+        auto copy_src_subset = build_original_subset(varying_decomp);
+
+        // Destination = buffer at (per_thread_indices ++ varying_decomp) linearized.
+        std::vector<symbolic::Expression> dest_indices = per_thread_indices;
+        for (auto& v : varying_decomp) dest_indices.push_back(v);
+        data_flow::Subset copy_dst_subset = {linearize_exprs(dest_indices)};
 
         builder.add_computational_memlet(copy_block, copy_src, copy_tasklet, "_in", copy_src_subset, pointer_type);
         builder.add_computational_memlet(copy_block, copy_tasklet, "_out", copy_dst, copy_dst_subset, buffer_type);
@@ -413,7 +477,7 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An
             copy_indvars.push_back(indvar);
 
             auto init = symbolic::integer(0);
-            auto condition = symbolic::Lt(indvar, dim_sizes[i]);
+            auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]);
             auto update = symbolic::add(indvar, symbolic::integer(1));
 
             auto& copy_loop = builder.add_map(
@@ -481,7 +545,8 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An
                     }
                     auto* acc = mla.access(memlet);
                     if (acc && acc->subset.size() == tile_info_.dimensions.size()) {
-                        std::vector<symbolic::Expression> local_indices;
+                        // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]]
+                        std::vector<symbolic::Expression> local_indices = per_thread_indices;
                         for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
                             if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
                                 local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d)));
@@ -498,7 +563,8 @@ void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::An
                     }
                     auto* acc = mla.access(memlet);
                     if (acc && acc->subset.size() == tile_info_.dimensions.size()) {
-                        std::vector<symbolic::Expression> local_indices;
+                        // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]]
+                        std::vector<symbolic::Expression> local_indices = per_thread_indices;
                         for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
                             if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
                                 local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d)));
diff --git a/opt/src/transformations/out_local_storage.cpp b/opt/src/transformations/out_local_storage.cpp
index ff16ab11..4e6c598f 100644
--- a/opt/src/transformations/out_local_storage.cpp
+++ b/opt/src/transformations/out_local_storage.cpp
@@ -1,5 +1,6 @@
 #include "sdfg/transformations/out_local_storage.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <functional>
 #include <string>
@@ -206,39 +207,123 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
     local_name_ = builder.find_new_name("__daisy_out_local_storage_" + this->container_);
 
 
-    // Collect varying dimensions (extent > 1) and compute buffer layout.
+    // Collect varying dimensions (extent > 1) and their sizes.
     // Extent-1 dimensions are degenerate (no loop is needed) and must be
     // skipped when sizing the buffer, when creating copy indvars, and when
     // linearizing into the local buffer.  The bookkeeping must match what
     // `build_original_subset` expects (it indexes copy_indices by varying
     // dimension only).
     std::vector<size_t> varying_dims;
-    std::vector<symbolic::Expression> dim_sizes;
+    std::vector<symbolic::Expression> varying_dim_sizes;
     for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
         auto& dim_size = tile_info_.dimensions.at(d);
         if (!symbolic::eq(dim_size, symbolic::integer(1))) {
             varying_dims.push_back(d);
-            dim_sizes.push_back(dim_size);
+            varying_dim_sizes.push_back(dim_size);
         }
     }
 
-    // Compute total buffer size
-    symbolic::Expression total_size = symbolic::integer(1);
-    for (auto& ds : dim_sizes) {
-        total_size = symbolic::mul(total_size, ds);
+    // GPU classification: each ancestor GPU Map is either
+    //  - per-thread (its Map indvar appears in tile.bases — each thread sees a
+    //    distinct slice along that dim, so the shared buffer gets its own
+    //    per-thread slot indexed by the within-block thread_idx), or
+    //  - cooperative (Map indvar not in bases — all threads along that dim
+    //    cooperatively load/store the same shared tile, strided by thread_idx).
+    struct GpuDim {
+        gpu::GPUDimension dim;
+        symbolic::Symbol map_indvar; // global thread index (== thread_idx + blockIdx * blockDim)
+        symbolic::Symbol thread_idx; // within-block thread index (NV_Symbol)
+        symbolic::Integer block_size;
+        bool is_per_thread;
+    };
+    std::vector<GpuDim> per_thread_dims; // populated only on GPU path
+    std::vector<GpuDim> coop_dims; // populated only on GPU path
+    bool is_rocm = false;
+
+    if (storage_type_.is_nv_shared()) {
+        auto ancestors = ControlFlowNode::parent_chain(loop_);
+        for (auto* node : ancestors) {
+            auto* m = dynamic_cast<structured_control_flow::Map*>(node);
+            if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue;
+            if (m->schedule_type().value() == "ROCM") {
+                is_rocm = true;
+                break;
+            }
+        }
+        const std::string prefix = is_rocm ? "__daisy_hip_thread_idx_" : "__daisy_cuda_thread_idx_";
+        auto suffix = [](gpu::GPUDimension d) -> std::string {
+            switch (d) {
+                case gpu::GPUDimension::X:
+                    return "x";
+                case gpu::GPUDimension::Y:
+                    return "y";
+                case gpu::GPUDimension::Z:
+                    return "z";
+            }
+            return "?";
+        };
+        for (auto* node : ancestors) {
+            auto* m = dynamic_cast<structured_control_flow::Map*>(node);
+            if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue;
+            GpuDim gd;
+            gd.dim = gpu::gpu_dimension(m->schedule_type());
+            gd.map_indvar = m->indvar();
+            gd.thread_idx = symbolic::symbol(prefix + suffix(gd.dim));
+            gd.block_size = gpu::gpu_block_size(m->schedule_type());
+            gd.is_per_thread = false;
+            for (auto& base : tile_info_.bases) {
+                if (symbolic::uses(base, m->indvar())) {
+                    gd.is_per_thread = true;
+                    break;
+                }
+            }
+            (gd.is_per_thread ? per_thread_dims : coop_dims).push_back(gd);
+        }
+        auto by_dim = [](const GpuDim& a, const GpuDim& b) {
+            return static_cast<int>(a.dim) < static_cast<int>(b.dim);
+        };
+        std::sort(per_thread_dims.begin(), per_thread_dims.end(), by_dim);
+        std::sort(coop_dims.begin(), coop_dims.end(), by_dim);
+
+        // Ensure within-block thread_idx containers exist. Codegen recognises
+        // NV_Symbol-typed scalars and substitutes them with threadIdx.{x,y,z}
+        // (CUDA) or the ROCm equivalent at emission time.
+        auto ensure_idx = [&](const symbolic::Symbol& sym) {
+            if (!sdfg.exists(sym->get_name())) {
+                types::Scalar idx_type(types::PrimitiveType::Int32);
+                idx_type.storage_type(types::StorageType::NV_Symbol());
+                builder.add_container(sym->get_name(), idx_type);
+            }
+        };
+        for (auto& gd : per_thread_dims) ensure_idx(gd.thread_idx);
+        for (auto& gd : coop_dims) ensure_idx(gd.thread_idx);
     }
 
+    // Buffer dim sizes: [per-thread block sizes (X, Y, Z canonical order)] ++
+    //                   [varying tile dim sizes (original access-dim order)]
+    std::vector<symbolic::Expression> buf_dim_sizes;
+    for (auto& gd : per_thread_dims) buf_dim_sizes.push_back(gd.block_size);
+    for (auto& s : varying_dim_sizes) buf_dim_sizes.push_back(s);
+
+    // Total buffer size (number of scalar slots)
+    symbolic::Expression total_size = symbolic::integer(1);
+    for (auto& s : buf_dim_sizes) total_size = symbolic::mul(total_size, s);
+
+    // Per-thread index prefix (each thread's fixed buffer coords)
+    std::vector<symbolic::Expression> per_thread_indices;
+    for (auto& gd : per_thread_dims) per_thread_indices.push_back(gd.thread_idx);
+
     // Create the local buffer with specified storage type
     types::Array buffer_type(storage_type_, 0, {}, scalar_type, total_size);
     builder.add_container(local_name_, buffer_type);
 
-    // Helper: build linearized local index from per-dimension expressions
+    // Row-major linearization over buf_dim_sizes (leftmost dim = outermost stride)
     auto linearize_exprs = [&](const std::vector<symbolic::Expression>& indices) -> symbolic::Expression {
         symbolic::Expression linear_idx = symbolic::integer(0);
         symbolic::Expression stride = symbolic::integer(1);
-        for (int i = indices.size() - 1; i >= 0; i--) {
+        for (int i = static_cast<int>(indices.size()) - 1; i >= 0; i--) {
             linear_idx = symbolic::add(linear_idx, symbolic::mul(indices[i], stride));
-            stride = symbolic::mul(stride, dim_sizes[i]);
+            stride = symbolic::mul(stride, buf_dim_sizes[i]);
         }
         return linear_idx;
     };
@@ -272,52 +357,60 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
         // ============================================================
         // GPU COOPERATIVE PATH
         // ============================================================
-        auto ancestors = ControlFlowNode::parent_chain(loop_);
+        // Each thread owns a fixed slot along per-thread buffer dims and
+        // strides through the varying-flat range with the other threads
+        // sharing that slot (i.e. threads in cooperative dims only).
 
-        // Collect cooperative GPU dimensions
-        struct CoopDim {
-            symbolic::Symbol indvar;
-            symbolic::Integer block_size;
-            gpu::GPUDimension dimension;
-        };
-        std::vector<CoopDim> coop_dims;
-
-        for (auto* node : ancestors) {
-            if (auto* ancestor_map = dynamic_cast<structured_control_flow::Map*>(node)) {
-                if (!gpu::is_gpu_schedule(ancestor_map->schedule_type())) {
-                    continue;
-                }
-                bool appears_in_bases = false;
-                for (auto& base : tile_info_.bases) {
-                    if (symbolic::uses(base, ancestor_map->indvar())) {
-                        appears_in_bases = true;
-                        break;
-                    }
-                }
-                if (!appears_in_bases) {
-                    coop_dims.push_back(
-                        {ancestor_map->indvar(),
-                         gpu::gpu_block_size(ancestor_map->schedule_type()),
-                         gpu::gpu_dimension(ancestor_map->schedule_type())}
-                    );
-                }
-            }
-        }
-
-        // Compute total cooperative thread count
+        // Total cooperative-thread count (= 1 if no cooperative dims)
         symbolic::Expression total_coop_threads = symbolic::integer(1);
         for (auto& cd : coop_dims) {
             total_coop_threads = symbolic::mul(total_coop_threads, cd.block_size);
         }
 
-        // Flatten cooperative thread index
+        // Flat within-block index over cooperative dims only (= 0 if none).
+        // Row-major: X is least-significant when present.
         symbolic::Expression coop_flat = symbolic::integer(0);
-        symbolic::Expression coop_stride = symbolic::integer(1);
-        for (int i = coop_dims.size() - 1; i >= 0; i--) {
-            coop_flat = symbolic::add(coop_flat, symbolic::mul(coop_dims[i].indvar, coop_stride));
-            coop_stride = symbolic::mul(coop_stride, coop_dims[i].block_size);
+        {
+            symbolic::Expression stride = symbolic::integer(1);
+            for (auto it = coop_dims.rbegin(); it != coop_dims.rend(); ++it) {
+                coop_flat = symbolic::add(coop_flat, symbolic::mul(it->thread_idx, stride));
+                stride = symbolic::mul(stride, it->block_size);
+            }
         }
 
+        // Varying-flat size = product of tile dim extents (excluding extent==1).
+        // This is the address range each thread cooperatively walks within its
+        // per-thread slot.
+        symbolic::Expression varying_flat_size = symbolic::integer(1);
+        for (auto& s : varying_dim_sizes) {
+            varying_flat_size = symbolic::mul(varying_flat_size, s);
+        }
+
+        // Helper to decompose a flat varying index into per-varying-dim indices
+        // (row-major), and to build the buffer dest subset (per_thread ++ varying).
+        auto decompose = [&](const symbolic::Symbol& idx_var) {
+            std::vector<symbolic::Expression> result;
+            symbolic::Expression remainder = idx_var;
+            for (size_t i = 0; i < varying_dim_sizes.size(); i++) {
+                if (i + 1 < varying_dim_sizes.size()) {
+                    symbolic::Expression divisor = symbolic::integer(1);
+                    for (size_t j = i + 1; j < varying_dim_sizes.size(); j++) {
+                        divisor = symbolic::mul(divisor, varying_dim_sizes[j]);
+                    }
+                    result.push_back(symbolic::div(remainder, divisor));
+                    remainder = symbolic::mod(remainder, divisor);
+                } else {
+                    result.push_back(remainder);
+                }
+            }
+            return result;
+        };
+        auto buf_subset_for = [&](const std::vector<symbolic::Expression>& varying_decomp) -> data_flow::Subset {
+            std::vector<symbolic::Expression> dest_indices = per_thread_indices;
+            for (auto& v : varying_decomp) dest_indices.push_back(v);
+            return {linearize_exprs(dest_indices)};
+        };
+
         // INIT: barrier → cooperative copy-in → barrier (if has_read)
         if (tile_info_.has_read) {
             // Barrier before init
@@ -334,7 +427,7 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
                 *parent,
                 loop_,
                 idx_var,
-                symbolic::Lt(idx_var, total_size),
+                symbolic::Lt(idx_var, varying_flat_size),
                 coop_flat,
                 symbolic::add(idx_var, total_coop_threads),
                 structured_control_flow::ScheduleType_Sequential::create(),
@@ -347,25 +440,11 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
             auto& init_dst = builder.add_access(init_block, local_name_);
             auto& init_tasklet = builder.add_tasklet(init_block, data_flow::TaskletCode::assign, "_out", {"_in"});
 
-            // Decompose idx_var into per-dim indices over varying dims only
-            std::vector<symbolic::Expression> init_indices;
-            symbolic::Expression remainder = idx_var;
-            for (size_t i = 0; i < dim_sizes.size(); i++) {
-                if (i < dim_sizes.size() - 1) {
-                    symbolic::Expression divisor = symbolic::integer(1);
-                    for (size_t j = i + 1; j < dim_sizes.size(); j++) {
-                        divisor = symbolic::mul(divisor, dim_sizes[j]);
-                    }
-                    init_indices.push_back(symbolic::div(remainder, divisor));
-                    remainder = symbolic::mod(remainder, divisor);
-                } else {
-                    init_indices.push_back(remainder);
-                }
-            }
-
-            auto init_src_subset = build_original_subset(init_indices);
+            auto init_decomp = decompose(idx_var);
+            auto init_src_subset = build_original_subset(init_decomp);
+            auto init_dst_subset = buf_subset_for(init_decomp);
             builder.add_computational_memlet(init_block, init_src, init_tasklet, "_in", init_src_subset, pointer_type);
-            builder.add_computational_memlet(init_block, init_tasklet, "_out", init_dst, {idx_var}, buffer_type);
+            builder.add_computational_memlet(init_block, init_tasklet, "_out", init_dst, init_dst_subset, buffer_type);
 
             // Barrier after init
             auto& barrier_block2 = builder.add_block_before(*parent, loop_, {}, loop_.debug_info());
@@ -388,7 +467,7 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
                 *parent,
                 loop_,
                 idx_var,
-                symbolic::Lt(idx_var, total_size),
+                symbolic::Lt(idx_var, varying_flat_size),
                 coop_flat,
                 symbolic::add(idx_var, total_coop_threads),
                 structured_control_flow::ScheduleType_Sequential::create(),
@@ -401,24 +480,10 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
             auto& wb_dst = builder.add_access(wb_block, this->container_);
             auto& wb_tasklet = builder.add_tasklet(wb_block, data_flow::TaskletCode::assign, "_out", {"_in"});
 
-            // Decompose idx_var into per-dim indices over varying dims only
-            std::vector<symbolic::Expression> wb_indices;
-            symbolic::Expression remainder = idx_var;
-            for (size_t i = 0; i < dim_sizes.size(); i++) {
-                if (i < dim_sizes.size() - 1) {
-                    symbolic::Expression divisor = symbolic::integer(1);
-                    for (size_t j = i + 1; j < dim_sizes.size(); j++) {
-                        divisor = symbolic::mul(divisor, dim_sizes[j]);
-                    }
-                    wb_indices.push_back(symbolic::div(remainder, divisor));
-                    remainder = symbolic::mod(remainder, divisor);
-                } else {
-                    wb_indices.push_back(remainder);
-                }
-            }
-
-            auto wb_dst_subset = build_original_subset(wb_indices);
-            builder.add_computational_memlet(wb_block, wb_src, wb_tasklet, "_in", {idx_var}, buffer_type);
+            auto wb_decomp = decompose(idx_var);
+            auto wb_src_subset = buf_subset_for(wb_decomp);
+            auto wb_dst_subset = build_original_subset(wb_decomp);
+            builder.add_computational_memlet(wb_block, wb_src, wb_tasklet, "_in", wb_src_subset, buffer_type);
             builder.add_computational_memlet(wb_block, wb_tasklet, "_out", wb_dst, wb_dst_subset, pointer_type);
 
             // Barrier after writeback
@@ -443,7 +508,7 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
                 init_indvars.push_back(indvar);
 
                 auto init = symbolic::integer(0);
-                auto condition = symbolic::Lt(indvar, dim_sizes[i]);
+                auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]);
                 auto update = symbolic::add(indvar, symbolic::integer(1));
 
                 auto& init_loop = builder.add_map(
@@ -488,7 +553,7 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
                 wb_indvars.push_back(indvar);
 
                 auto init = symbolic::integer(0);
-                auto condition = symbolic::Lt(indvar, dim_sizes[i]);
+                auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]);
                 auto update = symbolic::add(indvar, symbolic::integer(1));
 
                 auto& wb_loop = builder.add_map(
@@ -556,7 +621,8 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
                     }
                     auto* acc = mla.access(memlet);
                     if (acc && acc->subset.size() == tile_info_.dimensions.size()) {
-                        std::vector<symbolic::Expression> local_indices;
+                        // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]]
+                        std::vector<symbolic::Expression> local_indices = per_thread_indices;
                         for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
                             if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
                                 local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d)));
@@ -580,7 +646,8 @@ void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::A
                     }
                     auto* acc = mla.access(memlet);
                     if (acc && acc->subset.size() == tile_info_.dimensions.size()) {
-                        std::vector<symbolic::Expression> local_indices;
+                        // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]]
+                        std::vector<symbolic::Expression> local_indices = per_thread_indices;
                         for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
                             if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
                                 local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d)));
diff --git a/opt/tests/transformations/in_local_storage_test.cpp b/opt/tests/transformations/in_local_storage_test.cpp
index f0eecd5b..a46e0dd6 100644
--- a/opt/tests/transformations/in_local_storage_test.cpp
+++ b/opt/tests/transformations/in_local_storage_test.cpp
@@ -2289,8 +2289,9 @@ TEST(InLocalStorageTest, GPU_Cooperative_SymbolicBounds) {
     EXPECT_EQ(buf_type.storage_type(), types::StorageType::NV_Shared());
 
     auto& arr_type = static_cast<const types::Array&>(buf_type);
-    // Extent M resolved to 8 from GPU Y-dim block_size
-    EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(8)));
+    // Per-thread X dim contributes BX=32 slots; varying dim (extent M→8) contributes 8.
+    // Total = 32 * 8 = 256.
+    EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(256)));
 
     // Verify structure: [barrier, copy_loop, barrier, main_loop]
     auto& map_y_body = map_y.root();
diff --git a/opt/tests/transformations/out_local_storage_test.cpp b/opt/tests/transformations/out_local_storage_test.cpp
index 9a5a2c2f..44d00513 100644
--- a/opt/tests/transformations/out_local_storage_test.cpp
+++ b/opt/tests/transformations/out_local_storage_test.cpp
@@ -2476,7 +2476,9 @@ TEST(OutLocalStorageTest, GPU_Cooperative_FlatPointer) {
     auto& buf_type = builder_opt.subject().type("__daisy_out_local_storage_C0");
     EXPECT_EQ(buf_type.storage_type(), types::StorageType::NV_Shared());
     auto& arr_type = static_cast<const types::Array&>(buf_type);
-    EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(8)));
+    // Per-thread Y dim (j in C base) contributes BY=8 slots; varying dim (M→8) contributes 8.
+    // Total = 8 * 8 = 64.
+    EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(64)));
 
     // Verify structure: write-only → [main_loop, barrier, writeback_loop, barrier]
     auto& map_y_body = map_y.root();
@@ -2595,7 +2597,9 @@ TEST(OutLocalStorageTest, GPU_Cooperative_ReadWrite) {
     EXPECT_EQ(buf_type.type_id(), types::TypeID::Array);
 
     auto& arr_type = static_cast<const types::Array&>(buf_type);
-    EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(32)));
+    // Per-thread Y dim (j in C base) contributes BY=8 slots; varying dim (N→32) contributes 32.
+    // Total = 8 * 32 = 256.
+    EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(256)));
 
     // Verify structure: has_read → [barrier, init_copy, barrier, main_loop, barrier, writeback, barrier]
     auto& map_y_body = map_y.root();
@@ -2880,6 +2884,8 @@ TEST(OutLocalStorageTest, GPU_Cooperative_SymbolicBounds) {
     EXPECT_TRUE(builder_opt.subject().exists("__daisy_out_local_storage_C0"));
     auto& buf_type = builder_opt.subject().type("__daisy_out_local_storage_C0");
     auto& arr_type = static_cast<const types::Array&>(buf_type);
-    EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(8)));
+    // Per-thread X dim (i in C base) contributes BX=32 slots; varying dim (M→8) contributes 8.
+    // Total = 32 * 8 = 256.
+    EXPECT_TRUE(symbolic::eq(arr_type.num_elements(), symbolic::integer(256)));
     EXPECT_EQ(buf_type.storage_type(), types::StorageType::NV_Shared());
 }

From 154d1c60991d116ba319d4dc29ba0a903282cd2e Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Sun, 14 Jun 2026 14:47:16 +0200
Subject: [PATCH 2/4] moves optimizations tests

---
 opt/tests/CMakeLists.txt                                      | 4 ++--
 .../{transformations => }/optimizations/blocking_test.cpp     | 0
 .../optimizations/diamond_tiling_test.cpp                     | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename opt/tests/{transformations => }/optimizations/blocking_test.cpp (100%)
 rename opt/tests/{transformations => }/optimizations/diamond_tiling_test.cpp (100%)

diff --git a/opt/tests/CMakeLists.txt b/opt/tests/CMakeLists.txt
index 907a2775..4f5e2ad2 100644
--- a/opt/tests/CMakeLists.txt
+++ b/opt/tests/CMakeLists.txt
@@ -31,6 +31,8 @@ set(TEST_FILES
     transformations/offloading/rocblas_data_transfer_extraction_test.cpp
     transformations/offloading/cuda_stdlib_data_transfer_extraction_test.cpp
     transformations/offloading/rocm_stdlib_data_transfer_extraction_test.cpp
+    optimizations/diamond_tiling_test.cpp
+    optimizations/blocking_test.cpp
     passes/offloading/cuda_library_node_transfer_extraction_pass_test.cpp
     passes/offloading/cuda_library_node_expand_tests.cpp
     passes/offloading/rocm_library_node_transfer_extraction_pass_test.cpp
@@ -64,8 +66,6 @@ set(TEST_FILES
     transformations/in_local_storage_test.cpp
     transformations/recorder_test.cpp
     transformations/vectorize_transform_test.cpp
-    transformations/optimizations/diamond_tiling_test.cpp
-    transformations/optimizations/blocking_test.cpp
     transformations/transformation_serialization_test.cpp
     test.cpp
 )
diff --git a/opt/tests/transformations/optimizations/blocking_test.cpp b/opt/tests/optimizations/blocking_test.cpp
similarity index 100%
rename from opt/tests/transformations/optimizations/blocking_test.cpp
rename to opt/tests/optimizations/blocking_test.cpp
diff --git a/opt/tests/transformations/optimizations/diamond_tiling_test.cpp b/opt/tests/optimizations/diamond_tiling_test.cpp
similarity index 100%
rename from opt/tests/transformations/optimizations/diamond_tiling_test.cpp
rename to opt/tests/optimizations/diamond_tiling_test.cpp

From 4aa1204ca2443f7c9469745805a1d5ba16e401a0 Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Sun, 14 Jun 2026 15:31:22 +0200
Subject: [PATCH 3/4] deprecates compound GPU transformations in favor of more
 searchable simple transformations

---
 .../sdfg/passes/offloading/gpu_tiling_pass.h  |  16 +-
 .../transformations/offloading/gpu_tiling.h   |  22 +-
 .../offloading/kernel_local_storage.h         |  16 +-
 opt/tests/CMakeLists.txt                      |   1 +
 opt/tests/optimizations/gpu_kernels_test.cpp  | 603 ++++++++++++++++++
 5 files changed, 654 insertions(+), 4 deletions(-)
 create mode 100644 opt/tests/optimizations/gpu_kernels_test.cpp

diff --git a/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h b/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h
index aa8e374a..cc56e586 100644
--- a/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h
+++ b/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h
@@ -8,14 +8,26 @@ namespace sdfg {
 namespace passes {
 
 /**
- * @brief Phased GPU tiling pass.
+ * @brief [DEPRECATED] Phased GPU tiling pass.
+ *
+ * Drives the legacy `transformations::GPUTiling` +
+ * `transformations::KernelLocalStorage` pair. Both are deprecated. New code
+ * should compose `LoopTiling`, `CUDAParallelizeNestedMap` / `cuda::CUDATransform`,
+ * `InLocalStorage` / `OutLocalStorage` (with `NV_Shared`), and
+ * `passes::SyncConditionPropagation` directly — see
+ * `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked example.
+ *
+ * Retained for the existing CUDA and ROCm schedulers that have not yet been
+ * migrated.
  *
  * Given a set of outer maps, finds all descendant structured loops and applies
  * GPU tiling in two phases:
  * 1. can_be_applied phase: collects all loops where tiling is applicable
  * 2. apply phase: applies tiling to all collected loops
  */
-class GPUTilingPass : public Pass {
+class [[deprecated(
+    "Use LoopTiling + CUDA/ROCm parallelize + In/OutLocalStorage + SyncConditionPropagation. See gpu_kernels_test.cpp."
+)]] GPUTilingPass : public Pass {
 private:
     const std::vector<structured_control_flow::Map*>& maps_;
     size_t tile_size_;
diff --git a/opt/include/sdfg/transformations/offloading/gpu_tiling.h b/opt/include/sdfg/transformations/offloading/gpu_tiling.h
index 0a8089ea..daabec2b 100644
--- a/opt/include/sdfg/transformations/offloading/gpu_tiling.h
+++ b/opt/include/sdfg/transformations/offloading/gpu_tiling.h
@@ -6,7 +6,27 @@
 namespace sdfg {
 namespace transformations {
 
-class GPUTiling : public Transformation {
+/**
+ * @brief [DEPRECATED] Monolithic GPU tiling transformation.
+ *
+ * Prefer the composable pipeline instead:
+ *   1. transformations::LoopTiling             — strip-mine the target loop
+ *   2. transformations::CUDAParallelizeNestedMap / cuda::CUDATransform
+ *                                              — assign GPU schedules
+ *   3. transformations::InLocalStorage  (NV_Shared) — stage read tiles
+ *      transformations::OutLocalStorage (NV_Shared) — stage write tiles
+ *   4. passes::SyncConditionPropagation         — guard out-of-bounds threads
+ *
+ * See `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked
+ * example (GEMM). KernelLocalStorage and GPUTilingPass are deprecated for
+ * the same reason.
+ *
+ * The legacy transformation is retained for autotuning search spaces and
+ * existing schedulers that have not yet been migrated.
+ */
+class [[deprecated(
+    "Use LoopTiling + CUDA/ROCm parallelize + In/OutLocalStorage + SyncConditionPropagation. See gpu_kernels_test.cpp."
+)]] GPUTiling : public Transformation {
     structured_control_flow::StructuredLoop& loop_;
     size_t size_;
     bool applied_ = false;
diff --git a/opt/include/sdfg/transformations/offloading/kernel_local_storage.h b/opt/include/sdfg/transformations/offloading/kernel_local_storage.h
index a009f3a7..8faa8db9 100644
--- a/opt/include/sdfg/transformations/offloading/kernel_local_storage.h
+++ b/opt/include/sdfg/transformations/offloading/kernel_local_storage.h
@@ -9,7 +9,21 @@
 namespace sdfg {
 namespace transformations {
 
-class KernelLocalStorage : public Transformation {
+/**
+ * @brief [DEPRECATED] Monolithic shared-memory staging transformation.
+ *
+ * Prefer `transformations::InLocalStorage` (for read tiles) and
+ * `transformations::OutLocalStorage` (for write tiles) with the
+ * `types::StorageType::NV_Shared()` storage. Those transformations are
+ * composable with `LoopTiling`, `CUDATransform` / `CUDAParallelizeNestedMap`,
+ * and `passes::SyncConditionPropagation`. See
+ * `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked example.
+ *
+ * The legacy transformation is retained for autotuning search spaces and
+ * existing schedulers that have not yet been migrated.
+ */
+class [[deprecated("Use InLocalStorage / OutLocalStorage with NV_Shared. See gpu_kernels_test.cpp."
+)]] KernelLocalStorage : public Transformation {
 private:
     structured_control_flow::StructuredLoop& loop_;
     symbolic::Expression offset_;
diff --git a/opt/tests/CMakeLists.txt b/opt/tests/CMakeLists.txt
index 4f5e2ad2..35c7f0da 100644
--- a/opt/tests/CMakeLists.txt
+++ b/opt/tests/CMakeLists.txt
@@ -33,6 +33,7 @@ set(TEST_FILES
     transformations/offloading/rocm_stdlib_data_transfer_extraction_test.cpp
     optimizations/diamond_tiling_test.cpp
     optimizations/blocking_test.cpp
+    optimizations/gpu_kernels_test.cpp
     passes/offloading/cuda_library_node_transfer_extraction_pass_test.cpp
     passes/offloading/cuda_library_node_expand_tests.cpp
     passes/offloading/rocm_library_node_transfer_extraction_pass_test.cpp
diff --git a/opt/tests/optimizations/gpu_kernels_test.cpp b/opt/tests/optimizations/gpu_kernels_test.cpp
new file mode 100644
index 00000000..7f081aef
--- /dev/null
+++ b/opt/tests/optimizations/gpu_kernels_test.cpp
@@ -0,0 +1,603 @@
+// =============================================================================
+// GPU Kernel Tests
+// =============================================================================
+//
+// The pipeline demonstrated here is the canonical way to bring a sequential,
+// flat-pointer matmul kernel onto the GPU:
+//
+//   sequential Map(i) Map(j) For(k)               // host-side scalar kernel
+//     |
+//     |  (1) LoopInterchange(i, j)                // coalesce X-dim along j
+//     v
+//   Map(j) Map(i) For(k)
+//     |
+//     |  (2) cuda::CUDATransform(map_j, BX=32)    // offload to CUDA, X-dim
+//     |  (3) CUDAParallelizeNestedMap(map_i, BY)  // nested Y-dim
+//     v
+//   Map_X(j, BX=32) Map_Y(i, BY=8) For(k)        // kernel skeleton
+//     |
+//     |  (4) LoopTiling(for_k, TK=8)              // strip-mine k
+//     v
+//   Map_X Map_Y For(kk, step=8) For(k_in)
+//     |
+//     |  (5) InLocalStorage(for_k_inner, A, NV_Shared) // stage A tile in SMEM
+//     |  (6) InLocalStorage(for_k_inner, B, NV_Shared) // stage B tile in SMEM
+//     |  (7) OutLocalStorage(for_kk, C)                 // promote C to register
+//     v
+//   Map_X Map_Y { C_reg = C[..],
+//                 for(kk) {
+//                   barriers + coop copy_A,
+//                   barriers + coop copy_B,
+//                   for(k_in) [fma reads A_local, B_local; accumulates C_reg]
+//                 },
+//                 C[..] = C_reg }
+//
+// =============================================================================
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "sdfg/analysis/analysis.h"
+#include "sdfg/analysis/users.h"
+#include "sdfg/builder/structured_sdfg_builder.h"
+#include "sdfg/data_flow/access_node.h"
+#include "sdfg/data_flow/library_node.h"
+#include "sdfg/data_flow/library_nodes/barrier_local_node.h"
+#include "sdfg/data_flow/tasklet.h"
+#include "sdfg/passes/offloading/sync_condition_propagation.h"
+#include "sdfg/passes/structured_control_flow/dead_cfg_elimination.h"
+#include "sdfg/passes/structured_control_flow/sequence_fusion.h"
+#include "sdfg/structured_control_flow/block.h"
+#include "sdfg/structured_control_flow/for.h"
+#include "sdfg/structured_control_flow/if_else.h"
+#include "sdfg/structured_control_flow/map.h"
+#include "sdfg/structured_control_flow/sequence.h"
+#include "sdfg/structured_control_flow/structured_loop.h"
+#include "sdfg/structured_control_flow/while.h"
+#include "sdfg/symbolic/symbolic.h"
+#include "sdfg/targets/cuda/cuda.h"
+#include "sdfg/targets/gpu/gpu_schedule_type.h"
+#include "sdfg/transformations/in_local_storage.h"
+#include "sdfg/transformations/loop_interchange.h"
+#include "sdfg/transformations/loop_tiling.h"
+#include "sdfg/transformations/offloading/cuda_parallelize_nested_map.h"
+#include "sdfg/transformations/offloading/cuda_transform.h"
+#include "sdfg/transformations/out_local_storage.h"
+#include "sdfg/types/array.h"
+#include "sdfg/types/pointer.h"
+#include "sdfg/types/scalar.h"
+
+using namespace sdfg;
+
+namespace {
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+void cleanup(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& am) {
+    passes::SequenceFusion sequence_fusion;
+    passes::DeadCFGElimination dead_cfg;
+    bool applies;
+    do {
+        applies = false;
+        applies |= dead_cfg.run(builder, am);
+        applies |= sequence_fusion.run(builder, am);
+    } while (applies);
+}
+
+// Find an access node for any container whose name matches `predicate`,
+// reachable from `node` (recursively). Used to look up access nodes after
+// CUDATransform renames pointer arguments (e.g. A -> __daisy_cuda_<id>_A).
+data_flow::AccessNode* find_access_by_suffix(structured_control_flow::ControlFlowNode& node, const std::string& suffix) {
+    // Walk the node recursively. We only need a single match - first wins.
+    std::function<data_flow::AccessNode*(structured_control_flow::ControlFlowNode&)> walk =
+        [&](structured_control_flow::ControlFlowNode& n) -> data_flow::AccessNode* {
+        if (auto* block = dynamic_cast<structured_control_flow::Block*>(&n)) {
+            for (auto& dn : block->dataflow().nodes()) {
+                if (auto* an = dynamic_cast<data_flow::AccessNode*>(&dn)) {
+                    const auto& data = an->data();
+                    if (data.size() >= suffix.size() &&
+                        data.compare(data.size() - suffix.size(), suffix.size(), suffix) == 0) {
+                        return an;
+                    }
+                }
+            }
+        } else if (auto* seq = dynamic_cast<structured_control_flow::Sequence*>(&n)) {
+            for (size_t i = 0; i < seq->size(); ++i) {
+                if (auto* hit = walk(seq->at(i).first)) {
+                    return hit;
+                }
+            }
+        } else if (auto* ifelse = dynamic_cast<structured_control_flow::IfElse*>(&n)) {
+            for (size_t i = 0; i < ifelse->size(); ++i) {
+                if (auto* hit = walk(ifelse->at(i).first)) {
+                    return hit;
+                }
+            }
+        } else if (auto* loop_node = dynamic_cast<structured_control_flow::StructuredLoop*>(&n)) {
+            if (auto* hit = walk(loop_node->root())) {
+                return hit;
+            }
+        }
+        return nullptr;
+    };
+    // For a StructuredLoop, walk its body; for everything else, walk the node
+    // itself (Sequence/Block/IfElse search recursively).
+    if (auto* loop = dynamic_cast<structured_control_flow::StructuredLoop*>(&node)) {
+        return walk(loop->root());
+    }
+    return walk(node);
+}
+
+// Find the unique container whose name starts with `prefix`.
+std::string find_container_by_prefix(const sdfg::Function& fn, const std::string& prefix) {
+    for (auto& name : fn.containers()) {
+        if (name.size() >= prefix.size() && name.compare(0, prefix.size(), prefix) == 0) {
+            return name;
+        }
+    }
+    return {};
+}
+
+// Find the unique container name ending in `suffix` (e.g. "_A0" for the SMEM
+// buffer that ILS created for the A pointer argument).
+std::string find_container_by_suffix(const sdfg::Function& fn, const std::string& suffix) {
+    for (auto& name : fn.containers()) {
+        if (name.size() >= suffix.size() && name.compare(name.size() - suffix.size(), suffix.size(), suffix) == 0) {
+            return name;
+        }
+    }
+    return {};
+}
+
+// Find the first For loop under `seq`.
+structured_control_flow::For* find_first_for(structured_control_flow::Sequence& seq) {
+    for (size_t i = 0; i < seq.size(); ++i) {
+        if (auto* f = dynamic_cast<structured_control_flow::For*>(&seq.at(i).first)) {
+            return f;
+        }
+    }
+    return nullptr;
+}
+
+// Find the first Map under `seq`.
+structured_control_flow::Map* find_first_map(structured_control_flow::Sequence& seq) {
+    for (size_t i = 0; i < seq.size(); ++i) {
+        if (auto* m = dynamic_cast<structured_control_flow::Map*>(&seq.at(i).first)) {
+            return m;
+        }
+    }
+    return nullptr;
+}
+
+// ---------------------------------------------------------------------------
+// Matmul fixture: flat-pointer, linearized accesses, fixed bounds
+// ---------------------------------------------------------------------------
+//
+// Builds:
+//   for (i = 0; i < M; ++i)
+//     for (j = 0; j < N; ++j)
+//       for (k = 0; k < K; ++k)
+//         C[i*N+j] = A[i*K+k] * B[k*N+j] + C[i*N+j]   // fma
+//
+// Dimensions are integer constants so the argument-size analysis used by
+// CUDATransform can succeed without `allow_dynamic_sizes=true`.
+
+struct MatmulFixture {
+    static constexpr long M = 128;
+    static constexpr long N = 128;
+    static constexpr long K = 64;
+
+    std::unique_ptr<builder::StructuredSDFGBuilder> builder;
+    structured_control_flow::Map* for_i = nullptr;
+    structured_control_flow::Map* for_j = nullptr;
+    structured_control_flow::For* for_k = nullptr;
+
+    void build() {
+        builder = std::make_unique<builder::StructuredSDFGBuilder>("gpu_kernel_gemm", FunctionType_CPU);
+
+        types::Scalar idx_desc(types::PrimitiveType::Int64);
+        types::Scalar elem_desc(types::PrimitiveType::Double);
+        types::Pointer ptr_desc(elem_desc);
+
+        builder->add_container("i", idx_desc);
+        builder->add_container("j", idx_desc);
+        builder->add_container("k", idx_desc);
+        builder->add_container("A", ptr_desc, /*is_argument=*/true);
+        builder->add_container("B", ptr_desc, /*is_argument=*/true);
+        builder->add_container("C", ptr_desc, /*is_argument=*/true);
+
+        auto i = symbolic::symbol("i");
+        auto j = symbolic::symbol("j");
+        auto k = symbolic::symbol("k");
+        auto M_e = symbolic::integer(M);
+        auto N_e = symbolic::integer(N);
+        auto K_e = symbolic::integer(K);
+
+        auto& root = builder->subject().root();
+
+        for_i = &builder->add_map(
+            root,
+            i,
+            symbolic::Lt(i, M_e),
+            symbolic::integer(0),
+            symbolic::add(i, symbolic::integer(1)),
+            structured_control_flow::ScheduleType_Sequential::create()
+        );
+        for_j = &builder->add_map(
+            for_i->root(),
+            j,
+            symbolic::Lt(j, N_e),
+            symbolic::integer(0),
+            symbolic::add(j, symbolic::integer(1)),
+            structured_control_flow::ScheduleType_Sequential::create()
+        );
+        for_k = &builder->add_for(
+            for_j->root(), k, symbolic::Lt(k, K_e), symbolic::integer(0), symbolic::add(k, symbolic::integer(1))
+        );
+
+        // fma: C[i*N+j] = A[i*K+k] * B[k*N+j] + C[i*N+j]
+        auto& block = builder->add_block(for_k->root());
+        auto& a_in = builder->add_access(block, "A");
+        auto& b_in = builder->add_access(block, "B");
+        auto& c_in = builder->add_access(block, "C");
+        auto& c_out = builder->add_access(block, "C");
+
+        auto& tasklet = builder->add_tasklet(block, data_flow::TaskletCode::fp_fma, "_out", {"_in1", "_in2", "_in3"});
+
+        builder
+            ->add_computational_memlet(block, a_in, tasklet, "_in1", {symbolic::add(symbolic::mul(i, K_e), k)}, ptr_desc);
+        builder
+            ->add_computational_memlet(block, b_in, tasklet, "_in2", {symbolic::add(symbolic::mul(k, N_e), j)}, ptr_desc);
+        builder
+            ->add_computational_memlet(block, c_in, tasklet, "_in3", {symbolic::add(symbolic::mul(i, N_e), j)}, ptr_desc);
+        builder
+            ->add_computational_memlet(block, tasklet, "_out", c_out, {symbolic::add(symbolic::mul(i, N_e), j)}, ptr_desc);
+    }
+};
+
+} // namespace
+
+// =============================================================================
+// Test
+// =============================================================================
+
+TEST(GPUKernelTest, GEMM_CudaTilingILS) {
+    constexpr int BX = 32; // X-dim block size (warp width, coalesced along j)
+    constexpr int BY = 8; // Y-dim block size
+    constexpr int TK = 8; // K-tile size
+
+    MatmulFixture fix;
+    fix.build();
+    auto& builder = *fix.builder;
+    analysis::AnalysisManager am(builder.subject());
+
+    // -------------------------------------------------------------------------
+    // (1) Loop interchange: i <-> j so j becomes outermost.
+    //     This makes the j-dim the X-dim (coalesced along the fast axis of
+    //     row-major C/B) once we offload.
+    // -------------------------------------------------------------------------
+    {
+        transformations::LoopInterchange interchange(*fix.for_i, *fix.for_j);
+        ASSERT_TRUE(interchange.can_be_applied(builder, am)) << "LoopInterchange(i, j) must apply";
+        interchange.apply(builder, am);
+        am.invalidate_all();
+    }
+
+    // After interchange the new outer loop body is the (former) i loop, and
+    // its body still contains the unchanged k loop. Re-find references.
+    auto* for_j_outer = find_first_map(builder.subject().root());
+    ASSERT_NE(for_j_outer, nullptr);
+    EXPECT_EQ(for_j_outer->indvar()->get_name(), "j");
+
+    auto* for_i_inner = find_first_map(for_j_outer->root());
+    ASSERT_NE(for_i_inner, nullptr);
+    EXPECT_EQ(for_i_inner->indvar()->get_name(), "i");
+
+    auto* for_k_inner = find_first_for(for_i_inner->root());
+    ASSERT_NE(for_k_inner, nullptr);
+    EXPECT_EQ(for_k_inner->indvar()->get_name(), "k");
+
+    // -------------------------------------------------------------------------
+    // (2) CUDATransform on Map(j): X-dim CUDA kernel, BX=32.
+    //     Also inserts H2D/D2H blocks around the kernel and renames pointer
+    //     arguments A, B, C -> __daisy_cuda_<id>_{A,B,C}.
+    // -------------------------------------------------------------------------
+    {
+        cuda::CUDATransform offload(*for_j_outer, /*block_size=*/BX);
+        ASSERT_TRUE(offload.can_be_applied(builder, am)) << "CUDATransform on Map(j) must apply";
+        offload.apply(builder, am);
+        am.invalidate_all();
+    }
+
+    // for_j_outer is unchanged in identity but the schedule type was updated.
+    EXPECT_EQ(for_j_outer->schedule_type().value(), cuda::ScheduleType_CUDA::value());
+    EXPECT_EQ(cuda::ScheduleType_CUDA::dimension(for_j_outer->schedule_type()), cuda::CUDADimension::X);
+    EXPECT_TRUE(symbolic::eq(cuda::ScheduleType_CUDA::block_size(for_j_outer->schedule_type()), symbolic::integer(BX)));
+
+    // -------------------------------------------------------------------------
+    // (3) Promote Map(i) to the Y dimension (BY=8).
+    // -------------------------------------------------------------------------
+    {
+        transformations::CUDAParallelizeNestedMap parallelize(*for_i_inner, /*block_size=*/BY);
+        ASSERT_TRUE(parallelize.can_be_applied(builder, am)) << "CUDAParallelizeNestedMap on Map(i) must apply";
+        parallelize.apply(builder, am);
+        am.invalidate_all();
+    }
+
+    EXPECT_EQ(for_i_inner->schedule_type().value(), cuda::ScheduleType_CUDA::value());
+    EXPECT_EQ(cuda::ScheduleType_CUDA::dimension(for_i_inner->schedule_type()), cuda::CUDADimension::Y);
+    EXPECT_TRUE(symbolic::eq(cuda::ScheduleType_CUDA::block_size(for_i_inner->schedule_type()), symbolic::integer(BY)));
+
+    // -------------------------------------------------------------------------
+    // (4) Strip-mine the k loop: For(k) -> For(k_tile, step=TK) For(k).
+    // -------------------------------------------------------------------------
+    auto* for_k_in_kernel = find_first_for(for_i_inner->root());
+    ASSERT_NE(for_k_in_kernel, nullptr);
+    EXPECT_EQ(for_k_in_kernel->indvar()->get_name(), "k");
+
+    structured_control_flow::StructuredLoop* for_kk = nullptr;
+    {
+        transformations::LoopTiling tile(*for_k_in_kernel, /*tile_size=*/TK);
+        ASSERT_TRUE(tile.can_be_applied(builder, am)) << "LoopTiling on for_k must apply";
+        tile.apply(builder, am);
+        for_kk = tile.outer_loop();
+        ASSERT_NE(for_kk, nullptr);
+        am.invalidate_all();
+    }
+
+    // for_kk should now be the only direct loop child of for_i_inner->root().
+    // LoopTiling uses find_new_name for the outer indvar, so the actual name
+    // includes a disambiguating suffix ("k_tile0").
+    {
+        const std::string indvar = for_kk->indvar()->get_name();
+        EXPECT_EQ(indvar.compare(0, std::strlen("k_tile"), "k_tile"), 0) << "Unexpected k_tile indvar: " << indvar;
+    }
+
+    // -------------------------------------------------------------------------
+    // (5) Stage A tile in SMEM. ILS is applied to the *inner* k-loop (post
+    //     tiling), so the cooperative load lands BETWEEN for_kk and the inner
+    //     for_k. Each for_kk iteration loads exactly TK elements per i thread.
+    //     After CUDATransform, A is renamed - look it up by suffix.
+    // -------------------------------------------------------------------------
+    auto* for_k_inner_tiled = find_first_for(for_kk->root());
+    ASSERT_NE(for_k_inner_tiled, nullptr);
+    EXPECT_EQ(for_k_inner_tiled->indvar()->get_name(), "k");
+
+    {
+        auto* a_access = find_access_by_suffix(*for_k_inner_tiled, "_A");
+        ASSERT_NE(a_access, nullptr) << "Renamed A access not found under for_k_inner_tiled";
+
+        transformations::InLocalStorage ils_a(*for_k_inner_tiled, *a_access, types::StorageType::NV_Shared());
+        ASSERT_TRUE(ils_a.can_be_applied(builder, am)) << "ILS on A must apply";
+        ils_a.apply(builder, am);
+        am.invalidate_all();
+    }
+
+    // -------------------------------------------------------------------------
+    // (6) Stage B tile in SMEM.
+    // -------------------------------------------------------------------------
+    {
+        auto* b_access = find_access_by_suffix(*for_k_inner_tiled, "_B");
+        ASSERT_NE(b_access, nullptr) << "Renamed B access not found under for_k_inner_tiled";
+
+        transformations::InLocalStorage ils_b(*for_k_inner_tiled, *b_access, types::StorageType::NV_Shared());
+        ASSERT_TRUE(ils_b.can_be_applied(builder, am)) << "ILS on B must apply";
+        ils_b.apply(builder, am);
+        am.invalidate_all();
+    }
+
+    // -------------------------------------------------------------------------
+    // (7) Promote C to a per-thread register accumulator.
+    //
+    //     OLS applied at the for_kk scope hoists the C load OUT of for_kk and
+    //     the writeback AFTER for_kk. Each thread sees C[i*N + j] (a single
+    //     scalar slot per thread), so the tile collapses to a 1-element local
+    //     and the entire k-traversal accumulates in a register.
+    //
+    //     This cuts GMEM traffic on C from 2*M*N*(K/TK) accesses to just 2*M*N
+    //     (one load + one store per output element), and is the prerequisite
+    //     for any further inner-loop optimization (unrolling, vectorization,
+    //     tensor-core MMA), since the inner k loop is now a pure FMA chain
+    //     with no DRAM round-trip on the accumulator.
+    //
+    //     Default storage = CPU_Stack: on GPU codegen this lowers to a
+    //     per-thread local (register or stack slot, at the compiler's
+    //     discretion).
+    // -------------------------------------------------------------------------
+    {
+        auto* c_access = find_access_by_suffix(*for_kk, "_C");
+        ASSERT_NE(c_access, nullptr) << "Renamed C access not found under for_kk";
+
+        transformations::OutLocalStorage ols_c(*for_kk, *c_access);
+        ASSERT_TRUE(ols_c.can_be_applied(builder, am)) << "OLS on C must apply";
+        ols_c.apply(builder, am);
+        am.invalidate_all();
+    }
+
+    // -------------------------------------------------------------------------
+    // (8) Propagate the grid-condition through the kernel so that out-of-bounds
+    //     threads still hit every barrier but skip non-barrier work. For
+    //     perfectly divisible bounds (M%BY == 0, N%BX == 0) this is a no-op
+    //     w.r.t. semantics, but it sets the `nested_sync` schedule property
+    //     and is part of the canonical pipeline.
+    // -------------------------------------------------------------------------
+    {
+        passes::SyncConditionPropagation sync_prop;
+        sync_prop.run_pass(builder, am);
+        am.invalidate_all();
+    }
+
+    cleanup(builder, am);
+
+    // -------------------------------------------------------------------------
+    // Verification: SMEM buffers exist with correct sizes.
+    //   A_local: per-thread on Y (BY) x varying TK = 8 * 8 = 64 elements
+    //   B_local: per-thread on X (BX) x varying TK = 32 * 8 = 256 elements
+    //
+    // ILS names its buffer `__daisy_in_local_storage_<container>` (where
+    // <container> is the renamed device arg, e.g. `__daisy_cuda_0_A`). The
+    // cooperative copy indvar uses a similar suffix (`__daisy_ils_coop_...`),
+    // so we look up by the buffer prefix to disambiguate.
+    // -------------------------------------------------------------------------
+    auto a_local = find_container_by_prefix(builder.subject(), "__daisy_in_local_storage_");
+    ASSERT_FALSE(a_local.empty()) << "First SMEM container not created";
+    std::string b_local;
+    for (auto& name : builder.subject().containers()) {
+        if (name == a_local) continue;
+        if (name.compare(0, std::strlen("__daisy_in_local_storage_"), "__daisy_in_local_storage_") == 0) {
+            b_local = name;
+            break;
+        }
+    }
+    ASSERT_FALSE(b_local.empty()) << "Second SMEM container not created";
+
+    // Pin A vs B by the container suffix (which carries the original arg name).
+    if (a_local.find("_B") != std::string::npos) {
+        std::swap(a_local, b_local);
+    }
+    EXPECT_NE(a_local.find("_A"), std::string::npos) << "A buffer name: " << a_local;
+    EXPECT_NE(b_local.find("_B"), std::string::npos) << "B buffer name: " << b_local;
+
+    const auto& a_type = builder.subject().type(a_local);
+    const auto& b_type = builder.subject().type(b_local);
+
+    EXPECT_EQ(a_type.storage_type(), types::StorageType::NV_Shared());
+    EXPECT_EQ(b_type.storage_type(), types::StorageType::NV_Shared());
+
+    const auto* a_arr = dynamic_cast<const types::Array*>(&a_type);
+    const auto* b_arr = dynamic_cast<const types::Array*>(&b_type);
+
+    ASSERT_NE(a_arr, nullptr) << "A_local is not an Array";
+    ASSERT_NE(b_arr, nullptr) << "B_local is not an Array";
+
+    EXPECT_TRUE(symbolic::eq(a_arr->num_elements(), symbolic::integer(BY * TK)))
+        << "A_local size mismatch: got " << a_arr->num_elements()->__str__() << ", expected " << (BY * TK);
+    EXPECT_TRUE(symbolic::eq(b_arr->num_elements(), symbolic::integer(BX * TK)))
+        << "B_local size mismatch: got " << b_arr->num_elements()->__str__() << ", expected " << (BX * TK);
+
+    // -------------------------------------------------------------------------
+    // Verification: C register accumulator exists as a 1-element per-thread
+    // local (CPU_Stack on GPU = register/local slot). OLS names its buffer
+    // `__daisy_out_local_storage_<container>`.
+    // -------------------------------------------------------------------------
+    auto c_local = find_container_by_prefix(builder.subject(), "__daisy_out_local_storage_");
+    ASSERT_FALSE(c_local.empty()) << "C register-accumulator container not created";
+    EXPECT_NE(c_local.find("_C"), std::string::npos) << "OLS buffer name: " << c_local;
+
+    const auto& c_type = builder.subject().type(c_local);
+    EXPECT_EQ(c_type.storage_type(), types::StorageType::CPU_Stack())
+        << "C accumulator should be a per-thread local (CPU_Stack), got " << c_type.storage_type().value();
+
+    const auto* c_arr = dynamic_cast<const types::Array*>(&c_type);
+    ASSERT_NE(c_arr, nullptr) << "C_local is not an Array";
+    EXPECT_TRUE(symbolic::eq(c_arr->num_elements(), symbolic::integer(1)))
+        << "C_local should be a single-element register accumulator, got " << c_arr->num_elements()->__str__();
+
+    // -------------------------------------------------------------------------
+    // Loop structure: map_j(X, BX) > map_i(Y, BY) > for_kk > [prologue
+    // blocks/maps..., for_k_inner > fma_block]. ILS was applied to the inner
+    // for_k, so the cooperative copy + barriers live INSIDE for_kk's body,
+    // ahead of the inner k loop. map_i_body should still hold just for_kk as
+    // its only meaningful child.
+    // -------------------------------------------------------------------------
+    auto& map_i_body = for_i_inner->root();
+
+    // Find the (now tiled) outer kk loop under map_i_body. After ILS+cleanup,
+    // it should be the only For directly under map_i_body.
+    structured_control_flow::For* kk_loop = nullptr;
+    for (size_t i = 0; i < map_i_body.size(); ++i) {
+        if (auto* f = dynamic_cast<structured_control_flow::For*>(&map_i_body.at(i).first)) {
+            const std::string indvar = f->indvar()->get_name();
+            if (indvar.compare(0, std::strlen("k_tile"), "k_tile") == 0) {
+                kk_loop = f;
+                break;
+            }
+        }
+    }
+    ASSERT_NE(kk_loop, nullptr) << "k_tile loop not found under map_i_body";
+
+    auto& kk_body = kk_loop->root();
+    EXPECT_GE(kk_body.size(), 4u) << "for_kk body should hold prologue + inner k loop";
+
+    // SyncConditionPropagation wraps each non-barrier child of a GPU Map body
+    // in an `if (map.condition()) { ... }` guard. The pass runs once per GPU
+    // ancestor (Map(j) X, then Map(i) Y), so each non-barrier child of for_kk
+    // ends up wrapped in TWO nested single-case IfElse nodes. Unwrap to a
+    // fixpoint.
+    auto unwrap_if_else = [](structured_control_flow::ControlFlowNode* node
+                          ) -> structured_control_flow::ControlFlowNode* {
+        while (auto* ie = dynamic_cast<structured_control_flow::IfElse*>(node)) {
+            if (ie->size() != 1) return nullptr;
+            auto& inner = ie->at(0).first;
+            if (inner.size() != 1) return nullptr;
+            node = &inner.at(0).first;
+        }
+        return node;
+    };
+
+    // Last child of kk_body must be (a wrapper around) the inner k loop.
+    auto* last_raw = &kk_body.at(kk_body.size() - 1).first;
+    auto* last_unwrapped = unwrap_if_else(last_raw);
+    ASSERT_NE(last_unwrapped, nullptr) << "last kk_body child does not unwrap to a single inner node";
+
+    auto* last_child = dynamic_cast<structured_control_flow::For*>(last_unwrapped);
+    ASSERT_NE(last_child, nullptr) << "last unwrapped child is not a For";
+    EXPECT_EQ(last_child->indvar()->get_name(), "k");
+
+    // The fma body references the SMEM buffers (no longer the renamed device
+    // pointers for A/B) and the C register accumulator (no longer the C device
+    // pointer).
+    auto* a_in_body = find_access_by_suffix(*last_child, a_local);
+    auto* b_in_body = find_access_by_suffix(*last_child, b_local);
+    EXPECT_NE(a_in_body, nullptr) << "fma body does not read A from SMEM";
+    EXPECT_NE(b_in_body, nullptr) << "fma body does not read B from SMEM";
+
+    // OLS rewrote all C accesses inside for_kk to point at C_local. The
+    // device pointer "_C" (i.e. `__daisy_cuda_0_C`) should no longer appear
+    // anywhere inside the inner k loop. C_local must appear instead.
+    auto* c_dev_in_body = find_access_by_suffix(*last_child, "_C");
+    EXPECT_EQ(c_dev_in_body, nullptr) << "C device pointer still present inside inner k loop after OLS";
+
+    auto* c_reg_in_body = find_access_by_suffix(*last_child, c_local);
+    EXPECT_NE(c_reg_in_body, nullptr) << "fma body does not reference C register accumulator";
+
+    // The C device pointer must still appear OUTSIDE for_kk — once for the
+    // initial load (C_local = C[..]) and once for the writeback (C[..] =
+    // C_local). Both live as siblings of for_kk under map_i_body, wrapped in
+    // IfElse by SyncConditionPropagation.
+    auto* c_dev_outside = find_access_by_suffix(map_i_body, "_C");
+    EXPECT_NE(c_dev_outside, nullptr) << "C device pointer load/store vanished from kernel";
+
+    // Exactly two cooperative copy Maps must exist in kk_body, possibly wrapped
+    // in IfElse by SyncConditionPropagation.
+    int copy_map_count = 0;
+    for (size_t i = 0; i < kk_body.size(); ++i) {
+        auto* unwrapped = unwrap_if_else(&kk_body.at(i).first);
+        if (unwrapped && dynamic_cast<structured_control_flow::Map*>(unwrapped)) {
+            ++copy_map_count;
+        }
+    }
+    EXPECT_EQ(copy_map_count, 2) << "Expected exactly two cooperative copy Maps inside for_kk";
+
+    // Barrier blocks: at least one per staged buffer pair (before & after copy)
+    // -> 4 total. Barriers are NOT wrapped (they must execute for all threads).
+    int barrier_block_count = 0;
+    for (size_t i = 0; i < kk_body.size(); ++i) {
+        auto* blk = dynamic_cast<structured_control_flow::Block*>(&kk_body.at(i).first);
+        if (!blk) continue;
+        for (auto& n : blk->dataflow().nodes()) {
+            if (auto* lib = dynamic_cast<data_flow::LibraryNode*>(&n)) {
+                if (lib->code() == data_flow::LibraryNodeType_BarrierLocal) {
+                    ++barrier_block_count;
+                    break;
+                }
+            }
+        }
+    }
+    EXPECT_GE(barrier_block_count, 2) << "Expected at least 2 barrier blocks (one per copy boundary pair)";
+}

From 4be7da0a488588defb5429e224cdd3bbe93a289b Mon Sep 17 00:00:00 2001
From: Lukas Truemper <lukas.truemper@outlook.de>
Date: Sun, 14 Jun 2026 17:55:54 +0200
Subject: [PATCH 4/4] allows GPU schedules on strided maps

---
 opt/include/sdfg/targets/gpu/gpu_map_utils.h  | 24 +++++++
 opt/src/targets/cuda/cuda_map_dispatcher.cpp  | 69 ++++++++++++++++---
 opt/src/targets/gpu/gpu_map_utils.cpp         | 41 ++++++++---
 opt/src/targets/rocm/rocm_map_dispatcher.cpp  | 57 ++++++++++++---
 .../cuda_parallelize_nested_map.cpp           | 14 ++--
 .../offloading/offload_transform.cpp          | 12 ++--
 .../rocm_parallelize_nested_map.cpp           | 14 ++--
 7 files changed, 175 insertions(+), 56 deletions(-)

diff --git a/opt/include/sdfg/targets/gpu/gpu_map_utils.h b/opt/include/sdfg/targets/gpu/gpu_map_utils.h
index 7c62c6d2..85fc0685 100644
--- a/opt/include/sdfg/targets/gpu/gpu_map_utils.h
+++ b/opt/include/sdfg/targets/gpu/gpu_map_utils.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <vector>
+
 #include "sdfg/analysis/analysis.h"
 #include "sdfg/structured_control_flow/map.h"
 #include "sdfg/symbolic/symbolic.h"
@@ -75,6 +77,23 @@ symbolic::SymbolSet get_gpu_indvars(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
 );
 
+/**
+ * @brief Get all GPU Map nodes in a given dimension (in tree traversal order).
+ *
+ * Unlike get_gpu_indvars, this preserves access to each Map's init / stride
+ * so the codegen can emit `indvar = init + thread_flat_id * stride` for
+ * arbitrary affine grid loops.
+ *
+ * @tparam ScheduleT Schedule type class with value() and dimension() static methods
+ * @param node The current map node
+ * @param analysis_manager Analysis manager for loop analysis
+ * @param dimension GPU dimension (X, Y, or Z)
+ * @return Vector of Map pointers in the given GPU dimension
+ */
+template<typename ScheduleT>
+std::vector<structured_control_flow::Map*>
+get_gpu_maps(structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension);
+
 // Extern template declarations to prevent implicit instantiation
 extern template symbolic::Expression find_nested_gpu_blocksize<
     cuda::ScheduleType_CUDA>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
@@ -96,5 +115,10 @@ extern template symbolic::SymbolSet get_gpu_indvars<
 extern template symbolic::SymbolSet get_gpu_indvars<
     rocm::ScheduleType_ROCM>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
 
+extern template std::vector<structured_control_flow::Map*> get_gpu_maps<
+    cuda::ScheduleType_CUDA>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
+extern template std::vector<structured_control_flow::Map*> get_gpu_maps<
+    rocm::ScheduleType_ROCM>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
+
 } // namespace gpu
 } // namespace sdfg
diff --git a/opt/src/targets/cuda/cuda_map_dispatcher.cpp b/opt/src/targets/cuda/cuda_map_dispatcher.cpp
index 61ad78d1..02f507cd 100644
--- a/opt/src/targets/cuda/cuda_map_dispatcher.cpp
+++ b/opt/src/targets/cuda/cuda_map_dispatcher.cpp
@@ -207,8 +207,27 @@ void CUDAMapDispatcher::dispatch_kernel_body(
     }
     // Boundary Conditions
     if (!ScheduleType_CUDA::nested_sync(node_.schedule_type())) {
-        library_stream << "if (" << indvar->get_name() << " < " << cuda_language_extension.expression(num_iterations)
-                       << ") {" << std::endl;
+        // Guard on the flat thread id rather than the per-Map indvar so that
+        // Maps with non-unit stride or non-zero init still get a correct OOB
+        // check (the per-Map indvar = init + flat_id * stride and is
+        // only well-defined when flat_id < num_iterations).
+        std::string flat_id;
+        switch (ScheduleType_CUDA::dimension(node_.schedule_type())) {
+            case CUDADimension::X:
+                flat_id = "__daisy_cuda_indvar_x";
+                break;
+            case CUDADimension::Y:
+                flat_id = "__daisy_cuda_indvar_y";
+                break;
+            case CUDADimension::Z:
+                flat_id = "__daisy_cuda_indvar_z";
+                break;
+            default:
+                flat_id = indvar->get_name();
+                break;
+        }
+        library_stream << "if (" << flat_id << " < " << cuda_language_extension.expression(num_iterations) << ") {"
+                       << std::endl;
         library_stream.setIndent(library_stream.indent() + 4);
     }
 
@@ -317,18 +336,46 @@ void CUDAMapDispatcher::dispatch_kernel_preamble(
     library_stream << "int " << indvar_z << " = " << this->language_extension_.expression(gpu_indvar_z) << ";"
                    << std::endl;
 
-    // Declare all other indvars in the kernel
-    for (auto& var : x_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_x << ";" << std::endl;
-    }
+    // Declare each per-Map indvar as a strided affine of the flat thread id:
+    //   <map.indvar> = <map.init> + <thread_flat_id> * <map.stride>
+    //
+    // This lets the dispatcher consume Maps with arbitrary init / stride
+    // (e.g. block-tiled outer loops produced by LoopTiling). The bound check
+    // in dispatch_kernel_body() guards on the flat id against num_iterations,
+    // so out-of-grid threads are skipped before any body access.
+    auto x_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::X);
+    auto y_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::Y);
+    auto z_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::Z);
+
+    auto emit_indvar = [&](structured_control_flow::Map* map, const std::string& flat_id_var) {
+        symbolic::Expression value = symbolic::symbol(flat_id_var);
+        auto stride = map->stride();
+        if (!stride.is_null() && !symbolic::eq(stride, symbolic::one())) {
+            value = symbolic::mul(value, stride);
+        }
+        auto init = map->init();
+        if (!symbolic::eq(init, symbolic::zero())) {
+            value = symbolic::add(init, value);
+        }
+        library_stream << "int " << map->indvar()->get_name() << " = " << this->language_extension_.expression(value)
+                       << ";" << std::endl;
+    };
 
-    for (auto& var : y_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_y << ";" << std::endl;
+    for (auto* map : x_maps) {
+        emit_indvar(map, indvar_x);
     }
-
-    for (auto& var : z_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_z << ";" << std::endl;
+    for (auto* map : y_maps) {
+        emit_indvar(map, indvar_y);
+    }
+    for (auto* map : z_maps) {
+        emit_indvar(map, indvar_z);
     }
+    // x_vars/y_vars/z_vars params kept for signature compatibility (used by
+    // callers to filter scope_variables); their iteration here would be
+    // redundant with the per-Map loops above.
+    (void) x_vars;
+    (void) y_vars;
+    (void) z_vars;
 }
 
 codegen::InstrumentationInfo CUDAMapDispatcher::instrumentation_info() const {
diff --git a/opt/src/targets/gpu/gpu_map_utils.cpp b/opt/src/targets/gpu/gpu_map_utils.cpp
index 02b2afc9..f3a87879 100644
--- a/opt/src/targets/gpu/gpu_map_utils.cpp
+++ b/opt/src/targets/gpu/gpu_map_utils.cpp
@@ -88,16 +88,10 @@ symbolic::Expression find_nested_gpu_iterations(
                 continue;
             }
 
-            auto init = map->init();
-            if (!symbolic::eq(init, symbolic::zero())) {
-                throw InvalidSDFGException("Init is not zero");
-            }
-
-            auto stride = map->stride();
-            if (!symbolic::eq(stride, symbolic::one())) {
-                throw InvalidSDFGException("Stride is not one");
-            }
-
+            // Note: arbitrary `init` and `stride` are permitted here; the
+            // dispatcher emits `indvar = init + thread_flat_id * stride` so
+            // the body sees the natural strided value. `num_iterations()`
+            // already accounts for both.
             auto num_iterations = map->num_iterations();
             if (num_iterations.is_null()) {
                 throw InvalidSDFGException("Cannot determine number of iterations for nested map in GPU kernel");
@@ -144,6 +138,25 @@ symbolic::SymbolSet get_gpu_indvars(
     return indvars;
 }
 
+template<typename ScheduleT>
+std::vector<structured_control_flow::Map*>
+get_gpu_maps(structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension) {
+    auto& loop_analysis = analysis_manager.get<analysis::LoopAnalysis>();
+    auto loops = loop_analysis.descendants(&node);
+    loops.insert(&node);
+    std::vector<structured_control_flow::Map*> maps;
+    for (const auto& loop : loops) {
+        if (auto map = dynamic_cast<structured_control_flow::Map*>(loop)) {
+            if (map->schedule_type().value() == ScheduleT::value()) {
+                if (ScheduleT::dimension(map->schedule_type()) == dimension) {
+                    maps.push_back(map);
+                }
+            }
+        }
+    }
+    return maps;
+}
+
 // Explicit template instantiations for CUDA
 template symbolic::Expression find_nested_gpu_blocksize<cuda::ScheduleType_CUDA>(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
@@ -160,6 +173,10 @@ template symbolic::SymbolSet get_gpu_indvars<cuda::ScheduleType_CUDA>(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
 );
 
+template std::vector<structured_control_flow::Map*> get_gpu_maps<cuda::ScheduleType_CUDA>(
+    structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
+);
+
 // Explicit template instantiations for ROCM
 template symbolic::Expression find_nested_gpu_blocksize<rocm::ScheduleType_ROCM>(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
@@ -176,5 +193,9 @@ template symbolic::SymbolSet get_gpu_indvars<rocm::ScheduleType_ROCM>(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
 );
 
+template std::vector<structured_control_flow::Map*> get_gpu_maps<rocm::ScheduleType_ROCM>(
+    structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
+);
+
 } // namespace gpu
 } // namespace sdfg
diff --git a/opt/src/targets/rocm/rocm_map_dispatcher.cpp b/opt/src/targets/rocm/rocm_map_dispatcher.cpp
index 0c1b4151..08634ac7 100644
--- a/opt/src/targets/rocm/rocm_map_dispatcher.cpp
+++ b/opt/src/targets/rocm/rocm_map_dispatcher.cpp
@@ -207,8 +207,23 @@ void ROCMMapDispatcher::dispatch_kernel_body(
     }
     // Boundary Conditions
     if (!ScheduleType_ROCM::nested_sync(node_.schedule_type())) {
-        library_stream << "if (" << indvar->get_name() << " < " << rocm_language_extension.expression(num_iterations)
-                       << ") {" << std::endl;
+        std::string flat_id;
+        switch (ScheduleType_ROCM::dimension(node_.schedule_type())) {
+            case ROCMDimension::X:
+                flat_id = "__daisy_hip_indvar_x";
+                break;
+            case ROCMDimension::Y:
+                flat_id = "__daisy_hip_indvar_y";
+                break;
+            case ROCMDimension::Z:
+                flat_id = "__daisy_hip_indvar_z";
+                break;
+            default:
+                flat_id = indvar->get_name();
+                break;
+        }
+        library_stream << "if (" << flat_id << " < " << rocm_language_extension.expression(num_iterations) << ") {"
+                       << std::endl;
         library_stream.setIndent(library_stream.indent() + 4);
     }
 
@@ -316,18 +331,38 @@ void ROCMMapDispatcher::dispatch_kernel_preamble(
     library_stream << "int " << indvar_z << " = " << this->language_extension_.expression(gpu_indvar_z) << ";"
                    << std::endl;
 
-    // Declare all other indvars in the kernel
-    for (auto& var : x_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_x << ";" << std::endl;
-    }
+    // Declare each per-Map indvar as a strided affine of the flat thread id:
+    //   <map.indvar> = <map.init> + <thread_flat_id> * <map.stride>
+    auto x_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::X);
+    auto y_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::Y);
+    auto z_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::Z);
+
+    auto emit_indvar = [&](structured_control_flow::Map* map, const std::string& flat_id_var) {
+        symbolic::Expression value = symbolic::symbol(flat_id_var);
+        auto stride = map->stride();
+        if (!stride.is_null() && !symbolic::eq(stride, symbolic::one())) {
+            value = symbolic::mul(value, stride);
+        }
+        auto init = map->init();
+        if (!symbolic::eq(init, symbolic::zero())) {
+            value = symbolic::add(init, value);
+        }
+        library_stream << "int " << map->indvar()->get_name() << " = " << this->language_extension_.expression(value)
+                       << ";" << std::endl;
+    };
 
-    for (auto& var : y_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_y << ";" << std::endl;
+    for (auto* map : x_maps) {
+        emit_indvar(map, indvar_x);
     }
-
-    for (auto& var : z_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_z << ";" << std::endl;
+    for (auto* map : y_maps) {
+        emit_indvar(map, indvar_y);
+    }
+    for (auto* map : z_maps) {
+        emit_indvar(map, indvar_z);
     }
+    (void) x_vars;
+    (void) y_vars;
+    (void) z_vars;
 }
 
 codegen::InstrumentationInfo ROCMMapDispatcher::instrumentation_info() const {
diff --git a/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp b/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp
index 54f1ae6c..24586897 100644
--- a/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp
+++ b/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp
@@ -53,16 +53,10 @@ bool CUDAParallelizeNestedMap::
         return false;
     }
 
-    // Condition: Check if current loop starts from 0
-    if (!symbolic::eq(loop_.init(), symbolic::zero())) {
-        return false;
-    }
-
-    // Condition: Loop has a stride of 1
-    auto stride = loop_.stride();
-    if (!symbolic::eq(stride, symbolic::one())) {
-        return false;
-    }
+    // Note: arbitrary `init` and `stride` are permitted. The CUDA dispatcher
+    // emits `<map.indvar> = init + thread_flat_id * stride`, so the body sees
+    // the natural strided value; `num_iterations()` accounts for both when
+    // computing the grid geometry.
 
     // Condition: Resulting CUDA grid dimension must not exceed hardware limits.
     // Y and Z grid dimensions are limited to 65535.
diff --git a/opt/src/transformations/offloading/offload_transform.cpp b/opt/src/transformations/offloading/offload_transform.cpp
index bc47d294..6130aafd 100644
--- a/opt/src/transformations/offloading/offload_transform.cpp
+++ b/opt/src/transformations/offloading/offload_transform.cpp
@@ -75,10 +75,14 @@ bool OffloadTransform::can_be_applied(builder::StructuredSDFGBuilder& builder, a
         }
     }
 
-    // Criterion: Map must start at 0
-    if (!symbolic::eq(this->map_.init(), symbolic::zero())) {
-        if (report_) report_->transform_impossible(this, "non zero start");
-        DEBUG_PRINTLN("Cannot apply transform: map does not start at zero");
+    // Note: arbitrary `init` and `stride` are permitted on the kernel-boundary
+    // Map. The CUDA/ROCm dispatchers emit
+    //   `<map.indvar> = init + thread_flat_id * stride`,
+    // and `num_iterations()` already accounts for both when computing the grid
+    // geometry.
+    if (map_.num_iterations().is_null()) {
+        if (report_) report_->transform_impossible(this, "cannot determine num iterations");
+        DEBUG_PRINTLN("Cannot apply transform: cannot determine number of iterations for map");
         return false;
     }
 
diff --git a/opt/src/transformations/offloading/rocm_parallelize_nested_map.cpp b/opt/src/transformations/offloading/rocm_parallelize_nested_map.cpp
index 1b25864b..0bfd55b9 100644
--- a/opt/src/transformations/offloading/rocm_parallelize_nested_map.cpp
+++ b/opt/src/transformations/offloading/rocm_parallelize_nested_map.cpp
@@ -53,16 +53,10 @@ bool ROCMParallelizeNestedMap::
         return false;
     }
 
-    // Condition: Check if current loop starts from 0
-    if (!symbolic::eq(loop_.init(), symbolic::zero())) {
-        return false;
-    }
-
-    // Condition: Loop has a stride of 1
-    auto stride = loop_.stride();
-    if (!symbolic::eq(stride, symbolic::one())) {
-        return false;
-    }
+    // Note: arbitrary `init` and `stride` are permitted. The ROCm dispatcher
+    // emits `<map.indvar> = init + thread_flat_id * stride`, so the body sees
+    // the natural strided value; `num_iterations()` accounts for both when
+    // computing the grid geometry.
 
     // Condition: Resulting ROCm grid dimension must not exceed hardware limits.
     // Y and Z grid dimensions are limited to 65535.