daisytuner · lukastruemper · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h b/opt/include/sdfg/passes/offloading/gpu_tiling_pass.h
@@ -8,14 +8,26 @@ namespace sdfg {
 namespace passes {
 
 /**
- * @brief Phased GPU tiling pass.
+ * @brief [DEPRECATED] Phased GPU tiling pass.
+ *
+ * Drives the legacy `transformations::GPUTiling` +
+ * `transformations::KernelLocalStorage` pair. Both are deprecated. New code
+ * should compose `LoopTiling`, `CUDAParallelizeNestedMap` / `cuda::CUDATransform`,
+ * `InLocalStorage` / `OutLocalStorage` (with `NV_Shared`), and
+ * `passes::SyncConditionPropagation` directly — see
+ * `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked example.
+ *
+ * Retained for the existing CUDA and ROCm schedulers that have not yet been
+ * migrated.
  *
  * Given a set of outer maps, finds all descendant structured loops and applies
  * GPU tiling in two phases:
  * 1. can_be_applied phase: collects all loops where tiling is applicable
  * 2. apply phase: applies tiling to all collected loops
  */
-class GPUTilingPass : public Pass {
+class [[deprecated(
+    "Use LoopTiling + CUDA/ROCm parallelize + In/OutLocalStorage + SyncConditionPropagation. See gpu_kernels_test.cpp."
+)]] GPUTilingPass : public Pass {
 private:
     const std::vector<structured_control_flow::Map*>& maps_;
     size_t tile_size_;

diff --git a/opt/include/sdfg/targets/gpu/gpu_map_utils.h b/opt/include/sdfg/targets/gpu/gpu_map_utils.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <vector>
+
 #include "sdfg/analysis/analysis.h"
 #include "sdfg/structured_control_flow/map.h"
 #include "sdfg/symbolic/symbolic.h"
@@ -75,6 +77,23 @@ symbolic::SymbolSet get_gpu_indvars(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
 );
 
+/**
+ * @brief Get all GPU Map nodes in a given dimension (in tree traversal order).
+ *
+ * Unlike get_gpu_indvars, this preserves access to each Map's init / stride
+ * so the codegen can emit `indvar = init + thread_flat_id * stride` for
+ * arbitrary affine grid loops.
+ *
+ * @tparam ScheduleT Schedule type class with value() and dimension() static methods
+ * @param node The current map node
+ * @param analysis_manager Analysis manager for loop analysis
+ * @param dimension GPU dimension (X, Y, or Z)
+ * @return Vector of Map pointers in the given GPU dimension
+ */
+template<typename ScheduleT>
+std::vector<structured_control_flow::Map*>
+get_gpu_maps(structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension);
+
 // Extern template declarations to prevent implicit instantiation
 extern template symbolic::Expression find_nested_gpu_blocksize<
     cuda::ScheduleType_CUDA>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
@@ -96,5 +115,10 @@ extern template symbolic::SymbolSet get_gpu_indvars<
 extern template symbolic::SymbolSet get_gpu_indvars<
     rocm::ScheduleType_ROCM>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
 
+extern template std::vector<structured_control_flow::Map*> get_gpu_maps<
+    cuda::ScheduleType_CUDA>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
+extern template std::vector<structured_control_flow::Map*> get_gpu_maps<
+    rocm::ScheduleType_ROCM>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
+
 } // namespace gpu
 } // namespace sdfg
diff --git a/opt/include/sdfg/transformations/offloading/gpu_tiling.h b/opt/include/sdfg/transformations/offloading/gpu_tiling.h
@@ -6,7 +6,27 @@
 namespace sdfg {
 namespace transformations {
 
-class GPUTiling : public Transformation {
+/**
+ * @brief [DEPRECATED] Monolithic GPU tiling transformation.
+ *
+ * Prefer the composable pipeline instead:
+ *   1. transformations::LoopTiling             — strip-mine the target loop
+ *   2. transformations::CUDAParallelizeNestedMap / cuda::CUDATransform
+ *                                              — assign GPU schedules
+ *   3. transformations::InLocalStorage  (NV_Shared) — stage read tiles
+ *      transformations::OutLocalStorage (NV_Shared) — stage write tiles
+ *   4. passes::SyncConditionPropagation         — guard out-of-bounds threads
+ *
+ * See `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked
+ * example (GEMM). KernelLocalStorage and GPUTilingPass are deprecated for
+ * the same reason.
+ *
+ * The legacy transformation is retained for autotuning search spaces and
+ * existing schedulers that have not yet been migrated.
+ */
+class [[deprecated(
+    "Use LoopTiling + CUDA/ROCm parallelize + In/OutLocalStorage + SyncConditionPropagation. See gpu_kernels_test.cpp."
+)]] GPUTiling : public Transformation {
     structured_control_flow::StructuredLoop& loop_;
     size_t size_;
     bool applied_ = false;

diff --git a/opt/include/sdfg/transformations/offloading/kernel_local_storage.h b/opt/include/sdfg/transformations/offloading/kernel_local_storage.h
@@ -9,7 +9,21 @@
 namespace sdfg {
 namespace transformations {
 
-class KernelLocalStorage : public Transformation {
+/**
+ * @brief [DEPRECATED] Monolithic shared-memory staging transformation.
+ *
+ * Prefer `transformations::InLocalStorage` (for read tiles) and
+ * `transformations::OutLocalStorage` (for write tiles) with the
+ * `types::StorageType::NV_Shared()` storage. Those transformations are
+ * composable with `LoopTiling`, `CUDATransform` / `CUDAParallelizeNestedMap`,
+ * and `passes::SyncConditionPropagation`. See
+ * `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked example.
+ *
+ * The legacy transformation is retained for autotuning search spaces and
+ * existing schedulers that have not yet been migrated.
+ */
+class [[deprecated("Use InLocalStorage / OutLocalStorage with NV_Shared. See gpu_kernels_test.cpp."
+)]] KernelLocalStorage : public Transformation {
 private:
     structured_control_flow::StructuredLoop& loop_;
     symbolic::Expression offset_;

diff --git a/opt/src/targets/cuda/cuda_map_dispatcher.cpp b/opt/src/targets/cuda/cuda_map_dispatcher.cpp
@@ -207,8 +207,27 @@ void CUDAMapDispatcher::dispatch_kernel_body(
     }
     // Boundary Conditions
     if (!ScheduleType_CUDA::nested_sync(node_.schedule_type())) {
-        library_stream << "if (" << indvar->get_name() << " < " << cuda_language_extension.expression(num_iterations)
-                       << ") {" << std::endl;
+        // Guard on the flat thread id rather than the per-Map indvar so that
+        // Maps with non-unit stride or non-zero init still get a correct OOB
+        // check (the per-Map indvar = init + flat_id * stride and is
+        // only well-defined when flat_id < num_iterations).
+        std::string flat_id;
+        switch (ScheduleType_CUDA::dimension(node_.schedule_type())) {
+            case CUDADimension::X:
+                flat_id = "__daisy_cuda_indvar_x";
+                break;
+            case CUDADimension::Y:
+                flat_id = "__daisy_cuda_indvar_y";
+                break;
+            case CUDADimension::Z:
+                flat_id = "__daisy_cuda_indvar_z";
+                break;
+            default:
+                flat_id = indvar->get_name();
+                break;
+        }
+        library_stream << "if (" << flat_id << " < " << cuda_language_extension.expression(num_iterations) << ") {"
+                       << std::endl;
         library_stream.setIndent(library_stream.indent() + 4);
     }
 
@@ -317,18 +336,46 @@ void CUDAMapDispatcher::dispatch_kernel_preamble(
     library_stream << "int " << indvar_z << " = " << this->language_extension_.expression(gpu_indvar_z) << ";"
                    << std::endl;
 
-    // Declare all other indvars in the kernel
-    for (auto& var : x_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_x << ";" << std::endl;
-    }
+    // Declare each per-Map indvar as a strided affine of the flat thread id:
+    //   <map.indvar> = <map.init> + <thread_flat_id> * <map.stride>
+    //
+    // This lets the dispatcher consume Maps with arbitrary init / stride
+    // (e.g. block-tiled outer loops produced by LoopTiling). The bound check
+    // in dispatch_kernel_body() guards on the flat id against num_iterations,
+    // so out-of-grid threads are skipped before any body access.
+    auto x_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::X);
+    auto y_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::Y);
+    auto z_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::Z);
+
+    auto emit_indvar = [&](structured_control_flow::Map* map, const std::string& flat_id_var) {
+        symbolic::Expression value = symbolic::symbol(flat_id_var);
+        auto stride = map->stride();
+        if (!stride.is_null() && !symbolic::eq(stride, symbolic::one())) {
+            value = symbolic::mul(value, stride);
+        }
+        auto init = map->init();
+        if (!symbolic::eq(init, symbolic::zero())) {
+            value = symbolic::add(init, value);
+        }
+        library_stream << "int " << map->indvar()->get_name() << " = " << this->language_extension_.expression(value)
+                       << ";" << std::endl;
+    };
 
-    for (auto& var : y_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_y << ";" << std::endl;
+    for (auto* map : x_maps) {
+        emit_indvar(map, indvar_x);
     }
-
-    for (auto& var : z_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_z << ";" << std::endl;
+    for (auto* map : y_maps) {
+        emit_indvar(map, indvar_y);
+    }
+    for (auto* map : z_maps) {
+        emit_indvar(map, indvar_z);
     }
+    // x_vars/y_vars/z_vars params kept for signature compatibility (used by
+    // callers to filter scope_variables); their iteration here would be
+    // redundant with the per-Map loops above.
+    (void) x_vars;
+    (void) y_vars;
+    (void) z_vars;
 }
 
 codegen::InstrumentationInfo CUDAMapDispatcher::instrumentation_info() const {

diff --git a/opt/src/targets/gpu/gpu_map_utils.cpp b/opt/src/targets/gpu/gpu_map_utils.cpp
@@ -88,16 +88,10 @@ symbolic::Expression find_nested_gpu_iterations(
                 continue;
             }
 
-            auto init = map->init();
-            if (!symbolic::eq(init, symbolic::zero())) {
-                throw InvalidSDFGException("Init is not zero");
-            }
-
-            auto stride = map->stride();
-            if (!symbolic::eq(stride, symbolic::one())) {
-                throw InvalidSDFGException("Stride is not one");
-            }
-
+            // Note: arbitrary `init` and `stride` are permitted here; the
+            // dispatcher emits `indvar = init + thread_flat_id * stride` so
+            // the body sees the natural strided value. `num_iterations()`
+            // already accounts for both.
             auto num_iterations = map->num_iterations();
             if (num_iterations.is_null()) {
                 throw InvalidSDFGException("Cannot determine number of iterations for nested map in GPU kernel");
@@ -144,6 +138,25 @@ symbolic::SymbolSet get_gpu_indvars(
     return indvars;
 }
 
+template<typename ScheduleT>
+std::vector<structured_control_flow::Map*>
+get_gpu_maps(structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension) {
+    auto& loop_analysis = analysis_manager.get<analysis::LoopAnalysis>();
+    auto loops = loop_analysis.descendants(&node);
+    loops.insert(&node);
+    std::vector<structured_control_flow::Map*> maps;
+    for (const auto& loop : loops) {
+        if (auto map = dynamic_cast<structured_control_flow::Map*>(loop)) {
+            if (map->schedule_type().value() == ScheduleT::value()) {
+                if (ScheduleT::dimension(map->schedule_type()) == dimension) {
+                    maps.push_back(map);
+                }
+            }
+        }
+    }
+    return maps;
+}
+
 // Explicit template instantiations for CUDA
 template symbolic::Expression find_nested_gpu_blocksize<cuda::ScheduleType_CUDA>(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
@@ -160,6 +173,10 @@ template symbolic::SymbolSet get_gpu_indvars<cuda::ScheduleType_CUDA>(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
 );
 
+template std::vector<structured_control_flow::Map*> get_gpu_maps<cuda::ScheduleType_CUDA>(
+    structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
+);
+
 // Explicit template instantiations for ROCM
 template symbolic::Expression find_nested_gpu_blocksize<rocm::ScheduleType_ROCM>(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
@@ -176,5 +193,9 @@ template symbolic::SymbolSet get_gpu_indvars<rocm::ScheduleType_ROCM>(
     structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
 );
 
+template std::vector<structured_control_flow::Map*> get_gpu_maps<rocm::ScheduleType_ROCM>(
+    structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
+);
+
 } // namespace gpu
 } // namespace sdfg
diff --git a/opt/src/targets/rocm/rocm_map_dispatcher.cpp b/opt/src/targets/rocm/rocm_map_dispatcher.cpp
@@ -207,8 +207,23 @@ void ROCMMapDispatcher::dispatch_kernel_body(
     }
     // Boundary Conditions
     if (!ScheduleType_ROCM::nested_sync(node_.schedule_type())) {
-        library_stream << "if (" << indvar->get_name() << " < " << rocm_language_extension.expression(num_iterations)
-                       << ") {" << std::endl;
+        std::string flat_id;
+        switch (ScheduleType_ROCM::dimension(node_.schedule_type())) {
+            case ROCMDimension::X:
+                flat_id = "__daisy_hip_indvar_x";
+                break;
+            case ROCMDimension::Y:
+                flat_id = "__daisy_hip_indvar_y";
+                break;
+            case ROCMDimension::Z:
+                flat_id = "__daisy_hip_indvar_z";
+                break;
+            default:
+                flat_id = indvar->get_name();
+                break;
+        }
+        library_stream << "if (" << flat_id << " < " << rocm_language_extension.expression(num_iterations) << ") {"
+                       << std::endl;
         library_stream.setIndent(library_stream.indent() + 4);
     }
 
@@ -316,18 +331,38 @@ void ROCMMapDispatcher::dispatch_kernel_preamble(
     library_stream << "int " << indvar_z << " = " << this->language_extension_.expression(gpu_indvar_z) << ";"
                    << std::endl;
 
-    // Declare all other indvars in the kernel
-    for (auto& var : x_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_x << ";" << std::endl;
-    }
+    // Declare each per-Map indvar as a strided affine of the flat thread id:
+    //   <map.indvar> = <map.init> + <thread_flat_id> * <map.stride>
+    auto x_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::X);
+    auto y_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::Y);
+    auto z_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::Z);
+
+    auto emit_indvar = [&](structured_control_flow::Map* map, const std::string& flat_id_var) {
+        symbolic::Expression value = symbolic::symbol(flat_id_var);
+        auto stride = map->stride();
+        if (!stride.is_null() && !symbolic::eq(stride, symbolic::one())) {
+            value = symbolic::mul(value, stride);
+        }
+        auto init = map->init();
+        if (!symbolic::eq(init, symbolic::zero())) {
+            value = symbolic::add(init, value);
+        }
+        library_stream << "int " << map->indvar()->get_name() << " = " << this->language_extension_.expression(value)
+                       << ";" << std::endl;
+    };
 
-    for (auto& var : y_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_y << ";" << std::endl;
+    for (auto* map : x_maps) {
+        emit_indvar(map, indvar_x);
     }
-
-    for (auto& var : z_vars) {
-        library_stream << "int " << var->get_name() << " = " << indvar_z << ";" << std::endl;
+    for (auto* map : y_maps) {
+        emit_indvar(map, indvar_y);
+    }
+    for (auto* map : z_maps) {
+        emit_indvar(map, indvar_z);
     }
+    (void) x_vars;
+    (void) y_vars;
+    (void) z_vars;
 }
 
 codegen::InstrumentationInfo ROCMMapDispatcher::instrumentation_info() const {