Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions opt/include/sdfg/passes/offloading/gpu_tiling_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,26 @@ namespace sdfg {
namespace passes {

/**
* @brief Phased GPU tiling pass.
* @brief [DEPRECATED] Phased GPU tiling pass.
*
* Drives the legacy `transformations::GPUTiling` +
* `transformations::KernelLocalStorage` pair. Both are deprecated. New code
* should compose `LoopTiling`, `CUDAParallelizeNestedMap` / `cuda::CUDATransform`,
* `InLocalStorage` / `OutLocalStorage` (with `NV_Shared`), and
* `passes::SyncConditionPropagation` directly — see
* `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked example.
*
* Retained for the existing CUDA and ROCm schedulers that have not yet been
* migrated.
*
* Given a set of outer maps, finds all descendant structured loops and applies
* GPU tiling in two phases:
* 1. can_be_applied phase: collects all loops where tiling is applicable
* 2. apply phase: applies tiling to all collected loops
*/
class GPUTilingPass : public Pass {
class [[deprecated(
"Use LoopTiling + CUDA/ROCm parallelize + In/OutLocalStorage + SyncConditionPropagation. See gpu_kernels_test.cpp."
)]] GPUTilingPass : public Pass {
private:
const std::vector<structured_control_flow::Map*>& maps_;
size_t tile_size_;
Expand Down
24 changes: 24 additions & 0 deletions opt/include/sdfg/targets/gpu/gpu_map_utils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

#include <vector>

#include "sdfg/analysis/analysis.h"
#include "sdfg/structured_control_flow/map.h"
#include "sdfg/symbolic/symbolic.h"
Expand Down Expand Up @@ -75,6 +77,23 @@ symbolic::SymbolSet get_gpu_indvars(
structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
);

/**
* @brief Get all GPU Map nodes in a given dimension (in tree traversal order).
*
* Unlike get_gpu_indvars, this preserves access to each Map's init / stride
* so the codegen can emit `indvar = init + thread_flat_id * stride` for
* arbitrary affine grid loops.
*
* @tparam ScheduleT Schedule type class with value() and dimension() static methods
* @param node The current map node
* @param analysis_manager Analysis manager for loop analysis
* @param dimension GPU dimension (X, Y, or Z)
* @return Vector of Map pointers in the given GPU dimension
*/
template<typename ScheduleT>
std::vector<structured_control_flow::Map*>
get_gpu_maps(structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension);

// Extern template declarations to prevent implicit instantiation
extern template symbolic::Expression find_nested_gpu_blocksize<
cuda::ScheduleType_CUDA>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
Expand All @@ -96,5 +115,10 @@ extern template symbolic::SymbolSet get_gpu_indvars<
extern template symbolic::SymbolSet get_gpu_indvars<
rocm::ScheduleType_ROCM>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);

extern template std::vector<structured_control_flow::Map*> get_gpu_maps<
cuda::ScheduleType_CUDA>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);
extern template std::vector<structured_control_flow::Map*> get_gpu_maps<
rocm::ScheduleType_ROCM>(structured_control_flow::Map&, analysis::AnalysisManager&, GPUDimension);

} // namespace gpu
} // namespace sdfg
22 changes: 21 additions & 1 deletion opt/include/sdfg/transformations/offloading/gpu_tiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,27 @@
namespace sdfg {
namespace transformations {

class GPUTiling : public Transformation {
/**
* @brief [DEPRECATED] Monolithic GPU tiling transformation.
*
* Prefer the composable pipeline instead:
* 1. transformations::LoopTiling — strip-mine the target loop
* 2. transformations::CUDAParallelizeNestedMap / cuda::CUDATransform
* — assign GPU schedules
* 3. transformations::InLocalStorage (NV_Shared) — stage read tiles
* transformations::OutLocalStorage (NV_Shared) — stage write tiles
* 4. passes::SyncConditionPropagation — guard out-of-bounds threads
*
* See `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked
* example (GEMM). KernelLocalStorage and GPUTilingPass are deprecated for
* the same reason.
*
* The legacy transformation is retained for autotuning search spaces and
* existing schedulers that have not yet been migrated.
*/
class [[deprecated(
"Use LoopTiling + CUDA/ROCm parallelize + In/OutLocalStorage + SyncConditionPropagation. See gpu_kernels_test.cpp."
)]] GPUTiling : public Transformation {
structured_control_flow::StructuredLoop& loop_;
size_t size_;
bool applied_ = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,21 @@
namespace sdfg {
namespace transformations {

class KernelLocalStorage : public Transformation {
/**
* @brief [DEPRECATED] Monolithic shared-memory staging transformation.
*
* Prefer `transformations::InLocalStorage` (for read tiles) and
* `transformations::OutLocalStorage` (for write tiles) with the
* `types::StorageType::NV_Shared()` storage. Those transformations are
* composable with `LoopTiling`, `CUDATransform` / `CUDAParallelizeNestedMap`,
* and `passes::SyncConditionPropagation`. See
* `docc/opt/tests/optimizations/gpu_kernels_test.cpp` for a worked example.
*
* The legacy transformation is retained for autotuning search spaces and
* existing schedulers that have not yet been migrated.
*/
class [[deprecated("Use InLocalStorage / OutLocalStorage with NV_Shared. See gpu_kernels_test.cpp."
)]] KernelLocalStorage : public Transformation {
private:
structured_control_flow::StructuredLoop& loop_;
symbolic::Expression offset_;
Expand Down
69 changes: 58 additions & 11 deletions opt/src/targets/cuda/cuda_map_dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,27 @@ void CUDAMapDispatcher::dispatch_kernel_body(
}
// Boundary Conditions
if (!ScheduleType_CUDA::nested_sync(node_.schedule_type())) {
library_stream << "if (" << indvar->get_name() << " < " << cuda_language_extension.expression(num_iterations)
<< ") {" << std::endl;
// Guard on the flat thread id rather than the per-Map indvar so that
// Maps with non-unit stride or non-zero init still get a correct OOB
// check (the per-Map indvar = init + flat_id * stride and is
// only well-defined when flat_id < num_iterations).
std::string flat_id;
switch (ScheduleType_CUDA::dimension(node_.schedule_type())) {
case CUDADimension::X:
flat_id = "__daisy_cuda_indvar_x";
break;
case CUDADimension::Y:
flat_id = "__daisy_cuda_indvar_y";
break;
case CUDADimension::Z:
flat_id = "__daisy_cuda_indvar_z";
break;
default:
flat_id = indvar->get_name();
break;
}
library_stream << "if (" << flat_id << " < " << cuda_language_extension.expression(num_iterations) << ") {"
<< std::endl;
library_stream.setIndent(library_stream.indent() + 4);
}

Expand Down Expand Up @@ -317,18 +336,46 @@ void CUDAMapDispatcher::dispatch_kernel_preamble(
library_stream << "int " << indvar_z << " = " << this->language_extension_.expression(gpu_indvar_z) << ";"
<< std::endl;

// Declare all other indvars in the kernel
for (auto& var : x_vars) {
library_stream << "int " << var->get_name() << " = " << indvar_x << ";" << std::endl;
}
// Declare each per-Map indvar as a strided affine of the flat thread id:
// <map.indvar> = <map.init> + <thread_flat_id> * <map.stride>
//
// This lets the dispatcher consume Maps with arbitrary init / stride
// (e.g. block-tiled outer loops produced by LoopTiling). The bound check
// in dispatch_kernel_body() guards on the flat id against num_iterations,
// so out-of-grid threads are skipped before any body access.
auto x_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::X);
auto y_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::Y);
auto z_maps = gpu::get_gpu_maps<ScheduleType_CUDA>(node_, analysis_manager, CUDADimension::Z);

auto emit_indvar = [&](structured_control_flow::Map* map, const std::string& flat_id_var) {
symbolic::Expression value = symbolic::symbol(flat_id_var);
auto stride = map->stride();
if (!stride.is_null() && !symbolic::eq(stride, symbolic::one())) {
value = symbolic::mul(value, stride);
}
auto init = map->init();
if (!symbolic::eq(init, symbolic::zero())) {
value = symbolic::add(init, value);
}
library_stream << "int " << map->indvar()->get_name() << " = " << this->language_extension_.expression(value)
<< ";" << std::endl;
};

for (auto& var : y_vars) {
library_stream << "int " << var->get_name() << " = " << indvar_y << ";" << std::endl;
for (auto* map : x_maps) {
emit_indvar(map, indvar_x);
}

for (auto& var : z_vars) {
library_stream << "int " << var->get_name() << " = " << indvar_z << ";" << std::endl;
for (auto* map : y_maps) {
emit_indvar(map, indvar_y);
}
for (auto* map : z_maps) {
emit_indvar(map, indvar_z);
}
// x_vars/y_vars/z_vars params kept for signature compatibility (used by
// callers to filter scope_variables); their iteration here would be
// redundant with the per-Map loops above.
(void) x_vars;
(void) y_vars;
(void) z_vars;
}

codegen::InstrumentationInfo CUDAMapDispatcher::instrumentation_info() const {
Expand Down
41 changes: 31 additions & 10 deletions opt/src/targets/gpu/gpu_map_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,10 @@ symbolic::Expression find_nested_gpu_iterations(
continue;
}

auto init = map->init();
if (!symbolic::eq(init, symbolic::zero())) {
throw InvalidSDFGException("Init is not zero");
}

auto stride = map->stride();
if (!symbolic::eq(stride, symbolic::one())) {
throw InvalidSDFGException("Stride is not one");
}

// Note: arbitrary `init` and `stride` are permitted here; the
// dispatcher emits `indvar = init + thread_flat_id * stride` so
// the body sees the natural strided value. `num_iterations()`
// already accounts for both.
auto num_iterations = map->num_iterations();
if (num_iterations.is_null()) {
throw InvalidSDFGException("Cannot determine number of iterations for nested map in GPU kernel");
Expand Down Expand Up @@ -144,6 +138,25 @@ symbolic::SymbolSet get_gpu_indvars(
return indvars;
}

template<typename ScheduleT>
std::vector<structured_control_flow::Map*>
get_gpu_maps(structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension) {
auto& loop_analysis = analysis_manager.get<analysis::LoopAnalysis>();
auto loops = loop_analysis.descendants(&node);
loops.insert(&node);
std::vector<structured_control_flow::Map*> maps;
for (const auto& loop : loops) {
if (auto map = dynamic_cast<structured_control_flow::Map*>(loop)) {
if (map->schedule_type().value() == ScheduleT::value()) {
if (ScheduleT::dimension(map->schedule_type()) == dimension) {
maps.push_back(map);
}
}
}
}
return maps;
}

// Explicit template instantiations for CUDA
template symbolic::Expression find_nested_gpu_blocksize<cuda::ScheduleType_CUDA>(
structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
Expand All @@ -160,6 +173,10 @@ template symbolic::SymbolSet get_gpu_indvars<cuda::ScheduleType_CUDA>(
structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
);

template std::vector<structured_control_flow::Map*> get_gpu_maps<cuda::ScheduleType_CUDA>(
structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
);

// Explicit template instantiations for ROCM
template symbolic::Expression find_nested_gpu_blocksize<rocm::ScheduleType_ROCM>(
structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
Expand All @@ -176,5 +193,9 @@ template symbolic::SymbolSet get_gpu_indvars<rocm::ScheduleType_ROCM>(
structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
);

template std::vector<structured_control_flow::Map*> get_gpu_maps<rocm::ScheduleType_ROCM>(
structured_control_flow::Map& node, analysis::AnalysisManager& analysis_manager, GPUDimension dimension
);

} // namespace gpu
} // namespace sdfg
57 changes: 46 additions & 11 deletions opt/src/targets/rocm/rocm_map_dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,23 @@ void ROCMMapDispatcher::dispatch_kernel_body(
}
// Boundary Conditions
if (!ScheduleType_ROCM::nested_sync(node_.schedule_type())) {
library_stream << "if (" << indvar->get_name() << " < " << rocm_language_extension.expression(num_iterations)
<< ") {" << std::endl;
std::string flat_id;
switch (ScheduleType_ROCM::dimension(node_.schedule_type())) {
case ROCMDimension::X:
flat_id = "__daisy_hip_indvar_x";
break;
case ROCMDimension::Y:
flat_id = "__daisy_hip_indvar_y";
break;
case ROCMDimension::Z:
flat_id = "__daisy_hip_indvar_z";
break;
default:
flat_id = indvar->get_name();
break;
}
library_stream << "if (" << flat_id << " < " << rocm_language_extension.expression(num_iterations) << ") {"
<< std::endl;
library_stream.setIndent(library_stream.indent() + 4);
}

Expand Down Expand Up @@ -316,18 +331,38 @@ void ROCMMapDispatcher::dispatch_kernel_preamble(
library_stream << "int " << indvar_z << " = " << this->language_extension_.expression(gpu_indvar_z) << ";"
<< std::endl;

// Declare all other indvars in the kernel
for (auto& var : x_vars) {
library_stream << "int " << var->get_name() << " = " << indvar_x << ";" << std::endl;
}
// Declare each per-Map indvar as a strided affine of the flat thread id:
// <map.indvar> = <map.init> + <thread_flat_id> * <map.stride>
auto x_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::X);
auto y_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::Y);
auto z_maps = gpu::get_gpu_maps<ScheduleType_ROCM>(node_, analysis_manager, ROCMDimension::Z);

auto emit_indvar = [&](structured_control_flow::Map* map, const std::string& flat_id_var) {
symbolic::Expression value = symbolic::symbol(flat_id_var);
auto stride = map->stride();
if (!stride.is_null() && !symbolic::eq(stride, symbolic::one())) {
value = symbolic::mul(value, stride);
}
auto init = map->init();
if (!symbolic::eq(init, symbolic::zero())) {
value = symbolic::add(init, value);
}
library_stream << "int " << map->indvar()->get_name() << " = " << this->language_extension_.expression(value)
<< ";" << std::endl;
};

for (auto& var : y_vars) {
library_stream << "int " << var->get_name() << " = " << indvar_y << ";" << std::endl;
for (auto* map : x_maps) {
emit_indvar(map, indvar_x);
}

for (auto& var : z_vars) {
library_stream << "int " << var->get_name() << " = " << indvar_z << ";" << std::endl;
for (auto* map : y_maps) {
emit_indvar(map, indvar_y);
}
for (auto* map : z_maps) {
emit_indvar(map, indvar_z);
}
(void) x_vars;
(void) y_vars;
(void) z_vars;
}

codegen::InstrumentationInfo ROCMMapDispatcher::instrumentation_info() const {
Expand Down
Loading
Loading