From 86696f87a23135f88568aca365b047d9ba8796a3 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Sat, 30 May 2026 18:01:15 +0200 Subject: [PATCH 01/20] Add prototype of layerwise test --- .../model_zoo/segformer_layerwise_test.py | 309 ++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py diff --git a/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py b/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py new file mode 100644 index 000000000..4b37332be --- /dev/null +++ b/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py @@ -0,0 +1,309 @@ +"""Layerwise test for SegFormer-b0. + +Tests each encoder stage and the decode head individually with the docc backend, +checking the output of each against a pure-PyTorch reference. + +Structure of SegFormer-b0: + Encoder: + Stage 0: OverlapPatchEmbedding (stride=4) + 2x TransformerBlock + LayerNorm -> (B, 32, H/4, W/4) + Stage 1: OverlapPatchEmbedding (stride=2) + 2x TransformerBlock + LayerNorm -> (B, 64, H/8, W/8) + Stage 2: OverlapPatchEmbedding (stride=2) + 2x TransformerBlock + LayerNorm -> (B,160, H/16, W/16) + Stage 3: OverlapPatchEmbedding (stride=2) + 2x TransformerBlock + LayerNorm -> (B,256, H/32, W/32) + Decode head: + 4x Linear projection + upsample to stage-0 resolution + concat + fuse Conv+BN + classifier Conv +""" + +import time + +import pytest +import torch +import torch.nn as nn +from transformers import SegformerForSemanticSegmentation + +import docc.torch + +MODEL_NAME = "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" +INPUT_SHAPE = (1, 3, 512, 512) +RTOL = 1e-2 +ATOL = 1e-4 + + +# --------------------------------------------------------------------------- +# Wrappers +# --------------------------------------------------------------------------- + +class EncoderStageWrapper(nn.Module): + """One encoder stage (SegformerStage): patch embedding + transformer blocks + layer norm. + + In newer HuggingFace versions the stage is a self-contained SegformerStage module + whose forward accepts and returns a spatial feature map (B, C, H, W). + """ + + def __init__(self, stage): + super().__init__() + self.stage = stage + + def forward(self, x): + return self.stage(x) + + +class DecodeHeadWrapper(nn.Module): + """Decode head: takes 4 stage feature maps, returns logits (B, num_classes, H/4, W/4). + + Accepts stage outputs as individual positional arguments (not a tuple) so that + torch.compile / docc can trace through without dynamic container unpacking. + """ + + def __init__(self, decode_head): + super().__init__() + self.decode_head = decode_head + + def forward(self, s0, s1, s2, s3): + return self.decode_head((s0, s1, s2, s3)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _print_diff(result: torch.Tensor, reference: torch.Tensor, label: str) -> bool: + diff = (result - reference).abs() + rel = diff / reference.abs().clamp(min=1e-8) + n_total = diff.numel() + n_fail = (~torch.isclose(result, reference, rtol=RTOL, atol=ATOL)).sum().item() + print( + f" {label}: " + f"abs max={diff.max().item():.6f} mean={diff.mean().item():.6f} | " + f"rel max={rel.max().item():.6f} mean={rel.mean().item():.6f} | " + f"failing {n_fail}/{n_total} ({100 * n_fail / n_total:.2f}%)" + ) + return n_fail == 0 + + +def _compile(module: nn.Module) -> nn.Module: + return torch.compile( + module, + backend="docc", + options={"target": "sequential", "category": "server"}, + dynamic=False, # keep height/width as concrete ints, not SymInts + ) + + +# --------------------------------------------------------------------------- +# Shared fixture: load model + compute reference outputs for all stages once +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="module") +def segformer_refs(): + """Load the pretrained model and run the reference forward pass stage by stage.""" + model = SegformerForSemanticSegmentation.from_pretrained(MODEL_NAME).eval() + stages = model.segformer.stages + + example_input = torch.randn(*INPUT_SHAPE) + + stage_inputs = [] # input to each encoder stage + stage_outputs = [] # output of each encoder stage (2-D spatial feature map) + + x = example_input + with torch.no_grad(): + for stage in stages: + stage_inputs.append(x.clone()) + x = stage(x) + stage_outputs.append(x.clone()) + + # Reference logits from the full model (using reference stage outputs) + ref_logits = model.decode_head(tuple(stage_outputs)) + + return { + "model": model, + "example_input": example_input, + "stage_inputs": stage_inputs, + "stage_outputs": stage_outputs, + "ref_logits": ref_logits, + } + + +# --------------------------------------------------------------------------- +# Encoder stage tests +# --------------------------------------------------------------------------- + +def _test_encoder_stage(segformer_refs, stage_idx: int): + refs = segformer_refs + stage = refs["model"].segformer.stages[stage_idx] + + wrapper = EncoderStageWrapper(stage) + + compiled = _compile(wrapper) + stage_input = refs["stage_inputs"][stage_idx] + + t0 = time.perf_counter() + with torch.no_grad(): + result = compiled(stage_input) + t1 = time.perf_counter() + print(f"\nEncoderStage{stage_idx} inference: {(t1 - t0) * 1000:.2f} ms") + + reference = refs["stage_outputs"][stage_idx] + ok = _print_diff(result, reference, f"EncoderStage{stage_idx}") + assert ok, f"EncoderStage{stage_idx} output mismatch (see diff above)" + + +def test_encoder_stage_0(segformer_refs): + _test_encoder_stage(segformer_refs, 0) + + +def test_encoder_stage_1(segformer_refs): + _test_encoder_stage(segformer_refs, 1) + + +def test_encoder_stage_2(segformer_refs): + _test_encoder_stage(segformer_refs, 2) + + +def test_encoder_stage_3(segformer_refs): + _test_encoder_stage(segformer_refs, 3) + + +# --------------------------------------------------------------------------- +# Individual transformer block tests (finer granularity within a stage) +# --------------------------------------------------------------------------- + +class SingleBlockWrapper(nn.Module): + """A single SegformerLayer (attention + FFN) with fixed height/width.""" + + def __init__(self, block, height: int, width: int): + super().__init__() + self.block = block + self.height = height + self.width = width + + def forward(self, hidden_states): + return self.block(hidden_states, self.height, self.width)[0] + + +def _test_transformer_block(segformer_refs, stage_idx: int, block_idx: int): + """Test one transformer block inside an encoder stage. + + Uses the actual intermediate hidden states at that block's input by running + the patch embedding (and preceding blocks) in reference mode. + """ + refs = segformer_refs + stage = refs["model"].segformer.stages[stage_idx] + # SegformerStage stores its transformer blocks as 'layers' in newer HF versions + blocks = getattr(stage, "layers", None) or getattr(stage, "blocks", None) + if blocks is None: + pytest.skip(f"Cannot find transformer blocks in SegformerStage (stage {stage_idx})") + + stage_input = refs["stage_inputs"][stage_idx] + + with torch.no_grad(): + hidden_states, height, width = stage.patch_embeddings(stage_input) + for j in range(block_idx): + hidden_states = blocks[j](hidden_states, height, width)[0] + block_input = hidden_states.clone() + block_ref_output = blocks[block_idx](block_input, height, width)[0] + + wrapper = SingleBlockWrapper(blocks[block_idx], height, width) + compiled = _compile(wrapper) + + with torch.no_grad(): + result = compiled(block_input) + + label = f"Stage{stage_idx}/Block{block_idx}" + ok = _print_diff(result, block_ref_output, label) + assert ok, f"{label} output mismatch" + + +def test_stage0_block0(segformer_refs): + _test_transformer_block(segformer_refs, 0, 0) + + +def test_stage0_block1(segformer_refs): + _test_transformer_block(segformer_refs, 0, 1) + + +def test_stage1_block0(segformer_refs): + _test_transformer_block(segformer_refs, 1, 0) + + +def test_stage1_block1(segformer_refs): + _test_transformer_block(segformer_refs, 1, 1) + + +def test_stage2_block0(segformer_refs): + _test_transformer_block(segformer_refs, 2, 0) + + +def test_stage2_block1(segformer_refs): + _test_transformer_block(segformer_refs, 2, 1) + + +def test_stage3_block0(segformer_refs): + _test_transformer_block(segformer_refs, 3, 0) + + +def test_stage3_block1(segformer_refs): + _test_transformer_block(segformer_refs, 3, 1) + + +# --------------------------------------------------------------------------- +# Decode head test +# --------------------------------------------------------------------------- + +def test_decode_head(segformer_refs): + """Test the decode head in isolation using the reference stage outputs as input.""" + refs = segformer_refs + decode_head = refs["model"].decode_head + s0, s1, s2, s3 = refs["stage_outputs"] + + wrapper = DecodeHeadWrapper(decode_head) + compiled = _compile(wrapper) + + t0 = time.perf_counter() + with torch.no_grad(): + result = compiled(s0, s1, s2, s3) + t1 = time.perf_counter() + print(f"\nDecodeHead inference: {(t1 - t0) * 1000:.2f} ms") + + ok = _print_diff(result, refs["ref_logits"], "DecodeHead") + assert ok, "DecodeHead output mismatch" + + +# --------------------------------------------------------------------------- +# End-to-end composed test: use compiled stages in sequence +# --------------------------------------------------------------------------- + +def test_end_to_end_composed(segformer_refs): + """Run all 4 compiled encoder stages + compiled decode head in sequence. + + This is the same as test_backend in segformer_test.py but with the model + manually decomposed so that the first failing stage is immediately visible. + """ + refs = segformer_refs + stages = refs["model"].segformer.stages + + compiled_stages = [ + _compile(EncoderStageWrapper(stage)) + for stage in stages + ] + compiled_head = _compile(DecodeHeadWrapper(refs["model"].decode_head)) + + x = refs["example_input"] + stage_outputs = [] + with torch.no_grad(): + for i, stage in enumerate(compiled_stages): + t0 = time.perf_counter() + x = stage(x) + t1 = time.perf_counter() + print(f"\nComposed Stage{i}: {(t1 - t0) * 1000:.2f} ms, shape={tuple(x.shape)}") + + ok = _print_diff(x, refs["stage_outputs"][i], f"ComposedStage{i}") + assert ok, f"Composed encoder stage {i} output mismatch" + stage_outputs.append(x) + + t0 = time.perf_counter() + logits = compiled_head(*stage_outputs) + t1 = time.perf_counter() + print(f"Composed DecodeHead: {(t1 - t0) * 1000:.2f} ms") + + ok = _print_diff(logits, refs["ref_logits"], "ComposedLogits") + assert ok, "End-to-end composed output mismatch" From f2e061cf60ad3657fbd7a0c1eeba61d94ea766ee Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Sat, 30 May 2026 18:01:49 +0200 Subject: [PATCH 02/20] Update segformer test --- .../torch/model_zoo/segformer_test.py | 8 ++++++- mlir/docc/torch/torch_program.py | 21 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/mlir/benchmarks/torch/model_zoo/segformer_test.py b/mlir/benchmarks/torch/model_zoo/segformer_test.py index a70c186d9..e900fce62 100644 --- a/mlir/benchmarks/torch/model_zoo/segformer_test.py +++ b/mlir/benchmarks/torch/model_zoo/segformer_test.py @@ -30,13 +30,19 @@ def test_backend(): program = torch.compile(model, backend="docc", options={"target": "none", "category": "server"}) end = time.perf_counter() print(f"compilation time: {(end - start) * 1000:.2f} ms") + + start = time.perf_counter() + ref_program = torch.compile(model, backend="docc", options={"target": "none", "category": "server"}) + end = time.perf_counter() + print(f"ref compilation time: {(end - start) * 1000:.2f} ms") + with torch.no_grad(): start = time.perf_counter() res = program(pixel_values=example_input) end = time.perf_counter() print(f"inference time: {(end - start) * 1000:.2f} ms") start = time.perf_counter() - res_ref = model_ref(pixel_values=example_input) + res_ref = ref_program(pixel_values=example_input) end = time.perf_counter() print(f"reference inference time: {(end - start) * 1000:.2f} ms") for k in range(res.logits.shape[0]): diff --git a/mlir/docc/torch/torch_program.py b/mlir/docc/torch/torch_program.py index 76f63694c..929818964 100644 --- a/mlir/docc/torch/torch_program.py +++ b/mlir/docc/torch/torch_program.py @@ -519,6 +519,19 @@ def _docc_dynamo_compiler(gm, example_inputs, backend_options): """Dynamic Compiler based on TorchProgram (inference only).""" import torch + # Resolve SymInt/SymFloat values that dynamo passes as graph inputs when a + # model (e.g. SegFormer) unpacks tensor shapes and forwards them as explicit + # integer arguments to submodules. torch.export.export cannot handle + # torch.SymInt; converting to concrete Python ints/floats is safe here + # because these values are always backed by a concrete shape at this point. + def _resolve(x): + if isinstance(x, torch.SymInt): + return int(x) + if isinstance(x, torch.SymFloat): + return float(x) + return x + example_inputs = [_resolve(inp) for inp in example_inputs] + if len(example_inputs) == 1: example_input = example_inputs[0] else: @@ -548,6 +561,14 @@ def _docc_aot_compiler(gm, example_inputs): import torch + def _resolve(x): + if isinstance(x, torch.SymInt): + return int(x) + if isinstance(x, torch.SymFloat): + return float(x) + return x + example_inputs = [_resolve(inp) for inp in example_inputs] + if len(example_inputs) == 1: example_input = example_inputs[0] else: From a47dd7ec44d389d654a8cba5b0dc58648a572189 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Sat, 30 May 2026 18:03:21 +0200 Subject: [PATCH 03/20] Partially invalidate analysis manager --- opt/src/transformations/map_fusion.cpp | 22 +++++++++++++++++- sdfg/include/sdfg/analysis/analysis.h | 23 +++++++++++++++++++ .../sdfg/analysis/assumptions_analysis.h | 8 +++++++ sdfg/src/analysis/assumptions_analysis.cpp | 13 +++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/opt/src/transformations/map_fusion.cpp b/opt/src/transformations/map_fusion.cpp index 3ce4fe605..60a928d01 100644 --- a/opt/src/transformations/map_fusion.cpp +++ b/opt/src/transformations/map_fusion.cpp @@ -1027,7 +1027,27 @@ void MapFusion::apply(builder::StructuredSDFGBuilder& builder, analysis::Analysi } } - analysis_manager.invalidate_all(); + if (direction_ == FusionDirection::ProducerIntoConsumer) { + // The loop structure is unchanged after ProducerIntoConsumer: only new Block + // nodes are inserted into consumer_body_. Patch them into AssumptionsAnalysis + // so it stays valid, then preserve it (and LoopAnalysis) across the invalidation. + if (analysis_manager.has()) { + size_t n = fusion_candidates_.size(); + if (n < consumer_body_->size()) { + auto& aa = analysis_manager.get(); + // Original consumer blocks were shifted to index n..size-1; use + // the first of them as the scope reference for the new blocks. + auto& sibling = consumer_body_->at(n).first; + for (size_t i = 0; i < n; ++i) { + aa.register_node(consumer_body_->at(i).first, sibling); + } + } + } + analysis_manager.invalidate_preserving(); + } else { + // ConsumerIntoProducer removes the consumer loop node entirely — full invalidation. + analysis_manager.invalidate_all(); + } applied_ = true; } diff --git a/sdfg/include/sdfg/analysis/analysis.h b/sdfg/include/sdfg/analysis/analysis.h index c1143317d..923c78a34 100644 --- a/sdfg/include/sdfg/analysis/analysis.h +++ b/sdfg/include/sdfg/analysis/analysis.h @@ -73,6 +73,11 @@ class AnalysisManager { return *static_cast(cache_[type].get()); } + template + bool has() const { + return cache_.find(std::type_index(typeid(T))) != cache_.end(); + } + template void invalidate() { std::type_index type = std::type_index(typeid(T)); @@ -81,6 +86,24 @@ class AnalysisManager { } } + // Invalidate all cached analyses except the listed types. + // Analyses not present in the cache are unaffected. + template + void invalidate_preserving() { + std::unordered_map> kept; + auto try_keep = [&](std::type_index type) { + auto it = cache_.find(type); + if (it != cache_.end()) { + kept.emplace(type, std::move(it->second)); + } + }; + (try_keep(std::type_index(typeid(Ts))), ...); + cache_.clear(); + for (auto& [type, analysis] : kept) { + cache_.emplace(type, std::move(analysis)); + } + } + void invalidate_all(); }; diff --git a/sdfg/include/sdfg/analysis/assumptions_analysis.h b/sdfg/include/sdfg/analysis/assumptions_analysis.h index b2ee52159..7c15600ae 100644 --- a/sdfg/include/sdfg/analysis/assumptions_analysis.h +++ b/sdfg/include/sdfg/analysis/assumptions_analysis.h @@ -66,6 +66,14 @@ class AssumptionsAnalysis : public Analysis { const symbolic::Assumptions& get(structured_control_flow::ControlFlowNode& node, bool include_trivial_bounds = false); + // Register a newly created node so it inherits the same scope assumptions as + // sibling_node. Call this after inserting nodes into a sequence to keep the + // cached analysis valid without a full re-run. + void register_node( + structured_control_flow::ControlFlowNode& new_node, + structured_control_flow::ControlFlowNode& sibling_node + ); + const symbolic::SymbolSet& parameters(); bool is_parameter(const symbolic::Symbol& container); diff --git a/sdfg/src/analysis/assumptions_analysis.cpp b/sdfg/src/analysis/assumptions_analysis.cpp index 8ac97f256..4968f7283 100644 --- a/sdfg/src/analysis/assumptions_analysis.cpp +++ b/sdfg/src/analysis/assumptions_analysis.cpp @@ -337,6 +337,19 @@ const symbolic::Assumptions& AssumptionsAnalysis:: } } +void AssumptionsAnalysis::register_node( + structured_control_flow::ControlFlowNode& new_node, structured_control_flow::ControlFlowNode& sibling_node +) { + auto it = ref_assumptions_.find(&sibling_node); + if (it != ref_assumptions_.end()) { + ref_assumptions_[&new_node] = it->second; + } + auto it2 = ref_assumptions_with_trivial_.find(&sibling_node); + if (it2 != ref_assumptions_with_trivial_.end()) { + ref_assumptions_with_trivial_[&new_node] = it2->second; + } +} + const symbolic::SymbolSet& AssumptionsAnalysis::parameters() { return this->parameters_; } bool AssumptionsAnalysis::is_parameter(const symbolic::Symbol& container) { From 1f45ba73749764c735c17a4d805e3a81be1f5df9 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Sun, 7 Jun 2026 22:58:19 +0200 Subject: [PATCH 04/20] Add benchmarks --- .daisy/mlir_torch_models.yml | 66 ----- .daisy/mlir_torch_segformer.yml | 40 +++ .daisy/mlir_torch_segformer_b2.yml | 40 +++ .daisy/mlir_torch_segformer_b2_torch.yml | 40 +++ .daisy/mlir_torch_segformer_torch.yml | 40 +++ .daisy/python_npbench.yml | 267 ------------------ .github/workflows/llvm_tests_san.yml | 82 ------ .github/workflows/release.yml | 200 ------------- .github/workflows/sanitizer_tests_asan.yml | 86 ------ .github/workflows/sanitizer_tests_lsan.yml | 48 ---- .github/workflows/sanitizer_tests_ubsan.yml | 48 ---- .github/workflows/unit_tests_macos.yml | 95 ------- .github/workflows/unit_tests_release.yml | 113 -------- .../model_zoo/segformer_layerwise_test.py | 141 +++++++++ .../torch/model_zoo/segformer_test.py | 146 ++++++++-- 15 files changed, 421 insertions(+), 1031 deletions(-) delete mode 100644 .daisy/mlir_torch_models.yml create mode 100644 .daisy/mlir_torch_segformer.yml create mode 100644 .daisy/mlir_torch_segformer_b2.yml create mode 100644 .daisy/mlir_torch_segformer_b2_torch.yml create mode 100644 .daisy/mlir_torch_segformer_torch.yml delete mode 100644 .daisy/python_npbench.yml delete mode 100644 .github/workflows/llvm_tests_san.yml delete mode 100644 .github/workflows/release.yml delete mode 100644 .github/workflows/sanitizer_tests_asan.yml delete mode 100644 .github/workflows/sanitizer_tests_lsan.yml delete mode 100644 .github/workflows/sanitizer_tests_ubsan.yml delete mode 100644 .github/workflows/unit_tests_macos.yml delete mode 100644 .github/workflows/unit_tests_release.yml diff --git a/.daisy/mlir_torch_models.yml b/.daisy/mlir_torch_models.yml deleted file mode 100644 index e1ced8018..000000000 --- a/.daisy/mlir_torch_models.yml +++ /dev/null @@ -1,66 +0,0 @@ -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -parameters: - container: daisytuner/docc-build-env-llvm19-base:latest-amd64 - timeout: 120 - partitions: - - chamomile - -steps: - build: | - python3.11 -m venv venv - . venv/bin/activate - - python -m pip install --upgrade pip - pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - pip install numpy scipy - - pip install --no-build-isolation -e python/ - pip install --no-build-isolation -e mlir/ - - pip install -r mlir/requirements.txt - - # Warm start - - venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --torch - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=cuda - - run: - - # model resnet18 - - resnet18_torch: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --torch - energy: true - resnet18_docc_none: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - resnet18_docc_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - resnet18_docc_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - resnet18_docc_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 diff --git a/.daisy/mlir_torch_segformer.yml b/.daisy/mlir_torch_segformer.yml new file mode 100644 index 000000000..dcdcf0757 --- /dev/null +++ b/.daisy/mlir_torch_segformer.yml @@ -0,0 +1,40 @@ +on: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +parameters: + container: daisytuner/docc-build-env-llvm19-base:latest-amd64 + timeout: 240 + partitions: + - chamomile + +steps: + build: | + python3.11 -m venv venv + . venv/bin/activate + + python -m pip install --upgrade pip + pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core + pip install numpy scipy transformers + + pip install --no-build-isolation -e python/ + pip install --no-build-isolation -e mlir/ + + pip install -r mlir/requirements.txt + + # Warm start (DOCC benchmark, CUDA target) + DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu + + run: + + # model segformer b0 (DOCC CUDA target) + + segformer_b0_docc_cuda: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu + energy: true + env: + DOCC_CI: "" + DOCC_REUSE_BINARIES: 1 diff --git a/.daisy/mlir_torch_segformer_b2.yml b/.daisy/mlir_torch_segformer_b2.yml new file mode 100644 index 000000000..afdb15fac --- /dev/null +++ b/.daisy/mlir_torch_segformer_b2.yml @@ -0,0 +1,40 @@ +on: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +parameters: + container: daisytuner/docc-build-env-llvm19-base:latest-amd64 + timeout: 480 + partitions: + - chamomile + +steps: + build: | + python3.11 -m venv venv + . venv/bin/activate + + python -m pip install --upgrade pip + pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core + pip install numpy scipy transformers + + pip install --no-build-isolation -e python/ + pip install --no-build-isolation -e mlir/ + + pip install -r mlir/requirements.txt + + # Warm start (DOCC benchmark, CUDA target) + DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=docc --target=cuda --device=cpu + + run: + + # model segformer b2 (DOCC CUDA target) + + segformer_b2_docc_cuda: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=docc --target=cuda --device=cpu + energy: true + env: + DOCC_CI: "" + DOCC_REUSE_BINARIES: 1 diff --git a/.daisy/mlir_torch_segformer_b2_torch.yml b/.daisy/mlir_torch_segformer_b2_torch.yml new file mode 100644 index 000000000..e63215168 --- /dev/null +++ b/.daisy/mlir_torch_segformer_b2_torch.yml @@ -0,0 +1,40 @@ +on: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +parameters: + container: daisytuner/docc-build-env-llvm19-base:latest-amd64 + timeout: 480 + partitions: + - chamomile + +steps: + build: | + python3.11 -m venv venv + . venv/bin/activate + + python -m pip install --upgrade pip + pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core + pip install numpy scipy transformers + + pip install --no-build-isolation -e python/ + pip install --no-build-isolation -e mlir/ + + pip install -r mlir/requirements.txt + + # Override CPU torch with CUDA wheels for torch GPU benchmarks + pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126 + + # Warm start (Torch benchmark on CUDA) + venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda + + run: + + # model segformer b2 (Torch CUDA) + + segformer_b2_torch_cuda: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda + energy: true diff --git a/.daisy/mlir_torch_segformer_torch.yml b/.daisy/mlir_torch_segformer_torch.yml new file mode 100644 index 000000000..5e14f0c53 --- /dev/null +++ b/.daisy/mlir_torch_segformer_torch.yml @@ -0,0 +1,40 @@ +on: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +parameters: + container: daisytuner/docc-build-env-llvm19-base:latest-amd64 + timeout: 240 + partitions: + - chamomile + +steps: + build: | + python3.11 -m venv venv + . venv/bin/activate + + python -m pip install --upgrade pip + pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core + pip install numpy scipy transformers + + pip install --no-build-isolation -e python/ + pip install --no-build-isolation -e mlir/ + + pip install -r mlir/requirements.txt + + # Override CPU torch with CUDA wheels for torch GPU benchmarks + pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126 + + # Warm start (Torch benchmark on CUDA) + venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=torch --device=cuda + + run: + + # model segformer b0 (Torch CUDA) + + segformer_b0_torch_cuda: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=torch --device=cuda + energy: true diff --git a/.daisy/python_npbench.yml b/.daisy/python_npbench.yml deleted file mode 100644 index fbcc56dfb..000000000 --- a/.daisy/python_npbench.yml +++ /dev/null @@ -1,267 +0,0 @@ -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -parameters: - container: daisytuner/docc-build-env-llvm19-base:latest-amd64 - timeout: 120 - partitions: - - zinnia - -steps: - build: | - apt-get install -y python3-venv python3-pip - - python3 -m venv venv - . venv/bin/activate - - python -m pip install --upgrade pip - pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - pip install numpy scipy - - pip install --no-build-isolation -v -e python/ - - run: - adi_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - adi_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - adi_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - adi_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - atax_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - atax_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - atax_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - atax_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - gemm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - gemm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - gemm_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - gemm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - gesummv_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - gesummv_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - gesummv_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - gesummv_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - gemver_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - gemver_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - gemver_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - gemver_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - k2mm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - k2mm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - k2mm_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - k2mm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - k3mm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - k3mm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - k3mm_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - k3mm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - mvt_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - mvt_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - mvt_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - mvt_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - symm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - symm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - symm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - # symm_cuda: - # command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=cuda - # energy: true - # env: - # DOCC_CI: regions - syr2k_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - syr2k_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - syr2k_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - syr2k_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - syrk_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - syrk_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - syrk_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - syrk_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - trmm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - trmm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - trmm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - # trmm_cuda: - # command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=cuda - # energy: true - # env: - # DOCC_CI: regions diff --git a/.github/workflows/llvm_tests_san.yml b/.github/workflows/llvm_tests_san.yml deleted file mode 100644 index f260f9903..000000000 --- a/.github/workflows/llvm_tests_san.yml +++ /dev/null @@ -1,82 +0,0 @@ -name: LLVM - Unit and Integration Sanitized Tests - -on: - push: - branches: - - main - schedule: - - cron: "0 4 * * *" - -jobs: - llvm-tests-linux-san: - runs-on: - group: dahlia - labels: Linux - container: - image: daisytuner/docc-build-env-llvm19-ubuntu-24.04:latest-amd64 - strategy: - fail-fast: false - matrix: - san: ["address", "leak", "undefined"] - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Mark GitHub Actions workdir as safe - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - - name: Build - run: | - mkdir build - cd build - cmake -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=${{ matrix.san }} \ - -DLLVM_BUILD_FRONTEND=ON \ - -DLLVM_BUILD_TESTS=ON \ - -DSDFG_BUILD_TESTS=OFF \ - -DINSTALL_GTEST=OFF \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - cpack -G DEB - apt-get install -y ./docc-llvm*.deb - - - name: Unit Tests - run: | - cd build - ./llvm/tests/docc_llvm_pass_test - - - name: Set up Python - if: matrix.san == 'leak' - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - - name: Setup virtual environment - if: matrix.san == 'leak' - run: | - python -m venv .venv - echo "$PWD/.venv/bin" >> $GITHUB_PATH - - - name: Install dependencies - if: matrix.san == 'leak' - run: | - python -m pip install --upgrade pip - pip install pytest==7.1.3 pytest-parallel lit - - - name: Integration Tests - # The docc C/C++ compiler currently only works with leak sanitizer - if: matrix.san == 'leak' - run: | - export LLVM_SYMBOLIZER_PATH=$(which llvm-symbolizer-19) - - cd llvm/integration - pytest -v llvm_test_suite.py \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 9228b9a89..000000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,200 +0,0 @@ -name: Release - -on: - push: - tags: - - "v*.*.*" - -jobs: - # Stage 1: Build docc-compiler (no dependencies) - wheels-compiler: - name: Compiler (${{ matrix.os }}, ${{ matrix.python }}) - runs-on: ${{ matrix.os }} - - strategy: - fail-fast: false - matrix: - os: [build-amd64-big, build-arm64-big, macos-14] - python: ["cp311", "cp312", "cp313", "cp314"] - include: - - os: build-amd64-big - cibw_archs: x86_64 - - os: build-arm64-big - cibw_archs: aarch64 - - os: macos-14 - cibw_archs: arm64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - uses: pypa/cibuildwheel@v3.3.1 - with: - package-dir: python/ - output-dir: wheelhouse - env: - CIBW_ARCHS: ${{ matrix.cibw_archs }} - CIBW_BUILD: "${{ matrix.python }}-*" - - - uses: actions/upload-artifact@v4 - with: - name: wheels-docc-compiler-${{ matrix.os }}-${{ matrix.python }} - path: wheelhouse/*.whl - - # Stage 2: Build docc-ai (depends on docc-compiler) - wheels-ai: - name: AI (${{ matrix.os }}, ${{ matrix.python }}) - needs: [wheels-compiler] - runs-on: ${{ matrix.os }} - - strategy: - fail-fast: false - matrix: - os: [build-amd64-big, build-arm64-big, macos-14] - python: ["cp311", "cp312"] - include: - - os: build-amd64-big - cibw_archs: x86_64 - - os: build-arm64-big - cibw_archs: aarch64 - - os: macos-14 - cibw_archs: arm64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - # Pin docc-compiler version to match release - # - name: Pin docc-compiler version - # run: | - # VERSION=$(cat VERSION) - # sed -i.bak "s/\"docc-compiler\"/\"docc-compiler==$VERSION\"/" mlir/pyproject.toml && rm mlir/pyproject.toml.bak - - # Download compiler wheels into the package directory so they're available in container - # - uses: actions/download-artifact@v4 - # with: - # pattern: wheels-docc-compiler-${{ matrix.os }}-* - # path: mlir/compiler-wheels - # merge-multiple: true - - - uses: pypa/cibuildwheel@v3.3.1 - with: - package-dir: mlir/ - output-dir: wheelhouse - env: - CIBW_ARCHS: ${{ matrix.cibw_archs }} - CIBW_BUILD: "${{ matrix.python }}-*" - # Install docc-compiler before building docc-ai - # CIBW_BEFORE_BUILD: "pip install --no-index --find-links {project}/compiler-wheels docc-compiler" - # Make compiler wheels available for dependency resolution during test - # CIBW_ENVIRONMENT: "PIP_FIND_LINKS={project}/compiler-wheels" - - - uses: actions/upload-artifact@v4 - with: - name: wheels-docc-ai-${{ matrix.os }}-${{ matrix.python }} - path: wheelhouse/*.whl - - wheels-publish: - needs: [wheels-compiler, wheels-ai] - runs-on: build-amd64-big - permissions: - id-token: write - - steps: - - uses: actions/download-artifact@v4 - with: - pattern: wheels-* - path: dist - merge-multiple: true - - - uses: pypa/gh-action-pypi-publish@v1.10.0 - - packages-llvm: - strategy: - matrix: - include: - - platform: ubuntu-24.04 - package-format: deb - cpack-generator: DEB - upload-dist-id: ubuntu - upload-dist-version: 24.04 - runner: build-amd64-big - architecture: x64 - image: daisytuner/docc-build-env-llvm19-ubuntu-24.04:latest - - platform: ubuntu-24.04 - package-format: deb - cpack-generator: DEB - upload-dist-id: ubuntu - upload-dist-version: 24.04 - runner: build-arm64-big - architecture: arm64 - image: daisytuner/docc-build-env-llvm19:latest-arm64 - - platform: rhel-10 - package-format: rpm - cpack-generator: RPM - upload-dist-id: rhel - upload-dist-version: 10 - upload-dist-platform-id: platform:el10 - runner: build-amd64-big - architecture: x64 - image: daisytuner/docc-build-env-llvm19-rhel-10:latest - - platform: debian-13 - package-format: deb - cpack-generator: DEB - upload-dist-id: debian - upload-dist-version: 13 - runner: build-amd64-big - architecture: x64 - image: daisytuner/docc-build-env-llvm19-debian-13:latest - - runs-on: ${{ matrix.runner }} - container: - image: ${{ matrix.image }} - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Define Version - id: define_version - run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT - - - name: Build package - run: | - mkdir -p build - cd build - cmake -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Release \ - -DINSTALL_GTEST=OFF \ - -DBUILD_TESTS:BOOL=OFF \ - -DSDFGLIB_AUTO_INSTALL_MODE=ON \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - -DRELEASE_PACKAGE=ON \ - -DPACKAGE_WITH_TOOL_DEPS=ON \ - .. - ninja -j$(nproc) - cpack -G ${{ matrix.cpack-generator }} - - - name: Upload docc package as Artifact - uses: actions/upload-artifact@v4 - with: - name: docc-${{ matrix.platform }}-${{ matrix.architecture }} - path: "build/*.${{ matrix.package-format }}" - - - name: Upload docc package to Firebase - uses: daisytuner/upload-distribution-action@main - with: - file: "build/*.${{ matrix.package-format }}" - version: ${{ steps.define_version.outputs.VERSION }} - architecture: ${{ matrix.architecture }} - dist-id: ${{ matrix.upload-dist-id }} - dist-version: ${{ matrix.upload-dist-version }} - dist-platform-id: ${{ matrix.upload-dist-platform-id }} - token: ${{ secrets.DOCC_RELEASE_TOKEN }} - url: /v1/system/docc-distributions/upload diff --git a/.github/workflows/sanitizer_tests_asan.yml b/.github/workflows/sanitizer_tests_asan.yml deleted file mode 100644 index b8b6afc26..000000000 --- a/.github/workflows/sanitizer_tests_asan.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: Sanitizer Tests (Address) - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -jobs: - sanitizer-linux-asan: - runs-on: - group: dahlia - labels: openmp - container: - image: daisytuner/docc-build-env-llvm19-base:latest-amd64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Formatting - shell: bash - run: | - shopt -s globstar - clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp - clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp - - - name: Build and test - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=address \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - - sanitizer-macos-asan: - runs-on: - group: dahlia - labels: macOS - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Install dependencies - run: | - brew install ninja cmake - brew install gmp isl nlohmann-json boost - brew install libomp - - - name: Build - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=address \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja - - - name: Unit Tests - run: | - cd build/ - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - ./tutorial/printf_target/tests/printf_target_test diff --git a/.github/workflows/sanitizer_tests_lsan.yml b/.github/workflows/sanitizer_tests_lsan.yml deleted file mode 100644 index dae1b8819..000000000 --- a/.github/workflows/sanitizer_tests_lsan.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: Sanitizer Tests (Leak) - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -jobs: - sanitizer-linux-lsan: - runs-on: - group: dahlia - labels: openmp - container: - image: daisytuner/docc-build-env-llvm19-base:latest-amd64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Formatting - shell: bash - run: | - shopt -s globstar - clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp - clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp - - - name: Build and test - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=leak \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test diff --git a/.github/workflows/sanitizer_tests_ubsan.yml b/.github/workflows/sanitizer_tests_ubsan.yml deleted file mode 100644 index d88d1ed54..000000000 --- a/.github/workflows/sanitizer_tests_ubsan.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: Sanitizer Tests (Undefined Behavior) - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -jobs: - sanitizer-linux-ubsan: - runs-on: - group: dahlia - labels: openmp - container: - image: daisytuner/docc-build-env-llvm19-base:latest-amd64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Formatting - shell: bash - run: | - shopt -s globstar - clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp - clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp - - - name: Build and test - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=undefined \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test diff --git a/.github/workflows/unit_tests_macos.yml b/.github/workflows/unit_tests_macos.yml deleted file mode 100644 index 08fdb6aa1..000000000 --- a/.github/workflows/unit_tests_macos.yml +++ /dev/null @@ -1,95 +0,0 @@ -name: Unit Tests (macOS) - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - schedule: - - cron: "0 4 * * *" - -jobs: - primary-tests-macos: - runs-on: - group: dahlia - labels: macOS - - env: - python_version: "3.14" - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Install dependencies - run: | - brew install ninja cmake - brew install gmp isl nlohmann-json boost - brew install libomp - brew install uv - - - name: Set up Python ${{ env.python_version }} - run: | - uv python install ${{ env.python_version }} - uv venv --python ${{ env.python_version }} .venv - echo "$PWD/.venv/bin" >> $GITHUB_PATH - echo "PYTHONPATH=$PWD/python" >> $GITHUB_ENV - - - name: Install Python dependencies - run: | - uv pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - uv pip install numpy scipy ml_dtypes - - - name: Build - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_BUILD_TYPE=Debug \ - -DPYTHON_BUILD_FRONTEND=ON \ - -Dpybind11_DIR=$GITHUB_WORKSPACE/.venv/lib/python${{ env.python_version }}/site-packages/pybind11/share/cmake/pybind11 \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja - - - name: Unit Tests - run: | - cd build/ - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - ./tutorial/printf_target/tests/printf_target_test - - - name: Test Arg-Capture-IO - run: | - cd build - ./arg-capture-io/tests/capture_io_test - - - name: Python Unit Tests - env: - DOCC_ACCESS_TOKEN: ${{ secrets.DOCC_CI_TOKEN }} - run: | - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - pytest -v python/tests - - - name: Python Integration Tests - run: | - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - pytest -v python/benchmarks/ - - # - name: Test RTL - # run: | - # export CPATH=/usr/local/include:$CPATH - # export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH - # export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - # export PATH=/usr/local/bin:$PATH - - # pip install pytest==7.1.3 --break-system-packages - # pip install pytest-parallel --break-system-packages - - # cd rtl/tests - # pytest -v -s rtl_tests.py diff --git a/.github/workflows/unit_tests_release.yml b/.github/workflows/unit_tests_release.yml deleted file mode 100644 index bafd81499..000000000 --- a/.github/workflows/unit_tests_release.yml +++ /dev/null @@ -1,113 +0,0 @@ -name: Unit Tests - Release - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - schedule: - - cron: "0 4 * * *" - -jobs: - release-linux: - runs-on: - group: dahlia - labels: RTX5060 - container: - image: daisytuner/docc-build-env-llvm19-base:latest-amd64 - options: >- - --cap-add=PERFMON - --gpus=all - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Mark GitHub Actions workdir as safe - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - - name: Formatting - shell: bash - run: | - shopt -s globstar - clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp - clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp - - - name: Build and test - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_INSTALL_PREFIX=/usr/local \ - -DCMAKE_BUILD_TYPE=Release \ - -DLLVM_BUILD_FRONTEND=ON \ - -DLLVM_BUILD_TESTS=ON \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - ninja install - - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - ./llvm/tests/docc_llvm_pass_test - - - name: Test RTL - run: | - export CPATH=/usr/local/include:$CPATH - export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - export PATH=/usr/local/bin:$PATH - - pip install pytest==7.1.3 --break-system-packages - pip install pytest-parallel --break-system-packages - - cd rtl/tests - pytest -v -s rtl_tests.py - - release-macos: - runs-on: - group: dahlia - labels: macOS - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Install dependencies - run: | - brew install ninja cmake - brew install gmp isl nlohmann-json boost - brew install libomp - - - name: Build - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja - - - name: Unit Tests - run: | - cd build/ - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - ./tutorial/printf_target/tests/printf_target_test - - - name: Test Arg-Capture-IO - run: | - cd build - ./arg-capture-io/tests/capture_io_test diff --git a/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py b/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py index 4b37332be..50e1d6491 100644 --- a/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py +++ b/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py @@ -13,6 +13,7 @@ 4x Linear projection + upsample to stage-0 resolution + concat + fuse Conv+BN + classifier Conv """ +import argparse import time import pytest @@ -23,6 +24,14 @@ import docc.torch MODEL_NAME = "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" +SEGFORMER_MODELS = { + "b0": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024", + "b1": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", + "b2": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024", + "b3": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024", + "b4": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024", + "b5": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024", +} INPUT_SHAPE = (1, 3, 512, 512) RTOL = 1e-2 ATOL = 1e-4 @@ -89,6 +98,93 @@ def _compile(module: nn.Module) -> nn.Module: ) +def _compile_for_backend(module: nn.Module, backend: str, target: str) -> nn.Module: + if backend == "docc": + return torch.compile( + module, + backend="docc", + options={"target": target, "category": "server"}, + dynamic=False, + ) + return torch.compile(module, dynamic=False) + + +def _benchmark_module(label: str, module: nn.Module, inputs, backend: str, target: str, n_runs: int) -> None: + compiled = _compile_for_backend(module, backend, target) + with torch.no_grad(): + compiled(*inputs) + + times_ms = [] + for _ in range(n_runs): + start = time.perf_counter() + compiled(*inputs) + end = time.perf_counter() + times_ms.append((end - start) * 1000.0) + + mean_ms = sum(times_ms) / len(times_ms) + print(f"{label}: mean={mean_ms:.2f} ms over {n_runs} runs") + + +def benchmark_layerwise(model_name: str, backend: str = "torch", target: str = "cuda", device: str = "cuda", n_runs: int = 10) -> None: + if device == "cuda" and not torch.cuda.is_available(): + raise RuntimeError("CUDA requested but not available") + + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval().to(device) + stage_modules = model.segformer.stages + decode_head = model.decode_head + + x = torch.randn(*INPUT_SHAPE, device=device) + + print(f"Layerwise benchmark model={model_name} backend={backend} target={target} device={device}") + with torch.no_grad(): + stage_inputs = [] + stage_outputs = [] + + for stage in stage_modules: + stage_inputs.append(x) + x = stage(x) + stage_outputs.append(x) + + for i, stage in enumerate(stage_modules): + wrapper = EncoderStageWrapper(stage) + _benchmark_module( + f"EncoderStage{i}", + wrapper, + (stage_inputs[i],), + backend, + target, + n_runs, + ) + + blocks = getattr(stage, "layers", None) or getattr(stage, "blocks", None) + if blocks is None: + continue + + hidden_states, height, width = stage.patch_embeddings(stage_inputs[i]) + for j, block in enumerate(blocks): + block_input = hidden_states + block_wrapper = SingleBlockWrapper(block, height, width) + _benchmark_module( + f"Stage{i}/Block{j}", + block_wrapper, + (block_input,), + backend, + target, + n_runs, + ) + hidden_states = block(hidden_states, height, width)[0] + + decode_wrapper = DecodeHeadWrapper(decode_head) + _benchmark_module( + "DecodeHead", + decode_wrapper, + tuple(stage_outputs), + backend, + target, + n_runs, + ) + + # --------------------------------------------------------------------------- # Shared fixture: load model + compute reference outputs for all stages once # --------------------------------------------------------------------------- @@ -307,3 +403,48 @@ def test_end_to_end_composed(segformer_refs): ok = _print_diff(logits, refs["ref_logits"], "ComposedLogits") assert ok, "End-to-end composed output mismatch" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="SegFormer layerwise benchmarks/tests helper") + parser.add_argument( + "--action", + type=str, + choices=["benchmark_layerwise"], + default="benchmark_layerwise", + ) + parser.add_argument( + "--version", + type=str, + choices=list(SEGFORMER_MODELS.keys()), + default="b0", + ) + parser.add_argument( + "--backend", + type=str, + choices=["torch", "docc"], + default="torch", + ) + parser.add_argument( + "--target", + type=str, + default="cuda", + help="DOCC target when backend=docc", + ) + parser.add_argument( + "--device", + type=str, + choices=["cpu", "cuda"], + default="cuda", + ) + parser.add_argument("--n_runs", type=int, default=10) + args = parser.parse_args() + + model_name = SEGFORMER_MODELS[args.version] + benchmark_layerwise( + model_name=model_name, + backend=args.backend, + target=args.target, + device=args.device, + n_runs=args.n_runs, + ) diff --git a/mlir/benchmarks/torch/model_zoo/segformer_test.py b/mlir/benchmarks/torch/model_zoo/segformer_test.py index e900fce62..712074d4a 100644 --- a/mlir/benchmarks/torch/model_zoo/segformer_test.py +++ b/mlir/benchmarks/torch/model_zoo/segformer_test.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -15,13 +16,36 @@ os.environ["DOCC_DEBUG"] = "dump" +SEGFORMER_MODELS = { + "b0": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024", + "b1": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", + "b2": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024", + "b3": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024", + "b4": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024", + "b5": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024", +} + + +def resolve_model_name(version, model): + if model: + return model + return SEGFORMER_MODELS[version] + + +def get_test_model_name(): + version = os.getenv("SEGFORMER_VERSION", "b0") + if version not in SEGFORMER_MODELS: + raise ValueError( + f"Unsupported SEGFORMER_VERSION '{version}'. " + f"Expected one of: {', '.join(SEGFORMER_MODELS.keys())}" + ) + return resolve_model_name(version, None) + + def test_backend(): - model = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() - model_ref = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() + model_name = get_test_model_name() + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() + model_ref = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() model_ref.load_state_dict(model.state_dict()) example_input = torch.randn(1, 3, 512, 512) @@ -60,12 +84,9 @@ def test_backend(): @pytest.mark.skip("Skip") def test_compile(): - model = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() - model_ref = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() + model_name = get_test_model_name() + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() + model_ref = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() model_ref.load_state_dict(model.state_dict()) example_input = torch.randn(1, 3, 512, 512) @@ -83,9 +104,7 @@ def test_compile(): assert torch.allclose(res, res_ref.logits, rtol=1e-4) def find_used_dialects(): - model = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() + model = SegformerForSemanticSegmentation.from_pretrained(get_test_model_name()).eval() example_input = torch.randn(1, 3, 512, 512) @@ -107,20 +126,32 @@ def find_used_dialects(): # print(mlir_str) -def benchmark_segformer(model_name): +def benchmark_segformer(model_name, backend="torch", target="none", device="cpu"): model = SegformerForSemanticSegmentation.from_pretrained( model_name ).eval() - example_input = torch.randn(1, 3, 1024, 1024) + if device == "cuda" and not torch.cuda.is_available(): + raise RuntimeError("CUDA requested but not available") + + if device == "cuda": + model = model.to("cuda") - program = torch.compile(model) + example_input = torch.randn(1, 3, 1024, 1024, device=device) + + compile_kwargs = {} + if backend == "docc": + compile_kwargs = { + "backend": "docc", + "options": {"target": target, "category": "server"}, + } + + program = torch.compile(model, **compile_kwargs) with torch.no_grad(): # Warmup res = program(pixel_values=example_input) import time - import math from scipy import stats as scipy_stats times = [] @@ -153,11 +184,74 @@ def benchmark_segformer(model_name): print(f"Average inference time: {mean:.2f} ms (n={n})") print(f"95% CI: [{mean - half_width:.2f}, {mean + half_width:.2f}] ms (±{half_width:.2f} ms)") + +def setup_segformer_benchmark(model_name): + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() + example_input = torch.randn(1, 3, 512, 512) + return model, example_input + if __name__ == "__main__": - # find_used_dialects() - find_used_dialects() - #benchmark_segformer("nvidia/segformer-b1-finetuned-cityscapes-1024-1024") - #benchmark_segformer("nvidia/segformer-b2-finetuned-cityscapes-1024-1024") - #benchmark_segformer("nvidia/segformer-b3-finetuned-cityscapes-1024-1024") - #benchmark_segformer("nvidia/segformer-b4-finetuned-cityscapes-1024-1024") - #benchmark_segformer("nvidia/segformer-b5-finetuned-cityscapes-1024-1024") \ No newline at end of file + parser = argparse.ArgumentParser(description="segformer benchmark") + parser.add_argument( + "--model", + type=str, + default=None, + help="Optional Hugging Face model id to override --version", + ) + parser.add_argument( + "--version", + type=str, + choices=list(SEGFORMER_MODELS.keys()), + default="b0", + help="SegFormer variant used when --model is not provided", + ) + parser.add_argument( + "--action", + type=str, + choices=["dialects", "benchmark", "benchmark_segformer"], + default="benchmark", + help="Run dialect dump or harness benchmark", + ) + parser.add_argument( + "--backend", + type=str, + choices=["torch", "docc"], + default="torch", + help="Backend for --action benchmark_segformer", + ) + parser.add_argument( + "--target", + type=str, + default="none", + help="DOCC target for --action benchmark_segformer (e.g. none, openmp, cuda)", + ) + parser.add_argument( + "--device", + type=str, + choices=["cpu", "cuda"], + default="cpu", + help="Tensor/model device for --action benchmark_segformer", + ) + args, remaining = parser.parse_known_args() + model_name = resolve_model_name(args.version, args.model) + + import sys + + if args.action == "dialects": + find_used_dialects() + elif args.action == "benchmark_segformer": + benchmark_segformer( + model_name, + backend=args.backend, + target=args.target, + device=args.device, + ) + else: + sys.argv = [sys.argv[0]] + remaining + from functools import partial + from benchmarks.harness import run_benchmark + + run_benchmark( + partial(setup_segformer_benchmark, model_name), + f"segformer {model_name}", + ) \ No newline at end of file From 02d84ef3ddb6cddc5c9f9218c1cf809ace302aa9 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Mon, 8 Jun 2026 10:48:50 +0200 Subject: [PATCH 05/20] Remove unnecessary tests --- .daisy/python_npbench.yml | 0 .../model_zoo/segformer_layerwise_test.py | 450 ------------------ 2 files changed, 450 deletions(-) delete mode 100644 .daisy/python_npbench.yml delete mode 100644 mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py diff --git a/.daisy/python_npbench.yml b/.daisy/python_npbench.yml deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py b/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py deleted file mode 100644 index 50e1d6491..000000000 --- a/mlir/benchmarks/torch/model_zoo/segformer_layerwise_test.py +++ /dev/null @@ -1,450 +0,0 @@ -"""Layerwise test for SegFormer-b0. - -Tests each encoder stage and the decode head individually with the docc backend, -checking the output of each against a pure-PyTorch reference. - -Structure of SegFormer-b0: - Encoder: - Stage 0: OverlapPatchEmbedding (stride=4) + 2x TransformerBlock + LayerNorm -> (B, 32, H/4, W/4) - Stage 1: OverlapPatchEmbedding (stride=2) + 2x TransformerBlock + LayerNorm -> (B, 64, H/8, W/8) - Stage 2: OverlapPatchEmbedding (stride=2) + 2x TransformerBlock + LayerNorm -> (B,160, H/16, W/16) - Stage 3: OverlapPatchEmbedding (stride=2) + 2x TransformerBlock + LayerNorm -> (B,256, H/32, W/32) - Decode head: - 4x Linear projection + upsample to stage-0 resolution + concat + fuse Conv+BN + classifier Conv -""" - -import argparse -import time - -import pytest -import torch -import torch.nn as nn -from transformers import SegformerForSemanticSegmentation - -import docc.torch - -MODEL_NAME = "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" -SEGFORMER_MODELS = { - "b0": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024", - "b1": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", - "b2": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024", - "b3": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024", - "b4": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024", - "b5": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024", -} -INPUT_SHAPE = (1, 3, 512, 512) -RTOL = 1e-2 -ATOL = 1e-4 - - -# --------------------------------------------------------------------------- -# Wrappers -# --------------------------------------------------------------------------- - -class EncoderStageWrapper(nn.Module): - """One encoder stage (SegformerStage): patch embedding + transformer blocks + layer norm. - - In newer HuggingFace versions the stage is a self-contained SegformerStage module - whose forward accepts and returns a spatial feature map (B, C, H, W). - """ - - def __init__(self, stage): - super().__init__() - self.stage = stage - - def forward(self, x): - return self.stage(x) - - -class DecodeHeadWrapper(nn.Module): - """Decode head: takes 4 stage feature maps, returns logits (B, num_classes, H/4, W/4). - - Accepts stage outputs as individual positional arguments (not a tuple) so that - torch.compile / docc can trace through without dynamic container unpacking. - """ - - def __init__(self, decode_head): - super().__init__() - self.decode_head = decode_head - - def forward(self, s0, s1, s2, s3): - return self.decode_head((s0, s1, s2, s3)) - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _print_diff(result: torch.Tensor, reference: torch.Tensor, label: str) -> bool: - diff = (result - reference).abs() - rel = diff / reference.abs().clamp(min=1e-8) - n_total = diff.numel() - n_fail = (~torch.isclose(result, reference, rtol=RTOL, atol=ATOL)).sum().item() - print( - f" {label}: " - f"abs max={diff.max().item():.6f} mean={diff.mean().item():.6f} | " - f"rel max={rel.max().item():.6f} mean={rel.mean().item():.6f} | " - f"failing {n_fail}/{n_total} ({100 * n_fail / n_total:.2f}%)" - ) - return n_fail == 0 - - -def _compile(module: nn.Module) -> nn.Module: - return torch.compile( - module, - backend="docc", - options={"target": "sequential", "category": "server"}, - dynamic=False, # keep height/width as concrete ints, not SymInts - ) - - -def _compile_for_backend(module: nn.Module, backend: str, target: str) -> nn.Module: - if backend == "docc": - return torch.compile( - module, - backend="docc", - options={"target": target, "category": "server"}, - dynamic=False, - ) - return torch.compile(module, dynamic=False) - - -def _benchmark_module(label: str, module: nn.Module, inputs, backend: str, target: str, n_runs: int) -> None: - compiled = _compile_for_backend(module, backend, target) - with torch.no_grad(): - compiled(*inputs) - - times_ms = [] - for _ in range(n_runs): - start = time.perf_counter() - compiled(*inputs) - end = time.perf_counter() - times_ms.append((end - start) * 1000.0) - - mean_ms = sum(times_ms) / len(times_ms) - print(f"{label}: mean={mean_ms:.2f} ms over {n_runs} runs") - - -def benchmark_layerwise(model_name: str, backend: str = "torch", target: str = "cuda", device: str = "cuda", n_runs: int = 10) -> None: - if device == "cuda" and not torch.cuda.is_available(): - raise RuntimeError("CUDA requested but not available") - - model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval().to(device) - stage_modules = model.segformer.stages - decode_head = model.decode_head - - x = torch.randn(*INPUT_SHAPE, device=device) - - print(f"Layerwise benchmark model={model_name} backend={backend} target={target} device={device}") - with torch.no_grad(): - stage_inputs = [] - stage_outputs = [] - - for stage in stage_modules: - stage_inputs.append(x) - x = stage(x) - stage_outputs.append(x) - - for i, stage in enumerate(stage_modules): - wrapper = EncoderStageWrapper(stage) - _benchmark_module( - f"EncoderStage{i}", - wrapper, - (stage_inputs[i],), - backend, - target, - n_runs, - ) - - blocks = getattr(stage, "layers", None) or getattr(stage, "blocks", None) - if blocks is None: - continue - - hidden_states, height, width = stage.patch_embeddings(stage_inputs[i]) - for j, block in enumerate(blocks): - block_input = hidden_states - block_wrapper = SingleBlockWrapper(block, height, width) - _benchmark_module( - f"Stage{i}/Block{j}", - block_wrapper, - (block_input,), - backend, - target, - n_runs, - ) - hidden_states = block(hidden_states, height, width)[0] - - decode_wrapper = DecodeHeadWrapper(decode_head) - _benchmark_module( - "DecodeHead", - decode_wrapper, - tuple(stage_outputs), - backend, - target, - n_runs, - ) - - -# --------------------------------------------------------------------------- -# Shared fixture: load model + compute reference outputs for all stages once -# --------------------------------------------------------------------------- - -@pytest.fixture(scope="module") -def segformer_refs(): - """Load the pretrained model and run the reference forward pass stage by stage.""" - model = SegformerForSemanticSegmentation.from_pretrained(MODEL_NAME).eval() - stages = model.segformer.stages - - example_input = torch.randn(*INPUT_SHAPE) - - stage_inputs = [] # input to each encoder stage - stage_outputs = [] # output of each encoder stage (2-D spatial feature map) - - x = example_input - with torch.no_grad(): - for stage in stages: - stage_inputs.append(x.clone()) - x = stage(x) - stage_outputs.append(x.clone()) - - # Reference logits from the full model (using reference stage outputs) - ref_logits = model.decode_head(tuple(stage_outputs)) - - return { - "model": model, - "example_input": example_input, - "stage_inputs": stage_inputs, - "stage_outputs": stage_outputs, - "ref_logits": ref_logits, - } - - -# --------------------------------------------------------------------------- -# Encoder stage tests -# --------------------------------------------------------------------------- - -def _test_encoder_stage(segformer_refs, stage_idx: int): - refs = segformer_refs - stage = refs["model"].segformer.stages[stage_idx] - - wrapper = EncoderStageWrapper(stage) - - compiled = _compile(wrapper) - stage_input = refs["stage_inputs"][stage_idx] - - t0 = time.perf_counter() - with torch.no_grad(): - result = compiled(stage_input) - t1 = time.perf_counter() - print(f"\nEncoderStage{stage_idx} inference: {(t1 - t0) * 1000:.2f} ms") - - reference = refs["stage_outputs"][stage_idx] - ok = _print_diff(result, reference, f"EncoderStage{stage_idx}") - assert ok, f"EncoderStage{stage_idx} output mismatch (see diff above)" - - -def test_encoder_stage_0(segformer_refs): - _test_encoder_stage(segformer_refs, 0) - - -def test_encoder_stage_1(segformer_refs): - _test_encoder_stage(segformer_refs, 1) - - -def test_encoder_stage_2(segformer_refs): - _test_encoder_stage(segformer_refs, 2) - - -def test_encoder_stage_3(segformer_refs): - _test_encoder_stage(segformer_refs, 3) - - -# --------------------------------------------------------------------------- -# Individual transformer block tests (finer granularity within a stage) -# --------------------------------------------------------------------------- - -class SingleBlockWrapper(nn.Module): - """A single SegformerLayer (attention + FFN) with fixed height/width.""" - - def __init__(self, block, height: int, width: int): - super().__init__() - self.block = block - self.height = height - self.width = width - - def forward(self, hidden_states): - return self.block(hidden_states, self.height, self.width)[0] - - -def _test_transformer_block(segformer_refs, stage_idx: int, block_idx: int): - """Test one transformer block inside an encoder stage. - - Uses the actual intermediate hidden states at that block's input by running - the patch embedding (and preceding blocks) in reference mode. - """ - refs = segformer_refs - stage = refs["model"].segformer.stages[stage_idx] - # SegformerStage stores its transformer blocks as 'layers' in newer HF versions - blocks = getattr(stage, "layers", None) or getattr(stage, "blocks", None) - if blocks is None: - pytest.skip(f"Cannot find transformer blocks in SegformerStage (stage {stage_idx})") - - stage_input = refs["stage_inputs"][stage_idx] - - with torch.no_grad(): - hidden_states, height, width = stage.patch_embeddings(stage_input) - for j in range(block_idx): - hidden_states = blocks[j](hidden_states, height, width)[0] - block_input = hidden_states.clone() - block_ref_output = blocks[block_idx](block_input, height, width)[0] - - wrapper = SingleBlockWrapper(blocks[block_idx], height, width) - compiled = _compile(wrapper) - - with torch.no_grad(): - result = compiled(block_input) - - label = f"Stage{stage_idx}/Block{block_idx}" - ok = _print_diff(result, block_ref_output, label) - assert ok, f"{label} output mismatch" - - -def test_stage0_block0(segformer_refs): - _test_transformer_block(segformer_refs, 0, 0) - - -def test_stage0_block1(segformer_refs): - _test_transformer_block(segformer_refs, 0, 1) - - -def test_stage1_block0(segformer_refs): - _test_transformer_block(segformer_refs, 1, 0) - - -def test_stage1_block1(segformer_refs): - _test_transformer_block(segformer_refs, 1, 1) - - -def test_stage2_block0(segformer_refs): - _test_transformer_block(segformer_refs, 2, 0) - - -def test_stage2_block1(segformer_refs): - _test_transformer_block(segformer_refs, 2, 1) - - -def test_stage3_block0(segformer_refs): - _test_transformer_block(segformer_refs, 3, 0) - - -def test_stage3_block1(segformer_refs): - _test_transformer_block(segformer_refs, 3, 1) - - -# --------------------------------------------------------------------------- -# Decode head test -# --------------------------------------------------------------------------- - -def test_decode_head(segformer_refs): - """Test the decode head in isolation using the reference stage outputs as input.""" - refs = segformer_refs - decode_head = refs["model"].decode_head - s0, s1, s2, s3 = refs["stage_outputs"] - - wrapper = DecodeHeadWrapper(decode_head) - compiled = _compile(wrapper) - - t0 = time.perf_counter() - with torch.no_grad(): - result = compiled(s0, s1, s2, s3) - t1 = time.perf_counter() - print(f"\nDecodeHead inference: {(t1 - t0) * 1000:.2f} ms") - - ok = _print_diff(result, refs["ref_logits"], "DecodeHead") - assert ok, "DecodeHead output mismatch" - - -# --------------------------------------------------------------------------- -# End-to-end composed test: use compiled stages in sequence -# --------------------------------------------------------------------------- - -def test_end_to_end_composed(segformer_refs): - """Run all 4 compiled encoder stages + compiled decode head in sequence. - - This is the same as test_backend in segformer_test.py but with the model - manually decomposed so that the first failing stage is immediately visible. - """ - refs = segformer_refs - stages = refs["model"].segformer.stages - - compiled_stages = [ - _compile(EncoderStageWrapper(stage)) - for stage in stages - ] - compiled_head = _compile(DecodeHeadWrapper(refs["model"].decode_head)) - - x = refs["example_input"] - stage_outputs = [] - with torch.no_grad(): - for i, stage in enumerate(compiled_stages): - t0 = time.perf_counter() - x = stage(x) - t1 = time.perf_counter() - print(f"\nComposed Stage{i}: {(t1 - t0) * 1000:.2f} ms, shape={tuple(x.shape)}") - - ok = _print_diff(x, refs["stage_outputs"][i], f"ComposedStage{i}") - assert ok, f"Composed encoder stage {i} output mismatch" - stage_outputs.append(x) - - t0 = time.perf_counter() - logits = compiled_head(*stage_outputs) - t1 = time.perf_counter() - print(f"Composed DecodeHead: {(t1 - t0) * 1000:.2f} ms") - - ok = _print_diff(logits, refs["ref_logits"], "ComposedLogits") - assert ok, "End-to-end composed output mismatch" - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="SegFormer layerwise benchmarks/tests helper") - parser.add_argument( - "--action", - type=str, - choices=["benchmark_layerwise"], - default="benchmark_layerwise", - ) - parser.add_argument( - "--version", - type=str, - choices=list(SEGFORMER_MODELS.keys()), - default="b0", - ) - parser.add_argument( - "--backend", - type=str, - choices=["torch", "docc"], - default="torch", - ) - parser.add_argument( - "--target", - type=str, - default="cuda", - help="DOCC target when backend=docc", - ) - parser.add_argument( - "--device", - type=str, - choices=["cpu", "cuda"], - default="cuda", - ) - parser.add_argument("--n_runs", type=int, default=10) - args = parser.parse_args() - - model_name = SEGFORMER_MODELS[args.version] - benchmark_layerwise( - model_name=model_name, - backend=args.backend, - target=args.target, - device=args.device, - n_runs=args.n_runs, - ) From 28ea4e54c59e325cb4811aaeadb729a2a74b5139 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Mon, 8 Jun 2026 15:03:23 +0200 Subject: [PATCH 06/20] Preserve arguments analysis in loop scheduler --- .github/workflows/llvm_tests_san.yml | 0 .github/workflows/release.yml | 0 opt/src/passes/scheduler/loop_scheduling_pass.cpp | 2 +- opt/src/transformations/map_fusion.cpp | 2 +- sdfg/include/sdfg/analysis/analysis.h | 4 ++-- 5 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 .github/workflows/llvm_tests_san.yml delete mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/llvm_tests_san.yml b/.github/workflows/llvm_tests_san.yml deleted file mode 100644 index e69de29bb..000000000 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index e69de29bb..000000000 diff --git a/opt/src/passes/scheduler/loop_scheduling_pass.cpp b/opt/src/passes/scheduler/loop_scheduling_pass.cpp index a6e280b8c..4b6b9fff9 100644 --- a/opt/src/passes/scheduler/loop_scheduling_pass.cpp +++ b/opt/src/passes/scheduler/loop_scheduling_pass.cpp @@ -130,7 +130,7 @@ bool LoopSchedulingPass::run_pass_target( for (auto* loop : schedulable_loops) { scheduler->apply_schedule(builder, analysis_manager, *loop, offload_unknown_sizes_); } - analysis_manager.invalidate_all(); + analysis_manager.preserve(); // ===== Phase 4: Post-schedule ===== scheduler->post_schedule(builder, analysis_manager, schedulable_loops); diff --git a/opt/src/transformations/map_fusion.cpp b/opt/src/transformations/map_fusion.cpp index 60a928d01..64c6d2d7b 100644 --- a/opt/src/transformations/map_fusion.cpp +++ b/opt/src/transformations/map_fusion.cpp @@ -1043,7 +1043,7 @@ void MapFusion::apply(builder::StructuredSDFGBuilder& builder, analysis::Analysi } } } - analysis_manager.invalidate_preserving(); + analysis_manager.preserve(); } else { // ConsumerIntoProducer removes the consumer loop node entirely — full invalidation. analysis_manager.invalidate_all(); diff --git a/sdfg/include/sdfg/analysis/analysis.h b/sdfg/include/sdfg/analysis/analysis.h index 923c78a34..14e7d9eb3 100644 --- a/sdfg/include/sdfg/analysis/analysis.h +++ b/sdfg/include/sdfg/analysis/analysis.h @@ -86,10 +86,10 @@ class AnalysisManager { } } - // Invalidate all cached analyses except the listed types. + // Preserve only the listed analyses and invalidate all others. // Analyses not present in the cache are unaffected. template - void invalidate_preserving() { + void preserve() { std::unordered_map> kept; auto try_keep = [&](std::type_index type) { auto it = cache_.find(type); From 2fce817032603215c6a887bc1a5bf85f490a1b66 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Mon, 8 Jun 2026 15:03:46 +0200 Subject: [PATCH 07/20] Change segformer test to cuda --- mlir/benchmarks/torch/model_zoo/segformer_test.py | 8 ++++---- sdfg/include/sdfg/analysis/assumptions_analysis.h | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mlir/benchmarks/torch/model_zoo/segformer_test.py b/mlir/benchmarks/torch/model_zoo/segformer_test.py index 712074d4a..3601feccf 100644 --- a/mlir/benchmarks/torch/model_zoo/segformer_test.py +++ b/mlir/benchmarks/torch/model_zoo/segformer_test.py @@ -41,7 +41,7 @@ def get_test_model_name(): ) return resolve_model_name(version, None) - +@pytest.mark.skipif(not os.environ.get("SLOW_TESTS", ""), reason="slow test") def test_backend(): model_name = get_test_model_name() model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() @@ -51,12 +51,12 @@ def test_backend(): example_input = torch.randn(1, 3, 512, 512) start = time.perf_counter() - program = torch.compile(model, backend="docc", options={"target": "none", "category": "server"}) + program = torch.compile(model, backend="docc", options={"target": "cuda", "category": "server"}) end = time.perf_counter() print(f"compilation time: {(end - start) * 1000:.2f} ms") start = time.perf_counter() - ref_program = torch.compile(model, backend="docc", options={"target": "none", "category": "server"}) + ref_program = torch.compile(model, backend="docc", options={"target": "cuda", "category": "server"}) end = time.perf_counter() print(f"ref compilation time: {(end - start) * 1000:.2f} ms") @@ -254,4 +254,4 @@ def setup_segformer_benchmark(model_name): run_benchmark( partial(setup_segformer_benchmark, model_name), f"segformer {model_name}", - ) \ No newline at end of file + ) diff --git a/sdfg/include/sdfg/analysis/assumptions_analysis.h b/sdfg/include/sdfg/analysis/assumptions_analysis.h index 7c15600ae..22bdd6a31 100644 --- a/sdfg/include/sdfg/analysis/assumptions_analysis.h +++ b/sdfg/include/sdfg/analysis/assumptions_analysis.h @@ -70,8 +70,7 @@ class AssumptionsAnalysis : public Analysis { // sibling_node. Call this after inserting nodes into a sequence to keep the // cached analysis valid without a full re-run. void register_node( - structured_control_flow::ControlFlowNode& new_node, - structured_control_flow::ControlFlowNode& sibling_node + structured_control_flow::ControlFlowNode& new_node, structured_control_flow::ControlFlowNode& sibling_node ); const symbolic::SymbolSet& parameters(); From 1d115b19ff28c6deec1105e424130a998fd13fd6 Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Mon, 8 Jun 2026 15:59:30 +0200 Subject: [PATCH 08/20] replaces MemAccessRangeAnalysis with MemoryLayoutAnalysis --- opt/src/transformations/in_local_storage.cpp | 14 + .../offloading/offload_transform.cpp | 5 - opt/src/transformations/out_local_storage.cpp | 6 + .../npbench/polybench/test_fdtd_2d.py | 2 +- sdfg/CMakeLists.txt | 1 - .../sdfg/analysis/mem_access_range_analysis.h | 83 ---- .../mem_access_range_analysis_internal.h | 54 -- .../sdfg/analysis/memory_layout_analysis.h | 59 ++- sdfg/include/sdfg/codegen/code_generator.h | 1 - sdfg/src/analysis/arguments_analysis.cpp | 49 +- .../analysis/mem_access_range_analysis.cpp | 254 ---------- sdfg/src/analysis/memory_layout_analysis.cpp | 268 +++++++--- sdfg/tests/CMakeLists.txt | 1 - .../analysis/arguments_analysis_test.cpp | 25 +- .../mem_access_range_analysis_test.cpp | 470 ------------------ .../analysis/memory_layout_analysis_test.cpp | 311 ++++++++++++ .../src/tenstorrent/tenstorrent_transform.cpp | 3 - 17 files changed, 592 insertions(+), 1014 deletions(-) delete mode 100644 sdfg/include/sdfg/analysis/mem_access_range_analysis.h delete mode 100644 sdfg/include/sdfg/analysis/mem_access_range_analysis_internal.h delete mode 100644 sdfg/src/analysis/mem_access_range_analysis.cpp delete mode 100644 sdfg/tests/analysis/mem_access_range_analysis_test.cpp diff --git a/opt/src/transformations/in_local_storage.cpp b/opt/src/transformations/in_local_storage.cpp index 0007cb12c..30379d66f 100644 --- a/opt/src/transformations/in_local_storage.cpp +++ b/opt/src/transformations/in_local_storage.cpp @@ -83,6 +83,15 @@ bool InLocalStorage::can_be_applied(builder::StructuredSDFGBuilder& builder, ana auto extents = candidate->tile.extents_approx(); if (extents.empty()) continue; + // Reject candidates with any unbounded-dependent extent (returned as null). + bool has_null = false; + for (auto& ext : extents) { + if (ext.is_null()) { + has_null = true; + break; + } + } + if (has_null) continue; if (storage_type_.is_nv_shared()) { // GPU path: accept first valid group (substitution happens later) @@ -118,6 +127,11 @@ bool InLocalStorage::can_be_applied(builder::StructuredSDFGBuilder& builder, ana if (extents.empty()) { return false; } + // Defensive: candidate filtering above already rejects unbounded-dependent extents, + // but guard here too since downstream code dereferences these expressions. + for (auto& ext : extents) { + if (ext.is_null()) return false; + } // Store tile info (before substitution, bases/strides stay symbolic) tile_info_.dimensions = extents; diff --git a/opt/src/transformations/offloading/offload_transform.cpp b/opt/src/transformations/offloading/offload_transform.cpp index 6e92080f4..ad4076c33 100644 --- a/opt/src/transformations/offloading/offload_transform.cpp +++ b/opt/src/transformations/offloading/offload_transform.cpp @@ -3,7 +3,6 @@ #include #include -#include "sdfg/analysis/mem_access_range_analysis.h" #include "sdfg/analysis/scope_analysis.h" #include "sdfg/analysis/type_analysis.h" #include "sdfg/data_flow/access_node.h" @@ -93,9 +92,6 @@ bool OffloadTransform::can_be_applied(builder::StructuredSDFGBuilder& builder, a } } - // Criterion: arg ranges must be known - auto& mem_access_ranges = analysis_manager.get(); - if (!arguments_analysis.argument_size_known(analysis_manager, this->map_, allow_dynamic_sizes_)) { if (report_) report_->transform_impossible(this, "args not understood"); DEBUG_PRINTLN("Cannot apply transform: argument sizes not known"); @@ -127,7 +123,6 @@ void OffloadTransform::apply(builder::StructuredSDFGBuilder& builder, analysis:: auto& locals = arguments_analysis.locals(analysis_manager, this->map_); // Infer subsets for arguments - auto& mem_access_ranges = analysis_manager.get(); auto& argument_sizes = arguments_analysis.argument_sizes(analysis_manager, this->map_, allow_dynamic_sizes_); auto& scope_analysis = analysis_manager.get(); diff --git a/opt/src/transformations/out_local_storage.cpp b/opt/src/transformations/out_local_storage.cpp index 2167ff124..45bae58e4 100644 --- a/opt/src/transformations/out_local_storage.cpp +++ b/opt/src/transformations/out_local_storage.cpp @@ -111,6 +111,12 @@ bool OutLocalStorage::can_be_applied(builder::StructuredSDFGBuilder& builder, an if (extents.empty()) { return false; } + // Reject if any extent depends on an unbounded leading dimension (returned as null + // by extents_approx). Downstream code (substitution, stride computation) would + // dereference these. + for (auto& ext : extents) { + if (ext.is_null()) return false; + } // Store tile info (before substitution, bases/strides stay symbolic) tile_info_.dimensions = extents; diff --git a/python/benchmarks/npbench/polybench/test_fdtd_2d.py b/python/benchmarks/npbench/polybench/test_fdtd_2d.py index 1d22a9642..dac08e4c0 100644 --- a/python/benchmarks/npbench/polybench/test_fdtd_2d.py +++ b/python/benchmarks/npbench/polybench/test_fdtd_2d.py @@ -51,7 +51,7 @@ def test_fdtd_2d(target): ) elif target == "cuda": verifier = SDFGVerification( - verification={"CUDA": 13, "MAP": 13, "CUDAOffloading": 22, "FOR": 14} + verification={"CUDA": 13, "MAP": 13, "CUDAOffloading": 20, "FOR": 14} ) else: # rocm verifier = SDFGVerification( diff --git a/sdfg/CMakeLists.txt b/sdfg/CMakeLists.txt index dffb287e9..cd07a3515 100644 --- a/sdfg/CMakeLists.txt +++ b/sdfg/CMakeLists.txt @@ -34,7 +34,6 @@ set(SOURCE_FILES src/analysis/dominance_analysis.cpp src/analysis/loop_analysis.cpp src/analysis/loop_carried_dependency_analysis.cpp - src/analysis/mem_access_range_analysis.cpp src/analysis/memory_layout_analysis.cpp src/analysis/reference_analysis.cpp src/analysis/scope_analysis.cpp diff --git a/sdfg/include/sdfg/analysis/mem_access_range_analysis.h b/sdfg/include/sdfg/analysis/mem_access_range_analysis.h deleted file mode 100644 index b02bb24f5..000000000 --- a/sdfg/include/sdfg/analysis/mem_access_range_analysis.h +++ /dev/null @@ -1,83 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "sdfg/analysis/analysis.h" -#include "sdfg/structured_control_flow/control_flow_node.h" -#include "sdfg/structured_control_flow/sequence.h" -#include "sdfg/structured_sdfg.h" -#include "sdfg/symbolic/symbolic.h" - -namespace sdfg { -namespace analysis { - -class MemAccessRange { - friend class MemAccessRangesBuilder; - -private: - const std::string name_; - bool saw_read_; - bool saw_write_; - bool undefined_; - std::vector> dims_; - -public: - MemAccessRange( - const std::string& name, - bool saw_read, - bool saw_write, - bool undefined, - const std::vector>&& dims - ); - - MemAccessRange(const MemAccessRange& other) - : name_(other.name_), saw_read_(other.saw_read_), saw_write_(other.saw_write_), undefined_(other.undefined_), - dims_(other.dims_) {} - - MemAccessRange(MemAccessRange&& other) noexcept - : name_(std::move(other.name_)), saw_read_(other.saw_read_), saw_write_(other.saw_write_), - undefined_(other.undefined_), dims_(std::move(other.dims_)) {} - - const std::string& get_name() const; - - bool saw_read() const; - bool saw_write() const; - bool is_undefined() const; - - const std::vector>& dims() const; -}; - -class MemAccessRanges : public Analysis { - friend class AnalysisManager; - -private: - // Graph representation - graph::Graph graph_; - - std::unordered_map> - ranges_; - - analysis::AnalysisManager* analysis_manager_; - - void run(structured_control_flow::ControlFlowNode& node, std::unordered_set target_container); - -protected: - void run(analysis::AnalysisManager& analysis_manager) override; - -public: - MemAccessRanges(StructuredSDFG& sdfg); - - std::string name() const override { return "MemAccessRanges"; } - - const MemAccessRange* get(const std::string& varName) const; - - const MemAccessRange* - get(const std::string& varName, - structured_control_flow::ControlFlowNode& node, - std::unordered_set target_container); -}; - -} // namespace analysis -} // namespace sdfg diff --git a/sdfg/include/sdfg/analysis/mem_access_range_analysis_internal.h b/sdfg/include/sdfg/analysis/mem_access_range_analysis_internal.h deleted file mode 100644 index 46d5031d7..000000000 --- a/sdfg/include/sdfg/analysis/mem_access_range_analysis_internal.h +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#include -#include - -#include "sdfg/analysis/assumptions_analysis.h" -#include "sdfg/analysis/mem_access_range_analysis.h" -#include "sdfg/analysis/users.h" -#include "sdfg/structured_control_flow/sequence.h" -#include "sdfg/structured_sdfg.h" -#include "sdfg/symbolic/symbolic.h" - -namespace sdfg { -namespace analysis { - -struct WorkItem { - const std::string* var_name; - bool saw_read = false; - bool saw_write = false; - bool undefined = false; - WorkItem* will_complete; - std::vector, bool, std::vector, bool>> dims; - - WorkItem(const std::string* var_name) : var_name(var_name), will_complete(nullptr) {} -}; - -class MemAccessRangesBuilder { - friend class MemAccessRanges; - -private: - std::deque worklist_; - std::unordered_map ranges_; - - StructuredSDFG& sdfg_; - structured_control_flow::ControlFlowNode& node_; - - Users& users_analysis_; - AssumptionsAnalysis& assumptions_analysis_; - - void process_workItem(WorkItem* item); - - void process_direct_users(WorkItem* item, bool is_write, std::vector accesses); - - MemAccessRangesBuilder( - StructuredSDFG& sdfg, - structured_control_flow::ControlFlowNode& node, - Users& users_analysis, - AssumptionsAnalysis& assumptions_analysis - ) - : sdfg_(sdfg), node_(node), users_analysis_(users_analysis), assumptions_analysis_(assumptions_analysis) {} -}; - -} // namespace analysis -} // namespace sdfg diff --git a/sdfg/include/sdfg/analysis/memory_layout_analysis.h b/sdfg/include/sdfg/analysis/memory_layout_analysis.h index 38e45f2f5..f1a185275 100644 --- a/sdfg/include/sdfg/analysis/memory_layout_analysis.h +++ b/sdfg/include/sdfg/analysis/memory_layout_analysis.h @@ -1,10 +1,9 @@ /** - * @file memlet_delinearization_analysis.h - * @brief Analysis for delinearizing memlet subsets + * @file memory_layout_analysis.h + * @brief Analysis for inferring memory layouts of memlets * - * This analysis attempts to delinearize memlet subsets by recovering - * multi-dimensional structure from linearized expressions using the - * symbolic delinearize function with block-level assumptions. + * This analysis attempts to infer the memory layout of memlets using + * symbolic assumptions to interpret linearized subset expressions. */ #pragma once @@ -17,10 +16,9 @@ #include #include "sdfg/analysis/analysis.h" -#include "sdfg/data_flow/library_nodes/math/tensor/tensor_layout.h" #include "sdfg/data_flow/memlet.h" #include "sdfg/structured_control_flow/block.h" -#include "sdfg/structured_control_flow/structured_loop.h" +#include "sdfg/structured_control_flow/control_flow_node.h" #include "sdfg/symbolic/symbolic.h" namespace sdfg { @@ -42,13 +40,22 @@ struct MemoryTile { MemoryLayout layout; // Inferred tile layout at this loop level bool first_dim_bounded; // True if first dimension is bounded (Tensor/Array), false for unbounded pointers - /// Per-dimension bounding box extents: max[d] - min[d] + 1 + /// Per-dimension bounding box extents: max[d] - min[d] + 1. + /// Returns `SymEngine::null` in slot `d` if that extent would depend on an + /// unbounded leading-dimension sentinel. Callers MUST check each entry for null + /// before using it. symbolic::MultiExpression extents() const; - /// Per-dimension extents with min/max resolved to upper bounds via overapproximation + /// Per-dimension extents with min/max resolved to upper bounds via overapproximation. + /// Returns `SymEngine::null` in slot `d` if that extent would depend on an + /// unbounded leading-dimension sentinel. Callers MUST check each entry for null. symbolic::MultiExpression extents_approx() const; - /// First and last linear element addresses: offset + sum(stride[d] * idx[d]) + /// First and last linear element addresses: offset + sum(stride[d] * idx[d]). + /// Returns `{SymEngine::null, SymEngine::null}` if either endpoint would depend + /// on an unbounded leading-dimension sentinel (e.g. a layout whose strides + /// reference an unknown shape entry). Callers MUST check `.first.is_null()` / + /// `.second.is_null()` before using the result. std::pair contiguous_range() const; }; @@ -72,23 +79,23 @@ struct MemoryTileGroup { class MemoryLayoutAnalysis : public Analysis { private: std::unordered_map accesses_; - std::map, MemoryTile> tiles_; - std::map, std::vector> + std::map, MemoryTile> tiles_; + std::map, std::vector> tile_groups_; void traverse(structured_control_flow::ControlFlowNode& node, analysis::AnalysisManager& analysis_manager); void process_block(structured_control_flow::Block& block, analysis::AnalysisManager& analysis_manager); - void merge_loop_layouts( - structured_control_flow::StructuredLoop& loop, + void merge_scope_layouts( + structured_control_flow::ControlFlowNode& scope, const std::vector& memlets_before, - const std::set>& tiles_before, + const std::set>& tiles_before, analysis::AnalysisManager& analysis_manager ); void compute_tile_groups( - structured_control_flow::StructuredLoop& loop, + structured_control_flow::ControlFlowNode& scope, const std::string& container, const std::vector& memlets, const MemoryLayout& reference_layout, @@ -113,30 +120,30 @@ class MemoryLayoutAnalysis : public Analysis { const MemoryAccess* access(const data_flow::Memlet& memlet) const; /** - * @brief Get the inferred memory layout for a container at a specific loop level - * @param loop The loop to query + * @brief Get the inferred memory layout for a container at a specific scope + * @param scope The structured control-flow scope to query (Sequence, IfElse, While, StructuredLoop) * @param container The container name - * @return A pointer to the memory layout at that loop level, nullptr if not available + * @return A pointer to the memory tile at that scope, nullptr if not available */ - const MemoryTile* tile(const structured_control_flow::StructuredLoop& loop, const std::string& container) const; + const MemoryTile* tile(const structured_control_flow::ControlFlowNode& scope, const std::string& container) const; /** - * @brief Get tile groups for a container at a specific loop level - * @param loop The loop to query + * @brief Get tile groups for a container at a specific scope + * @param scope The structured control-flow scope to query * @param container The container name * @return A pointer to the vector of tile groups, nullptr if not available */ const std::vector* - tile_groups(const structured_control_flow::StructuredLoop& loop, const std::string& container) const; + tile_groups(const structured_control_flow::ControlFlowNode& scope, const std::string& container) const; /** - * @brief Get the tile group containing a specific memlet at a loop level - * @param loop The loop to query + * @brief Get the tile group containing a specific memlet at a scope + * @param scope The structured control-flow scope to query * @param memlet The memlet to find * @return A pointer to the tile group containing the memlet, nullptr if not found */ const MemoryTileGroup* - tile_group_for(const structured_control_flow::StructuredLoop& loop, const data_flow::Memlet& memlet) const; + tile_group_for(const structured_control_flow::ControlFlowNode& scope, const data_flow::Memlet& memlet) const; }; } // namespace analysis diff --git a/sdfg/include/sdfg/codegen/code_generator.h b/sdfg/include/sdfg/codegen/code_generator.h index e64bd9283..96e739f5b 100644 --- a/sdfg/include/sdfg/codegen/code_generator.h +++ b/sdfg/include/sdfg/codegen/code_generator.h @@ -5,7 +5,6 @@ #include #include "code_snippet_factory.h" -#include "sdfg/analysis/mem_access_range_analysis.h" #include "sdfg/codegen/instrumentation/arg_capture_plan.h" #include "sdfg/codegen/instrumentation/instrumentation_plan.h" #include "sdfg/codegen/utils.h" diff --git a/sdfg/src/analysis/arguments_analysis.cpp b/sdfg/src/analysis/arguments_analysis.cpp index cf714221a..cb349b984 100644 --- a/sdfg/src/analysis/arguments_analysis.cpp +++ b/sdfg/src/analysis/arguments_analysis.cpp @@ -1,5 +1,5 @@ #include "sdfg/analysis/arguments_analysis.h" -#include "sdfg/analysis/mem_access_range_analysis.h" +#include "sdfg/analysis/memory_layout_analysis.h" #include "sdfg/analysis/type_analysis.h" #include "sdfg/analysis/users.h" #include "sdfg/codegen/utils.h" @@ -94,19 +94,15 @@ void ArgumentsAnalysis::collect_arg_sizes( bool allow_dynamic_sizes_, bool do_not_throw ) { - std::unordered_set internal_vars; argument_sizes_.insert({&node, {}}); argument_element_sizes_.insert({&node, {}}); - auto& mem_access_ranges = analysis_manager.get(); + auto& memory_layout_analysis = analysis_manager.get(); auto& users = analysis_manager.get(); auto arguments = this->arguments(analysis_manager, node); auto locals = this->locals(analysis_manager, node); - internal_vars.insert(locals.begin(), locals.end()); - std::ranges::for_each(arguments, [&internal_vars](const auto& pair) { internal_vars.insert(pair.first); }); - analysis::TypeAnalysis type_analysis(sdfg_, &node, analysis_manager); for (auto& [argument, meta] : arguments) { @@ -135,37 +131,34 @@ void ArgumentsAnalysis::collect_arg_sizes( continue; } - auto range = mem_access_ranges.get(argument, node, internal_vars); - if (range == nullptr) { + auto tile = memory_layout_analysis.tile(node, argument); + if (tile == nullptr) { if (do_not_throw) { known_sizes_.insert({&node, false}); return; } else { - throw std::runtime_error("Range not found for " + argument); + throw std::runtime_error("Tile not found for " + argument); } } - - auto base_type = type_analysis.get_outer_type(argument); - auto elem_size = types::get_contiguous_element_size(*base_type, true); - if (range->is_undefined()) { - if (!allow_dynamic_sizes_) { - if (do_not_throw) { - known_sizes_.insert({&node, false}); - return; - } else { - throw std::runtime_error("Argument " + argument + " has undefined range"); - } + auto range = tile->contiguous_range(); + // contiguous_range returns {null, null} when the tile's extent would depend on + // an unbounded leading dimension; treat that as "size unknown" rather than + // dereferencing a null expression. + if (range.first.is_null() || range.second.is_null()) { + if (do_not_throw) { + known_sizes_.insert({&node, false}); + return; + } else { + throw std::runtime_error("Tile size unknown (unbounded dimension) for " + argument); } - DEBUG_PRINTLN("Argument " << argument << " has undefined range, using malloc_usable_size"); - argument_sizes_.at(&node).insert({argument, symbolic::malloc_usable_size(symbolic::symbol(argument))}); - argument_element_sizes_.at(&node).insert({argument, elem_size}); - continue; } + symbolic::Expression size = range.second; + size = symbolic::add(size, symbolic::one()); // Inclusive range, so add 1 + std::cout << "Contiguous range for " << argument << ": " << range.first->__str__() << " to " + << range.second->__str__() << std::endl; - symbolic::Expression size = symbolic::one(); - if (!range->dims().empty()) { - size = symbolic::add(range->dims().at(0).second, symbolic::one()); - } + auto base_type = type_analysis.get_outer_type(argument); + auto elem_size = types::get_contiguous_element_size(*base_type, true); bool is_nested_type = true; auto peeled_type = types::peel_to_next_element(*base_type); diff --git a/sdfg/src/analysis/mem_access_range_analysis.cpp b/sdfg/src/analysis/mem_access_range_analysis.cpp deleted file mode 100644 index c2e0340b1..000000000 --- a/sdfg/src/analysis/mem_access_range_analysis.cpp +++ /dev/null @@ -1,254 +0,0 @@ - -#include "sdfg/analysis/mem_access_range_analysis.h" - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "sdfg/analysis/analysis.h" -#include "sdfg/analysis/assumptions_analysis.h" -#include "sdfg/analysis/mem_access_range_analysis_internal.h" -#include "sdfg/analysis/users.h" -#include "sdfg/helpers/helpers.h" -#include "sdfg/symbolic/extreme_values.h" -#include "sdfg/symbolic/symbolic.h" - -namespace sdfg { -namespace analysis { - -MemAccessRanges::MemAccessRanges(StructuredSDFG& sdfg) : Analysis(sdfg), graph_() {} - -void MemAccessRanges:: - run(structured_control_flow::ControlFlowNode& node, std::unordered_set target_containers) { - auto& users = analysis_manager_->get(); - auto& assumptions_analysis = analysis_manager_->get(); - - auto builder = MemAccessRangesBuilder(sdfg_, node, users, assumptions_analysis); - - auto& worklist = builder.worklist_; - - // Initialize worklist with containers - for (const auto& container : target_containers) { - worklist.push_back(new WorkItem{&container}); - } - - // Iterate over all variables and their users - while (!worklist.empty()) { - auto* workItem = worklist.front(); - builder.process_workItem(workItem); - worklist.pop_front(); - delete workItem; - } - - this->ranges_.insert_or_assign(&node, std::move(builder.ranges_)); -} - -void MemAccessRanges::run(analysis::AnalysisManager& analysis_manager) { - this->analysis_manager_ = &analysis_manager; - std::unordered_set containers; - - // Collect argument names - for (auto& arg : sdfg_.arguments()) { - if (sdfg_.type(arg).type_id() != types::TypeID::Scalar) { - containers.insert(arg); - } - } - - // Collect external names - for (auto& ext : sdfg_.externals()) { - if (sdfg_.type(ext).type_id() != types::TypeID::Scalar) { - containers.insert(ext); - } - } - - this->run(sdfg_.root(), containers); -} - -const MemAccessRange* MemAccessRanges::get(const std::string& varName) const { - auto ranges = this->ranges_.find(&sdfg_.root()); - if (ranges == this->ranges_.end()) { - return nullptr; - } - auto res = ranges->second.find(varName); - if (res != ranges->second.end()) { - return &res->second; - } else { - return nullptr; - } -} - -const MemAccessRange* MemAccessRanges:: - get(const std::string& varName, - structured_control_flow::ControlFlowNode& node, - std::unordered_set target_nodes) { - auto ranges = this->ranges_.find(&node); - this->run(node, target_nodes); - ranges = this->ranges_.find(&node); - if (ranges == this->ranges_.end()) { - return nullptr; - } - auto res = ranges->second.find(varName); - if (res != ranges->second.end()) { - return &res->second; - } else { - return nullptr; - } -} - -MemAccessRange::MemAccessRange( - const std::string& name, - bool saw_read, - bool saw_write, - bool undefined, - const std::vector>&& dims -) - : name_(name), saw_read_(saw_read), saw_write_(saw_write), undefined_(undefined), dims_(dims) {} - -const std::string& MemAccessRange::get_name() const { return name_; } - -bool MemAccessRange::saw_read() const { return saw_read_; } -bool MemAccessRange::saw_write() const { return saw_write_; } -bool MemAccessRange::is_undefined() const { return undefined_; } - -const std::vector>& MemAccessRange::dims() const { return dims_; } - -void MemAccessRangesBuilder::process_workItem(WorkItem* item) { - analysis::UsersView users_(users_analysis_, node_); - - const auto* varName = item->var_name; - - const auto& reads = users_.reads(*varName); - process_direct_users(item, false, reads); - - const auto& writes = users_.writes(*varName); - process_direct_users(item, true, writes); - - const auto& views = users_.views(*varName); - if (!views.empty()) { - DEBUG_PRINTLN("Found views for " << *varName << " => not rangeable!"); - item->undefined = true; - } - - const auto& moves = users_.moves(*varName); - if (!moves.empty()) { - DEBUG_PRINTLN("Found moves for " << *varName << " => not rangeable!"); - item->undefined = true; - } - - if (!item->dims.empty()) { - std::vector> finalDims; - finalDims.reserve(item->dims.size()); - - for (auto& dim : item->dims) { - auto& lowerExprs = std::get<0>(dim); - bool isLowerUndefined = std::get<1>(dim); - symbolic::Expression lb = (!lowerExprs.empty() && !isLowerUndefined) - ? SymEngine::min(lowerExprs) - : SymEngine::RCP(); - auto& upperExprs = std::get<2>(dim); - bool isUpperUndefined = std::get<3>(dim); - symbolic::Expression ub = (!upperExprs.empty() && !isUpperUndefined) - ? SymEngine::max(upperExprs) - : SymEngine::RCP(); - - if (lb.is_null() || ub.is_null()) { - item->undefined = true; - } - if (!lb.is_null() && SymEngine::is_a(*lb)) { - lb = SymEngine::null; - item->undefined = true; - } - if (!ub.is_null() && SymEngine::is_a(*ub)) { - ub = SymEngine::null; - item->undefined = true; - } - - finalDims.emplace_back(std::move(lb), std::move(ub)); - } - - this->ranges_.emplace( - std::piecewise_construct, - std::forward_as_tuple(*varName), - std::forward_as_tuple(*varName, item->saw_read, item->saw_write, item->undefined, std::move(finalDims)) - ); - } -} - -void MemAccessRangesBuilder::process_direct_users(WorkItem* item, bool is_write, std::vector accesses) { - for (auto& access : accesses) { - // The actual range analysis replaces symbols used in subsets - // by their lower/upper bounds according to the assumptions analysis. - // For this, we take the immediate scope to get the richest assumptions. - const auto& user_scope = analysis::Users::scope(access); - auto assums = assumptions_analysis_.get(*user_scope, false); - - // The final expression must be an expression w.r.t parameters, - // i.e., constant symbols w.r.t the actual node. - // Note we can compute this more efficiently once, but - // we want to move this to the assumptions analysis anyway - analysis::UsersView users_view(users_analysis_, node_); - symbolic::SymbolSet params; - for (auto& user : users_view.uses()) { - if (user->container() == symbolic::__nullptr__()->get_name()) { - continue; - } - auto& type = sdfg_.type(user->container()); - if (type.type_id() != types::TypeID::Scalar) { - continue; - } - auto& scalar_type = static_cast(type); - if (!types::is_integer(scalar_type.primitive_type())) { - continue; - } - if (users_view.writes(user->container()).size() > 0) { - continue; - } - params.insert(symbolic::symbol(user->container())); - } - - item->saw_read |= !is_write; - item->saw_write |= is_write; - - auto subsets = access->subsets(); - for (const auto& subset : subsets) { - auto subsetDims = subset.size(); - item->dims.reserve(subsetDims); - for (size_t i = item->dims.size(); i < subsetDims; ++i) { - item->dims.emplace_back(std::make_tuple< - std::vector, - bool, - std::vector, - bool>({}, false, {}, false)); - } - int dimIdx = 0; - for (auto& dim : subset) { - auto lb = symbolic::minimum(dim, params, assums, true); - auto ub = symbolic::maximum(dim, params, assums, true); - - if (lb.is_null() || symbolic::has(lb)) { - std::get<1>(item->dims[dimIdx]) = true; - } else { - std::get<0>(item->dims[dimIdx]).push_back(lb); - } - if (ub.is_null() || symbolic::has(ub)) { - std::get<3>(item->dims[dimIdx]) = true; - } else { - std::get<2>(item->dims[dimIdx]).push_back(ub); - } - - ++dimIdx; - } - } - } -} - -} // namespace analysis -} // namespace sdfg diff --git a/sdfg/src/analysis/memory_layout_analysis.cpp b/sdfg/src/analysis/memory_layout_analysis.cpp index 4c13af88a..681aca067 100644 --- a/sdfg/src/analysis/memory_layout_analysis.cpp +++ b/sdfg/src/analysis/memory_layout_analysis.cpp @@ -20,25 +20,53 @@ namespace sdfg { namespace analysis { namespace { -// Collect StructuredLoop nodes that are direct children of the given node, -// stopping at loop boundaries (does not recurse into nested loops). -void collect_direct_child_loops( - structured_control_flow::ControlFlowNode& node, std::set& result -) { - if (auto* loop = dynamic_cast(&node)) { - result.insert(loop); - return; + +// Sentinel symbol stored in shape[0] of a MemoryLayout when the leading dimension's +// extent is unknown (raw pointer accesses). The symbol never escapes the analysis: +// any expression that mentions it must be reported to the caller as `SymEngine::null` +// from the public size accessors (see `MemoryTile::extents()` etc.). +constexpr const char* kUnboundedName = "__unbounded__"; + +bool is_unbounded_dim(const symbolic::Expression& e) { + if (e.is_null()) return false; + if (!SymEngine::is_a(*e)) return false; + return SymEngine::down_cast(*e).get_name() == kUnboundedName; +} + +bool depends_on_unbounded(const symbolic::Expression& e) { + if (e.is_null()) return false; + for (const auto& a : symbolic::atoms(e)) { + if (is_unbounded_dim(a)) return true; } - if (auto* seq = dynamic_cast(&node)) { + return false; +} + +bool layout_has_unbounded_first_dim(const MemoryLayout& layout) { + const auto& shape = layout.shape(); + return !shape.empty() && is_unbounded_dim(shape[0]); +} + +// Collect immediate child scopes (Sequence/IfElse/While/StructuredLoop) of a given +// scope that carry their own MemoryTile entries. Blocks are excluded because their +// per-memlet info is held in `accesses_`, not in `tiles_`/`tile_groups_`. +void collect_direct_child_scopes( + structured_control_flow::ControlFlowNode& scope, std::set& result +) { + if (auto* loop = dynamic_cast(&scope)) { + result.insert(&loop->root()); + } else if (auto* w = dynamic_cast(&scope)) { + result.insert(&w->root()); + } else if (auto* seq = dynamic_cast(&scope)) { for (size_t i = 0; i < seq->size(); i++) { - collect_direct_child_loops(seq->at(i).first, result); + auto& child = seq->at(i).first; + if (!dynamic_cast(&child)) { + result.insert(&child); + } } - } else if (auto* ife = dynamic_cast(&node)) { + } else if (auto* ife = dynamic_cast(&scope)) { for (size_t i = 0; i < ife->size(); i++) { - collect_direct_child_loops(ife->at(i).first, result); + result.insert(&ife->at(i).first); } - } else if (auto* w = dynamic_cast(&node)) { - collect_direct_child_loops(w->root(), result); } } } // namespace @@ -54,6 +82,17 @@ void MemoryLayoutAnalysis::run(analysis::AnalysisManager& analysis_manager) { void MemoryLayoutAnalysis:: traverse(structured_control_flow::ControlFlowNode& node, analysis::AnalysisManager& analysis_manager) { + // Snapshot current memlets and tile keys before recursing into the scope's children + std::vector memlets_before; + memlets_before.reserve(accesses_.size()); + for (const auto& entry : accesses_) { + memlets_before.push_back(entry.first); + } + std::set> tiles_before; + for (const auto& entry : tiles_) { + tiles_before.insert(entry.first); + } + if (auto block = dynamic_cast(&node)) { process_block(*block, analysis_manager); } else if (auto sequence = dynamic_cast(&node)) { @@ -67,25 +106,14 @@ void MemoryLayoutAnalysis:: } else if (auto while_stmt = dynamic_cast(&node)) { traverse(while_stmt->root(), analysis_manager); } else if (auto loop = dynamic_cast(&node)) { - // Snapshot current memlets before traversing loop body - std::vector memlets_before; - memlets_before.reserve(accesses_.size()); - for (const auto& entry : accesses_) { - memlets_before.push_back(entry.first); - } - - // Snapshot tile keys before traversal - std::set> tiles_before; - for (const auto& entry : tiles_) { - tiles_before.insert(entry.first); - } - traverse(loop->root(), analysis_manager); - - // Merge layouts for containers accessed within this loop - merge_loop_layouts(*loop, memlets_before, tiles_before, analysis_manager); + } else { + // Break, Continue, Return nodes don't contain blocks + return; } - // Break, Continue, Return nodes don't contain blocks + + // Merge tiles for containers accessed within this scope + merge_scope_layouts(node, memlets_before, tiles_before, analysis_manager); } void MemoryLayoutAnalysis:: @@ -147,6 +175,38 @@ void MemoryLayoutAnalysis:: // For pointers, we attempt to delinearize the access pattern to infer the layout based // on assumptions from loop bounds auto* pointer_type = dynamic_cast(&memlet.base_type()); + + // Typed pointer to a (possibly multi-dim) fixed array of scalar, + // e.g. `float (*A)[M]`. The pointer adds one unbounded leading + // dimension; remaining dimensions come from the array shape. The + // subset is expected to be one index per dimension — no + // delinearization needed. + if (pointer_type->pointee_type().type_id() == types::TypeID::Array) { + auto* array_type = dynamic_cast(&pointer_type->pointee_type()); + symbolic::MultiExpression array_shape = {array_type->num_elements()}; + while (array_type->element_type().type_id() == types::TypeID::Array) { + array_type = dynamic_cast(&array_type->element_type()); + array_shape.push_back(array_type->num_elements()); + } + if (array_type->element_type().type_id() != types::TypeID::Scalar) { + continue; // Skip non-scalar leaf + } + if (subset.size() != array_shape.size() + 1) { + continue; // Require one index per dimension (leading pointer + array dims) + } + + symbolic::MultiExpression shape; + shape.push_back(symbolic::symbol("__unbounded__")); + for (const auto& dim : array_shape) { + shape.push_back(dim); + } + + MemoryLayout layout(shape); + MemoryAccess layout_info{container_name, subset, layout, false}; + this->accesses_.emplace(&memlet, layout_info); + continue; + } + if (pointer_type->pointee_type().type_id() != types::TypeID::Scalar) { continue; // Skip non-scalar pointers } @@ -191,10 +251,10 @@ const MemoryAccess* MemoryLayoutAnalysis::access(const data_flow::Memlet& memlet return &layout_it->second; } -void MemoryLayoutAnalysis::merge_loop_layouts( - structured_control_flow::StructuredLoop& loop, +void MemoryLayoutAnalysis::merge_scope_layouts( + structured_control_flow::ControlFlowNode& scope, const std::vector& memlets_before, - const std::set>& tiles_before, + const std::set>& tiles_before, analysis::AnalysisManager& analysis_manager ) { // Convert memlets_before to a set for O(1) lookup @@ -216,36 +276,66 @@ void MemoryLayoutAnalysis::merge_loop_layouts( }); } + auto* loop = dynamic_cast(&scope); + auto& assumptions_analysis = analysis_manager.get(); - // Use trivial bounds (type-derived, e.g. unsigned >= 0) so symbolic min/max - // over per-dimension index expressions can use parameter sign information. - auto& assumptions = assumptions_analysis.get(loop.root(), /*include_trivial_bounds=*/true); - // Start with SDFG-level parameters (read-only arguments like N, M) - // then add any additional constant symbols from loop assumptions + // For loops, query at the loop body so the induction variable's bounds are visible. + auto& assumption_node = loop ? static_cast(loop->root()) : scope; + // Trivial-bounds view: includes type-derived defaults (e.g. Int32 ∈ [INT_MIN, INT_MAX]). + // Used as the assumption set passed to symbolic::minimum/maximum so that the + // resolver has sign information for parameters. + auto& assumptions = assumptions_analysis.get(assumption_node, /*include_trivial_bounds=*/true); + // Narrowing-only view: excludes type-derived defaults. A symbol that only + // appears here (or in neither) has at most its type's intrinsic range — any + // min/max resolution would collapse to INT_MIN/INT_MAX-style numerics, which + // is not a sound tile bound. We use this to decide whether to emit a tile. + auto& narrowing_assumptions = assumptions_analysis.get(assumption_node, /*include_trivial_bounds=*/false); + // Parameters of a scope can only be constant symbols (invariant within the + // scope). SDFG-level read-only arguments are constant by construction; for + // each scope-local entry, the constant() flag tells us whether the symbol + // can be treated opaquely by the min/max resolver. symbolic::SymbolSet parameters = assumptions_analysis.parameters(); for (auto& entry : assumptions) { - if (symbolic::eq(entry.first, loop.indvar())) { - continue; // Skip induction variable itself + if (loop && symbolic::eq(entry.first, loop->indvar())) { + continue; // The induction variable is not a parameter of its own loop scope } - if (entry.second.constant()) { parameters.insert(entry.first); } } - // Find direct child loops of this loop (not grandchildren) - std::set direct_child_loops; - collect_direct_child_loops(loop.root(), direct_child_loops); + // Soundness check: every free (non-parameter) symbol in an index expression + // must have a narrowing assumption at this scope. Otherwise symbolic::minimum/ + // maximum would fall back to the symbol's type-default range and produce + // bogus tile bounds (e.g. INT_MAX) that the rest of the pipeline would + // silently consume as truth. + auto has_narrowing = [&](const symbolic::Symbol& sym) -> bool { + auto it = narrowing_assumptions.find(sym); + if (it == narrowing_assumptions.end()) return false; + return !it->second.lower_bounds().empty() || !it->second.upper_bounds().empty(); + }; + auto bounds_are_sound = [&](const symbolic::Expression& expr) -> bool { + for (const auto& sym : symbolic::atoms(expr)) { + if (parameters.contains(sym)) continue; + if (loop && symbolic::eq(sym, loop->indvar())) continue; + if (!has_narrowing(sym)) return false; + } + return true; + }; + + // Find direct child scopes that may carry tiles for this scope + std::set direct_child_scopes; + collect_direct_child_scopes(scope, direct_child_scopes); for (auto& [container, memlets] : all_container_groups) { if (memlets.empty()) continue; - // Find inner tiles from direct child loops only + // Find inner tiles from direct child scopes only std::vector inner_tiles; for (auto& [key, tile] : tiles_) { if (tiles_before.count(key) > 0) continue; if (key.second != container) continue; - if (direct_child_loops.count(key.first) == 0) continue; + if (direct_child_scopes.count(key.first) == 0) continue; inner_tiles.push_back(&tile); } @@ -271,14 +361,14 @@ void MemoryLayoutAnalysis::merge_loop_layouts( } } - // Propagate tile groups from child loops upward using the same + // Propagate tile groups from child scopes upward using the same // base-partitioning logic: group inner groups by their min_subset - // base at this loop level, then merge each partition. + // base at this scope level, then merge each partition. std::vector inner_groups; for (auto& [key, groups] : tile_groups_) { if (tiles_before.count({key.first, key.second}) > 0) continue; if (key.second != container) continue; - if (direct_child_loops.count(key.first) == 0) continue; + if (direct_child_scopes.count(key.first) == 0) continue; for (const auto& g : groups) { inner_groups.push_back(&g); } @@ -376,12 +466,14 @@ void MemoryLayoutAnalysis::merge_loop_layouts( grp_memlets.insert(grp_memlets.end(), c->memlets.begin(), c->memlets.end()); } - MemoryTile grp_tile{container, grp_min, grp_max, reference_layout, true}; + MemoryTile grp_tile{ + container, grp_min, grp_max, reference_layout, !layout_has_unbounded_first_dim(reference_layout) + }; result_groups.push_back({grp_tile, std::move(grp_memlets)}); } if (!result_groups.empty()) { - tile_groups_.insert({{&loop, container}, std::move(result_groups)}); + tile_groups_.insert({{&scope, container}, std::move(result_groups)}); } } } else { @@ -425,7 +517,7 @@ void MemoryLayoutAnalysis::merge_loop_layouts( if (!consistent) continue; // Compute tile groups for raw memlets - compute_tile_groups(loop, container, memlets, reference_layout, ndims, parameters, assumptions); + compute_tile_groups(scope, container, memlets, reference_layout, ndims, parameters, assumptions); } if (ndims == 0) continue; @@ -441,6 +533,10 @@ void MemoryLayoutAnalysis::merge_loop_layouts( // Compute dim_min from min_indices for (const auto& idx : min_indices[d]) { + if (!bounds_are_sound(idx)) { + all_bounded = false; + break; + } auto lb = symbolic::minimum(idx, parameters, assumptions, true); if (lb.is_null()) { lb = symbolic::minimum(idx, parameters, assumptions, false); @@ -459,6 +555,10 @@ void MemoryLayoutAnalysis::merge_loop_layouts( // Compute dim_max from max_indices for (const auto& idx : max_indices[d]) { + if (!bounds_are_sound(idx)) { + all_bounded = false; + break; + } auto ub = symbolic::maximum(idx, parameters, assumptions, true); if (ub.is_null()) { ub = symbolic::maximum(idx, parameters, assumptions, false); @@ -481,15 +581,18 @@ void MemoryLayoutAnalysis::merge_loop_layouts( if (!all_bounded) continue; - // Store this loop's tile with the original memory layout - MemoryTile merged_tile{container, min_subset, max_subset, reference_layout, true}; - tiles_.insert({{&loop, container}, merged_tile}); + // Store this scope's tile with the original memory layout. `first_dim_bounded` + // mirrors the underlying layout: false whenever shape[0] is the unbounded sentinel. + MemoryTile merged_tile{ + container, min_subset, max_subset, reference_layout, !layout_has_unbounded_first_dim(reference_layout) + }; + tiles_.insert({{&scope, container}, merged_tile}); } } const MemoryTile* MemoryLayoutAnalysis:: - tile(const structured_control_flow::StructuredLoop& loop, const std::string& container) const { - auto key = std::make_pair(&loop, container); + tile(const structured_control_flow::ControlFlowNode& scope, const std::string& container) const { + auto key = std::make_pair(&scope, container); auto it = tiles_.find(key); if (it == tiles_.end()) { return nullptr; @@ -498,7 +601,7 @@ const MemoryTile* MemoryLayoutAnalysis:: } void MemoryLayoutAnalysis::compute_tile_groups( - structured_control_flow::StructuredLoop& loop, + structured_control_flow::ControlFlowNode& scope, const std::string& container, const std::vector& memlets, const MemoryLayout& reference_layout, @@ -648,18 +751,20 @@ void MemoryLayoutAnalysis::compute_tile_groups( if (!all_bounded) continue; - MemoryTile tile{container, min_subset, max_subset, reference_layout, true}; + MemoryTile tile{ + container, min_subset, max_subset, reference_layout, !layout_has_unbounded_first_dim(reference_layout) + }; result_groups.push_back({tile, group.group_memlets}); } if (!result_groups.empty()) { - tile_groups_.insert({{&loop, container}, std::move(result_groups)}); + tile_groups_.insert({{&scope, container}, std::move(result_groups)}); } } const std::vector* MemoryLayoutAnalysis:: - tile_groups(const structured_control_flow::StructuredLoop& loop, const std::string& container) const { - auto key = std::make_pair(&loop, container); + tile_groups(const structured_control_flow::ControlFlowNode& scope, const std::string& container) const { + auto key = std::make_pair(&scope, container); auto it = tile_groups_.find(key); if (it == tile_groups_.end()) { return nullptr; @@ -668,7 +773,7 @@ const std::vector* MemoryLayoutAnalysis:: } const MemoryTileGroup* MemoryLayoutAnalysis:: - tile_group_for(const structured_control_flow::StructuredLoop& loop, const data_flow::Memlet& memlet) const { + tile_group_for(const structured_control_flow::ControlFlowNode& scope, const data_flow::Memlet& memlet) const { // Find which container this memlet accesses auto acc_it = accesses_.find(&memlet); if (acc_it == accesses_.end()) { @@ -676,7 +781,7 @@ const MemoryTileGroup* MemoryLayoutAnalysis:: } auto& container = acc_it->second.container; - auto key = std::make_pair(&loop, container); + auto key = std::make_pair(&scope, container); auto groups_it = tile_groups_.find(key); if (groups_it == tile_groups_.end()) { return nullptr; @@ -695,9 +800,17 @@ const MemoryTileGroup* MemoryLayoutAnalysis:: symbolic::MultiExpression MemoryTile::extents() const { symbolic::MultiExpression result; for (size_t d = 0; d < min_subset.size(); ++d) { - result.push_back(symbolic::simplify( - symbolic::expand(symbolic::add(symbolic::sub(max_subset[d], min_subset[d]), symbolic::one())) - )); + auto ext = + symbolic::simplify(symbolic::expand(symbolic::add(symbolic::sub(max_subset[d], min_subset[d]), symbolic::one()) + )); + // Defensive: subset values are always proven-bounded, so this should never trigger + // for row-major layouts. Guards future custom layouts whose subsets could pick up + // the unbounded sentinel. + if (depends_on_unbounded(ext)) { + result.push_back(SymEngine::null); + } else { + result.push_back(ext); + } } return result; } @@ -705,9 +818,14 @@ symbolic::MultiExpression MemoryTile::extents() const { symbolic::MultiExpression MemoryTile::extents_approx() const { symbolic::MultiExpression result; for (size_t d = 0; d < min_subset.size(); ++d) { - result.push_back(symbolic::simplify(symbolic::expand( + auto ext = symbolic::simplify(symbolic::expand( symbolic::overapproximate(symbolic::add(symbolic::sub(max_subset[d], min_subset[d]), symbolic::one())) - ))); + )); + if (depends_on_unbounded(ext)) { + result.push_back(SymEngine::null); + } else { + result.push_back(ext); + } } return result; } @@ -720,7 +838,15 @@ std::pair MemoryTile::contiguous_ran first = symbolic::add(first, symbolic::mul(strides[d], min_subset[d])); last = symbolic::add(last, symbolic::mul(strides[d], max_subset[d])); } - return {symbolic::simplify(symbolic::expand(first)), symbolic::simplify(symbolic::expand(last))}; + first = symbolic::simplify(symbolic::expand(first)); + last = symbolic::simplify(symbolic::expand(last)); + // If either endpoint references the unbounded sentinel, the linear range is undefined + // (e.g. a non-row-major layout whose stride references shape[0]). Report as unknown + // rather than leaking the sentinel symbol to callers. + if (depends_on_unbounded(first) || depends_on_unbounded(last)) { + return {SymEngine::null, SymEngine::null}; + } + return {first, last}; } } // namespace analysis diff --git a/sdfg/tests/CMakeLists.txt b/sdfg/tests/CMakeLists.txt index e4ae6d22b..7554de357 100644 --- a/sdfg/tests/CMakeLists.txt +++ b/sdfg/tests/CMakeLists.txt @@ -18,7 +18,6 @@ set(TEST_FILES analysis/loop_analysis_test.cpp analysis/loop_analysis_info_test.cpp analysis/loop_carried_dependency_analysis_test.cpp - analysis/mem_access_range_analysis_test.cpp analysis/memory_layout_analysis_test.cpp analysis/type_analysis_test.cpp analysis/users_test.cpp diff --git a/sdfg/tests/analysis/arguments_analysis_test.cpp b/sdfg/tests/analysis/arguments_analysis_test.cpp index 6a55b0ab1..6c96f23ac 100644 --- a/sdfg/tests/analysis/arguments_analysis_test.cpp +++ b/sdfg/tests/analysis/arguments_analysis_test.cpp @@ -126,14 +126,11 @@ TEST(ArgumentsAnalysisTest, Block_Arguments_Arrays) { EXPECT_TRUE(locals.contains("t1")); EXPECT_TRUE(locals.contains("i")); - EXPECT_TRUE(analysis.argument_size_known(analysis_manager, block, false)); - auto arg_sizes = analysis.argument_sizes(analysis_manager, block, false); - EXPECT_EQ(arg_sizes.size(), 1); - EXPECT_TRUE(arg_sizes.contains("arg1")); - EXPECT_TRUE(symbolic:: - eq(arg_sizes.at("arg1"), - symbolic::mul(symbolic::integer(4), symbolic::add(symbolic::symbol("i"), symbolic::integer(1)))) - ); + // The index `i` is a local Int32 with no narrowing assumption at the Block + // scope, so the memory-layout analysis soundly refuses to bound the access + // (its only bounds would be the type-default INT_MIN..INT_MAX). The argument + // size is therefore unknown. + EXPECT_FALSE(analysis.argument_size_known(analysis_manager, block, false)); } TEST(ArgumentsAnalysisTest, Block_Arguments_Pointers) { @@ -175,14 +172,10 @@ TEST(ArgumentsAnalysisTest, Block_Arguments_Pointers) { EXPECT_TRUE(locals.contains("t1")); EXPECT_TRUE(locals.contains("i")); - EXPECT_TRUE(analysis.argument_size_known(analysis_manager, block, false)); - auto arg_sizes = analysis.argument_sizes(analysis_manager, block, false); - EXPECT_EQ(arg_sizes.size(), 1); - EXPECT_TRUE(arg_sizes.contains("arg1")); - EXPECT_TRUE(symbolic:: - eq(arg_sizes.at("arg1"), - symbolic::mul(symbolic::integer(4), symbolic::add(symbolic::symbol("i"), symbolic::integer(1)))) - ); + // Same as Block_Arguments_Arrays: `i` is a free local Int32 with only + // type-default bounds, so the tile is correctly not produced and the + // argument size is unknown. + EXPECT_FALSE(analysis.argument_size_known(analysis_manager, block, false)); } TEST(ArgumentsAnalysisTest, Sequence_Blocks) { diff --git a/sdfg/tests/analysis/mem_access_range_analysis_test.cpp b/sdfg/tests/analysis/mem_access_range_analysis_test.cpp deleted file mode 100644 index e1eb131c0..000000000 --- a/sdfg/tests/analysis/mem_access_range_analysis_test.cpp +++ /dev/null @@ -1,470 +0,0 @@ -#include "sdfg/analysis/mem_access_range_analysis.h" - -#include -#include -#include - - -#include "sdfg/builder/structured_sdfg_builder.h" -#include "sdfg/data_flow/tasklet.h" -#include "sdfg/structured_sdfg.h" -#include "sdfg/symbolic/symbolic.h" -#include "sdfg/types/pointer.h" -#include "sdfg/types/type.h" -#include "sdfg/visualizer/dot_visualizer.h" - -using namespace sdfg; - -#ifndef DEBUG_WRITE_SDFG_VIZ -#define DEBUG_WRITE_SDFG_VIZ true -#endif - -#define DEBUG_DOT_SDFG(sdfg) \ - if constexpr (DEBUG_WRITE_SDFG_VIZ) { \ - writeSdfgDot(sdfg); \ - } - -static void writeSdfgDot(const StructuredSDFG& sdfg) { - visualizer::DotVisualizer viz(sdfg); - viz.visualize(); - - std::string filename = sdfg.name() + ".dot"; - - std::ofstream dotOutput(filename, std::ofstream::out); - - dotOutput << viz.getStream().str(); - dotOutput.close(); - std::cout << "Wrote graph to : " << filename << std::endl; -} - -TEST(MemAccessRangeAnalysisTest, AccessNode_Write_Element_1D) { - builder::StructuredSDFGBuilder builder("sdfg", FunctionType_CPU); - - types::Scalar base_desc(types::PrimitiveType::Int32); - types::Pointer ptr_desc(base_desc); - - types::Pointer opaque_desc; - builder.add_container("A", opaque_desc, true); - builder.add_container("i", base_desc, true); - - auto sym = symbolic::symbol("i"); - - auto& root = builder.subject().root(); - - auto& block = builder.add_block(root); - - auto& writeAccess = builder.add_access(block, "A"); - auto& zero_node = builder.add_constant(block, "0", base_desc); - auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); - builder.add_computational_memlet(block, zero_node, tasklet, "_in", {}); - builder.add_computational_memlet(block, tasklet, "_out", writeAccess, {sym}, ptr_desc); - - auto sdfg = builder.move(); - - DEBUG_DOT_SDFG(*sdfg); - - // Run analysis - builder::StructuredSDFGBuilder builder_opt(sdfg); - analysis::AnalysisManager analysis_manager(builder_opt.subject()); - auto& ranges = analysis_manager.get(); - - - // Check result - auto* range_a = ranges.get("A"); - EXPECT_NE(range_a, nullptr); - EXPECT_EQ(range_a->get_name(), "A"); - EXPECT_FALSE(range_a->saw_read()); - EXPECT_TRUE(range_a->saw_write()); - EXPECT_FALSE(range_a->is_undefined()); - - auto& dims = range_a->dims(); - EXPECT_EQ(dims.size(), 1); - EXPECT_TRUE(symbolic::eq(dims[0].first, sym)); - EXPECT_TRUE(symbolic::eq(dims[0].second, sym)); -} - -TEST(MemAccessRangeAnalysisTest, AccessNode_Read_Element_1D) { - builder::StructuredSDFGBuilder builder("sdfg", FunctionType_CPU); - - types::Scalar base_desc(types::PrimitiveType::Int32); - types::Pointer ptr_desc(base_desc); - - types::Pointer opaque_desc; - builder.add_container("A", opaque_desc, true); - builder.add_container("B", opaque_desc, true); - builder.add_container("i", base_desc, true); - - auto sym = symbolic::symbol("i"); - - auto& root = builder.subject().root(); - - auto& block = builder.add_block(root); - - auto& node_A = builder.add_access(block, "A"); - auto& node_B = builder.add_access(block, "B"); - auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); - builder.add_computational_memlet(block, node_A, tasklet, "_in", {sym}, ptr_desc); - builder.add_computational_memlet(block, tasklet, "_out", node_B, {sym}, ptr_desc); - - auto sdfg = builder.move(); - - DEBUG_DOT_SDFG(*sdfg); - - // Run analysis - builder::StructuredSDFGBuilder builder_opt(sdfg); - analysis::AnalysisManager analysis_manager(builder_opt.subject()); - auto& ranges = analysis_manager.get(); - - - // Check result - auto* range_a = ranges.get("A"); - EXPECT_NE(range_a, nullptr); - EXPECT_EQ(range_a->get_name(), "A"); - EXPECT_TRUE(range_a->saw_read()); - EXPECT_FALSE(range_a->saw_write()); - EXPECT_FALSE(range_a->is_undefined()); - - auto& dims = range_a->dims(); - EXPECT_EQ(dims.size(), 1); - EXPECT_TRUE(symbolic::eq(dims[0].first, sym)); - EXPECT_TRUE(symbolic::eq(dims[0].second, sym)); -} - -TEST(MemAccessRangeAnalysisTest, AccessNode_Write_Range_1D) { - builder::StructuredSDFGBuilder builder("sdfg", FunctionType_CPU); - - types::Scalar base_desc(types::PrimitiveType::Int32); - types::Pointer ptr_desc(base_desc); - - types::Pointer opaque_desc; - builder.add_container("A", opaque_desc, true); - builder.add_container("N", base_desc, true); - builder.add_container("i", base_desc); - - auto sym = symbolic::symbol("i"); - - auto& root = builder.subject().root(); - auto& scope = builder.add_map( - root, - sym, - symbolic::Lt(sym, symbolic::symbol("N")), - symbolic::integer(0), - symbolic::add(sym, symbolic::integer(1)), - structured_control_flow::ScheduleType_Sequential::create() - ); - - auto& block = builder.add_block(scope.root()); - - auto& node_A = builder.add_access(block, "A"); - auto& zero_node = builder.add_constant(block, "0", base_desc); - auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); - builder.add_computational_memlet(block, zero_node, tasklet, "_in", {}); - builder.add_computational_memlet(block, tasklet, "_out", node_A, {sym}, ptr_desc); - - auto sdfg = builder.move(); - - DEBUG_DOT_SDFG(*sdfg); - - // Run analysis - builder::StructuredSDFGBuilder builder_opt(sdfg); - analysis::AnalysisManager analysis_manager(builder_opt.subject()); - auto& ranges = analysis_manager.get(); - - // Check result - auto* range_a = ranges.get("A"); - EXPECT_NE(range_a, nullptr); - EXPECT_EQ(range_a->get_name(), "A"); - EXPECT_FALSE(range_a->saw_read()); - EXPECT_TRUE(range_a->saw_write()); - EXPECT_FALSE(range_a->is_undefined()); - - auto& dims = range_a->dims(); - EXPECT_EQ(dims.size(), 1); - EXPECT_TRUE(symbolic::eq(dims[0].first, symbolic::zero())); - EXPECT_TRUE(symbolic::eq(dims[0].second, symbolic::sub(symbolic::symbol("N"), symbolic::one()))); -} - -TEST(MemAccessRangeAnalysisTest, AccessNode_Write_Range_Shift_1D) { - builder::StructuredSDFGBuilder builder("sdfg", FunctionType_CPU); - - types::Scalar base_desc(types::PrimitiveType::Int32); - types::Pointer ptr_desc(base_desc); - - types::Pointer opaque_desc; - builder.add_container("A", opaque_desc, true); - builder.add_container("N", base_desc, true); - builder.add_container("i", base_desc); - - auto sym = symbolic::symbol("i"); - - auto& root = builder.subject().root(); - auto& scope = builder.add_map( - root, - sym, - symbolic::Lt(sym, symbolic::symbol("N")), - symbolic::integer(10), - symbolic::add(sym, symbolic::integer(1)), - structured_control_flow::ScheduleType_Sequential::create() - ); - - auto& block = builder.add_block(scope.root()); - - auto& node_A = builder.add_access(block, "A"); - auto& zero_node = builder.add_constant(block, "0", base_desc); - auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); - builder.add_computational_memlet(block, zero_node, tasklet, "_in", {}); - builder.add_computational_memlet(block, tasklet, "_out", node_A, {sym}, ptr_desc); - - auto sdfg = builder.move(); - - DEBUG_DOT_SDFG(*sdfg); - - // Run analysis - builder::StructuredSDFGBuilder builder_opt(sdfg); - analysis::AnalysisManager analysis_manager(builder_opt.subject()); - auto& ranges = analysis_manager.get(); - - // Check result - auto* range_a = ranges.get("A"); - EXPECT_NE(range_a, nullptr); - EXPECT_EQ(range_a->get_name(), "A"); - EXPECT_FALSE(range_a->saw_read()); - EXPECT_TRUE(range_a->saw_write()); - EXPECT_FALSE(range_a->is_undefined()); - - auto& dims = range_a->dims(); - EXPECT_EQ(dims.size(), 1); - EXPECT_TRUE(symbolic::eq(dims[0].first, symbolic::integer(10))); - EXPECT_TRUE(symbolic::eq(dims[0].second, symbolic::sub(symbolic::symbol("N"), symbolic::one()))); -} - -TEST(MemAccessRangeAnalysisTest, AccessNode_Read_Range_1D) { - builder::StructuredSDFGBuilder builder("sdfg", FunctionType_CPU); - - types::Scalar base_desc(types::PrimitiveType::Int32); - types::Pointer ptr_desc(base_desc); - - types::Pointer opaque_desc; - builder.add_container("A", opaque_desc, true); - builder.add_container("B", opaque_desc, true); - builder.add_container("N", base_desc, true); - builder.add_container("i", base_desc); - - auto sym = symbolic::symbol("i"); - - auto& root = builder.subject().root(); - auto& scope = builder.add_map( - root, - sym, - symbolic::Lt(sym, symbolic::symbol("N")), - symbolic::integer(0), - symbolic::add(sym, symbolic::integer(1)), - structured_control_flow::ScheduleType_Sequential::create() - ); - - auto& block = builder.add_block(scope.root()); - - auto& node_A = builder.add_access(block, "A"); - auto& node_B = builder.add_access(block, "B"); - auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); - builder.add_computational_memlet(block, node_A, tasklet, "_in", {sym}, ptr_desc); - builder.add_computational_memlet(block, tasklet, "_out", node_B, {sym}, ptr_desc); - - auto sdfg = builder.move(); - - DEBUG_DOT_SDFG(*sdfg); - - // Run analysis - builder::StructuredSDFGBuilder builder_opt(sdfg); - analysis::AnalysisManager analysis_manager(builder_opt.subject()); - auto& ranges = analysis_manager.get(); - - // Check result - auto* range_a = ranges.get("A"); - EXPECT_NE(range_a, nullptr); - EXPECT_EQ(range_a->get_name(), "A"); - EXPECT_TRUE(range_a->saw_read()); - EXPECT_FALSE(range_a->saw_write()); - EXPECT_FALSE(range_a->is_undefined()); - - auto& dims = range_a->dims(); - EXPECT_EQ(dims.size(), 1); - EXPECT_TRUE(symbolic::eq(dims[0].first, symbolic::zero())); - EXPECT_TRUE(symbolic::eq(dims[0].second, symbolic::sub(symbolic::symbol("N"), symbolic::one()))); -} - -TEST(MemAccessRangeAnalysisTest, AccessNode_Write_Range_2D) { - builder::StructuredSDFGBuilder builder("sdfg_simple_2d", FunctionType_CPU); - - types::Scalar base_desc(types::PrimitiveType::Int32); - types::Array array1dType(base_desc, symbolic::symbol("M")); - types::Pointer array2dType(array1dType); - - types::Pointer opaque_desc; - builder.add_container("A", opaque_desc, true); - builder.add_container("arg_init", base_desc, true); - builder.add_container("i", base_desc); - builder.add_container("j", base_desc); - auto sym_i = symbolic::symbol("i"); - auto sym_j = symbolic::symbol("j"); - - auto& root = builder.subject().root(); - auto& outer_for = builder.add_for( - root, sym_i, symbolic::Lt(sym_i, symbolic::integer(23)), symbolic::zero(), symbolic::add(symbolic::one(), sym_i) - ); - auto& inner_for = builder.add_for( - outer_for.root(), - sym_j, - symbolic::Lt(sym_j, symbolic::integer(16)), - symbolic::zero(), - symbolic::add(symbolic::one(), sym_j) - ); - - auto& block = builder.add_block(inner_for.root()); - auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); - auto& readAccess = builder.add_access(block, "arg_init"); - auto& readArg = builder.add_computational_memlet(block, readAccess, tasklet, "_in", {}); - auto& writeAccess = builder.add_access(block, "A"); - auto& writeArg = builder.add_computational_memlet(block, tasklet, "_out", writeAccess, {sym_i, sym_j}, array2dType); - - auto sdfg = builder.move(); - - DEBUG_DOT_SDFG(*sdfg); - - // Run analysis - builder::StructuredSDFGBuilder builder_opt(sdfg); - analysis::AnalysisManager analysis_manager(builder_opt.subject()); - auto& ranges = analysis_manager.get(); - - - // Check result - auto* range_arg_init = ranges.get("arg_init"); - EXPECT_EQ(range_arg_init, nullptr); - - auto* range_a = ranges.get("A"); - EXPECT_NE(range_a, nullptr); - EXPECT_EQ(range_a->get_name(), "A"); - EXPECT_FALSE(range_a->saw_read()); - EXPECT_TRUE(range_a->saw_write()); - EXPECT_FALSE(range_a->is_undefined()); - auto& dims = range_a->dims(); - EXPECT_EQ(dims.size(), 2); - EXPECT_TRUE(symbolic::eq(dims[0].first, symbolic::zero())); - EXPECT_TRUE(symbolic::eq(dims[0].second, symbolic::integer(22))); - EXPECT_TRUE(symbolic::eq(dims[1].first, symbolic::zero())); - EXPECT_TRUE(symbolic::eq(dims[1].second, symbolic::integer(15))); -} - -TEST(MemAccessRangeAnalysisTest, Incomplete_2D_Line_Sum) { - builder::StructuredSDFGBuilder builder("sdfg_incomplete_2d", FunctionType_CPU); - - types::Scalar base_desc(types::PrimitiveType::Int32); - types::Pointer base_ptr_desc(base_desc); - - types::Array array1dType(base_desc, symbolic::symbol("M")); - types::Pointer array2dType(array1dType); - - types::Pointer opaque_desc; - builder.add_container("A", opaque_desc, true); - builder.add_container("B", opaque_desc, true); - builder.add_container("result", opaque_desc, true); - - builder.add_container("init_i", base_desc); - builder.add_container("i", base_desc); - builder.add_container("j", base_desc); - builder.add_container("sum", base_desc); - auto sym_i = symbolic::symbol("i"); - auto sym_init_i = symbolic::symbol("init_i"); - auto sym_j = symbolic::symbol("j"); - - auto& root = builder.subject().root(); - auto& init_block = builder.add_block(root); - auto& zero_node = builder.add_constant(init_block, "0", base_desc); - auto& initTasklet = builder.add_tasklet(init_block, data_flow::TaskletCode::assign, "_out", {"_in"}); - auto& sumInitAccess = builder.add_access(init_block, "sum"); - builder.add_computational_memlet(init_block, zero_node, initTasklet, "_in", {}); - builder.add_computational_memlet(init_block, initTasklet, "_out", sumInitAccess, {}); - auto& b_access = builder.add_access(init_block, "B"); - auto& init_i_tasklet = builder.add_tasklet(init_block, data_flow::TaskletCode::assign, "_out", {"_in"}); - builder.add_computational_memlet(init_block, b_access, init_i_tasklet, "_in", {symbolic::integer(0)}, base_ptr_desc); - auto& init_i_access = builder.add_access(init_block, "init_i"); - builder.add_computational_memlet(init_block, init_i_tasklet, "_out", init_i_access, {}); - - - auto& outer_for = builder.add_for( - root, - sym_i, - symbolic::Eq(symbolic::__false__(), symbolic::Eq(sym_i, symbolic::integer(23))), - sym_init_i, - symbolic::add(symbolic::one(), sym_i) - ); - auto& inner_for = builder.add_for( - outer_for.root(), - sym_j, - symbolic::Lt(sym_j, symbolic::integer(16)), - symbolic::zero(), - symbolic::add(symbolic::one(), sym_j) - ); - - auto& inner_block = builder.add_block(inner_for.root()); - auto& tasklet = builder.add_tasklet(inner_block, data_flow::TaskletCode::int_add, "_out", {"_in0", "_in1"}); - auto& prevSumAccess = builder.add_access(inner_block, "sum"); - auto& readPrevSum = builder.add_computational_memlet(inner_block, prevSumAccess, tasklet, "_in0", {}); - auto& readAAccess = builder.add_access(inner_block, "A"); - auto& readArray = - builder.add_computational_memlet(inner_block, readAAccess, tasklet, "_in1", {sym_i, sym_j}, array2dType); - auto& writeAccess = builder.add_access(inner_block, "sum"); - builder.add_computational_memlet(inner_block, tasklet, "_out", writeAccess, {}); - - auto& result_block = builder.add_block(root); - auto& sumAccess = builder.add_access(result_block, "sum"); - auto& result_tasklet = builder.add_tasklet(result_block, data_flow::TaskletCode::assign, "_out", {"_in"}); - builder.add_computational_memlet(result_block, sumAccess, result_tasklet, "_in", {}); - auto& resultAccess = builder.add_access(result_block, "result"); - builder.add_computational_memlet( - result_block, result_tasklet, "_out", resultAccess, {symbolic::integer(0)}, base_ptr_desc - ); - - auto sdfg = builder.move(); - - DEBUG_DOT_SDFG(*sdfg); - - // Run analysis - builder::StructuredSDFGBuilder builder_opt(sdfg); - analysis::AnalysisManager analysis_manager(builder_opt.subject()); - auto& ranges = analysis_manager.get(); - - - // Check result - auto* range_arg_init = ranges.get("arg_init"); - EXPECT_EQ(range_arg_init, nullptr); - - auto* range_sum = ranges.get("sum"); - EXPECT_EQ(range_sum, nullptr); - - // Write-pointer to scalar! - auto* range_result = ranges.get("result"); - EXPECT_NE(range_result, nullptr); - EXPECT_EQ(range_result->get_name(), "result"); - EXPECT_FALSE(range_result->saw_read()); - EXPECT_TRUE(range_result->saw_write()); - EXPECT_FALSE(range_result->is_undefined()); - auto& dims_res = range_result->dims(); - EXPECT_EQ(dims_res.size(), 1); - EXPECT_TRUE(symbolic::eq(dims_res[0].first, symbolic::zero())); - EXPECT_TRUE(symbolic::eq(dims_res[0].second, symbolic::zero())); - - auto* range_a = ranges.get("A"); - EXPECT_NE(range_a, nullptr); - EXPECT_EQ(range_a->get_name(), "A"); - EXPECT_TRUE(range_a->saw_read()); - EXPECT_FALSE(range_a->saw_write()); - EXPECT_TRUE(range_a->is_undefined()); - auto& dims_a = range_a->dims(); - EXPECT_EQ(dims_a.size(), 2); - EXPECT_TRUE(dims_a[0].first.is_null()); - EXPECT_TRUE(dims_a[0].second.is_null()); - - EXPECT_TRUE(symbolic::eq(dims_a[1].first, symbolic::zero())); - EXPECT_TRUE(symbolic::eq(dims_a[1].second, symbolic::integer(15))); -} diff --git a/sdfg/tests/analysis/memory_layout_analysis_test.cpp b/sdfg/tests/analysis/memory_layout_analysis_test.cpp index 5c6b9461c..daadd120c 100644 --- a/sdfg/tests/analysis/memory_layout_analysis_test.cpp +++ b/sdfg/tests/analysis/memory_layout_analysis_test.cpp @@ -2206,3 +2206,314 @@ TEST(MemoryLayoutAnalysisTest, LU_BlockedFactorization_Diagnostic) { check_2d(a_S7t_in, "S7 trailing sub-in", i, symbolic::add(i, j20)); check_2d(a_S7t_out, "S7 trailing sub-out", i, symbolic::add(i, j20)); } + +// ===================================================================== +// Scope-generic API tests: tiles should also be queryable at non-loop +// control-flow scopes (root Sequence, IfElse, While). +// ===================================================================== + +TEST(MemoryLayoutAnalysisTest, ScopeAPI_RootSequence_SingleNestedLoop) { + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("N", index_type, true); + builder.add_container("M", index_type, true); + builder.add_container("i", index_type); + builder.add_container("j", index_type); + builder.add_container("A", pointer_type, true); + + auto N = symbolic::symbol("N"); + auto M = symbolic::symbol("M"); + auto i = symbolic::symbol("i"); + auto j = symbolic::symbol("j"); + + auto& outer_loop = + builder.add_for(root, i, symbolic::Lt(i, N), symbolic::integer(0), symbolic::add(i, symbolic::one())); + auto& inner_loop = + builder + .add_for(outer_loop.root(), j, symbolic::Lt(j, M), symbolic::integer(0), symbolic::add(j, symbolic::one())); + + auto& block = builder.add_block(inner_loop.root()); + auto& access_in = builder.add_access(block, "A"); + auto& access_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + auto linearized = symbolic::add(symbolic::mul(i, M), j); + builder.add_computational_memlet(block, access_in, tasklet, "_in", {linearized}); + builder.add_computational_memlet(block, tasklet, "_out", access_out, {linearized}); + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + // Outer-loop tile is the established reference. + auto* tile_outer = analysis.tile(outer_loop, "A"); + ASSERT_NE(tile_outer, nullptr); + + // Root sequence tile should exist (scope-generic API) and match the outer-loop tile, + // because the outer loop is the only direct child carrying A accesses. + auto* tile_root = analysis.tile(root, "A"); + ASSERT_NE(tile_root, nullptr); + + ASSERT_EQ(tile_root->min_subset.size(), tile_outer->min_subset.size()); + for (size_t d = 0; d < tile_outer->min_subset.size(); ++d) { + EXPECT_TRUE(symbolic::eq(tile_root->min_subset.at(d), tile_outer->min_subset.at(d))); + EXPECT_TRUE(symbolic::eq(tile_root->max_subset.at(d), tile_outer->max_subset.at(d))); + } +} + +TEST(MemoryLayoutAnalysisTest, ScopeAPI_RootSequence_TwoSiblingLoops) { + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("N", index_type, true); + builder.add_container("M", index_type, true); + builder.add_container("i", index_type); + builder.add_container("j", index_type); + builder.add_container("i2", index_type); + builder.add_container("j2", index_type); + builder.add_container("A", pointer_type, true); + + auto N = symbolic::symbol("N"); + auto M = symbolic::symbol("M"); + auto i = symbolic::symbol("i"); + auto j = symbolic::symbol("j"); + auto i2 = symbolic::symbol("i2"); + auto j2 = symbolic::symbol("j2"); + + // First nest: writes A[i*M + j] for i in [0, N), j in [0, M) + { + auto& loop_i = + builder.add_for(root, i, symbolic::Lt(i, N), symbolic::integer(0), symbolic::add(i, symbolic::one())); + auto& loop_j = + builder + .add_for(loop_i.root(), j, symbolic::Lt(j, M), symbolic::integer(0), symbolic::add(j, symbolic::one())); + auto& block = builder.add_block(loop_j.root()); + auto& a_in = builder.add_access(block, "A"); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + auto idx = symbolic::add(symbolic::mul(i, M), j); + builder.add_computational_memlet(block, a_in, tasklet, "_in", {idx}); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {idx}); + } + // Second nest: writes A[i2*M + j2] for i2 in [0, N), j2 in [0, M) (independent indvars) + { + auto& loop_i = + builder.add_for(root, i2, symbolic::Lt(i2, N), symbolic::integer(0), symbolic::add(i2, symbolic::one())); + auto& loop_j = + builder + .add_for(loop_i.root(), j2, symbolic::Lt(j2, M), symbolic::integer(0), symbolic::add(j2, symbolic::one())); + auto& block = builder.add_block(loop_j.root()); + auto& a_in = builder.add_access(block, "A"); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + auto idx = symbolic::add(symbolic::mul(i2, M), j2); + builder.add_computational_memlet(block, a_in, tasklet, "_in", {idx}); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {idx}); + } + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + // Root sequence tile should exist and union both child loop tiles. Each loop + // covers [0..N-1, 0..M-1], so the union is identical. + auto* tile_root = analysis.tile(root, "A"); + ASSERT_NE(tile_root, nullptr); + + ASSERT_EQ(tile_root->min_subset.size(), 2u); + EXPECT_TRUE(symbolic::eq(tile_root->min_subset.at(0), symbolic::zero())); + EXPECT_TRUE(symbolic::eq(tile_root->min_subset.at(1), symbolic::zero())); + EXPECT_TRUE(symbolic::eq(tile_root->max_subset.at(0), symbolic::sub(N, symbolic::one()))); + EXPECT_TRUE(symbolic::eq(tile_root->max_subset.at(1), symbolic::sub(M, symbolic::one()))); +} + +TEST(MemoryLayoutAnalysisTest, ScopeAPI_IfElse_BothBranches) { + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("N", index_type, true); + builder.add_container("M", index_type, true); + builder.add_container("cond", index_type, true); + builder.add_container("i", index_type); + builder.add_container("j", index_type); + builder.add_container("A", pointer_type, true); + + auto N = symbolic::symbol("N"); + auto M = symbolic::symbol("M"); + auto cond = symbolic::symbol("cond"); + auto i = symbolic::symbol("i"); + auto j = symbolic::symbol("j"); + + auto& if_else = builder.add_if_else(root); + auto& branch_true = builder.add_case(if_else, symbolic::Eq(cond, symbolic::zero())); + auto& branch_false = builder.add_case(if_else, symbolic::Ne(cond, symbolic::zero())); + + auto build_nest = [&](structured_control_flow::Sequence& parent) { + auto& loop_i = + builder.add_for(parent, i, symbolic::Lt(i, N), symbolic::integer(0), symbolic::add(i, symbolic::one())); + auto& loop_j = + builder + .add_for(loop_i.root(), j, symbolic::Lt(j, M), symbolic::integer(0), symbolic::add(j, symbolic::one())); + auto& block = builder.add_block(loop_j.root()); + auto& a_in = builder.add_access(block, "A"); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + auto idx = symbolic::add(symbolic::mul(i, M), j); + builder.add_computational_memlet(block, a_in, tasklet, "_in", {idx}); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {idx}); + return &loop_i; + }; + + build_nest(branch_true); + build_nest(branch_false); + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + // Each branch sequence has its own tile. + auto* tile_branch_true = analysis.tile(branch_true, "A"); + ASSERT_NE(tile_branch_true, nullptr); + auto* tile_branch_false = analysis.tile(branch_false, "A"); + ASSERT_NE(tile_branch_false, nullptr); + + // The IfElse scope tile unions both branches; bounds match either branch (identical here). + auto* tile_ife = analysis.tile(if_else, "A"); + ASSERT_NE(tile_ife, nullptr); + + ASSERT_EQ(tile_ife->min_subset.size(), 2u); + EXPECT_TRUE(symbolic::eq(tile_ife->min_subset.at(0), symbolic::zero())); + EXPECT_TRUE(symbolic::eq(tile_ife->min_subset.at(1), symbolic::zero())); + EXPECT_TRUE(symbolic::eq(tile_ife->max_subset.at(0), symbolic::sub(N, symbolic::one()))); + EXPECT_TRUE(symbolic::eq(tile_ife->max_subset.at(1), symbolic::sub(M, symbolic::one()))); + + // Root sequence picks up the IfElse contribution. + auto* tile_root = analysis.tile(root, "A"); + ASSERT_NE(tile_root, nullptr); + ASSERT_EQ(tile_root->min_subset.size(), 2u); + EXPECT_TRUE(symbolic::eq(tile_root->min_subset.at(0), tile_ife->min_subset.at(0))); + EXPECT_TRUE(symbolic::eq(tile_root->max_subset.at(0), tile_ife->max_subset.at(0))); +} + +TEST(MemoryLayoutAnalysisTest, ScopeAPI_While_PassThrough) { + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("N", index_type, true); + builder.add_container("M", index_type, true); + builder.add_container("i", index_type); + builder.add_container("j", index_type); + builder.add_container("A", pointer_type, true); + + auto N = symbolic::symbol("N"); + auto M = symbolic::symbol("M"); + auto i = symbolic::symbol("i"); + auto j = symbolic::symbol("j"); + + auto& while_loop = builder.add_while(root); + + auto& loop_i = + builder + .add_for(while_loop.root(), i, symbolic::Lt(i, N), symbolic::integer(0), symbolic::add(i, symbolic::one())); + auto& loop_j = + builder.add_for(loop_i.root(), j, symbolic::Lt(j, M), symbolic::integer(0), symbolic::add(j, symbolic::one())); + + auto& block = builder.add_block(loop_j.root()); + auto& a_in = builder.add_access(block, "A"); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + auto idx = symbolic::add(symbolic::mul(i, M), j); + builder.add_computational_memlet(block, a_in, tasklet, "_in", {idx}); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {idx}); + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile_body = analysis.tile(while_loop.root(), "A"); + ASSERT_NE(tile_body, nullptr); + + // While scope tile should equal its body sequence tile. + auto* tile_while = analysis.tile(while_loop, "A"); + ASSERT_NE(tile_while, nullptr); + + ASSERT_EQ(tile_while->min_subset.size(), tile_body->min_subset.size()); + for (size_t d = 0; d < tile_body->min_subset.size(); ++d) { + EXPECT_TRUE(symbolic::eq(tile_while->min_subset.at(d), tile_body->min_subset.at(d))); + EXPECT_TRUE(symbolic::eq(tile_while->max_subset.at(d), tile_body->max_subset.at(d))); + } +} + +TEST(MemoryLayoutAnalysisTest, ScopeAPI_TileGroups_NonLoopScope) { + // Stencil-like pattern with constant-offset bases should produce a merged + // tile group not only at the loop level but also at the enclosing scope. + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("N", index_type, true); + builder.add_container("M", index_type, true); + builder.add_container("i", index_type); + builder.add_container("j", index_type); + builder.add_container("A", pointer_type, true); + + auto N = symbolic::symbol("N"); + auto M = symbolic::symbol("M"); + auto i = symbolic::symbol("i"); + auto j = symbolic::symbol("j"); + + auto& loop_i = builder.add_for( + root, i, symbolic::Lt(i, symbolic::sub(N, symbolic::one())), symbolic::one(), symbolic::add(i, symbolic::one()) + ); + auto& loop_j = builder.add_for( + loop_i.root(), + j, + symbolic::Lt(j, symbolic::sub(M, symbolic::one())), + symbolic::one(), + symbolic::add(j, symbolic::one()) + ); + + // Two reads of A with constant-offset bases: A[i*M + j] and A[i*M + (j+1)] + auto& block = builder.add_block(loop_j.root()); + auto& a_c = builder.add_access(block, "A"); + auto& a_r = builder.add_access(block, "A"); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::fp_add, "_out", {"_inc", "_inr"}); + auto idx_c = symbolic::add(symbolic::mul(i, M), j); + auto idx_r = symbolic::add(symbolic::mul(i, M), symbolic::add(j, symbolic::one())); + builder.add_computational_memlet(block, a_c, tasklet, "_inc", {idx_c}); + builder.add_computational_memlet(block, a_r, tasklet, "_inr", {idx_r}); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {idx_c}); + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + // Loop-level groups: stencil bases merge into one group at the j-loop level. + auto* groups_j = analysis.tile_groups(loop_j, "A"); + ASSERT_NE(groups_j, nullptr); + + // The root sequence should also expose tile groups for A (propagated upward). + auto* groups_root = analysis.tile_groups(root, "A"); + ASSERT_NE(groups_root, nullptr); + EXPECT_FALSE(groups_root->empty()); +} diff --git a/targets/tenstorrent/src/tenstorrent/tenstorrent_transform.cpp b/targets/tenstorrent/src/tenstorrent/tenstorrent_transform.cpp index 9a6ef5705..c14333363 100644 --- a/targets/tenstorrent/src/tenstorrent/tenstorrent_transform.cpp +++ b/targets/tenstorrent/src/tenstorrent/tenstorrent_transform.cpp @@ -4,7 +4,6 @@ #include "sdfg/analysis/assumptions_analysis.h" #include "sdfg/analysis/loop_analysis.h" -#include "sdfg/analysis/mem_access_range_analysis.h" #include "sdfg/analysis/type_analysis.h" #include "sdfg/analysis/users.h" @@ -126,8 +125,6 @@ std::unique_ptr TenstorrentTransform:: } } - auto& mem_access_ranges = analysis_manager.get(); - if (!arguments_analysis.argument_size_known(analysis_manager, this->map_, allow_dynamic_sizes_)) { if (report_) report_->transform_impossible(this, "transfer args not sized"); return {}; From 84b6ee854e4467ea559f7f12a9a2278a86c3944e Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Mon, 8 Jun 2026 23:06:27 +0200 Subject: [PATCH 09/20] addresses regression in offload transform --- opt/tests/CMakeLists.txt | 1 + .../offloading/cuda_transform_im2col_test.cpp | 252 +++++++++ sdfg/src/analysis/memory_layout_analysis.cpp | 16 +- sdfg/src/symbolic/delinearization.cpp | 34 ++ .../analysis/memory_layout_analysis_test.cpp | 534 ++++++++++++++++++ 5 files changed, 836 insertions(+), 1 deletion(-) create mode 100644 opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp diff --git a/opt/tests/CMakeLists.txt b/opt/tests/CMakeLists.txt index cf8266c99..47b807435 100644 --- a/opt/tests/CMakeLists.txt +++ b/opt/tests/CMakeLists.txt @@ -23,6 +23,7 @@ set(TEST_FILES passes/offloading/code_motion/block_sorting_test.cpp passes/offloading/data_transfer_minimization_pass_test.cpp transformations/offloading/cuda_parallelize_nested_map_test.cpp + transformations/offloading/cuda_transform_im2col_test.cpp transformations/offloading/gpu_tiling_test.cpp transformations/offloading/kernel_local_storage_test.cpp transformations/offloading/cublas_data_transfer_extraction_test.cpp diff --git a/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp b/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp new file mode 100644 index 000000000..d58215a82 --- /dev/null +++ b/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp @@ -0,0 +1,252 @@ +// Regression tests for CUDATransform / OffloadTransform on the im2col pattern +// produced by ResNet's first stride-2 7x7 conv lowering. +// +// The map writes a `_patches` buffer from an input image `_1` and previously +// (commit prior to the regression observed in resnet `__docc_GraphModule.cpp`) +// was offloaded to a single CUDA kernel. It is now left as a host-side double +// loop with an additional H2D copy of the produced `_patches`, doubling the +// end-to-end runtime. +// +// Two tests: +// * `CollapsedTwoDimMap` - exact shape produced by the optimizer: two maps +// over collapsed indvars with mod/div arithmetic in the memlet subsets. +// * `ExplicitSixDimMap` - the logically equivalent un-collapsed form (six +// nested maps with simple affine subscripts). Useful to disentangle +// whether the regression is in subset analysis under collapsed indvars or +// in the offload-transform criteria themselves. + +#include + +#include "sdfg/analysis/analysis.h" +#include "sdfg/builder/structured_sdfg_builder.h" +#include "sdfg/data_flow/tasklet.h" +#include "sdfg/function.h" +#include "sdfg/structured_control_flow/block.h" +#include "sdfg/structured_control_flow/if_else.h" +#include "sdfg/structured_control_flow/map.h" +#include "sdfg/symbolic/symbolic.h" +#include "sdfg/targets/cuda/cuda.h" +#include "sdfg/transformations/offloading/cuda_transform.h" +#include "sdfg/types/pointer.h" +#include "sdfg/types/scalar.h" + +namespace sdfg::cuda { + +namespace { + +// Constants mirroring the failing resnet kernel. +constexpr int kN = 32; +constexpr int kCin = 3; +constexpr int kHin = 224; +constexpr int kHout = 112; +constexpr int kKh = 7; + +constexpr int kCollapsedOuter = kN * kHout * kHout; // 401408 +constexpr int kCollapsedInner = kCin * kKh * kKh; // 147 + +constexpr int kStrideNCin = kHin * kHin; // 50176 +constexpr int kStrideN = kCin * kHin * kHin; // 150528 + +constexpr int kStridePatchN = kHout * kHout * kCin * kKh * kKh; // 1843968 +constexpr int kStridePatchHout = kHout * kCin * kKh * kKh; // 16464 +constexpr int kStridePatchWout = kCin * kKh * kKh; // 147 +constexpr int kStridePatchC = kKh * kKh; // 49 + +// _patches0 size in elements: N * Hout * Wout * Cin * Kh * Kw +constexpr long long kPatchesElems = static_cast(kN) * kHout * kHout * kCin * kKh * kKh; +// _1 size in elements: N * Cin * Hin * Win +constexpr long long kImageElems = static_cast(kN) * kCin * kHin * kHin; + +symbolic::Expression i(long long v) { return symbolic::integer(v); } +symbolic::Symbol s(const std::string& n) { return symbolic::symbol(n); } + +} // namespace + +TEST(CudaTransformIm2colTest, CollapsedTwoDimMap) { + builder::StructuredSDFGBuilder builder("im2col_collapsed", FunctionType_CPU); + auto& root = builder.subject().root(); + + types::Scalar f32(types::PrimitiveType::Float); + types::Pointer f32ptr(f32); + types::Scalar i64(types::PrimitiveType::Int64); + + builder.add_container("_n0_collapsed0", i64); + builder.add_container("_c0_collapsed0", i64); + builder.add_container("_1", f32ptr, /*is_argument=*/true); + builder.add_container("_patches0", f32ptr, /*is_argument=*/true); + + ScheduleType seq = ScheduleType_Sequential::create(); + + auto& outer_map = builder.add_map( + root, + s("_n0_collapsed0"), + symbolic::Lt(s("_n0_collapsed0"), i(kCollapsedOuter)), + i(0), + symbolic::add(s("_n0_collapsed0"), i(1)), + seq + ); + auto& inner_map = builder.add_map( + outer_map.root(), + s("_c0_collapsed0"), + symbolic::Lt(s("_c0_collapsed0"), i(kCollapsedInner)), + i(0), + symbolic::add(s("_c0_collapsed0"), i(1)), + seq + ); + + // Helpers + auto kh_mod = symbolic::mod(symbolic::div(s("_c0_collapsed0"), i(kKh)), i(kKh)); + auto kw_mod = symbolic::mod(s("_c0_collapsed0"), i(kKh)); + auto hout_mod = symbolic::mod(symbolic::div(s("_n0_collapsed0"), i(kHout)), i(kHout)); + auto wout_mod = symbolic::mod(s("_n0_collapsed0"), i(kHout)); + auto c_div = symbolic::div(s("_c0_collapsed0"), i(kStridePatchC)); // c0 / 49 + auto n_div = symbolic::div(s("_n0_collapsed0"), i(kHout * kHout)); // n0 / 12544 + + // h_in = -3 + ((c0/7)%7) + 2*((n0/112)%112) + auto h_in = symbolic::add(i(-(kKh / 2)), symbolic::add(kh_mod, symbolic::mul(i(2), hout_mod))); + // w_in = -3 + (c0%7) + 2*(n0%112) + auto w_in = symbolic::add(i(-(kKh / 2)), symbolic::add(kw_mod, symbolic::mul(i(2), wout_mod))); + + auto cond_in_bounds = symbolic:: + And(symbolic::And(symbolic::Ge(w_in, i(0)), symbolic::Ge(h_in, i(0))), + symbolic::And(symbolic::Lt(w_in, i(kHin)), symbolic::Lt(h_in, i(kHin)))); + auto cond_out_of_bounds = symbolic:: + Or(symbolic::Or(symbolic::Ge(w_in, i(kHin)), symbolic::Ge(h_in, i(kHin))), + symbolic::Or(symbolic::Lt(w_in, i(0)), symbolic::Lt(h_in, i(0)))); + + auto& ifelse = builder.add_if_else(inner_map.root()); + auto& case_in = builder.add_case(ifelse, cond_in_bounds); + auto& case_out = builder.add_case(ifelse, cond_out_of_bounds); + + // out_idx = 49*(c0/49) + 1843968*(n0/12544) + (c0%7) + 147*(n0%112) + // + 7*((c0/7)%7) + 16464*((n0/112)%112) + auto out_idx = symbolic:: + add(symbolic:: + add(symbolic::add(symbolic::mul(i(kStridePatchC), c_div), symbolic::mul(i(kStridePatchN), n_div)), + symbolic::add(kw_mod, symbolic::mul(i(kStridePatchWout), wout_mod))), + symbolic::add(symbolic::mul(i(kKh), kh_mod), symbolic::mul(i(kStridePatchHout), hout_mod))); + + // in_idx = -3 + 224*(-3 + ((c0/7)%7) + 2*((n0/112)%112)) + // + 50176*(c0/49) + 150528*(n0/12544) + (c0%7) + 2*(n0%112) + auto in_idx = symbolic::add( + i(-(kKh / 2)), + symbolic:: + add(symbolic::add(symbolic::mul(i(kHin), h_in), symbolic::mul(i(kStrideNCin), c_div)), + symbolic::add(symbolic::mul(i(kStrideN), n_div), symbolic::add(kw_mod, symbolic::mul(i(2), wout_mod)))) + ); + + // In-bounds branch: _patches0[out_idx] = _1[in_idx] + { + auto& block = builder.add_block(case_in); + auto& read = builder.add_access(block, "_1"); + auto& write = builder.add_access(block, "_patches0"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "out_", {"in_"}); + builder.add_computational_memlet(block, read, tasklet, "in_", {in_idx}); + builder.add_computational_memlet(block, tasklet, "out_", write, {out_idx}); + } + // Out-of-bounds branch: _patches0[out_idx] = 0 + { + auto& block = builder.add_block(case_out); + auto& write = builder.add_access(block, "_patches0"); + auto& constant = builder.add_constant(block, "0.0f", f32); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "out_", {"in_"}); + builder.add_computational_memlet(block, constant, tasklet, "in_", {}, f32); + builder.add_computational_memlet(block, tasklet, "out_", write, {out_idx}); + } + + analysis::AnalysisManager analysis_manager(builder.subject()); + CUDATransform transform(outer_map, /*block_size=*/32); + + // Regression: this expects `true`; the failing main branch returns `false` + // and the offload pipeline keeps the map on the host. + EXPECT_TRUE(transform.can_be_applied(builder, analysis_manager)) + << "OffloadTransform regressed on collapsed im2col map: the outer map " + "is no longer recognised as offloadable."; +} + +TEST(CudaTransformIm2colTest, ExplicitSixDimMap) { + builder::StructuredSDFGBuilder builder("im2col_explicit", FunctionType_CPU); + auto& root = builder.subject().root(); + + types::Scalar f32(types::PrimitiveType::Float); + types::Pointer f32ptr(f32); + types::Scalar i64(types::PrimitiveType::Int64); + + builder.add_container("n", i64); + builder.add_container("hout", i64); + builder.add_container("wout", i64); + builder.add_container("c", i64); + builder.add_container("kh", i64); + builder.add_container("kw", i64); + builder.add_container("_1", f32ptr, /*is_argument=*/true); + builder.add_container("_patches0", f32ptr, /*is_argument=*/true); + + ScheduleType seq = ScheduleType_Sequential::create(); + + auto add_simple_map = [&](structured_control_flow::Sequence& parent, const std::string& name, long long bound + ) -> structured_control_flow::Map& { + return builder + .add_map(parent, s(name), symbolic::Lt(s(name), i(bound)), i(0), symbolic::add(s(name), i(1)), seq); + }; + + auto& m_n = add_simple_map(root, "n", kN); + auto& m_hout = add_simple_map(m_n.root(), "hout", kHout); + auto& m_wout = add_simple_map(m_hout.root(), "wout", kHout); + auto& m_c = add_simple_map(m_wout.root(), "c", kCin); + auto& m_kh = add_simple_map(m_c.root(), "kh", kKh); + auto& m_kw = add_simple_map(m_kh.root(), "kw", kKh); + + // h_in = 2*hout + kh - 3, w_in = 2*wout + kw - 3 + auto h_in = symbolic::sub(symbolic::add(symbolic::mul(i(2), s("hout")), s("kh")), i(kKh / 2)); + auto w_in = symbolic::sub(symbolic::add(symbolic::mul(i(2), s("wout")), s("kw")), i(kKh / 2)); + + auto cond_in_bounds = symbolic:: + And(symbolic::And(symbolic::Ge(w_in, i(0)), symbolic::Ge(h_in, i(0))), + symbolic::And(symbolic::Lt(w_in, i(kHin)), symbolic::Lt(h_in, i(kHin)))); + auto cond_out_of_bounds = symbolic:: + Or(symbolic::Or(symbolic::Ge(w_in, i(kHin)), symbolic::Ge(h_in, i(kHin))), + symbolic::Or(symbolic::Lt(w_in, i(0)), symbolic::Lt(h_in, i(0)))); + + auto& ifelse = builder.add_if_else(m_kw.root()); + auto& case_in = builder.add_case(ifelse, cond_in_bounds); + auto& case_out = builder.add_case(ifelse, cond_out_of_bounds); + + auto out_idx = symbolic::add( + symbolic:: + add(symbolic::add(symbolic::mul(i(kStridePatchN), s("n")), symbolic::mul(i(kStridePatchHout), s("hout"))), + symbolic::add(symbolic::mul(i(kStridePatchWout), s("wout")), symbolic::mul(i(kStridePatchC), s("c")))), + symbolic::add(symbolic::mul(i(kKh), s("kh")), s("kw")) + ); + auto in_idx = symbolic:: + add(symbolic:: + add(symbolic::add(symbolic::mul(i(kStrideN), s("n")), symbolic::mul(i(kStrideNCin), s("c"))), + symbolic::mul(i(kHin), h_in)), + w_in); + + { + auto& block = builder.add_block(case_in); + auto& read = builder.add_access(block, "_1"); + auto& write = builder.add_access(block, "_patches0"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "out_", {"in_"}); + builder.add_computational_memlet(block, read, tasklet, "in_", {in_idx}); + builder.add_computational_memlet(block, tasklet, "out_", write, {out_idx}); + } + { + auto& block = builder.add_block(case_out); + auto& write = builder.add_access(block, "_patches0"); + auto& constant = builder.add_constant(block, "0.0f", f32); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "out_", {"in_"}); + builder.add_computational_memlet(block, constant, tasklet, "in_", {}, f32); + builder.add_computational_memlet(block, tasklet, "out_", write, {out_idx}); + } + + analysis::AnalysisManager analysis_manager(builder.subject()); + CUDATransform transform(m_n, /*block_size=*/32); + + EXPECT_TRUE(transform.can_be_applied(builder, analysis_manager)) + << "OffloadTransform unexpectedly rejects the explicit (un-collapsed) " + "im2col map. If only the collapsed variant fails, the regression " + "lies in subset analysis under mod/div indvars."; +} + +} // namespace sdfg::cuda diff --git a/sdfg/src/analysis/memory_layout_analysis.cpp b/sdfg/src/analysis/memory_layout_analysis.cpp index 3f7bee668..6f97d9e25 100644 --- a/sdfg/src/analysis/memory_layout_analysis.cpp +++ b/sdfg/src/analysis/memory_layout_analysis.cpp @@ -218,7 +218,21 @@ void MemoryLayoutAnalysis:: auto result = symbolic::delinearize(linearized_expr, assumptions); if (!result.success) { - continue; // Delinearization failed, skip + // Fallback: register the access as a 1D contiguous range over the + // raw linearized address. We lose multi-dim layout info, but the + // scope-level merge can still bound the access via BoundAnalysis, + // which is enough for downstream consumers like ArgumentsAnalysis + // to compute argument sizes. This recovers patterns where the + // delinearizer rejects the access (e.g. halo offsets producing + // negative constants inside a stride product, or non-strictly- + // dominating strides) but the overall address range is still + // soundly bounded by the enclosing loop assumptions. + symbolic::MultiExpression shape; + shape.push_back(symbolic::symbol("__unbounded__")); + MemoryLayout layout(shape); + MemoryAccess layout_info{container_name, {linearized_expr}, layout, false}; + this->accesses_.emplace(&memlet, layout_info); + continue; } // Delinearization returns N indices but only N-1 dimensions (from stride division) diff --git a/sdfg/src/symbolic/delinearization.cpp b/sdfg/src/symbolic/delinearization.cpp index b6f6c76db..bacd91771 100644 --- a/sdfg/src/symbolic/delinearization.cpp +++ b/sdfg/src/symbolic/delinearization.cpp @@ -158,6 +158,40 @@ bool decompose_by_stride( } } + // If the indvar-side index is an Add with constant (no-indvar) subterms, + // peel those subterms out and fold `stride * constant_part` into the + // global constant_offset. This keeps the per-group index expression + // non-negative when individual sub-additions are non-negative even + // though the unexpanded original (e.g. `224*(-3 + (i%7) + 2*j)`) has + // a negative constant inside the stride product. Without this step, + // delinearize's `is_nonneg(best_index, ...)` gate rejects valid + // accesses like im2col with halo offsets. + if (SymEngine::is_a(*index)) { + sym::Expression nonconstant = sym::zero(); + sym::Expression constant_part = sym::zero(); + for (const auto& sub : index->get_args()) { + bool sub_has_indvar = false; + for (auto& s : sym::atoms(sub)) { + if (params.count(s) == 0) { + sub_has_indvar = true; + break; + } + } + if (sub_has_indvar) { + nonconstant = sym::add(nonconstant, sub); + } else { + constant_part = sym::add(constant_part, sub); + } + } + if (!sym::eq(constant_part, sym::zero())) { + constant_offset = sym::add(constant_offset, sym::mul(stride, constant_part)); + if (sym::eq(nonconstant, sym::zero())) { + continue; + } + index = nonconstant; + } + } + add_to_group(stride, index); } return true; diff --git a/sdfg/tests/analysis/memory_layout_analysis_test.cpp b/sdfg/tests/analysis/memory_layout_analysis_test.cpp index daadd120c..09aba5ce5 100644 --- a/sdfg/tests/analysis/memory_layout_analysis_test.cpp +++ b/sdfg/tests/analysis/memory_layout_analysis_test.cpp @@ -2517,3 +2517,537 @@ TEST(MemoryLayoutAnalysisTest, ScopeAPI_TileGroups_NonLoopScope) { ASSERT_NE(groups_root, nullptr); EXPECT_FALSE(groups_root->empty()); } + +// ----------------------------------------------------------------------------- +// Regression tests targeting the ResNet im2col offload regression. +// +// The failing kernel is a 2D collapsed map writing `_patches0` from `_1`. The +// subscripts involve `i % C` and `i / C` of plain (constant-bound) indvars, +// and the dataflow lives inside an `IfElse` with branch-disjoint reads. The +// tests below isolate the smallest MLA patterns that should still produce a +// tile bound (`tile(map, container)` and `contiguous_range()`); each one will +// expose a separate gap if MLA regresses again. +// ----------------------------------------------------------------------------- + +TEST(MemoryLayoutAnalysisTest, Regression_Im2col_ModSubscript) { + // for i in [0, 1024): A[i % 16] -- pointer access with a single mod. + // Outer tile for A should be bounded by [0, 15]. + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("i", index_type); + builder.add_container("A", pointer_type, true); + + auto i = symbolic::symbol("i"); + auto& map = builder.add_map( + root, + i, + symbolic::Lt(i, symbolic::integer(1024)), + symbolic::zero(), + symbolic::add(i, symbolic::one()), + ScheduleType_Sequential::create() + ); + + auto& block = builder.add_block(map.root()); + auto& a_in = builder.add_access(block, "A"); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + auto idx = symbolic::mod(i, symbolic::integer(16)); + builder.add_computational_memlet(block, a_in, tasklet, "_in", {idx}); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {idx}); + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile_map = analysis.tile(map, "A"); + ASSERT_NE(tile_map, nullptr) << "MLA could not bound A[i % 16] over the map scope."; + auto range = tile_map->contiguous_range(); + EXPECT_FALSE(range.first.is_null()); + EXPECT_FALSE(range.second.is_null()); +} + +TEST(MemoryLayoutAnalysisTest, Regression_Im2col_DivSubscript) { + // for i in [0, 1024): A[i / 16] -- single floor-div subscript. + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("i", index_type); + builder.add_container("A", pointer_type, true); + + auto i = symbolic::symbol("i"); + auto& map = builder.add_map( + root, + i, + symbolic::Lt(i, symbolic::integer(1024)), + symbolic::zero(), + symbolic::add(i, symbolic::one()), + ScheduleType_Sequential::create() + ); + + auto& block = builder.add_block(map.root()); + auto& a_in = builder.add_access(block, "A"); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + auto idx = symbolic::div(i, symbolic::integer(16)); + builder.add_computational_memlet(block, a_in, tasklet, "_in", {idx}); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {idx}); + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile_map = analysis.tile(map, "A"); + ASSERT_NE(tile_map, nullptr) << "MLA could not bound A[i / 16] over the map scope."; + auto range = tile_map->contiguous_range(); + EXPECT_FALSE(range.first.is_null()); + EXPECT_FALSE(range.second.is_null()); +} + +TEST(MemoryLayoutAnalysisTest, Regression_Im2col_MixedModDivStrided) { + // for i in [0, 401408): for j in [0, 147): + // A[150528*(i/12544) + 50176*(j/49) + 224*((i/112)%112) + (i%112)] + // -- mod/div linear combination, the form produced by collapsing four + // outer loops to one in resnet's im2col. + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("i", index_type); + builder.add_container("j", index_type); + builder.add_container("A", pointer_type, true); + + auto i = symbolic::symbol("i"); + auto j = symbolic::symbol("j"); + auto N = symbolic::integer(401408); + auto M = symbolic::integer(147); + + auto& outer = builder.add_map( + root, + i, + symbolic::Lt(i, N), + symbolic::zero(), + symbolic::add(i, symbolic::one()), + ScheduleType_Sequential::create() + ); + auto& inner = builder.add_map( + outer.root(), + j, + symbolic::Lt(j, M), + symbolic::zero(), + symbolic::add(j, symbolic::one()), + ScheduleType_Sequential::create() + ); + + auto& block = builder.add_block(inner.root()); + auto& a_in = builder.add_access(block, "A"); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + auto idx = symbolic:: + add(symbolic:: + add(symbolic::mul(symbolic::integer(150528), symbolic::div(i, symbolic::integer(12544))), + symbolic::mul(symbolic::integer(50176), symbolic::div(j, symbolic::integer(49)))), + symbolic:: + add(symbolic:: + mul(symbolic::integer(224), + symbolic::mod(symbolic::div(i, symbolic::integer(112)), symbolic::integer(112))), + symbolic::mod(i, symbolic::integer(112)))); + builder.add_computational_memlet(block, a_in, tasklet, "_in", {idx}); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {idx}); + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile_outer = analysis.tile(outer, "A"); + ASSERT_NE(tile_outer, nullptr) << "MLA could not bound the collapsed im2col-style mod/div access at the outer map."; + auto range = tile_outer->contiguous_range(); + EXPECT_FALSE(range.first.is_null()); + EXPECT_FALSE(range.second.is_null()); +} + +TEST(MemoryLayoutAnalysisTest, Regression_Im2col_AccessInsideIfElse) { + // for i in [0, 1024): if (i < 16) A[i] = 0; + // The map body contains an IfElse rather than a Block; the pointer access + // lives inside one branch only. + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("i", index_type); + builder.add_container("A", pointer_type, true); + + auto i = symbolic::symbol("i"); + auto& map = builder.add_map( + root, + i, + symbolic::Lt(i, symbolic::integer(1024)), + symbolic::zero(), + symbolic::add(i, symbolic::one()), + ScheduleType_Sequential::create() + ); + + auto& ife = builder.add_if_else(map.root()); + auto& taken = builder.add_case(ife, symbolic::Lt(i, symbolic::integer(16))); + auto& not_taken = builder.add_case(ife, symbolic::Ge(i, symbolic::integer(16))); + auto& block = builder.add_block(taken); + auto& constant = builder.add_constant(block, "0.0f", scalar_type); + auto& a_out = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + builder.add_computational_memlet(block, constant, tasklet, "_in", {}, scalar_type); + builder.add_computational_memlet(block, tasklet, "_out", a_out, {i}); + // Suppress unused-variable warning for the empty else case. + (void) not_taken; + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile_map = analysis.tile(map, "A"); + ASSERT_NE(tile_map, nullptr) << "MLA returns nullptr when the only access to A lives in one IfElse branch."; + auto range = tile_map->contiguous_range(); + EXPECT_FALSE(range.first.is_null()); + EXPECT_FALSE(range.second.is_null()); +} + +TEST(MemoryLayoutAnalysisTest, Regression_Im2col_TwoArgsIfElseBranchAsymmetric) { + // Map body is an IfElse with two cases, both writing to `_patches`: + // if (i < 16): _patches[i] = _1[i]; (reads _1) + // if (i >= 16): _patches[i] = 0.0f; (does NOT read _1) + // This mirrors the resnet asymmetry: one container is accessed in both + // branches, another in only one. MLA must still bound both at the map. + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("i", index_type); + builder.add_container("_1", pointer_type, true); + builder.add_container("_patches", pointer_type, true); + + auto i = symbolic::symbol("i"); + auto& map = builder.add_map( + root, + i, + symbolic::Lt(i, symbolic::integer(1024)), + symbolic::zero(), + symbolic::add(i, symbolic::one()), + ScheduleType_Sequential::create() + ); + + auto& ife = builder.add_if_else(map.root()); + auto& taken = builder.add_case(ife, symbolic::Lt(i, symbolic::integer(16))); + auto& not_taken = builder.add_case(ife, symbolic::Ge(i, symbolic::integer(16))); + + { + auto& block = builder.add_block(taken); + auto& in_node = builder.add_access(block, "_1"); + auto& out_node = builder.add_access(block, "_patches"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + builder.add_computational_memlet(block, in_node, tasklet, "_in", {i}); + builder.add_computational_memlet(block, tasklet, "_out", out_node, {i}); + } + { + auto& block = builder.add_block(not_taken); + auto& constant = builder.add_constant(block, "0.0f", scalar_type); + auto& out_node = builder.add_access(block, "_patches"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + builder.add_computational_memlet(block, constant, tasklet, "_in", {}, scalar_type); + builder.add_computational_memlet(block, tasklet, "_out", out_node, {i}); + } + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile_in = analysis.tile(map, "_1"); + ASSERT_NE(tile_in, nullptr) << "MLA could not bound _1 (one-branch read) at the map scope."; + auto range_in = tile_in->contiguous_range(); + EXPECT_FALSE(range_in.first.is_null()); + EXPECT_FALSE(range_in.second.is_null()); + + auto* tile_out = analysis.tile(map, "_patches"); + ASSERT_NE(tile_out, nullptr) << "MLA could not bound _patches (both-branch write) at the map scope."; + auto range_out = tile_out->contiguous_range(); + EXPECT_FALSE(range_out.first.is_null()); + EXPECT_FALSE(range_out.second.is_null()); +} + +TEST(MemoryLayoutAnalysisTest, Regression_Im2col_NegativeOffsetSubscript) { + // Map body has an IfElse guarding a negative-offset read: + // if (i >= 3 && i < 224 + 3): _patches[i-3] = _1[i-3]; + // A common simplification result of im2col padding logic. The subscript + // (i - 3) reaches -2..-1 at the loop's lower bound when the guard is + // ignored. MLA's BoundAnalysis must respect the loop range and yield a + // sound [-3, last] tile, not give up because of the negative offset. + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("i", index_type); + builder.add_container("_1", pointer_type, true); + builder.add_container("_patches", pointer_type, true); + + auto i = symbolic::symbol("i"); + auto& map = builder.add_map( + root, + i, + symbolic::Lt(i, symbolic::integer(230)), + symbolic::zero(), + symbolic::add(i, symbolic::one()), + ScheduleType_Sequential::create() + ); + + auto& ife = builder.add_if_else(map.root()); + auto& taken = + builder + .add_case(ife, symbolic::And(symbolic::Ge(i, symbolic::integer(3)), symbolic::Lt(i, symbolic::integer(227)))); + auto& not_taken = + builder + .add_case(ife, symbolic::Or(symbolic::Lt(i, symbolic::integer(3)), symbolic::Ge(i, symbolic::integer(227)))); + + auto idx = symbolic::sub(i, symbolic::integer(3)); + { + auto& block = builder.add_block(taken); + auto& in_node = builder.add_access(block, "_1"); + auto& out_node = builder.add_access(block, "_patches"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + builder.add_computational_memlet(block, in_node, tasklet, "_in", {idx}); + builder.add_computational_memlet(block, tasklet, "_out", out_node, {idx}); + } + { + auto& block = builder.add_block(not_taken); + auto& constant = builder.add_constant(block, "0.0f", scalar_type); + auto& out_node = builder.add_access(block, "_patches"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + builder.add_computational_memlet(block, constant, tasklet, "_in", {}, scalar_type); + builder.add_computational_memlet(block, tasklet, "_out", out_node, {idx}); + } + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile_in = analysis.tile(map, "_1"); + ASSERT_NE(tile_in, nullptr) << "MLA returned nullptr for _1 with negative-offset (i-3) subscript inside IfElse."; + auto range_in = tile_in->contiguous_range(); + EXPECT_FALSE(range_in.first.is_null()); + EXPECT_FALSE(range_in.second.is_null()); + + auto* tile_out = analysis.tile(map, "_patches"); + ASSERT_NE(tile_out, nullptr) + << "MLA returned nullptr for _patches with negative-offset (i-3) subscript inside IfElse."; + auto range_out = tile_out->contiguous_range(); + EXPECT_FALSE(range_out.first.is_null()); + EXPECT_FALSE(range_out.second.is_null()); +} + +TEST(MemoryLayoutAnalysisTest, Regression_Im2col_ResNetFullPattern) { + // Faithful reproduction of the resnet kernel that stopped offloading: + // for n0 in [0, 401408): + // for c0 in [0, 147): + // if (in-bounds for _1 read): _patches0[Pn0c0] = _1[In0c0] + // elif (out-of-bounds): _patches0[Pn0c0] = 0 + // where In0c0 / Pn0c0 are the mod/div linear combos from the failing + // kernel. Both _1 and _patches0 must be bounded at the outer map. + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("_n0_collapsed0", index_type); + builder.add_container("_c0_collapsed0", index_type); + builder.add_container("_1", pointer_type, true); + builder.add_container("_patches0", pointer_type, true); + + auto n0 = symbolic::symbol("_n0_collapsed0"); + auto c0 = symbolic::symbol("_c0_collapsed0"); + auto i7 = symbolic::integer(7); + auto i112 = symbolic::integer(112); + auto i49 = symbolic::integer(49); + auto i12544 = symbolic::integer(12544); + auto i224 = symbolic::integer(224); + auto i147 = symbolic::integer(147); + auto i16464 = symbolic::integer(16464); + auto i50176 = symbolic::integer(50176); + auto i150528 = symbolic::integer(150528); + auto i1843968 = symbolic::integer(1843968); + auto i_neg3 = symbolic::integer(-3); + auto i2 = symbolic::integer(2); + + auto& outer = builder.add_map( + root, + n0, + symbolic::Lt(n0, symbolic::integer(401408)), + symbolic::zero(), + symbolic::add(n0, symbolic::one()), + ScheduleType_Sequential::create() + ); + auto& inner = builder.add_map( + outer.root(), + c0, + symbolic::Lt(c0, i147), + symbolic::zero(), + symbolic::add(c0, symbolic::one()), + ScheduleType_Sequential::create() + ); + + auto kh_mod = symbolic::mod(symbolic::div(c0, i7), i7); + auto kw_mod = symbolic::mod(c0, i7); + auto hout_mod = symbolic::mod(symbolic::div(n0, i112), i112); + auto wout_mod = symbolic::mod(n0, i112); + auto c_div = symbolic::div(c0, i49); + auto n_div = symbolic::div(n0, i12544); + + // h_in = -3 + ((c0/7)%7) + 2*((n0/112)%112) + auto h_in = symbolic::add(i_neg3, symbolic::add(kh_mod, symbolic::mul(i2, hout_mod))); + // w_in = -3 + (c0%7) + 2*(n0%112) + auto w_in = symbolic::add(i_neg3, symbolic::add(kw_mod, symbolic::mul(i2, wout_mod))); + + auto cond_in = symbolic:: + And(symbolic::And(symbolic::Ge(w_in, symbolic::zero()), symbolic::Ge(h_in, symbolic::zero())), + symbolic::And(symbolic::Lt(w_in, i224), symbolic::Lt(h_in, i224))); + auto cond_out = symbolic:: + Or(symbolic::Or(symbolic::Ge(w_in, i224), symbolic::Ge(h_in, i224)), + symbolic::Or(symbolic::Lt(w_in, symbolic::zero()), symbolic::Lt(h_in, symbolic::zero()))); + + auto& ife = builder.add_if_else(inner.root()); + auto& case_in = builder.add_case(ife, cond_in); + auto& case_out = builder.add_case(ife, cond_out); + + // patches index: 49*(c0/49) + 1843968*(n0/12544) + (c0%7) + 147*(n0%112) + // + 7*((c0/7)%7) + 16464*((n0/112)%112) + auto out_idx = symbolic:: + add(symbolic:: + add(symbolic::add(symbolic::mul(i49, c_div), symbolic::mul(i1843968, n_div)), + symbolic::add(kw_mod, symbolic::mul(i147, wout_mod))), + symbolic::add(symbolic::mul(i7, kh_mod), symbolic::mul(i16464, hout_mod))); + // _1 index: -3 + 224*h_in + 50176*c_div + 150528*n_div + (c0%7) + 2*(n0%112) + auto in_idx = symbolic:: + add(i_neg3, + symbolic:: + add(symbolic::add(symbolic::mul(i224, h_in), symbolic::mul(i50176, c_div)), + symbolic::add(symbolic::mul(i150528, n_div), symbolic::add(kw_mod, symbolic::mul(i2, wout_mod))))); + + { + auto& block = builder.add_block(case_in); + auto& in_node = builder.add_access(block, "_1"); + auto& out_node = builder.add_access(block, "_patches0"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + builder.add_computational_memlet(block, in_node, tasklet, "_in", {in_idx}); + builder.add_computational_memlet(block, tasklet, "_out", out_node, {out_idx}); + } + { + auto& block = builder.add_block(case_out); + auto& constant = builder.add_constant(block, "0.0f", scalar_type); + auto& out_node = builder.add_access(block, "_patches0"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + builder.add_computational_memlet(block, constant, tasklet, "_in", {}, scalar_type); + builder.add_computational_memlet(block, tasklet, "_out", out_node, {out_idx}); + } + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile_1 = analysis.tile(outer, "_1"); + ASSERT_NE(tile_1, nullptr) << "MLA could not bound _1 at the outer collapsed map (resnet im2col pattern)."; + auto r1 = tile_1->contiguous_range(); + EXPECT_FALSE(r1.first.is_null()); + EXPECT_FALSE(r1.second.is_null()); + + auto* tile_p = analysis.tile(outer, "_patches0"); + ASSERT_NE(tile_p, nullptr) << "MLA could not bound _patches0 at the outer collapsed map (resnet im2col pattern)."; + auto rp = tile_p->contiguous_range(); + EXPECT_FALSE(rp.first.is_null()); + EXPECT_FALSE(rp.second.is_null()); +} + +// MINIMAL reproduction of the delinearization bug found while debugging +// Regression_Im2col_ResNetFullPattern. The kernel feeds `A[224*(-3 + (i%7) + +// 2*j) + (i%7)]` over `(i,j) in [0,7) x [0,112)` to MLA. Without expansion of +// the parameter*indvar product, `decompose_by_stride` produces a group whose +// index is `-3 + (i%7) + 2*j`, which can take the value -3. The non-negativity +// gate in `delinearize` then rejects the access and no tile is built. +TEST(MemoryLayoutAnalysisTest, Regression_Im2col_NegativeConstInsideStrideProduct) { + builder::StructuredSDFGBuilder builder("sdfg_test", FunctionType_CPU); + + auto& sdfg = builder.subject(); + auto& root = sdfg.root(); + + types::Scalar index_type(types::PrimitiveType::Int64); + types::Scalar scalar_type(types::PrimitiveType::Float); + types::Pointer pointer_type(scalar_type); + builder.add_container("i", index_type); + builder.add_container("j", index_type); + builder.add_container("A", pointer_type, true); + + auto i_sym = symbolic::symbol("i"); + auto j_sym = symbolic::symbol("j"); + + auto& outer = builder.add_map( + root, + i_sym, + symbolic::Lt(i_sym, symbolic::integer(7)), + symbolic::zero(), + symbolic::add(i_sym, symbolic::one()), + ScheduleType_Sequential::create() + ); + auto& inner = builder.add_map( + outer.root(), + j_sym, + symbolic::Lt(j_sym, symbolic::integer(112)), + symbolic::zero(), + symbolic::add(j_sym, symbolic::one()), + ScheduleType_Sequential::create() + ); + + // idx = 224 * (-3 + (i%7) + 2*j) + (i%7) + auto idx = symbolic::add( + symbolic::mul( + symbolic::integer(224), + symbolic::add( + symbolic::integer(-3), + symbolic::add(symbolic::mod(i_sym, symbolic::integer(7)), symbolic::mul(symbolic::integer(2), j_sym)) + ) + ), + symbolic::mod(i_sym, symbolic::integer(7)) + ); + + auto& block = builder.add_block(inner.root()); + auto& a_node = builder.add_access(block, "A"); + auto& tasklet = builder.add_tasklet(block, data_flow::TaskletCode::assign, "_out", {"_in"}); + builder.add_computational_memlet(block, a_node, tasklet, "_in", {idx}); + auto& sink = builder.add_access(block, "A"); + builder.add_computational_memlet(block, tasklet, "_out", sink, {idx}); + + analysis::AnalysisManager analysis_manager(sdfg); + auto& analysis = analysis_manager.get(); + + auto* tile = analysis.tile(outer, "A"); + ASSERT_NE(tile, nullptr) << "MLA cannot bound A: stride * (negative_const + indvar...) breaks delinearization."; + auto r = tile->contiguous_range(); + EXPECT_FALSE(r.first.is_null()); + EXPECT_FALSE(r.second.is_null()); +} From 1bca01c97a793ccddfdf75ab627e01643a81bcc6 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Tue, 9 Jun 2026 10:27:34 +0200 Subject: [PATCH 10/20] Enable profiling from script --- .../torch/model_zoo/segformer_test.py | 59 +++++++++++++++++-- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/mlir/benchmarks/torch/model_zoo/segformer_test.py b/mlir/benchmarks/torch/model_zoo/segformer_test.py index 3601feccf..e40fcc3cf 100644 --- a/mlir/benchmarks/torch/model_zoo/segformer_test.py +++ b/mlir/benchmarks/torch/model_zoo/segformer_test.py @@ -190,6 +190,29 @@ def setup_segformer_benchmark(model_name): example_input = torch.randn(1, 3, 512, 512) return model, example_input + +def profile_segformer( + model_name, + backend="torch", + target="none", + device="cpu", + n_runs=10, + image_size=512, + trace_prefix="segformer_trace", +): + from segformer_profile import setup_segformer, run_torch_profile, run_docc_profile + + model, model_input = setup_segformer(model_name, device, image_size) + if backend == "torch": + run_torch_profile(model, model_input, n_runs, trace_prefix) + elif backend == "docc": + run_docc_profile(model, model_input, n_runs, target) + elif backend == "both": + run_torch_profile(model, model_input, n_runs, trace_prefix) + run_docc_profile(model, model_input, n_runs, target) + else: + raise ValueError(f"Unsupported backend '{backend}' for profiling") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="segformer benchmark") parser.add_argument( @@ -208,16 +231,16 @@ def setup_segformer_benchmark(model_name): parser.add_argument( "--action", type=str, - choices=["dialects", "benchmark", "benchmark_segformer"], + choices=["dialects", "benchmark", "benchmark_segformer", "profile"], default="benchmark", help="Run dialect dump or harness benchmark", ) parser.add_argument( "--backend", type=str, - choices=["torch", "docc"], + choices=["torch", "docc", "both"], default="torch", - help="Backend for --action benchmark_segformer", + help="Backend for --action benchmark_segformer/profile", ) parser.add_argument( "--target", @@ -230,7 +253,25 @@ def setup_segformer_benchmark(model_name): type=str, choices=["cpu", "cuda"], default="cpu", - help="Tensor/model device for --action benchmark_segformer", + help="Tensor/model device for --action benchmark_segformer/profile", + ) + parser.add_argument( + "--n_runs", + type=int, + default=10, + help="Number of runs for --action profile", + ) + parser.add_argument( + "--image_size", + type=int, + default=512, + help="Input image size for --action profile", + ) + parser.add_argument( + "--trace_prefix", + type=str, + default="segformer_trace", + help="Trace file prefix for --action profile torch runs", ) args, remaining = parser.parse_known_args() model_name = resolve_model_name(args.version, args.model) @@ -246,6 +287,16 @@ def setup_segformer_benchmark(model_name): target=args.target, device=args.device, ) + elif args.action == "profile": + profile_segformer( + model_name, + backend=args.backend, + target=args.target, + device=args.device, + n_runs=args.n_runs, + image_size=args.image_size, + trace_prefix=args.trace_prefix, + ) else: sys.argv = [sys.argv[0]] + remaining from functools import partial From a52fddf33fd3711459f841172141f6032d038555 Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Tue, 9 Jun 2026 11:15:38 +0200 Subject: [PATCH 11/20] removes peeling trick --- sdfg/src/symbolic/delinearization.cpp | 34 --------------------------- 1 file changed, 34 deletions(-) diff --git a/sdfg/src/symbolic/delinearization.cpp b/sdfg/src/symbolic/delinearization.cpp index bacd91771..b6f6c76db 100644 --- a/sdfg/src/symbolic/delinearization.cpp +++ b/sdfg/src/symbolic/delinearization.cpp @@ -158,40 +158,6 @@ bool decompose_by_stride( } } - // If the indvar-side index is an Add with constant (no-indvar) subterms, - // peel those subterms out and fold `stride * constant_part` into the - // global constant_offset. This keeps the per-group index expression - // non-negative when individual sub-additions are non-negative even - // though the unexpanded original (e.g. `224*(-3 + (i%7) + 2*j)`) has - // a negative constant inside the stride product. Without this step, - // delinearize's `is_nonneg(best_index, ...)` gate rejects valid - // accesses like im2col with halo offsets. - if (SymEngine::is_a(*index)) { - sym::Expression nonconstant = sym::zero(); - sym::Expression constant_part = sym::zero(); - for (const auto& sub : index->get_args()) { - bool sub_has_indvar = false; - for (auto& s : sym::atoms(sub)) { - if (params.count(s) == 0) { - sub_has_indvar = true; - break; - } - } - if (sub_has_indvar) { - nonconstant = sym::add(nonconstant, sub); - } else { - constant_part = sym::add(constant_part, sub); - } - } - if (!sym::eq(constant_part, sym::zero())) { - constant_offset = sym::add(constant_offset, sym::mul(stride, constant_part)); - if (sym::eq(nonconstant, sym::zero())) { - continue; - } - index = nonconstant; - } - } - add_to_group(stride, index); } return true; From 27e8c0b19a192951b4935b9585ab25e08398f03b Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Tue, 9 Jun 2026 17:26:33 +0200 Subject: [PATCH 12/20] set llvm test timeout to 6min --- llvm/integration/llvm_test_suite.py | 1055 +++++++++++++++++++++------ 1 file changed, 849 insertions(+), 206 deletions(-) diff --git a/llvm/integration/llvm_test_suite.py b/llvm/integration/llvm_test_suite.py index b0b42d697..541f2bf26 100644 --- a/llvm/integration/llvm_test_suite.py +++ b/llvm/integration/llvm_test_suite.py @@ -5,19 +5,25 @@ from pathlib import Path + # This method clones / fetches the llvm-test-suite repository @pytest.fixture(scope="session") def setup(): # The commit sha on which the llvm-test-suite is fixed COMMIT = "f711e105d94c4819d3bc8f399f06f22d4df49421" - # Check the repository dir repo_dir = Path(__file__).parent / "llvm-test-suite" if repo_dir.exists(): # The repository already exists, check that its a folder - assert repo_dir.is_dir(), "The repository path already exists but is not a directory: " + str(repo_dir) - assert (repo_dir / ".git").is_dir(), "The repository dir already exists but is not a git repository: " + str(repo_dir) + assert ( + repo_dir.is_dir() + ), "The repository path already exists but is not a directory: " + str(repo_dir) + assert ( + repo_dir / ".git" + ).is_dir(), "The repository dir already exists but is not a git repository: " + str( + repo_dir + ) # Fetch all fetch_process = subprocess.Popen( ["git", "fetch", "-q", "--all"], @@ -27,18 +33,28 @@ def setup(): cwd=repo_dir, ) stdout, stderr = fetch_process.communicate() - assert fetch_process.returncode == 0, "Could not fetch the llvm-test-suite repository" + assert ( + fetch_process.returncode == 0 + ), "Could not fetch the llvm-test-suite repository" else: # The repository does not exist # We need to clone it clone_process = subprocess.Popen( - ["git", "clone", "-q", "https://github.com/llvm/llvm-test-suite.git", str(repo_dir)], + [ + "git", + "clone", + "-q", + "https://github.com/llvm/llvm-test-suite.git", + str(repo_dir), + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) stdout, stderr = clone_process.communicate() - assert clone_process.returncode == 0, "Could not clone the llvm-test-suite repository" + assert ( + clone_process.returncode == 0 + ), "Could not clone the llvm-test-suite repository" # Now, we have to checkout the specified branch / commit checkout_process = subprocess.Popen( @@ -49,12 +65,16 @@ def setup(): cwd=repo_dir, ) stdout, stderr = checkout_process.communicate() - assert checkout_process.returncode == 0, "Could not checkout the llvm-test-suite repository to commit: " + COMMIT + assert checkout_process.returncode == 0, ( + "Could not checkout the llvm-test-suite repository to commit: " + COMMIT + ) # Check the build dir build_dir = repo_dir / "build" if build_dir.exists(): - assert build_dir.is_dir(), "The build path already exists but is not a directory: " + str(build_dir) + assert ( + build_dir.is_dir() + ), "The build path already exists but is not a directory: " + str(build_dir) else: # Create the buil dir os.mkdir(str(build_dir)) @@ -67,7 +87,8 @@ def setup(): "-DCMAKE_CXX_COMPILER=docc-cpp", "-DTEST_SUITE_BENCHMARKING_ONLY=ON", "-DTEST_SUITE_COLLECT_CODE_SIZE=OFF", - "-C", "../cmake/caches/O2.cmake", + "-C", + "../cmake/caches/O2.cmake", "-DTEST_SUITE_SUBDIRS=SingleSource;MultiSource", str(repo_dir), ], @@ -98,23 +119,28 @@ def setup(): yield repo_dir, build_dir + # Each test is listed in the parameters # Options for compiles: # YES = The test compiles -# TIMEOUT = The compilation timeouts (5 min) +# TIMEOUT = The compilation timeouts (6 min) # OUT_OF_MEMORY = The compiler's memory usage crashes the system # SEGFAULT = The compiler segfaults # Options for executes: # PASS = The test execution passes -# TIMEOUT = The test execution timeouts (5 min) +# TIMEOUT = The test execution timeouts (6 min) # FAIL = The test execution fails because the result is wrong or the application crashes # FLAKY = The test execution sometimes passes, sometimes fails @pytest.mark.parametrize( "path, name, compiles, executes", [ pytest.param("MultiSource/Applications/aha", "aha", "YES", "PASS"), - pytest.param("MultiSource/Applications/ALAC/decode", "alacconvert-decode", "SEGFAULT", ""), - pytest.param("MultiSource/Applications/ALAC/encode", "alacconvert-encode", "SEGFAULT", ""), + pytest.param( + "MultiSource/Applications/ALAC/decode", "alacconvert-decode", "SEGFAULT", "" + ), + pytest.param( + "MultiSource/Applications/ALAC/encode", "alacconvert-encode", "SEGFAULT", "" + ), pytest.param("MultiSource/Applications/ClamAV", "clamscan", "SEGFAULT", ""), pytest.param("MultiSource/Applications/d", "make_dparser", "TIMEOUT", ""), pytest.param("MultiSource/Applications/hbd", "hbd", "YES", "PASS"), @@ -135,43 +161,110 @@ def setup(): pytest.param("MultiSource/Applications/sqlite3", "sqlite3", "SEGFAULT", ""), pytest.param("MultiSource/Applications/viterbi", "viterbi", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/7zip", "7zip-benchmark", "SEGFAULT", ""), - pytest.param("MultiSource/Benchmarks/ASC_Sequoia/AMGmk", "AMGmk", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/ASC_Sequoia/CrystalMk", "CrystalMk", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/ASC_Sequoia/IRSmk", "IRSmk", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/ASCI_Purple/SMG2000", "smg2000", "TIMEOUT", ""), + pytest.param( + "MultiSource/Benchmarks/ASC_Sequoia/AMGmk", "AMGmk", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/ASC_Sequoia/CrystalMk", "CrystalMk", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/ASC_Sequoia/IRSmk", "IRSmk", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/ASCI_Purple/SMG2000", "smg2000", "TIMEOUT", "" + ), pytest.param("MultiSource/Benchmarks/BitBench/drop3", "drop3", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/BitBench/five11", "five11", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/BitBench/uudecode", "uudecode", "YES", "FAIL"), - pytest.param("MultiSource/Benchmarks/BitBench/uuencode", "uuencode", "YES", "FAIL"), + pytest.param( + "MultiSource/Benchmarks/BitBench/uudecode", "uudecode", "YES", "FAIL" + ), + pytest.param( + "MultiSource/Benchmarks/BitBench/uuencode", "uuencode", "YES", "FAIL" + ), pytest.param("MultiSource/Benchmarks/Bullet", "bullet", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD", "CoMD", "SEGFAULT", ""), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR", "miniAMR", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG", "miniGMG", "YES", "FAIL"), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C/Pathfinder", "PathFinder", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C/RSBench", "rsbench", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC", "SimpleMOC", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench", "XSBench", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR", "CLAMR", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C++/HACCKernels", "HACCKernels", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG", "HPCCG", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE", "miniFE", "YES", "FLAKY"), - pytest.param("MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT", "PENNANT", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Fhourstones", "fhourstones", "YES", "FAIL"), - pytest.param("MultiSource/Benchmarks/Fhourstones-3.1", "fhourstones3.1", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/FreeBench/analyzer", "analyzer", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/FreeBench/distray", "distray", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/FreeBench/fourinarow", "fourinarow", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD", "CoMD", "SEGFAULT", "" + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR", "miniAMR", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG", "miniGMG", "YES", "FAIL" + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C/Pathfinder", + "PathFinder", + "TIMEOUT", + "", + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C/RSBench", "rsbench", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC", + "SimpleMOC", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench", "XSBench", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR", "CLAMR", "TIMEOUT", "" + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C++/HACCKernels", + "HACCKernels", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG", "HPCCG", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE", "miniFE", "YES", "FLAKY" + ), + pytest.param( + "MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT", "PENNANT", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Fhourstones", "fhourstones", "YES", "FAIL" + ), + pytest.param( + "MultiSource/Benchmarks/Fhourstones-3.1", "fhourstones3.1", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/FreeBench/analyzer", "analyzer", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/FreeBench/distray", "distray", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/FreeBench/fourinarow", "fourinarow", "YES", "PASS" + ), pytest.param("MultiSource/Benchmarks/FreeBench/mason", "mason", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/FreeBench/neural", "neural", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/FreeBench/pcompress2", "pcompress2", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/FreeBench/pifft", "pifft", "YES", "TIMEOUT"), + pytest.param( + "MultiSource/Benchmarks/FreeBench/neural", "neural", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/FreeBench/pcompress2", "pcompress2", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/FreeBench/pifft", "pifft", "YES", "TIMEOUT" + ), pytest.param("MultiSource/Benchmarks/llubenchmark", "llu", "YES", "FAIL"), pytest.param("MultiSource/Benchmarks/mafft", "pairlocalalign", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/MallocBench/cfrac", "cfrac", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/MallocBench/espresso", "espresso", "YES", "FAIL"), + pytest.param( + "MultiSource/Benchmarks/MallocBench/cfrac", "cfrac", "TIMEOUT", "" + ), + pytest.param( + "MultiSource/Benchmarks/MallocBench/espresso", "espresso", "YES", "FAIL" + ), pytest.param("MultiSource/Benchmarks/MallocBench/gs", "gs", "SEGFAULT", ""), pytest.param("MultiSource/Benchmarks/McCat/01-qbsort", "qbsort", "YES", "FAIL"), - pytest.param("MultiSource/Benchmarks/McCat/03-testtrie", "testtrie", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/McCat/03-testtrie", "testtrie", "YES", "PASS" + ), pytest.param("MultiSource/Benchmarks/McCat/04-bisect", "bisect", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/McCat/05-eks", "eks", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/McCat/08-main", "main", "SEGFAULT", ""), @@ -179,25 +272,105 @@ def setup(): pytest.param("MultiSource/Benchmarks/McCat/12-IOtest", "iotest", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/McCat/17-bintr", "bintr", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/McCat/18-imp", "imp", "YES", "FAIL"), - pytest.param("MultiSource/Benchmarks/mediabench/adpcm/rawcaudio", "rawcaudio", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/mediabench/adpcm/rawdaudio", "rawdaudio", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/mediabench/g721/g721encode", "encode", "YES", "FAIL"), - pytest.param("MultiSource/Benchmarks/mediabench/gsm/toast", "toast", "SEGFAULT", ""), - pytest.param("MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a", "cjpeg", "SEGFAULT", ""), - pytest.param("MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec", "mpeg2decode", "YES", "FAIL"), - pytest.param("MultiSource/Benchmarks/MiBench/automotive-basicmath", "automotive-basicmath", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/MiBench/automotive-bitcount", "automotive-bitcount", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/MiBench/automotive-susan", "automotive-susan", "OUT_OF_MEMORY", ""), - pytest.param("MultiSource/Benchmarks/MiBench/consumer-jpeg", "consumer-jpeg", "SEGFAULT", ""), - pytest.param("MultiSource/Benchmarks/MiBench/consumer-lame", "consumer-lame", "SEGFAULT", ""), - pytest.param("MultiSource/Benchmarks/MiBench/consumer-typeset", "consumer-typeset", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/MiBench/network-dijkstra", "network-dijkstra", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/MiBench/network-patricia", "network-patricia", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/MiBench/security-rijndael", "security-rijndael", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/MiBench/security-sha", "security-sha", "YES", "FAIL"), - pytest.param("MultiSource/Benchmarks/MiBench/telecomm-CRC32", "telecomm-CRC32", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/MiBench/telecomm-FFT", "telecomm-fft", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/MiBench/telecomm-gsm", "telecomm-gsm", "SEGFAULT", ""), + pytest.param( + "MultiSource/Benchmarks/mediabench/adpcm/rawcaudio", + "rawcaudio", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/mediabench/adpcm/rawdaudio", + "rawdaudio", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/mediabench/g721/g721encode", "encode", "YES", "FAIL" + ), + pytest.param( + "MultiSource/Benchmarks/mediabench/gsm/toast", "toast", "SEGFAULT", "" + ), + pytest.param( + "MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a", "cjpeg", "SEGFAULT", "" + ), + pytest.param( + "MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec", + "mpeg2decode", + "YES", + "FAIL", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/automotive-basicmath", + "automotive-basicmath", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/automotive-bitcount", + "automotive-bitcount", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/automotive-susan", + "automotive-susan", + "OUT_OF_MEMORY", + "", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/consumer-jpeg", + "consumer-jpeg", + "SEGFAULT", + "", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/consumer-lame", + "consumer-lame", + "SEGFAULT", + "", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/consumer-typeset", + "consumer-typeset", + "TIMEOUT", + "", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/network-dijkstra", + "network-dijkstra", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/network-patricia", + "network-patricia", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/security-rijndael", + "security-rijndael", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/security-sha", "security-sha", "YES", "FAIL" + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/telecomm-CRC32", + "telecomm-CRC32", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/telecomm-FFT", "telecomm-fft", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/MiBench/telecomm-gsm", + "telecomm-gsm", + "SEGFAULT", + "", + ), pytest.param("MultiSource/Benchmarks/nbench", "nbench", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/NPB-serial/is", "is", "YES", "FLAKY"), pytest.param("MultiSource/Benchmarks/Olden/bh", "bh", "YES", "PASS"), @@ -205,94 +378,322 @@ def setup(): pytest.param("MultiSource/Benchmarks/Olden/em3d", "em3d", "YES", "TIMEOUT"), pytest.param("MultiSource/Benchmarks/Olden/health", "health", "SEGFAULT", ""), pytest.param("MultiSource/Benchmarks/Olden/mst", "mst", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Olden/perimeter", "perimeter", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/Olden/perimeter", "perimeter", "YES", "PASS" + ), pytest.param("MultiSource/Benchmarks/Olden/power", "power", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/Olden/treeadd", "treeadd", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/Olden/tsp", "tsp", "YES", "FAIL"), pytest.param("MultiSource/Benchmarks/Olden/voronoi", "voronoi", "YES", "FAIL"), pytest.param("MultiSource/Benchmarks/PAQ8p", "paq8p", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/Prolangs-C/agrep", "agrep", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/Prolangs-C/bison", "mybison", "YES", "FAIL"), + pytest.param( + "MultiSource/Benchmarks/Prolangs-C/bison", "mybison", "YES", "FAIL" + ), pytest.param("MultiSource/Benchmarks/Prolangs-C/gnugo", "gnugo", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Prolangs-C++/city", "city", "SEGFAULT", ""), - pytest.param("MultiSource/Benchmarks/Prolangs-C++/employ", "employ", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/Prolangs-C++/city", "city", "SEGFAULT", "" + ), + pytest.param( + "MultiSource/Benchmarks/Prolangs-C++/employ", "employ", "YES", "PASS" + ), pytest.param("MultiSource/Benchmarks/Prolangs-C++/life", "life", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Prolangs-C++/ocean", "ocean", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Prolangs-C++/primes", "primes", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Prolangs-C++/simul", "simul", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Ptrdist/anagram", "anagram", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/Prolangs-C++/ocean", "ocean", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Prolangs-C++/primes", "primes", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Prolangs-C++/simul", "simul", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Ptrdist/anagram", "anagram", "YES", "PASS" + ), pytest.param("MultiSource/Benchmarks/Ptrdist/bc", "bc", "YES", "FLAKY"), pytest.param("MultiSource/Benchmarks/Ptrdist/ft", "ft", "YES", "TIMEOUT"), pytest.param("MultiSource/Benchmarks/Ptrdist/ks", "ks", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/Ptrdist/yacr2", "yacr2", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Rodinia/backprop", "backprop", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Rodinia/hotspot", "hotspot", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Rodinia/pathfinder", "pathfinder", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/Rodinia/backprop", "backprop", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Rodinia/hotspot", "hotspot", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Rodinia/pathfinder", "pathfinder", "YES", "PASS" + ), pytest.param("MultiSource/Benchmarks/Rodinia/srad", "srad", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/SciMark2-C", "scimark2", "YES", "FAIL"), pytest.param("MultiSource/Benchmarks/sim", "sim", "SEGFAULT", ""), pytest.param("MultiSource/Benchmarks/tramp3d-v4", "tramp3d-v4", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/Trimaran/enc-3des", "enc-3des", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Trimaran/enc-md5", "enc-md5", "TIMEOUT", ""), - pytest.param("MultiSource/Benchmarks/Trimaran/enc-pc1", "enc-pc1", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Trimaran/enc-rc4", "enc-rc4", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Trimaran/netbench-crc", "netbench-crc", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/Trimaran/netbench-url", "netbench-url", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/ControlFlow-dbl", "ControlFlow-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/ControlFlow-flt", "ControlFlow-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/ControlLoops-dbl", "ControlLoops-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/ControlLoops-flt", "ControlLoops-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/CrossingThresholds-dbl", "CrossingThresholds-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/CrossingThresholds-flt", "CrossingThresholds-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Equivalencing-dbl", "Equivalencing-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Equivalencing-flt", "Equivalencing-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Expansion-dbl", "Expansion-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Expansion-flt", "Expansion-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/GlobalDataFlow-dbl", "GlobalDataFlow-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/GlobalDataFlow-flt", "GlobalDataFlow-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/IndirectAddressing-dbl", "IndirectAddressing-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/IndirectAddressing-flt", "IndirectAddressing-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/InductionVariable-dbl", "InductionVariable-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/InductionVariable-flt", "InductionVariable-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/LinearDependence-dbl", "LinearDependence-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/LinearDependence-flt", "LinearDependence-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/LoopRerolling-dbl", "LoopRerolling-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/LoopRerolling-flt", "LoopRerolling-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/LoopRestructuring-dbl", "LoopRestructuring-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/LoopRestructuring-flt", "LoopRestructuring-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/NodeSplitting-dbl", "NodeSplitting-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/NodeSplitting-flt", "NodeSplitting-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Packing-dbl", "Packing-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Packing-flt", "Packing-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Recurrences-dbl", "Recurrences-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Recurrences-flt", "Recurrences-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Reductions-dbl", "Reductions-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Reductions-flt", "Reductions-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Searching-dbl", "Searching-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Searching-flt", "Searching-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/StatementReordering-dbl", "StatementReordering-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/StatementReordering-flt", "StatementReordering-flt", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Symbolics-dbl", "Symbolics-dbl", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/TSVC/Symbolics-flt", "Symbolics-flt", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/Trimaran/enc-3des", "enc-3des", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Trimaran/enc-md5", "enc-md5", "TIMEOUT", "" + ), + pytest.param( + "MultiSource/Benchmarks/Trimaran/enc-pc1", "enc-pc1", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Trimaran/enc-rc4", "enc-rc4", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/Trimaran/netbench-crc", + "netbench-crc", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/Trimaran/netbench-url", + "netbench-url", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/ControlFlow-dbl", + "ControlFlow-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/ControlFlow-flt", + "ControlFlow-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/ControlLoops-dbl", + "ControlLoops-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/ControlLoops-flt", + "ControlLoops-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/CrossingThresholds-dbl", + "CrossingThresholds-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/CrossingThresholds-flt", + "CrossingThresholds-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Equivalencing-dbl", + "Equivalencing-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Equivalencing-flt", + "Equivalencing-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Expansion-dbl", "Expansion-dbl", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Expansion-flt", "Expansion-flt", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/GlobalDataFlow-dbl", + "GlobalDataFlow-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/GlobalDataFlow-flt", + "GlobalDataFlow-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/IndirectAddressing-dbl", + "IndirectAddressing-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/IndirectAddressing-flt", + "IndirectAddressing-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/InductionVariable-dbl", + "InductionVariable-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/InductionVariable-flt", + "InductionVariable-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/LinearDependence-dbl", + "LinearDependence-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/LinearDependence-flt", + "LinearDependence-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/LoopRerolling-dbl", + "LoopRerolling-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/LoopRerolling-flt", + "LoopRerolling-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/LoopRestructuring-dbl", + "LoopRestructuring-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/LoopRestructuring-flt", + "LoopRestructuring-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/NodeSplitting-dbl", + "NodeSplitting-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/NodeSplitting-flt", + "NodeSplitting-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Packing-dbl", "Packing-dbl", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Packing-flt", "Packing-flt", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Recurrences-dbl", + "Recurrences-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Recurrences-flt", + "Recurrences-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Reductions-dbl", + "Reductions-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Reductions-flt", + "Reductions-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Searching-dbl", "Searching-dbl", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Searching-flt", "Searching-flt", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/StatementReordering-dbl", + "StatementReordering-dbl", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/StatementReordering-flt", + "StatementReordering-flt", + "YES", + "PASS", + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Symbolics-dbl", "Symbolics-dbl", "YES", "PASS" + ), + pytest.param( + "MultiSource/Benchmarks/TSVC/Symbolics-flt", "Symbolics-flt", "YES", "PASS" + ), pytest.param("MultiSource/Benchmarks/VersaBench/8b10b", "8b10b", "YES", "PASS"), - pytest.param("MultiSource/Benchmarks/VersaBench/beamformer", "beamformer", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/VersaBench/beamformer", "beamformer", "YES", "PASS" + ), pytest.param("MultiSource/Benchmarks/VersaBench/bmm", "bmm", "YES", "PASS"), pytest.param("MultiSource/Benchmarks/VersaBench/dbms", "dbms", "SEGFAULT", ""), - pytest.param("MultiSource/Benchmarks/VersaBench/ecbdes", "ecbdes", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Adobe-C++", "functionobjects", "YES", "PASS"), + pytest.param( + "MultiSource/Benchmarks/VersaBench/ecbdes", "ecbdes", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Adobe-C++", "functionobjects", "YES", "PASS" + ), pytest.param("SingleSource/Benchmarks/Adobe-C++", "loop_unroll", "YES", "FAIL"), - pytest.param("SingleSource/Benchmarks/Adobe-C++", "simple_types_constant_folding", "YES", "FAIL"), - pytest.param("SingleSource/Benchmarks/Adobe-C++", "simple_types_loop_invariant", "YES", "FAIL"), - pytest.param("SingleSource/Benchmarks/Adobe-C++", "stepanov_abstraction", "YES", "FAIL"), - pytest.param("SingleSource/Benchmarks/Adobe-C++", "stepanov_vector", "YES", "TIMEOUT"), - pytest.param("SingleSource/Benchmarks/BenchmarkGame/Large", "fasta", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/BenchmarkGame", "fannkuch", "YES", "PASS"), + pytest.param( + "SingleSource/Benchmarks/Adobe-C++", + "simple_types_constant_folding", + "YES", + "FAIL", + ), + pytest.param( + "SingleSource/Benchmarks/Adobe-C++", + "simple_types_loop_invariant", + "YES", + "FAIL", + ), + pytest.param( + "SingleSource/Benchmarks/Adobe-C++", "stepanov_abstraction", "YES", "FAIL" + ), + pytest.param( + "SingleSource/Benchmarks/Adobe-C++", "stepanov_vector", "YES", "TIMEOUT" + ), + pytest.param( + "SingleSource/Benchmarks/BenchmarkGame/Large", "fasta", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/BenchmarkGame", "fannkuch", "YES", "PASS" + ), pytest.param("SingleSource/Benchmarks/BenchmarkGame", "n-body", "YES", "FAIL"), - pytest.param("SingleSource/Benchmarks/BenchmarkGame", "nsieve-bits", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/BenchmarkGame", "partialsums", "YES", "PASS"), + pytest.param( + "SingleSource/Benchmarks/BenchmarkGame", "nsieve-bits", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/BenchmarkGame", "partialsums", "YES", "PASS" + ), pytest.param("SingleSource/Benchmarks/BenchmarkGame", "puzzle", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/BenchmarkGame", "recursive", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/BenchmarkGame", "spectral-norm", "YES", "PASS"), + pytest.param( + "SingleSource/Benchmarks/BenchmarkGame", "recursive", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/BenchmarkGame", "spectral-norm", "YES", "PASS" + ), pytest.param("SingleSource/Benchmarks/CoyoteBench", "almabench", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/CoyoteBench", "fftbench", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/CoyoteBench", "huffbench", "YES", "PASS"), @@ -302,7 +703,9 @@ def setup(): pytest.param("SingleSource/Benchmarks/Linpack", "linpack-pc", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/McGill", "chomp", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/McGill", "misr", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/McGill", "queens", "TIMEOUT", ""), # Compilation sometimes flaky + pytest.param( + "SingleSource/Benchmarks/McGill", "queens", "TIMEOUT", "" + ), # Compilation sometimes flaky pytest.param("SingleSource/Benchmarks/Misc", "dt", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Misc", "evalloop", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Misc", "fbench", "YES", "PASS"), @@ -327,76 +730,298 @@ def setup(): pytest.param("SingleSource/Benchmarks/Misc", "pi", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Misc", "ReedSolomon", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Misc", "revertBits", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Misc", "richards_benchmark", "YES", "PASS"), + pytest.param( + "SingleSource/Benchmarks/Misc", "richards_benchmark", "YES", "PASS" + ), pytest.param("SingleSource/Benchmarks/Misc", "salsa20", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Misc", "whetstone", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Misc-C++/Large", "ray", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Misc-C++/Large", "sphereflake", "YES", "PASS"), + pytest.param( + "SingleSource/Benchmarks/Misc-C++/Large", "sphereflake", "YES", "PASS" + ), pytest.param("SingleSource/Benchmarks/Misc-C++", "bigfib", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Misc-C++", "mandel-text", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Misc-C++", "oopack_v1p8", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Misc-C++", "stepanov_container", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Misc-C++", "stepanov_v1p2", "YES", "PASS"), + pytest.param( + "SingleSource/Benchmarks/Misc-C++", "stepanov_container", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Misc-C++", "stepanov_v1p2", "YES", "PASS" + ), pytest.param("SingleSource/Benchmarks/Misc-C++-EH", "spirit", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/datamining/correlation", "correlation", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/datamining/covariance", "covariance", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/blas/gemver", "gemver", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/blas/gesummv", "gesummv", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/blas/symm", "symm", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/blas/syr2k", "syr2k", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/blas/syrk", "syrk", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/blas/trmm", "trmm", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/kernels/atax", "atax", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/kernels/bicg", "bicg", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/kernels/doitgen", "doitgen", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/kernels/mvt", "mvt", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/solvers/cholesky", "cholesky", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/solvers/durbin", "durbin", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/solvers/gramschmidt", "gramschmidt", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/solvers/lu", "lu", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/solvers/ludcmp", "ludcmp", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/linear-algebra/solvers/trisolv", "trisolv", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/medley/deriche", "deriche", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/medley/floyd-warshall", "floyd-warshall", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/medley/nussinov", "nussinov", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/stencils/adi", "adi", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/stencils/fdtd-2d", "fdtd-2d", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/stencils/heat-3d", "heat-3d", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/stencils/jacobi-1d", "jacobi-1d", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/stencils/jacobi-2d", "jacobi-2d", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Polybench/stencils/seidel-2d", "seidel-2d", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-ackermann", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-ary3", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-fib2", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-hash", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-heapsort", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-lists", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-matrix", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-methcall", "SEGFAULT", ""), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-nestedloop", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-objinst", "SEGFAULT", ""), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-random", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-sieve", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout", "Shootout-strcat", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++/EH", "Shootout-C++-except", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-ackermann", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-ary", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-ary2", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-ary3", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-fibo", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-hash", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-hash2", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-heapsort", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-lists", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-lists1", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-matrix", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-methcall", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-moments", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-nestedloop", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-objinst", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-random", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-sieve", "YES", "PASS"), - pytest.param("SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-strcat", "YES", "PASS"), + pytest.param( + "SingleSource/Benchmarks/Polybench/datamining/correlation", + "correlation", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/datamining/covariance", + "covariance", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/blas/gemver", + "gemver", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/blas/gesummv", + "gesummv", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/blas/symm", + "symm", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/blas/syr2k", + "syr2k", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/blas/syrk", + "syrk", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/blas/trmm", + "trmm", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/kernels/atax", + "atax", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/kernels/bicg", + "bicg", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/kernels/doitgen", + "doitgen", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/kernels/mvt", + "mvt", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/solvers/cholesky", + "cholesky", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/solvers/durbin", + "durbin", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/solvers/gramschmidt", + "gramschmidt", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/solvers/lu", + "lu", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/solvers/ludcmp", + "ludcmp", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/linear-algebra/solvers/trisolv", + "trisolv", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/medley/deriche", "deriche", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/medley/floyd-warshall", + "floyd-warshall", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/medley/nussinov", + "nussinov", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/stencils/adi", "adi", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/stencils/fdtd-2d", + "fdtd-2d", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/stencils/heat-3d", + "heat-3d", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/stencils/jacobi-1d", + "jacobi-1d", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/stencils/jacobi-2d", + "jacobi-2d", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Polybench/stencils/seidel-2d", + "seidel-2d", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-ackermann", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-ary3", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-fib2", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-hash", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-heapsort", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-lists", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-matrix", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-methcall", "SEGFAULT", "" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-nestedloop", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-objinst", "SEGFAULT", "" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-random", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-sieve", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout", "Shootout-strcat", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++/EH", + "Shootout-C++-except", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", + "Shootout-C++-ackermann", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-ary", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-ary2", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-ary3", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-fibo", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-hash", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-hash2", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", + "Shootout-C++-heapsort", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-lists", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-lists1", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-matrix", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", + "Shootout-C++-methcall", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", + "Shootout-C++-moments", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", + "Shootout-C++-nestedloop", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", + "Shootout-C++-objinst", + "YES", + "PASS", + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-random", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-sieve", "YES", "PASS" + ), + pytest.param( + "SingleSource/Benchmarks/Shootout-C++", "Shootout-C++-strcat", "YES", "PASS" + ), pytest.param("SingleSource/Benchmarks/SmallPT", "smallpt", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Stanford", "Bubblesort", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Stanford", "FloatMM", "YES", "PASS"), @@ -408,7 +1033,7 @@ def setup(): pytest.param("SingleSource/Benchmarks/Stanford", "RealMM", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Stanford", "Towers", "YES", "PASS"), pytest.param("SingleSource/Benchmarks/Stanford", "Treesort", "YES", "PASS"), - ] + ], ) def test(setup, path, name, compiles, executes): repo_dir, build_dir = setup @@ -419,12 +1044,22 @@ def test(setup, path, name, compiles, executes): assert test_file.is_file(), "Test file does not exist: " + str(test_file) # Determine if all test should be tried to execute - all_tests = ("ALL" in os.environ) + all_tests = "ALL" in os.environ # Check that compiles and executes have valid values - assert compiles in ["YES", "TIMEOUT", "OUT_OF_MEMORY", "SEGFAULT"], "compiles option must be YES, TIMEOUT, OUT_OF_MEMORY, or SEGFAULT" + assert compiles in [ + "YES", + "TIMEOUT", + "OUT_OF_MEMORY", + "SEGFAULT", + ], "compiles option must be YES, TIMEOUT, OUT_OF_MEMORY, or SEGFAULT" if compiles == "YES": - assert executes in ["PASS", "TIMEOUT", "FAIL", "FLAKY"], "executes option must be PASS, TIMEOUT, FAIL, or FLAKY" + assert executes in [ + "PASS", + "TIMEOUT", + "FAIL", + "FLAKY", + ], "executes option must be PASS, TIMEOUT, FAIL, or FLAKY" # Skip if compiles == "OUT_OF_MEMORY": @@ -452,20 +1087,24 @@ def test(setup, path, name, compiles, executes): ) try: timeout = False - stdout, stderr = make_process.communicate(timeout=300) - except subprocess.TimeoutExpired: # must catch this otherwise subprocess is not killed + stdout, stderr = make_process.communicate(timeout=360) + except ( + subprocess.TimeoutExpired + ): # must catch this otherwise subprocess is not killed timeout = True if timeout: os.killpg(make_process.pid, signal.SIGTERM) if compiles == "TIMEOUT": - return # Expected this + return # Expected this pytest.fail("Compilation timed out but expected compiles = " + compiles) if make_process.returncode != 0: if compiles == "SEGFAULT": - return # Expected this + return # Expected this print("STDOUT:\n", stdout) print("STDERR:\n", stderr) - assert make_process.returncode == 0, "Compilation failed but expected compiles = " + compiles + assert make_process.returncode == 0, ( + "Compilation failed but expected compiles = " + compiles + ) if all_tests and compiles != "YES": print("Compilation succeeded but expected compiles = " + compiles) @@ -480,19 +1119,23 @@ def test(setup, path, name, compiles, executes): ) try: timeout = False - stdout, stderr = lit_process.communicate(timeout=300) - except subprocess.TimeoutExpired: # must catch this otherwise subprocess is not killed + stdout, stderr = lit_process.communicate(timeout=360) + except ( + subprocess.TimeoutExpired + ): # must catch this otherwise subprocess is not killed timeout = True if timeout: os.killpg(lit_process.pid, signal.SIGTERM) if executes == "TIMEOUT": - return # Expected this + return # Expected this pytest.fail("Execution timed out but expected executes = " + executes) if lit_process.returncode != 0: if executes == "FAIL" or executes == "FLAKY": - return # Expected this + return # Expected this print("STDOUT:\n", stdout) print("STDERR:\n", stderr) - assert lit_process.returncode == 0, "Execution failed but expected executes = " + executes + assert lit_process.returncode == 0, ( + "Execution failed but expected executes = " + executes + ) if all_tests and executes != "PASS": - print("Execution passed but expected executes = " + executes) \ No newline at end of file + print("Execution passed but expected executes = " + executes) From 5c0b9f03c07b7c6df0f122f89027d3ecd55cbd2e Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Tue, 9 Jun 2026 17:47:40 +0200 Subject: [PATCH 13/20] Add profiling and transfer tuning --- mlir/benchmarks/harness.py | 16 +- .../torch/model_zoo/segformer_profile.py | 238 ++++++++++++++++++ .../torch/model_zoo/segformer_test.py | 35 ++- 3 files changed, 281 insertions(+), 8 deletions(-) create mode 100644 mlir/benchmarks/torch/model_zoo/segformer_profile.py diff --git a/mlir/benchmarks/harness.py b/mlir/benchmarks/harness.py index 15a5d9768..7423e1b24 100644 --- a/mlir/benchmarks/harness.py +++ b/mlir/benchmarks/harness.py @@ -8,6 +8,7 @@ def run_benchmark(setup_func, name): parser.add_argument("--docc", action="store_true") parser.add_argument("--torch", action="store_true") parser.add_argument("--target", type=str, default="none") + parser.add_argument("--remote_tuning", action="store_true") parser.add_argument("--n_runs", type=int, default=10) args = parser.parse_args() @@ -26,7 +27,18 @@ def run_benchmark(setup_func, name): for _ in range(args.n_runs): start = time.time() with torch.no_grad(): - program = torch.compile(model, backend="docc", options={"target": args.target, "category": "server"}) + program = torch.compile( + model, + backend="docc", + options={ + "target": args.target, + "category": "server", + "remote_tuning": args.remote_tuning, + }, + ) program(model_input) end = time.time() - print(f"{name} docc execution time: {end - start:.6f} seconds") + print( + f"{name} docc execution time: {end - start:.6f} seconds " + f"(remote_tuning={args.remote_tuning})" + ) diff --git a/mlir/benchmarks/torch/model_zoo/segformer_profile.py b/mlir/benchmarks/torch/model_zoo/segformer_profile.py new file mode 100644 index 000000000..89849874e --- /dev/null +++ b/mlir/benchmarks/torch/model_zoo/segformer_profile.py @@ -0,0 +1,238 @@ +import argparse +import time + +import torch +from torch.profiler import ProfilerActivity, profile +from transformers import SegformerForSemanticSegmentation + +import docc.torch + + +SEGFORMER_MODELS = { + "b0": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024", + "b1": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", + "b2": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024", + "b3": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024", + "b4": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024", + "b5": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024", +} + + +def resolve_model_name(version: str, model: str | None) -> str: + if model: + return model + return SEGFORMER_MODELS[version] + + +def _assert_cuda_arch_supported() -> None: + capability = torch.cuda.get_device_capability() + current_arch = f"sm_{capability[0]}{capability[1]}" + supported_arches = set(torch.cuda.get_arch_list()) + if current_arch not in supported_arches: + supported_str = " ".join(sorted(supported_arches)) + raise RuntimeError( + "The active PyTorch CUDA build does not support this GPU architecture " + f"({current_arch}). Supported architectures: {supported_str}. " + "Install a compatible CUDA wheel (for RTX 50xx typically cu128+), " + "or run with --device cpu." + ) + + +def setup_segformer( + model_name: str, + model_device: str, + image_size: int, + input_device: str | None = None, +) -> tuple[torch.nn.Module, torch.Tensor]: + if input_device is None: + input_device = model_device + + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() + if model_device == "cuda": + if not torch.cuda.is_available(): + raise RuntimeError("CUDA requested but not available") + _assert_cuda_arch_supported() + model = model.to("cuda") + + if input_device == "cuda" and not torch.cuda.is_available(): + raise RuntimeError("CUDA input requested but not available") + + model_input = torch.randn(1, 3, image_size, image_size, device=input_device) + return model, model_input + + +def _model_device(model: torch.nn.Module) -> torch.device: + try: + return next(model.parameters()).device + except StopIteration: + return torch.device("cpu") + + +def _materialize_output(res: object) -> None: + if isinstance(res, dict): + _ = {k: v.cpu() if torch.is_tensor(v) else v for k, v in res.items()} + elif hasattr(res, "logits") and torch.is_tensor(res.logits): + _ = res.logits.cpu() + + +def _run_once(program: torch.nn.Module, model_input: torch.Tensor, model_dev: torch.device) -> None: + current_input = model_input + if current_input.device != model_dev: + current_input = current_input.to(model_dev, non_blocking=True) + + res = program(pixel_values=current_input) + _materialize_output(res) + if model_dev.type == "cuda": + torch.cuda.synchronize(model_dev) + + +def run_torch_profile(model: torch.nn.Module, model_input: torch.Tensor, n_runs: int, trace_prefix: str) -> None: + model_dev = _model_device(model) + with torch.no_grad(): + compile_start = time.perf_counter() + program = torch.compile(model) + _run_once(program, model_input, model_dev) + compile_end = time.perf_counter() + print(f"Torch compile+first-run: {(compile_end - compile_start):.6f} s") + + _run_once(program, model_input, model_dev) + activities = [ProfilerActivity.CPU] + if model_dev.type == "cuda": + activities.append(ProfilerActivity.CUDA) + + for i in range(n_runs): + start = time.perf_counter() + with profile(activities=activities, record_shapes=True) as prof: + _run_once(program, model_input, model_dev) + end = time.perf_counter() + + trace_path = f"{trace_prefix}_torch_{i}.json" + prof.export_chrome_trace(trace_path) + print(f"Torch runtime run {i}: {(end - start):.6f} s, trace={trace_path}") + + +def run_docc_profile( + model: torch.nn.Module, + model_input: torch.Tensor, + n_runs: int, + target: str, + remote_tuning: bool, + trace_prefix: str, +) -> None: + model_dev = _model_device(model) + with torch.no_grad(): + compile_start = time.perf_counter() + program = torch.compile( + model, + backend="docc", + options={"target": target, "category": "server", "remote_tuning": remote_tuning}, + ) + _run_once(program, model_input, model_dev) + compile_end = time.perf_counter() + print( + f"DOCC compile+first-run ({target}, remote_tuning={remote_tuning}): " + f"{(compile_end - compile_start):.6f} s" + ) + + _run_once(program, model_input, model_dev) + activities = [ProfilerActivity.CPU] + if model_dev.type == "cuda": + activities.append(ProfilerActivity.CUDA) + + for i in range(n_runs): + start = time.perf_counter() + with profile(activities=activities, record_shapes=True) as prof: + _run_once(program, model_input, model_dev) + end = time.perf_counter() + + trace_path = f"{trace_prefix}_docc_{target}_{i}.json" + prof.export_chrome_trace(trace_path) + print(f"DOCC runtime run {i}: {(end - start):.6f} s, trace={trace_path}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Profile SegFormer with Torch and/or DOCC backend") + parser.add_argument("--docc", action="store_true", help="Run DOCC backend") + parser.add_argument("--torch", action="store_true", dest="run_torch", help="Run Torch backend") + parser.add_argument( + "--version", + type=str, + choices=list(SEGFORMER_MODELS.keys()), + default="b0", + help="SegFormer variant to use when --model is not provided", + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="Optional Hugging Face model id to override --version", + ) + parser.add_argument("--target", type=str, default="none", help="DOCC target") + parser.add_argument( + "--remote_tuning", + action="store_true", + help="Enable DOCC remote tuning during compilation", + ) + parser.add_argument("--n_runs", type=int, default=10, help="Number of runs per backend") + parser.add_argument( + "--device", + type=str, + choices=["cpu", "cuda"], + default="cpu", + help="Device for model and input tensor", + ) + parser.add_argument( + "--input_device", + type=str, + choices=["cpu", "cuda"], + default=None, + help="Device where input tensor is created (defaults to --device)", + ) + parser.add_argument("--image_size", type=int, default=512, help="Input image size") + parser.add_argument( + "--trace_prefix", + type=str, + default="segformer_trace", + help="Prefix for exported Torch profiler traces", + ) + args = parser.parse_args() + + if not args.docc and not args.run_torch: + parser.error("Specify at least one backend: --torch and/or --docc") + + return args + + +def main() -> None: + args = parse_args() + model_name = resolve_model_name(args.version, args.model) + input_device = args.input_device if args.input_device is not None else args.device + model, model_input = setup_segformer( + model_name, + args.device, + args.image_size, + input_device=input_device, + ) + + print(f"Model: {model_name}") + print(f"Device: {args.device}") + print(f"Input device: {input_device}") + print(f"Remote tuning: {args.remote_tuning}") + print(f"Runs: {args.n_runs}") + + if args.run_torch: + run_torch_profile(model, model_input, args.n_runs, args.trace_prefix) + + if args.docc: + run_docc_profile( + model, + model_input, + args.n_runs, + args.target, + args.remote_tuning, + args.trace_prefix, + ) + + +if __name__ == "__main__": + main() diff --git a/mlir/benchmarks/torch/model_zoo/segformer_test.py b/mlir/benchmarks/torch/model_zoo/segformer_test.py index e40fcc3cf..2dd9c76e5 100644 --- a/mlir/benchmarks/torch/model_zoo/segformer_test.py +++ b/mlir/benchmarks/torch/model_zoo/segformer_test.py @@ -33,7 +33,7 @@ def resolve_model_name(version, model): def get_test_model_name(): - version = os.getenv("SEGFORMER_VERSION", "b0") + version = os.getenv("SEGFORMER_VERSION", "b2") if version not in SEGFORMER_MODELS: raise ValueError( f"Unsupported SEGFORMER_VERSION '{version}'. " @@ -126,7 +126,7 @@ def find_used_dialects(): # print(mlir_str) -def benchmark_segformer(model_name, backend="torch", target="none", device="cpu"): +def benchmark_segformer(model_name, backend="torch", target="none", device="cpu", remote_tuning=False): model = SegformerForSemanticSegmentation.from_pretrained( model_name ).eval() @@ -143,7 +143,7 @@ def benchmark_segformer(model_name, backend="torch", target="none", device="cpu" if backend == "docc": compile_kwargs = { "backend": "docc", - "options": {"target": target, "category": "server"}, + "options": {"target": target, "category": "server", "remote_tuning": remote_tuning}, } program = torch.compile(model, **compile_kwargs) @@ -181,6 +181,7 @@ def benchmark_segformer(model_name, backend="torch", target="none", device="cpu" sem = scipy_stats.sem(times) half_width = scipy_stats.t.ppf(0.975, df=n - 1) * sem print(f"Benchmarking {model_name}:") + print(f"Remote tuning: {remote_tuning}") print(f"Average inference time: {mean:.2f} ms (n={n})") print(f"95% CI: [{mean - half_width:.2f}, {mean + half_width:.2f}] ms (±{half_width:.2f} ms)") @@ -196,20 +197,27 @@ def profile_segformer( backend="torch", target="none", device="cpu", + input_device=None, + remote_tuning=False, n_runs=10, image_size=512, trace_prefix="segformer_trace", ): from segformer_profile import setup_segformer, run_torch_profile, run_docc_profile - model, model_input = setup_segformer(model_name, device, image_size) + model, model_input = setup_segformer( + model_name, + device, + image_size, + input_device=input_device, + ) if backend == "torch": run_torch_profile(model, model_input, n_runs, trace_prefix) elif backend == "docc": - run_docc_profile(model, model_input, n_runs, target) + run_docc_profile(model, model_input, n_runs, target, remote_tuning, trace_prefix) elif backend == "both": run_torch_profile(model, model_input, n_runs, trace_prefix) - run_docc_profile(model, model_input, n_runs, target) + run_docc_profile(model, model_input, n_runs, target, remote_tuning, trace_prefix) else: raise ValueError(f"Unsupported backend '{backend}' for profiling") @@ -248,6 +256,11 @@ def profile_segformer( default="none", help="DOCC target for --action benchmark_segformer (e.g. none, openmp, cuda)", ) + parser.add_argument( + "--remote_tuning", + action="store_true", + help="Enable DOCC remote tuning during benchmark/profile compilation", + ) parser.add_argument( "--device", type=str, @@ -255,6 +268,13 @@ def profile_segformer( default="cpu", help="Tensor/model device for --action benchmark_segformer/profile", ) + parser.add_argument( + "--input_device", + type=str, + choices=["cpu", "cuda"], + default=None, + help="Input tensor device for --action profile (defaults to --device)", + ) parser.add_argument( "--n_runs", type=int, @@ -286,6 +306,7 @@ def profile_segformer( backend=args.backend, target=args.target, device=args.device, + remote_tuning=args.remote_tuning, ) elif args.action == "profile": profile_segformer( @@ -293,6 +314,8 @@ def profile_segformer( backend=args.backend, target=args.target, device=args.device, + input_device=args.input_device, + remote_tuning=args.remote_tuning, n_runs=args.n_runs, image_size=args.image_size, trace_prefix=args.trace_prefix, From ecc67158725e99b676013372947c653f0899aba5 Mon Sep 17 00:00:00 2001 From: Lukas Truemper Date: Tue, 9 Jun 2026 18:34:01 +0200 Subject: [PATCH 14/20] reverts performance improvements in non-critical passes --- llvm/integration/llvm_test_suite.py | 8 +++--- .../sdfg/passes/symbolic/type_minimization.h | 3 +-- .../src/analysis/data_dependency_analysis.cpp | 26 ++++--------------- .../loop_carried_dependency_analysis.cpp | 8 +++--- .../src/passes/symbolic/type_minimization.cpp | 22 ++++------------ 5 files changed, 18 insertions(+), 49 deletions(-) diff --git a/llvm/integration/llvm_test_suite.py b/llvm/integration/llvm_test_suite.py index 541f2bf26..98c3f14ad 100644 --- a/llvm/integration/llvm_test_suite.py +++ b/llvm/integration/llvm_test_suite.py @@ -123,12 +123,12 @@ def setup(): # Each test is listed in the parameters # Options for compiles: # YES = The test compiles -# TIMEOUT = The compilation timeouts (6 min) +# TIMEOUT = The compilation timeouts (5 min) # OUT_OF_MEMORY = The compiler's memory usage crashes the system # SEGFAULT = The compiler segfaults # Options for executes: # PASS = The test execution passes -# TIMEOUT = The test execution timeouts (6 min) +# TIMEOUT = The test execution timeouts (5 min) # FAIL = The test execution fails because the result is wrong or the application crashes # FLAKY = The test execution sometimes passes, sometimes fails @pytest.mark.parametrize( @@ -1087,7 +1087,7 @@ def test(setup, path, name, compiles, executes): ) try: timeout = False - stdout, stderr = make_process.communicate(timeout=360) + stdout, stderr = make_process.communicate(timeout=300) except ( subprocess.TimeoutExpired ): # must catch this otherwise subprocess is not killed @@ -1119,7 +1119,7 @@ def test(setup, path, name, compiles, executes): ) try: timeout = False - stdout, stderr = lit_process.communicate(timeout=360) + stdout, stderr = lit_process.communicate(timeout=300) except ( subprocess.TimeoutExpired ): # must catch this otherwise subprocess is not killed diff --git a/sdfg/include/sdfg/passes/symbolic/type_minimization.h b/sdfg/include/sdfg/passes/symbolic/type_minimization.h index 09ae42998..6f3db0951 100644 --- a/sdfg/include/sdfg/passes/symbolic/type_minimization.h +++ b/sdfg/include/sdfg/passes/symbolic/type_minimization.h @@ -8,7 +8,6 @@ #include "sdfg/element.h" #include "sdfg/passes/pass.h" #include "sdfg/structured_control_flow/block.h" -#include "sdfg/symbolic/extreme_values.h" #include "sdfg/visitor/structured_sdfg_visitor.h" namespace sdfg { @@ -16,7 +15,7 @@ namespace passes { class TypeMinimization : public visitor::NonStoppingStructuredSDFGVisitor { private: - bool is_safe_trunc(symbolic::Expression expr, symbolic::BoundAnalysis& ba_tight, symbolic::BoundAnalysis& ba_loose); + bool is_safe_trunc(symbolic::Expression expr, const symbolic::Assumptions& assumptions); public: TypeMinimization(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager); diff --git a/sdfg/src/analysis/data_dependency_analysis.cpp b/sdfg/src/analysis/data_dependency_analysis.cpp index c1dbff53a..42e095540 100644 --- a/sdfg/src/analysis/data_dependency_analysis.cpp +++ b/sdfg/src/analysis/data_dependency_analysis.cpp @@ -728,16 +728,11 @@ bool DataDependencyAnalysis:: auto current_scope = Users::scope(¤t); auto& current_assumptions = assumptions_analysis.get(*current_scope, true); - // One AssumptionsBounds per side, shared across the whole subset-pair scan. - // The original used `previous_assumptions, previous_assumptions` (both - // sides of `is_subset`), so we only need one bounds object here. - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - // Check if previous subset is subset of any current subset for (auto& previous_subset : previous_subsets) { bool found = false; for (auto& current_subset : current_subsets) { - if (symbolic::is_subset(previous_subset, current_subset, previous_bounds, previous_bounds)) { + if (symbolic::is_subset(previous_subset, current_subset, previous_assumptions, previous_assumptions)) { found = true; break; } @@ -797,7 +792,6 @@ bool DataDependencyAnalysis::fully_covered( auto& assumptions_analysis = analysis_manager.get(); auto& current_assumptions = assumptions_analysis.get(*Users::scope(¤t), true); - symbolic::AssumptionsBounds current_bounds(current_assumptions); // Each read subset must be contained in some single open writer's subset. for (auto& read_subset : current_subsets) { @@ -807,9 +801,8 @@ bool DataDependencyAnalysis::fully_covered( if (w->container() != current.container()) continue; if (this->is_undefined_user(*w)) continue; auto& w_assumptions = assumptions_analysis.get(*Users::scope(w), true); - symbolic::AssumptionsBounds w_bounds(w_assumptions); for (auto& w_subset : w->subsets()) { - if (symbolic::is_subset(read_subset, w_subset, current_bounds, w_bounds)) { + if (symbolic::is_subset(read_subset, w_subset, current_assumptions, w_assumptions)) { covered = true; break; } @@ -851,14 +844,11 @@ bool DataDependencyAnalysis::intersects(User& previous, User& current, analysis: auto current_scope = Users::scope(¤t); auto& current_assumptions = assumptions_analysis.get(*current_scope, true); - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - symbolic::AssumptionsBounds current_bounds(current_assumptions); - // Check if any current subset intersects with any previous subset bool found = false; for (auto& current_subset : current_subsets) { for (auto& previous_subset : previous_subsets) { - if (!symbolic::is_disjoint(current_subset, previous_subset, current_bounds, previous_bounds)) { + if (!symbolic::is_disjoint(current_subset, previous_subset, current_assumptions, previous_assumptions)) { found = true; break; } @@ -909,16 +899,13 @@ bool DataDependencyAnalysis:: auto& previous_assumptions = assumptions_analysis.get(*previous_scope, true); auto& current_assumptions = assumptions_analysis.get(*current_scope, true); - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - symbolic::AssumptionsBounds current_bounds(current_assumptions); - auto& previous_memlets = previous.subsets(); auto& current_memlets = current.subsets(); for (auto& subset_ : previous_memlets) { bool overwritten = false; for (auto& subset : current_memlets) { - if (symbolic::is_subset(subset_, subset, previous_bounds, current_bounds)) { + if (symbolic::is_subset(subset_, subset, previous_assumptions, current_assumptions)) { overwritten = true; break; } @@ -957,16 +944,13 @@ bool DataDependencyAnalysis::depends(analysis::AnalysisManager& analysis_manager auto& previous_assumptions = assumptions_analysis.get(*previous_scope, true); auto& current_assumptions = assumptions_analysis.get(*current_scope, true); - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - symbolic::AssumptionsBounds current_bounds(current_assumptions); - auto& previous_memlets = previous.subsets(); auto& current_memlets = current.subsets(); bool intersect_any = false; for (auto& current_subset : current_memlets) { for (auto& previous_subset : previous_memlets) { - if (!symbolic::is_disjoint(current_subset, previous_subset, current_bounds, previous_bounds)) { + if (!symbolic::is_disjoint(current_subset, previous_subset, current_assumptions, previous_assumptions)) { intersect_any = true; break; } diff --git a/sdfg/src/analysis/loop_carried_dependency_analysis.cpp b/sdfg/src/analysis/loop_carried_dependency_analysis.cpp index 5177a1bcd..787381769 100644 --- a/sdfg/src/analysis/loop_carried_dependency_analysis.cpp +++ b/sdfg/src/analysis/loop_carried_dependency_analysis.cpp @@ -153,17 +153,15 @@ symbolic::maps::DependenceDeltas pair_deltas( } // Collect deltas across all subset pairs and union them. - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - symbolic::AssumptionsBounds current_bounds(current_assumptions); - isl_ctx* union_ctx = nullptr; isl_set* accumulated = nullptr; std::vector result_dimensions; for (auto& previous_subset : previous_subsets) { for (auto& current_subset : current_subsets) { - auto deltas = symbolic::maps:: - dependence_deltas(previous_subset, current_subset, loop.indvar(), previous_bounds, current_bounds); + auto deltas = symbolic::maps::dependence_deltas( + previous_subset, current_subset, loop.indvar(), previous_assumptions, current_assumptions + ); if (deltas.empty) { continue; } diff --git a/sdfg/src/passes/symbolic/type_minimization.cpp b/sdfg/src/passes/symbolic/type_minimization.cpp index db13ceade..63a31f879 100644 --- a/sdfg/src/passes/symbolic/type_minimization.cpp +++ b/sdfg/src/passes/symbolic/type_minimization.cpp @@ -13,13 +13,12 @@ namespace passes { TypeMinimization::TypeMinimization(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) : visitor::NonStoppingStructuredSDFGVisitor(builder, analysis_manager) {}; -bool TypeMinimization:: - is_safe_trunc(symbolic::Expression expr, symbolic::BoundAnalysis& ba_tight, symbolic::BoundAnalysis& ba_loose) { +bool TypeMinimization::is_safe_trunc(symbolic::Expression expr, const symbolic::Assumptions& assumptions) { size_t output_bitwidth = 32; int64_t output_min_value_signed = 0; int64_t output_max_value_signed = (1ULL << (output_bitwidth - 1)) - 1; - auto mini = ba_tight.lower_bound(expr); + auto mini = symbolic::minimum(expr, {}, assumptions, true); if (mini.is_null()) { return false; } @@ -28,7 +27,7 @@ bool TypeMinimization:: return false; } - auto maxi = ba_loose.upper_bound(expr); + auto maxi = symbolic::maximum(expr, {}, assumptions, false); if (maxi.is_null()) { return false; } @@ -46,13 +45,6 @@ bool TypeMinimization::accept(structured_control_flow::Block& block) { auto& assumptions_analysis = this->analysis_manager_.get(); auto& block_assumptions = assumptions_analysis.get(block, true); - // One BoundAnalysis pair for the whole block: every is_safe_trunc call here - // shares the same empty parameter set and the same assumptions, so the - // internal cache amortizes across all truncs in the block. - static const symbolic::SymbolSet no_params; - symbolic::BoundAnalysis ba_tight(no_params, block_assumptions, true); - symbolic::BoundAnalysis ba_loose(no_params, block_assumptions, false); - symbolic::ExpressionMap replacements; for (auto& edge : dfg.edges()) { auto& subset = edge.subset(); @@ -67,7 +59,7 @@ bool TypeMinimization::accept(structured_control_flow::Block& block) { continue; } auto arg = trunc_func->get_args()[0]; - if (!this->is_safe_trunc(arg, ba_tight, ba_loose)) { + if (!this->is_safe_trunc(arg, block_assumptions)) { continue; } @@ -101,10 +93,6 @@ bool TypeMinimization::accept(structured_control_flow::For& loop) { auto& assumptions_analysis = this->analysis_manager_.get(); auto& block_assumptions = assumptions_analysis.get(loop, true); - static const symbolic::SymbolSet no_params; - symbolic::BoundAnalysis ba_tight(no_params, block_assumptions, true); - symbolic::BoundAnalysis ba_loose(no_params, block_assumptions, false); - symbolic::ExpressionMap replacements; auto truncs = symbolic::find(loop.condition()); for (auto& trunc : truncs) { @@ -116,7 +104,7 @@ bool TypeMinimization::accept(structured_control_flow::For& loop) { continue; } auto arg = trunc_func->get_args()[0]; - if (!this->is_safe_trunc(arg, ba_tight, ba_loose)) { + if (!this->is_safe_trunc(arg, block_assumptions)) { continue; } From 2ce363194b82eb0de592245d5bd3f0be3617d7ea Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Wed, 10 Jun 2026 13:15:35 +0200 Subject: [PATCH 15/20] Add instrumented sequential segformer to workflow --- .daisy/mlir_torch_segformer.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.daisy/mlir_torch_segformer.yml b/.daisy/mlir_torch_segformer.yml index dcdcf0757..3ba79d245 100644 --- a/.daisy/mlir_torch_segformer.yml +++ b/.daisy/mlir_torch_segformer.yml @@ -27,10 +27,15 @@ steps: # Warm start (DOCC benchmark, CUDA target) DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu + DOCC_CI=1 venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu run: - # model segformer b0 (DOCC CUDA target) + segformer_b0_docc_sequential: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu + energy: true + env: + DOCC_REUSE_BINARIES: 1 segformer_b0_docc_cuda: command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu From 4229f2051ad0aac25bd86dff10afe5abf5bf4cfe Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Wed, 10 Jun 2026 14:44:16 +0200 Subject: [PATCH 16/20] Reduce instrumenttaion overhead --- .daisy/mlir_torch_segformer.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.daisy/mlir_torch_segformer.yml b/.daisy/mlir_torch_segformer.yml index 3ba79d245..2b299caed 100644 --- a/.daisy/mlir_torch_segformer.yml +++ b/.daisy/mlir_torch_segformer.yml @@ -26,8 +26,8 @@ steps: pip install -r mlir/requirements.txt # Warm start (DOCC benchmark, CUDA target) - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu - DOCC_CI=1 venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu + DOCC_CI="" venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu + DOCC_CI=1 __DAISY_CAPTURE_STRATEGY_DEFAULT=once venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu run: @@ -36,6 +36,7 @@ steps: energy: true env: DOCC_REUSE_BINARIES: 1 + __DAISY_CAPTURE_STRATEGY_DEFAULT: once segformer_b0_docc_cuda: command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu From ba03faea132409d37a8dee4dc7f0d8cca48dc2e1 Mon Sep 17 00:00:00 2001 From: Atrisan Date: Wed, 10 Jun 2026 17:35:48 +0200 Subject: [PATCH 17/20] Add softmax test --- mlir/benchmarks/torch/layers/softmax.py | 69 +++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 mlir/benchmarks/torch/layers/softmax.py diff --git a/mlir/benchmarks/torch/layers/softmax.py b/mlir/benchmarks/torch/layers/softmax.py new file mode 100644 index 000000000..3711fc58f --- /dev/null +++ b/mlir/benchmarks/torch/layers/softmax.py @@ -0,0 +1,69 @@ +import torch +import torch.nn as nn + +from benchmarks.harness import run_benchmark + + +class SoftmaxNet(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.softmax = nn.Softmax(dim=dim) + + def forward(self, x: torch.Tensor): + return self.softmax(x) + + +class LogSoftmaxNet(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.log_softmax = nn.LogSoftmax(dim=dim) + + def forward(self, x: torch.Tensor): + return self.log_softmax(x) + + +# batch=64, classes=1000 — classifier output +def setup_softmax_classifier(): + model = SoftmaxNet(dim=1) + x = torch.randn(64, 1000) + return model, x + + +# batch=64, seq_len=512, features=768 — transformer-style attention scores +def setup_softmax_attention(): + model = SoftmaxNet(dim=-1) + x = torch.randn(64, 512, 768) + return model, x + + +# batch=64, classes=1000 — log-softmax for NLLLoss +def setup_log_softmax(): + model = LogSoftmaxNet(dim=1) + x = torch.randn(64, 1000) + return model, x + + +BENCHMARKS = { + "softmax_classifier": setup_softmax_classifier, + "softmax_attention": setup_softmax_attention, + "log_softmax": setup_log_softmax, +} + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Softmax layer benchmarks") + parser.add_argument( + "--variant", + type=str, + choices=list(BENCHMARKS.keys()), + default="softmax_classifier", + help="Softmax variant to benchmark", + ) + args, remaining = parser.parse_known_args() + + import sys + + sys.argv = [sys.argv[0]] + remaining + + run_benchmark(BENCHMARKS[args.variant], args.variant) From ec9307a23246545c0059a8bf83b658ac1dca5ab5 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Thu, 11 Jun 2026 21:25:54 +0200 Subject: [PATCH 18/20] Distribute benchmarks --- .daisy/mlir_torch_layers.yml | 244 ------------------ .daisy/mlir_torch_models.yml | 0 .daisy/mlir_torch_segformer.yml | 9 +- .daisy/mlir_torch_segformer_b2.yml | 12 + .daisy/mlir_torch_segformer_b2_torch.yml | 40 --- ...ml => mlir_torch_segformer_sequential.yml} | 15 +- 6 files changed, 26 insertions(+), 294 deletions(-) delete mode 100644 .daisy/mlir_torch_layers.yml delete mode 100644 .daisy/mlir_torch_models.yml delete mode 100644 .daisy/mlir_torch_segformer_b2_torch.yml rename .daisy/{mlir_torch_segformer_torch.yml => mlir_torch_segformer_sequential.yml} (61%) diff --git a/.daisy/mlir_torch_layers.yml b/.daisy/mlir_torch_layers.yml deleted file mode 100644 index 7716791e9..000000000 --- a/.daisy/mlir_torch_layers.yml +++ /dev/null @@ -1,244 +0,0 @@ -on: - push: - branches: - - main - schedule: - - cron: '0 0 * * *' - -parameters: - container: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - timeout: 150 - partitions: - - chamomile - -steps: - build: | - python3.11 -m venv venv - . venv/bin/activate - - python -m pip install --upgrade pip - pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - pip install numpy scipy - - pip install --no-build-isolation -e python/ - pip install --no-build-isolation -e mlir/ - - pip install -r mlir/requirements.txt - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=cuda - - run: - - # layer batchnorm - - batchnorm_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --torch - energy: true - measurements: 3 - batchnorm_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - batchnorm_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - batchnorm_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - batchnorm_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer conv2d - - conv2d_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --torch - energy: true - measurements: 3 - # conv2d_run_none: - # command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=none - # energy: true - # env: - # DOCC_CI: regions - # DOCC_REUSE_BINARIES: 1 - # conv2d_run_sequential: - # command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=sequential - # energy: true - # env: - # DOCC_CI: regions - # DOCC_REUSE_BINARIES: 1 - conv2d_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - conv2d_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer linear - - linear_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --torch - energy: true - measurements: 3 - linear_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - linear_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - linear_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - linear_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer matmul - - matmul_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --torch - energy: true - measurements: 3 - matmul_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - matmul_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - matmul_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - matmul_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer pooling - - pooling_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --torch - energy: true - measurements: 3 - pooling_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - pooling_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - pooling_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - pooling_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer relu - - relu_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --torch - energy: true - measurements: 3 - relu_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - relu_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - relu_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - relu_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 diff --git a/.daisy/mlir_torch_models.yml b/.daisy/mlir_torch_models.yml deleted file mode 100644 index e69de29bb..000000000 diff --git a/.daisy/mlir_torch_segformer.yml b/.daisy/mlir_torch_segformer.yml index 2b299caed..5403c62d5 100644 --- a/.daisy/mlir_torch_segformer.yml +++ b/.daisy/mlir_torch_segformer.yml @@ -25,9 +25,14 @@ steps: pip install -r mlir/requirements.txt + # Override CPU torch with CUDA wheels for torch GPU benchmarks + pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126 + + # Warm start (Torch benchmark on CUDA) + venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=torch --device=cuda + # Warm start (DOCC benchmark, CUDA target) - DOCC_CI="" venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu - DOCC_CI=1 __DAISY_CAPTURE_STRATEGY_DEFAULT=once venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu + DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu run: diff --git a/.daisy/mlir_torch_segformer_b2.yml b/.daisy/mlir_torch_segformer_b2.yml index afdb15fac..cc8333e5b 100644 --- a/.daisy/mlir_torch_segformer_b2.yml +++ b/.daisy/mlir_torch_segformer_b2.yml @@ -25,11 +25,23 @@ steps: pip install -r mlir/requirements.txt + # Override CPU torch with CUDA wheels for torch GPU benchmarks + pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126 + + # Warm start (Torch benchmark on CUDA) + venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda + # Warm start (DOCC benchmark, CUDA target) DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=docc --target=cuda --device=cpu run: + # model segformer b2 (Torch CUDA) + + segformer_b2_torch_cuda: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda + energy: true + # model segformer b2 (DOCC CUDA target) segformer_b2_docc_cuda: diff --git a/.daisy/mlir_torch_segformer_b2_torch.yml b/.daisy/mlir_torch_segformer_b2_torch.yml deleted file mode 100644 index e63215168..000000000 --- a/.daisy/mlir_torch_segformer_b2_torch.yml +++ /dev/null @@ -1,40 +0,0 @@ -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -parameters: - container: daisytuner/docc-build-env-llvm19-base:latest-amd64 - timeout: 480 - partitions: - - chamomile - -steps: - build: | - python3.11 -m venv venv - . venv/bin/activate - - python -m pip install --upgrade pip - pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - pip install numpy scipy transformers - - pip install --no-build-isolation -e python/ - pip install --no-build-isolation -e mlir/ - - pip install -r mlir/requirements.txt - - # Override CPU torch with CUDA wheels for torch GPU benchmarks - pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126 - - # Warm start (Torch benchmark on CUDA) - venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda - - run: - - # model segformer b2 (Torch CUDA) - - segformer_b2_torch_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda - energy: true diff --git a/.daisy/mlir_torch_segformer_torch.yml b/.daisy/mlir_torch_segformer_sequential.yml similarity index 61% rename from .daisy/mlir_torch_segformer_torch.yml rename to .daisy/mlir_torch_segformer_sequential.yml index 5e14f0c53..582f46c40 100644 --- a/.daisy/mlir_torch_segformer_torch.yml +++ b/.daisy/mlir_torch_segformer_sequential.yml @@ -25,16 +25,15 @@ steps: pip install -r mlir/requirements.txt - # Override CPU torch with CUDA wheels for torch GPU benchmarks - pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126 - - # Warm start (Torch benchmark on CUDA) - venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=torch --device=cuda + # Warm start (DOCC benchmark, sequential target) + DOCC_CI=1 venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu run: - # model segformer b0 (Torch CUDA) + # model segformer b0 (DOCC sequential target) - segformer_b0_torch_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=torch --device=cuda + segformer_b0_docc_sequential: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu energy: true + env: + DOCC_REUSE_BINARIES: 1 From b248ea4ed751cd58670aadc1532fd0d5576d7941 Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Fri, 12 Jun 2026 23:40:34 +0200 Subject: [PATCH 19/20] Increase benchmark time --- .daisy/mlir_torch_segformer_sequential.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.daisy/mlir_torch_segformer_sequential.yml b/.daisy/mlir_torch_segformer_sequential.yml index 582f46c40..60d49f58a 100644 --- a/.daisy/mlir_torch_segformer_sequential.yml +++ b/.daisy/mlir_torch_segformer_sequential.yml @@ -7,7 +7,7 @@ on: parameters: container: daisytuner/docc-build-env-llvm19-base:latest-amd64 - timeout: 240 + timeout: 480 partitions: - chamomile From f912c515211949b6c0a715ea0ace156e031940bf Mon Sep 17 00:00:00 2001 From: Nora Hagmeyer Date: Sat, 13 Jun 2026 17:40:05 +0200 Subject: [PATCH 20/20] Get regions uploaded --- .daisy/mlir_torch_segformer_sequential.yml | 5 +++-- mlir/benchmarks/torch/model_zoo/segformer_test.py | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.daisy/mlir_torch_segformer_sequential.yml b/.daisy/mlir_torch_segformer_sequential.yml index 60d49f58a..36569b277 100644 --- a/.daisy/mlir_torch_segformer_sequential.yml +++ b/.daisy/mlir_torch_segformer_sequential.yml @@ -7,7 +7,7 @@ on: parameters: container: daisytuner/docc-build-env-llvm19-base:latest-amd64 - timeout: 480 + timeout: 720 partitions: - chamomile @@ -26,7 +26,7 @@ steps: pip install -r mlir/requirements.txt # Warm start (DOCC benchmark, sequential target) - DOCC_CI=1 venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu + __DAISY_CAPTURE_STRATEGY_DEFAULT=once DOCC_CI=1 venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu run: @@ -37,3 +37,4 @@ steps: energy: true env: DOCC_REUSE_BINARIES: 1 + __DAISY_CAPTURE_STRATEGY_DEFAULT: once diff --git a/mlir/benchmarks/torch/model_zoo/segformer_test.py b/mlir/benchmarks/torch/model_zoo/segformer_test.py index 2dd9c76e5..b8c75e1ff 100644 --- a/mlir/benchmarks/torch/model_zoo/segformer_test.py +++ b/mlir/benchmarks/torch/model_zoo/segformer_test.py @@ -11,9 +11,9 @@ import docc.torch import os -os.environ["DOCC_STATISTICS"] = "1" -os.environ["DOCC_PROFILE_COMPILE"] = "1" -os.environ["DOCC_DEBUG"] = "dump" +#os.environ["DOCC_STATISTICS"] = "1" +#os.environ["DOCC_PROFILE_COMPILE"] = "1" +#os.environ["DOCC_DEBUG"] = "dump" SEGFORMER_MODELS = { @@ -155,8 +155,8 @@ def benchmark_segformer(model_name, backend="torch", target="none", device="cpu" from scipy import stats as scipy_stats times = [] - min_samples = 5 - max_samples = 500 + min_samples = 1 + max_samples = 5 target_rel_ci = 0.01 # stop when 95% CI half-width < 1% of mean while len(times) < max_samples: