diff --git a/.daisy/mlir_torch_layers.yml b/.daisy/mlir_torch_layers.yml
deleted file mode 100644
index 7716791e9..000000000
--- a/.daisy/mlir_torch_layers.yml
+++ /dev/null
@@ -1,244 +0,0 @@
-on:
-  push:
-    branches:
-      - main
-  schedule:
-    - cron: '0 0 * * *'
-
-parameters:
-  container: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64
-  timeout: 150
-  partitions:
-    - chamomile
-
-steps:
-  build: |
-    python3.11 -m venv venv
-    . venv/bin/activate
-
-    python -m pip install --upgrade pip
-    pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core
-    pip install numpy scipy
-
-    pip install --no-build-isolation -e python/
-    pip install --no-build-isolation -e mlir/
-
-    pip install -r mlir/requirements.txt
-
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=none
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=sequential
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=openmp
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=cuda
-
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=none
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=sequential
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=openmp
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=cuda
-
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=none
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=sequential
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=openmp
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=cuda
-
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=none
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=sequential
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=openmp
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=cuda
-
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=none
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=sequential
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=openmp
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=cuda
-
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=none
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=sequential
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=openmp
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=cuda
-
-  run:
-
-    # layer batchnorm
-
-    batchnorm_torch:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --torch
-      energy: true
-      measurements: 3
-    batchnorm_run_none:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=none
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    batchnorm_run_sequential:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=sequential
-      energy: true
-      env:
-        DOCC_CI: true
-        DOCC_REUSE_BINARIES: 1
-    batchnorm_run_openmp:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    batchnorm_run_cuda:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-
-    # layer conv2d
-
-    conv2d_torch:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --torch
-      energy: true
-      measurements: 3
-    # conv2d_run_none:
-    #   command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=none
-    #   energy: true
-    #   env:
-    #     DOCC_CI: regions
-    #     DOCC_REUSE_BINARIES: 1
-    # conv2d_run_sequential:
-    #   command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=sequential
-    #   energy: true
-    #   env:
-    #     DOCC_CI: regions
-    #     DOCC_REUSE_BINARIES: 1
-    conv2d_run_openmp:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    conv2d_run_cuda:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-
-    # layer linear
-
-    linear_torch:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --torch
-      energy: true
-      measurements: 3
-    linear_run_none:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=none
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    linear_run_sequential:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=sequential
-      energy: true
-      env:
-        DOCC_CI: true
-        DOCC_REUSE_BINARIES: 1
-    linear_run_openmp:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    linear_run_cuda:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-
-    # layer matmul
-
-    matmul_torch:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --torch
-      energy: true
-      measurements: 3
-    matmul_run_none:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=none
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    matmul_run_sequential:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=sequential
-      energy: true
-      env:
-        DOCC_CI: true
-        DOCC_REUSE_BINARIES: 1
-    matmul_run_openmp:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    matmul_run_cuda:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-
-    # layer pooling
-
-    pooling_torch:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --torch
-      energy: true
-      measurements: 3
-    pooling_run_none:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=none
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    pooling_run_sequential:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=sequential
-      energy: true
-      env:
-        DOCC_CI: true
-        DOCC_REUSE_BINARIES: 1
-    pooling_run_openmp:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    pooling_run_cuda:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-
-    # layer relu
-
-    relu_torch:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --torch
-      energy: true
-      measurements: 3
-    relu_run_none:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=none
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    relu_run_sequential:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=sequential
-      energy: true
-      env:
-        DOCC_CI: true
-        DOCC_REUSE_BINARIES: 1
-    relu_run_openmp:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    relu_run_cuda:
-      command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
diff --git a/.daisy/mlir_torch_models.yml b/.daisy/mlir_torch_models.yml
deleted file mode 100644
index 8b917b6a0..000000000
--- a/.daisy/mlir_torch_models.yml
+++ /dev/null
@@ -1,66 +0,0 @@
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, reopened, synchronize, ready_for_review]
-
-parameters:
-  container: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64
-  timeout: 120
-  partitions:
-    - chamomile
-
-steps:
-  build: |
-    python3.11 -m venv venv
-    . venv/bin/activate
-
-    python -m pip install --upgrade pip
-    pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core
-    pip install numpy scipy
-
-    pip install --no-build-isolation -e python/
-    pip install --no-build-isolation -e mlir/
-
-    pip install -r mlir/requirements.txt
-
-    # Warm start
-
-    venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --torch
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=none
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=sequential
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=openmp
-    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=cuda
-
-  run:
-
-    # model resnet18
-
-    resnet18_torch:
-      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --torch
-      energy: true
-    resnet18_docc_none:
-      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=none
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    resnet18_docc_sequential:
-      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=sequential
-      energy: true
-      env:
-        DOCC_CI: true
-        DOCC_REUSE_BINARIES: 1
-    resnet18_docc_openmp:
-      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
-    resnet18_docc_cuda:
-      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-        DOCC_REUSE_BINARIES: 1
diff --git a/.daisy/mlir_torch_segformer.yml b/.daisy/mlir_torch_segformer.yml
new file mode 100644
index 000000000..5403c62d5
--- /dev/null
+++ b/.daisy/mlir_torch_segformer.yml
@@ -0,0 +1,51 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  container: daisytuner/docc-build-env-llvm19-base:latest-amd64
+  timeout: 240
+  partitions:
+    - chamomile
+
+steps:
+  build: |
+    python3.11 -m venv venv
+    . venv/bin/activate
+
+    python -m pip install --upgrade pip
+    pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core
+    pip install numpy scipy transformers
+
+    pip install --no-build-isolation -e python/
+    pip install --no-build-isolation -e mlir/
+
+    pip install -r mlir/requirements.txt
+
+    # Override CPU torch with CUDA wheels for torch GPU benchmarks
+    pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126
+
+    # Warm start (Torch benchmark on CUDA)
+    venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=torch --device=cuda
+
+    # Warm start (DOCC benchmark, CUDA target)
+    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu
+
+  run:
+
+    segformer_b0_docc_sequential:
+      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu
+      energy: true
+      env:
+        DOCC_REUSE_BINARIES: 1
+        __DAISY_CAPTURE_STRATEGY_DEFAULT: once
+
+    segformer_b0_docc_cuda:
+      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu
+      energy: true
+      env:
+        DOCC_CI: ""
+        DOCC_REUSE_BINARIES: 1
diff --git a/.daisy/mlir_torch_segformer_b2.yml b/.daisy/mlir_torch_segformer_b2.yml
new file mode 100644
index 000000000..cc8333e5b
--- /dev/null
+++ b/.daisy/mlir_torch_segformer_b2.yml
@@ -0,0 +1,52 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  container: daisytuner/docc-build-env-llvm19-base:latest-amd64
+  timeout: 480
+  partitions:
+    - chamomile
+
+steps:
+  build: |
+    python3.11 -m venv venv
+    . venv/bin/activate
+
+    python -m pip install --upgrade pip
+    pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core
+    pip install numpy scipy transformers
+
+    pip install --no-build-isolation -e python/
+    pip install --no-build-isolation -e mlir/
+
+    pip install -r mlir/requirements.txt
+
+    # Override CPU torch with CUDA wheels for torch GPU benchmarks
+    pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126
+
+    # Warm start (Torch benchmark on CUDA)
+    venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda
+
+    # Warm start (DOCC benchmark, CUDA target)
+    DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=docc --target=cuda --device=cpu
+
+  run:
+
+    # model segformer b2 (Torch CUDA)
+
+    segformer_b2_torch_cuda:
+      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda
+      energy: true
+
+    # model segformer b2 (DOCC CUDA target)
+
+    segformer_b2_docc_cuda:
+      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=docc --target=cuda --device=cpu
+      energy: true
+      env:
+        DOCC_CI: ""
+        DOCC_REUSE_BINARIES: 1
diff --git a/.daisy/mlir_torch_segformer_sequential.yml b/.daisy/mlir_torch_segformer_sequential.yml
new file mode 100644
index 000000000..36569b277
--- /dev/null
+++ b/.daisy/mlir_torch_segformer_sequential.yml
@@ -0,0 +1,40 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+
+parameters:
+  container: daisytuner/docc-build-env-llvm19-base:latest-amd64
+  timeout: 720
+  partitions:
+    - chamomile
+
+steps:
+  build: |
+    python3.11 -m venv venv
+    . venv/bin/activate
+
+    python -m pip install --upgrade pip
+    pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core
+    pip install numpy scipy transformers
+
+    pip install --no-build-isolation -e python/
+    pip install --no-build-isolation -e mlir/
+
+    pip install -r mlir/requirements.txt
+
+    # Warm start (DOCC benchmark, sequential target)
+    __DAISY_CAPTURE_STRATEGY_DEFAULT=once DOCC_CI=1 venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu
+
+  run:
+
+    # model segformer b0 (DOCC sequential target)
+
+    segformer_b0_docc_sequential:
+      command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu
+      energy: true
+      env:
+        DOCC_REUSE_BINARIES: 1
+        __DAISY_CAPTURE_STRATEGY_DEFAULT: once
diff --git a/.daisy/python_npbench.yml b/.daisy/python_npbench.yml
deleted file mode 100644
index d31fc020b..000000000
--- a/.daisy/python_npbench.yml
+++ /dev/null
@@ -1,267 +0,0 @@
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, reopened, synchronize, ready_for_review]
-
-parameters:
-  container: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64
-  timeout: 120
-  partitions:
-    - zinnia
-
-steps:
-  build: |
-    apt-get install -y python3-venv python3-pip
-
-    python3 -m venv venv
-    . venv/bin/activate
-
-    python -m pip install --upgrade pip
-    pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core
-    pip install numpy scipy
-
-    pip install --no-build-isolation -v -e python/
-
-  run:
-    adi_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    adi_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    adi_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    adi_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    atax_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    atax_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    atax_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    atax_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    gemm_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    gemm_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    gemm_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    gemm_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    gesummv_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    gesummv_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    gesummv_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    gesummv_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    gemver_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    gemver_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    gemver_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    gemver_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    k2mm_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    k2mm_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    k2mm_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    k2mm_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    k3mm_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    k3mm_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    k3mm_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    k3mm_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    mvt_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    mvt_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    mvt_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    mvt_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    symm_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    symm_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    symm_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    # symm_cuda:
-    #   command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=cuda
-    #   energy: true
-      # env:
-      #   DOCC_CI: regions
-    syr2k_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    syr2k_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    syr2k_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    syr2k_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    syrk_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    syrk_omp:
-      command:   venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    syrk_cuda:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=cuda
-      energy: true
-      env:
-        DOCC_CI: regions
-    syrk_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    trmm_numpy:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --numpy --size=M
-      energy: true
-      env:
-        DOCC_CI: regions
-    trmm_omp:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=openmp
-      energy: true
-      env:
-        DOCC_CI: regions
-    trmm_seq_tuning:
-      command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=sequential --remote-tuning
-      energy: true
-      env:
-        DOCC_CI: regions
-    # trmm_cuda:
-    #   command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=cuda
-    #   energy: true
-      # env:
-      #   DOCC_CI: regions
diff --git a/.github/workflows/llvm_tests_san.yml b/.github/workflows/llvm_tests_san.yml
deleted file mode 100644
index e4328f190..000000000
--- a/.github/workflows/llvm_tests_san.yml
+++ /dev/null
@@ -1,83 +0,0 @@
-name: LLVM - Unit and Integration Sanitized Tests
-
-on:
-  push:
-    branches:
-      - main
-      - llvm-test-suite
-  schedule:
-    - cron: "0 4 * * *"
-
-jobs:
-  llvm-tests-linux-san:
-    runs-on:
-      group: dahlia
-      labels: Linux
-    container:
-      image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64
-    strategy:
-      fail-fast: false
-      matrix:
-        san: ["address", "leak", "undefined"]
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Mark GitHub Actions workdir as safe
-        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
-
-      - name: Build
-        run: |
-          mkdir build
-          cd build
-          cmake -G Ninja \
-            -DCMAKE_C_COMPILER=clang-19 \
-            -DCMAKE_CXX_COMPILER=clang++-19 \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DSDFG_ENABLE_SANITIZER=ON \
-            -DSDFG_SANITIZER=${{ matrix.san }} \
-            -DLLVM_BUILD_FRONTEND=ON \
-            -DLLVM_BUILD_TESTS=ON \
-            -DSDFG_BUILD_TESTS=OFF \
-            -DINSTALL_GTEST=OFF \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \
-            ..
-          ninja -j$(nproc)
-          cpack -G DEB
-          apt-get install -y ./docc-llvm*.deb
-
-      - name: Unit Tests
-        run: |
-          cd build
-          ./llvm/tests/docc_llvm_pass_test
-
-      - name: Set up Python
-        if: matrix.san == 'leak'
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-
-      - name: Setup virtual environment
-        if: matrix.san == 'leak'
-        run: |
-          python -m venv .venv
-          echo "$PWD/.venv/bin" >> $GITHUB_PATH
-
-      - name: Install dependencies
-        if: matrix.san == 'leak'
-        run: |
-          python -m pip install --upgrade pip
-          pip install pytest==7.1.3 pytest-parallel lit
-
-      - name: Integration Tests
-        # The docc C/C++ compiler currently only works with leak sanitizer
-        if: matrix.san == 'leak'
-        run: |
-          export LLVM_SYMBOLIZER_PATH=$(which llvm-symbolizer-19)
-
-          cd llvm/integration
-          pytest -v llvm_test_suite.py
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
deleted file mode 100644
index 7a74ae854..000000000
--- a/.github/workflows/release.yml
+++ /dev/null
@@ -1,200 +0,0 @@
-name: Release
-
-on:
-  push:
-    tags:
-      - "v*.*.*"
-
-jobs:
-  # Stage 1: Build docc-compiler (no dependencies)
-  wheels-compiler:
-    name: Compiler (${{ matrix.os }}, ${{ matrix.python }})
-    runs-on: ${{ matrix.os }}
-
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [build-amd64-big, build-arm64-big, macos-14]
-        python: ["cp311", "cp312", "cp313", "cp314"]
-        include:
-          - os: build-amd64-big
-            cibw_archs: x86_64
-          - os: build-arm64-big
-            cibw_archs: aarch64
-          - os: macos-14
-            cibw_archs: arm64
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - uses: pypa/cibuildwheel@v3.3.1
-        with:
-          package-dir: python/
-          output-dir: wheelhouse
-        env:
-          CIBW_ARCHS: ${{ matrix.cibw_archs }}
-          CIBW_BUILD: "${{ matrix.python }}-*"
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: wheels-docc-compiler-${{ matrix.os }}-${{ matrix.python }}
-          path: wheelhouse/*.whl
-
-  # Stage 2: Build docc-ai (depends on docc-compiler)
-  wheels-ai:
-    name: AI (${{ matrix.os }}, ${{ matrix.python }})
-    needs: [wheels-compiler]
-    runs-on: ${{ matrix.os }}
-
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [build-amd64-big, build-arm64-big, macos-14]
-        python: ["cp311", "cp312"]
-        include:
-          - os: build-amd64-big
-            cibw_archs: x86_64
-          - os: build-arm64-big
-            cibw_archs: aarch64
-          - os: macos-14
-            cibw_archs: arm64
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      # Pin docc-compiler version to match release
-      # - name: Pin docc-compiler version
-      #   run: |
-      #     VERSION=$(cat VERSION)
-      #     sed -i.bak "s/\"docc-compiler\"/\"docc-compiler==$VERSION\"/" mlir/pyproject.toml && rm mlir/pyproject.toml.bak
-
-      # Download compiler wheels into the package directory so they're available in container
-      # - uses: actions/download-artifact@v4
-      #   with:
-      #     pattern: wheels-docc-compiler-${{ matrix.os }}-*
-      #     path: mlir/compiler-wheels
-      #     merge-multiple: true
-
-      - uses: pypa/cibuildwheel@v3.3.1
-        with:
-          package-dir: mlir/
-          output-dir: wheelhouse
-        env:
-          CIBW_ARCHS: ${{ matrix.cibw_archs }}
-          CIBW_BUILD: "${{ matrix.python }}-*"
-          # Install docc-compiler before building docc-ai
-          # CIBW_BEFORE_BUILD: "pip install --no-index --find-links {project}/compiler-wheels docc-compiler"
-          # Make compiler wheels available for dependency resolution during test
-          # CIBW_ENVIRONMENT: "PIP_FIND_LINKS={project}/compiler-wheels"
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: wheels-docc-ai-${{ matrix.os }}-${{ matrix.python }}
-          path: wheelhouse/*.whl
-
-  wheels-publish:
-    needs: [wheels-compiler, wheels-ai]
-    runs-on: build-amd64-big
-    permissions:
-      id-token: write
-
-    steps:
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: wheels-*
-          path: dist
-          merge-multiple: true
-
-      - uses: pypa/gh-action-pypi-publish@v1.10.0
-
-  packages-llvm:
-    strategy:
-      matrix:
-        include:
-          - platform: ubuntu-24.04
-            package-format: deb
-            cpack-generator: DEB
-            upload-dist-id: ubuntu
-            upload-dist-version: 24.04
-            runner: build-amd64-big
-            architecture: x64
-            image: daisytuner/docc-build-env-llvm19-ubuntu-24.04:latest-amd64
-          - platform: ubuntu-24.04
-            package-format: deb
-            cpack-generator: DEB
-            upload-dist-id: ubuntu
-            upload-dist-version: 24.04
-            runner: build-arm64-big
-            architecture: arm64
-            image: daisytuner/docc-build-env-llvm19-ubuntu-24.04:latest-arm64
-          - platform: rhel-10
-            package-format: rpm
-            cpack-generator: RPM
-            upload-dist-id: rhel
-            upload-dist-version: 10
-            upload-dist-platform-id: platform:el10
-            runner: build-amd64-big
-            architecture: x64
-            image: daisytuner/docc-build-env-llvm19-rhel-10:latest-amd64
-          - platform: debian-13
-            package-format: deb
-            cpack-generator: DEB
-            upload-dist-id: debian
-            upload-dist-version: 13
-            runner: build-amd64-big
-            architecture: x64
-            image: daisytuner/docc-build-env-llvm19-debian-13:latest-amd64
-
-    runs-on: ${{ matrix.runner }}
-    container:
-      image: ${{ matrix.image }}
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Define Version
-        id: define_version
-        run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
-
-      - name: Build package
-        run: |
-          mkdir -p build
-          cd build
-          cmake -G Ninja \
-            -DCMAKE_C_COMPILER=clang-19 \
-            -DCMAKE_CXX_COMPILER=clang++-19 \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DINSTALL_GTEST=OFF \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DSDFGLIB_AUTO_INSTALL_MODE=ON \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \
-            -DRELEASE_PACKAGE=ON \
-            -DPACKAGE_WITH_TOOL_DEPS=ON \
-            ..
-          ninja -j$(nproc)
-          cpack -G ${{ matrix.cpack-generator }}
-
-      - name: Upload docc package as Artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: docc-${{ matrix.platform }}-${{ matrix.architecture }}
-          path: "build/*.${{ matrix.package-format }}"
-
-      - name: Upload docc package to Firebase
-        uses: daisytuner/upload-distribution-action@main
-        with:
-          file: "build/*.${{ matrix.package-format }}"
-          version: ${{ steps.define_version.outputs.VERSION }}
-          architecture: ${{ matrix.architecture }}
-          dist-id: ${{ matrix.upload-dist-id }}
-          dist-version: ${{ matrix.upload-dist-version }}
-          dist-platform-id: ${{ matrix.upload-dist-platform-id }}
-          token: ${{ secrets.DOCC_RELEASE_TOKEN }}
-          url: /v1/system/docc-distributions/upload
diff --git a/.github/workflows/sanitizer_tests_asan.yml b/.github/workflows/sanitizer_tests_asan.yml
deleted file mode 100644
index 5ebca361d..000000000
--- a/.github/workflows/sanitizer_tests_asan.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-name: Sanitizer Tests (Address)
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, reopened, synchronize, ready_for_review]
-
-jobs:
-  sanitizer-linux-asan:
-    runs-on:
-      group: dahlia
-      labels: openmp
-    container:
-      image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Formatting
-        shell: bash
-        run: |
-          shopt -s globstar
-          clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp
-          clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp
-
-      - name: Build and test
-        run: |
-          mkdir build
-          cd build
-          cmake \
-            -G Ninja \
-            -DCMAKE_C_COMPILER=clang-19 \
-            -DCMAKE_CXX_COMPILER=clang++-19 \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DSDFG_ENABLE_SANITIZER=ON \
-            -DSDFG_SANITIZER=address \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF  \
-            ..
-          ninja -j$(nproc)
-
-          ./sdfg/tests/sdfglib_test
-          ./opt/tests/sdfgopt_test
-
-  sanitizer-macos-asan:
-    runs-on:
-      group: dahlia
-      labels: macOS
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Install dependencies
-        run: |
-          brew install ninja cmake
-          brew install gmp isl nlohmann-json boost
-          brew install libomp
-
-      - name: Build
-        run: |
-          mkdir build
-          cd build
-          cmake \
-            -G Ninja \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DSDFG_ENABLE_SANITIZER=ON \
-            -DSDFG_SANITIZER=address \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF  \
-            ..
-          ninja
-
-      - name: Unit Tests
-        run: |
-          cd build/
-          ./sdfg/tests/sdfglib_test
-          ./opt/tests/sdfgopt_test
-          ./tutorial/printf_target/tests/printf_target_test
diff --git a/.github/workflows/sanitizer_tests_lsan.yml b/.github/workflows/sanitizer_tests_lsan.yml
deleted file mode 100644
index 69d5c5a36..000000000
--- a/.github/workflows/sanitizer_tests_lsan.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: Sanitizer Tests (Leak)
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, reopened, synchronize, ready_for_review]
-
-jobs:
-  sanitizer-linux-lsan:
-    runs-on:
-      group: dahlia
-      labels: openmp
-    container:
-      image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Formatting
-        shell: bash
-        run: |
-          shopt -s globstar
-          clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp
-          clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp
-
-      - name: Build and test
-        run: |
-          mkdir build
-          cd build
-          cmake \
-            -G Ninja \
-            -DCMAKE_C_COMPILER=clang-19 \
-            -DCMAKE_CXX_COMPILER=clang++-19 \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DSDFG_ENABLE_SANITIZER=ON \
-            -DSDFG_SANITIZER=leak \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF  \
-            ..
-          ninja -j$(nproc)
-
-          ./sdfg/tests/sdfglib_test
-          ./opt/tests/sdfgopt_test
diff --git a/.github/workflows/sanitizer_tests_ubsan.yml b/.github/workflows/sanitizer_tests_ubsan.yml
deleted file mode 100644
index f8d95b67e..000000000
--- a/.github/workflows/sanitizer_tests_ubsan.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: Sanitizer Tests (Undefined Behavior)
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, reopened, synchronize, ready_for_review]
-
-jobs:
-  sanitizer-linux-ubsan:
-    runs-on:
-      group: dahlia
-      labels: openmp
-    container:
-      image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Formatting
-        shell: bash
-        run: |
-          shopt -s globstar
-          clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp
-          clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp
-
-      - name: Build and test
-        run: |
-          mkdir build
-          cd build
-          cmake \
-            -G Ninja \
-            -DCMAKE_C_COMPILER=clang-19 \
-            -DCMAKE_CXX_COMPILER=clang++-19 \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DSDFG_ENABLE_SANITIZER=ON \
-            -DSDFG_SANITIZER=undefined \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF  \
-            ..
-          ninja -j$(nproc)
-
-          ./sdfg/tests/sdfglib_test
-          ./opt/tests/sdfgopt_test
diff --git a/.github/workflows/unit_tests_macos.yml b/.github/workflows/unit_tests_macos.yml
deleted file mode 100644
index fe8db86fd..000000000
--- a/.github/workflows/unit_tests_macos.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-name: Unit Tests (macOS)
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, reopened, synchronize, ready_for_review]
-  schedule:
-    - cron: "0 4 * * *"
-
-jobs:
-  primary-tests-macos:
-    runs-on:
-      group: dahlia
-      labels: macOS
-
-    env:
-      python_version: "3.14"
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Install dependencies
-        run: |
-          brew install ninja cmake
-          brew install gmp isl nlohmann-json boost
-          brew install libomp
-          brew install uv
-
-      - name: Set up Python ${{ env.python_version }}
-        run: |
-          uv python install ${{ env.python_version }}
-          uv venv --python ${{ env.python_version }} .venv
-          echo "$PWD/.venv/bin" >> $GITHUB_PATH
-          echo "PYTHONPATH=$PWD/python" >> $GITHUB_ENV
-
-      - name: Install Python dependencies
-        run: |
-          uv pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core
-          uv pip install numpy scipy ml_dtypes
-
-      - name: Build
-        run: |
-          mkdir build
-          cd build
-          cmake \
-            -G Ninja \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DPYTHON_BUILD_FRONTEND=ON \
-            -Dpybind11_DIR=$GITHUB_WORKSPACE/.venv/lib/python${{ env.python_version }}/site-packages/pybind11/share/cmake/pybind11 \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \
-            ..
-          ninja
-
-      - name: Unit Tests
-        run: |
-          cd build/
-          ./sdfg/tests/sdfglib_test
-          ./opt/tests/sdfgopt_test
-          ./tutorial/printf_target/tests/printf_target_test
-
-      - name: Test Arg-Capture-IO
-        run: |
-          cd build
-          ./arg-capture-io/tests/capture_io_test
-
-      - name: Python Unit Tests
-        env:
-          DOCC_ACCESS_TOKEN: ${{ secrets.DOCC_CI_TOKEN }}
-        run: |
-          export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-          pytest -v python/tests -m "unmarked"
-
-      - name: Python Integration Tests
-        run: |
-          export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-          pytest -v python/benchmarks/
-
-      # - name: Test RTL
-      #   run: |
-      #     export CPATH=/usr/local/include:$CPATH
-      #     export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH
-      #     export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-      #     export PATH=/usr/local/bin:$PATH
-
-      #     pip install pytest==7.1.3 --break-system-packages
-      #     pip install pytest-parallel --break-system-packages
-
-      #     cd rtl/tests
-      #     pytest -v -s rtl_tests.py
diff --git a/.github/workflows/unit_tests_release.yml b/.github/workflows/unit_tests_release.yml
deleted file mode 100644
index e881251bf..000000000
--- a/.github/workflows/unit_tests_release.yml
+++ /dev/null
@@ -1,113 +0,0 @@
-name: Unit Tests - Release
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, reopened, synchronize, ready_for_review]
-  schedule:
-    - cron: "0 4 * * *"
-
-jobs:
-  release-linux:
-    runs-on:
-      group: dahlia
-      labels: RTX5060
-    container:
-      image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64
-      options: >-
-        --cap-add=PERFMON
-        --gpus=all
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Mark GitHub Actions workdir as safe
-        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
-
-      - name: Formatting
-        shell: bash
-        run: |
-          shopt -s globstar
-          clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp
-          clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp
-
-      - name: Build and test
-        run: |
-          mkdir build
-          cd build
-          cmake \
-            -G Ninja \
-            -DCMAKE_C_COMPILER=clang-19 \
-            -DCMAKE_CXX_COMPILER=clang++-19 \
-            -DCMAKE_INSTALL_PREFIX=/usr/local \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DLLVM_BUILD_FRONTEND=ON \
-            -DLLVM_BUILD_TESTS=ON \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF  \
-            ..
-          ninja -j$(nproc)
-          ninja install
-
-          ./sdfg/tests/sdfglib_test
-          ./opt/tests/sdfgopt_test
-          ./llvm/tests/docc_llvm_pass_test
-
-      - name: Test RTL
-        run: |
-          export CPATH=/usr/local/include:$CPATH
-          export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH
-          export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-          export PATH=/usr/local/bin:$PATH
-
-          pip install pytest==7.1.3 --break-system-packages
-          pip install pytest-parallel --break-system-packages
-
-          cd rtl/tests
-          pytest -v -s rtl_tests.py
-
-  release-macos:
-    runs-on:
-      group: dahlia
-      labels: macOS
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Install dependencies
-        run: |
-          brew install ninja cmake
-          brew install gmp isl nlohmann-json boost
-          brew install libomp
-
-      - name: Build
-        run: |
-          mkdir build
-          cd build
-          cmake \
-            -G Ninja \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DBUILD_TESTS:BOOL=OFF \
-            -DBUILD_BENCHMARKS:BOOL=OFF \
-            -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF  \
-            ..
-          ninja
-
-      - name: Unit Tests
-        run: |
-          cd build/
-          ./sdfg/tests/sdfglib_test
-          ./opt/tests/sdfgopt_test
-          ./tutorial/printf_target/tests/printf_target_test
-
-      - name: Test Arg-Capture-IO
-        run: |
-          cd build
-          ./arg-capture-io/tests/capture_io_test
diff --git a/mlir/benchmarks/harness.py b/mlir/benchmarks/harness.py
index 08f3a1783..2852a85be 100644
--- a/mlir/benchmarks/harness.py
+++ b/mlir/benchmarks/harness.py
@@ -3,11 +3,13 @@
 import time
 import docc.torch
 
+
 def run_benchmark(setup_func, name):
     parser = argparse.ArgumentParser()
     parser.add_argument("--docc", action="store_true")
     parser.add_argument("--torch", action="store_true")
     parser.add_argument("--target", type=str, default="none")
+    parser.add_argument("--remote_tuning", action="store_true")
     parser.add_argument("--n_runs", type=int, default=10)
     args = parser.parse_args()
 
@@ -24,15 +26,26 @@ def run_benchmark(setup_func, name):
                     program(model_input)
             end = time.time()
             print(f"{name} torch execution time: {end - start:.6f} seconds")
-    
+
     if args.docc:
         for _ in range(args.n_runs):
             start = time.time()
             with torch.no_grad():
-                program = torch.compile(model, backend="docc", options={"target": args.target, "category": "server"})
+                program = torch.compile(
+                    model,
+                    backend="docc",
+                    options={
+                        "target": args.target,
+                        "category": "server",
+                        "remote_tuning": args.remote_tuning,
+                    },
+                )
                 if type(model_input) == tuple:
                     program(*model_input)
                 else:
                     program(model_input)
             end = time.time()
-            print(f"{name} docc execution time: {end - start:.6f} seconds")
+            print(
+                f"{name} docc execution time: {end - start:.6f} seconds "
+                f"(remote_tuning={args.remote_tuning})"
+            )
diff --git a/mlir/benchmarks/torch/layers/softmax.py b/mlir/benchmarks/torch/layers/softmax.py
new file mode 100644
index 000000000..3711fc58f
--- /dev/null
+++ b/mlir/benchmarks/torch/layers/softmax.py
@@ -0,0 +1,69 @@
+import torch
+import torch.nn as nn
+
+from benchmarks.harness import run_benchmark
+
+
+class SoftmaxNet(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.softmax = nn.Softmax(dim=dim)
+
+    def forward(self, x: torch.Tensor):
+        return self.softmax(x)
+
+
+class LogSoftmaxNet(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.log_softmax = nn.LogSoftmax(dim=dim)
+
+    def forward(self, x: torch.Tensor):
+        return self.log_softmax(x)
+
+
+# batch=64, classes=1000 — classifier output
+def setup_softmax_classifier():
+    model = SoftmaxNet(dim=1)
+    x = torch.randn(64, 1000)
+    return model, x
+
+
+# batch=64, seq_len=512, features=768 — transformer-style attention scores
+def setup_softmax_attention():
+    model = SoftmaxNet(dim=-1)
+    x = torch.randn(64, 512, 768)
+    return model, x
+
+
+# batch=64, classes=1000 — log-softmax for NLLLoss
+def setup_log_softmax():
+    model = LogSoftmaxNet(dim=1)
+    x = torch.randn(64, 1000)
+    return model, x
+
+
+BENCHMARKS = {
+    "softmax_classifier": setup_softmax_classifier,
+    "softmax_attention": setup_softmax_attention,
+    "log_softmax": setup_log_softmax,
+}
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Softmax layer benchmarks")
+    parser.add_argument(
+        "--variant",
+        type=str,
+        choices=list(BENCHMARKS.keys()),
+        default="softmax_classifier",
+        help="Softmax variant to benchmark",
+    )
+    args, remaining = parser.parse_known_args()
+
+    import sys
+
+    sys.argv = [sys.argv[0]] + remaining
+
+    run_benchmark(BENCHMARKS[args.variant], args.variant)
diff --git a/mlir/benchmarks/torch/model_zoo/segformer_profile.py b/mlir/benchmarks/torch/model_zoo/segformer_profile.py
new file mode 100644
index 000000000..89849874e
--- /dev/null
+++ b/mlir/benchmarks/torch/model_zoo/segformer_profile.py
@@ -0,0 +1,238 @@
+import argparse
+import time
+
+import torch
+from torch.profiler import ProfilerActivity, profile
+from transformers import SegformerForSemanticSegmentation
+
+import docc.torch
+
+
+SEGFORMER_MODELS = {
+    "b0": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024",
+    "b1": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024",
+    "b2": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024",
+    "b3": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024",
+    "b4": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024",
+    "b5": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024",
+}
+
+
+def resolve_model_name(version: str, model: str | None) -> str:
+    if model:
+        return model
+    return SEGFORMER_MODELS[version]
+
+
+def _assert_cuda_arch_supported() -> None:
+    capability = torch.cuda.get_device_capability()
+    current_arch = f"sm_{capability[0]}{capability[1]}"
+    supported_arches = set(torch.cuda.get_arch_list())
+    if current_arch not in supported_arches:
+        supported_str = " ".join(sorted(supported_arches))
+        raise RuntimeError(
+            "The active PyTorch CUDA build does not support this GPU architecture "
+            f"({current_arch}). Supported architectures: {supported_str}. "
+            "Install a compatible CUDA wheel (for RTX 50xx typically cu128+), "
+            "or run with --device cpu."
+        )
+
+
+def setup_segformer(
+    model_name: str,
+    model_device: str,
+    image_size: int,
+    input_device: str | None = None,
+) -> tuple[torch.nn.Module, torch.Tensor]:
+    if input_device is None:
+        input_device = model_device
+
+    model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval()
+    if model_device == "cuda":
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA requested but not available")
+        _assert_cuda_arch_supported()
+        model = model.to("cuda")
+
+    if input_device == "cuda" and not torch.cuda.is_available():
+        raise RuntimeError("CUDA input requested but not available")
+
+    model_input = torch.randn(1, 3, image_size, image_size, device=input_device)
+    return model, model_input
+
+
+def _model_device(model: torch.nn.Module) -> torch.device:
+    try:
+        return next(model.parameters()).device
+    except StopIteration:
+        return torch.device("cpu")
+
+
+def _materialize_output(res: object) -> None:
+    if isinstance(res, dict):
+        _ = {k: v.cpu() if torch.is_tensor(v) else v for k, v in res.items()}
+    elif hasattr(res, "logits") and torch.is_tensor(res.logits):
+        _ = res.logits.cpu()
+
+
+def _run_once(program: torch.nn.Module, model_input: torch.Tensor, model_dev: torch.device) -> None:
+    current_input = model_input
+    if current_input.device != model_dev:
+        current_input = current_input.to(model_dev, non_blocking=True)
+
+    res = program(pixel_values=current_input)
+    _materialize_output(res)
+    if model_dev.type == "cuda":
+        torch.cuda.synchronize(model_dev)
+
+
+def run_torch_profile(model: torch.nn.Module, model_input: torch.Tensor, n_runs: int, trace_prefix: str) -> None:
+    model_dev = _model_device(model)
+    with torch.no_grad():
+        compile_start = time.perf_counter()
+        program = torch.compile(model)
+        _run_once(program, model_input, model_dev)
+        compile_end = time.perf_counter()
+        print(f"Torch compile+first-run: {(compile_end - compile_start):.6f} s")
+
+        _run_once(program, model_input, model_dev)
+        activities = [ProfilerActivity.CPU]
+        if model_dev.type == "cuda":
+            activities.append(ProfilerActivity.CUDA)
+
+        for i in range(n_runs):
+            start = time.perf_counter()
+            with profile(activities=activities, record_shapes=True) as prof:
+                _run_once(program, model_input, model_dev)
+            end = time.perf_counter()
+
+            trace_path = f"{trace_prefix}_torch_{i}.json"
+            prof.export_chrome_trace(trace_path)
+            print(f"Torch runtime run {i}: {(end - start):.6f} s, trace={trace_path}")
+
+
+def run_docc_profile(
+    model: torch.nn.Module,
+    model_input: torch.Tensor,
+    n_runs: int,
+    target: str,
+    remote_tuning: bool,
+    trace_prefix: str,
+) -> None:
+    model_dev = _model_device(model)
+    with torch.no_grad():
+        compile_start = time.perf_counter()
+        program = torch.compile(
+            model,
+            backend="docc",
+            options={"target": target, "category": "server", "remote_tuning": remote_tuning},
+        )
+        _run_once(program, model_input, model_dev)
+        compile_end = time.perf_counter()
+        print(
+            f"DOCC compile+first-run ({target}, remote_tuning={remote_tuning}): "
+            f"{(compile_end - compile_start):.6f} s"
+        )
+
+        _run_once(program, model_input, model_dev)
+        activities = [ProfilerActivity.CPU]
+        if model_dev.type == "cuda":
+            activities.append(ProfilerActivity.CUDA)
+
+        for i in range(n_runs):
+            start = time.perf_counter()
+            with profile(activities=activities, record_shapes=True) as prof:
+                _run_once(program, model_input, model_dev)
+            end = time.perf_counter()
+
+            trace_path = f"{trace_prefix}_docc_{target}_{i}.json"
+            prof.export_chrome_trace(trace_path)
+            print(f"DOCC runtime run {i}: {(end - start):.6f} s, trace={trace_path}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Profile SegFormer with Torch and/or DOCC backend")
+    parser.add_argument("--docc", action="store_true", help="Run DOCC backend")
+    parser.add_argument("--torch", action="store_true", dest="run_torch", help="Run Torch backend")
+    parser.add_argument(
+        "--version",
+        type=str,
+        choices=list(SEGFORMER_MODELS.keys()),
+        default="b0",
+        help="SegFormer variant to use when --model is not provided",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="Optional Hugging Face model id to override --version",
+    )
+    parser.add_argument("--target", type=str, default="none", help="DOCC target")
+    parser.add_argument(
+        "--remote_tuning",
+        action="store_true",
+        help="Enable DOCC remote tuning during compilation",
+    )
+    parser.add_argument("--n_runs", type=int, default=10, help="Number of runs per backend")
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "cuda"],
+        default="cpu",
+        help="Device for model and input tensor",
+    )
+    parser.add_argument(
+        "--input_device",
+        type=str,
+        choices=["cpu", "cuda"],
+        default=None,
+        help="Device where input tensor is created (defaults to --device)",
+    )
+    parser.add_argument("--image_size", type=int, default=512, help="Input image size")
+    parser.add_argument(
+        "--trace_prefix",
+        type=str,
+        default="segformer_trace",
+        help="Prefix for exported Torch profiler traces",
+    )
+    args = parser.parse_args()
+
+    if not args.docc and not args.run_torch:
+        parser.error("Specify at least one backend: --torch and/or --docc")
+
+    return args
+
+
+def main() -> None:
+    args = parse_args()
+    model_name = resolve_model_name(args.version, args.model)
+    input_device = args.input_device if args.input_device is not None else args.device
+    model, model_input = setup_segformer(
+        model_name,
+        args.device,
+        args.image_size,
+        input_device=input_device,
+    )
+
+    print(f"Model: {model_name}")
+    print(f"Device: {args.device}")
+    print(f"Input device: {input_device}")
+    print(f"Remote tuning: {args.remote_tuning}")
+    print(f"Runs: {args.n_runs}")
+
+    if args.run_torch:
+        run_torch_profile(model, model_input, args.n_runs, args.trace_prefix)
+
+    if args.docc:
+        run_docc_profile(
+            model,
+            model_input,
+            args.n_runs,
+            args.target,
+            args.remote_tuning,
+            args.trace_prefix,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlir/benchmarks/torch/model_zoo/segformer_test.py b/mlir/benchmarks/torch/model_zoo/segformer_test.py
index a70c186d9..b8c75e1ff 100644
--- a/mlir/benchmarks/torch/model_zoo/segformer_test.py
+++ b/mlir/benchmarks/torch/model_zoo/segformer_test.py
@@ -1,3 +1,4 @@
+import argparse
 import time
 
 import torch
@@ -10,33 +11,62 @@
 import docc.torch
 
 import os
-os.environ["DOCC_STATISTICS"] = "1"
-os.environ["DOCC_PROFILE_COMPILE"] = "1"
-os.environ["DOCC_DEBUG"] = "dump"
+#os.environ["DOCC_STATISTICS"] = "1"
+#os.environ["DOCC_PROFILE_COMPILE"] = "1"
+#os.environ["DOCC_DEBUG"] = "dump"
 
 
+SEGFORMER_MODELS = {
+    "b0": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024",
+    "b1": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024",
+    "b2": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024",
+    "b3": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024",
+    "b4": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024",
+    "b5": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024",
+}
+
+
+def resolve_model_name(version, model):
+    if model:
+        return model
+    return SEGFORMER_MODELS[version]
+
+
+def get_test_model_name():
+    version = os.getenv("SEGFORMER_VERSION", "b2")
+    if version not in SEGFORMER_MODELS:
+        raise ValueError(
+            f"Unsupported SEGFORMER_VERSION '{version}'. "
+            f"Expected one of: {', '.join(SEGFORMER_MODELS.keys())}"
+        )
+    return resolve_model_name(version, None)
+
+@pytest.mark.skipif(not os.environ.get("SLOW_TESTS", ""), reason="slow test")
 def test_backend():
-    model = SegformerForSemanticSegmentation.from_pretrained(
-        "nvidia/segformer-b0-finetuned-cityscapes-1024-1024"
-    ).eval()
-    model_ref = SegformerForSemanticSegmentation.from_pretrained(
-        "nvidia/segformer-b0-finetuned-cityscapes-1024-1024"
-    ).eval()
+    model_name = get_test_model_name()
+    model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval()
+    model_ref = SegformerForSemanticSegmentation.from_pretrained(model_name).eval()
     model_ref.load_state_dict(model.state_dict())
 
     example_input = torch.randn(1, 3, 512, 512)
 
     start = time.perf_counter()
-    program = torch.compile(model, backend="docc", options={"target": "none", "category": "server"})
+    program = torch.compile(model, backend="docc", options={"target": "cuda", "category": "server"})
     end = time.perf_counter()
     print(f"compilation time: {(end - start) * 1000:.2f} ms")
+
+    start = time.perf_counter()
+    ref_program = torch.compile(model, backend="docc", options={"target": "cuda", "category": "server"})
+    end = time.perf_counter()
+    print(f"ref compilation time: {(end - start) * 1000:.2f} ms")
+
     with torch.no_grad():
         start = time.perf_counter()
         res = program(pixel_values=example_input)
         end = time.perf_counter()
         print(f"inference time: {(end - start) * 1000:.2f} ms")
         start = time.perf_counter()
-        res_ref = model_ref(pixel_values=example_input)
+        res_ref = ref_program(pixel_values=example_input)
         end = time.perf_counter()
         print(f"reference inference time: {(end - start) * 1000:.2f} ms")
         for k in range(res.logits.shape[0]):
@@ -54,12 +84,9 @@ def test_backend():
 
 @pytest.mark.skip("Skip")
 def test_compile():
-    model = SegformerForSemanticSegmentation.from_pretrained(
-        "nvidia/segformer-b0-finetuned-cityscapes-1024-1024"
-    ).eval()
-    model_ref = SegformerForSemanticSegmentation.from_pretrained(
-        "nvidia/segformer-b0-finetuned-cityscapes-1024-1024"
-    ).eval()
+    model_name = get_test_model_name()
+    model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval()
+    model_ref = SegformerForSemanticSegmentation.from_pretrained(model_name).eval()
     model_ref.load_state_dict(model.state_dict())
 
     example_input = torch.randn(1, 3, 512, 512)
@@ -77,9 +104,7 @@ def test_compile():
     assert torch.allclose(res, res_ref.logits, rtol=1e-4)
 
 def find_used_dialects():
-    model = SegformerForSemanticSegmentation.from_pretrained(
-        "nvidia/segformer-b0-finetuned-cityscapes-1024-1024"
-    ).eval()
+    model = SegformerForSemanticSegmentation.from_pretrained(get_test_model_name()).eval()
 
     example_input = torch.randn(1, 3, 512, 512)
 
@@ -101,25 +126,37 @@ def find_used_dialects():
 
     # print(mlir_str)
 
-def benchmark_segformer(model_name):
+def benchmark_segformer(model_name, backend="torch", target="none", device="cpu", remote_tuning=False):
     model = SegformerForSemanticSegmentation.from_pretrained(
         model_name
     ).eval()
 
-    example_input = torch.randn(1, 3, 1024, 1024)
+    if device == "cuda" and not torch.cuda.is_available():
+        raise RuntimeError("CUDA requested but not available")
+
+    if device == "cuda":
+        model = model.to("cuda")
+
+    example_input = torch.randn(1, 3, 1024, 1024, device=device)
+
+    compile_kwargs = {}
+    if backend == "docc":
+        compile_kwargs = {
+            "backend": "docc",
+            "options": {"target": target, "category": "server", "remote_tuning": remote_tuning},
+        }
 
-    program = torch.compile(model)
+    program = torch.compile(model, **compile_kwargs)
     with torch.no_grad():
         # Warmup
         res = program(pixel_values=example_input)
 
         import time
-        import math
         from scipy import stats as scipy_stats
 
         times = []
-        min_samples = 5
-        max_samples = 500
+        min_samples = 1
+        max_samples = 5
         target_rel_ci = 0.01  # stop when 95% CI half-width < 1% of mean
 
         while len(times) < max_samples:
@@ -144,14 +181,151 @@ def benchmark_segformer(model_name):
     sem = scipy_stats.sem(times)
     half_width = scipy_stats.t.ppf(0.975, df=n - 1) * sem
     print(f"Benchmarking {model_name}:")
+    print(f"Remote tuning: {remote_tuning}")
     print(f"Average inference time: {mean:.2f} ms (n={n})")
     print(f"95% CI: [{mean - half_width:.2f}, {mean + half_width:.2f}] ms  (±{half_width:.2f} ms)")
 
+
+def setup_segformer_benchmark(model_name):
+    model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval()
+    example_input = torch.randn(1, 3, 512, 512)
+    return model, example_input
+
+
+def profile_segformer(
+    model_name,
+    backend="torch",
+    target="none",
+    device="cpu",
+    input_device=None,
+    remote_tuning=False,
+    n_runs=10,
+    image_size=512,
+    trace_prefix="segformer_trace",
+):
+    from segformer_profile import setup_segformer, run_torch_profile, run_docc_profile
+
+    model, model_input = setup_segformer(
+        model_name,
+        device,
+        image_size,
+        input_device=input_device,
+    )
+    if backend == "torch":
+        run_torch_profile(model, model_input, n_runs, trace_prefix)
+    elif backend == "docc":
+        run_docc_profile(model, model_input, n_runs, target, remote_tuning, trace_prefix)
+    elif backend == "both":
+        run_torch_profile(model, model_input, n_runs, trace_prefix)
+        run_docc_profile(model, model_input, n_runs, target, remote_tuning, trace_prefix)
+    else:
+        raise ValueError(f"Unsupported backend '{backend}' for profiling")
+
 if __name__ == "__main__":
-    # find_used_dialects()
-    find_used_dialects()
-    #benchmark_segformer("nvidia/segformer-b1-finetuned-cityscapes-1024-1024")
-    #benchmark_segformer("nvidia/segformer-b2-finetuned-cityscapes-1024-1024")
-    #benchmark_segformer("nvidia/segformer-b3-finetuned-cityscapes-1024-1024")
-    #benchmark_segformer("nvidia/segformer-b4-finetuned-cityscapes-1024-1024")
-    #benchmark_segformer("nvidia/segformer-b5-finetuned-cityscapes-1024-1024")
\ No newline at end of file
+    parser = argparse.ArgumentParser(description="segformer benchmark")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="Optional Hugging Face model id to override --version",
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        choices=list(SEGFORMER_MODELS.keys()),
+        default="b0",
+        help="SegFormer variant used when --model is not provided",
+    )
+    parser.add_argument(
+        "--action",
+        type=str,
+        choices=["dialects", "benchmark", "benchmark_segformer", "profile"],
+        default="benchmark",
+        help="Run dialect dump or harness benchmark",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=["torch", "docc", "both"],
+        default="torch",
+        help="Backend for --action benchmark_segformer/profile",
+    )
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="none",
+        help="DOCC target for --action benchmark_segformer (e.g. none, openmp, cuda)",
+    )
+    parser.add_argument(
+        "--remote_tuning",
+        action="store_true",
+        help="Enable DOCC remote tuning during benchmark/profile compilation",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "cuda"],
+        default="cpu",
+        help="Tensor/model device for --action benchmark_segformer/profile",
+    )
+    parser.add_argument(
+        "--input_device",
+        type=str,
+        choices=["cpu", "cuda"],
+        default=None,
+        help="Input tensor device for --action profile (defaults to --device)",
+    )
+    parser.add_argument(
+        "--n_runs",
+        type=int,
+        default=10,
+        help="Number of runs for --action profile",
+    )
+    parser.add_argument(
+        "--image_size",
+        type=int,
+        default=512,
+        help="Input image size for --action profile",
+    )
+    parser.add_argument(
+        "--trace_prefix",
+        type=str,
+        default="segformer_trace",
+        help="Trace file prefix for --action profile torch runs",
+    )
+    args, remaining = parser.parse_known_args()
+    model_name = resolve_model_name(args.version, args.model)
+
+    import sys
+
+    if args.action == "dialects":
+        find_used_dialects()
+    elif args.action == "benchmark_segformer":
+        benchmark_segformer(
+            model_name,
+            backend=args.backend,
+            target=args.target,
+            device=args.device,
+            remote_tuning=args.remote_tuning,
+        )
+    elif args.action == "profile":
+        profile_segformer(
+            model_name,
+            backend=args.backend,
+            target=args.target,
+            device=args.device,
+            input_device=args.input_device,
+            remote_tuning=args.remote_tuning,
+            n_runs=args.n_runs,
+            image_size=args.image_size,
+            trace_prefix=args.trace_prefix,
+        )
+    else:
+        sys.argv = [sys.argv[0]] + remaining
+        from functools import partial
+        from benchmarks.harness import run_benchmark
+
+        run_benchmark(
+            partial(setup_segformer_benchmark, model_name),
+            f"segformer {model_name}",
+        )
diff --git a/mlir/docc/torch/torch_program.py b/mlir/docc/torch/torch_program.py
index ecd1aa16c..17c0fb526 100644
--- a/mlir/docc/torch/torch_program.py
+++ b/mlir/docc/torch/torch_program.py
@@ -530,6 +530,19 @@ def _docc_dynamo_compiler(gm, example_inputs, backend_options):
     """Dynamic Compiler based on TorchProgram (inference only)."""
     import torch
 
+    # Resolve SymInt/SymFloat values that dynamo passes as graph inputs when a
+    # model (e.g. SegFormer) unpacks tensor shapes and forwards them as explicit
+    # integer arguments to submodules.  torch.export.export cannot handle
+    # torch.SymInt; converting to concrete Python ints/floats is safe here
+    # because these values are always backed by a concrete shape at this point.
+    def _resolve(x):
+        if isinstance(x, torch.SymInt):
+            return int(x)
+        if isinstance(x, torch.SymFloat):
+            return float(x)
+        return x
+    example_inputs = [_resolve(inp) for inp in example_inputs]
+
     if len(example_inputs) == 1:
         example_input = example_inputs[0]
     else:
@@ -560,6 +573,14 @@ def _docc_aot_compiler(gm, example_inputs):
 
         import torch
 
+        def _resolve(x):
+            if isinstance(x, torch.SymInt):
+                return int(x)
+            if isinstance(x, torch.SymFloat):
+                return float(x)
+            return x
+        example_inputs = [_resolve(inp) for inp in example_inputs]
+
         if len(example_inputs) == 1:
             example_input = example_inputs[0]
         else:
diff --git a/opt/src/transformations/map_fusion.cpp b/opt/src/transformations/map_fusion.cpp
index 8cdb30f7d..f2be61250 100644
--- a/opt/src/transformations/map_fusion.cpp
+++ b/opt/src/transformations/map_fusion.cpp
@@ -1196,7 +1196,27 @@ void MapFusion::apply(builder::StructuredSDFGBuilder& builder, analysis::Analysi
         }
     }
 
-    analysis_manager.invalidate_all();
+    if (direction_ == FusionDirection::ProducerIntoConsumer) {
+        // The loop structure is unchanged after ProducerIntoConsumer: only new Block
+        // nodes are inserted into consumer_body_. Patch them into AssumptionsAnalysis
+        // so it stays valid, then preserve it (and LoopAnalysis) across the invalidation.
+        if (analysis_manager.has<analysis::AssumptionsAnalysis>()) {
+            size_t n = fusion_candidates_.size();
+            if (n < consumer_body_->size()) {
+                auto& aa = analysis_manager.get<analysis::AssumptionsAnalysis>();
+                // Original consumer blocks were shifted to index n..size-1; use
+                // the first of them as the scope reference for the new blocks.
+                auto& sibling = consumer_body_->at(n).first;
+                for (size_t i = 0; i < n; ++i) {
+                    aa.register_node(consumer_body_->at(i).first, sibling);
+                }
+            }
+        }
+        analysis_manager.preserve<analysis::AssumptionsAnalysis, analysis::LoopAnalysis>();
+    } else {
+        // ConsumerIntoProducer removes the consumer loop node entirely — full invalidation.
+        analysis_manager.invalidate_all();
+    }
     applied_ = true;
 }
 
diff --git a/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp b/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp
index 842859697..03044339d 100644
--- a/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp
+++ b/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp
@@ -157,10 +157,11 @@ TEST(CudaTransformIm2colTest, CollapsedTwoDimMap) {
     analysis::AnalysisManager analysis_manager(builder.subject());
     CUDATransform transform(outer_map, /*block_size=*/32);
 
-    // The outer map of the collapsed im2col pattern must be recognised as
-    // offloadable to a single CUDA kernel.
+    // Regression: this expects `true`; the failing main branch returns `false`
+    // and the offload pipeline keeps the map on the host.
     EXPECT_TRUE(transform.can_be_applied(builder, analysis_manager))
-        << "OffloadTransform should accept the collapsed im2col map.";
+        << "OffloadTransform regressed on collapsed im2col map: the outer map "
+           "is no longer recognised as offloadable.";
 }
 
 TEST(CudaTransformIm2colTest, ExplicitSixDimMap) {
diff --git a/sdfg/include/sdfg/analysis/analysis.h b/sdfg/include/sdfg/analysis/analysis.h
index 2ccb2efc2..14e7d9eb3 100644
--- a/sdfg/include/sdfg/analysis/analysis.h
+++ b/sdfg/include/sdfg/analysis/analysis.h
@@ -73,6 +73,11 @@ class AnalysisManager {
         return *static_cast<T*>(cache_[type].get());
     }
 
+    template<class T>
+    bool has() const {
+        return cache_.find(std::type_index(typeid(T))) != cache_.end();
+    }
+
     template<class T>
     void invalidate() {
         std::type_index type = std::type_index(typeid(T));
diff --git a/sdfg/include/sdfg/analysis/assumptions_analysis.h b/sdfg/include/sdfg/analysis/assumptions_analysis.h
index e21777adb..dd49baf33 100644
--- a/sdfg/include/sdfg/analysis/assumptions_analysis.h
+++ b/sdfg/include/sdfg/analysis/assumptions_analysis.h
@@ -84,6 +84,13 @@ class AssumptionsAnalysis : public Analysis {
 
     const symbolic::Assumptions& get(structured_control_flow::ControlFlowNode& node, bool include_trivial_bounds = false);
 
+    // Register a newly created node so it inherits the same scope assumptions as
+    // sibling_node. Call this after inserting nodes into a sequence to keep the
+    // cached analysis valid without a full re-run.
+    void register_node(
+        structured_control_flow::ControlFlowNode& new_node, structured_control_flow::ControlFlowNode& sibling_node
+    );
+
     const symbolic::SymbolSet& parameters();
 
     bool is_parameter(const symbolic::Symbol& container);
diff --git a/sdfg/include/sdfg/passes/symbolic/type_minimization.h b/sdfg/include/sdfg/passes/symbolic/type_minimization.h
index 09ae42998..6f3db0951 100644
--- a/sdfg/include/sdfg/passes/symbolic/type_minimization.h
+++ b/sdfg/include/sdfg/passes/symbolic/type_minimization.h
@@ -8,7 +8,6 @@
 #include "sdfg/element.h"
 #include "sdfg/passes/pass.h"
 #include "sdfg/structured_control_flow/block.h"
-#include "sdfg/symbolic/extreme_values.h"
 #include "sdfg/visitor/structured_sdfg_visitor.h"
 
 namespace sdfg {
@@ -16,7 +15,7 @@ namespace passes {
 
 class TypeMinimization : public visitor::NonStoppingStructuredSDFGVisitor {
 private:
-    bool is_safe_trunc(symbolic::Expression expr, symbolic::BoundAnalysis& ba_tight, symbolic::BoundAnalysis& ba_loose);
+    bool is_safe_trunc(symbolic::Expression expr, const symbolic::Assumptions& assumptions);
 
 public:
     TypeMinimization(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager);
diff --git a/sdfg/src/analysis/assumptions_analysis.cpp b/sdfg/src/analysis/assumptions_analysis.cpp
index 67a3661ad..5ba56a9e7 100644
--- a/sdfg/src/analysis/assumptions_analysis.cpp
+++ b/sdfg/src/analysis/assumptions_analysis.cpp
@@ -597,6 +597,19 @@ const symbolic::Assumptions& AssumptionsAnalysis::
     }
 }
 
+void AssumptionsAnalysis::register_node(
+    structured_control_flow::ControlFlowNode& new_node, structured_control_flow::ControlFlowNode& sibling_node
+) {
+    auto it = ref_assumptions_.find(&sibling_node);
+    if (it != ref_assumptions_.end()) {
+        ref_assumptions_[&new_node] = it->second;
+    }
+    auto it2 = ref_assumptions_with_trivial_.find(&sibling_node);
+    if (it2 != ref_assumptions_with_trivial_.end()) {
+        ref_assumptions_with_trivial_[&new_node] = it2->second;
+    }
+}
+
 const symbolic::SymbolSet& AssumptionsAnalysis::parameters() { return this->parameters_; }
 
 bool AssumptionsAnalysis::is_parameter(const symbolic::Symbol& container) {
diff --git a/sdfg/src/analysis/data_dependency_analysis.cpp b/sdfg/src/analysis/data_dependency_analysis.cpp
index 61840fb1a..d79d1b695 100644
--- a/sdfg/src/analysis/data_dependency_analysis.cpp
+++ b/sdfg/src/analysis/data_dependency_analysis.cpp
@@ -745,16 +745,11 @@ bool DataDependencyAnalysis::
     auto current_scope = Users::scope(&current);
     auto& current_assumptions = assumptions_analysis.get(*current_scope, true);
 
-    // One AssumptionsBounds per side, shared across the whole subset-pair scan.
-    // The original used `previous_assumptions, previous_assumptions` (both
-    // sides of `is_subset`), so we only need one bounds object here.
-    symbolic::AssumptionsBounds previous_bounds(previous_assumptions);
-
     // Check if previous subset is subset of any current subset
     for (auto& previous_subset : previous_subsets) {
         bool found = false;
         for (auto& current_subset : current_subsets) {
-            if (symbolic::is_subset(previous_subset, current_subset, previous_bounds, previous_bounds)) {
+            if (symbolic::is_subset(previous_subset, current_subset, previous_assumptions, previous_assumptions)) {
                 found = true;
                 break;
             }
@@ -814,7 +809,6 @@ bool DataDependencyAnalysis::fully_covered(
 
     auto& assumptions_analysis = this->ensure_detailed_assumptions(analysis_manager);
     auto& current_assumptions = assumptions_analysis.get(*Users::scope(&current), true);
-    symbolic::AssumptionsBounds current_bounds(current_assumptions);
 
     // Each read subset must be contained in some single open writer's subset.
     for (auto& read_subset : current_subsets) {
@@ -824,9 +818,8 @@ bool DataDependencyAnalysis::fully_covered(
             if (w->container() != current.container()) continue;
             if (this->is_undefined_user(*w)) continue;
             auto& w_assumptions = assumptions_analysis.get(*Users::scope(w), true);
-            symbolic::AssumptionsBounds w_bounds(w_assumptions);
             for (auto& w_subset : w->subsets()) {
-                if (symbolic::is_subset(read_subset, w_subset, current_bounds, w_bounds)) {
+                if (symbolic::is_subset(read_subset, w_subset, current_assumptions, w_assumptions)) {
                     covered = true;
                     break;
                 }
@@ -868,14 +861,11 @@ bool DataDependencyAnalysis::intersects(User& previous, User& current, analysis:
     auto current_scope = Users::scope(&current);
     auto& current_assumptions = assumptions_analysis.get(*current_scope, true);
 
-    symbolic::AssumptionsBounds previous_bounds(previous_assumptions);
-    symbolic::AssumptionsBounds current_bounds(current_assumptions);
-
     // Check if any current subset intersects with any previous subset
     bool found = false;
     for (auto& current_subset : current_subsets) {
         for (auto& previous_subset : previous_subsets) {
-            if (!symbolic::is_disjoint(current_subset, previous_subset, current_bounds, previous_bounds)) {
+            if (!symbolic::is_disjoint(current_subset, previous_subset, current_assumptions, previous_assumptions)) {
                 found = true;
                 break;
             }
@@ -926,16 +916,13 @@ bool DataDependencyAnalysis::
     auto& previous_assumptions = assumptions_analysis.get(*previous_scope, true);
     auto& current_assumptions = assumptions_analysis.get(*current_scope, true);
 
-    symbolic::AssumptionsBounds previous_bounds(previous_assumptions);
-    symbolic::AssumptionsBounds current_bounds(current_assumptions);
-
     auto& previous_memlets = previous.subsets();
     auto& current_memlets = current.subsets();
 
     for (auto& subset_ : previous_memlets) {
         bool overwritten = false;
         for (auto& subset : current_memlets) {
-            if (symbolic::is_subset(subset_, subset, previous_bounds, current_bounds)) {
+            if (symbolic::is_subset(subset_, subset, previous_assumptions, current_assumptions)) {
                 overwritten = true;
                 break;
             }
@@ -974,16 +961,13 @@ bool DataDependencyAnalysis::depends(analysis::AnalysisManager& analysis_manager
     auto& previous_assumptions = assumptions_analysis.get(*previous_scope, true);
     auto& current_assumptions = assumptions_analysis.get(*current_scope, true);
 
-    symbolic::AssumptionsBounds previous_bounds(previous_assumptions);
-    symbolic::AssumptionsBounds current_bounds(current_assumptions);
-
     auto& previous_memlets = previous.subsets();
     auto& current_memlets = current.subsets();
 
     bool intersect_any = false;
     for (auto& current_subset : current_memlets) {
         for (auto& previous_subset : previous_memlets) {
-            if (!symbolic::is_disjoint(current_subset, previous_subset, current_bounds, previous_bounds)) {
+            if (!symbolic::is_disjoint(current_subset, previous_subset, current_assumptions, previous_assumptions)) {
                 intersect_any = true;
                 break;
             }
diff --git a/sdfg/src/analysis/loop_carried_dependency_analysis.cpp b/sdfg/src/analysis/loop_carried_dependency_analysis.cpp
index a334ebd8b..20ab265da 100644
--- a/sdfg/src/analysis/loop_carried_dependency_analysis.cpp
+++ b/sdfg/src/analysis/loop_carried_dependency_analysis.cpp
@@ -152,17 +152,15 @@ symbolic::maps::DependenceDeltas pair_deltas(
     }
 
     // Collect deltas across all subset pairs and union them.
-    symbolic::AssumptionsBounds previous_bounds(previous_assumptions);
-    symbolic::AssumptionsBounds current_bounds(current_assumptions);
-
     isl_ctx* union_ctx = nullptr;
     isl_set* accumulated = nullptr;
     std::vector<std::string> result_dimensions;
 
     for (auto& previous_subset : previous_subsets) {
         for (auto& current_subset : current_subsets) {
-            auto deltas = symbolic::maps::
-                dependence_deltas(previous_subset, current_subset, loop.indvar(), previous_bounds, current_bounds);
+            auto deltas = symbolic::maps::dependence_deltas(
+                previous_subset, current_subset, loop.indvar(), previous_assumptions, current_assumptions
+            );
             if (deltas.empty) {
                 continue;
             }
diff --git a/sdfg/src/analysis/memory_layout_analysis.cpp b/sdfg/src/analysis/memory_layout_analysis.cpp
index 062284ef6..0a057230d 100644
--- a/sdfg/src/analysis/memory_layout_analysis.cpp
+++ b/sdfg/src/analysis/memory_layout_analysis.cpp
@@ -228,7 +228,21 @@ void MemoryLayoutAnalysis::
 
                 auto result = symbolic::delinearize(linearized_expr, assumptions);
                 if (!result.success) {
-                    continue; // Delinearization failed, skip
+                    // Fallback: register the access as a 1D contiguous range over the
+                    // raw linearized address. We lose multi-dim layout info, but the
+                    // scope-level merge can still bound the access via BoundAnalysis,
+                    // which is enough for downstream consumers like ArgumentsAnalysis
+                    // to compute argument sizes. This recovers patterns where the
+                    // delinearizer rejects the access (e.g. halo offsets producing
+                    // negative constants inside a stride product, or non-strictly-
+                    // dominating strides) but the overall address range is still
+                    // soundly bounded by the enclosing loop assumptions.
+                    symbolic::MultiExpression shape;
+                    shape.push_back(symbolic::symbol("__unbounded__"));
+                    MemoryLayout layout(shape);
+                    MemoryAccess layout_info{container_name, {linearized_expr}, layout, false};
+                    this->accesses_.emplace(&memlet, layout_info);
+                    continue;
                 }
 
                 // Delinearization returns N indices but only N-1 dimensions (from stride division)
diff --git a/sdfg/src/passes/symbolic/type_minimization.cpp b/sdfg/src/passes/symbolic/type_minimization.cpp
index db13ceade..63a31f879 100644
--- a/sdfg/src/passes/symbolic/type_minimization.cpp
+++ b/sdfg/src/passes/symbolic/type_minimization.cpp
@@ -13,13 +13,12 @@ namespace passes {
 TypeMinimization::TypeMinimization(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager)
     : visitor::NonStoppingStructuredSDFGVisitor(builder, analysis_manager) {};
 
-bool TypeMinimization::
-    is_safe_trunc(symbolic::Expression expr, symbolic::BoundAnalysis& ba_tight, symbolic::BoundAnalysis& ba_loose) {
+bool TypeMinimization::is_safe_trunc(symbolic::Expression expr, const symbolic::Assumptions& assumptions) {
     size_t output_bitwidth = 32;
     int64_t output_min_value_signed = 0;
     int64_t output_max_value_signed = (1ULL << (output_bitwidth - 1)) - 1;
 
-    auto mini = ba_tight.lower_bound(expr);
+    auto mini = symbolic::minimum(expr, {}, assumptions, true);
     if (mini.is_null()) {
         return false;
     }
@@ -28,7 +27,7 @@ bool TypeMinimization::
         return false;
     }
 
-    auto maxi = ba_loose.upper_bound(expr);
+    auto maxi = symbolic::maximum(expr, {}, assumptions, false);
     if (maxi.is_null()) {
         return false;
     }
@@ -46,13 +45,6 @@ bool TypeMinimization::accept(structured_control_flow::Block& block) {
     auto& assumptions_analysis = this->analysis_manager_.get<analysis::AssumptionsAnalysis>();
     auto& block_assumptions = assumptions_analysis.get(block, true);
 
-    // One BoundAnalysis pair for the whole block: every is_safe_trunc call here
-    // shares the same empty parameter set and the same assumptions, so the
-    // internal cache amortizes across all truncs in the block.
-    static const symbolic::SymbolSet no_params;
-    symbolic::BoundAnalysis ba_tight(no_params, block_assumptions, true);
-    symbolic::BoundAnalysis ba_loose(no_params, block_assumptions, false);
-
     symbolic::ExpressionMap replacements;
     for (auto& edge : dfg.edges()) {
         auto& subset = edge.subset();
@@ -67,7 +59,7 @@ bool TypeMinimization::accept(structured_control_flow::Block& block) {
                     continue;
                 }
                 auto arg = trunc_func->get_args()[0];
-                if (!this->is_safe_trunc(arg, ba_tight, ba_loose)) {
+                if (!this->is_safe_trunc(arg, block_assumptions)) {
                     continue;
                 }
 
@@ -101,10 +93,6 @@ bool TypeMinimization::accept(structured_control_flow::For& loop) {
     auto& assumptions_analysis = this->analysis_manager_.get<analysis::AssumptionsAnalysis>();
     auto& block_assumptions = assumptions_analysis.get(loop, true);
 
-    static const symbolic::SymbolSet no_params;
-    symbolic::BoundAnalysis ba_tight(no_params, block_assumptions, true);
-    symbolic::BoundAnalysis ba_loose(no_params, block_assumptions, false);
-
     symbolic::ExpressionMap replacements;
     auto truncs = symbolic::find<SymEngine::FunctionSymbol>(loop.condition());
     for (auto& trunc : truncs) {
@@ -116,7 +104,7 @@ bool TypeMinimization::accept(structured_control_flow::For& loop) {
             continue;
         }
         auto arg = trunc_func->get_args()[0];
-        if (!this->is_safe_trunc(arg, ba_tight, ba_loose)) {
+        if (!this->is_safe_trunc(arg, block_assumptions)) {
             continue;
         }