diff --git a/.daisy/mlir_torch_layers.yml b/.daisy/mlir_torch_layers.yml deleted file mode 100644 index 7716791e9..000000000 --- a/.daisy/mlir_torch_layers.yml +++ /dev/null @@ -1,244 +0,0 @@ -on: - push: - branches: - - main - schedule: - - cron: '0 0 * * *' - -parameters: - container: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - timeout: 150 - partitions: - - chamomile - -steps: - build: | - python3.11 -m venv venv - . venv/bin/activate - - python -m pip install --upgrade pip - pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - pip install numpy scipy - - pip install --no-build-isolation -e python/ - pip install --no-build-isolation -e mlir/ - - pip install -r mlir/requirements.txt - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --n_runs=1 --docc --target=cuda - - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --n_runs=1 --docc --target=cuda - - run: - - # layer batchnorm - - batchnorm_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --torch - energy: true - measurements: 3 - batchnorm_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - batchnorm_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - batchnorm_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - batchnorm_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/batchnorm.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer conv2d - - conv2d_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --torch - energy: true - measurements: 3 - # conv2d_run_none: - # command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=none - # energy: true - # env: - # DOCC_CI: regions - # DOCC_REUSE_BINARIES: 1 - # conv2d_run_sequential: - # command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=sequential - # energy: true - # env: - # DOCC_CI: regions - # DOCC_REUSE_BINARIES: 1 - conv2d_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - conv2d_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/conv2d.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer linear - - linear_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --torch - energy: true - measurements: 3 - linear_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - linear_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - linear_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - linear_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/linear.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer matmul - - matmul_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --torch - energy: true - measurements: 3 - matmul_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - matmul_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - matmul_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - matmul_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/matmul.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer pooling - - pooling_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --torch - energy: true - measurements: 3 - pooling_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - pooling_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - pooling_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - pooling_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/pooling.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - - # layer relu - - relu_torch: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --torch - energy: true - measurements: 3 - relu_run_none: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - relu_run_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - relu_run_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - relu_run_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/layers/relu.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 diff --git a/.daisy/mlir_torch_models.yml b/.daisy/mlir_torch_models.yml deleted file mode 100644 index 8b917b6a0..000000000 --- a/.daisy/mlir_torch_models.yml +++ /dev/null @@ -1,66 +0,0 @@ -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -parameters: - container: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - timeout: 120 - partitions: - - chamomile - -steps: - build: | - python3.11 -m venv venv - . venv/bin/activate - - python -m pip install --upgrade pip - pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - pip install numpy scipy - - pip install --no-build-isolation -e python/ - pip install --no-build-isolation -e mlir/ - - pip install -r mlir/requirements.txt - - # Warm start - - venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --torch - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=none - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=sequential - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=openmp - DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --n_runs=1 --docc --target=cuda - - run: - - # model resnet18 - - resnet18_torch: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --torch - energy: true - resnet18_docc_none: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=none - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - resnet18_docc_sequential: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=sequential - energy: true - env: - DOCC_CI: true - DOCC_REUSE_BINARIES: 1 - resnet18_docc_openmp: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=openmp - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 - resnet18_docc_cuda: - command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/resnet18.py --docc --target=cuda - energy: true - env: - DOCC_CI: regions - DOCC_REUSE_BINARIES: 1 diff --git a/.daisy/mlir_torch_segformer.yml b/.daisy/mlir_torch_segformer.yml new file mode 100644 index 000000000..5403c62d5 --- /dev/null +++ b/.daisy/mlir_torch_segformer.yml @@ -0,0 +1,51 @@ +on: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +parameters: + container: daisytuner/docc-build-env-llvm19-base:latest-amd64 + timeout: 240 + partitions: + - chamomile + +steps: + build: | + python3.11 -m venv venv + . venv/bin/activate + + python -m pip install --upgrade pip + pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core + pip install numpy scipy transformers + + pip install --no-build-isolation -e python/ + pip install --no-build-isolation -e mlir/ + + pip install -r mlir/requirements.txt + + # Override CPU torch with CUDA wheels for torch GPU benchmarks + pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126 + + # Warm start (Torch benchmark on CUDA) + venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=torch --device=cuda + + # Warm start (DOCC benchmark, CUDA target) + DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu + + run: + + segformer_b0_docc_sequential: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu + energy: true + env: + DOCC_REUSE_BINARIES: 1 + __DAISY_CAPTURE_STRATEGY_DEFAULT: once + + segformer_b0_docc_cuda: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=cuda --device=cpu + energy: true + env: + DOCC_CI: "" + DOCC_REUSE_BINARIES: 1 diff --git a/.daisy/mlir_torch_segformer_b2.yml b/.daisy/mlir_torch_segformer_b2.yml new file mode 100644 index 000000000..cc8333e5b --- /dev/null +++ b/.daisy/mlir_torch_segformer_b2.yml @@ -0,0 +1,52 @@ +on: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +parameters: + container: daisytuner/docc-build-env-llvm19-base:latest-amd64 + timeout: 480 + partitions: + - chamomile + +steps: + build: | + python3.11 -m venv venv + . venv/bin/activate + + python -m pip install --upgrade pip + pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core + pip install numpy scipy transformers + + pip install --no-build-isolation -e python/ + pip install --no-build-isolation -e mlir/ + + pip install -r mlir/requirements.txt + + # Override CPU torch with CUDA wheels for torch GPU benchmarks + pip install --upgrade --extra-index-url https://download.pytorch.org/whl/cu126 torch==2.10.0+cu126 torchvision==0.25.0+cu126 + + # Warm start (Torch benchmark on CUDA) + venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda + + # Warm start (DOCC benchmark, CUDA target) + DOCC_CI=regions venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=docc --target=cuda --device=cpu + + run: + + # model segformer b2 (Torch CUDA) + + segformer_b2_torch_cuda: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=torch --device=cuda + energy: true + + # model segformer b2 (DOCC CUDA target) + + segformer_b2_docc_cuda: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b2 --backend=docc --target=cuda --device=cpu + energy: true + env: + DOCC_CI: "" + DOCC_REUSE_BINARIES: 1 diff --git a/.daisy/mlir_torch_segformer_sequential.yml b/.daisy/mlir_torch_segformer_sequential.yml new file mode 100644 index 000000000..36569b277 --- /dev/null +++ b/.daisy/mlir_torch_segformer_sequential.yml @@ -0,0 +1,40 @@ +on: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +parameters: + container: daisytuner/docc-build-env-llvm19-base:latest-amd64 + timeout: 720 + partitions: + - chamomile + +steps: + build: | + python3.11 -m venv venv + . venv/bin/activate + + python -m pip install --upgrade pip + pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core + pip install numpy scipy transformers + + pip install --no-build-isolation -e python/ + pip install --no-build-isolation -e mlir/ + + pip install -r mlir/requirements.txt + + # Warm start (DOCC benchmark, sequential target) + __DAISY_CAPTURE_STRATEGY_DEFAULT=once DOCC_CI=1 venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu + + run: + + # model segformer b0 (DOCC sequential target) + + segformer_b0_docc_sequential: + command: venv/bin/python3 mlir/benchmarks/torch/model_zoo/segformer_test.py --action=benchmark_segformer --version=b0 --backend=docc --target=sequential --device=cpu + energy: true + env: + DOCC_REUSE_BINARIES: 1 + __DAISY_CAPTURE_STRATEGY_DEFAULT: once diff --git a/.daisy/python_npbench.yml b/.daisy/python_npbench.yml deleted file mode 100644 index d31fc020b..000000000 --- a/.daisy/python_npbench.yml +++ /dev/null @@ -1,267 +0,0 @@ -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -parameters: - container: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - timeout: 120 - partitions: - - zinnia - -steps: - build: | - apt-get install -y python3-venv python3-pip - - python3 -m venv venv - . venv/bin/activate - - python -m pip install --upgrade pip - pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - pip install numpy scipy - - pip install --no-build-isolation -v -e python/ - - run: - adi_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - adi_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - adi_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - adi_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_adi.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - atax_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - atax_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - atax_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - atax_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_atax.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - gemm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - gemm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - gemm_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - gemm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - gesummv_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - gesummv_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - gesummv_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - gesummv_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gesummv.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - gemver_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - gemver_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - gemver_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - gemver_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_gemver.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - k2mm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - k2mm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - k2mm_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - k2mm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k2mm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - k3mm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - k3mm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - k3mm_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - k3mm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_k3mm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - mvt_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - mvt_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - mvt_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - mvt_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_mvt.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - symm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - symm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - symm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - # symm_cuda: - # command: venv/bin/python3 python/benchmarks/npbench/polybench/test_symm.py --docc --size=M --target=cuda - # energy: true - # env: - # DOCC_CI: regions - syr2k_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - syr2k_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - syr2k_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - syr2k_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syr2k.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - syrk_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - syrk_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - syrk_cuda: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=cuda - energy: true - env: - DOCC_CI: regions - syrk_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_syrk.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - trmm_numpy: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --numpy --size=M - energy: true - env: - DOCC_CI: regions - trmm_omp: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=openmp - energy: true - env: - DOCC_CI: regions - trmm_seq_tuning: - command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=sequential --remote-tuning - energy: true - env: - DOCC_CI: regions - # trmm_cuda: - # command: venv/bin/python3 python/benchmarks/npbench/polybench/test_trmm.py --docc --size=M --target=cuda - # energy: true - # env: - # DOCC_CI: regions diff --git a/.github/workflows/llvm_tests_san.yml b/.github/workflows/llvm_tests_san.yml deleted file mode 100644 index e4328f190..000000000 --- a/.github/workflows/llvm_tests_san.yml +++ /dev/null @@ -1,83 +0,0 @@ -name: LLVM - Unit and Integration Sanitized Tests - -on: - push: - branches: - - main - - llvm-test-suite - schedule: - - cron: "0 4 * * *" - -jobs: - llvm-tests-linux-san: - runs-on: - group: dahlia - labels: Linux - container: - image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - strategy: - fail-fast: false - matrix: - san: ["address", "leak", "undefined"] - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Mark GitHub Actions workdir as safe - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - - name: Build - run: | - mkdir build - cd build - cmake -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=${{ matrix.san }} \ - -DLLVM_BUILD_FRONTEND=ON \ - -DLLVM_BUILD_TESTS=ON \ - -DSDFG_BUILD_TESTS=OFF \ - -DINSTALL_GTEST=OFF \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - cpack -G DEB - apt-get install -y ./docc-llvm*.deb - - - name: Unit Tests - run: | - cd build - ./llvm/tests/docc_llvm_pass_test - - - name: Set up Python - if: matrix.san == 'leak' - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - - name: Setup virtual environment - if: matrix.san == 'leak' - run: | - python -m venv .venv - echo "$PWD/.venv/bin" >> $GITHUB_PATH - - - name: Install dependencies - if: matrix.san == 'leak' - run: | - python -m pip install --upgrade pip - pip install pytest==7.1.3 pytest-parallel lit - - - name: Integration Tests - # The docc C/C++ compiler currently only works with leak sanitizer - if: matrix.san == 'leak' - run: | - export LLVM_SYMBOLIZER_PATH=$(which llvm-symbolizer-19) - - cd llvm/integration - pytest -v llvm_test_suite.py diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 7a74ae854..000000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,200 +0,0 @@ -name: Release - -on: - push: - tags: - - "v*.*.*" - -jobs: - # Stage 1: Build docc-compiler (no dependencies) - wheels-compiler: - name: Compiler (${{ matrix.os }}, ${{ matrix.python }}) - runs-on: ${{ matrix.os }} - - strategy: - fail-fast: false - matrix: - os: [build-amd64-big, build-arm64-big, macos-14] - python: ["cp311", "cp312", "cp313", "cp314"] - include: - - os: build-amd64-big - cibw_archs: x86_64 - - os: build-arm64-big - cibw_archs: aarch64 - - os: macos-14 - cibw_archs: arm64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - uses: pypa/cibuildwheel@v3.3.1 - with: - package-dir: python/ - output-dir: wheelhouse - env: - CIBW_ARCHS: ${{ matrix.cibw_archs }} - CIBW_BUILD: "${{ matrix.python }}-*" - - - uses: actions/upload-artifact@v4 - with: - name: wheels-docc-compiler-${{ matrix.os }}-${{ matrix.python }} - path: wheelhouse/*.whl - - # Stage 2: Build docc-ai (depends on docc-compiler) - wheels-ai: - name: AI (${{ matrix.os }}, ${{ matrix.python }}) - needs: [wheels-compiler] - runs-on: ${{ matrix.os }} - - strategy: - fail-fast: false - matrix: - os: [build-amd64-big, build-arm64-big, macos-14] - python: ["cp311", "cp312"] - include: - - os: build-amd64-big - cibw_archs: x86_64 - - os: build-arm64-big - cibw_archs: aarch64 - - os: macos-14 - cibw_archs: arm64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - # Pin docc-compiler version to match release - # - name: Pin docc-compiler version - # run: | - # VERSION=$(cat VERSION) - # sed -i.bak "s/\"docc-compiler\"/\"docc-compiler==$VERSION\"/" mlir/pyproject.toml && rm mlir/pyproject.toml.bak - - # Download compiler wheels into the package directory so they're available in container - # - uses: actions/download-artifact@v4 - # with: - # pattern: wheels-docc-compiler-${{ matrix.os }}-* - # path: mlir/compiler-wheels - # merge-multiple: true - - - uses: pypa/cibuildwheel@v3.3.1 - with: - package-dir: mlir/ - output-dir: wheelhouse - env: - CIBW_ARCHS: ${{ matrix.cibw_archs }} - CIBW_BUILD: "${{ matrix.python }}-*" - # Install docc-compiler before building docc-ai - # CIBW_BEFORE_BUILD: "pip install --no-index --find-links {project}/compiler-wheels docc-compiler" - # Make compiler wheels available for dependency resolution during test - # CIBW_ENVIRONMENT: "PIP_FIND_LINKS={project}/compiler-wheels" - - - uses: actions/upload-artifact@v4 - with: - name: wheels-docc-ai-${{ matrix.os }}-${{ matrix.python }} - path: wheelhouse/*.whl - - wheels-publish: - needs: [wheels-compiler, wheels-ai] - runs-on: build-amd64-big - permissions: - id-token: write - - steps: - - uses: actions/download-artifact@v4 - with: - pattern: wheels-* - path: dist - merge-multiple: true - - - uses: pypa/gh-action-pypi-publish@v1.10.0 - - packages-llvm: - strategy: - matrix: - include: - - platform: ubuntu-24.04 - package-format: deb - cpack-generator: DEB - upload-dist-id: ubuntu - upload-dist-version: 24.04 - runner: build-amd64-big - architecture: x64 - image: daisytuner/docc-build-env-llvm19-ubuntu-24.04:latest-amd64 - - platform: ubuntu-24.04 - package-format: deb - cpack-generator: DEB - upload-dist-id: ubuntu - upload-dist-version: 24.04 - runner: build-arm64-big - architecture: arm64 - image: daisytuner/docc-build-env-llvm19-ubuntu-24.04:latest-arm64 - - platform: rhel-10 - package-format: rpm - cpack-generator: RPM - upload-dist-id: rhel - upload-dist-version: 10 - upload-dist-platform-id: platform:el10 - runner: build-amd64-big - architecture: x64 - image: daisytuner/docc-build-env-llvm19-rhel-10:latest-amd64 - - platform: debian-13 - package-format: deb - cpack-generator: DEB - upload-dist-id: debian - upload-dist-version: 13 - runner: build-amd64-big - architecture: x64 - image: daisytuner/docc-build-env-llvm19-debian-13:latest-amd64 - - runs-on: ${{ matrix.runner }} - container: - image: ${{ matrix.image }} - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Define Version - id: define_version - run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT - - - name: Build package - run: | - mkdir -p build - cd build - cmake -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Release \ - -DINSTALL_GTEST=OFF \ - -DBUILD_TESTS:BOOL=OFF \ - -DSDFGLIB_AUTO_INSTALL_MODE=ON \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - -DRELEASE_PACKAGE=ON \ - -DPACKAGE_WITH_TOOL_DEPS=ON \ - .. - ninja -j$(nproc) - cpack -G ${{ matrix.cpack-generator }} - - - name: Upload docc package as Artifact - uses: actions/upload-artifact@v4 - with: - name: docc-${{ matrix.platform }}-${{ matrix.architecture }} - path: "build/*.${{ matrix.package-format }}" - - - name: Upload docc package to Firebase - uses: daisytuner/upload-distribution-action@main - with: - file: "build/*.${{ matrix.package-format }}" - version: ${{ steps.define_version.outputs.VERSION }} - architecture: ${{ matrix.architecture }} - dist-id: ${{ matrix.upload-dist-id }} - dist-version: ${{ matrix.upload-dist-version }} - dist-platform-id: ${{ matrix.upload-dist-platform-id }} - token: ${{ secrets.DOCC_RELEASE_TOKEN }} - url: /v1/system/docc-distributions/upload diff --git a/.github/workflows/sanitizer_tests_asan.yml b/.github/workflows/sanitizer_tests_asan.yml deleted file mode 100644 index 5ebca361d..000000000 --- a/.github/workflows/sanitizer_tests_asan.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: Sanitizer Tests (Address) - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -jobs: - sanitizer-linux-asan: - runs-on: - group: dahlia - labels: openmp - container: - image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Formatting - shell: bash - run: | - shopt -s globstar - clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp - clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp - - - name: Build and test - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=address \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - - sanitizer-macos-asan: - runs-on: - group: dahlia - labels: macOS - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Install dependencies - run: | - brew install ninja cmake - brew install gmp isl nlohmann-json boost - brew install libomp - - - name: Build - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=address \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja - - - name: Unit Tests - run: | - cd build/ - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - ./tutorial/printf_target/tests/printf_target_test diff --git a/.github/workflows/sanitizer_tests_lsan.yml b/.github/workflows/sanitizer_tests_lsan.yml deleted file mode 100644 index 69d5c5a36..000000000 --- a/.github/workflows/sanitizer_tests_lsan.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: Sanitizer Tests (Leak) - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -jobs: - sanitizer-linux-lsan: - runs-on: - group: dahlia - labels: openmp - container: - image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Formatting - shell: bash - run: | - shopt -s globstar - clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp - clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp - - - name: Build and test - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=leak \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test diff --git a/.github/workflows/sanitizer_tests_ubsan.yml b/.github/workflows/sanitizer_tests_ubsan.yml deleted file mode 100644 index f8d95b67e..000000000 --- a/.github/workflows/sanitizer_tests_ubsan.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: Sanitizer Tests (Undefined Behavior) - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - -jobs: - sanitizer-linux-ubsan: - runs-on: - group: dahlia - labels: openmp - container: - image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Formatting - shell: bash - run: | - shopt -s globstar - clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp - clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp - - - name: Build and test - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSDFG_ENABLE_SANITIZER=ON \ - -DSDFG_SANITIZER=undefined \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test diff --git a/.github/workflows/unit_tests_macos.yml b/.github/workflows/unit_tests_macos.yml deleted file mode 100644 index fe8db86fd..000000000 --- a/.github/workflows/unit_tests_macos.yml +++ /dev/null @@ -1,95 +0,0 @@ -name: Unit Tests (macOS) - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - schedule: - - cron: "0 4 * * *" - -jobs: - primary-tests-macos: - runs-on: - group: dahlia - labels: macOS - - env: - python_version: "3.14" - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Install dependencies - run: | - brew install ninja cmake - brew install gmp isl nlohmann-json boost - brew install libomp - brew install uv - - - name: Set up Python ${{ env.python_version }} - run: | - uv python install ${{ env.python_version }} - uv venv --python ${{ env.python_version }} .venv - echo "$PWD/.venv/bin" >> $GITHUB_PATH - echo "PYTHONPATH=$PWD/python" >> $GITHUB_ENV - - - name: Install Python dependencies - run: | - uv pip install pybind11 pytest coverage black==25.9.0 build scikit-build-core - uv pip install numpy scipy ml_dtypes - - - name: Build - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_BUILD_TYPE=Debug \ - -DPYTHON_BUILD_FRONTEND=ON \ - -Dpybind11_DIR=$GITHUB_WORKSPACE/.venv/lib/python${{ env.python_version }}/site-packages/pybind11/share/cmake/pybind11 \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja - - - name: Unit Tests - run: | - cd build/ - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - ./tutorial/printf_target/tests/printf_target_test - - - name: Test Arg-Capture-IO - run: | - cd build - ./arg-capture-io/tests/capture_io_test - - - name: Python Unit Tests - env: - DOCC_ACCESS_TOKEN: ${{ secrets.DOCC_CI_TOKEN }} - run: | - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - pytest -v python/tests -m "unmarked" - - - name: Python Integration Tests - run: | - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - pytest -v python/benchmarks/ - - # - name: Test RTL - # run: | - # export CPATH=/usr/local/include:$CPATH - # export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH - # export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - # export PATH=/usr/local/bin:$PATH - - # pip install pytest==7.1.3 --break-system-packages - # pip install pytest-parallel --break-system-packages - - # cd rtl/tests - # pytest -v -s rtl_tests.py diff --git a/.github/workflows/unit_tests_release.yml b/.github/workflows/unit_tests_release.yml deleted file mode 100644 index e881251bf..000000000 --- a/.github/workflows/unit_tests_release.yml +++ /dev/null @@ -1,113 +0,0 @@ -name: Unit Tests - Release - -on: - push: - branches: - - main - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - schedule: - - cron: "0 4 * * *" - -jobs: - release-linux: - runs-on: - group: dahlia - labels: RTX5060 - container: - image: daisytuner/docc-run-env-llvm19-ubuntu-24.04:latest-amd64 - options: >- - --cap-add=PERFMON - --gpus=all - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Mark GitHub Actions workdir as safe - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - - name: Formatting - shell: bash - run: | - shopt -s globstar - clang-format-19 -style=file --dry-run --Werror sdfg/include/**/*.h sdfg/src/**/*.cpp sdfg/tests/**/*.cpp - clang-format-19 -style=file --dry-run --Werror opt/include/**/*.h opt/src/**/*.cpp opt/tests/**/*.cpp - - - name: Build and test - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_C_COMPILER=clang-19 \ - -DCMAKE_CXX_COMPILER=clang++-19 \ - -DCMAKE_INSTALL_PREFIX=/usr/local \ - -DCMAKE_BUILD_TYPE=Release \ - -DLLVM_BUILD_FRONTEND=ON \ - -DLLVM_BUILD_TESTS=ON \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja -j$(nproc) - ninja install - - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - ./llvm/tests/docc_llvm_pass_test - - - name: Test RTL - run: | - export CPATH=/usr/local/include:$CPATH - export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - export PATH=/usr/local/bin:$PATH - - pip install pytest==7.1.3 --break-system-packages - pip install pytest-parallel --break-system-packages - - cd rtl/tests - pytest -v -s rtl_tests.py - - release-macos: - runs-on: - group: dahlia - labels: macOS - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Install dependencies - run: | - brew install ninja cmake - brew install gmp isl nlohmann-json boost - brew install libomp - - - name: Build - run: | - mkdir build - cd build - cmake \ - -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DBUILD_TESTS:BOOL=OFF \ - -DBUILD_BENCHMARKS:BOOL=OFF \ - -DBUILD_BENCHMARKS_GOOGLE:BOOL=OFF \ - .. - ninja - - - name: Unit Tests - run: | - cd build/ - ./sdfg/tests/sdfglib_test - ./opt/tests/sdfgopt_test - ./tutorial/printf_target/tests/printf_target_test - - - name: Test Arg-Capture-IO - run: | - cd build - ./arg-capture-io/tests/capture_io_test diff --git a/mlir/benchmarks/harness.py b/mlir/benchmarks/harness.py index 08f3a1783..2852a85be 100644 --- a/mlir/benchmarks/harness.py +++ b/mlir/benchmarks/harness.py @@ -3,11 +3,13 @@ import time import docc.torch + def run_benchmark(setup_func, name): parser = argparse.ArgumentParser() parser.add_argument("--docc", action="store_true") parser.add_argument("--torch", action="store_true") parser.add_argument("--target", type=str, default="none") + parser.add_argument("--remote_tuning", action="store_true") parser.add_argument("--n_runs", type=int, default=10) args = parser.parse_args() @@ -24,15 +26,26 @@ def run_benchmark(setup_func, name): program(model_input) end = time.time() print(f"{name} torch execution time: {end - start:.6f} seconds") - + if args.docc: for _ in range(args.n_runs): start = time.time() with torch.no_grad(): - program = torch.compile(model, backend="docc", options={"target": args.target, "category": "server"}) + program = torch.compile( + model, + backend="docc", + options={ + "target": args.target, + "category": "server", + "remote_tuning": args.remote_tuning, + }, + ) if type(model_input) == tuple: program(*model_input) else: program(model_input) end = time.time() - print(f"{name} docc execution time: {end - start:.6f} seconds") + print( + f"{name} docc execution time: {end - start:.6f} seconds " + f"(remote_tuning={args.remote_tuning})" + ) diff --git a/mlir/benchmarks/torch/layers/softmax.py b/mlir/benchmarks/torch/layers/softmax.py new file mode 100644 index 000000000..3711fc58f --- /dev/null +++ b/mlir/benchmarks/torch/layers/softmax.py @@ -0,0 +1,69 @@ +import torch +import torch.nn as nn + +from benchmarks.harness import run_benchmark + + +class SoftmaxNet(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.softmax = nn.Softmax(dim=dim) + + def forward(self, x: torch.Tensor): + return self.softmax(x) + + +class LogSoftmaxNet(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.log_softmax = nn.LogSoftmax(dim=dim) + + def forward(self, x: torch.Tensor): + return self.log_softmax(x) + + +# batch=64, classes=1000 — classifier output +def setup_softmax_classifier(): + model = SoftmaxNet(dim=1) + x = torch.randn(64, 1000) + return model, x + + +# batch=64, seq_len=512, features=768 — transformer-style attention scores +def setup_softmax_attention(): + model = SoftmaxNet(dim=-1) + x = torch.randn(64, 512, 768) + return model, x + + +# batch=64, classes=1000 — log-softmax for NLLLoss +def setup_log_softmax(): + model = LogSoftmaxNet(dim=1) + x = torch.randn(64, 1000) + return model, x + + +BENCHMARKS = { + "softmax_classifier": setup_softmax_classifier, + "softmax_attention": setup_softmax_attention, + "log_softmax": setup_log_softmax, +} + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Softmax layer benchmarks") + parser.add_argument( + "--variant", + type=str, + choices=list(BENCHMARKS.keys()), + default="softmax_classifier", + help="Softmax variant to benchmark", + ) + args, remaining = parser.parse_known_args() + + import sys + + sys.argv = [sys.argv[0]] + remaining + + run_benchmark(BENCHMARKS[args.variant], args.variant) diff --git a/mlir/benchmarks/torch/model_zoo/segformer_profile.py b/mlir/benchmarks/torch/model_zoo/segformer_profile.py new file mode 100644 index 000000000..89849874e --- /dev/null +++ b/mlir/benchmarks/torch/model_zoo/segformer_profile.py @@ -0,0 +1,238 @@ +import argparse +import time + +import torch +from torch.profiler import ProfilerActivity, profile +from transformers import SegformerForSemanticSegmentation + +import docc.torch + + +SEGFORMER_MODELS = { + "b0": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024", + "b1": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", + "b2": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024", + "b3": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024", + "b4": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024", + "b5": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024", +} + + +def resolve_model_name(version: str, model: str | None) -> str: + if model: + return model + return SEGFORMER_MODELS[version] + + +def _assert_cuda_arch_supported() -> None: + capability = torch.cuda.get_device_capability() + current_arch = f"sm_{capability[0]}{capability[1]}" + supported_arches = set(torch.cuda.get_arch_list()) + if current_arch not in supported_arches: + supported_str = " ".join(sorted(supported_arches)) + raise RuntimeError( + "The active PyTorch CUDA build does not support this GPU architecture " + f"({current_arch}). Supported architectures: {supported_str}. " + "Install a compatible CUDA wheel (for RTX 50xx typically cu128+), " + "or run with --device cpu." + ) + + +def setup_segformer( + model_name: str, + model_device: str, + image_size: int, + input_device: str | None = None, +) -> tuple[torch.nn.Module, torch.Tensor]: + if input_device is None: + input_device = model_device + + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() + if model_device == "cuda": + if not torch.cuda.is_available(): + raise RuntimeError("CUDA requested but not available") + _assert_cuda_arch_supported() + model = model.to("cuda") + + if input_device == "cuda" and not torch.cuda.is_available(): + raise RuntimeError("CUDA input requested but not available") + + model_input = torch.randn(1, 3, image_size, image_size, device=input_device) + return model, model_input + + +def _model_device(model: torch.nn.Module) -> torch.device: + try: + return next(model.parameters()).device + except StopIteration: + return torch.device("cpu") + + +def _materialize_output(res: object) -> None: + if isinstance(res, dict): + _ = {k: v.cpu() if torch.is_tensor(v) else v for k, v in res.items()} + elif hasattr(res, "logits") and torch.is_tensor(res.logits): + _ = res.logits.cpu() + + +def _run_once(program: torch.nn.Module, model_input: torch.Tensor, model_dev: torch.device) -> None: + current_input = model_input + if current_input.device != model_dev: + current_input = current_input.to(model_dev, non_blocking=True) + + res = program(pixel_values=current_input) + _materialize_output(res) + if model_dev.type == "cuda": + torch.cuda.synchronize(model_dev) + + +def run_torch_profile(model: torch.nn.Module, model_input: torch.Tensor, n_runs: int, trace_prefix: str) -> None: + model_dev = _model_device(model) + with torch.no_grad(): + compile_start = time.perf_counter() + program = torch.compile(model) + _run_once(program, model_input, model_dev) + compile_end = time.perf_counter() + print(f"Torch compile+first-run: {(compile_end - compile_start):.6f} s") + + _run_once(program, model_input, model_dev) + activities = [ProfilerActivity.CPU] + if model_dev.type == "cuda": + activities.append(ProfilerActivity.CUDA) + + for i in range(n_runs): + start = time.perf_counter() + with profile(activities=activities, record_shapes=True) as prof: + _run_once(program, model_input, model_dev) + end = time.perf_counter() + + trace_path = f"{trace_prefix}_torch_{i}.json" + prof.export_chrome_trace(trace_path) + print(f"Torch runtime run {i}: {(end - start):.6f} s, trace={trace_path}") + + +def run_docc_profile( + model: torch.nn.Module, + model_input: torch.Tensor, + n_runs: int, + target: str, + remote_tuning: bool, + trace_prefix: str, +) -> None: + model_dev = _model_device(model) + with torch.no_grad(): + compile_start = time.perf_counter() + program = torch.compile( + model, + backend="docc", + options={"target": target, "category": "server", "remote_tuning": remote_tuning}, + ) + _run_once(program, model_input, model_dev) + compile_end = time.perf_counter() + print( + f"DOCC compile+first-run ({target}, remote_tuning={remote_tuning}): " + f"{(compile_end - compile_start):.6f} s" + ) + + _run_once(program, model_input, model_dev) + activities = [ProfilerActivity.CPU] + if model_dev.type == "cuda": + activities.append(ProfilerActivity.CUDA) + + for i in range(n_runs): + start = time.perf_counter() + with profile(activities=activities, record_shapes=True) as prof: + _run_once(program, model_input, model_dev) + end = time.perf_counter() + + trace_path = f"{trace_prefix}_docc_{target}_{i}.json" + prof.export_chrome_trace(trace_path) + print(f"DOCC runtime run {i}: {(end - start):.6f} s, trace={trace_path}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Profile SegFormer with Torch and/or DOCC backend") + parser.add_argument("--docc", action="store_true", help="Run DOCC backend") + parser.add_argument("--torch", action="store_true", dest="run_torch", help="Run Torch backend") + parser.add_argument( + "--version", + type=str, + choices=list(SEGFORMER_MODELS.keys()), + default="b0", + help="SegFormer variant to use when --model is not provided", + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="Optional Hugging Face model id to override --version", + ) + parser.add_argument("--target", type=str, default="none", help="DOCC target") + parser.add_argument( + "--remote_tuning", + action="store_true", + help="Enable DOCC remote tuning during compilation", + ) + parser.add_argument("--n_runs", type=int, default=10, help="Number of runs per backend") + parser.add_argument( + "--device", + type=str, + choices=["cpu", "cuda"], + default="cpu", + help="Device for model and input tensor", + ) + parser.add_argument( + "--input_device", + type=str, + choices=["cpu", "cuda"], + default=None, + help="Device where input tensor is created (defaults to --device)", + ) + parser.add_argument("--image_size", type=int, default=512, help="Input image size") + parser.add_argument( + "--trace_prefix", + type=str, + default="segformer_trace", + help="Prefix for exported Torch profiler traces", + ) + args = parser.parse_args() + + if not args.docc and not args.run_torch: + parser.error("Specify at least one backend: --torch and/or --docc") + + return args + + +def main() -> None: + args = parse_args() + model_name = resolve_model_name(args.version, args.model) + input_device = args.input_device if args.input_device is not None else args.device + model, model_input = setup_segformer( + model_name, + args.device, + args.image_size, + input_device=input_device, + ) + + print(f"Model: {model_name}") + print(f"Device: {args.device}") + print(f"Input device: {input_device}") + print(f"Remote tuning: {args.remote_tuning}") + print(f"Runs: {args.n_runs}") + + if args.run_torch: + run_torch_profile(model, model_input, args.n_runs, args.trace_prefix) + + if args.docc: + run_docc_profile( + model, + model_input, + args.n_runs, + args.target, + args.remote_tuning, + args.trace_prefix, + ) + + +if __name__ == "__main__": + main() diff --git a/mlir/benchmarks/torch/model_zoo/segformer_test.py b/mlir/benchmarks/torch/model_zoo/segformer_test.py index a70c186d9..b8c75e1ff 100644 --- a/mlir/benchmarks/torch/model_zoo/segformer_test.py +++ b/mlir/benchmarks/torch/model_zoo/segformer_test.py @@ -1,3 +1,4 @@ +import argparse import time import torch @@ -10,33 +11,62 @@ import docc.torch import os -os.environ["DOCC_STATISTICS"] = "1" -os.environ["DOCC_PROFILE_COMPILE"] = "1" -os.environ["DOCC_DEBUG"] = "dump" +#os.environ["DOCC_STATISTICS"] = "1" +#os.environ["DOCC_PROFILE_COMPILE"] = "1" +#os.environ["DOCC_DEBUG"] = "dump" +SEGFORMER_MODELS = { + "b0": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024", + "b1": "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", + "b2": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024", + "b3": "nvidia/segformer-b3-finetuned-cityscapes-1024-1024", + "b4": "nvidia/segformer-b4-finetuned-cityscapes-1024-1024", + "b5": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024", +} + + +def resolve_model_name(version, model): + if model: + return model + return SEGFORMER_MODELS[version] + + +def get_test_model_name(): + version = os.getenv("SEGFORMER_VERSION", "b2") + if version not in SEGFORMER_MODELS: + raise ValueError( + f"Unsupported SEGFORMER_VERSION '{version}'. " + f"Expected one of: {', '.join(SEGFORMER_MODELS.keys())}" + ) + return resolve_model_name(version, None) + +@pytest.mark.skipif(not os.environ.get("SLOW_TESTS", ""), reason="slow test") def test_backend(): - model = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() - model_ref = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() + model_name = get_test_model_name() + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() + model_ref = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() model_ref.load_state_dict(model.state_dict()) example_input = torch.randn(1, 3, 512, 512) start = time.perf_counter() - program = torch.compile(model, backend="docc", options={"target": "none", "category": "server"}) + program = torch.compile(model, backend="docc", options={"target": "cuda", "category": "server"}) end = time.perf_counter() print(f"compilation time: {(end - start) * 1000:.2f} ms") + + start = time.perf_counter() + ref_program = torch.compile(model, backend="docc", options={"target": "cuda", "category": "server"}) + end = time.perf_counter() + print(f"ref compilation time: {(end - start) * 1000:.2f} ms") + with torch.no_grad(): start = time.perf_counter() res = program(pixel_values=example_input) end = time.perf_counter() print(f"inference time: {(end - start) * 1000:.2f} ms") start = time.perf_counter() - res_ref = model_ref(pixel_values=example_input) + res_ref = ref_program(pixel_values=example_input) end = time.perf_counter() print(f"reference inference time: {(end - start) * 1000:.2f} ms") for k in range(res.logits.shape[0]): @@ -54,12 +84,9 @@ def test_backend(): @pytest.mark.skip("Skip") def test_compile(): - model = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() - model_ref = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() + model_name = get_test_model_name() + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() + model_ref = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() model_ref.load_state_dict(model.state_dict()) example_input = torch.randn(1, 3, 512, 512) @@ -77,9 +104,7 @@ def test_compile(): assert torch.allclose(res, res_ref.logits, rtol=1e-4) def find_used_dialects(): - model = SegformerForSemanticSegmentation.from_pretrained( - "nvidia/segformer-b0-finetuned-cityscapes-1024-1024" - ).eval() + model = SegformerForSemanticSegmentation.from_pretrained(get_test_model_name()).eval() example_input = torch.randn(1, 3, 512, 512) @@ -101,25 +126,37 @@ def find_used_dialects(): # print(mlir_str) -def benchmark_segformer(model_name): +def benchmark_segformer(model_name, backend="torch", target="none", device="cpu", remote_tuning=False): model = SegformerForSemanticSegmentation.from_pretrained( model_name ).eval() - example_input = torch.randn(1, 3, 1024, 1024) + if device == "cuda" and not torch.cuda.is_available(): + raise RuntimeError("CUDA requested but not available") + + if device == "cuda": + model = model.to("cuda") + + example_input = torch.randn(1, 3, 1024, 1024, device=device) + + compile_kwargs = {} + if backend == "docc": + compile_kwargs = { + "backend": "docc", + "options": {"target": target, "category": "server", "remote_tuning": remote_tuning}, + } - program = torch.compile(model) + program = torch.compile(model, **compile_kwargs) with torch.no_grad(): # Warmup res = program(pixel_values=example_input) import time - import math from scipy import stats as scipy_stats times = [] - min_samples = 5 - max_samples = 500 + min_samples = 1 + max_samples = 5 target_rel_ci = 0.01 # stop when 95% CI half-width < 1% of mean while len(times) < max_samples: @@ -144,14 +181,151 @@ def benchmark_segformer(model_name): sem = scipy_stats.sem(times) half_width = scipy_stats.t.ppf(0.975, df=n - 1) * sem print(f"Benchmarking {model_name}:") + print(f"Remote tuning: {remote_tuning}") print(f"Average inference time: {mean:.2f} ms (n={n})") print(f"95% CI: [{mean - half_width:.2f}, {mean + half_width:.2f}] ms (±{half_width:.2f} ms)") + +def setup_segformer_benchmark(model_name): + model = SegformerForSemanticSegmentation.from_pretrained(model_name).eval() + example_input = torch.randn(1, 3, 512, 512) + return model, example_input + + +def profile_segformer( + model_name, + backend="torch", + target="none", + device="cpu", + input_device=None, + remote_tuning=False, + n_runs=10, + image_size=512, + trace_prefix="segformer_trace", +): + from segformer_profile import setup_segformer, run_torch_profile, run_docc_profile + + model, model_input = setup_segformer( + model_name, + device, + image_size, + input_device=input_device, + ) + if backend == "torch": + run_torch_profile(model, model_input, n_runs, trace_prefix) + elif backend == "docc": + run_docc_profile(model, model_input, n_runs, target, remote_tuning, trace_prefix) + elif backend == "both": + run_torch_profile(model, model_input, n_runs, trace_prefix) + run_docc_profile(model, model_input, n_runs, target, remote_tuning, trace_prefix) + else: + raise ValueError(f"Unsupported backend '{backend}' for profiling") + if __name__ == "__main__": - # find_used_dialects() - find_used_dialects() - #benchmark_segformer("nvidia/segformer-b1-finetuned-cityscapes-1024-1024") - #benchmark_segformer("nvidia/segformer-b2-finetuned-cityscapes-1024-1024") - #benchmark_segformer("nvidia/segformer-b3-finetuned-cityscapes-1024-1024") - #benchmark_segformer("nvidia/segformer-b4-finetuned-cityscapes-1024-1024") - #benchmark_segformer("nvidia/segformer-b5-finetuned-cityscapes-1024-1024") \ No newline at end of file + parser = argparse.ArgumentParser(description="segformer benchmark") + parser.add_argument( + "--model", + type=str, + default=None, + help="Optional Hugging Face model id to override --version", + ) + parser.add_argument( + "--version", + type=str, + choices=list(SEGFORMER_MODELS.keys()), + default="b0", + help="SegFormer variant used when --model is not provided", + ) + parser.add_argument( + "--action", + type=str, + choices=["dialects", "benchmark", "benchmark_segformer", "profile"], + default="benchmark", + help="Run dialect dump or harness benchmark", + ) + parser.add_argument( + "--backend", + type=str, + choices=["torch", "docc", "both"], + default="torch", + help="Backend for --action benchmark_segformer/profile", + ) + parser.add_argument( + "--target", + type=str, + default="none", + help="DOCC target for --action benchmark_segformer (e.g. none, openmp, cuda)", + ) + parser.add_argument( + "--remote_tuning", + action="store_true", + help="Enable DOCC remote tuning during benchmark/profile compilation", + ) + parser.add_argument( + "--device", + type=str, + choices=["cpu", "cuda"], + default="cpu", + help="Tensor/model device for --action benchmark_segformer/profile", + ) + parser.add_argument( + "--input_device", + type=str, + choices=["cpu", "cuda"], + default=None, + help="Input tensor device for --action profile (defaults to --device)", + ) + parser.add_argument( + "--n_runs", + type=int, + default=10, + help="Number of runs for --action profile", + ) + parser.add_argument( + "--image_size", + type=int, + default=512, + help="Input image size for --action profile", + ) + parser.add_argument( + "--trace_prefix", + type=str, + default="segformer_trace", + help="Trace file prefix for --action profile torch runs", + ) + args, remaining = parser.parse_known_args() + model_name = resolve_model_name(args.version, args.model) + + import sys + + if args.action == "dialects": + find_used_dialects() + elif args.action == "benchmark_segformer": + benchmark_segformer( + model_name, + backend=args.backend, + target=args.target, + device=args.device, + remote_tuning=args.remote_tuning, + ) + elif args.action == "profile": + profile_segformer( + model_name, + backend=args.backend, + target=args.target, + device=args.device, + input_device=args.input_device, + remote_tuning=args.remote_tuning, + n_runs=args.n_runs, + image_size=args.image_size, + trace_prefix=args.trace_prefix, + ) + else: + sys.argv = [sys.argv[0]] + remaining + from functools import partial + from benchmarks.harness import run_benchmark + + run_benchmark( + partial(setup_segformer_benchmark, model_name), + f"segformer {model_name}", + ) diff --git a/mlir/docc/torch/torch_program.py b/mlir/docc/torch/torch_program.py index ecd1aa16c..17c0fb526 100644 --- a/mlir/docc/torch/torch_program.py +++ b/mlir/docc/torch/torch_program.py @@ -530,6 +530,19 @@ def _docc_dynamo_compiler(gm, example_inputs, backend_options): """Dynamic Compiler based on TorchProgram (inference only).""" import torch + # Resolve SymInt/SymFloat values that dynamo passes as graph inputs when a + # model (e.g. SegFormer) unpacks tensor shapes and forwards them as explicit + # integer arguments to submodules. torch.export.export cannot handle + # torch.SymInt; converting to concrete Python ints/floats is safe here + # because these values are always backed by a concrete shape at this point. + def _resolve(x): + if isinstance(x, torch.SymInt): + return int(x) + if isinstance(x, torch.SymFloat): + return float(x) + return x + example_inputs = [_resolve(inp) for inp in example_inputs] + if len(example_inputs) == 1: example_input = example_inputs[0] else: @@ -560,6 +573,14 @@ def _docc_aot_compiler(gm, example_inputs): import torch + def _resolve(x): + if isinstance(x, torch.SymInt): + return int(x) + if isinstance(x, torch.SymFloat): + return float(x) + return x + example_inputs = [_resolve(inp) for inp in example_inputs] + if len(example_inputs) == 1: example_input = example_inputs[0] else: diff --git a/opt/src/transformations/map_fusion.cpp b/opt/src/transformations/map_fusion.cpp index 8cdb30f7d..f2be61250 100644 --- a/opt/src/transformations/map_fusion.cpp +++ b/opt/src/transformations/map_fusion.cpp @@ -1196,7 +1196,27 @@ void MapFusion::apply(builder::StructuredSDFGBuilder& builder, analysis::Analysi } } - analysis_manager.invalidate_all(); + if (direction_ == FusionDirection::ProducerIntoConsumer) { + // The loop structure is unchanged after ProducerIntoConsumer: only new Block + // nodes are inserted into consumer_body_. Patch them into AssumptionsAnalysis + // so it stays valid, then preserve it (and LoopAnalysis) across the invalidation. + if (analysis_manager.has()) { + size_t n = fusion_candidates_.size(); + if (n < consumer_body_->size()) { + auto& aa = analysis_manager.get(); + // Original consumer blocks were shifted to index n..size-1; use + // the first of them as the scope reference for the new blocks. + auto& sibling = consumer_body_->at(n).first; + for (size_t i = 0; i < n; ++i) { + aa.register_node(consumer_body_->at(i).first, sibling); + } + } + } + analysis_manager.preserve(); + } else { + // ConsumerIntoProducer removes the consumer loop node entirely — full invalidation. + analysis_manager.invalidate_all(); + } applied_ = true; } diff --git a/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp b/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp index 842859697..03044339d 100644 --- a/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp +++ b/opt/tests/transformations/offloading/cuda_transform_im2col_test.cpp @@ -157,10 +157,11 @@ TEST(CudaTransformIm2colTest, CollapsedTwoDimMap) { analysis::AnalysisManager analysis_manager(builder.subject()); CUDATransform transform(outer_map, /*block_size=*/32); - // The outer map of the collapsed im2col pattern must be recognised as - // offloadable to a single CUDA kernel. + // Regression: this expects `true`; the failing main branch returns `false` + // and the offload pipeline keeps the map on the host. EXPECT_TRUE(transform.can_be_applied(builder, analysis_manager)) - << "OffloadTransform should accept the collapsed im2col map."; + << "OffloadTransform regressed on collapsed im2col map: the outer map " + "is no longer recognised as offloadable."; } TEST(CudaTransformIm2colTest, ExplicitSixDimMap) { diff --git a/sdfg/include/sdfg/analysis/analysis.h b/sdfg/include/sdfg/analysis/analysis.h index 2ccb2efc2..14e7d9eb3 100644 --- a/sdfg/include/sdfg/analysis/analysis.h +++ b/sdfg/include/sdfg/analysis/analysis.h @@ -73,6 +73,11 @@ class AnalysisManager { return *static_cast(cache_[type].get()); } + template + bool has() const { + return cache_.find(std::type_index(typeid(T))) != cache_.end(); + } + template void invalidate() { std::type_index type = std::type_index(typeid(T)); diff --git a/sdfg/include/sdfg/analysis/assumptions_analysis.h b/sdfg/include/sdfg/analysis/assumptions_analysis.h index e21777adb..dd49baf33 100644 --- a/sdfg/include/sdfg/analysis/assumptions_analysis.h +++ b/sdfg/include/sdfg/analysis/assumptions_analysis.h @@ -84,6 +84,13 @@ class AssumptionsAnalysis : public Analysis { const symbolic::Assumptions& get(structured_control_flow::ControlFlowNode& node, bool include_trivial_bounds = false); + // Register a newly created node so it inherits the same scope assumptions as + // sibling_node. Call this after inserting nodes into a sequence to keep the + // cached analysis valid without a full re-run. + void register_node( + structured_control_flow::ControlFlowNode& new_node, structured_control_flow::ControlFlowNode& sibling_node + ); + const symbolic::SymbolSet& parameters(); bool is_parameter(const symbolic::Symbol& container); diff --git a/sdfg/include/sdfg/passes/symbolic/type_minimization.h b/sdfg/include/sdfg/passes/symbolic/type_minimization.h index 09ae42998..6f3db0951 100644 --- a/sdfg/include/sdfg/passes/symbolic/type_minimization.h +++ b/sdfg/include/sdfg/passes/symbolic/type_minimization.h @@ -8,7 +8,6 @@ #include "sdfg/element.h" #include "sdfg/passes/pass.h" #include "sdfg/structured_control_flow/block.h" -#include "sdfg/symbolic/extreme_values.h" #include "sdfg/visitor/structured_sdfg_visitor.h" namespace sdfg { @@ -16,7 +15,7 @@ namespace passes { class TypeMinimization : public visitor::NonStoppingStructuredSDFGVisitor { private: - bool is_safe_trunc(symbolic::Expression expr, symbolic::BoundAnalysis& ba_tight, symbolic::BoundAnalysis& ba_loose); + bool is_safe_trunc(symbolic::Expression expr, const symbolic::Assumptions& assumptions); public: TypeMinimization(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager); diff --git a/sdfg/src/analysis/assumptions_analysis.cpp b/sdfg/src/analysis/assumptions_analysis.cpp index 67a3661ad..5ba56a9e7 100644 --- a/sdfg/src/analysis/assumptions_analysis.cpp +++ b/sdfg/src/analysis/assumptions_analysis.cpp @@ -597,6 +597,19 @@ const symbolic::Assumptions& AssumptionsAnalysis:: } } +void AssumptionsAnalysis::register_node( + structured_control_flow::ControlFlowNode& new_node, structured_control_flow::ControlFlowNode& sibling_node +) { + auto it = ref_assumptions_.find(&sibling_node); + if (it != ref_assumptions_.end()) { + ref_assumptions_[&new_node] = it->second; + } + auto it2 = ref_assumptions_with_trivial_.find(&sibling_node); + if (it2 != ref_assumptions_with_trivial_.end()) { + ref_assumptions_with_trivial_[&new_node] = it2->second; + } +} + const symbolic::SymbolSet& AssumptionsAnalysis::parameters() { return this->parameters_; } bool AssumptionsAnalysis::is_parameter(const symbolic::Symbol& container) { diff --git a/sdfg/src/analysis/data_dependency_analysis.cpp b/sdfg/src/analysis/data_dependency_analysis.cpp index 61840fb1a..d79d1b695 100644 --- a/sdfg/src/analysis/data_dependency_analysis.cpp +++ b/sdfg/src/analysis/data_dependency_analysis.cpp @@ -745,16 +745,11 @@ bool DataDependencyAnalysis:: auto current_scope = Users::scope(¤t); auto& current_assumptions = assumptions_analysis.get(*current_scope, true); - // One AssumptionsBounds per side, shared across the whole subset-pair scan. - // The original used `previous_assumptions, previous_assumptions` (both - // sides of `is_subset`), so we only need one bounds object here. - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - // Check if previous subset is subset of any current subset for (auto& previous_subset : previous_subsets) { bool found = false; for (auto& current_subset : current_subsets) { - if (symbolic::is_subset(previous_subset, current_subset, previous_bounds, previous_bounds)) { + if (symbolic::is_subset(previous_subset, current_subset, previous_assumptions, previous_assumptions)) { found = true; break; } @@ -814,7 +809,6 @@ bool DataDependencyAnalysis::fully_covered( auto& assumptions_analysis = this->ensure_detailed_assumptions(analysis_manager); auto& current_assumptions = assumptions_analysis.get(*Users::scope(¤t), true); - symbolic::AssumptionsBounds current_bounds(current_assumptions); // Each read subset must be contained in some single open writer's subset. for (auto& read_subset : current_subsets) { @@ -824,9 +818,8 @@ bool DataDependencyAnalysis::fully_covered( if (w->container() != current.container()) continue; if (this->is_undefined_user(*w)) continue; auto& w_assumptions = assumptions_analysis.get(*Users::scope(w), true); - symbolic::AssumptionsBounds w_bounds(w_assumptions); for (auto& w_subset : w->subsets()) { - if (symbolic::is_subset(read_subset, w_subset, current_bounds, w_bounds)) { + if (symbolic::is_subset(read_subset, w_subset, current_assumptions, w_assumptions)) { covered = true; break; } @@ -868,14 +861,11 @@ bool DataDependencyAnalysis::intersects(User& previous, User& current, analysis: auto current_scope = Users::scope(¤t); auto& current_assumptions = assumptions_analysis.get(*current_scope, true); - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - symbolic::AssumptionsBounds current_bounds(current_assumptions); - // Check if any current subset intersects with any previous subset bool found = false; for (auto& current_subset : current_subsets) { for (auto& previous_subset : previous_subsets) { - if (!symbolic::is_disjoint(current_subset, previous_subset, current_bounds, previous_bounds)) { + if (!symbolic::is_disjoint(current_subset, previous_subset, current_assumptions, previous_assumptions)) { found = true; break; } @@ -926,16 +916,13 @@ bool DataDependencyAnalysis:: auto& previous_assumptions = assumptions_analysis.get(*previous_scope, true); auto& current_assumptions = assumptions_analysis.get(*current_scope, true); - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - symbolic::AssumptionsBounds current_bounds(current_assumptions); - auto& previous_memlets = previous.subsets(); auto& current_memlets = current.subsets(); for (auto& subset_ : previous_memlets) { bool overwritten = false; for (auto& subset : current_memlets) { - if (symbolic::is_subset(subset_, subset, previous_bounds, current_bounds)) { + if (symbolic::is_subset(subset_, subset, previous_assumptions, current_assumptions)) { overwritten = true; break; } @@ -974,16 +961,13 @@ bool DataDependencyAnalysis::depends(analysis::AnalysisManager& analysis_manager auto& previous_assumptions = assumptions_analysis.get(*previous_scope, true); auto& current_assumptions = assumptions_analysis.get(*current_scope, true); - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - symbolic::AssumptionsBounds current_bounds(current_assumptions); - auto& previous_memlets = previous.subsets(); auto& current_memlets = current.subsets(); bool intersect_any = false; for (auto& current_subset : current_memlets) { for (auto& previous_subset : previous_memlets) { - if (!symbolic::is_disjoint(current_subset, previous_subset, current_bounds, previous_bounds)) { + if (!symbolic::is_disjoint(current_subset, previous_subset, current_assumptions, previous_assumptions)) { intersect_any = true; break; } diff --git a/sdfg/src/analysis/loop_carried_dependency_analysis.cpp b/sdfg/src/analysis/loop_carried_dependency_analysis.cpp index a334ebd8b..20ab265da 100644 --- a/sdfg/src/analysis/loop_carried_dependency_analysis.cpp +++ b/sdfg/src/analysis/loop_carried_dependency_analysis.cpp @@ -152,17 +152,15 @@ symbolic::maps::DependenceDeltas pair_deltas( } // Collect deltas across all subset pairs and union them. - symbolic::AssumptionsBounds previous_bounds(previous_assumptions); - symbolic::AssumptionsBounds current_bounds(current_assumptions); - isl_ctx* union_ctx = nullptr; isl_set* accumulated = nullptr; std::vector result_dimensions; for (auto& previous_subset : previous_subsets) { for (auto& current_subset : current_subsets) { - auto deltas = symbolic::maps:: - dependence_deltas(previous_subset, current_subset, loop.indvar(), previous_bounds, current_bounds); + auto deltas = symbolic::maps::dependence_deltas( + previous_subset, current_subset, loop.indvar(), previous_assumptions, current_assumptions + ); if (deltas.empty) { continue; } diff --git a/sdfg/src/analysis/memory_layout_analysis.cpp b/sdfg/src/analysis/memory_layout_analysis.cpp index 062284ef6..0a057230d 100644 --- a/sdfg/src/analysis/memory_layout_analysis.cpp +++ b/sdfg/src/analysis/memory_layout_analysis.cpp @@ -228,7 +228,21 @@ void MemoryLayoutAnalysis:: auto result = symbolic::delinearize(linearized_expr, assumptions); if (!result.success) { - continue; // Delinearization failed, skip + // Fallback: register the access as a 1D contiguous range over the + // raw linearized address. We lose multi-dim layout info, but the + // scope-level merge can still bound the access via BoundAnalysis, + // which is enough for downstream consumers like ArgumentsAnalysis + // to compute argument sizes. This recovers patterns where the + // delinearizer rejects the access (e.g. halo offsets producing + // negative constants inside a stride product, or non-strictly- + // dominating strides) but the overall address range is still + // soundly bounded by the enclosing loop assumptions. + symbolic::MultiExpression shape; + shape.push_back(symbolic::symbol("__unbounded__")); + MemoryLayout layout(shape); + MemoryAccess layout_info{container_name, {linearized_expr}, layout, false}; + this->accesses_.emplace(&memlet, layout_info); + continue; } // Delinearization returns N indices but only N-1 dimensions (from stride division) diff --git a/sdfg/src/passes/symbolic/type_minimization.cpp b/sdfg/src/passes/symbolic/type_minimization.cpp index db13ceade..63a31f879 100644 --- a/sdfg/src/passes/symbolic/type_minimization.cpp +++ b/sdfg/src/passes/symbolic/type_minimization.cpp @@ -13,13 +13,12 @@ namespace passes { TypeMinimization::TypeMinimization(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) : visitor::NonStoppingStructuredSDFGVisitor(builder, analysis_manager) {}; -bool TypeMinimization:: - is_safe_trunc(symbolic::Expression expr, symbolic::BoundAnalysis& ba_tight, symbolic::BoundAnalysis& ba_loose) { +bool TypeMinimization::is_safe_trunc(symbolic::Expression expr, const symbolic::Assumptions& assumptions) { size_t output_bitwidth = 32; int64_t output_min_value_signed = 0; int64_t output_max_value_signed = (1ULL << (output_bitwidth - 1)) - 1; - auto mini = ba_tight.lower_bound(expr); + auto mini = symbolic::minimum(expr, {}, assumptions, true); if (mini.is_null()) { return false; } @@ -28,7 +27,7 @@ bool TypeMinimization:: return false; } - auto maxi = ba_loose.upper_bound(expr); + auto maxi = symbolic::maximum(expr, {}, assumptions, false); if (maxi.is_null()) { return false; } @@ -46,13 +45,6 @@ bool TypeMinimization::accept(structured_control_flow::Block& block) { auto& assumptions_analysis = this->analysis_manager_.get(); auto& block_assumptions = assumptions_analysis.get(block, true); - // One BoundAnalysis pair for the whole block: every is_safe_trunc call here - // shares the same empty parameter set and the same assumptions, so the - // internal cache amortizes across all truncs in the block. - static const symbolic::SymbolSet no_params; - symbolic::BoundAnalysis ba_tight(no_params, block_assumptions, true); - symbolic::BoundAnalysis ba_loose(no_params, block_assumptions, false); - symbolic::ExpressionMap replacements; for (auto& edge : dfg.edges()) { auto& subset = edge.subset(); @@ -67,7 +59,7 @@ bool TypeMinimization::accept(structured_control_flow::Block& block) { continue; } auto arg = trunc_func->get_args()[0]; - if (!this->is_safe_trunc(arg, ba_tight, ba_loose)) { + if (!this->is_safe_trunc(arg, block_assumptions)) { continue; } @@ -101,10 +93,6 @@ bool TypeMinimization::accept(structured_control_flow::For& loop) { auto& assumptions_analysis = this->analysis_manager_.get(); auto& block_assumptions = assumptions_analysis.get(loop, true); - static const symbolic::SymbolSet no_params; - symbolic::BoundAnalysis ba_tight(no_params, block_assumptions, true); - symbolic::BoundAnalysis ba_loose(no_params, block_assumptions, false); - symbolic::ExpressionMap replacements; auto truncs = symbolic::find(loop.condition()); for (auto& trunc : truncs) { @@ -116,7 +104,7 @@ bool TypeMinimization::accept(structured_control_flow::For& loop) { continue; } auto arg = trunc_func->get_args()[0]; - if (!this->is_safe_trunc(arg, ba_tight, ba_loose)) { + if (!this->is_safe_trunc(arg, block_assumptions)) { continue; }