From 8ba6e0d690b12f1ad952902f56e2db1ab552a60a Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Wed, 20 May 2026 13:09:40 +0200
Subject: [PATCH 1/7] Add initial bench machinery

---
 .buildkite/pipeline.yml         |  26 ++++
 .github/workflows/Benchmark.yml |  49 ++++++++
 README.md                       |   3 +
 perf/.gitignore                 |   1 +
 perf/Project.toml               |   5 +
 perf/amdgpu.jl                  |  11 ++
 perf/array.jl                   | 155 +++++++++++++++++++++++
 perf/byval.jl                   |  72 +++++++++++
 perf/kernel.jl                  |  27 ++++
 perf/latency.jl                 |  39 ++++++
 perf/runbenchmarks.jl           |  59 +++++++++
 perf/volumerhs.jl               | 214 ++++++++++++++++++++++++++++++++
 12 files changed, 661 insertions(+)
 create mode 100644 .github/workflows/Benchmark.yml
 create mode 100644 perf/.gitignore
 create mode 100644 perf/Project.toml
 create mode 100644 perf/amdgpu.jl
 create mode 100644 perf/array.jl
 create mode 100644 perf/byval.jl
 create mode 100644 perf/kernel.jl
 create mode 100644 perf/latency.jl
 create mode 100644 perf/runbenchmarks.jl
 create mode 100644 perf/volumerhs.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index dfedd031f..332d333f7 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -188,6 +188,32 @@ steps:
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 10
 
+  - label: ":racehorse: Benchmarks"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia --project=perf -e '
+        println("--- :julia: Instantiating benchmark project")
+        using Pkg
+        Pkg.develop(PackageSpec(path="."))
+        Pkg.instantiate()
+
+        println("+++ :julia: Running benchmarks")
+        include("perf/runbenchmarks.jl")'
+    artifact_paths:
+      - "perf/benchmarkresults.json"
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip benchmarks\]/
+    timeout_in_minutes: 120
+    env:
+      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
+      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
+      JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
+
 env:
   JULIA_AMDGPU_LOGGING_ENABLED: true
   SECRET_CODECOV_TOKEN: "lVqFGgrywYmQrILXBcP8i6TosP+q/W2oTDVLIdkWFWscd/a61oSVb8Tycq3qvngsrdmKU9EevdQo+1x+w7cu4IuTq63ahQc0RFgi4Q29hC52OgN2wFql984Cqq3T96P3jyV0ZljaRT+a+9AY0oWmmCph55amvvQ4DOMq3tfGDbp7gdueQvJmSYQGVT3/9Sjn4/esYppcKBGltQqQX2E7WrHLpnqRmsmjcSeZ/S/+PgPRb4ZnpBecAUP2d/MlPgKfP0ZUGbDlcbGu+ZDZNksxKIYuAlNrWPhpNAro7hACfEk4T5RRpNiwmJyXJZ8LUD8zNYIUKSmHjUtmqhNXgujWXA==;U2FsdGVkX1/v/P2Y7KZsvC55Au6eET37uDE6M5I6J275maix+SMD0EoJQ19cFp/lae+G8V7dvpPGfrh4hj2nOg=="
diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
new file mode 100644
index 000000000..87b12a96e
--- /dev/null
+++ b/.github/workflows/Benchmark.yml
@@ -0,0 +1,49 @@
+name: Benchmarks
+permissions:
+  statuses: read        # find Buildkite URL from PR status
+  contents: write       # update benchmark contents in gh-pages branch
+  pull-requests: write  # comment on PR with benchmark results
+  deployments: write    # deploy GitHub pages website
+
+on:
+  pull_request_target:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download Buildkite Artifacts
+        id: download
+        uses: EnricoMi/download-buildkite-artifact-action@v1
+        with:
+          buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
+          ignore_build_states: blocked,canceled,skipped,not_run
+          ignore_job_states: timed_out,failed
+          output_path: artifacts
+
+      - name: Locate Benchmarks Artifact
+        id: locate
+        if: ${{ steps.download.outputs.download-state == 'success' }}
+        run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT
+
+      - name: Upload Benchmark Results
+        if: ${{ steps.locate.outputs.path != '' }}
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: AMDGPU.jl Benchmarks
+          tool: "julia"
+          output-file-path: ${{ steps.locate.outputs.path }}
+          benchmark-data-dir-path: "bench"
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          comment-always: ${{ github.event_name == 'pull_request_target' }}
+          summary-always: true
+          alert-threshold: "125%"
+          fail-on-alert: false
+          auto-push: ${{ github.event_name != 'pull_request_target' }}
diff --git a/README.md b/README.md
index fb88c3081..a3d9bacf4 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,9 @@
 [downloads-img]: https://img.shields.io/badge/dynamic/json?url=http%3A%2F%2Fjuliapkgstats.com%2Fapi%2Fv1%2Fmonthly_downloads%2FAMDGPU&query=total_requests&suffix=%2Fmonth&label=Downloads
 [downloads-url]: https://juliapkgstats.com/pkg/AMDGPU
 
+[benchmark-img]: https://img.shields.io/badge/benchmarks-Chart-yellowgreen
+[benchmark-url]: https://amdgpu.juliagpu.org/bench/
+
 ## Quick start
 
 AMDGPU.jl can be installed with the Julia package manager.
diff --git a/perf/.gitignore b/perf/.gitignore
new file mode 100644
index 000000000..82d7c41fd
--- /dev/null
+++ b/perf/.gitignore
@@ -0,0 +1 @@
+benchmarkresults.json
diff --git a/perf/Project.toml b/perf/Project.toml
new file mode 100644
index 000000000..ddd8a2048
--- /dev/null
+++ b/perf/Project.toml
@@ -0,0 +1,5 @@
+[deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/perf/amdgpu.jl b/perf/amdgpu.jl
new file mode 100644
index 000000000..8c88c62e5
--- /dev/null
+++ b/perf/amdgpu.jl
@@ -0,0 +1,11 @@
+group = addgroup!(SUITE, "amdgpu")
+
+let group = addgroup!(group, "synchronization")
+    let group = addgroup!(group, "stream")
+        group["blocking"]    = @benchmarkable AMDGPU.synchronize(; blocking=true)
+        group["nonblocking"] = @benchmarkable AMDGPU.synchronize(; blocking=false)
+    end
+    let group = addgroup!(group, "context")
+        group["device"] = @benchmarkable AMDGPU.device_synchronize()
+    end
+end
diff --git a/perf/array.jl b/perf/array.jl
new file mode 100644
index 000000000..b26b53749
--- /dev/null
+++ b/perf/array.jl
@@ -0,0 +1,155 @@
+const m = 512
+const n = 1000
+const m_long = 3
+const n_long = 1_000_000
+
+group = addgroup!(SUITE, "array")
+
+# generate some arrays
+cpu_mat = rand(rng, Float32, m, n)
+gpu_mat = ROCArray{Float32}(cpu_mat)
+gpu_mat_long = ROCArray{Float32}(rand(rng, Float32, m_long, n_long))
+gpu_vec = reshape(gpu_mat, length(gpu_mat))
+gpu_vec_long = reshape(gpu_mat_long, length(gpu_mat_long))
+gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
+gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
+gpu_mat_ints = ROCArray(rand(rng, -10:10, m, n))
+gpu_mat_long_ints = ROCArray(rand(rng, -10:10, m_long, n_long))
+gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
+gpu_mat_bools = ROCArray(rand(rng, Bool, m, n))
+gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
+
+group["construct"] = @benchmarkable ROCArray{Int}(undef, 1) evals=1
+
+group["copy"] = @async_benchmarkable copy($gpu_mat)
+
+gpu_mat2 = copy(gpu_mat)
+let group = addgroup!(group, "copyto!")
+    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
+    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
+    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
+end
+
+let group = addgroup!(group, "iteration")
+    group["scalar"] = @benchmarkable AMDGPU.allowscalar() do
+        [$gpu_vec[i] for i in 1:10]
+    end
+
+    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+
+    let group = addgroup!(group, "findall")
+        group["bool"] = @benchmarkable findall($gpu_vec_bools)
+        group["int"]  = @benchmarkable findall(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findfirst")
+        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+        group["int"]  = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findmin") # findmax
+        group["1d"] = @async_benchmarkable findmin($gpu_vec)
+        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+    end
+end
+
+let group = addgroup!(group, "reverse")
+    group["1d"]        = @async_benchmarkable reverse($gpu_vec)
+    group["1dL"]       = @async_benchmarkable reverse($gpu_vec_long)
+    group["2d"]        = @async_benchmarkable reverse($gpu_mat; dims=1)
+    group["2dL"]       = @async_benchmarkable reverse($gpu_mat_long; dims=1)
+    group["1d_inplace"]  = @async_benchmarkable reverse!($gpu_vec)
+    group["1dL_inplace"] = @async_benchmarkable reverse!($gpu_vec_long)
+    group["2d_inplace"]  = @async_benchmarkable reverse!($gpu_mat; dims=1)
+    group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
+end
+
+group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+
+# no need to test inplace version, which performs the same operation (but with an alloc)
+let group = addgroup!(group, "accumulate")
+    let group = addgroup!(group, "Float32")
+        group["1d"]     = @async_benchmarkable accumulate(+, $gpu_vec)
+        group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
+        group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)
+        group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
+        group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
+    end
+    let group = addgroup!(group, "Int64")
+        group["1d"]     = @async_benchmarkable accumulate(+, $gpu_vec_ints)
+        group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
+        group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)
+        group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
+        group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
+    end
+end
+
+let group = addgroup!(group, "reductions")
+    let group = addgroup!(group, "reduce")
+        let group = addgroup!(group, "Float32")
+            group["1d"]     = @async_benchmarkable reduce(+, $gpu_vec)
+            group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+            group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
+            group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
+            group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
+        end
+        let group = addgroup!(group, "Int64")
+            group["1d"]     = @async_benchmarkable reduce(+, $gpu_vec_ints)
+            group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
+            group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
+            group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
+            group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
+        end
+    end
+
+    let group = addgroup!(group, "mapreduce")
+        let group = addgroup!(group, "Float32")
+            group["1d"]     = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
+            group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
+            group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
+            group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
+            group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
+        end
+        let group = addgroup!(group, "Int64")
+            group["1d"]     = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
+            group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
+            group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+            group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
+            group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
+        end
+    end
+
+    # used by sum, prod, minimum, maximum, all, any, count
+end
+
+let group = addgroup!(group, "random")
+    let group = addgroup!(group, "rand")
+        group["Float32"] = @async_benchmarkable AMDGPU.rand(Float32, $m*$n)
+        group["Int64"]   = @async_benchmarkable AMDGPU.rand(Int64, $m*$n)
+    end
+
+    let group = addgroup!(group, "rand!")
+        group["Float32"] = @async_benchmarkable rand!($gpu_vec)
+        group["Int64"]   = @async_benchmarkable rand!($gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "randn")
+        group["Float32"] = @async_benchmarkable AMDGPU.randn(Float32, $m*$n)
+    end
+
+    let group = addgroup!(group, "randn!")
+        group["Float32"] = @async_benchmarkable randn!($gpu_vec)
+    end
+end
+
+let group = addgroup!(group, "sorting")
+    group["1d"] = @async_benchmarkable sort($gpu_vec)
+    group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+    group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+end
+
+let group = addgroup!(group, "permutedims")
+    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
+    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
+    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
+end
diff --git a/perf/byval.jl b/perf/byval.jl
new file mode 100644
index 000000000..863c63203
--- /dev/null
+++ b/perf/byval.jl
@@ -0,0 +1,72 @@
+module ByVal
+
+using AMDGPU, BenchmarkTools, Random
+
+const threads = 256
+
+# simple add matrices kernel
+function kernel_add_mat(n, x1, x2, y)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+@inline get_inputs3(indx_y, a, b, c)                          = (a, b, c)
+@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2)           = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
+@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)
+
+# add arrays of matrices kernel
+function kernel_add_mat_z_slices(n, vararg...)
+    x1, x2, y = get_inputs3(workgroupIdx().y, vararg...)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+function add_z_slices!(y, x1, x2)
+    m1, n1 = size(x1[1])
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    @roc groupsize=threads gridsize=(blocks, length(x1)) kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
+end
+
+function add!(y, x1, x2)
+    m1, n1 = size(x1)
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    @roc groupsize=threads gridsize=(blocks, 1) kernel_add_mat(m1 * n1, x1, x2, y)
+end
+
+function main()
+    results = BenchmarkGroup()
+
+    num_z_slices = 3
+    Random.seed!(1)
+
+    m, n = 3072, 1536    # 256 multiplier
+
+    x1 = [ROCArray(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x2 = [ROCArray(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    y1 = [similar(x1[1]) for i = 1:num_z_slices]
+
+    # reference down-to-bones add on GPU
+    results["reference"] = @benchmark AMDGPU.@sync add!($y1[1], $x1[1], $x2[1])
+
+    # adding arrays in an array
+    for slices = 1:num_z_slices
+        results["slices=$slices"] = @benchmark AMDGPU.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+    end
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    AMDGPU.unsafe_free!.(x1)
+    AMDGPU.unsafe_free!.(x2)
+    AMDGPU.unsafe_free!.(y1)
+
+    return results
+end
+
+end
+
+ByVal.main()
diff --git a/perf/kernel.jl b/perf/kernel.jl
new file mode 100644
index 000000000..7543fce3f
--- /dev/null
+++ b/perf/kernel.jl
@@ -0,0 +1,27 @@
+group = addgroup!(SUITE, "kernel")
+
+group["launch"] = @benchmarkable @roc identity(nothing)
+
+src = AMDGPU.rand(Float32, 512, 1000)
+dest = similar(src)
+
+function indexing_kernel(dest, src)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    @inbounds dest[i] = src[i]
+    return
+end
+group["indexing"] = @async_benchmarkable @roc groupsize=size(src,1) gridsize=size(src,2) $indexing_kernel($dest, $src)
+
+function checked_indexing_kernel(dest, src)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    dest[i] = src[i]
+    return
+end
+group["indexing_checked"] = @async_benchmarkable @roc groupsize=size(src,1) gridsize=size(src,2) $checked_indexing_kernel($dest, $src)
+
+function rand_kernel(dest::AbstractArray{T}) where {T}
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    dest[i] = rand(T)
+    return
+end
+group["rand"] = @async_benchmarkable @roc groupsize=size($dest,1) gridsize=size($dest,2) $rand_kernel($dest)
diff --git a/perf/latency.jl b/perf/latency.jl
new file mode 100644
index 000000000..166d5162c
--- /dev/null
+++ b/perf/latency.jl
@@ -0,0 +1,39 @@
+module Latency
+
+using AMDGPU
+using BenchmarkTools
+
+function main()
+    results = BenchmarkGroup()
+
+    base_cmd = Base.julia_cmd()
+    if Base.JLOptions().project != C_NULL
+        base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
+    end
+    # NOTE: we don't use Base.active_project() here because of how CI launches this script,
+    #       starting with --project in the main AMDGPU.jl project.
+
+    # time to precompile the package and its dependencies
+    precompile_cmd =
+        `$base_cmd -e "pkg = Base.identify_package(\"AMDGPU\")
+                       Base.compilecache(pkg)"`
+    results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
+
+    # time to actually import the package
+    import_cmd =
+        `$base_cmd -e "using AMDGPU"`
+    results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
+
+    # time to actually compile a kernel (time to first kernel)
+    ttfp_cmd =
+        `$base_cmd -e "using AMDGPU
+                       kernel() = return
+                       AMDGPU.code_gcn(devnull, kernel, Tuple{}; kernel=true)"`
+    results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
+
+    results
+end
+
+end
+
+Latency.main()
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
new file mode 100644
index 000000000..9a1bb5b39
--- /dev/null
+++ b/perf/runbenchmarks.jl
@@ -0,0 +1,59 @@
+# Benchmark suite execution
+using AMDGPU
+
+using BenchmarkTools
+
+using Random, StableRNGs
+rng = StableRNG(123)
+
+# print system information
+AMDGPU.versioninfo()
+
+# convenience macro to create a benchmark that requires synchronizing the GPU
+macro async_benchmarkable(ex...)
+    quote
+        @benchmarkable AMDGPU.@sync $(ex...)
+    end
+end
+
+# before anything else, run latency benchmarks. these spawn subprocesses, so we don't want
+# to do so after regular benchmarks have caused the memory allocator to reserve memory.
+@info "Running latency benchmarks"
+latency_results = include("latency.jl")
+
+SUITE = BenchmarkGroup()
+
+include("amdgpu.jl")
+include("kernel.jl")
+include("array.jl")
+
+@info "Preparing main benchmarks"
+warmup(SUITE; verbose=false)
+tune!(SUITE)
+
+# reclaim memory that might have been used by the tuning process
+GC.gc(true)
+AMDGPU.reclaim()
+
+# benchmark groups that aren't part of the suite
+addgroup!(SUITE, "integration")
+
+@info "Running main benchmarks"
+results = run(SUITE, verbose=true)
+
+# integration tests (that do nasty things, so need to be run last)
+@info "Running integration benchmarks"
+integration_results = BenchmarkGroup()
+integration_results["volumerhs"] = include("volumerhs.jl")
+integration_results["byval"] = include("byval.jl")
+
+results["latency"] = latency_results
+results["integration"] = integration_results
+
+# write out the results
+# we report the minimum rather than the median: at the sub-microsecond scale of many
+# of these benchmarks, OS scheduler jitter dominates the median and produces 5-15%
+# trial-to-trial variance, while the minimum reflects the un-preempted code path
+# and is stable to <1% across trials. real regressions still show up in the minimum.
+result_file = length(ARGS) >= 1 ? ARGS[1] : "benchmarkresults.json"
+BenchmarkTools.save(result_file, minimum(results))
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
new file mode 100644
index 000000000..d81024c73
--- /dev/null
+++ b/perf/volumerhs.jl
@@ -0,0 +1,214 @@
+module VolumeRHS
+
+using BenchmarkTools
+using AMDGPU
+using StableRNGs
+using StaticArrays
+
+function loopinfo(name, expr, nodes...)
+    if expr.head != :for
+        error("Syntax error: pragma $name needs a for loop")
+    end
+    push!(expr.args[2].args, Expr(:loopinfo, nodes...))
+    return expr
+end
+
+macro unroll(expr)
+    expr = loopinfo("@unroll", expr, (Symbol("llvm.loop.unroll.full"),))
+    return esc(expr)
+end
+
+# note the order of the fields below is also assumed in the code.
+const _nstate = 5
+const _ρ, _U, _V, _W, _E = 1:_nstate
+const stateid = (ρ = _ρ, U = _U, V = _V, W = _W, E = _E)
+
+const _nvgeo = 14
+const _ξx, _ηx, _ζx, _ξy, _ηy, _ζy, _ξz, _ηz, _ζz, _MJ, _MJI,
+      _x, _y, _z = 1:_nvgeo
+const vgeoid = (ξx = _ξx, ηx = _ηx, ζx = _ζx,
+                ξy = _ξy, ηy = _ηy, ζy = _ζy,
+                ξz = _ξz, ηz = _ηz, ζz = _ζz,
+                MJ = _MJ, MJI = _MJI,
+                x = _x,   y = _y,   z = _z)
+
+const N = 4
+const nmoist = 0
+const ntrace = 0
+
+Base.@irrational grav  9.81 BigFloat(9.81)
+Base.@irrational gdm1  0.4  BigFloat(0.4)
+
+function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
+    Q    = Base.Experimental.Const(Q)
+    vgeo = Base.Experimental.Const(vgeo)
+    D    = Base.Experimental.Const(D)
+
+    nvar = _nstate + nmoist + ntrace
+    Nq   = N + 1
+
+    s_D = @ROCStaticLocalArray(eltype(D), (Nq, Nq))
+    s_F = @ROCStaticLocalArray(eltype(Q), (Nq, Nq, _nstate))
+    s_G = @ROCStaticLocalArray(eltype(Q), (Nq, Nq, _nstate))
+
+    r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+    r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+    r_rhsV = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+    r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+    r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+
+    e = workgroupIdx().x
+    j = workitemIdx().y
+    i = workitemIdx().x
+
+    @inbounds begin
+        for k in 1:Nq
+            r_rhsρ[k] = zero(eltype(rhs))
+            r_rhsU[k] = zero(eltype(rhs))
+            r_rhsV[k] = zero(eltype(rhs))
+            r_rhsW[k] = zero(eltype(rhs))
+            r_rhsE[k] = zero(eltype(rhs))
+        end
+
+        # fetch D into shared
+        s_D[i, j] = D[i, j]
+        @unroll for k in 1:Nq
+            sync_workgroup()
+
+            # Load values we need into registers
+            MJ = vgeo[i, j, k, _MJ, e]
+            ξx, ξy, ξz = vgeo[i,j,k,_ξx,e], vgeo[i,j,k,_ξy,e], vgeo[i,j,k,_ξz,e]
+            ηx, ηy, ηz = vgeo[i,j,k,_ηx,e], vgeo[i,j,k,_ηy,e], vgeo[i,j,k,_ηz,e]
+            ζx, ζy, ζz = vgeo[i,j,k,_ζx,e], vgeo[i,j,k,_ζy,e], vgeo[i,j,k,_ζz,e]
+            z = vgeo[i,j,k,_z,e]
+
+            U, V, W = Q[i, j, k, _U, e], Q[i, j, k, _V, e], Q[i, j, k, _W, e]
+            ρ, E   = Q[i, j, k, _ρ, e], Q[i, j, k, _E, e]
+
+            ρinv  = @fastmath inv(ρ)
+            ρ2inv = @fastmath inv(2ρ)
+
+            P = gdm1 * (E - (U^2 + V^2 + W^2) * ρ2inv - ρ * gravity * z)
+
+            fluxρ_x = U
+            fluxU_x = ρinv * U * U + P
+            fluxV_x = ρinv * U * V
+            fluxW_x = ρinv * U * W
+            fluxE_x = ρinv * U * (E + P)
+
+            fluxρ_y = V
+            fluxU_y = ρinv * V * U
+            fluxV_y = ρinv * V * V + P
+            fluxW_y = ρinv * V * W
+            fluxE_y = ρinv * V * (E + P)
+
+            fluxρ_z = W
+            fluxU_z = ρinv * W * U
+            fluxV_z = ρinv * W * V
+            fluxW_z = ρinv * W * W + P
+            fluxE_z = ρinv * W * (E + P)
+
+            s_F[i, j, _ρ] = MJ * (ξx * fluxρ_x + ξy * fluxρ_y + ξz * fluxρ_z)
+            s_F[i, j, _U] = MJ * (ξx * fluxU_x + ξy * fluxU_y + ξz * fluxU_z)
+            s_F[i, j, _V] = MJ * (ξx * fluxV_x + ξy * fluxV_y + ξz * fluxV_z)
+            s_F[i, j, _W] = MJ * (ξx * fluxW_x + ξy * fluxW_y + ξz * fluxW_z)
+            s_F[i, j, _E] = MJ * (ξx * fluxE_x + ξy * fluxE_y + ξz * fluxE_z)
+
+            s_G[i, j, _ρ] = MJ * (ηx * fluxρ_x + ηy * fluxρ_y + ηz * fluxρ_z)
+            s_G[i, j, _U] = MJ * (ηx * fluxU_x + ηy * fluxU_y + ηz * fluxU_z)
+            s_G[i, j, _V] = MJ * (ηx * fluxV_x + ηy * fluxV_y + ηz * fluxV_z)
+            s_G[i, j, _W] = MJ * (ηx * fluxW_x + ηy * fluxW_y + ηz * fluxW_z)
+            s_G[i, j, _E] = MJ * (ηx * fluxE_x + ηy * fluxE_y + ηz * fluxE_z)
+
+            r_Hρ = MJ * (ζx * fluxρ_x + ζy * fluxρ_y + ζz * fluxρ_z)
+            r_HU = MJ * (ζx * fluxU_x + ζy * fluxU_y + ζz * fluxU_z)
+            r_HV = MJ * (ζx * fluxV_x + ζy * fluxV_y + ζz * fluxV_z)
+            r_HW = MJ * (ζx * fluxW_x + ζy * fluxW_y + ζz * fluxW_z)
+            r_HE = MJ * (ζx * fluxE_x + ζy * fluxE_y + ζz * fluxE_z)
+
+            # one shared access per 10 flops
+            for n = 1:Nq
+                Dkn = s_D[k, n]
+                r_rhsρ[n] += Dkn * r_Hρ
+                r_rhsU[n] += Dkn * r_HU
+                r_rhsV[n] += Dkn * r_HV
+                r_rhsW[n] += Dkn * r_HW
+                r_rhsE[n] += Dkn * r_HE
+            end
+
+            r_rhsW[k] -= MJ * ρ * gravity
+
+            sync_workgroup()
+
+            # loop over ξ-grid lines
+            @unroll for n = 1:Nq
+                Dni = s_D[n, i]
+                Dnj = s_D[n, j]
+
+                r_rhsρ[k] += Dni * s_F[n, j, _ρ]
+                r_rhsρ[k] += Dnj * s_G[i, n, _ρ]
+
+                r_rhsU[k] += Dni * s_F[n, j, _U]
+                r_rhsU[k] += Dnj * s_G[i, n, _U]
+
+                r_rhsV[k] += Dni * s_F[n, j, _V]
+                r_rhsV[k] += Dnj * s_G[i, n, _V]
+
+                r_rhsW[k] += Dni * s_F[n, j, _W]
+                r_rhsW[k] += Dnj * s_G[i, n, _W]
+
+                r_rhsE[k] += Dni * s_F[n, j, _E]
+                r_rhsE[k] += Dnj * s_G[i, n, _E]
+            end
+        end # k
+
+        @unroll for k in 1:Nq
+            MJI = vgeo[i, j, k, _MJI, e]
+            rhs[i, j, k, _U, e] += MJI * r_rhsU[k]
+            rhs[i, j, k, _V, e] += MJI * r_rhsV[k]
+            rhs[i, j, k, _W, e] += MJI * r_rhsW[k]
+            rhs[i, j, k, _ρ, e] += MJI * r_rhsρ[k]
+            rhs[i, j, k, _E, e] += MJI * r_rhsE[k]
+        end
+    end
+    return
+end
+
+function main()
+    DFloat = Float32
+    nelem  = 240_000
+
+    rng = StableRNG(123)
+
+    Nq   = N + 1
+    nvar = _nstate + nmoist + ntrace
+
+    Q    = 1 .+ ROCArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem))
+    Q[:, :, :, _E, :] .+= 20
+
+    vgeo = ROCArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem))
+    vgeo[:, :, :, _MJ,  :] .+= 3
+    vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :]
+
+    D   = ROCArray(rand(rng, DFloat, Nq, Nq))
+    rhs = ROCArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem))
+
+    groupsize = (N+1, N+1)
+
+    results = @benchmark begin
+        AMDGPU.@sync @roc groupsize=$groupsize gridsize=$nelem $volumerhs!(
+            $rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem)
+    end
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    AMDGPU.unsafe_free!(rhs)
+    AMDGPU.unsafe_free!(Q)
+    AMDGPU.unsafe_free!(vgeo)
+    AMDGPU.unsafe_free!(D)
+
+    results
+end
+
+end
+
+VolumeRHS.main()

From 2bb39a8b232f8f176bbf6e909420ea1ee806dbb5 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Wed, 20 May 2026 15:23:27 +0200
Subject: [PATCH 2/7] Tweaks

---
 .buildkite/pipeline.yml | 4 ++--
 perf/array.jl           | 6 +++---
 perf/runbenchmarks.jl   | 3 +--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 332d333f7..4420a7796 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -191,7 +191,7 @@ steps:
   - label: ":racehorse: Benchmarks"
     plugins:
       - JuliaCI/julia#v1:
-          version: "1.10"
+          version: "1.12"
     command: |
       julia --project=perf -e '
         println("--- :julia: Instantiating benchmark project")
@@ -206,7 +206,7 @@ steps:
     agents:
       queue: "juliagpu"
       rocm: "*"
-      rocmgpu: "*"
+      rocmgpu: "gfx1100"
     if: build.message !~ /\[skip benchmarks\]/
     timeout_in_minutes: 120
     env:
diff --git a/perf/array.jl b/perf/array.jl
index b26b53749..32c92dde1 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -118,7 +118,6 @@ let group = addgroup!(group, "reductions")
             group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
         end
     end
-
     # used by sum, prod, minimum, maximum, all, any, count
 end
 
@@ -144,8 +143,9 @@ end
 
 let group = addgroup!(group, "sorting")
     group["1d"] = @async_benchmarkable sort($gpu_vec)
-    group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
-    group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+    #  there's no GPU-side support for dims= or by=. Both fall back to Base's CPU sort path, which triggers scalar indexing
+    # group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+    # group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
 end
 
 let group = addgroup!(group, "permutedims")
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 9a1bb5b39..0fbeba38b 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -28,7 +28,6 @@ include("kernel.jl")
 include("array.jl")
 
 @info "Preparing main benchmarks"
-warmup(SUITE; verbose=false)
 tune!(SUITE)
 
 # reclaim memory that might have been used by the tuning process
@@ -41,7 +40,7 @@ addgroup!(SUITE, "integration")
 @info "Running main benchmarks"
 results = run(SUITE, verbose=true)
 
-# integration tests (that do nasty things, so need to be run last)
+# integration tests (that need to be run last)
 @info "Running integration benchmarks"
 integration_results = BenchmarkGroup()
 integration_results["volumerhs"] = include("volumerhs.jl")

From 7661fa7b7443008a9b18fd7403760b24c0601704 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Wed, 20 May 2026 15:41:25 +0200
Subject: [PATCH 3/7] Fixup

---
 perf/volumerhs.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
index d81024c73..8170f8400 100644
--- a/perf/volumerhs.jl
+++ b/perf/volumerhs.jl
@@ -40,9 +40,10 @@ Base.@irrational grav  9.81 BigFloat(9.81)
 Base.@irrational gdm1  0.4  BigFloat(0.4)
 
 function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
-    Q    = Base.Experimental.Const(Q)
-    vgeo = Base.Experimental.Const(vgeo)
-    D    = Base.Experimental.Const(D)
+    # not supported on AMDGPU currently
+    # Q    = Base.Experimental.Const(Q)
+    # vgeo = Base.Experimental.Const(vgeo)
+    # D    = Base.Experimental.Const(D)
 
     nvar = _nstate + nmoist + ntrace
     Nq   = N + 1

From a4575d9d256a4d4b8b7b4de46d33b4267cae68eb Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Fri, 22 May 2026 00:19:15 +0200
Subject: [PATCH 4/7] Fix hangs on AMDGPU

---
 perf/array.jl         |  6 +++---
 perf/runbenchmarks.jl | 24 +++++++++++++++++++++---
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/perf/array.jl b/perf/array.jl
index 32c92dde1..8ab79c7fd 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -35,11 +35,11 @@ let group = addgroup!(group, "iteration")
         [$gpu_vec[i] for i in 1:10]
     end
 
-    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+    group["logical"] = @async_benchmarkable $gpu_vec[$gpu_vec_bools]
 
     let group = addgroup!(group, "findall")
-        group["bool"] = @benchmarkable findall($gpu_vec_bools)
-        group["int"]  = @benchmarkable findall(isodd, $gpu_vec_ints)
+        group["bool"] = @async_benchmarkable findall($gpu_vec_bools)
+        group["int"]  = @async_benchmarkable findall(isodd, $gpu_vec_ints)
     end
 
     let group = addgroup!(group, "findfirst")
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 0fbeba38b..c50bed50c 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -9,10 +9,14 @@ rng = StableRNG(123)
 # print system information
 AMDGPU.versioninfo()
 
-# convenience macro to create a benchmark that requires synchronizing the GPU
+# convenience macro to create a benchmark that requires synchronizing the GPU.
+# The setup=(GC+sync) runs before each sample (outside the timing window) to
+# drain pending hipFreeAsync calls before the next allocation, preventing
+# HIP memory pool exhaustion on discrete GPUs.
 macro async_benchmarkable(ex...)
     quote
-        @benchmarkable AMDGPU.@sync $(ex...)
+        @benchmarkable(AMDGPU.@sync($(ex...)),
+                       setup=(GC.gc(false); AMDGPU.synchronize()))
     end
 end
 
@@ -28,7 +32,21 @@ include("kernel.jl")
 include("array.jl")
 
 @info "Preparing main benchmarks"
-tune!(SUITE)
+# tune!() uses a doubling strategy (maxevals=1,2,4,…) that exhausts the HIP
+# memory pool on discrete GPUs. Instead, warmup for compilation and fix evals=1:
+# one GPU round-trip per sample is the right granularity anyway.
+warmup(SUITE; verbose=false)
+
+function set_evals!(group::BenchmarkGroup, evals::Int=1)
+    for (_, b) in group
+        if b isa BenchmarkGroup
+            set_evals!(b, evals)
+        else
+            b.params.evals = evals
+        end
+    end
+end
+set_evals!(SUITE)
 
 # reclaim memory that might have been used by the tuning process
 GC.gc(true)

From db0076d7a5f3396f1211baa191d58de52b1feb27 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Fri, 22 May 2026 09:29:17 +0200
Subject: [PATCH 5/7] Fix path

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8e069d055..95efa6fe3 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -202,7 +202,7 @@ steps:
         println("+++ :julia: Running benchmarks")
         include("perf/runbenchmarks.jl")'
     artifact_paths:
-      - "perf/benchmarkresults.json"
+      - "benchmarkresults.json"
     agents:
       queue: "juliagpu"
       rocm: "*"

From c1846e6703d7cb38764dc758586e7d1d2a845894 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Mon, 25 May 2026 21:54:11 +0200
Subject: [PATCH 6/7] Up workflow

---
 .github/workflows/Benchmark.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
index 87b12a96e..57a4cfc4d 100644
--- a/.github/workflows/Benchmark.yml
+++ b/.github/workflows/Benchmark.yml
@@ -6,7 +6,7 @@ permissions:
   deployments: write    # deploy GitHub pages website
 
 on:
-  pull_request_target:
+  pull_request:
     branches:
       - main
   push:
@@ -42,8 +42,8 @@ jobs:
           output-file-path: ${{ steps.locate.outputs.path }}
           benchmark-data-dir-path: "bench"
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          comment-always: ${{ github.event_name == 'pull_request_target' }}
+          comment-always: ${{ github.event_name == 'pull_request' }}
           summary-always: true
           alert-threshold: "125%"
           fail-on-alert: false
-          auto-push: ${{ github.event_name != 'pull_request_target' }}
+          auto-push: ${{ github.event_name != 'pull_request' }}

From 6965cd1f9bea0a7a89bc83cd09fc087cd5611564 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Tue, 26 May 2026 10:42:43 +0200
Subject: [PATCH 7/7] Up comment

---
 perf/runbenchmarks.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index c50bed50c..042177583 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -32,9 +32,8 @@ include("kernel.jl")
 include("array.jl")
 
 @info "Preparing main benchmarks"
-# tune!() uses a doubling strategy (maxevals=1,2,4,…) that exhausts the HIP
-# memory pool on discrete GPUs. Instead, warmup for compilation and fix evals=1:
-# one GPU round-trip per sample is the right granularity anyway.
+# tune!() uses a strategy that exhausts the HIP memory pool on discrete GPUs.
+# Instead, warmup for compilation and fix evals=1: one GPU round-trip per sample.
 warmup(SUITE; verbose=false)
 
 function set_evals!(group::BenchmarkGroup, evals::Int=1)