From 8ba6e0d690b12f1ad952902f56e2db1ab552a60a Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Wed, 20 May 2026 13:09:40 +0200 Subject: [PATCH 1/7] Add initial bench machinery --- .buildkite/pipeline.yml | 26 ++++ .github/workflows/Benchmark.yml | 49 ++++++++ README.md | 3 + perf/.gitignore | 1 + perf/Project.toml | 5 + perf/amdgpu.jl | 11 ++ perf/array.jl | 155 +++++++++++++++++++++++ perf/byval.jl | 72 +++++++++++ perf/kernel.jl | 27 ++++ perf/latency.jl | 39 ++++++ perf/runbenchmarks.jl | 59 +++++++++ perf/volumerhs.jl | 214 ++++++++++++++++++++++++++++++++ 12 files changed, 661 insertions(+) create mode 100644 .github/workflows/Benchmark.yml create mode 100644 perf/.gitignore create mode 100644 perf/Project.toml create mode 100644 perf/amdgpu.jl create mode 100644 perf/array.jl create mode 100644 perf/byval.jl create mode 100644 perf/kernel.jl create mode 100644 perf/latency.jl create mode 100644 perf/runbenchmarks.jl create mode 100644 perf/volumerhs.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index dfedd031f..332d333f7 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -188,6 +188,32 @@ steps: if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 10 + - label: ":racehorse: Benchmarks" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia --project=perf -e ' + println("--- :julia: Instantiating benchmark project") + using Pkg + Pkg.develop(PackageSpec(path=".")) + Pkg.instantiate() + + println("+++ :julia: Running benchmarks") + include("perf/runbenchmarks.jl")' + artifact_paths: + - "perf/benchmarkresults.json" + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip benchmarks\]/ + timeout_in_minutes: 120 + env: + JULIA_AMDGPU_CORE_MUST_LOAD: "1" + JULIA_AMDGPU_HIP_MUST_LOAD: "1" + JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" + env: JULIA_AMDGPU_LOGGING_ENABLED: true SECRET_CODECOV_TOKEN: "lVqFGgrywYmQrILXBcP8i6TosP+q/W2oTDVLIdkWFWscd/a61oSVb8Tycq3qvngsrdmKU9EevdQo+1x+w7cu4IuTq63ahQc0RFgi4Q29hC52OgN2wFql984Cqq3T96P3jyV0ZljaRT+a+9AY0oWmmCph55amvvQ4DOMq3tfGDbp7gdueQvJmSYQGVT3/9Sjn4/esYppcKBGltQqQX2E7WrHLpnqRmsmjcSeZ/S/+PgPRb4ZnpBecAUP2d/MlPgKfP0ZUGbDlcbGu+ZDZNksxKIYuAlNrWPhpNAro7hACfEk4T5RRpNiwmJyXJZ8LUD8zNYIUKSmHjUtmqhNXgujWXA==;U2FsdGVkX1/v/P2Y7KZsvC55Au6eET37uDE6M5I6J275maix+SMD0EoJQ19cFp/lae+G8V7dvpPGfrh4hj2nOg==" diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml new file mode 100644 index 000000000..87b12a96e --- /dev/null +++ b/.github/workflows/Benchmark.yml @@ -0,0 +1,49 @@ +name: Benchmarks +permissions: + statuses: read # find Buildkite URL from PR status + contents: write # update benchmark contents in gh-pages branch + pull-requests: write # comment on PR with benchmark results + deployments: write # deploy GitHub pages website + +on: + pull_request_target: + branches: + - main + push: + branches: + - main + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Download Buildkite Artifacts + id: download + uses: EnricoMi/download-buildkite-artifact-action@v1 + with: + buildkite_token: ${{ secrets.BUILDKITE_TOKEN }} + ignore_build_states: blocked,canceled,skipped,not_run + ignore_job_states: timed_out,failed + output_path: artifacts + + - name: Locate Benchmarks Artifact + id: locate + if: ${{ steps.download.outputs.download-state == 'success' }} + run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT + + - name: Upload Benchmark Results + if: ${{ steps.locate.outputs.path != '' }} + uses: benchmark-action/github-action-benchmark@v1 + with: + name: AMDGPU.jl Benchmarks + tool: "julia" + output-file-path: ${{ steps.locate.outputs.path }} + benchmark-data-dir-path: "bench" + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-always: ${{ github.event_name == 'pull_request_target' }} + summary-always: true + alert-threshold: "125%" + fail-on-alert: false + auto-push: ${{ github.event_name != 'pull_request_target' }} diff --git a/README.md b/README.md index fb88c3081..a3d9bacf4 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,9 @@ [downloads-img]: https://img.shields.io/badge/dynamic/json?url=http%3A%2F%2Fjuliapkgstats.com%2Fapi%2Fv1%2Fmonthly_downloads%2FAMDGPU&query=total_requests&suffix=%2Fmonth&label=Downloads [downloads-url]: https://juliapkgstats.com/pkg/AMDGPU +[benchmark-img]: https://img.shields.io/badge/benchmarks-Chart-yellowgreen +[benchmark-url]: https://amdgpu.juliagpu.org/bench/ + ## Quick start AMDGPU.jl can be installed with the Julia package manager. diff --git a/perf/.gitignore b/perf/.gitignore new file mode 100644 index 000000000..82d7c41fd --- /dev/null +++ b/perf/.gitignore @@ -0,0 +1 @@ +benchmarkresults.json diff --git a/perf/Project.toml b/perf/Project.toml new file mode 100644 index 000000000..ddd8a2048 --- /dev/null +++ b/perf/Project.toml @@ -0,0 +1,5 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" diff --git a/perf/amdgpu.jl b/perf/amdgpu.jl new file mode 100644 index 000000000..8c88c62e5 --- /dev/null +++ b/perf/amdgpu.jl @@ -0,0 +1,11 @@ +group = addgroup!(SUITE, "amdgpu") + +let group = addgroup!(group, "synchronization") + let group = addgroup!(group, "stream") + group["blocking"] = @benchmarkable AMDGPU.synchronize(; blocking=true) + group["nonblocking"] = @benchmarkable AMDGPU.synchronize(; blocking=false) + end + let group = addgroup!(group, "context") + group["device"] = @benchmarkable AMDGPU.device_synchronize() + end +end diff --git a/perf/array.jl b/perf/array.jl new file mode 100644 index 000000000..b26b53749 --- /dev/null +++ b/perf/array.jl @@ -0,0 +1,155 @@ +const m = 512 +const n = 1000 +const m_long = 3 +const n_long = 1_000_000 + +group = addgroup!(SUITE, "array") + +# generate some arrays +cpu_mat = rand(rng, Float32, m, n) +gpu_mat = ROCArray{Float32}(cpu_mat) +gpu_mat_long = ROCArray{Float32}(rand(rng, Float32, m_long, n_long)) +gpu_vec = reshape(gpu_mat, length(gpu_mat)) +gpu_vec_long = reshape(gpu_mat_long, length(gpu_mat_long)) +gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) +gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) +gpu_mat_ints = ROCArray(rand(rng, -10:10, m, n)) +gpu_mat_long_ints = ROCArray(rand(rng, -10:10, m_long, n_long)) +gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) +gpu_mat_bools = ROCArray(rand(rng, Bool, m, n)) +gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) + +group["construct"] = @benchmarkable ROCArray{Int}(undef, 1) evals=1 + +group["copy"] = @async_benchmarkable copy($gpu_mat) + +gpu_mat2 = copy(gpu_mat) +let group = addgroup!(group, "copyto!") + group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat) + group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat) + group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat) +end + +let group = addgroup!(group, "iteration") + group["scalar"] = @benchmarkable AMDGPU.allowscalar() do + [$gpu_vec[i] for i in 1:10] + end + + group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] + + let group = addgroup!(group, "findall") + group["bool"] = @benchmarkable findall($gpu_vec_bools) + group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints) + end + + let group = addgroup!(group, "findfirst") + group["bool"] = @benchmarkable findfirst($gpu_vec_bools) + group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints) + end + + let group = addgroup!(group, "findmin") # findmax + group["1d"] = @async_benchmarkable findmin($gpu_vec) + group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1) + end +end + +let group = addgroup!(group, "reverse") + group["1d"] = @async_benchmarkable reverse($gpu_vec) + group["1dL"] = @async_benchmarkable reverse($gpu_vec_long) + group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) + group["2dL"] = @async_benchmarkable reverse($gpu_mat_long; dims=1) + group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) + group["1dL_inplace"] = @async_benchmarkable reverse!($gpu_vec_long) + group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) + group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2) +end + +group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 + +# no need to test inplace version, which performs the same operation (but with an alloc) +let group = addgroup!(group, "accumulate") + let group = addgroup!(group, "Float32") + group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec) + group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1) + group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2) + group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1) + group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints) + group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1) + group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2) + group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1) + group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2) + end +end + +let group = addgroup!(group, "reductions") + let group = addgroup!(group, "reduce") + let group = addgroup!(group, "Float32") + group["1d"] = @async_benchmarkable reduce(+, $gpu_vec) + group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1) + group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2) + group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1) + group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints) + group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1) + group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2) + group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1) + group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2) + end + end + + let group = addgroup!(group, "mapreduce") + let group = addgroup!(group, "Float32") + group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec) + group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1) + group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2) + group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1) + group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints) + group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1) + group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2) + group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1) + group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2) + end + end + + # used by sum, prod, minimum, maximum, all, any, count +end + +let group = addgroup!(group, "random") + let group = addgroup!(group, "rand") + group["Float32"] = @async_benchmarkable AMDGPU.rand(Float32, $m*$n) + group["Int64"] = @async_benchmarkable AMDGPU.rand(Int64, $m*$n) + end + + let group = addgroup!(group, "rand!") + group["Float32"] = @async_benchmarkable rand!($gpu_vec) + group["Int64"] = @async_benchmarkable rand!($gpu_vec_ints) + end + + let group = addgroup!(group, "randn") + group["Float32"] = @async_benchmarkable AMDGPU.randn(Float32, $m*$n) + end + + let group = addgroup!(group, "randn!") + group["Float32"] = @async_benchmarkable randn!($gpu_vec) + end +end + +let group = addgroup!(group, "sorting") + group["1d"] = @async_benchmarkable sort($gpu_vec) + group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) + group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) +end + +let group = addgroup!(group, "permutedims") + group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1)) + group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2)) + group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3)) +end diff --git a/perf/byval.jl b/perf/byval.jl new file mode 100644 index 000000000..863c63203 --- /dev/null +++ b/perf/byval.jl @@ -0,0 +1,72 @@ +module ByVal + +using AMDGPU, BenchmarkTools, Random + +const threads = 256 + +# simple add matrices kernel +function kernel_add_mat(n, x1, x2, y) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if i <= n + @inbounds y[i] = x1[i] + x2[i] + end + return +end + +@inline get_inputs3(indx_y, a, b, c) = (a, b, c) +@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2) +@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3) + +# add arrays of matrices kernel +function kernel_add_mat_z_slices(n, vararg...) + x1, x2, y = get_inputs3(workgroupIdx().y, vararg...) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if i <= n + @inbounds y[i] = x1[i] + x2[i] + end + return +end + +function add_z_slices!(y, x1, x2) + m1, n1 = size(x1[1]) + blocks = (m1 * n1 + threads - 1) ÷ threads + @roc groupsize=threads gridsize=(blocks, length(x1)) kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) +end + +function add!(y, x1, x2) + m1, n1 = size(x1) + blocks = (m1 * n1 + threads - 1) ÷ threads + @roc groupsize=threads gridsize=(blocks, 1) kernel_add_mat(m1 * n1, x1, x2, y) +end + +function main() + results = BenchmarkGroup() + + num_z_slices = 3 + Random.seed!(1) + + m, n = 3072, 1536 # 256 multiplier + + x1 = [ROCArray(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] + x2 = [ROCArray(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] + y1 = [similar(x1[1]) for i = 1:num_z_slices] + + # reference down-to-bones add on GPU + results["reference"] = @benchmark AMDGPU.@sync add!($y1[1], $x1[1], $x2[1]) + + # adding arrays in an array + for slices = 1:num_z_slices + results["slices=$slices"] = @benchmark AMDGPU.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) + end + + # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them + AMDGPU.unsafe_free!.(x1) + AMDGPU.unsafe_free!.(x2) + AMDGPU.unsafe_free!.(y1) + + return results +end + +end + +ByVal.main() diff --git a/perf/kernel.jl b/perf/kernel.jl new file mode 100644 index 000000000..7543fce3f --- /dev/null +++ b/perf/kernel.jl @@ -0,0 +1,27 @@ +group = addgroup!(SUITE, "kernel") + +group["launch"] = @benchmarkable @roc identity(nothing) + +src = AMDGPU.rand(Float32, 512, 1000) +dest = similar(src) + +function indexing_kernel(dest, src) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + @inbounds dest[i] = src[i] + return +end +group["indexing"] = @async_benchmarkable @roc groupsize=size(src,1) gridsize=size(src,2) $indexing_kernel($dest, $src) + +function checked_indexing_kernel(dest, src) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + dest[i] = src[i] + return +end +group["indexing_checked"] = @async_benchmarkable @roc groupsize=size(src,1) gridsize=size(src,2) $checked_indexing_kernel($dest, $src) + +function rand_kernel(dest::AbstractArray{T}) where {T} + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + dest[i] = rand(T) + return +end +group["rand"] = @async_benchmarkable @roc groupsize=size($dest,1) gridsize=size($dest,2) $rand_kernel($dest) diff --git a/perf/latency.jl b/perf/latency.jl new file mode 100644 index 000000000..166d5162c --- /dev/null +++ b/perf/latency.jl @@ -0,0 +1,39 @@ +module Latency + +using AMDGPU +using BenchmarkTools + +function main() + results = BenchmarkGroup() + + base_cmd = Base.julia_cmd() + if Base.JLOptions().project != C_NULL + base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))` + end + # NOTE: we don't use Base.active_project() here because of how CI launches this script, + # starting with --project in the main AMDGPU.jl project. + + # time to precompile the package and its dependencies + precompile_cmd = + `$base_cmd -e "pkg = Base.identify_package(\"AMDGPU\") + Base.compilecache(pkg)"` + results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60 + + # time to actually import the package + import_cmd = + `$base_cmd -e "using AMDGPU"` + results["import"] = @benchmark run($import_cmd) evals=1 seconds=30 + + # time to actually compile a kernel (time to first kernel) + ttfp_cmd = + `$base_cmd -e "using AMDGPU + kernel() = return + AMDGPU.code_gcn(devnull, kernel, Tuple{}; kernel=true)"` + results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60 + + results +end + +end + +Latency.main() diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl new file mode 100644 index 000000000..9a1bb5b39 --- /dev/null +++ b/perf/runbenchmarks.jl @@ -0,0 +1,59 @@ +# Benchmark suite execution +using AMDGPU + +using BenchmarkTools + +using Random, StableRNGs +rng = StableRNG(123) + +# print system information +AMDGPU.versioninfo() + +# convenience macro to create a benchmark that requires synchronizing the GPU +macro async_benchmarkable(ex...) + quote + @benchmarkable AMDGPU.@sync $(ex...) + end +end + +# before anything else, run latency benchmarks. these spawn subprocesses, so we don't want +# to do so after regular benchmarks have caused the memory allocator to reserve memory. +@info "Running latency benchmarks" +latency_results = include("latency.jl") + +SUITE = BenchmarkGroup() + +include("amdgpu.jl") +include("kernel.jl") +include("array.jl") + +@info "Preparing main benchmarks" +warmup(SUITE; verbose=false) +tune!(SUITE) + +# reclaim memory that might have been used by the tuning process +GC.gc(true) +AMDGPU.reclaim() + +# benchmark groups that aren't part of the suite +addgroup!(SUITE, "integration") + +@info "Running main benchmarks" +results = run(SUITE, verbose=true) + +# integration tests (that do nasty things, so need to be run last) +@info "Running integration benchmarks" +integration_results = BenchmarkGroup() +integration_results["volumerhs"] = include("volumerhs.jl") +integration_results["byval"] = include("byval.jl") + +results["latency"] = latency_results +results["integration"] = integration_results + +# write out the results +# we report the minimum rather than the median: at the sub-microsecond scale of many +# of these benchmarks, OS scheduler jitter dominates the median and produces 5-15% +# trial-to-trial variance, while the minimum reflects the un-preempted code path +# and is stable to <1% across trials. real regressions still show up in the minimum. +result_file = length(ARGS) >= 1 ? ARGS[1] : "benchmarkresults.json" +BenchmarkTools.save(result_file, minimum(results)) diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl new file mode 100644 index 000000000..d81024c73 --- /dev/null +++ b/perf/volumerhs.jl @@ -0,0 +1,214 @@ +module VolumeRHS + +using BenchmarkTools +using AMDGPU +using StableRNGs +using StaticArrays + +function loopinfo(name, expr, nodes...) + if expr.head != :for + error("Syntax error: pragma $name needs a for loop") + end + push!(expr.args[2].args, Expr(:loopinfo, nodes...)) + return expr +end + +macro unroll(expr) + expr = loopinfo("@unroll", expr, (Symbol("llvm.loop.unroll.full"),)) + return esc(expr) +end + +# note the order of the fields below is also assumed in the code. +const _nstate = 5 +const _ρ, _U, _V, _W, _E = 1:_nstate +const stateid = (ρ = _ρ, U = _U, V = _V, W = _W, E = _E) + +const _nvgeo = 14 +const _ξx, _ηx, _ζx, _ξy, _ηy, _ζy, _ξz, _ηz, _ζz, _MJ, _MJI, + _x, _y, _z = 1:_nvgeo +const vgeoid = (ξx = _ξx, ηx = _ηx, ζx = _ζx, + ξy = _ξy, ηy = _ηy, ζy = _ζy, + ξz = _ξz, ηz = _ηz, ζz = _ζz, + MJ = _MJ, MJI = _MJI, + x = _x, y = _y, z = _z) + +const N = 4 +const nmoist = 0 +const ntrace = 0 + +Base.@irrational grav 9.81 BigFloat(9.81) +Base.@irrational gdm1 0.4 BigFloat(0.4) + +function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) + Q = Base.Experimental.Const(Q) + vgeo = Base.Experimental.Const(vgeo) + D = Base.Experimental.Const(D) + + nvar = _nstate + nmoist + ntrace + Nq = N + 1 + + s_D = @ROCStaticLocalArray(eltype(D), (Nq, Nq)) + s_F = @ROCStaticLocalArray(eltype(Q), (Nq, Nq, _nstate)) + s_G = @ROCStaticLocalArray(eltype(Q), (Nq, Nq, _nstate)) + + r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef) + r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef) + r_rhsV = MArray{Tuple{Nq}, eltype(rhs)}(undef) + r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef) + r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef) + + e = workgroupIdx().x + j = workitemIdx().y + i = workitemIdx().x + + @inbounds begin + for k in 1:Nq + r_rhsρ[k] = zero(eltype(rhs)) + r_rhsU[k] = zero(eltype(rhs)) + r_rhsV[k] = zero(eltype(rhs)) + r_rhsW[k] = zero(eltype(rhs)) + r_rhsE[k] = zero(eltype(rhs)) + end + + # fetch D into shared + s_D[i, j] = D[i, j] + @unroll for k in 1:Nq + sync_workgroup() + + # Load values we need into registers + MJ = vgeo[i, j, k, _MJ, e] + ξx, ξy, ξz = vgeo[i,j,k,_ξx,e], vgeo[i,j,k,_ξy,e], vgeo[i,j,k,_ξz,e] + ηx, ηy, ηz = vgeo[i,j,k,_ηx,e], vgeo[i,j,k,_ηy,e], vgeo[i,j,k,_ηz,e] + ζx, ζy, ζz = vgeo[i,j,k,_ζx,e], vgeo[i,j,k,_ζy,e], vgeo[i,j,k,_ζz,e] + z = vgeo[i,j,k,_z,e] + + U, V, W = Q[i, j, k, _U, e], Q[i, j, k, _V, e], Q[i, j, k, _W, e] + ρ, E = Q[i, j, k, _ρ, e], Q[i, j, k, _E, e] + + ρinv = @fastmath inv(ρ) + ρ2inv = @fastmath inv(2ρ) + + P = gdm1 * (E - (U^2 + V^2 + W^2) * ρ2inv - ρ * gravity * z) + + fluxρ_x = U + fluxU_x = ρinv * U * U + P + fluxV_x = ρinv * U * V + fluxW_x = ρinv * U * W + fluxE_x = ρinv * U * (E + P) + + fluxρ_y = V + fluxU_y = ρinv * V * U + fluxV_y = ρinv * V * V + P + fluxW_y = ρinv * V * W + fluxE_y = ρinv * V * (E + P) + + fluxρ_z = W + fluxU_z = ρinv * W * U + fluxV_z = ρinv * W * V + fluxW_z = ρinv * W * W + P + fluxE_z = ρinv * W * (E + P) + + s_F[i, j, _ρ] = MJ * (ξx * fluxρ_x + ξy * fluxρ_y + ξz * fluxρ_z) + s_F[i, j, _U] = MJ * (ξx * fluxU_x + ξy * fluxU_y + ξz * fluxU_z) + s_F[i, j, _V] = MJ * (ξx * fluxV_x + ξy * fluxV_y + ξz * fluxV_z) + s_F[i, j, _W] = MJ * (ξx * fluxW_x + ξy * fluxW_y + ξz * fluxW_z) + s_F[i, j, _E] = MJ * (ξx * fluxE_x + ξy * fluxE_y + ξz * fluxE_z) + + s_G[i, j, _ρ] = MJ * (ηx * fluxρ_x + ηy * fluxρ_y + ηz * fluxρ_z) + s_G[i, j, _U] = MJ * (ηx * fluxU_x + ηy * fluxU_y + ηz * fluxU_z) + s_G[i, j, _V] = MJ * (ηx * fluxV_x + ηy * fluxV_y + ηz * fluxV_z) + s_G[i, j, _W] = MJ * (ηx * fluxW_x + ηy * fluxW_y + ηz * fluxW_z) + s_G[i, j, _E] = MJ * (ηx * fluxE_x + ηy * fluxE_y + ηz * fluxE_z) + + r_Hρ = MJ * (ζx * fluxρ_x + ζy * fluxρ_y + ζz * fluxρ_z) + r_HU = MJ * (ζx * fluxU_x + ζy * fluxU_y + ζz * fluxU_z) + r_HV = MJ * (ζx * fluxV_x + ζy * fluxV_y + ζz * fluxV_z) + r_HW = MJ * (ζx * fluxW_x + ζy * fluxW_y + ζz * fluxW_z) + r_HE = MJ * (ζx * fluxE_x + ζy * fluxE_y + ζz * fluxE_z) + + # one shared access per 10 flops + for n = 1:Nq + Dkn = s_D[k, n] + r_rhsρ[n] += Dkn * r_Hρ + r_rhsU[n] += Dkn * r_HU + r_rhsV[n] += Dkn * r_HV + r_rhsW[n] += Dkn * r_HW + r_rhsE[n] += Dkn * r_HE + end + + r_rhsW[k] -= MJ * ρ * gravity + + sync_workgroup() + + # loop over ξ-grid lines + @unroll for n = 1:Nq + Dni = s_D[n, i] + Dnj = s_D[n, j] + + r_rhsρ[k] += Dni * s_F[n, j, _ρ] + r_rhsρ[k] += Dnj * s_G[i, n, _ρ] + + r_rhsU[k] += Dni * s_F[n, j, _U] + r_rhsU[k] += Dnj * s_G[i, n, _U] + + r_rhsV[k] += Dni * s_F[n, j, _V] + r_rhsV[k] += Dnj * s_G[i, n, _V] + + r_rhsW[k] += Dni * s_F[n, j, _W] + r_rhsW[k] += Dnj * s_G[i, n, _W] + + r_rhsE[k] += Dni * s_F[n, j, _E] + r_rhsE[k] += Dnj * s_G[i, n, _E] + end + end # k + + @unroll for k in 1:Nq + MJI = vgeo[i, j, k, _MJI, e] + rhs[i, j, k, _U, e] += MJI * r_rhsU[k] + rhs[i, j, k, _V, e] += MJI * r_rhsV[k] + rhs[i, j, k, _W, e] += MJI * r_rhsW[k] + rhs[i, j, k, _ρ, e] += MJI * r_rhsρ[k] + rhs[i, j, k, _E, e] += MJI * r_rhsE[k] + end + end + return +end + +function main() + DFloat = Float32 + nelem = 240_000 + + rng = StableRNG(123) + + Nq = N + 1 + nvar = _nstate + nmoist + ntrace + + Q = 1 .+ ROCArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem)) + Q[:, :, :, _E, :] .+= 20 + + vgeo = ROCArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem)) + vgeo[:, :, :, _MJ, :] .+= 3 + vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :] + + D = ROCArray(rand(rng, DFloat, Nq, Nq)) + rhs = ROCArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) + + groupsize = (N+1, N+1) + + results = @benchmark begin + AMDGPU.@sync @roc groupsize=$groupsize gridsize=$nelem $volumerhs!( + $rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem) + end + + # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them + AMDGPU.unsafe_free!(rhs) + AMDGPU.unsafe_free!(Q) + AMDGPU.unsafe_free!(vgeo) + AMDGPU.unsafe_free!(D) + + results +end + +end + +VolumeRHS.main() From 2bb39a8b232f8f176bbf6e909420ea1ee806dbb5 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Wed, 20 May 2026 15:23:27 +0200 Subject: [PATCH 2/7] Tweaks --- .buildkite/pipeline.yml | 4 ++-- perf/array.jl | 6 +++--- perf/runbenchmarks.jl | 3 +-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 332d333f7..4420a7796 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -191,7 +191,7 @@ steps: - label: ":racehorse: Benchmarks" plugins: - JuliaCI/julia#v1: - version: "1.10" + version: "1.12" command: | julia --project=perf -e ' println("--- :julia: Instantiating benchmark project") @@ -206,7 +206,7 @@ steps: agents: queue: "juliagpu" rocm: "*" - rocmgpu: "*" + rocmgpu: "gfx1100" if: build.message !~ /\[skip benchmarks\]/ timeout_in_minutes: 120 env: diff --git a/perf/array.jl b/perf/array.jl index b26b53749..32c92dde1 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -118,7 +118,6 @@ let group = addgroup!(group, "reductions") group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2) end end - # used by sum, prod, minimum, maximum, all, any, count end @@ -144,8 +143,9 @@ end let group = addgroup!(group, "sorting") group["1d"] = @async_benchmarkable sort($gpu_vec) - group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) - group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) + # there's no GPU-side support for dims= or by=. Both fall back to Base's CPU sort path, which triggers scalar indexing + # group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) + # group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) end let group = addgroup!(group, "permutedims") diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index 9a1bb5b39..0fbeba38b 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -28,7 +28,6 @@ include("kernel.jl") include("array.jl") @info "Preparing main benchmarks" -warmup(SUITE; verbose=false) tune!(SUITE) # reclaim memory that might have been used by the tuning process @@ -41,7 +40,7 @@ addgroup!(SUITE, "integration") @info "Running main benchmarks" results = run(SUITE, verbose=true) -# integration tests (that do nasty things, so need to be run last) +# integration tests (that need to be run last) @info "Running integration benchmarks" integration_results = BenchmarkGroup() integration_results["volumerhs"] = include("volumerhs.jl") From 7661fa7b7443008a9b18fd7403760b24c0601704 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Wed, 20 May 2026 15:41:25 +0200 Subject: [PATCH 3/7] Fixup --- perf/volumerhs.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl index d81024c73..8170f8400 100644 --- a/perf/volumerhs.jl +++ b/perf/volumerhs.jl @@ -40,9 +40,10 @@ Base.@irrational grav 9.81 BigFloat(9.81) Base.@irrational gdm1 0.4 BigFloat(0.4) function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) - Q = Base.Experimental.Const(Q) - vgeo = Base.Experimental.Const(vgeo) - D = Base.Experimental.Const(D) + # not supported on AMDGPU currently + # Q = Base.Experimental.Const(Q) + # vgeo = Base.Experimental.Const(vgeo) + # D = Base.Experimental.Const(D) nvar = _nstate + nmoist + ntrace Nq = N + 1 From a4575d9d256a4d4b8b7b4de46d33b4267cae68eb Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Fri, 22 May 2026 00:19:15 +0200 Subject: [PATCH 4/7] Fix hangs on AMDGPU --- perf/array.jl | 6 +++--- perf/runbenchmarks.jl | 24 +++++++++++++++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/perf/array.jl b/perf/array.jl index 32c92dde1..8ab79c7fd 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -35,11 +35,11 @@ let group = addgroup!(group, "iteration") [$gpu_vec[i] for i in 1:10] end - group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] + group["logical"] = @async_benchmarkable $gpu_vec[$gpu_vec_bools] let group = addgroup!(group, "findall") - group["bool"] = @benchmarkable findall($gpu_vec_bools) - group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints) + group["bool"] = @async_benchmarkable findall($gpu_vec_bools) + group["int"] = @async_benchmarkable findall(isodd, $gpu_vec_ints) end let group = addgroup!(group, "findfirst") diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index 0fbeba38b..c50bed50c 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -9,10 +9,14 @@ rng = StableRNG(123) # print system information AMDGPU.versioninfo() -# convenience macro to create a benchmark that requires synchronizing the GPU +# convenience macro to create a benchmark that requires synchronizing the GPU. +# The setup=(GC+sync) runs before each sample (outside the timing window) to +# drain pending hipFreeAsync calls before the next allocation, preventing +# HIP memory pool exhaustion on discrete GPUs. macro async_benchmarkable(ex...) quote - @benchmarkable AMDGPU.@sync $(ex...) + @benchmarkable(AMDGPU.@sync($(ex...)), + setup=(GC.gc(false); AMDGPU.synchronize())) end end @@ -28,7 +32,21 @@ include("kernel.jl") include("array.jl") @info "Preparing main benchmarks" -tune!(SUITE) +# tune!() uses a doubling strategy (maxevals=1,2,4,…) that exhausts the HIP +# memory pool on discrete GPUs. Instead, warmup for compilation and fix evals=1: +# one GPU round-trip per sample is the right granularity anyway. +warmup(SUITE; verbose=false) + +function set_evals!(group::BenchmarkGroup, evals::Int=1) + for (_, b) in group + if b isa BenchmarkGroup + set_evals!(b, evals) + else + b.params.evals = evals + end + end +end +set_evals!(SUITE) # reclaim memory that might have been used by the tuning process GC.gc(true) From db0076d7a5f3396f1211baa191d58de52b1feb27 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Fri, 22 May 2026 09:29:17 +0200 Subject: [PATCH 5/7] Fix path --- .buildkite/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8e069d055..95efa6fe3 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -202,7 +202,7 @@ steps: println("+++ :julia: Running benchmarks") include("perf/runbenchmarks.jl")' artifact_paths: - - "perf/benchmarkresults.json" + - "benchmarkresults.json" agents: queue: "juliagpu" rocm: "*" From c1846e6703d7cb38764dc758586e7d1d2a845894 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Mon, 25 May 2026 21:54:11 +0200 Subject: [PATCH 6/7] Up workflow --- .github/workflows/Benchmark.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml index 87b12a96e..57a4cfc4d 100644 --- a/.github/workflows/Benchmark.yml +++ b/.github/workflows/Benchmark.yml @@ -6,7 +6,7 @@ permissions: deployments: write # deploy GitHub pages website on: - pull_request_target: + pull_request: branches: - main push: @@ -42,8 +42,8 @@ jobs: output-file-path: ${{ steps.locate.outputs.path }} benchmark-data-dir-path: "bench" github-token: ${{ secrets.GITHUB_TOKEN }} - comment-always: ${{ github.event_name == 'pull_request_target' }} + comment-always: ${{ github.event_name == 'pull_request' }} summary-always: true alert-threshold: "125%" fail-on-alert: false - auto-push: ${{ github.event_name != 'pull_request_target' }} + auto-push: ${{ github.event_name != 'pull_request' }} From 6965cd1f9bea0a7a89bc83cd09fc087cd5611564 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 26 May 2026 10:42:43 +0200 Subject: [PATCH 7/7] Up comment --- perf/runbenchmarks.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index c50bed50c..042177583 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -32,9 +32,8 @@ include("kernel.jl") include("array.jl") @info "Preparing main benchmarks" -# tune!() uses a doubling strategy (maxevals=1,2,4,…) that exhausts the HIP -# memory pool on discrete GPUs. Instead, warmup for compilation and fix evals=1: -# one GPU round-trip per sample is the right granularity anyway. +# tune!() uses a strategy that exhausts the HIP memory pool on discrete GPUs. +# Instead, warmup for compilation and fix evals=1: one GPU round-trip per sample. warmup(SUITE; verbose=false) function set_evals!(group::BenchmarkGroup, evals::Int=1)