JuliaGPU · luraess · May 26, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -188,6 +188,32 @@ steps:
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 10
 
+  - label: ":racehorse: Benchmarks"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.12"
+    command: |
+      julia --project=perf -e '
+        println("--- :julia: Instantiating benchmark project")
+        using Pkg
+        Pkg.develop(PackageSpec(path="."))
+        Pkg.instantiate()
+
+        println("+++ :julia: Running benchmarks")
+        include("perf/runbenchmarks.jl")'
+    artifact_paths:
+      - "benchmarkresults.json"
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "gfx1100"
+    if: build.message !~ /\[skip benchmarks\]/
+    timeout_in_minutes: 120
+    env:
+      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
+      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
+      JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
+
 env:
   JULIA_AMDGPU_LOGGING_ENABLED: true
   SECRET_CODECOV_TOKEN: "lVqFGgrywYmQrILXBcP8i6TosP+q/W2oTDVLIdkWFWscd/a61oSVb8Tycq3qvngsrdmKU9EevdQo+1x+w7cu4IuTq63ahQc0RFgi4Q29hC52OgN2wFql984Cqq3T96P3jyV0ZljaRT+a+9AY0oWmmCph55amvvQ4DOMq3tfGDbp7gdueQvJmSYQGVT3/9Sjn4/esYppcKBGltQqQX2E7WrHLpnqRmsmjcSeZ/S/+PgPRb4ZnpBecAUP2d/MlPgKfP0ZUGbDlcbGu+ZDZNksxKIYuAlNrWPhpNAro7hACfEk4T5RRpNiwmJyXJZ8LUD8zNYIUKSmHjUtmqhNXgujWXA==;U2FsdGVkX1/v/P2Y7KZsvC55Au6eET37uDE6M5I6J275maix+SMD0EoJQ19cFp/lae+G8V7dvpPGfrh4hj2nOg=="

diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
@@ -0,0 +1,49 @@
+name: Benchmarks
+permissions:
+  statuses: read        # find Buildkite URL from PR status
+  contents: write       # update benchmark contents in gh-pages branch
+  pull-requests: write  # comment on PR with benchmark results
+  deployments: write    # deploy GitHub pages website
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download Buildkite Artifacts
+        id: download
+        uses: EnricoMi/download-buildkite-artifact-action@v1
+        with:
+          buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
+          ignore_build_states: blocked,canceled,skipped,not_run
+          ignore_job_states: timed_out,failed
+          output_path: artifacts
+
+      - name: Locate Benchmarks Artifact
+        id: locate
+        if: ${{ steps.download.outputs.download-state == 'success' }}
+        run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT
+
+      - name: Upload Benchmark Results
+        if: ${{ steps.locate.outputs.path != '' }}
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: AMDGPU.jl Benchmarks
+          tool: "julia"
+          output-file-path: ${{ steps.locate.outputs.path }}
+          benchmark-data-dir-path: "bench"
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          comment-always: ${{ github.event_name == 'pull_request' }}
+          summary-always: true
+          alert-threshold: "125%"
+          fail-on-alert: false
+          auto-push: ${{ github.event_name != 'pull_request' }}
diff --git a/README.md b/README.md
@@ -22,6 +22,9 @@
 [downloads-img]: https://img.shields.io/badge/dynamic/json?url=http%3A%2F%2Fjuliapkgstats.com%2Fapi%2Fv1%2Fmonthly_downloads%2FAMDGPU&query=total_requests&suffix=%2Fmonth&label=Downloads
 [downloads-url]: https://juliapkgstats.com/pkg/AMDGPU
 
+[benchmark-img]: https://img.shields.io/badge/benchmarks-Chart-yellowgreen
+[benchmark-url]: https://amdgpu.juliagpu.org/bench/
+
 ## Quick start
 
 AMDGPU.jl can be installed with the Julia package manager.

diff --git a/perf/.gitignore b/perf/.gitignore
@@ -0,0 +1 @@
+benchmarkresults.json
diff --git a/perf/Project.toml b/perf/Project.toml
@@ -0,0 +1,5 @@
+[deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/perf/amdgpu.jl b/perf/amdgpu.jl
@@ -0,0 +1,11 @@
+group = addgroup!(SUITE, "amdgpu")
+
+let group = addgroup!(group, "synchronization")
+    let group = addgroup!(group, "stream")
+        group["blocking"]    = @benchmarkable AMDGPU.synchronize(; blocking=true)
+        group["nonblocking"] = @benchmarkable AMDGPU.synchronize(; blocking=false)
+    end
+    let group = addgroup!(group, "context")
+        group["device"] = @benchmarkable AMDGPU.device_synchronize()
+    end
+end
diff --git a/perf/array.jl b/perf/array.jl
@@ -0,0 +1,155 @@
+const m = 512
+const n = 1000
+const m_long = 3
+const n_long = 1_000_000
+
+group = addgroup!(SUITE, "array")
+
+# generate some arrays
+cpu_mat = rand(rng, Float32, m, n)
+gpu_mat = ROCArray{Float32}(cpu_mat)
+gpu_mat_long = ROCArray{Float32}(rand(rng, Float32, m_long, n_long))
+gpu_vec = reshape(gpu_mat, length(gpu_mat))
+gpu_vec_long = reshape(gpu_mat_long, length(gpu_mat_long))
+gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
+gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
+gpu_mat_ints = ROCArray(rand(rng, -10:10, m, n))
+gpu_mat_long_ints = ROCArray(rand(rng, -10:10, m_long, n_long))
+gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
+gpu_mat_bools = ROCArray(rand(rng, Bool, m, n))
+gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
+
+group["construct"] = @benchmarkable ROCArray{Int}(undef, 1) evals=1
+
+group["copy"] = @async_benchmarkable copy($gpu_mat)
+
+gpu_mat2 = copy(gpu_mat)
+let group = addgroup!(group, "copyto!")
+    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
+    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
+    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
+end
+
+let group = addgroup!(group, "iteration")
+    group["scalar"] = @benchmarkable AMDGPU.allowscalar() do
+        [$gpu_vec[i] for i in 1:10]
+    end
+
+    group["logical"] = @async_benchmarkable $gpu_vec[$gpu_vec_bools]
+
+    let group = addgroup!(group, "findall")
+        group["bool"] = @async_benchmarkable findall($gpu_vec_bools)
+        group["int"]  = @async_benchmarkable findall(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findfirst")
+        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+        group["int"]  = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findmin") # findmax
+        group["1d"] = @async_benchmarkable findmin($gpu_vec)
+        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+    end
+end
+
+let group = addgroup!(group, "reverse")
+    group["1d"]        = @async_benchmarkable reverse($gpu_vec)
+    group["1dL"]       = @async_benchmarkable reverse($gpu_vec_long)
+    group["2d"]        = @async_benchmarkable reverse($gpu_mat; dims=1)
+    group["2dL"]       = @async_benchmarkable reverse($gpu_mat_long; dims=1)
+    group["1d_inplace"]  = @async_benchmarkable reverse!($gpu_vec)
+    group["1dL_inplace"] = @async_benchmarkable reverse!($gpu_vec_long)
+    group["2d_inplace"]  = @async_benchmarkable reverse!($gpu_mat; dims=1)
+    group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
+end
+
+group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+
+# no need to test inplace version, which performs the same operation (but with an alloc)
+let group = addgroup!(group, "accumulate")
+    let group = addgroup!(group, "Float32")
+        group["1d"]     = @async_benchmarkable accumulate(+, $gpu_vec)
+        group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
+        group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)
+        group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
+        group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
+    end
+    let group = addgroup!(group, "Int64")
+        group["1d"]     = @async_benchmarkable accumulate(+, $gpu_vec_ints)
+        group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
+        group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)
+        group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
+        group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
+    end
+end
+
+let group = addgroup!(group, "reductions")
+    let group = addgroup!(group, "reduce")
+        let group = addgroup!(group, "Float32")
+            group["1d"]     = @async_benchmarkable reduce(+, $gpu_vec)
+            group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+            group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
+            group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
+            group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
+        end
+        let group = addgroup!(group, "Int64")
+            group["1d"]     = @async_benchmarkable reduce(+, $gpu_vec_ints)
+            group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
+            group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
+            group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
+            group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
+        end
+    end
+
+    let group = addgroup!(group, "mapreduce")
+        let group = addgroup!(group, "Float32")
+            group["1d"]     = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
+            group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
+            group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
+            group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
+            group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
+        end
+        let group = addgroup!(group, "Int64")
+            group["1d"]     = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
+            group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
+            group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+            group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
+            group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
+        end
+    end
+    # used by sum, prod, minimum, maximum, all, any, count
+end
+
+let group = addgroup!(group, "random")
+    let group = addgroup!(group, "rand")
+        group["Float32"] = @async_benchmarkable AMDGPU.rand(Float32, $m*$n)
+        group["Int64"]   = @async_benchmarkable AMDGPU.rand(Int64, $m*$n)
+    end
+
+    let group = addgroup!(group, "rand!")
+        group["Float32"] = @async_benchmarkable rand!($gpu_vec)
+        group["Int64"]   = @async_benchmarkable rand!($gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "randn")
+        group["Float32"] = @async_benchmarkable AMDGPU.randn(Float32, $m*$n)
+    end
+
+    let group = addgroup!(group, "randn!")
+        group["Float32"] = @async_benchmarkable randn!($gpu_vec)
+    end
+end
+
+let group = addgroup!(group, "sorting")
+    group["1d"] = @async_benchmarkable sort($gpu_vec)
+    #  there's no GPU-side support for dims= or by=. Both fall back to Base's CPU sort path, which triggers scalar indexing
+    # group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+    # group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+end
+
+let group = addgroup!(group, "permutedims")
+    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
+    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
+    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
+end
diff --git a/perf/byval.jl b/perf/byval.jl
@@ -0,0 +1,72 @@
+module ByVal
+
+using AMDGPU, BenchmarkTools, Random
+
+const threads = 256
+
+# simple add matrices kernel
+function kernel_add_mat(n, x1, x2, y)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+@inline get_inputs3(indx_y, a, b, c)                          = (a, b, c)
+@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2)           = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
+@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)
+
+# add arrays of matrices kernel
+function kernel_add_mat_z_slices(n, vararg...)
+    x1, x2, y = get_inputs3(workgroupIdx().y, vararg...)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+function add_z_slices!(y, x1, x2)
+    m1, n1 = size(x1[1])
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    @roc groupsize=threads gridsize=(blocks, length(x1)) kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
+end
+
+function add!(y, x1, x2)
+    m1, n1 = size(x1)
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    @roc groupsize=threads gridsize=(blocks, 1) kernel_add_mat(m1 * n1, x1, x2, y)
+end
+
+function main()
+    results = BenchmarkGroup()
+
+    num_z_slices = 3
+    Random.seed!(1)
+
+    m, n = 3072, 1536    # 256 multiplier
+
+    x1 = [ROCArray(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x2 = [ROCArray(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    y1 = [similar(x1[1]) for i = 1:num_z_slices]
+
+    # reference down-to-bones add on GPU
+    results["reference"] = @benchmark AMDGPU.@sync add!($y1[1], $x1[1], $x2[1])
+
+    # adding arrays in an array
+    for slices = 1:num_z_slices
+        results["slices=$slices"] = @benchmark AMDGPU.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+    end
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    AMDGPU.unsafe_free!.(x1)
+    AMDGPU.unsafe_free!.(x2)
+    AMDGPU.unsafe_free!.(y1)
+
+    return results
+end
+
+end
+
+ByVal.main()
diff --git a/perf/kernel.jl b/perf/kernel.jl
@@ -0,0 +1,27 @@
+group = addgroup!(SUITE, "kernel")
+
+group["launch"] = @benchmarkable @roc identity(nothing)
+
+src = AMDGPU.rand(Float32, 512, 1000)
+dest = similar(src)
+
+function indexing_kernel(dest, src)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    @inbounds dest[i] = src[i]
+    return
+end
+group["indexing"] = @async_benchmarkable @roc groupsize=size(src,1) gridsize=size(src,2) $indexing_kernel($dest, $src)
+
+function checked_indexing_kernel(dest, src)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    dest[i] = src[i]
+    return
+end
+group["indexing_checked"] = @async_benchmarkable @roc groupsize=size(src,1) gridsize=size(src,2) $checked_indexing_kernel($dest, $src)
+
+function rand_kernel(dest::AbstractArray{T}) where {T}
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    dest[i] = rand(T)
+    return
+end
+group["rand"] = @async_benchmarkable @roc groupsize=size($dest,1) gridsize=size($dest,2) $rand_kernel($dest)