Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,32 @@ steps:
if: build.message !~ /\[skip tests\]/
timeout_in_minutes: 10

- label: ":racehorse: Benchmarks"
plugins:
- JuliaCI/julia#v1:
version: "1.12"
command: |
julia --project=perf -e '
println("--- :julia: Instantiating benchmark project")
using Pkg
Pkg.develop(PackageSpec(path="."))
Pkg.instantiate()

println("+++ :julia: Running benchmarks")
include("perf/runbenchmarks.jl")'
artifact_paths:
- "benchmarkresults.json"
agents:
queue: "juliagpu"
rocm: "*"
rocmgpu: "gfx1100"
if: build.message !~ /\[skip benchmarks\]/
timeout_in_minutes: 120
env:
JULIA_AMDGPU_CORE_MUST_LOAD: "1"
JULIA_AMDGPU_HIP_MUST_LOAD: "1"
JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"

env:
JULIA_AMDGPU_LOGGING_ENABLED: true
SECRET_CODECOV_TOKEN: "lVqFGgrywYmQrILXBcP8i6TosP+q/W2oTDVLIdkWFWscd/a61oSVb8Tycq3qvngsrdmKU9EevdQo+1x+w7cu4IuTq63ahQc0RFgi4Q29hC52OgN2wFql984Cqq3T96P3jyV0ZljaRT+a+9AY0oWmmCph55amvvQ4DOMq3tfGDbp7gdueQvJmSYQGVT3/9Sjn4/esYppcKBGltQqQX2E7WrHLpnqRmsmjcSeZ/S/+PgPRb4ZnpBecAUP2d/MlPgKfP0ZUGbDlcbGu+ZDZNksxKIYuAlNrWPhpNAro7hACfEk4T5RRpNiwmJyXJZ8LUD8zNYIUKSmHjUtmqhNXgujWXA==;U2FsdGVkX1/v/P2Y7KZsvC55Au6eET37uDE6M5I6J275maix+SMD0EoJQ19cFp/lae+G8V7dvpPGfrh4hj2nOg=="
Expand Down
49 changes: 49 additions & 0 deletions .github/workflows/Benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Benchmarks
permissions:
statuses: read # find Buildkite URL from PR status
contents: write # update benchmark contents in gh-pages branch
pull-requests: write # comment on PR with benchmark results
deployments: write # deploy GitHub pages website

on:
pull_request:
branches:
- main
push:
branches:
- main

jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Download Buildkite Artifacts
id: download
uses: EnricoMi/download-buildkite-artifact-action@v1
with:
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
ignore_build_states: blocked,canceled,skipped,not_run
ignore_job_states: timed_out,failed
output_path: artifacts

- name: Locate Benchmarks Artifact
id: locate
if: ${{ steps.download.outputs.download-state == 'success' }}
run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT

- name: Upload Benchmark Results
if: ${{ steps.locate.outputs.path != '' }}
uses: benchmark-action/github-action-benchmark@v1
with:
name: AMDGPU.jl Benchmarks
tool: "julia"
output-file-path: ${{ steps.locate.outputs.path }}
benchmark-data-dir-path: "bench"
github-token: ${{ secrets.GITHUB_TOKEN }}
comment-always: ${{ github.event_name == 'pull_request' }}
summary-always: true
alert-threshold: "125%"
fail-on-alert: false
auto-push: ${{ github.event_name != 'pull_request' }}
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
[downloads-img]: https://img.shields.io/badge/dynamic/json?url=http%3A%2F%2Fjuliapkgstats.com%2Fapi%2Fv1%2Fmonthly_downloads%2FAMDGPU&query=total_requests&suffix=%2Fmonth&label=Downloads
[downloads-url]: https://juliapkgstats.com/pkg/AMDGPU

[benchmark-img]: https://img.shields.io/badge/benchmarks-Chart-yellowgreen
[benchmark-url]: https://amdgpu.juliagpu.org/bench/

## Quick start

AMDGPU.jl can be installed with the Julia package manager.
Expand Down
1 change: 1 addition & 0 deletions perf/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
benchmarkresults.json
5 changes: 5 additions & 0 deletions perf/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
11 changes: 11 additions & 0 deletions perf/amdgpu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
group = addgroup!(SUITE, "amdgpu")

let group = addgroup!(group, "synchronization")
let group = addgroup!(group, "stream")
group["blocking"] = @benchmarkable AMDGPU.synchronize(; blocking=true)
group["nonblocking"] = @benchmarkable AMDGPU.synchronize(; blocking=false)
end
let group = addgroup!(group, "context")
group["device"] = @benchmarkable AMDGPU.device_synchronize()
end
end
155 changes: 155 additions & 0 deletions perf/array.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
const m = 512
const n = 1000
const m_long = 3
const n_long = 1_000_000

group = addgroup!(SUITE, "array")

# generate some arrays
cpu_mat = rand(rng, Float32, m, n)
gpu_mat = ROCArray{Float32}(cpu_mat)
gpu_mat_long = ROCArray{Float32}(rand(rng, Float32, m_long, n_long))
gpu_vec = reshape(gpu_mat, length(gpu_mat))
gpu_vec_long = reshape(gpu_mat_long, length(gpu_mat_long))
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
gpu_mat_ints = ROCArray(rand(rng, -10:10, m, n))
gpu_mat_long_ints = ROCArray(rand(rng, -10:10, m_long, n_long))
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
gpu_mat_bools = ROCArray(rand(rng, Bool, m, n))
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))

group["construct"] = @benchmarkable ROCArray{Int}(undef, 1) evals=1

group["copy"] = @async_benchmarkable copy($gpu_mat)

gpu_mat2 = copy(gpu_mat)
let group = addgroup!(group, "copyto!")
group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
end

let group = addgroup!(group, "iteration")
group["scalar"] = @benchmarkable AMDGPU.allowscalar() do
[$gpu_vec[i] for i in 1:10]
end

group["logical"] = @async_benchmarkable $gpu_vec[$gpu_vec_bools]

let group = addgroup!(group, "findall")
group["bool"] = @async_benchmarkable findall($gpu_vec_bools)
group["int"] = @async_benchmarkable findall(isodd, $gpu_vec_ints)
end

let group = addgroup!(group, "findfirst")
group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
end

let group = addgroup!(group, "findmin") # findmax
group["1d"] = @async_benchmarkable findmin($gpu_vec)
group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
end
end

let group = addgroup!(group, "reverse")
group["1d"] = @async_benchmarkable reverse($gpu_vec)
group["1dL"] = @async_benchmarkable reverse($gpu_vec_long)
group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
group["2dL"] = @async_benchmarkable reverse($gpu_mat_long; dims=1)
group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
group["1dL_inplace"] = @async_benchmarkable reverse!($gpu_vec_long)
group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
end

group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0

# no need to test inplace version, which performs the same operation (but with an alloc)
let group = addgroup!(group, "accumulate")
let group = addgroup!(group, "Float32")
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)
group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints)
group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)
group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
end
end

let group = addgroup!(group, "reductions")
let group = addgroup!(group, "reduce")
let group = addgroup!(group, "Float32")
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints)
group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
end
end

let group = addgroup!(group, "mapreduce")
let group = addgroup!(group, "Float32")
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
end
end
# used by sum, prod, minimum, maximum, all, any, count
end

let group = addgroup!(group, "random")
let group = addgroup!(group, "rand")
group["Float32"] = @async_benchmarkable AMDGPU.rand(Float32, $m*$n)
group["Int64"] = @async_benchmarkable AMDGPU.rand(Int64, $m*$n)
end

let group = addgroup!(group, "rand!")
group["Float32"] = @async_benchmarkable rand!($gpu_vec)
group["Int64"] = @async_benchmarkable rand!($gpu_vec_ints)
end

let group = addgroup!(group, "randn")
group["Float32"] = @async_benchmarkable AMDGPU.randn(Float32, $m*$n)
end

let group = addgroup!(group, "randn!")
group["Float32"] = @async_benchmarkable randn!($gpu_vec)
end
end

let group = addgroup!(group, "sorting")
group["1d"] = @async_benchmarkable sort($gpu_vec)
# there's no GPU-side support for dims= or by=. Both fall back to Base's CPU sort path, which triggers scalar indexing
# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
end

let group = addgroup!(group, "permutedims")
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
end
72 changes: 72 additions & 0 deletions perf/byval.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
module ByVal

using AMDGPU, BenchmarkTools, Random

const threads = 256

# simple add matrices kernel
function kernel_add_mat(n, x1, x2, y)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
if i <= n
@inbounds y[i] = x1[i] + x2[i]
end
return
end

@inline get_inputs3(indx_y, a, b, c) = (a, b, c)
@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)

# add arrays of matrices kernel
function kernel_add_mat_z_slices(n, vararg...)
x1, x2, y = get_inputs3(workgroupIdx().y, vararg...)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
if i <= n
@inbounds y[i] = x1[i] + x2[i]
end
return
end

function add_z_slices!(y, x1, x2)
m1, n1 = size(x1[1])
blocks = (m1 * n1 + threads - 1) ÷ threads
@roc groupsize=threads gridsize=(blocks, length(x1)) kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
end

function add!(y, x1, x2)
m1, n1 = size(x1)
blocks = (m1 * n1 + threads - 1) ÷ threads
@roc groupsize=threads gridsize=(blocks, 1) kernel_add_mat(m1 * n1, x1, x2, y)
end

function main()
results = BenchmarkGroup()

num_z_slices = 3
Random.seed!(1)

m, n = 3072, 1536 # 256 multiplier

x1 = [ROCArray(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
x2 = [ROCArray(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
y1 = [similar(x1[1]) for i = 1:num_z_slices]

# reference down-to-bones add on GPU
results["reference"] = @benchmark AMDGPU.@sync add!($y1[1], $x1[1], $x2[1])

# adding arrays in an array
for slices = 1:num_z_slices
results["slices=$slices"] = @benchmark AMDGPU.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
end

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
AMDGPU.unsafe_free!.(x1)
AMDGPU.unsafe_free!.(x2)
AMDGPU.unsafe_free!.(y1)

return results
end

end

ByVal.main()
27 changes: 27 additions & 0 deletions perf/kernel.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
group = addgroup!(SUITE, "kernel")

group["launch"] = @benchmarkable @roc identity(nothing)

src = AMDGPU.rand(Float32, 512, 1000)
dest = similar(src)

function indexing_kernel(dest, src)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
@inbounds dest[i] = src[i]
return
end
group["indexing"] = @async_benchmarkable @roc groupsize=size(src,1) gridsize=size(src,2) $indexing_kernel($dest, $src)

function checked_indexing_kernel(dest, src)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
dest[i] = src[i]
return
end
group["indexing_checked"] = @async_benchmarkable @roc groupsize=size(src,1) gridsize=size(src,2) $checked_indexing_kernel($dest, $src)

function rand_kernel(dest::AbstractArray{T}) where {T}
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
dest[i] = rand(T)
return
end
group["rand"] = @async_benchmarkable @roc groupsize=size($dest,1) gridsize=size($dest,2) $rand_kernel($dest)
Loading
Loading