|
| 1 | +# Benchmark `ArrayDiff` GPU gradient computation against the hand-written |
| 2 | +# CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`, |
| 3 | +# but in `Float64` so it matches `ArrayDiff`'s tape eltype). |
| 4 | +# |
| 5 | +# What this measures |
| 6 | +# ------------------ |
| 7 | +# A 2-layer MLP `loss = sum((W2 * tanh.(W1 * X) - y).^2) / n`, where `W1` is |
| 8 | +# the only `@variable` (so `MOI.eval_objective_gradient` returns ∂loss/∂W1) |
| 9 | +# and `W2`, `X`, `y` are constants. |
| 10 | +# |
| 11 | +# * Hand-CUDA Float64: `reverse_diff(W1, W2, X, y)` — the same hand-written |
| 12 | +# formula as the existing perf script, just promoted to `Float64` so the |
| 13 | +# numbers are directly comparable to ArrayDiff. |
| 14 | +# |
| 15 | +# * ArrayDiff CPU: same model with `Mode{Vector{Float64}}()` — sanity check |
| 16 | +# for what the existing CPU AD path costs at this problem size. |
| 17 | +# |
| 18 | +# * ArrayDiff GPU: same model with `Mode{CuVector{Float64}}()` — the path |
| 19 | +# this benchmark exists to measure. |
| 20 | +# |
| 21 | +# Caveats (read this before reading the numbers) |
| 22 | +# ---------------------------------------------- |
| 23 | +# The current `ArrayDiff` GPU path still does scalar reads/writes on the |
| 24 | +# tape for every `NODE_VARIABLE` (one per `W1[i,j]`) and every `NODE_VALUE` |
| 25 | +# (one per scalar inside the `vcat(row(...), row(...))` expansion of |
| 26 | +# constant matrices `W2`, `X`, `y`). On a `CuArray` each scalar load/store |
| 27 | +# is a separate `cudaMemcpy`, so for `h = 4096` and `d = 13` that's tens of |
| 28 | +# thousands of round-trips per call. We expect the GPU path to be much |
| 29 | +# slower than the hand-written CUDA reference until those leaves are |
| 30 | +# loaded/extracted in bulk — see the `NODE_VARIABLE_BLOCK` / |
| 31 | +# `NODE_VALUE_BLOCK` follow-up. |
| 32 | +# |
| 33 | +# The whole `optimize!` loop is wrapped in `CUDA.@allowscalar` so the |
| 34 | +# scalar paths don't error out under GPUArrays' default scalar policy. |
| 35 | +# |
| 36 | +# Run |
| 37 | +# --- |
| 38 | +# cd ~/.julia/dev/ArrayDiff |
| 39 | +# julia --project=perf -e 'using Pkg; Pkg.instantiate()' |
| 40 | +# julia --project=perf perf/arraydiff_gpu.jl |
| 41 | + |
| 42 | +using Random, LinearAlgebra, Printf |
| 43 | +using BenchmarkTools |
| 44 | +using CUDA |
| 45 | +using JuMP |
| 46 | +using ArrayDiff |
| 47 | +import MathOptInterface as MOI |
| 48 | + |
| 49 | +# ------------------------------------------------------------------------- |
| 50 | +# Hand-written CUDA Float64 reverse_diff — same formulas as |
| 51 | +# `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but in `Float64` |
| 52 | +# and *without* the `/ n` scaling. ArrayDiff's `:/` operator currently |
| 53 | +# has a latent bug with non-scalar tape offsets (it reads |
| 54 | +# `forward_storage[node_idx]` instead of `forward_storage[storage_offset[node_idx]+1]`), |
| 55 | +# so we drop the constant scaling factor on both sides — it doesn't change |
| 56 | +# what the gradient pathway exercises. |
| 57 | +# ------------------------------------------------------------------------- |
| 58 | +function forward_pass(W1, W2, X, y) |
| 59 | + y_1 = tanh.(W1 * X) |
| 60 | + J_1 = 1 .- y_1 .^ 2 |
| 61 | + J_2 = 2 .* (W2 * y_1 .- y) |
| 62 | + return y_1, J_1, J_2 |
| 63 | +end |
| 64 | + |
| 65 | +function reverse_diff(W1, W2, X, y) |
| 66 | + _, J_1, J_2 = forward_pass(W1, W2, X, y) |
| 67 | + return (J_1 .* (W2' * J_2)) * X' |
| 68 | +end |
| 69 | + |
| 70 | +# ------------------------------------------------------------------------- |
| 71 | +# ArrayDiff path. Builds a JuMP model with `W1` as the only `@variable` and |
| 72 | +# `W2 / X / y` baked in as constant matrices, parses to ArrayDiff, and |
| 73 | +# returns an evaluator + a closure that computes ∂loss/∂W1 in `g`. |
| 74 | +# ------------------------------------------------------------------------- |
| 75 | +function build_arraydiff( |
| 76 | + W2::Matrix{Float64}, |
| 77 | + X::Matrix{Float64}, |
| 78 | + y::Matrix{Float64}, |
| 79 | + h::Int, |
| 80 | + d::Int, |
| 81 | + n::Int, |
| 82 | + mode::ArrayDiff.Mode, |
| 83 | +) |
| 84 | + model = JuMP.Model() |
| 85 | + @variable(model, W1[1:h, 1:d], container = ArrayDiff.ArrayOfVariables) |
| 86 | + Y = W2 * tanh.(W1 * X) |
| 87 | + diff = Y .- y |
| 88 | + loss = sum(diff .^ 2) # `/ n` dropped — see `forward_pass` comment. |
| 89 | + ad = ArrayDiff.model(mode) |
| 90 | + MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss)) |
| 91 | + evaluator = MOI.Nonlinear.Evaluator( |
| 92 | + ad, |
| 93 | + mode, |
| 94 | + JuMP.index.(JuMP.all_variables(model)), |
| 95 | + ) |
| 96 | + MOI.initialize(evaluator, [:Grad]) |
| 97 | + return evaluator |
| 98 | +end |
| 99 | + |
| 100 | +# CPU path — `Mode()` defaults to `Vector{Float64}` storage, no `@allowscalar` |
| 101 | +# needed. |
| 102 | +function arraydiff_grad_cpu!(g, evaluator, x) |
| 103 | + MOI.eval_objective_gradient(evaluator, g, x) |
| 104 | + return g |
| 105 | +end |
| 106 | + |
| 107 | +# GPU path — `Mode{CuVector{Float64}}()`. The current implementation still |
| 108 | +# uses scalar tape access for `NODE_VARIABLE`/`NODE_VALUE` leaves, so the |
| 109 | +# call needs `@allowscalar` to get past GPUArrays' scalar-indexing policy. |
| 110 | +function arraydiff_grad_gpu!(g, evaluator, x) |
| 111 | + CUDA.@allowscalar MOI.eval_objective_gradient(evaluator, g, x) |
| 112 | + return g |
| 113 | +end |
| 114 | + |
| 115 | +# ------------------------------------------------------------------------- |
| 116 | +# One (h, d, n) sweep. |
| 117 | +# ------------------------------------------------------------------------- |
| 118 | +function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6) |
| 119 | + println("\n" * "="^72) |
| 120 | + @printf "h = %d, d = %d, n = %d\n" h d n |
| 121 | + println("="^72) |
| 122 | + |
| 123 | + Random.seed!(0) |
| 124 | + # `W2` and `y` are deliberately given 2 output rows (instead of the |
| 125 | + # 1-row used in `perf/cuda_vs_pytorch.jl`). The current `:vcat` |
| 126 | + # forward unpacks `idx1, idx2 = children_indices` and so requires at |
| 127 | + # least two rows; a single-row matrix would be parsed as a `vcat` |
| 128 | + # with one child and would error. Using 2 rows changes nothing about |
| 129 | + # the gradient pathway being measured. |
| 130 | + W1 = randn(Float64, h, d) |
| 131 | + W2 = randn(Float64, 2, h) |
| 132 | + X = randn(Float64, d, n) |
| 133 | + y = randn(Float64, 2, n) |
| 134 | + |
| 135 | + # Reference: hand-written CUDA in Float64. |
| 136 | + W1g = CuArray(W1) |
| 137 | + W2g = CuArray(W2) |
| 138 | + Xg = CuArray(X) |
| 139 | + yg = CuArray(y) |
| 140 | + grad_ref = Array(reverse_diff(W1g, W2g, Xg, yg)) |
| 141 | + CUDA.synchronize() |
| 142 | + |
| 143 | + # ArrayDiff CPU. |
| 144 | + print("ArrayDiff CPU build (h=$h) ... "); flush(stdout) |
| 145 | + t_cpu_build = @elapsed ev_cpu = |
| 146 | + build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode()) |
| 147 | + @printf "%.2f s\n" t_cpu_build |
| 148 | + x_cpu = vec(W1) |
| 149 | + g_cpu = zeros(Float64, length(x_cpu)) |
| 150 | + arraydiff_grad_cpu!(g_cpu, ev_cpu, x_cpu) |
| 151 | + grad_cpu = reshape(g_cpu, h, d) |
| 152 | + |
| 153 | + # ArrayDiff GPU. |
| 154 | + print("ArrayDiff GPU build (h=$h) ... "); flush(stdout) |
| 155 | + t_gpu_build = @elapsed ev_gpu = build_arraydiff( |
| 156 | + W2, |
| 157 | + X, |
| 158 | + y, |
| 159 | + h, |
| 160 | + d, |
| 161 | + n, |
| 162 | + ArrayDiff.Mode{CUDA.CuVector{Float64}}(), |
| 163 | + ) |
| 164 | + @printf "%.2f s\n" t_gpu_build |
| 165 | + x_gpu = vec(W1) |
| 166 | + g_gpu = zeros(Float64, length(x_gpu)) |
| 167 | + arraydiff_grad_gpu!(g_gpu, ev_gpu, x_gpu) |
| 168 | + CUDA.synchronize() |
| 169 | + grad_gpu = reshape(g_gpu, h, d) |
| 170 | + |
| 171 | + # Numerical equivalence. |
| 172 | + for (name, g) in [ |
| 173 | + ("ArrayDiff CPU", grad_cpu), |
| 174 | + ("ArrayDiff GPU", grad_gpu), |
| 175 | + ] |
| 176 | + maxdiff = maximum(abs.(grad_ref .- g)) |
| 177 | + relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(Float64)) |
| 178 | + ok = isapprox(grad_ref, g; rtol = rtol) |
| 179 | + @printf "%-15s vs hand-CUDA: max|Δ| = %.3e (rel %.2e) match=%s\n" name maxdiff relmag ok |
| 180 | + end |
| 181 | + |
| 182 | + println("\n--- benchmark (median of N samples, post-sync) ---") |
| 183 | + bj = @benchmark begin |
| 184 | + reverse_diff($W1g, $W2g, $Xg, $yg) |
| 185 | + CUDA.synchronize() |
| 186 | + end samples = 30 evals = 1 seconds = 10 |
| 187 | + bcpu = @benchmark begin |
| 188 | + arraydiff_grad_cpu!($g_cpu, $ev_cpu, $x_cpu) |
| 189 | + end samples = 30 evals = 1 seconds = 10 |
| 190 | + # `seconds = 60` for the GPU path because, at h = 4096, the |
| 191 | + # scalar-cudaMemcpy path can run into the 10s cap on a single sample at |
| 192 | + # warmup; bump the budget so we get a reliable median. |
| 193 | + bgpu = @benchmark begin |
| 194 | + arraydiff_grad_gpu!($g_gpu, $ev_gpu, $x_gpu) |
| 195 | + CUDA.synchronize() |
| 196 | + end samples = 30 evals = 1 seconds = 60 |
| 197 | + @printf "Hand-CUDA Float64 : median %12.2f µs\n" 1e-3 * median(bj).time |
| 198 | + @printf "ArrayDiff CPU : median %12.2f µs\n" 1e-3 * median(bcpu).time |
| 199 | + @printf "ArrayDiff GPU : median %12.2f µs\n" 1e-3 * median(bgpu).time |
| 200 | + |
| 201 | + return nothing |
| 202 | +end |
| 203 | + |
| 204 | +# ------------------------------------------------------------------------- |
| 205 | +# Main |
| 206 | +# ------------------------------------------------------------------------- |
| 207 | +function main() |
| 208 | + if !CUDA.functional() |
| 209 | + error("CUDA is not functional in this environment.") |
| 210 | + end |
| 211 | + CUDA.math_mode!(CUDA.FAST_MATH) |
| 212 | + println("CUDA.jl device : ", CUDA.name(CUDA.device()), " (math_mode=FAST_MATH)") |
| 213 | + for h in (16, 256, 4096) |
| 214 | + run_one(; h = h) |
| 215 | + GC.gc(true) |
| 216 | + CUDA.reclaim() |
| 217 | + end |
| 218 | +end |
| 219 | + |
| 220 | +main() |
0 commit comments