11# Benchmark `ArrayDiff` GPU gradient computation against the hand-written
2- # CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`,
3- # but in `Float64` so it matches `ArrayDiff`'s tape eltype).
2+ # CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`).
3+ #
4+ # Float32 throughout — this is the precision that maps onto the tensor cores
5+ # / SP pipeline a GPU is actually optimized for, and matches what a typical
6+ # ML workload (PyTorch default) feeds the device.
47#
58# What this measures
69# ------------------
710# A 2-layer MLP `loss = sum((W2 * tanh.(W1 * X) - y).^2) / n`, where `W1` is
811# the only `@variable` (so `MOI.eval_objective_gradient` returns ∂loss/∂W1)
912# and `W2`, `X`, `y` are constants.
1013#
11- # * Hand-CUDA Float64: `reverse_diff(W1, W2, X, y)` — the same hand-written
12- # formula as the existing perf script, just promoted to `Float64` so the
13- # numbers are directly comparable to ArrayDiff.
14+ # * Hand-CUDA Float32 (allocating): `reverse_diff(W1, W2, X, y)` — same
15+ # hand-written formula as `perf/cuda_vs_pytorch.jl`, allocates fresh
16+ # intermediates per call. This is the "naive" hand-tuned baseline.
17+ #
18+ # * Hand-CUDA Float32 (preallocated): `reverse_diff_prealloc!(buf, ...)` —
19+ # same arithmetic but every intermediate lives in a `HandPrealloc` buffer
20+ # that is reused across calls. This is the apples-to-apples comparison
21+ # against ArrayDiff, which preallocates its tape once at
22+ # `MOI.initialize` and never allocates again.
1423#
15- # * ArrayDiff CPU: same model with `Mode{Vector{Float64 }}()` — sanity check
24+ # * ArrayDiff CPU: same model with `Mode{Vector{Float32 }}()` — sanity check
1625# for what the existing CPU AD path costs at this problem size.
1726#
18- # * ArrayDiff GPU: same model with `Mode{CuVector{Float64 }}()` — the path
27+ # * ArrayDiff GPU: same model with `Mode{CuVector{Float32 }}()` — the path
1928# this benchmark exists to measure.
2029#
21- # Caveats (read this before reading the numbers)
22- # ----------------------------------------------
23- # The current `ArrayDiff` GPU path still does scalar reads/writes on the
24- # tape for every `NODE_VARIABLE` (one per `W1[i,j]`) and every `NODE_VALUE`
25- # (one per scalar inside the `vcat(row(...), row(...))` expansion of
26- # constant matrices `W2`, `X`, `y`). On a `CuArray` each scalar load/store
27- # is a separate `cudaMemcpy`, so for `h = 4096` and `d = 13` that's tens of
28- # thousands of round-trips per call. We expect the GPU path to be much
29- # slower than the hand-written CUDA reference until those leaves are
30- # loaded/extracted in bulk — see the `NODE_VARIABLE_BLOCK` /
31- # `NODE_VALUE_BLOCK` follow-up.
32- #
33- # The whole `optimize!` loop is wrapped in `CUDA.@allowscalar` so the
34- # scalar paths don't error out under GPUArrays' default scalar policy.
35- #
3630# Run
3731# ---
3832# cd ~/.julia/dev/ArrayDiff
@@ -47,13 +41,17 @@ using ArrayDiff
4741import MathOptInterface as MOI
4842
4943# -------------------------------------------------------------------------
50- # Hand-written CUDA Float64 reverse_diff — same formulas as
51- # `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but in `Float64`
52- # and *without* the `/ n` scaling. ArrayDiff's `:/` operator currently
53- # has a latent bug with non-scalar tape offsets (it reads
54- # `forward_storage[node_idx]` instead of `forward_storage[storage_offset[node_idx]+1]`),
55- # so we drop the constant scaling factor on both sides — it doesn't change
56- # what the gradient pathway exercises.
44+ # Hand-written CUDA reverse_diff — same formulas as
45+ # `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but *without* the
46+ # `/ n` scaling. ArrayDiff's `:/` operator currently has a latent bug with
47+ # non-scalar tape offsets (it reads `forward_storage[node_idx]` instead of
48+ # `forward_storage[storage_offset[node_idx]+1]`), so we drop the constant
49+ # scaling factor on both sides — it doesn't change what the gradient
50+ # pathway exercises.
51+ #
52+ # These functions are eltype-generic (they're just dotted broadcasts and
53+ # `*`), so the same code runs at `Float32` and `Float64` depending on the
54+ # eltype of the `CuArray` inputs.
5755# -------------------------------------------------------------------------
5856function forward_pass (W1, W2, X, y)
5957 y_1 = tanh .(W1 * X)
@@ -67,24 +65,80 @@ function reverse_diff(W1, W2, X, y)
6765 return (J_1 .* (W2' * J_2)) * X'
6866end
6967
68+ # -------------------------------------------------------------------------
69+ # Hand-written CUDA reverse_diff — preallocated variant.
70+ #
71+ # `reverse_diff` above allocates ~6 fresh `CuArray`s per call (one for
72+ # `tanh.(...)`, one for `1 .- y_1.^2`, one for `2 .* (...)`, plus three for
73+ # the chained `*` / `.*` / `*` in the reverse step). At h = 4096 each is a
74+ # multi-MB matrix, and the allocator overhead dominates the actual GEMM
75+ # compute. To be a fair "hand-tuned" baseline against ArrayDiff (which
76+ # allocates its tape exactly once at `MOI.initialize` and reuses it), we
77+ # pre-allocate all intermediates in `HandPrealloc` and run forward + reverse
78+ # with `mul!` + in-place broadcasts.
79+ #
80+ # Buffer reuse: `y_1` doubles as the `W1 * X` output, then is overwritten
81+ # in place by `tanh`. `J_2` doubles as the `W2 * y_1` output and is then
82+ # overwritten by `2 .* (J_2 - y)`. `W2T_J2` is scaled in place by `J_1`.
83+ # Five device buffers cover everything.
84+ # -------------------------------------------------------------------------
85+ struct HandPrealloc{T<: Real }
86+ y_1:: CuArray{T,2} # h × n (= tanh.(W1 * X))
87+ J_1:: CuArray{T,2} # h × n (= 1 .- y_1.^2)
88+ J_2:: CuArray{T,2} # out × n (= 2 .* (W2*y_1 - y))
89+ W2T_J2:: CuArray{T,2} # h × n (= W2' * J_2, then ⊙= J_1)
90+ grad:: CuArray{T,2} # h × d (= W2T_J2 * X')
91+ end
92+
93+ function HandPrealloc {T} (h:: Int , d:: Int , n:: Int , out_dim:: Int ) where {T}
94+ return HandPrealloc {T} (
95+ CuArray {T} (undef, h, n),
96+ CuArray {T} (undef, h, n),
97+ CuArray {T} (undef, out_dim, n),
98+ CuArray {T} (undef, h, n),
99+ CuArray {T} (undef, h, d),
100+ )
101+ end
102+
103+ function reverse_diff_prealloc! (buf:: HandPrealloc , W1, W2, X, y)
104+ # Forward
105+ LinearAlgebra. mul! (buf. y_1, W1, X) # y_1 = W1 * X
106+ buf. y_1 .= tanh .(buf. y_1) # y_1 = tanh.(y_1)
107+ buf. J_1 .= 1 .- buf. y_1 .^ 2 # J_1 = 1 .- y_1.^2
108+ LinearAlgebra. mul! (buf. J_2, W2, buf. y_1) # J_2 = W2 * y_1
109+ buf. J_2 .= 2 .* (buf. J_2 .- y) # J_2 = 2 .* (W2*y_1 - y)
110+ # Reverse — (J_1 .* (W2' * J_2)) * X'
111+ LinearAlgebra. mul! (buf. W2T_J2, W2' , buf. J_2) # W2T_J2 = W2' * J_2
112+ buf. W2T_J2 .= buf. J_1 .* buf. W2T_J2 # W2T_J2 ⊙= J_1
113+ LinearAlgebra. mul! (buf. grad, buf. W2T_J2, X' ) # grad = W2T_J2 * X'
114+ return buf. grad
115+ end
116+
70117# -------------------------------------------------------------------------
71118# ArrayDiff path. Builds a JuMP model with `W1` as the only `@variable` and
72119# `W2 / X / y` baked in as constant matrices, parses to ArrayDiff, and
73120# returns an evaluator + a closure that computes ∂loss/∂W1 in `g`.
74121# -------------------------------------------------------------------------
75122function build_arraydiff (
76- W2:: Matrix{Float64 } ,
77- X:: Matrix{Float64 } ,
78- y:: Matrix{Float64 } ,
123+ W2:: Matrix{T } ,
124+ X:: Matrix{T } ,
125+ y:: Matrix{T } ,
79126 h:: Int ,
80127 d:: Int ,
81128 n:: Int ,
82129 mode:: ArrayDiff.Mode ,
83- )
130+ ) where {T<: Real }
131+ # JuMP works in Float64 internally; ArrayDiff converts the parsed
132+ # `Expression{Float64}` constants into the `Mode`'s eltype (here Float32)
133+ # when filling its tape. We promote constants to Float64 for the JuMP
134+ # build to keep that conversion explicit / one-way.
135+ W2_64 = Float64 .(W2)
136+ X_64 = Float64 .(X)
137+ y_64 = Float64 .(y)
84138 model = JuMP. Model ()
85139 @variable (model, W1[1 : h, 1 : d], container = ArrayDiff. ArrayOfVariables)
86- Y = W2 * tanh .(W1 * X )
87- diff = Y .- y
140+ Y = W2_64 * tanh .(W1 * X_64 )
141+ diff = Y .- y_64
88142 loss = sum (diff .^ 2 ) # `/ n` dropped — see `forward_pass` comment.
89143 ad = ArrayDiff. model (mode)
90144 MOI. Nonlinear. set_objective (ad, JuMP. moi_function (loss))
@@ -97,16 +151,15 @@ function build_arraydiff(
97151 return evaluator
98152end
99153
100- # CPU path — `Mode()` defaults to `Vector{Float64}` storage, no `@allowscalar`
101- # needed.
102154function arraydiff_grad_cpu! (g, evaluator, x)
103155 MOI. eval_objective_gradient (evaluator, g, x)
104156 return g
105157end
106158
107- # GPU path — `Mode{CuVector{Float64}}()`. The current implementation still
108- # uses scalar tape access for `NODE_VARIABLE`/`NODE_VALUE` leaves, so the
109- # call needs `@allowscalar` to get past GPUArrays' scalar-indexing policy.
159+ # `@allowscalar` covers the residual scalar leaves that the BLOCK rewrite
160+ # can't fold (e.g. a stand-alone `NODE_VALUE` inside a non-block subterm).
161+ # The hot path is bulk; this is a guard so we don't trip GPUArrays' policy
162+ # on the leftovers.
110163function arraydiff_grad_gpu! (g, evaluator, x)
111164 CUDA. @allowscalar MOI. eval_objective_gradient (evaluator, g, x)
112165 return g
@@ -115,39 +168,48 @@ end
115168# -------------------------------------------------------------------------
116169# One (h, d, n) sweep.
117170# -------------------------------------------------------------------------
118- function run_one (; h:: Int , d:: Int = 13 , n:: Int = 178 , rtol:: Float64 = 1e-6 )
171+ function run_one (; h:: Int , d:: Int = 13 , n:: Int = 178 , rtol:: Float32 = 1f-3 )
119172 println (" \n " * " =" ^ 72 )
120- @printf " h = %d, d = %d, n = %d\n " h d n
173+ @printf " h = %d, d = %d, n = %d (Float32) \n " h d n
121174 println (" =" ^ 72 )
122175
176+ T = Float32
123177 Random. seed! (0 )
124178 # `W2` and `y` are deliberately given 2 output rows (instead of the
125179 # 1-row used in `perf/cuda_vs_pytorch.jl`). The current `:vcat`
126180 # forward unpacks `idx1, idx2 = children_indices` and so requires at
127181 # least two rows; a single-row matrix would be parsed as a `vcat`
128182 # with one child and would error. Using 2 rows changes nothing about
129183 # the gradient pathway being measured.
130- W1 = randn (Float64, h, d)
131- W2 = randn (Float64, 2 , h)
132- X = randn (Float64, d, n)
133- y = randn (Float64, 2 , n)
184+ out_dim = 2
185+ W1 = randn (T, h, d)
186+ W2 = randn (T, out_dim, h)
187+ X = randn (T, d, n)
188+ y = randn (T, out_dim, n)
134189
135- # Reference: hand-written CUDA in Float64 .
190+ # Reference: hand-written CUDA (allocating) .
136191 W1g = CuArray (W1)
137192 W2g = CuArray (W2)
138193 Xg = CuArray (X)
139194 yg = CuArray (y)
140195 grad_ref = Array (reverse_diff (W1g, W2g, Xg, yg))
141196 CUDA. synchronize ()
142197
198+ # Hand-CUDA preallocated. Same arithmetic, but every intermediate is
199+ # owned by `hand_buf` and reused across calls.
200+ hand_buf = HandPrealloc {T} (h, d, n, out_dim)
201+ grad_prealloc =
202+ Array (reverse_diff_prealloc! (hand_buf, W1g, W2g, Xg, yg))
203+ CUDA. synchronize ()
204+
143205 # ArrayDiff CPU.
144206 print (" ArrayDiff CPU build (h=$h ) ... " );
145207 flush (stdout )
146- t_cpu_build =
147- @elapsed ev_cpu = build_arraydiff (W2, X, y, h, d, n, ArrayDiff. Mode ())
208+ t_cpu_build = @elapsed ev_cpu =
209+ build_arraydiff (W2, X, y, h, d, n, ArrayDiff. Mode {Vector{T}} ())
148210 @printf " %.2f s\n " t_cpu_build
149211 x_cpu = vec (W1)
150- g_cpu = zeros (Float64 , length (x_cpu))
212+ g_cpu = zeros (T , length (x_cpu))
151213 arraydiff_grad_cpu! (g_cpu, ev_cpu, x_cpu)
152214 grad_cpu = reshape (g_cpu, h, d)
153215
@@ -164,28 +226,37 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
164226 h,
165227 d,
166228 n,
167- ArrayDiff. Mode {CUDA.CuVector{Float64 }} (),
229+ ArrayDiff. Mode {CUDA.CuVector{T }} (),
168230 )
169231 @printf " %.2f s\n " t_gpu_build
170- x_gpu = CUDA. CuVector {Float64 } (vec (W1))
171- g_gpu = CUDA. zeros (Float64 , length (x_gpu))
232+ x_gpu = CUDA. CuVector {T } (vec (W1))
233+ g_gpu = CUDA. zeros (T , length (x_gpu))
172234 arraydiff_grad_gpu! (g_gpu, ev_gpu, x_gpu)
173235 CUDA. synchronize ()
174236 grad_gpu = reshape (Array (g_gpu), h, d)
175237
176- # Numerical equivalence.
177- for (name, g) in [(" ArrayDiff CPU" , grad_cpu), (" ArrayDiff GPU" , grad_gpu)]
238+ # Numerical equivalence (rtol loosened for Float32 reductions).
239+ candidates = [
240+ (" Hand-CUDA prealloc" , grad_prealloc),
241+ (" ArrayDiff CPU" , grad_cpu),
242+ (" ArrayDiff GPU" , grad_gpu),
243+ ]
244+ for (name, g) in candidates
178245 maxdiff = maximum (abs .(grad_ref .- g))
179- relmag = maxdiff / max (maximum (abs .(grad_ref)), eps (Float64 ))
246+ relmag = maxdiff / max (maximum (abs .(grad_ref)), eps (T ))
180247 ok = isapprox (grad_ref, g; rtol = rtol)
181- @printf " %-15s vs hand-CUDA: max|Δ| = %.3e (rel %.2e) match=%s\n " name maxdiff relmag ok
248+ @printf " %-20s vs hand-CUDA alloc : max|Δ| = %.3e (rel %.2e) match=%s\n " name maxdiff relmag ok
182249 end
183250
184251 println (" \n --- benchmark (median of N samples, post-sync) ---" )
185252 bj = @benchmark begin
186253 reverse_diff ($ W1g, $ W2g, $ Xg, $ yg)
187254 CUDA. synchronize ()
188255 end samples = 30 evals = 1 seconds = 10
256+ bjp = @benchmark begin
257+ reverse_diff_prealloc! ($ hand_buf, $ W1g, $ W2g, $ Xg, $ yg)
258+ CUDA. synchronize ()
259+ end samples = 30 evals = 1 seconds = 10
189260 bcpu = @benchmark begin
190261 arraydiff_grad_cpu! ($ g_cpu, $ ev_cpu, $ x_cpu)
191262 end samples = 30 evals = 1 seconds = 10
@@ -196,9 +267,10 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
196267 arraydiff_grad_gpu! ($ g_gpu, $ ev_gpu, $ x_gpu)
197268 CUDA. synchronize ()
198269 end samples = 30 evals = 1 seconds = 60
199- @printf " Hand-CUDA Float64 : median %12.2f µs\n " 1e-3 * median (bj). time
200- @printf " ArrayDiff CPU : median %12.2f µs\n " 1e-3 * median (bcpu). time
201- @printf " ArrayDiff GPU : median %12.2f µs\n " 1e-3 * median (bgpu). time
270+ @printf " Hand-CUDA alloc : median %12.2f µs\n " 1e-3 * median (bj). time
271+ @printf " Hand-CUDA prealloc : median %12.2f µs\n " 1e-3 * median (bjp). time
272+ @printf " ArrayDiff CPU : median %12.2f µs\n " 1e-3 * median (bcpu). time
273+ @printf " ArrayDiff GPU : median %12.2f µs\n " 1e-3 * median (bgpu). time
202274
203275 return nothing
204276end
0 commit comments