Skip to content

Commit 3d5e005

Browse files
committed
Add benchmark vs hardcoded CUDA
1 parent e3fd42f commit 3d5e005

2 files changed

Lines changed: 226 additions & 0 deletions

File tree

perf/Project.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,18 @@ authors = ["Benoît Legat <benoit.legat@gmail.com>"]
44
version = "0.0.0"
55

66
[deps]
7+
ArrayDiff = "c45fa1ca-6901-44ac-ae5b-5513a4852d50"
78
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
89
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
910
CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab"
11+
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
1012
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1113
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
14+
MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
1215
Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
1316
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
1417
PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
1518
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
19+
20+
[sources]
21+
ArrayDiff = {path = ".."}

perf/arraydiff_gpu.jl

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
# Benchmark `ArrayDiff` GPU gradient computation against the hand-written
2+
# CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`,
3+
# but in `Float64` so it matches `ArrayDiff`'s tape eltype).
4+
#
5+
# What this measures
6+
# ------------------
7+
# A 2-layer MLP `loss = sum((W2 * tanh.(W1 * X) - y).^2) / n`, where `W1` is
8+
# the only `@variable` (so `MOI.eval_objective_gradient` returns ∂loss/∂W1)
9+
# and `W2`, `X`, `y` are constants.
10+
#
11+
# * Hand-CUDA Float64: `reverse_diff(W1, W2, X, y)` — the same hand-written
12+
# formula as the existing perf script, just promoted to `Float64` so the
13+
# numbers are directly comparable to ArrayDiff.
14+
#
15+
# * ArrayDiff CPU: same model with `Mode{Vector{Float64}}()` — sanity check
16+
# for what the existing CPU AD path costs at this problem size.
17+
#
18+
# * ArrayDiff GPU: same model with `Mode{CuVector{Float64}}()` — the path
19+
# this benchmark exists to measure.
20+
#
21+
# Caveats (read this before reading the numbers)
22+
# ----------------------------------------------
23+
# The current `ArrayDiff` GPU path still does scalar reads/writes on the
24+
# tape for every `NODE_VARIABLE` (one per `W1[i,j]`) and every `NODE_VALUE`
25+
# (one per scalar inside the `vcat(row(...), row(...))` expansion of
26+
# constant matrices `W2`, `X`, `y`). On a `CuArray` each scalar load/store
27+
# is a separate `cudaMemcpy`, so for `h = 4096` and `d = 13` that's tens of
28+
# thousands of round-trips per call. We expect the GPU path to be much
29+
# slower than the hand-written CUDA reference until those leaves are
30+
# loaded/extracted in bulk — see the `NODE_VARIABLE_BLOCK` /
31+
# `NODE_VALUE_BLOCK` follow-up.
32+
#
33+
# The whole `optimize!` loop is wrapped in `CUDA.@allowscalar` so the
34+
# scalar paths don't error out under GPUArrays' default scalar policy.
35+
#
36+
# Run
37+
# ---
38+
# cd ~/.julia/dev/ArrayDiff
39+
# julia --project=perf -e 'using Pkg; Pkg.instantiate()'
40+
# julia --project=perf perf/arraydiff_gpu.jl
41+
42+
using Random, LinearAlgebra, Printf
43+
using BenchmarkTools
44+
using CUDA
45+
using JuMP
46+
using ArrayDiff
47+
import MathOptInterface as MOI
48+
49+
# -------------------------------------------------------------------------
50+
# Hand-written CUDA Float64 reverse_diff — same formulas as
51+
# `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but in `Float64`
52+
# and *without* the `/ n` scaling. ArrayDiff's `:/` operator currently
53+
# has a latent bug with non-scalar tape offsets (it reads
54+
# `forward_storage[node_idx]` instead of `forward_storage[storage_offset[node_idx]+1]`),
55+
# so we drop the constant scaling factor on both sides — it doesn't change
56+
# what the gradient pathway exercises.
57+
# -------------------------------------------------------------------------
58+
function forward_pass(W1, W2, X, y)
59+
y_1 = tanh.(W1 * X)
60+
J_1 = 1 .- y_1 .^ 2
61+
J_2 = 2 .* (W2 * y_1 .- y)
62+
return y_1, J_1, J_2
63+
end
64+
65+
function reverse_diff(W1, W2, X, y)
66+
_, J_1, J_2 = forward_pass(W1, W2, X, y)
67+
return (J_1 .* (W2' * J_2)) * X'
68+
end
69+
70+
# -------------------------------------------------------------------------
71+
# ArrayDiff path. Builds a JuMP model with `W1` as the only `@variable` and
72+
# `W2 / X / y` baked in as constant matrices, parses to ArrayDiff, and
73+
# returns an evaluator + a closure that computes ∂loss/∂W1 in `g`.
74+
# -------------------------------------------------------------------------
75+
function build_arraydiff(
76+
W2::Matrix{Float64},
77+
X::Matrix{Float64},
78+
y::Matrix{Float64},
79+
h::Int,
80+
d::Int,
81+
n::Int,
82+
mode::ArrayDiff.Mode,
83+
)
84+
model = JuMP.Model()
85+
@variable(model, W1[1:h, 1:d], container = ArrayDiff.ArrayOfVariables)
86+
Y = W2 * tanh.(W1 * X)
87+
diff = Y .- y
88+
loss = sum(diff .^ 2) # `/ n` dropped — see `forward_pass` comment.
89+
ad = ArrayDiff.model(mode)
90+
MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss))
91+
evaluator = MOI.Nonlinear.Evaluator(
92+
ad,
93+
mode,
94+
JuMP.index.(JuMP.all_variables(model)),
95+
)
96+
MOI.initialize(evaluator, [:Grad])
97+
return evaluator
98+
end
99+
100+
# CPU path — `Mode()` defaults to `Vector{Float64}` storage, no `@allowscalar`
101+
# needed.
102+
function arraydiff_grad_cpu!(g, evaluator, x)
103+
MOI.eval_objective_gradient(evaluator, g, x)
104+
return g
105+
end
106+
107+
# GPU path — `Mode{CuVector{Float64}}()`. The current implementation still
108+
# uses scalar tape access for `NODE_VARIABLE`/`NODE_VALUE` leaves, so the
109+
# call needs `@allowscalar` to get past GPUArrays' scalar-indexing policy.
110+
function arraydiff_grad_gpu!(g, evaluator, x)
111+
CUDA.@allowscalar MOI.eval_objective_gradient(evaluator, g, x)
112+
return g
113+
end
114+
115+
# -------------------------------------------------------------------------
116+
# One (h, d, n) sweep.
117+
# -------------------------------------------------------------------------
118+
function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
119+
println("\n" * "="^72)
120+
@printf "h = %d, d = %d, n = %d\n" h d n
121+
println("="^72)
122+
123+
Random.seed!(0)
124+
# `W2` and `y` are deliberately given 2 output rows (instead of the
125+
# 1-row used in `perf/cuda_vs_pytorch.jl`). The current `:vcat`
126+
# forward unpacks `idx1, idx2 = children_indices` and so requires at
127+
# least two rows; a single-row matrix would be parsed as a `vcat`
128+
# with one child and would error. Using 2 rows changes nothing about
129+
# the gradient pathway being measured.
130+
W1 = randn(Float64, h, d)
131+
W2 = randn(Float64, 2, h)
132+
X = randn(Float64, d, n)
133+
y = randn(Float64, 2, n)
134+
135+
# Reference: hand-written CUDA in Float64.
136+
W1g = CuArray(W1)
137+
W2g = CuArray(W2)
138+
Xg = CuArray(X)
139+
yg = CuArray(y)
140+
grad_ref = Array(reverse_diff(W1g, W2g, Xg, yg))
141+
CUDA.synchronize()
142+
143+
# ArrayDiff CPU.
144+
print("ArrayDiff CPU build (h=$h) ... "); flush(stdout)
145+
t_cpu_build = @elapsed ev_cpu =
146+
build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode())
147+
@printf "%.2f s\n" t_cpu_build
148+
x_cpu = vec(W1)
149+
g_cpu = zeros(Float64, length(x_cpu))
150+
arraydiff_grad_cpu!(g_cpu, ev_cpu, x_cpu)
151+
grad_cpu = reshape(g_cpu, h, d)
152+
153+
# ArrayDiff GPU.
154+
print("ArrayDiff GPU build (h=$h) ... "); flush(stdout)
155+
t_gpu_build = @elapsed ev_gpu = build_arraydiff(
156+
W2,
157+
X,
158+
y,
159+
h,
160+
d,
161+
n,
162+
ArrayDiff.Mode{CUDA.CuVector{Float64}}(),
163+
)
164+
@printf "%.2f s\n" t_gpu_build
165+
x_gpu = vec(W1)
166+
g_gpu = zeros(Float64, length(x_gpu))
167+
arraydiff_grad_gpu!(g_gpu, ev_gpu, x_gpu)
168+
CUDA.synchronize()
169+
grad_gpu = reshape(g_gpu, h, d)
170+
171+
# Numerical equivalence.
172+
for (name, g) in [
173+
("ArrayDiff CPU", grad_cpu),
174+
("ArrayDiff GPU", grad_gpu),
175+
]
176+
maxdiff = maximum(abs.(grad_ref .- g))
177+
relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(Float64))
178+
ok = isapprox(grad_ref, g; rtol = rtol)
179+
@printf "%-15s vs hand-CUDA: max|Δ| = %.3e (rel %.2e) match=%s\n" name maxdiff relmag ok
180+
end
181+
182+
println("\n--- benchmark (median of N samples, post-sync) ---")
183+
bj = @benchmark begin
184+
reverse_diff($W1g, $W2g, $Xg, $yg)
185+
CUDA.synchronize()
186+
end samples = 30 evals = 1 seconds = 10
187+
bcpu = @benchmark begin
188+
arraydiff_grad_cpu!($g_cpu, $ev_cpu, $x_cpu)
189+
end samples = 30 evals = 1 seconds = 10
190+
# `seconds = 60` for the GPU path because, at h = 4096, the
191+
# scalar-cudaMemcpy path can run into the 10s cap on a single sample at
192+
# warmup; bump the budget so we get a reliable median.
193+
bgpu = @benchmark begin
194+
arraydiff_grad_gpu!($g_gpu, $ev_gpu, $x_gpu)
195+
CUDA.synchronize()
196+
end samples = 30 evals = 1 seconds = 60
197+
@printf "Hand-CUDA Float64 : median %12.2f µs\n" 1e-3 * median(bj).time
198+
@printf "ArrayDiff CPU : median %12.2f µs\n" 1e-3 * median(bcpu).time
199+
@printf "ArrayDiff GPU : median %12.2f µs\n" 1e-3 * median(bgpu).time
200+
201+
return nothing
202+
end
203+
204+
# -------------------------------------------------------------------------
205+
# Main
206+
# -------------------------------------------------------------------------
207+
function main()
208+
if !CUDA.functional()
209+
error("CUDA is not functional in this environment.")
210+
end
211+
CUDA.math_mode!(CUDA.FAST_MATH)
212+
println("CUDA.jl device : ", CUDA.name(CUDA.device()), " (math_mode=FAST_MATH)")
213+
for h in (16, 256, 4096)
214+
run_one(; h = h)
215+
GC.gc(true)
216+
CUDA.reclaim()
217+
end
218+
end
219+
220+
main()

0 commit comments

Comments
 (0)