Skip to content

Commit 556487c

Browse files
committed
Separate benchmark into different files
1 parent 5b4d9ab commit 556487c

8 files changed

Lines changed: 421 additions & 1118 deletions

File tree

perf/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.CondaPkg

perf/arraydiff.jl

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
module ArrayDiffNeural
2+
3+
# ArrayDiff reverse-mode AD of the 2-layer MLP loss
4+
#
5+
# L = sum((W2 * tanh(W1 * X) - y) .^ 2)
6+
#
7+
# `W1` is the only `@variable`; `W2`, `X`, `y` are baked in as constant
8+
# matrix blocks. With `gpu=true`, the AD tape, input vector, and gradient
9+
# buffer all live on the device (`Mode{CuVector{T}}`); with `gpu=false`,
10+
# everything stays on the host (`Mode{Vector{T}}`).
11+
#
12+
# `/n` is dropped from the loss to match `hand_cuda.jl` — ArrayDiff's `:/`
13+
# operator currently has a latent bug on non-scalar tape offsets, and a
14+
# constant scaling factor doesn't change the gradient pathway.
15+
16+
using Random
17+
using BenchmarkTools
18+
using CUDA
19+
using JuMP
20+
using ArrayDiff
21+
import MathOptInterface as MOI
22+
23+
# Output dimension `m`. Fixed at 2: a 1-row `W2` would parse as a `:vcat`
24+
# with a single child, which the current ArrayDiff parser unpacks via
25+
# `idx1, idx2 = children_indices` and so requires at least two rows.
26+
const OUT_DIM = 2
27+
28+
function _build(::Type{T}, h::Int, d::Int, n::Int, gpu::Bool) where {T<:Real}
29+
Random.seed!(0)
30+
W1 = randn(T, h, d)
31+
W2 = randn(T, OUT_DIM, h)
32+
X = randn(T, d, n)
33+
y = randn(T, OUT_DIM, n)
34+
35+
# JuMP works in Float64 internally; ArrayDiff converts the parsed
36+
# `Expression{Float64}` constants into the `Mode`'s eltype when filling
37+
# its tape. Promote constants to Float64 explicitly so the conversion
38+
# is one-way and obvious.
39+
W2_64 = Float64.(W2)
40+
X_64 = Float64.(X)
41+
y_64 = Float64.(y)
42+
43+
model = JuMP.Model()
44+
@variable(model, W1v[1:h, 1:d], container = ArrayDiff.ArrayOfVariables)
45+
Y = W2_64 * tanh.(W1v * X_64)
46+
diff = Y .- y_64
47+
loss = sum(diff .^ 2)
48+
49+
mode = gpu ?
50+
ArrayDiff.Mode{CUDA.CuVector{T}}() :
51+
ArrayDiff.Mode{Vector{T}}()
52+
ad = ArrayDiff.model(mode)
53+
MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss))
54+
ev = MOI.Nonlinear.Evaluator(
55+
ad,
56+
mode,
57+
JuMP.index.(JuMP.all_variables(model)),
58+
)
59+
MOI.initialize(ev, [:Grad])
60+
return (W1 = W1, evaluator = ev)
61+
end
62+
63+
"""
64+
neural(T, h, d, n; gpu::Bool = false) -> BenchmarkTools.Trial
65+
66+
Benchmark `∂L/∂W1` of the 2-layer MLP loss using ArrayDiff. Model parsing
67+
and `MOI.initialize` (the expensive per-shape build) happen once before the
68+
trial; the timed block is just `MOI.eval_objective_gradient` plus a
69+
`CUDA.synchronize` when `gpu=true`.
70+
"""
71+
function neural(
72+
::Type{T},
73+
h::Int,
74+
d::Int,
75+
n::Int;
76+
gpu::Bool = false,
77+
) where {T<:Real}
78+
state = _build(T, h, d, n, gpu)
79+
if gpu
80+
x = CUDA.CuVector{T}(vec(state.W1))
81+
g = CUDA.zeros(T, h * d)
82+
else
83+
x = vec(state.W1)
84+
g = zeros(T, h * d)
85+
end
86+
return @benchmark begin
87+
if $gpu
88+
# `@allowscalar` covers residual scalar leaves the BLOCK rewrite
89+
# can't fold; the hot path is bulk.
90+
CUDA.@allowscalar MOI.eval_objective_gradient(
91+
$state.evaluator,
92+
$g,
93+
$x,
94+
)
95+
CUDA.synchronize()
96+
else
97+
MOI.eval_objective_gradient($state.evaluator, $g, $x)
98+
end
99+
end
100+
end
101+
102+
end # module

0 commit comments

Comments
 (0)