|
| 1 | +module ArrayDiffNeural |
| 2 | + |
| 3 | +# ArrayDiff reverse-mode AD of the 2-layer MLP loss |
| 4 | +# |
| 5 | +# L = sum((W2 * tanh(W1 * X) - y) .^ 2) |
| 6 | +# |
| 7 | +# `W1` is the only `@variable`; `W2`, `X`, `y` are baked in as constant |
| 8 | +# matrix blocks. With `gpu=true`, the AD tape, input vector, and gradient |
| 9 | +# buffer all live on the device (`Mode{CuVector{T}}`); with `gpu=false`, |
| 10 | +# everything stays on the host (`Mode{Vector{T}}`). |
| 11 | +# |
| 12 | +# `/n` is dropped from the loss to match `hand_cuda.jl` — ArrayDiff's `:/` |
| 13 | +# operator currently has a latent bug on non-scalar tape offsets, and a |
| 14 | +# constant scaling factor doesn't change the gradient pathway. |
| 15 | + |
| 16 | +using Random |
| 17 | +using BenchmarkTools |
| 18 | +using CUDA |
| 19 | +using JuMP |
| 20 | +using ArrayDiff |
| 21 | +import MathOptInterface as MOI |
| 22 | + |
| 23 | +# Output dimension `m`. Fixed at 2: a 1-row `W2` would parse as a `:vcat` |
| 24 | +# with a single child, which the current ArrayDiff parser unpacks via |
| 25 | +# `idx1, idx2 = children_indices` and so requires at least two rows. |
| 26 | +const OUT_DIM = 2 |
| 27 | + |
| 28 | +function _build(::Type{T}, h::Int, d::Int, n::Int, gpu::Bool) where {T<:Real} |
| 29 | + Random.seed!(0) |
| 30 | + W1 = randn(T, h, d) |
| 31 | + W2 = randn(T, OUT_DIM, h) |
| 32 | + X = randn(T, d, n) |
| 33 | + y = randn(T, OUT_DIM, n) |
| 34 | + |
| 35 | + # JuMP works in Float64 internally; ArrayDiff converts the parsed |
| 36 | + # `Expression{Float64}` constants into the `Mode`'s eltype when filling |
| 37 | + # its tape. Promote constants to Float64 explicitly so the conversion |
| 38 | + # is one-way and obvious. |
| 39 | + W2_64 = Float64.(W2) |
| 40 | + X_64 = Float64.(X) |
| 41 | + y_64 = Float64.(y) |
| 42 | + |
| 43 | + model = JuMP.Model() |
| 44 | + @variable(model, W1v[1:h, 1:d], container = ArrayDiff.ArrayOfVariables) |
| 45 | + Y = W2_64 * tanh.(W1v * X_64) |
| 46 | + diff = Y .- y_64 |
| 47 | + loss = sum(diff .^ 2) |
| 48 | + |
| 49 | + mode = gpu ? |
| 50 | + ArrayDiff.Mode{CUDA.CuVector{T}}() : |
| 51 | + ArrayDiff.Mode{Vector{T}}() |
| 52 | + ad = ArrayDiff.model(mode) |
| 53 | + MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss)) |
| 54 | + ev = MOI.Nonlinear.Evaluator( |
| 55 | + ad, |
| 56 | + mode, |
| 57 | + JuMP.index.(JuMP.all_variables(model)), |
| 58 | + ) |
| 59 | + MOI.initialize(ev, [:Grad]) |
| 60 | + return (W1 = W1, evaluator = ev) |
| 61 | +end |
| 62 | + |
| 63 | +""" |
| 64 | + neural(T, h, d, n; gpu::Bool = false) -> BenchmarkTools.Trial |
| 65 | +
|
| 66 | +Benchmark `∂L/∂W1` of the 2-layer MLP loss using ArrayDiff. Model parsing |
| 67 | +and `MOI.initialize` (the expensive per-shape build) happen once before the |
| 68 | +trial; the timed block is just `MOI.eval_objective_gradient` plus a |
| 69 | +`CUDA.synchronize` when `gpu=true`. |
| 70 | +""" |
| 71 | +function neural( |
| 72 | + ::Type{T}, |
| 73 | + h::Int, |
| 74 | + d::Int, |
| 75 | + n::Int; |
| 76 | + gpu::Bool = false, |
| 77 | +) where {T<:Real} |
| 78 | + state = _build(T, h, d, n, gpu) |
| 79 | + if gpu |
| 80 | + x = CUDA.CuVector{T}(vec(state.W1)) |
| 81 | + g = CUDA.zeros(T, h * d) |
| 82 | + else |
| 83 | + x = vec(state.W1) |
| 84 | + g = zeros(T, h * d) |
| 85 | + end |
| 86 | + return @benchmark begin |
| 87 | + if $gpu |
| 88 | + # `@allowscalar` covers residual scalar leaves the BLOCK rewrite |
| 89 | + # can't fold; the hot path is bulk. |
| 90 | + CUDA.@allowscalar MOI.eval_objective_gradient( |
| 91 | + $state.evaluator, |
| 92 | + $g, |
| 93 | + $x, |
| 94 | + ) |
| 95 | + CUDA.synchronize() |
| 96 | + else |
| 97 | + MOI.eval_objective_gradient($state.evaluator, $g, $x) |
| 98 | + end |
| 99 | + end |
| 100 | +end |
| 101 | + |
| 102 | +end # module |
0 commit comments