Skip to content

Commit b435b0c

Browse files
committed
better test
1 parent 1d25af6 commit b435b0c

1 file changed

Lines changed: 42 additions & 21 deletions

File tree

test/JuMP.jl

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -241,37 +241,58 @@ function _test_neural(
241241
W1_val = [0.3 -0.2; 0.1 0.4]
242242
W2_val = [-0.1 0.5; 0.2 -0.3]
243243
obj, g = _eval(model, loss, [vec(W1_val); vec(W2_val)])
244-
obj_val = 0.8400236334170045
244+
# Reference computed from the same hand-written forward/reverse formulas
245+
# as `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, adapted to
246+
# this test's loss `sum((Y - target).^2)` (no `/ n` scaling, full gradient
247+
# over both `W1` and `W2`). `_eval` evaluates the objective at `xstart`
248+
# and the gradient at `x = [1, ..., 8]`, so we need the references at the
249+
# corresponding inputs.
250+
X_const = [1.0 0.5; 0.3 0.8]
251+
target_const = [0.5 0.2; 0.1 0.7]
252+
obj_val = _ref_objective(W1_val, W2_val, X_const, target_const)
245253
if with_norm
246254
obj_val = sqrt(obj_val)
247255
end
248256
@test obj obj_val
249-
# Reference gradient of `sum((Y - target)^2)` at x = [1, ..., 8] (which is
250-
# what `_eval` actually feeds into `eval_objective_gradient`, not
251-
# `xstart`). The corresponding sum-of-squares value at that x is
252-
# `obj_val_at_grad_x` below.
253-
grad = [
254-
25.28653965327976
255-
2.6560564947430976
256-
9.751611287356354
257-
1.0170720088353846
258-
44.92184359605744
259-
52.39073971097241
260-
45.96398468947882
261-
53.639895440937835
262-
]
257+
W1_at_grad = reshape([1.0, 2.0, 3.0, 4.0], 2, 2)
258+
W2_at_grad = reshape([5.0, 6.0, 7.0, 8.0], 2, 2)
259+
grad_sumsq = _ref_gradient(W1_at_grad, W2_at_grad, X_const, target_const)
263260
if with_norm
264-
# For `loss = norm(E)`, grad = grad_sumsq / (2 * norm(E)). The gradient
265-
# is taken at x = [1, ..., 8], so scale by `1/(2 * sqrt(sum_sq))` at
266-
# that x, not at xstart.
267-
obj_val_at_grad_x = 626.2848267738252
268-
@test g grad / (2 * sqrt(obj_val_at_grad_x))
261+
# `d/dx ‖E‖₂ = (1/(2‖E‖₂)) · d/dx ‖E‖₂² = grad_sumsq / (2 sqrt(sumsq))`,
262+
# taken at the gradient evaluation point.
263+
norm_at_grad = sqrt(
264+
_ref_objective(W1_at_grad, W2_at_grad, X_const, target_const),
265+
)
266+
@test g grad_sumsq ./ (2 * norm_at_grad)
269267
else
270-
@test g grad
268+
@test g grad_sumsq
271269
end
272270
return
273271
end
274272

273+
# Hand-written forward + reverse for the 2-layer MLP `loss = sum((W2 *
274+
# tanh.(W1 * X) - target).^2)`. Same shape as `perf/cuda_vs_pytorch.jl`'s
275+
# `forward_pass` / `reverse_diff` but adapted to this test (no `/ n` scaling
276+
# and gradient over both `W1` and `W2`). Returned gradient is flattened with
277+
# the JuMP variable convention `[vec(grad_W1); vec(grad_W2)]`.
278+
function _ref_forward(W1, W2, X, target)
279+
y_1 = tanh.(W1 * X)
280+
J_1 = 1 .- y_1 .^ 2
281+
J_2 = 2 .* (W2 * y_1 .- target)
282+
return y_1, J_1, J_2
283+
end
284+
285+
function _ref_objective(W1, W2, X, target)
286+
return sum((W2 * tanh.(W1 * X) .- target) .^ 2)
287+
end
288+
289+
function _ref_gradient(W1, W2, X, target)
290+
y_1, J_1, J_2 = _ref_forward(W1, W2, X, target)
291+
grad_W1 = (J_1 .* (W2' * J_2)) * X'
292+
grad_W2 = J_2 * y_1'
293+
return [vec(grad_W1); vec(grad_W2)]
294+
end
295+
275296
function test_neural()
276297
bin = [false, true]
277298
@testset "$(with_norm ? "norm" : "sum")" for with_norm in bin

0 commit comments

Comments
 (0)