@@ -241,37 +241,58 @@ function _test_neural(
241241 W1_val = [0.3 - 0.2 ; 0.1 0.4 ]
242242 W2_val = [- 0.1 0.5 ; 0.2 - 0.3 ]
243243 obj, g = _eval (model, loss, [vec (W1_val); vec (W2_val)])
244- obj_val = 0.8400236334170045
244+ # Reference computed from the same hand-written forward/reverse formulas
245+ # as `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, adapted to
246+ # this test's loss `sum((Y - target).^2)` (no `/ n` scaling, full gradient
247+ # over both `W1` and `W2`). `_eval` evaluates the objective at `xstart`
248+ # and the gradient at `x = [1, ..., 8]`, so we need the references at the
249+ # corresponding inputs.
250+ X_const = [1.0 0.5 ; 0.3 0.8 ]
251+ target_const = [0.5 0.2 ; 0.1 0.7 ]
252+ obj_val = _ref_objective (W1_val, W2_val, X_const, target_const)
245253 if with_norm
246254 obj_val = sqrt (obj_val)
247255 end
248256 @test obj ≈ obj_val
249- # Reference gradient of `sum((Y - target)^2)` at x = [1, ..., 8] (which is
250- # what `_eval` actually feeds into `eval_objective_gradient`, not
251- # `xstart`). The corresponding sum-of-squares value at that x is
252- # `obj_val_at_grad_x` below.
253- grad = [
254- 25.28653965327976
255- 2.6560564947430976
256- 9.751611287356354
257- 1.0170720088353846
258- 44.92184359605744
259- 52.39073971097241
260- 45.96398468947882
261- 53.639895440937835
262- ]
257+ W1_at_grad = reshape ([1.0 , 2.0 , 3.0 , 4.0 ], 2 , 2 )
258+ W2_at_grad = reshape ([5.0 , 6.0 , 7.0 , 8.0 ], 2 , 2 )
259+ grad_sumsq = _ref_gradient (W1_at_grad, W2_at_grad, X_const, target_const)
263260 if with_norm
264- # For `loss = norm(E)`, grad = grad_sumsq / (2 * norm(E)). The gradient
265- # is taken at x = [1, ..., 8], so scale by `1/(2 * sqrt(sum_sq))` at
266- # that x, not at xstart.
267- obj_val_at_grad_x = 626.2848267738252
268- @test g ≈ grad / (2 * sqrt (obj_val_at_grad_x))
261+ # `d/dx ‖E‖₂ = (1/(2‖E‖₂)) · d/dx ‖E‖₂² = grad_sumsq / (2 sqrt(sumsq))`,
262+ # taken at the gradient evaluation point.
263+ norm_at_grad = sqrt (
264+ _ref_objective (W1_at_grad, W2_at_grad, X_const, target_const),
265+ )
266+ @test g ≈ grad_sumsq ./ (2 * norm_at_grad)
269267 else
270- @test g ≈ grad
268+ @test g ≈ grad_sumsq
271269 end
272270 return
273271end
274272
273+ # Hand-written forward + reverse for the 2-layer MLP `loss = sum((W2 *
274+ # tanh.(W1 * X) - target).^2)`. Same shape as `perf/cuda_vs_pytorch.jl`'s
275+ # `forward_pass` / `reverse_diff` but adapted to this test (no `/ n` scaling
276+ # and gradient over both `W1` and `W2`). Returned gradient is flattened with
277+ # the JuMP variable convention `[vec(grad_W1); vec(grad_W2)]`.
278+ function _ref_forward (W1, W2, X, target)
279+ y_1 = tanh .(W1 * X)
280+ J_1 = 1 .- y_1 .^ 2
281+ J_2 = 2 .* (W2 * y_1 .- target)
282+ return y_1, J_1, J_2
283+ end
284+
285+ function _ref_objective (W1, W2, X, target)
286+ return sum ((W2 * tanh .(W1 * X) .- target) .^ 2 )
287+ end
288+
289+ function _ref_gradient (W1, W2, X, target)
290+ y_1, J_1, J_2 = _ref_forward (W1, W2, X, target)
291+ grad_W1 = (J_1 .* (W2' * J_2)) * X'
292+ grad_W2 = J_2 * y_1'
293+ return [vec (grad_W1); vec (grad_W2)]
294+ end
295+
275296function test_neural ()
276297 bin = [false , true ]
277298 @testset " $(with_norm ? " norm" : " sum" ) " for with_norm in bin
0 commit comments