better test

blegat · blegat · commit b435b0ce3363 · 2026-05-06T08:06:32.000+02:00
diff --git a/test/JuMP.jl b/test/JuMP.jl
@@ -241,37 +241,58 @@ function _test_neural(
     W1_val = [0.3 -0.2; 0.1 0.4]
     W2_val = [-0.1 0.5; 0.2 -0.3]
     obj, g = _eval(model, loss, [vec(W1_val); vec(W2_val)])
-    obj_val = 0.8400236334170045
+    # Reference computed from the same hand-written forward/reverse formulas
+    # as `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, adapted to
+    # this test's loss `sum((Y - target).^2)` (no `/ n` scaling, full gradient
+    # over both `W1` and `W2`). `_eval` evaluates the objective at `xstart`
+    # and the gradient at `x = [1, ..., 8]`, so we need the references at the
+    # corresponding inputs.
+    X_const = [1.0 0.5; 0.3 0.8]
+    target_const = [0.5 0.2; 0.1 0.7]
+    obj_val = _ref_objective(W1_val, W2_val, X_const, target_const)
     if with_norm
         obj_val = sqrt(obj_val)
     end
     @test obj ≈ obj_val
-    # Reference gradient of `sum((Y - target)^2)` at x = [1, ..., 8] (which is
-    # what `_eval` actually feeds into `eval_objective_gradient`, not
-    # `xstart`). The corresponding sum-of-squares value at that x is
-    # `obj_val_at_grad_x` below.
-    grad = [
-        25.28653965327976
-        2.6560564947430976
-        9.751611287356354
-        1.0170720088353846
-        44.92184359605744
-        52.39073971097241
-        45.96398468947882
-        53.639895440937835
-    ]
+    W1_at_grad = reshape([1.0, 2.0, 3.0, 4.0], 2, 2)
+    W2_at_grad = reshape([5.0, 6.0, 7.0, 8.0], 2, 2)
+    grad_sumsq = _ref_gradient(W1_at_grad, W2_at_grad, X_const, target_const)
     if with_norm
-        # For `loss = norm(E)`, grad = grad_sumsq / (2 * norm(E)). The gradient
-        # is taken at x = [1, ..., 8], so scale by `1/(2 * sqrt(sum_sq))` at
-        # that x, not at xstart.
-        obj_val_at_grad_x = 626.2848267738252
-        @test g ≈ grad / (2 * sqrt(obj_val_at_grad_x))
+        # `d/dx ‖E‖₂ = (1/(2‖E‖₂)) · d/dx ‖E‖₂² = grad_sumsq / (2 sqrt(sumsq))`,
+        # taken at the gradient evaluation point.
+        norm_at_grad = sqrt(
+            _ref_objective(W1_at_grad, W2_at_grad, X_const, target_const),
+        )
+        @test g ≈ grad_sumsq ./ (2 * norm_at_grad)
     else
-        @test g ≈ grad
+        @test g ≈ grad_sumsq
     end
     return
 end
 
+# Hand-written forward + reverse for the 2-layer MLP `loss = sum((W2 *
+# tanh.(W1 * X) - target).^2)`. Same shape as `perf/cuda_vs_pytorch.jl`'s
+# `forward_pass` / `reverse_diff` but adapted to this test (no `/ n` scaling
+# and gradient over both `W1` and `W2`). Returned gradient is flattened with
+# the JuMP variable convention `[vec(grad_W1); vec(grad_W2)]`.
+function _ref_forward(W1, W2, X, target)
+    y_1 = tanh.(W1 * X)
+    J_1 = 1 .- y_1 .^ 2
+    J_2 = 2 .* (W2 * y_1 .- target)
+    return y_1, J_1, J_2
+end
+
+function _ref_objective(W1, W2, X, target)
+    return sum((W2 * tanh.(W1 * X) .- target) .^ 2)
+end
+
+function _ref_gradient(W1, W2, X, target)
+    y_1, J_1, J_2 = _ref_forward(W1, W2, X, target)
+    grad_W1 = (J_1 .* (W2' * J_2)) * X'
+    grad_W2 = J_2 * y_1'
+    return [vec(grad_W1); vec(grad_W2)]
+end
+
 function test_neural()
     bin = [false, true]
     @testset "$(with_norm ? "norm" : "sum")" for with_norm in bin