Lux

blegat · blegat · commit ac1c6bde5cbb · 2026-05-03T11:48:40.000+02:00
diff --git a/perf/Project.toml b/perf/Project.toml
@@ -8,6 +8,8 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/perf/cuda_vs_pytorch.jl b/perf/cuda_vs_pytorch.jl
@@ -21,6 +21,7 @@ using CUDA
 using CUDA: AS
 using BenchmarkTools
 using PythonCall
+using Lux, Zygote
 
 # -------------------------------------------------------------------------
 # Hardcoded CUDA.jl path
@@ -146,18 +147,25 @@ function _gemm_simt!(C::CuArray{Float32,2}, transA::Char, A::CuArray{Float32,2},
     ldb = max(1, stride(B, 2))
     ldc = max(1, stride(C, 2))
     # CUDA.jl puts the cuBLAS handle in CUBLAS_POINTER_MODE_DEVICE, so alpha/beta
-    # MUST be device pointers. Passing host Ref{Float32} causes UVA fault handling
-    # per kernel launch (~100× slowdown but eventually-correct values).
+    # MUST be device pointers (host Ref triggers UVA fault handling — 100× slowdown).
     α = CUDA.CuRef{Float32}(alpha); β = CUDA.CuRef{Float32}(beta)
-    CUDA.CUBLAS.cublasGemmEx(
-        CUDA.CUBLAS.handle(),
-        transA, transB, m, n, k,
-        α, A, Float32, lda,
-        B, Float32, ldb,
-        β, C, Float32, ldc,
-        CUDA.CUBLAS.CUBLAS_COMPUTE_32F,
-        CUDA.CUBLAS.CUBLAS_GEMM_DEFAULT,
-    )
+    h = CUDA.CUBLAS.handle()
+    # Under FAST_MATH the handle's math mode is CUBLAS_TF32_TENSOR_OP_MATH, which
+    # forces TF32 tensor cores even when we ask for CUBLAS_COMPUTE_32F. Flip it to
+    # DEFAULT_MATH for this call so cuBLAS picks a SIMT FP32 kernel.
+    CUDA.CUBLAS.math_mode!(h, CUDA.DEFAULT_MATH)
+    try
+        CUDA.CUBLAS.cublasGemmEx(
+            h, transA, transB, m, n, k,
+            α, A, Float32, lda,
+            B, Float32, ldb,
+            β, C, Float32, ldc,
+            CUDA.CUBLAS.CUBLAS_COMPUTE_32F,
+            CUDA.CUBLAS.CUBLAS_GEMM_DEFAULT,
+        )
+    finally
+        CUDA.CUBLAS.math_mode!(h, CUDA.math_mode())  # restore (FAST_MATH → TF32 tensor op)
+    end
     return C
 end
 
@@ -185,6 +193,38 @@ function reverse_diff_v5(W1, W2, X, y)
     return result
 end
 
+# -------------------------------------------------------------------------
+# Lux + Zygote path
+#
+# Builds an equivalent 2-layer MLP `Y = W2 * tanh(W1 * X)` (no bias) using
+# Lux, plugs in the *same* CuArray weights so the gradient is comparable,
+# and lets Zygote source-to-source the backward. This goes through the same
+# CUDA.jl + cuBLAS stack as `reverse_diff`, so we expect similar kernels —
+# the interesting thing is the AD/dispatch overhead Lux+Zygote add on top.
+# -------------------------------------------------------------------------
+function build_lux(W1g::CuArray{Float32,2}, W2g::CuArray{Float32,2})
+    h, d  = size(W1g)
+    model = Lux.Chain(
+        Lux.Dense(d => h, tanh; use_bias = false),
+        Lux.Dense(h => 1, identity; use_bias = false),
+    )
+    ps = (
+        layer_1 = (weight = W1g,),
+        layer_2 = (weight = W2g,),
+    )
+    st = Lux.initialstates(Random.default_rng(), model)
+    return model, ps, st
+end
+
+function lux_grad(model, ps, st, Xg::CuArray, yg::CuArray)
+    function loss_fn(p)
+        y_hat, _ = model(Xg, p, st)
+        return sum((y_hat .- yg) .^ 2) / size(yg, 2)
+    end
+    ∂ps = first(Zygote.gradient(loss_fn, ps))
+    return ∂ps.layer_1.weight
+end
+
 # -------------------------------------------------------------------------
 # PyTorch path
 # -------------------------------------------------------------------------
@@ -314,6 +354,17 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1f-3)
     grad_julia_v5 = Array(reverse_diff_v5(W1g, W2g, Xg, yg))
     CUDA.synchronize()
 
+    # Lux + Zygote warmup (first call compiles Zygote's pullback for this shape)
+    print("Lux+Zygote compile warmup for h=$h ... "); flush(stdout)
+    lux_model, lux_ps, lux_st = build_lux(W1g, W2g)
+    t_lux_compile = @elapsed begin
+        lux_grad(lux_model, lux_ps, lux_st, Xg, yg)
+        CUDA.synchronize()
+    end
+    @printf "%.2f s\n" t_lux_compile
+    grad_lux = Array(lux_grad(lux_model, lux_ps, lux_st, Xg, yg))
+    CUDA.synchronize()
+
     # ----- PyTorch -----
     W1t, W2t, Xt, yt = build_torch_tensors(W1, W2, X, y)
     grad_pytorch_eager = torch_to_julia(pytorch_grad_eager(W1t, W2t, Xt, yt))
@@ -333,6 +384,7 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1f-3)
     # ----- Numerical equivalence -----
     for (name, g) in [("Julia v4 (vec=4)   ", grad_julia_v4),
                       ("Julia v5 (vec=4+SIMT)", grad_julia_v5),
+                      ("Lux + Zygote       ", grad_lux),
                       ("PyTorch eager      ", grad_pytorch_eager),
                       ("PyTorch compiled   ", grad_pytorch_compiled)]
         maxdiff = maximum(abs.(grad_julia .- g))
@@ -357,6 +409,10 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1f-3)
         reverse_diff_v5($W1g, $W2g, $Xg, $yg)
         CUDA.synchronize()
     end samples=30 evals=1 seconds=10
+    bjlux = @benchmark begin
+        lux_grad($lux_model, $lux_ps, $lux_st, $Xg, $yg)
+        CUDA.synchronize()
+    end samples=30 evals=1 seconds=10
     be = @benchmark begin
         pytorch_grad_eager($W1t, $W2t, $Xt, $yt)
         $torch.cuda.synchronize()
@@ -368,6 +424,7 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1f-3)
     @printf "Julia broadcast      : median %8.3f µs\n" 1e-3 * median(bj).time
     @printf "Julia vec=4          : median %8.3f µs\n" 1e-3 * median(bj4).time
     @printf "Julia vec=4 + SIMT   : median %8.3f µs\n" 1e-3 * median(bj5).time
+    @printf "Lux + Zygote         : median %8.3f µs\n" 1e-3 * median(bjlux).time
     @printf "PyTorch eager        : median %8.3f µs\n" 1e-3 * median(be).time
     @printf "PyTorch compiled     : median %8.3f µs\n" 1e-3 * median(bc).time
 
@@ -381,6 +438,9 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1f-3)
     println("\n--- CUDA trace: Julia vec=4 + SIMT ---")
     summarize_julia_trace(stdout, julia_trace(() -> reverse_diff_v5(W1g, W2g, Xg, yg)))
 
+    println("\n--- CUDA trace: Lux + Zygote ---")
+    summarize_julia_trace(stdout, julia_trace(() -> lux_grad(lux_model, lux_ps, lux_st, Xg, yg)))
+
     println("\n--- CUDA trace: PyTorch eager ---")
     println(pytorch_trace(() -> pytorch_grad_eager(W1t, W2t, Xt, yt)))