SciML · ChrisRackauckas · May 13, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -15,6 +15,7 @@ OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -36,6 +37,7 @@ Optimisers = "0.4"
 OrdinaryDiffEq = "7"
 Plots = "1"
 SafeTestsets = "0.1"
+SciMLBase = "3"
 SciMLSensitivity = "7"
 StaticArrays = "1"
 Statistics = "1"

diff --git a/docs/src/examples/bruss.md b/docs/src/examples/bruss.md
@@ -1,96 +1,84 @@
 # GPU-Acceleration of a Stiff Nonlinear Partial Differential Equation
 
-The following is a demonstration of a GPU-accelerated implicit solve of a stiff
-nonlinear partial differential equation (the Brusselator model):
+The following is a demonstration of a GPU-accelerated implicit solve of a
+stiff nonlinear partial differential equation (the Brusselator model). The
+RHS is written as a pure broadcast over the state arrays — the periodic
+5-point Laplacian uses `circshift`, the reaction terms are element-wise,
+and the spatially-varying source term `brusselator_f` is evaluated against
+precomputed `x`/`y` grids — so no scalar indexing into the GPU array is
+needed. We assert that explicitly with `CUDA.allowscalar(false)`: any
+accidental scalar GPU read or write during the solve will raise.
 
 ```@example bruss
 using OrdinaryDiffEq, CUDA, LinearAlgebra
 
 const N = 32
-const xyd_brusselator = range(0, stop = 1, length = N)
-brusselator_f(x, y, t) = (((x - 0.3)^2 + (y - 0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.0
-limit(a, N) = a == N + 1 ? 1 : a == 0 ? N : a
-kernel_u! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
-    @inline function (du, u, A, B, α, II, I, t)
-        i, j = Tuple(I)
-        x = xyd[I[1]]
-        y = xyd[I[2]]
-        ip1 = limit(i + 1, N)
-        im1 = limit(i - 1, N)
-        jp1 = limit(j + 1, N)
-        jm1 = limit(j - 1, N)
-        du[II[i, j, 1]] = α * (u[II[im1, j, 1]] + u[II[ip1, j, 1]] + u[II[i, jp1, 1]] +
-                           u[II[i, jm1, 1]] - 4u[II[i, j, 1]]) +
-                          B + u[II[i, j, 1]]^2 * u[II[i, j, 2]] - (A + 1) * u[II[i, j, 1]] +
-                          brusselator_f(x, y, t)
-    end
-end
-kernel_v! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
-    @inline function (du, u, A, B, α, II, I, t)
-        i, j = Tuple(I)
-        ip1 = limit(i + 1, N)
-        im1 = limit(i - 1, N)
-        jp1 = limit(j + 1, N)
-        jm1 = limit(j - 1, N)
-        du[II[i, j, 2]] = α * (u[II[im1, j, 2]] + u[II[ip1, j, 2]] + u[II[i, jp1, 2]] +
-                           u[II[i, jm1, 2]] - 4u[II[i, j, 2]]) +
-                          A * u[II[i, j, 1]] - u[II[i, j, 1]]^2 * u[II[i, j, 2]]
-    end
-end
-brusselator_2d = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
-    function (du, u, p, t)
-        @inbounds begin
-            ii1 = N^2
-            ii2 = ii1 + N^2
-            ii3 = ii2 + 2(N^2)
-            A = p[1]
-            B = p[2]
-            α = p[3] / dx^2
-            II = LinearIndices((N, N, 2))
-            kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
-            kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
-            return nothing
-        end
-    end
+const xyd_brusselator = range(0.0f0, stop = 1.0f0, length = N)
+
+# Source term — element-wise pure function so it broadcasts over the
+# (x, y) grids without any scalar indexing.
+brusselator_f(x, y, t) =
+    (((x - 0.3f0)^2 + (y - 0.6f0)^2) <= 0.01f0) * (t >= 1.1f0) * 5.0f0
+
+function brusselator_2d!(du, u, p, t)
+    A, B, α_raw, dx_val, x_grid, y_grid = p
+    α = α_raw / dx_val^2
+
+    u1 = @view u[:, :, 1]
+    u2 = @view u[:, :, 2]
+    du1 = @view du[:, :, 1]
+    du2 = @view du[:, :, 2]
+
+    # Periodic boundary 5-point Laplacian via circshift — entirely
+    # broadcast on the GPU array, no scalar indexing.
+    u1_im1 = circshift(u1, (1, 0));  u1_ip1 = circshift(u1, (-1, 0))
+    u1_jm1 = circshift(u1, (0, 1));  u1_jp1 = circshift(u1, (0, -1))
+    u2_im1 = circshift(u2, (1, 0));  u2_ip1 = circshift(u2, (-1, 0))
+    u2_jm1 = circshift(u2, (0, 1));  u2_jp1 = circshift(u2, (0, -1))
+
+    @. du1 = α * (u1_im1 + u1_ip1 + u1_jm1 + u1_jp1 - 4 * u1) +
+        B + u1^2 * u2 - (A + 1) * u1 + brusselator_f(x_grid, y_grid, t)
+    @. du2 = α * (u2_im1 + u2_ip1 + u2_jm1 + u2_jp1 - 4 * u2) +
+        A * u1 - u1^2 * u2
+
+    return nothing
 end
-p = (3.4, 1.0, 10.0, step(xyd_brusselator))
 
 function init_brusselator_2d(xyd)
     N = length(xyd)
-    u = zeros(N, N, 2)
-    for I in CartesianIndices((N, N))
-        x = xyd[I[1]]
-        y = xyd[I[2]]
-        u[I, 1] = 22 * (y * (1 - y))^(3 / 2)
-        u[I, 2] = 27 * (x * (1 - x))^(3 / 2)
+    u = zeros(Float32, N, N, 2)
+    for i in 1:N, j in 1:N
+        x = xyd[i]; y = xyd[j]
+        u[i, j, 1] = 22 * (y * (1 - y))^(3 / 2)
+        u[i, j, 2] = 27 * (x * (1 - x))^(3 / 2)
     end
     u
 end
-u0 = init_brusselator_2d(xyd_brusselator)
-prob_ode_brusselator_2d = ODEProblem(brusselator_2d, u0, (0.0, 11.5), p)
 
-du = similar(u0)
-brusselator_2d(du, u0, p, 0.0)
-du[34] # 802.9807693762164
-du[1058] # 985.3120721709204
-du[2000] # -403.5817880634729
-du[end] # 1431.1460373522068
-du[521] # -323.1677459142322
+u0_cpu = init_brusselator_2d(xyd_brusselator)
 
-du2 = similar(u0)
-brusselator_2d(du2, u0, p, 1.3)
-du2[34] # 802.9807693762164
-du2[1058] # 985.3120721709204
-du2[2000] # -403.5817880634729
-du2[end] # 1431.1460373522068
-du2[521] # -318.1677459142322
+# Pre-compute x/y grids shaped for broadcasting with `u[:, :, k]`.
+xg = CuArray(reshape(collect(xyd_brusselator), N, 1))
+yg = CuArray(reshape(collect(xyd_brusselator), 1, N))
+u0 = CuArray(u0_cpu)
+p = (3.4f0, 1.0f0, 10.0f0, step(xyd_brusselator), xg, yg)
 
-prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d, CuArray(u0), (0.0f0, 11.5f0), p,
-    tstops = [1.1f0])
-# Note: Solving requires allowscalar(true) during initialization
-# This demonstrates the problem setup for GPU-based PDE solving
-CUDA.allowscalar(true)
-sol = solve(prob_ode_brusselator_2d_cuda, Rosenbrock23(), save_everystep = false)
+# Sanity check the RHS on the GPU under allowscalar(false).
 CUDA.allowscalar(false)
+du = similar(u0)
+brusselator_2d!(du, u0, p, 0.0f0)
+Array(du[1:1, 1:1, 1])[1], Array(du[end:end, end:end, 2])[1]
+
+# The brusselator RHS only depends on `t` through `brusselator_f`, which is a
+# step source activating at t=1.1 (handled explicitly via `tstops`). Away
+# from that discontinuity, df/dt = 0; supply that analytically so the
+# implicit solver doesn't AD a non-AD-friendly time-gradient through the
+# closure.
+brusselator_tgrad(du, u, p, t) = (du .= 0)
+brusselator_f_cuda = ODEFunction(brusselator_2d!, tgrad = brusselator_tgrad)
+prob_ode_brusselator_2d_cuda = ODEProblem(
+    brusselator_f_cuda, u0, (0.0f0, 11.5f0), p, tstops = [1.1f0]
+)
+sol = solve(prob_ode_brusselator_2d_cuda, Rosenbrock23(), save_everystep = false)
 sol.retcode
 ```
diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -126,8 +126,10 @@ cpu_sol_plot = solve(ensemble_prob, Tsit5(), EnsembleThreads();
     dt = 0.01f0,
     adaptive = false)
 
+solutions_vector_cpu = cpu_sol_plot.u
+
 # Extract time points from the first trajectory's solution (assuming all are same)
-t_vals_cpu = cpu_sol_plot[1].t
+t_vals_cpu = solutions_vector_cpu[1].t
 num_times_cpu = length(t_vals_cpu)
 
 # Create a matrix to hold the results: rows=time, columns=trajectories
@@ -136,13 +138,13 @@ ensemble_vals_cpu = fill(NaN32, num_times_cpu, num_trajectories) # Use Float32
 
 # Extract the state value (u[1]) from each trajectory at each time point
 for i in 1:num_trajectories
+    sol = solutions_vector_cpu[i]
     # Check if the trajectory simulation was successful and data looks valid
-    if cpu_sol_plot[i].retcode == ReturnCode.Success &&
-       length(cpu_sol_plot[i].u) == num_times_cpu
+    if sol.retcode == ReturnCode.Success && length(sol.u) == num_times_cpu
         # sol.u is a Vector{SVector{1, Float32}}. We need the element from each SVector.
-        ensemble_vals_cpu[:, i] .= getindex.(cpu_sol_plot[i].u, 1)
+        ensemble_vals_cpu[:, i] .= getindex.(sol.u, 1)
     else
-        @warn "CPU Trajectory $i failed or had unexpected length. Retcode: $(cpu_sol_plot[i].retcode). Length: $(length(cpu_sol_plot[i].u)). Skipping."
+        @warn "CPU Trajectory $i failed or had unexpected length. Retcode: $(sol.retcode). Length: $(length(sol.u)). Skipping."
         # Column remains NaN
     end
 end

diff --git a/docs/src/tutorials/weak_order_conv_sde.md b/docs/src/tutorials/weak_order_conv_sde.md
@@ -6,7 +6,8 @@ With the lower overhead of `EnsembleGPUKernel` API, these calculations can be do
 The example below provides a way to calculate the expectation time-series of a linear SDE:
 
 ```@example kernel_sde
-using DiffEqGPU, OrdinaryDiffEq, StaticArrays, LinearAlgebra, Statistics
+using DiffEqGPU, OrdinaryDiffEq, StochasticDiffEq, StaticArrays, LinearAlgebra, Statistics
+using CUDA
 
 num_trajectories = 10_000
 
@@ -22,12 +23,14 @@ prob = SDEProblem(f, g, u₀, tspan, p; seed = 1234)
 
 monteprob = EnsembleProblem(prob)
 
-sol = solve(monteprob, GPUEM(), EnsembleGPUKernel(0.0), dt = Float32(1 // 2^8),
-    trajectories = num_trajectories, adaptive = false)
+sol = solve(
+    monteprob, GPUEM(), EnsembleGPUKernel(CUDA.CUDABackend(), 0.0),
+    dt = Float32(1 // 2^8), trajectories = num_trajectories, adaptive = false
+)
 
 sol_array = Array(sol)
 
-ts = sol[1].t
+ts = sol.u[1].t
 
 us_calc = reshape(mean(sol_array, dims = 3), size(sol_array, 2))