diff --git a/docs/Project.toml b/docs/Project.toml index 0f9b2a5e..9032e6ed 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -15,6 +15,7 @@ OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" +SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -36,6 +37,7 @@ Optimisers = "0.4" OrdinaryDiffEq = "7" Plots = "1" SafeTestsets = "0.1" +SciMLBase = "3" SciMLSensitivity = "7" StaticArrays = "1" Statistics = "1" diff --git a/docs/src/examples/bruss.md b/docs/src/examples/bruss.md index 2e2a1a40..5d0d6647 100644 --- a/docs/src/examples/bruss.md +++ b/docs/src/examples/bruss.md @@ -1,96 +1,84 @@ # GPU-Acceleration of a Stiff Nonlinear Partial Differential Equation -The following is a demonstration of a GPU-accelerated implicit solve of a stiff -nonlinear partial differential equation (the Brusselator model): +The following is a demonstration of a GPU-accelerated implicit solve of a +stiff nonlinear partial differential equation (the Brusselator model). The +RHS is written as a pure broadcast over the state arrays — the periodic +5-point Laplacian uses `circshift`, the reaction terms are element-wise, +and the spatially-varying source term `brusselator_f` is evaluated against +precomputed `x`/`y` grids — so no scalar indexing into the GPU array is +needed. We assert that explicitly with `CUDA.allowscalar(false)`: any +accidental scalar GPU read or write during the solve will raise. ```@example bruss using OrdinaryDiffEq, CUDA, LinearAlgebra const N = 32 -const xyd_brusselator = range(0, stop = 1, length = N) -brusselator_f(x, y, t) = (((x - 0.3)^2 + (y - 0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.0 -limit(a, N) = a == N + 1 ? 1 : a == 0 ? N : a -kernel_u! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator) - @inline function (du, u, A, B, α, II, I, t) - i, j = Tuple(I) - x = xyd[I[1]] - y = xyd[I[2]] - ip1 = limit(i + 1, N) - im1 = limit(i - 1, N) - jp1 = limit(j + 1, N) - jm1 = limit(j - 1, N) - du[II[i, j, 1]] = α * (u[II[im1, j, 1]] + u[II[ip1, j, 1]] + u[II[i, jp1, 1]] + - u[II[i, jm1, 1]] - 4u[II[i, j, 1]]) + - B + u[II[i, j, 1]]^2 * u[II[i, j, 2]] - (A + 1) * u[II[i, j, 1]] + - brusselator_f(x, y, t) - end -end -kernel_v! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator) - @inline function (du, u, A, B, α, II, I, t) - i, j = Tuple(I) - ip1 = limit(i + 1, N) - im1 = limit(i - 1, N) - jp1 = limit(j + 1, N) - jm1 = limit(j - 1, N) - du[II[i, j, 2]] = α * (u[II[im1, j, 2]] + u[II[ip1, j, 2]] + u[II[i, jp1, 2]] + - u[II[i, jm1, 2]] - 4u[II[i, j, 2]]) + - A * u[II[i, j, 1]] - u[II[i, j, 1]]^2 * u[II[i, j, 2]] - end -end -brusselator_2d = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator) - function (du, u, p, t) - @inbounds begin - ii1 = N^2 - ii2 = ii1 + N^2 - ii3 = ii2 + 2(N^2) - A = p[1] - B = p[2] - α = p[3] / dx^2 - II = LinearIndices((N, N, 2)) - kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t) - kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t) - return nothing - end - end +const xyd_brusselator = range(0.0f0, stop = 1.0f0, length = N) + +# Source term — element-wise pure function so it broadcasts over the +# (x, y) grids without any scalar indexing. +brusselator_f(x, y, t) = + (((x - 0.3f0)^2 + (y - 0.6f0)^2) <= 0.01f0) * (t >= 1.1f0) * 5.0f0 + +function brusselator_2d!(du, u, p, t) + A, B, α_raw, dx_val, x_grid, y_grid = p + α = α_raw / dx_val^2 + + u1 = @view u[:, :, 1] + u2 = @view u[:, :, 2] + du1 = @view du[:, :, 1] + du2 = @view du[:, :, 2] + + # Periodic boundary 5-point Laplacian via circshift — entirely + # broadcast on the GPU array, no scalar indexing. + u1_im1 = circshift(u1, (1, 0)); u1_ip1 = circshift(u1, (-1, 0)) + u1_jm1 = circshift(u1, (0, 1)); u1_jp1 = circshift(u1, (0, -1)) + u2_im1 = circshift(u2, (1, 0)); u2_ip1 = circshift(u2, (-1, 0)) + u2_jm1 = circshift(u2, (0, 1)); u2_jp1 = circshift(u2, (0, -1)) + + @. du1 = α * (u1_im1 + u1_ip1 + u1_jm1 + u1_jp1 - 4 * u1) + + B + u1^2 * u2 - (A + 1) * u1 + brusselator_f(x_grid, y_grid, t) + @. du2 = α * (u2_im1 + u2_ip1 + u2_jm1 + u2_jp1 - 4 * u2) + + A * u1 - u1^2 * u2 + + return nothing end -p = (3.4, 1.0, 10.0, step(xyd_brusselator)) function init_brusselator_2d(xyd) N = length(xyd) - u = zeros(N, N, 2) - for I in CartesianIndices((N, N)) - x = xyd[I[1]] - y = xyd[I[2]] - u[I, 1] = 22 * (y * (1 - y))^(3 / 2) - u[I, 2] = 27 * (x * (1 - x))^(3 / 2) + u = zeros(Float32, N, N, 2) + for i in 1:N, j in 1:N + x = xyd[i]; y = xyd[j] + u[i, j, 1] = 22 * (y * (1 - y))^(3 / 2) + u[i, j, 2] = 27 * (x * (1 - x))^(3 / 2) end u end -u0 = init_brusselator_2d(xyd_brusselator) -prob_ode_brusselator_2d = ODEProblem(brusselator_2d, u0, (0.0, 11.5), p) -du = similar(u0) -brusselator_2d(du, u0, p, 0.0) -du[34] # 802.9807693762164 -du[1058] # 985.3120721709204 -du[2000] # -403.5817880634729 -du[end] # 1431.1460373522068 -du[521] # -323.1677459142322 +u0_cpu = init_brusselator_2d(xyd_brusselator) -du2 = similar(u0) -brusselator_2d(du2, u0, p, 1.3) -du2[34] # 802.9807693762164 -du2[1058] # 985.3120721709204 -du2[2000] # -403.5817880634729 -du2[end] # 1431.1460373522068 -du2[521] # -318.1677459142322 +# Pre-compute x/y grids shaped for broadcasting with `u[:, :, k]`. +xg = CuArray(reshape(collect(xyd_brusselator), N, 1)) +yg = CuArray(reshape(collect(xyd_brusselator), 1, N)) +u0 = CuArray(u0_cpu) +p = (3.4f0, 1.0f0, 10.0f0, step(xyd_brusselator), xg, yg) -prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d, CuArray(u0), (0.0f0, 11.5f0), p, - tstops = [1.1f0]) -# Note: Solving requires allowscalar(true) during initialization -# This demonstrates the problem setup for GPU-based PDE solving -CUDA.allowscalar(true) -sol = solve(prob_ode_brusselator_2d_cuda, Rosenbrock23(), save_everystep = false) +# Sanity check the RHS on the GPU under allowscalar(false). CUDA.allowscalar(false) +du = similar(u0) +brusselator_2d!(du, u0, p, 0.0f0) +Array(du[1:1, 1:1, 1])[1], Array(du[end:end, end:end, 2])[1] + +# The brusselator RHS only depends on `t` through `brusselator_f`, which is a +# step source activating at t=1.1 (handled explicitly via `tstops`). Away +# from that discontinuity, df/dt = 0; supply that analytically so the +# implicit solver doesn't AD a non-AD-friendly time-gradient through the +# closure. +brusselator_tgrad(du, u, p, t) = (du .= 0) +brusselator_f_cuda = ODEFunction(brusselator_2d!, tgrad = brusselator_tgrad) +prob_ode_brusselator_2d_cuda = ODEProblem( + brusselator_f_cuda, u0, (0.0f0, 11.5f0), p, tstops = [1.1f0] +) +sol = solve(prob_ode_brusselator_2d_cuda, Rosenbrock23(), save_everystep = false) sol.retcode ``` diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md index 76c51134..5b2bc92f 100644 --- a/docs/src/examples/gpu_ensemble_random_decay.md +++ b/docs/src/examples/gpu_ensemble_random_decay.md @@ -126,8 +126,10 @@ cpu_sol_plot = solve(ensemble_prob, Tsit5(), EnsembleThreads(); dt = 0.01f0, adaptive = false) +solutions_vector_cpu = cpu_sol_plot.u + # Extract time points from the first trajectory's solution (assuming all are same) -t_vals_cpu = cpu_sol_plot[1].t +t_vals_cpu = solutions_vector_cpu[1].t num_times_cpu = length(t_vals_cpu) # Create a matrix to hold the results: rows=time, columns=trajectories @@ -136,13 +138,13 @@ ensemble_vals_cpu = fill(NaN32, num_times_cpu, num_trajectories) # Use Float32 # Extract the state value (u[1]) from each trajectory at each time point for i in 1:num_trajectories + sol = solutions_vector_cpu[i] # Check if the trajectory simulation was successful and data looks valid - if cpu_sol_plot[i].retcode == ReturnCode.Success && - length(cpu_sol_plot[i].u) == num_times_cpu + if sol.retcode == ReturnCode.Success && length(sol.u) == num_times_cpu # sol.u is a Vector{SVector{1, Float32}}. We need the element from each SVector. - ensemble_vals_cpu[:, i] .= getindex.(cpu_sol_plot[i].u, 1) + ensemble_vals_cpu[:, i] .= getindex.(sol.u, 1) else - @warn "CPU Trajectory $i failed or had unexpected length. Retcode: $(cpu_sol_plot[i].retcode). Length: $(length(cpu_sol_plot[i].u)). Skipping." + @warn "CPU Trajectory $i failed or had unexpected length. Retcode: $(sol.retcode). Length: $(length(sol.u)). Skipping." # Column remains NaN end end diff --git a/docs/src/tutorials/weak_order_conv_sde.md b/docs/src/tutorials/weak_order_conv_sde.md index b935567e..f1729d46 100644 --- a/docs/src/tutorials/weak_order_conv_sde.md +++ b/docs/src/tutorials/weak_order_conv_sde.md @@ -6,7 +6,8 @@ With the lower overhead of `EnsembleGPUKernel` API, these calculations can be do The example below provides a way to calculate the expectation time-series of a linear SDE: ```@example kernel_sde -using DiffEqGPU, OrdinaryDiffEq, StaticArrays, LinearAlgebra, Statistics +using DiffEqGPU, OrdinaryDiffEq, StochasticDiffEq, StaticArrays, LinearAlgebra, Statistics +using CUDA num_trajectories = 10_000 @@ -22,12 +23,14 @@ prob = SDEProblem(f, g, u₀, tspan, p; seed = 1234) monteprob = EnsembleProblem(prob) -sol = solve(monteprob, GPUEM(), EnsembleGPUKernel(0.0), dt = Float32(1 // 2^8), - trajectories = num_trajectories, adaptive = false) +sol = solve( + monteprob, GPUEM(), EnsembleGPUKernel(CUDA.CUDABackend(), 0.0), + dt = Float32(1 // 2^8), trajectories = num_trajectories, adaptive = false +) sol_array = Array(sol) -ts = sol[1].t +ts = sol.u[1].t us_calc = reshape(mean(sol_array, dims = 3), size(sol_array, 2))