Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand All @@ -36,6 +37,7 @@ Optimisers = "0.4"
OrdinaryDiffEq = "7"
Plots = "1"
SafeTestsets = "0.1"
SciMLBase = "3"
SciMLSensitivity = "7"
StaticArrays = "1"
Statistics = "1"
Expand Down
140 changes: 64 additions & 76 deletions docs/src/examples/bruss.md
Original file line number Diff line number Diff line change
@@ -1,96 +1,84 @@
# GPU-Acceleration of a Stiff Nonlinear Partial Differential Equation

The following is a demonstration of a GPU-accelerated implicit solve of a stiff
nonlinear partial differential equation (the Brusselator model):
The following is a demonstration of a GPU-accelerated implicit solve of a
stiff nonlinear partial differential equation (the Brusselator model). The
RHS is written as a pure broadcast over the state arrays — the periodic
5-point Laplacian uses `circshift`, the reaction terms are element-wise,
and the spatially-varying source term `brusselator_f` is evaluated against
precomputed `x`/`y` grids — so no scalar indexing into the GPU array is
needed. We assert that explicitly with `CUDA.allowscalar(false)`: any
accidental scalar GPU read or write during the solve will raise.

```@example bruss
using OrdinaryDiffEq, CUDA, LinearAlgebra

const N = 32
const xyd_brusselator = range(0, stop = 1, length = N)
brusselator_f(x, y, t) = (((x - 0.3)^2 + (y - 0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.0
limit(a, N) = a == N + 1 ? 1 : a == 0 ? N : a
kernel_u! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
@inline function (du, u, A, B, α, II, I, t)
i, j = Tuple(I)
x = xyd[I[1]]
y = xyd[I[2]]
ip1 = limit(i + 1, N)
im1 = limit(i - 1, N)
jp1 = limit(j + 1, N)
jm1 = limit(j - 1, N)
du[II[i, j, 1]] = α * (u[II[im1, j, 1]] + u[II[ip1, j, 1]] + u[II[i, jp1, 1]] +
u[II[i, jm1, 1]] - 4u[II[i, j, 1]]) +
B + u[II[i, j, 1]]^2 * u[II[i, j, 2]] - (A + 1) * u[II[i, j, 1]] +
brusselator_f(x, y, t)
end
end
kernel_v! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
@inline function (du, u, A, B, α, II, I, t)
i, j = Tuple(I)
ip1 = limit(i + 1, N)
im1 = limit(i - 1, N)
jp1 = limit(j + 1, N)
jm1 = limit(j - 1, N)
du[II[i, j, 2]] = α * (u[II[im1, j, 2]] + u[II[ip1, j, 2]] + u[II[i, jp1, 2]] +
u[II[i, jm1, 2]] - 4u[II[i, j, 2]]) +
A * u[II[i, j, 1]] - u[II[i, j, 1]]^2 * u[II[i, j, 2]]
end
end
brusselator_2d = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
function (du, u, p, t)
@inbounds begin
ii1 = N^2
ii2 = ii1 + N^2
ii3 = ii2 + 2(N^2)
A = p[1]
B = p[2]
α = p[3] / dx^2
II = LinearIndices((N, N, 2))
kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
return nothing
end
end
const xyd_brusselator = range(0.0f0, stop = 1.0f0, length = N)

# Source term — element-wise pure function so it broadcasts over the
# (x, y) grids without any scalar indexing.
brusselator_f(x, y, t) =
(((x - 0.3f0)^2 + (y - 0.6f0)^2) <= 0.01f0) * (t >= 1.1f0) * 5.0f0

function brusselator_2d!(du, u, p, t)
A, B, α_raw, dx_val, x_grid, y_grid = p
α = α_raw / dx_val^2

u1 = @view u[:, :, 1]
u2 = @view u[:, :, 2]
du1 = @view du[:, :, 1]
du2 = @view du[:, :, 2]

# Periodic boundary 5-point Laplacian via circshift — entirely
# broadcast on the GPU array, no scalar indexing.
u1_im1 = circshift(u1, (1, 0)); u1_ip1 = circshift(u1, (-1, 0))
u1_jm1 = circshift(u1, (0, 1)); u1_jp1 = circshift(u1, (0, -1))
u2_im1 = circshift(u2, (1, 0)); u2_ip1 = circshift(u2, (-1, 0))
u2_jm1 = circshift(u2, (0, 1)); u2_jp1 = circshift(u2, (0, -1))

@. du1 = α * (u1_im1 + u1_ip1 + u1_jm1 + u1_jp1 - 4 * u1) +
B + u1^2 * u2 - (A + 1) * u1 + brusselator_f(x_grid, y_grid, t)
@. du2 = α * (u2_im1 + u2_ip1 + u2_jm1 + u2_jp1 - 4 * u2) +
A * u1 - u1^2 * u2

return nothing
end
p = (3.4, 1.0, 10.0, step(xyd_brusselator))

function init_brusselator_2d(xyd)
N = length(xyd)
u = zeros(N, N, 2)
for I in CartesianIndices((N, N))
x = xyd[I[1]]
y = xyd[I[2]]
u[I, 1] = 22 * (y * (1 - y))^(3 / 2)
u[I, 2] = 27 * (x * (1 - x))^(3 / 2)
u = zeros(Float32, N, N, 2)
for i in 1:N, j in 1:N
x = xyd[i]; y = xyd[j]
u[i, j, 1] = 22 * (y * (1 - y))^(3 / 2)
u[i, j, 2] = 27 * (x * (1 - x))^(3 / 2)
end
u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d, u0, (0.0, 11.5), p)

du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322
u0_cpu = init_brusselator_2d(xyd_brusselator)

du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322
# Pre-compute x/y grids shaped for broadcasting with `u[:, :, k]`.
xg = CuArray(reshape(collect(xyd_brusselator), N, 1))
yg = CuArray(reshape(collect(xyd_brusselator), 1, N))
u0 = CuArray(u0_cpu)
p = (3.4f0, 1.0f0, 10.0f0, step(xyd_brusselator), xg, yg)

prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d, CuArray(u0), (0.0f0, 11.5f0), p,
tstops = [1.1f0])
# Note: Solving requires allowscalar(true) during initialization
# This demonstrates the problem setup for GPU-based PDE solving
CUDA.allowscalar(true)
sol = solve(prob_ode_brusselator_2d_cuda, Rosenbrock23(), save_everystep = false)
# Sanity check the RHS on the GPU under allowscalar(false).
CUDA.allowscalar(false)
du = similar(u0)
brusselator_2d!(du, u0, p, 0.0f0)
Array(du[1:1, 1:1, 1])[1], Array(du[end:end, end:end, 2])[1]

# The brusselator RHS only depends on `t` through `brusselator_f`, which is a
# step source activating at t=1.1 (handled explicitly via `tstops`). Away
# from that discontinuity, df/dt = 0; supply that analytically so the
# implicit solver doesn't AD a non-AD-friendly time-gradient through the
# closure.
brusselator_tgrad(du, u, p, t) = (du .= 0)
brusselator_f_cuda = ODEFunction(brusselator_2d!, tgrad = brusselator_tgrad)
prob_ode_brusselator_2d_cuda = ODEProblem(
brusselator_f_cuda, u0, (0.0f0, 11.5f0), p, tstops = [1.1f0]
)
sol = solve(prob_ode_brusselator_2d_cuda, Rosenbrock23(), save_everystep = false)
sol.retcode
```
12 changes: 7 additions & 5 deletions docs/src/examples/gpu_ensemble_random_decay.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,10 @@ cpu_sol_plot = solve(ensemble_prob, Tsit5(), EnsembleThreads();
dt = 0.01f0,
adaptive = false)

solutions_vector_cpu = cpu_sol_plot.u

# Extract time points from the first trajectory's solution (assuming all are same)
t_vals_cpu = cpu_sol_plot[1].t
t_vals_cpu = solutions_vector_cpu[1].t
num_times_cpu = length(t_vals_cpu)

# Create a matrix to hold the results: rows=time, columns=trajectories
Expand All @@ -136,13 +138,13 @@ ensemble_vals_cpu = fill(NaN32, num_times_cpu, num_trajectories) # Use Float32

# Extract the state value (u[1]) from each trajectory at each time point
for i in 1:num_trajectories
sol = solutions_vector_cpu[i]
# Check if the trajectory simulation was successful and data looks valid
if cpu_sol_plot[i].retcode == ReturnCode.Success &&
length(cpu_sol_plot[i].u) == num_times_cpu
if sol.retcode == ReturnCode.Success && length(sol.u) == num_times_cpu
# sol.u is a Vector{SVector{1, Float32}}. We need the element from each SVector.
ensemble_vals_cpu[:, i] .= getindex.(cpu_sol_plot[i].u, 1)
ensemble_vals_cpu[:, i] .= getindex.(sol.u, 1)
else
@warn "CPU Trajectory $i failed or had unexpected length. Retcode: $(cpu_sol_plot[i].retcode). Length: $(length(cpu_sol_plot[i].u)). Skipping."
@warn "CPU Trajectory $i failed or had unexpected length. Retcode: $(sol.retcode). Length: $(length(sol.u)). Skipping."
# Column remains NaN
end
end
Expand Down
11 changes: 7 additions & 4 deletions docs/src/tutorials/weak_order_conv_sde.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ With the lower overhead of `EnsembleGPUKernel` API, these calculations can be do
The example below provides a way to calculate the expectation time-series of a linear SDE:

```@example kernel_sde
using DiffEqGPU, OrdinaryDiffEq, StaticArrays, LinearAlgebra, Statistics
using DiffEqGPU, OrdinaryDiffEq, StochasticDiffEq, StaticArrays, LinearAlgebra, Statistics
using CUDA

num_trajectories = 10_000

Expand All @@ -22,12 +23,14 @@ prob = SDEProblem(f, g, u₀, tspan, p; seed = 1234)

monteprob = EnsembleProblem(prob)

sol = solve(monteprob, GPUEM(), EnsembleGPUKernel(0.0), dt = Float32(1 // 2^8),
trajectories = num_trajectories, adaptive = false)
sol = solve(
monteprob, GPUEM(), EnsembleGPUKernel(CUDA.CUDABackend(), 0.0),
dt = Float32(1 // 2^8), trajectories = num_trajectories, adaptive = false
)

sol_array = Array(sol)

ts = sol[1].t
ts = sol.u[1].t

us_calc = reshape(mean(sol_array, dims = 3), size(sol_array, 2))

Expand Down
Loading