From a9db6b6b2ee64efe9afa1bf016ca03a82da80c46 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 14 Apr 2026 19:42:04 +0200 Subject: [PATCH 01/19] Replace Tausworthe+LCG RNG with stateless Philox4x32-10 Replace the hybrid Tausworthe+LCG generator with a stateless Philox4x32-10 counter-based RNG using KernelAbstractions, portable across GPU backends. rand! uses batched kernels for common types (Float16/32/64, integers, Bool, Complex) that produce multiple values per Philox call. A generic ElementRNG fallback handles all other types via Julia's Random stdlib dispatch. randn! uses specialized Box-Muller kernels with @fastmath log + sincospi, letting each backend provide optimized intrinsics via @device_override. These can't be made generic like rand! because (1) Random's Ziggurat needs table lookups requiring device overlays, (2) the generic Marsaglia polar fallback has a rejection loop causing warp divergence, and (3) direct Box-Muller produces value pairs that must be batched to stay bandwidth-bound. The old RNG(state::AbstractGPUArray) constructor and seed!(rng, ::Vector) are kept as deprecated methods for backwards compatibility with existing backends. Co-Authored-By: Claude Opus 4.6 (1M context) --- Project.toml | 2 +- lib/JLArrays/src/JLArrays.jl | 6 +- src/GPUArrays.jl | 1 + src/deprecated.jl | 12 ++ src/host/random.jl | 395 ++++++++++++++++++++++++++++------- test/testsuite/random.jl | 78 +++---- 6 files changed, 379 insertions(+), 115 deletions(-) create mode 100644 src/deprecated.jl diff --git a/Project.toml b/Project.toml index 3b7c2e69..ccaaad35 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "GPUArrays" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "11.4.1" +version = "11.5.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 7bba4b95..5903b444 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -542,11 +542,7 @@ using Random const GLOBAL_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing) function GPUArrays.default_rng(::Type{<:JLArray}) if GLOBAL_RNG[] === nothing - N = MAXTHREADS - state = JLArray{NTuple{4, UInt32}}(undef, N) - rng = GPUArrays.RNG(state) - Random.seed!(rng) - GLOBAL_RNG[] = rng + GLOBAL_RNG[] = GPUArrays.RNG() end GLOBAL_RNG[] end diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index a35c1ff0..17a4d897 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -38,5 +38,6 @@ include("host/statistics.jl") include("host/sparse.jl") include("host/alloc_cache.jl") +include("deprecated.jl") end # module diff --git a/src/deprecated.jl b/src/deprecated.jl new file mode 100644 index 00000000..a088025a --- /dev/null +++ b/src/deprecated.jl @@ -0,0 +1,12 @@ +# RNG: the old RNG took a GPU state array and Vector{UInt32} seeds. +# The new stateless Philox4x32 RNG doesn't need either. + +function RNG(state::AbstractGPUArray) + Base.depwarn("RNG(state::AbstractGPUArray) is deprecated, use RNG() instead", :RNG) + RNG() +end + +function Random.seed!(rng::RNG, seed::Vector{UInt32}) + Base.depwarn("seed!(rng::RNG, seed::Vector{UInt32}) is deprecated, use seed!(rng, seed::Integer) instead", :seed!) + Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt32) : first(seed)) +end diff --git a/src/host/random.jl b/src/host/random.jl index 7759e841..13045cbf 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -1,114 +1,361 @@ # integration with Random stdlib -## device interface -# hybrid Tausworthe and Linear Congruent generator from -# https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch37.html +## Philox4x32-10 counter-based RNG # -# only generates Float32 or Float64 numbers, conversion happens elsewhere +# Stateless: (counter, key) → 4 UInt32 outputs. Each unique counter gives independent +# random values with no shared memory or global state needed. +# +# Reference: Salmon et al., "Parallel Random Numbers: As Easy as 1, 2, 3" (2011) + +const PHILOX_M4x32_0 = 0xD2511F53 +const PHILOX_M4x32_1 = 0xCD9E8D57 +const PHILOX_W32_0 = 0x9E3779B9 +const PHILOX_W32_1 = 0xBB67AE85 + +@inline function philox4x32round(ctr::NTuple{4,UInt32}, key::NTuple{2,UInt32}) + mul0 = widemul(PHILOX_M4x32_0, ctr[1]) + mul1 = widemul(PHILOX_M4x32_1, ctr[3]) + hi0 = (mul0 >> 32) % UInt32 + hi1 = (mul1 >> 32) % UInt32 + lo0 = mul0 % UInt32 + lo1 = mul1 % UInt32 + (hi1 ⊻ ctr[2] ⊻ key[1], lo1, hi0 ⊻ ctr[4] ⊻ key[2], lo0) +end + +@inline function philox4x32bumpkey(key::NTuple{2,UInt32}) + (key[1] + PHILOX_W32_0, key[2] + PHILOX_W32_1) +end + +@inline function philox4x32_10(ctr::NTuple{4,UInt32}, key::NTuple{2,UInt32}) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key) + ctr +end + + +## Float conversions: unsigned integer → uniform float in (0, 1] + +@inline function u01(::Type{Float32}, u::UInt32) + fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33)) +end + +@inline function u01(::Type{Float64}, u::UInt64) + fma(Float64(u), Float64(2)^(-64), Float64(2)^(-65)) +end + + +## Box-Muller transform using @fastmath log + sincospi +# +# Each backend provides optimized intrinsics via @device_override: +# CUDA: __nv_fast_logf, __nv_sincospif, etc. + +using Base.FastMath -function TausStep(z::Unsigned, S1::Integer, S2::Integer, S3::Integer, M::Unsigned) - b = (((z << S1) ⊻ z) >> S2) - return (((z & M) << S3) ⊻ b) +@inline function boxmuller(::Type{T}, u1::T, u2::T) where T + r = sqrt(T(-2) * FastMath.log_fast(u1)) + s, c = sincospi(2 * u2) + (r * s, r * c) end -LCGStep(z::Unsigned, A::Unsigned, C::Unsigned) = A * z + C -make_rand_num(::Type{Float64}, tmp) = 2.3283064365387e-10 * Float64(tmp) -make_rand_num(::Type{Float32}, tmp) = 2.3283064f-10 * Float32(tmp) -# NOTE: the rng state is often not representable as Float16, so perform this in Float32. -make_rand_num(::Type{Float16}, tmp) = Float16(2.3283064f-10 * Float32(tmp)) +## RNG type -function next_rand(state::NTuple{4, T}) where {T <: Unsigned} - state = ( - TausStep(state[1], Cint(13), Cint(19), Cint(12), T(4294967294)), - TausStep(state[2], Cint(2), Cint(25), Cint(4), T(4294967288)), - TausStep(state[3], Cint(3), Cint(11), Cint(17), T(4294967280)), - LCGStep(state[4], T(1664525), T(1013904223)) - ) - tmp = (state[1] ⊻ state[2] ⊻ state[3] ⊻ state[4]) - return state, tmp +mutable struct RNG <: AbstractRNG + seed::UInt32 + counter::UInt32 end -function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T - stateful_rand = next_rand(randstate[threadid]) - randstate[threadid] = stateful_rand[1] - return make_rand_num(T, stateful_rand[2]) +RNG() = RNG(rand(Random.RandomDevice(), UInt32), UInt32(0)) +RNG(seed::Integer) = RNG(seed % UInt32, UInt32(0)) + +# return an instance of GPUArrays.RNG suitable for the requested array type +default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE + +Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt32); rng.counter = 0; rng) +Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt32; rng.counter = 0; rng) + +function advance_counter!(rng::RNG) + new_counter = Int64(rng.counter) + 1 + overflow, remainder = fldmod(new_counter, typemax(UInt32)) + rng.seed += overflow % UInt32 + rng.counter = remainder % UInt32 +end + + +## Specialized rand! kernels for common types +# +# Each Philox4x32 call produces 4 UInt32 outputs. Specialized kernels batch +# multiple values per work item to use all 4 outputs efficiently. +# Types that consume 1 UInt32 each get 4 values per call; types that need +# 2 UInt32s (UInt64/Float64) get 2 values per call; complex types that need +# 4 UInt32s get 1 value per call. + +# Convert Philox UInt32 outputs to N values of type T +@inline philox_to_vals(::Type{Float16}, a1, a2, a3, a4) = + (Float16(u01(Float32, a1)), Float16(u01(Float32, a2)), + Float16(u01(Float32, a3)), Float16(u01(Float32, a4))) +@inline philox_to_vals(::Type{Float32}, a1, a2, a3, a4) = + (u01(Float32, a1), u01(Float32, a2), u01(Float32, a3), u01(Float32, a4)) +for T in (UInt8, UInt16, UInt32, Int8, Int16, Int32, Bool) + @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) = + (a1 % $T, a2 % $T, a3 % $T, a4 % $T) +end +@inline philox_to_vals(::Type{Float64}, a1, a2, a3, a4) = + (u01(Float64, UInt64(a1) | UInt64(a2) << 32), + u01(Float64, UInt64(a3) | UInt64(a4) << 32)) +for T in (UInt64, Int64) + @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) = + ((UInt64(a1) | UInt64(a2) << 32) % $T, + (UInt64(a3) | UInt64(a4) << 32) % $T) end +@inline philox_to_vals(::Type{Complex{Float32}}, a1, a2, a3, a4) = + (complex(u01(Float32, a1), u01(Float32, a2)), + complex(u01(Float32, a3), u01(Float32, a4))) +@inline philox_to_vals(::Type{Complex{Float64}}, a1, a2, a3, a4) = + (complex(u01(Float64, UInt64(a1) | UInt64(a2) << 32), + u01(Float64, UInt64(a3) | UInt64(a4) << 32)),) + +# Number of output values per Philox4x32 call +vals_per_call(::Type{T}) where T = sizeof(T) <= 4 ? 4 : sizeof(T) <= 8 ? 2 : 1 +vals_per_call(::Type{Complex{T}}) where T = sizeof(T) <= 4 ? 2 : 1 -function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T <: Integer - result = zero(T) - if sizeof(T) >= 4 - for _ in 1:sizeof(T) >> 2 - randstate[threadid], y = next_rand(randstate[threadid]) - result = reinterpret(T, (|)(promote(result << 32, y)...)) +# Batched kernel: N values per work item from one Philox call +@kernel function rand_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + N = vals_per_call(T) + idx = N * gid + len = length(A) + if idx <= len + vals = philox_to_vals(T, philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0)))...) + for j in 1:N + @inbounds A[idx - N + j] = vals[j] + end + elseif idx - N < len + vals = philox_to_vals(T, philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0)))...) + for j in 1:min(N, len - idx + N) + @inbounds A[idx - N + j] = vals[j] end - else - randstate[threadid], y = next_rand(randstate[threadid]) - x = reinterpret(Int32, y) - result = convert(T, x & typemax(T)) end - result end -# support for complex numbers +# Types with specialized batched kernels +const BatchedRandTypes = Union{ + Float16, Float32, Float64, Complex{Float32}, Complex{Float64}, + Bool, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64} -function gpu_rand(::Type{Complex{T}}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T - re = gpu_rand(T, threadid, randstate) - im = gpu_rand(T, threadid, randstate) - return complex(re, im) +function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandTypes + isempty(A) && return A + N = vals_per_call(T) + rand_batched_kernel!(get_backend(A))(rng.seed, rng.counter, A; + ndrange=cld(length(A), N)) + advance_counter!(rng) + A end -## host interface +## Generic rand! fallback via ElementRNG +# +# For types without specialized kernels (Float16, integers, Bool, UInt128, etc.), +# an immutable ElementRNG wraps a Philox4x32 call as an AbstractRNG. Julia's Random +# stdlib then handles all type conversions automatically via `rand(rng, T)`. +# +# State (subcounter) lives in a stack-allocated Ref; the struct holds a pointer to +# it. This avoids mutable struct heap allocation on GPU. -struct RNG <: AbstractRNG - state::AbstractGPUArray{NTuple{4,UInt32},1} +struct ElementRNG <: AbstractRNG + seed::UInt32 + counter::UInt32 + gid::UInt32 + subctr_ptr::Ptr{UInt32} end -# return an instance of GPUArrays.RNG suitable for the requested array type -default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE +@inline Random.rng_native_52(::ElementRNG) = UInt64 -make_seed(rng::RNG) = make_seed(rng, rand(UInt)) -function make_seed(rng::RNG, n::Integer) - rand(MersenneTwister(n), UInt32, sizeof(rng.state)÷sizeof(UInt32)) +@inline function Random.rand(rng::ElementRNG, ::Random.SamplerType{UInt64}) + sc = unsafe_load(rng.subctr_ptr) + UInt32(1) + unsafe_store!(rng.subctr_ptr, sc) + a1, a2, _, _ = philox4x32_10( + (rng.gid, sc, rng.counter, UInt32(0)), + (rng.seed, UInt32(0))) + UInt64(a1) | UInt64(a2) << 32 end -Random.seed!(rng::RNG) = Random.seed!(rng, make_seed(rng)) -Random.seed!(rng::RNG, seed::Integer) = Random.seed!(rng, make_seed(rng, seed)) -function Random.seed!(rng::RNG, seed::Vector{UInt32}) - copyto!(rng.state, collect(reinterpret(NTuple{4, UInt32}, seed))) - return +@inline function Random.rand(rng::ElementRNG, ::Random.SamplerType{UInt128}) + UInt128(rand(rng, Random.SamplerType{UInt64}())) | + UInt128(rand(rng, Random.SamplerType{UInt64}())) << 64 +end + +@inline Random.rand(rng::ElementRNG, ::Random.SamplerType{T}) where T <: Union{Bool,Base.BitInteger} = + rand(rng, Random.SamplerType{UInt64}()) % T + +# @inline overrides for Random stdlib entry points that aren't inlined by default. +# Without these, the dispatch chain stays opaque and the Ref won't be optimized. +@inline Random.rand(rng::ElementRNG, ::Type{T}) where {T} = + rand(rng, Random.Sampler(typeof(rng), T, Val(1))) +@inline Random.rand(rng::ElementRNG, ::Random.SamplerType{Complex{T}}) where {T<:Real} = + complex(rand(rng, T), rand(rng, T)) + +@kernel function rand_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + if gid <= length(A) + subctr = Ref{UInt32}(0) + rng = ElementRNG(seed, counter, gid % UInt32, + Base.unsafe_convert(Ptr{UInt32}, subctr)) + @inbounds A[gid] = rand(rng, T) + end end function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number isempty(A) && return A - @kernel function rand!(a, randstate) - idx = @index(Global, Linear) - @inbounds a[idx] = gpu_rand(T, ((idx-1)%length(randstate)+1), randstate) + rand_generic_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A)) + advance_counter!(rng) + A +end + + +## randn! kernels +# +# Unlike rand!, randn! uses specialized batched kernels instead of the generic +# ElementRNG approach. This is because: +# +# 1. Random's default `randn` uses the Ziggurat algorithm with table lookups +# (`ki`, `wi`, `fi` arrays) that aren't accessible on GPU without device +# overlays (which are backend-specific, not available in KernelAbstractions). +# +# 2. The generic fallback `randn(rng, T::AbstractFloat)` uses Marsaglia's polar +# Box-Muller variant with a rejection loop (`while true ... 0 < s < 1`), +# which causes severe warp divergence on GPU (~2x slower). +# +# 3. Direct (non-rejection) Box-Muller naturally produces value *pairs* from +# sincospi. Batching these (4 Float32 / 2 Float64 per Philox call) is key +# to staying memory-bandwidth-bound; a 1-per-work-item generic kernel +# discards half the output. +# +# Box-Muller: each Philox call produces 2 normal values from 2 uniform values. +# For <=32-bit floats: 4 values per call (2 Box-Muller pairs from 4 UInt32). +# For >32-bit floats: 2 values per call (1 Box-Muller pair from 2 UInt64). + +# Box-Muller for complex: sqrt(-log(U)) not sqrt(-2*log(U)), +# so each component has variance 1/2 +@inline function boxmuller_complex(::Type{T}, u1::T, u2::T) where T + r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1))) + s, c = sincospi(2 * u2) + complex(r * s, r * c) +end + +@kernel function randn_small_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + idx = 4 * gid + len = length(A) + if idx <= len + a1, a2, a3, a4 = philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0))) + n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2))) + n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4))) + @inbounds A[idx - 3] = n1 + @inbounds A[idx - 2] = n2 + @inbounds A[idx - 1] = n3 + @inbounds A[idx] = n4 + elseif idx - 3 <= len + a1, a2, a3, a4 = philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0))) + n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2))) + n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4))) + vals = (n1, n2, n3, n4) + for j in 1:min(4, len - idx + 4) + @inbounds A[idx - 4 + j] = vals[j] + end + end +end + +@kernel function randn_large_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + idx = 2 * gid + len = length(A) + if idx <= len + a1, a2, a3, a4 = philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0))) + n1, n2 = boxmuller(T, + T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)), + T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))) + @inbounds A[idx - 1] = n1 + @inbounds A[idx] = n2 + elseif idx - 1 <= len + a1, a2, a3, a4 = philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0))) + n1, _ = boxmuller(T, + T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)), + T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))) + @inbounds A[len] = n1 + end +end + +@kernel function randn_complex_small_kernel!(@Const(seed), @Const(counter), A::AbstractArray{Complex{T}}) where T + gid = @index(Global, Linear) + idx = 2 * gid + len = length(A) + if idx <= len + a1, a2, a3, a4 = philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0))) + @inbounds A[idx - 1] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2))) + @inbounds A[idx] = boxmuller_complex(T, T(u01(Float32, a3)), T(u01(Float32, a4))) + elseif idx - 1 <= len + a1, a2, _, _ = philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0))) + @inbounds A[len] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2))) + end +end + +@kernel function randn_complex_large_kernel!(@Const(seed), @Const(counter), A::AbstractArray{Complex{T}}) where T + gid = @index(Global, Linear) + if gid <= length(A) + a1, a2, a3, a4 = philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + (seed, UInt32(0))) + U1 = T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)) + U2 = T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)) + @inbounds A[gid] = boxmuller_complex(T, U1, U2) + end +end + +function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: AbstractFloat + isempty(A) && return A + if sizeof(T) <= 4 + randn_small_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 4)) + else + randn_large_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 2)) end - rand!(get_backend(A))(A, rng.state; ndrange = size(A)) + advance_counter!(rng) A end -function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number +function Random.randn!(rng::RNG, A::AnyGPUArray{Complex{T}}) where T <: AbstractFloat isempty(A) && return A - threads = (length(A) - 1) ÷ 2 + 1 - @kernel function randn!(a, randstates) - i = @index(Global, Linear) - threadidx = @index(Local, Linear) - idx = 2*(i - 1) + 1 - U1 = gpu_rand(T, threadidx, randstates) - U2 = gpu_rand(T, threadidx, randstates) - Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2) - Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2) - @inbounds a[idx] = Z0 - if idx + 1 <= length(a) - @inbounds a[idx + 1] = Z1 - end + if sizeof(T) <= 4 + randn_complex_small_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 2)) + else + randn_complex_large_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A)) end - kernel = randn!(get_backend(A)) - kernel(A, rng.state; ndrange=threads) + advance_counter!(rng) A end diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl index 5e00682c..b688dbde 100644 --- a/test/testsuite/random.jl +++ b/test/testsuite/random.jl @@ -6,32 +6,25 @@ end cpu_rng = Random.default_rng() - SEEDING_BROKEN = (rng != cpu_rng) && !contains(string(AT), "JLArray") - @testset "rand" begin # uniform - @testset "$d $T" for T in eltypes, d in (10, (10, 10), (1024, 1024)) + @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2), 3, (3,3)) A = AT{T}(undef, d) B = copy(A) rand!(rng, A) rand!(rng, B) @test Array(A) != Array(B) - A = AT(rand(T, d)) - B = AT(rand(T, d)) - - Random.seed!(rng) - Random.seed!(rng, 1) - rand!(rng, A) - Random.seed!(rng, 1) - rand!(rng, B) - - @test Array(A) == Array(B) skip=SEEDING_BROKEN && (prod(d) > length(rng.state)) - if rng != cpu_rng rand!(cpu_rng, A) end end + # empty arrays + B = AT{Float32}(undef, 0) + rand!(rng, B) + @test isempty(Array(B)) + + # Bool coverage A = AT{Bool}(undef, 1024) fill!(A, false) rand!(rng, A) @@ -39,43 +32,58 @@ fill!(A, true) rand!(rng, A) @test false in Array(A) - - # AT of length 0 - B = AT{Float32}(undef, 0) - fill!(B, 1f0) - rand!(rng, B) - @test isempty(Array(B)) end @testset "randn" begin # normally-distributed - # XXX: randn calls sqrt, and Base's sqrt(::Complex) performs - # checked type conversions that throw boxed numbers. - @testset "$d $T" for T in filter(isrealfloattype, eltypes), d in (2, (2, 2), (1024, 1024)) + @testset "$T $d" for T in filter(isrealfloattype, eltypes), + d in (2, (2,2), (2,2,2), 3, (3,3)) A = AT{T}(undef, d) B = copy(A) randn!(rng, A) randn!(rng, B) @test Array(A) != Array(B) - A = AT(rand(T, d)) - B = AT(rand(T, d)) - - Random.seed!(rng) - Random.seed!(rng, 1) - randn!(rng, A) - Random.seed!(rng, 1) - randn!(rng, B) - @test Array(A) == Array(B) skip=SEEDING_BROKEN && (prod(d) > (2 * length(rng.state))) - if rng != cpu_rng randn!(cpu_rng, A) end end - # AT of length 0 + # complex randn + for T in filter(t -> t <: Complex && isrealfloattype(real(t)), eltypes) + A = AT{T}(undef, 8) + randn!(rng, A) + @test !any(isnan, Array(A)) + end + + # empty arrays A = AT{Float32}(undef, 0) - fill!(A, 1f0) randn!(rng, A) @test isempty(Array(A)) + + # Box-Muller should not produce infinities + if Float32 in eltypes + @test isfinite(maximum(randn!(rng, AT{Float32}(undef, 2^20)))) + end + end + + @testset "seeding" begin + if rng != cpu_rng + # seeding should produce reproducible results + for T in (Float32, Float64) + if T in eltypes + Random.seed!(rng, 1) + A = rand!(rng, AT{T}(undef, 100)) + Random.seed!(rng, 1) + B = rand!(rng, AT{T}(undef, 100)) + @test Array(A) == Array(B) + + Random.seed!(rng, 1) + A = randn!(rng, AT{T}(undef, 100)) + Random.seed!(rng, 1) + B = randn!(rng, AT{T}(undef, 100)) + @test Array(A) == Array(B) + end + end + end end end From 1546bc8731dce7759073d01cfe152851457ea134 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:07:31 +0200 Subject: [PATCH 02/19] Simplify advance_counter! to use native UInt32 wrap Previously used fldmod with typemax(UInt32) = 2^32 - 1, which wraps the counter to 1 (not 0) when it saturates. Not a correctness issue today since we only advance by 1, but the logic was confusing. Replace with a plain increment that lets UInt32 overflow naturally. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index 13045cbf..d007cc12 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -1,8 +1,7 @@ # integration with Random stdlib - ## Philox4x32-10 counter-based RNG -# + # Stateless: (counter, key) → 4 UInt32 outputs. Each unique counter gives independent # random values with no shared memory or global state needed. # @@ -54,7 +53,7 @@ end ## Box-Muller transform using @fastmath log + sincospi -# + # Each backend provides optimized intrinsics via @device_override: # CUDA: __nv_fast_logf, __nv_sincospif, etc. @@ -84,15 +83,14 @@ Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt32); rng.co Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt32; rng.counter = 0; rng) function advance_counter!(rng::RNG) - new_counter = Int64(rng.counter) + 1 - overflow, remainder = fldmod(new_counter, typemax(UInt32)) - rng.seed += overflow % UInt32 - rng.counter = remainder % UInt32 + rng.counter += one(UInt32) + rng.counter == 0 && (rng.seed += one(UInt32)) + rng end ## Specialized rand! kernels for common types -# + # Each Philox4x32 call produces 4 UInt32 outputs. Specialized kernels batch # multiple values per work item to use all 4 outputs efficiently. # Types that consume 1 UInt32 each get 4 values per call; types that need @@ -167,7 +165,7 @@ end ## Generic rand! fallback via ElementRNG -# + # For types without specialized kernels (Float16, integers, Bool, UInt128, etc.), # an immutable ElementRNG wraps a Philox4x32 call as an AbstractRNG. Julia's Random # stdlib then handles all type conversions automatically via `rand(rng, T)`. @@ -227,7 +225,7 @@ end ## randn! kernels -# + # Unlike rand!, randn! uses specialized batched kernels instead of the generic # ElementRNG approach. This is because: # @@ -359,6 +357,9 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{Complex{T}}) where T <: Abstract A end + +## host API + # allow use of CPU RNGs without scalar iteration Random.rand!(rng::AbstractRNG, A::AnyGPUArray) = copyto!(A, rand(rng, eltype(A), size(A)...)) From 71adaa2ad2e074d7ffd1ddc4a66c123111caac74 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:08:23 +0200 Subject: [PATCH 03/19] Widen RNG seed to UInt64 for full Philox key entropy Previously the seed was UInt32 and the Philox key's upper lane was always zero, so different 64-bit seeds colliding mod 2^32 produced identical streams. Store the seed as UInt64 and split both halves into the key's two lanes via a new philox_key helper. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/deprecated.jl | 2 +- src/host/random.jl | 37 ++++++++++++++++++++----------------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/deprecated.jl b/src/deprecated.jl index a088025a..62ef634f 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -8,5 +8,5 @@ end function Random.seed!(rng::RNG, seed::Vector{UInt32}) Base.depwarn("seed!(rng::RNG, seed::Vector{UInt32}) is deprecated, use seed!(rng, seed::Integer) instead", :seed!) - Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt32) : first(seed)) + Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt64) : first(seed)) end diff --git a/src/host/random.jl b/src/host/random.jl index d007cc12..fe054fc4 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -69,25 +69,28 @@ end ## RNG type mutable struct RNG <: AbstractRNG - seed::UInt32 + seed::UInt64 counter::UInt32 end -RNG() = RNG(rand(Random.RandomDevice(), UInt32), UInt32(0)) -RNG(seed::Integer) = RNG(seed % UInt32, UInt32(0)) +RNG() = RNG(rand(Random.RandomDevice(), UInt64), UInt32(0)) +RNG(seed::Integer) = RNG(seed % UInt64, UInt32(0)) # return an instance of GPUArrays.RNG suitable for the requested array type default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE -Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt32); rng.counter = 0; rng) -Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt32; rng.counter = 0; rng) +Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt64); rng.counter = 0; rng) +Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt64; rng.counter = 0; rng) function advance_counter!(rng::RNG) rng.counter += one(UInt32) - rng.counter == 0 && (rng.seed += one(UInt32)) + rng.counter == 0 && (rng.seed += one(UInt64)) rng end +# Split the 64-bit seed into the two 32-bit lanes of the Philox key. +@inline philox_key(seed::UInt64) = (seed % UInt32, (seed >> 32) % UInt32) + ## Specialized rand! kernels for common types @@ -135,14 +138,14 @@ vals_per_call(::Type{Complex{T}}) where T = sizeof(T) <= 4 ? 2 : 1 if idx <= len vals = philox_to_vals(T, philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0)))...) + philox_key(seed))...) for j in 1:N @inbounds A[idx - N + j] = vals[j] end elseif idx - N < len vals = philox_to_vals(T, philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0)))...) + philox_key(seed))...) for j in 1:min(N, len - idx + N) @inbounds A[idx - N + j] = vals[j] end @@ -174,7 +177,7 @@ end # it. This avoids mutable struct heap allocation on GPU. struct ElementRNG <: AbstractRNG - seed::UInt32 + seed::UInt64 counter::UInt32 gid::UInt32 subctr_ptr::Ptr{UInt32} @@ -187,7 +190,7 @@ end unsafe_store!(rng.subctr_ptr, sc) a1, a2, _, _ = philox4x32_10( (rng.gid, sc, rng.counter, UInt32(0)), - (rng.seed, UInt32(0))) + philox_key(rng.seed)) UInt64(a1) | UInt64(a2) << 32 end @@ -261,7 +264,7 @@ end if idx <= len a1, a2, a3, a4 = philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0))) + philox_key(seed)) n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2))) n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4))) @inbounds A[idx - 3] = n1 @@ -271,7 +274,7 @@ end elseif idx - 3 <= len a1, a2, a3, a4 = philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0))) + philox_key(seed)) n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2))) n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4))) vals = (n1, n2, n3, n4) @@ -288,7 +291,7 @@ end if idx <= len a1, a2, a3, a4 = philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0))) + philox_key(seed)) n1, n2 = boxmuller(T, T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)), T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))) @@ -297,7 +300,7 @@ end elseif idx - 1 <= len a1, a2, a3, a4 = philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0))) + philox_key(seed)) n1, _ = boxmuller(T, T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)), T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))) @@ -312,13 +315,13 @@ end if idx <= len a1, a2, a3, a4 = philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0))) + philox_key(seed)) @inbounds A[idx - 1] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2))) @inbounds A[idx] = boxmuller_complex(T, T(u01(Float32, a3)), T(u01(Float32, a4))) elseif idx - 1 <= len a1, a2, _, _ = philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0))) + philox_key(seed)) @inbounds A[len] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2))) end end @@ -328,7 +331,7 @@ end if gid <= length(A) a1, a2, a3, a4 = philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - (seed, UInt32(0))) + philox_key(seed)) U1 = T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)) U2 = T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)) @inbounds A[gid] = boxmuller_complex(T, U1, U2) From 1affc0923bdde7a03dc64408d34074dc136a41bb Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:09:28 +0200 Subject: [PATCH 04/19] Use bit-pattern trick for u01(Float64, UInt64) Construct the Float64 bit-pattern directly instead of calling the expensive Float64(::UInt64) conversion and a subsequent fma. On consumer GPUs with 1:64 FP64 throughput this is a ~40x speedup for rand!(F64) on a 100M-element array (0.9 vs 38 ms on an RTX 5080, now memory-bound). Force the low mantissa bit so the result is strictly in (0, 1), which is required by Box-Muller's log(u). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/host/random.jl b/src/host/random.jl index fe054fc4..dfa53760 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -48,7 +48,11 @@ end end @inline function u01(::Type{Float64}, u::UInt64) - fma(Float64(u), Float64(2)^(-64), Float64(2)^(-65)) + # Bit-pattern construction avoids the expensive Float64(::UInt64) conversion + # and fma on consumer GPUs (where FP64 throughput is as low as 1:64). The + # low bit of the mantissa is forced set so the result is strictly in (0, 1), + # which is required by Box-Muller's log(u). + reinterpret(Float64, ((u >> 12) | 0x1) | 0x3ff0000000000000) - 1.0 end From f34d77687aa8e6d0e5def390d9f7a7f4bf6d1611 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:11:14 +0200 Subject: [PATCH 05/19] Unify randn! kernels via philox_to_normals Replace the four randn_*_kernel!'s with a single randn_batched_kernel! mirroring rand_batched_kernel!, parameterized on the element type via a philox_to_normals helper (analogous to philox_to_vals). Same performance, half the kernel code. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 139 +++++++++++++++------------------------------ 1 file changed, 46 insertions(+), 93 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index dfa53760..0f420246 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -231,10 +231,9 @@ function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number end -## randn! kernels +## randn! kernel -# Unlike rand!, randn! uses specialized batched kernels instead of the generic -# ElementRNG approach. This is because: +# Unlike rand!, randn! can't use the generic ElementRNG approach: # # 1. Random's default `randn` uses the Ziggurat algorithm with table lookups # (`ki`, `wi`, `fi` arrays) that aren't accessible on GPU without device @@ -249,117 +248,71 @@ end # to staying memory-bandwidth-bound; a 1-per-work-item generic kernel # discards half the output. # -# Box-Muller: each Philox call produces 2 normal values from 2 uniform values. -# For <=32-bit floats: 4 values per call (2 Box-Muller pairs from 4 UInt32). -# For >32-bit floats: 2 values per call (1 Box-Muller pair from 2 UInt64). +# Instead, reuse the batched kernel structure from rand!: one kernel parametric +# over the element type, with `philox_to_normals` producing N values per call +# (matching `vals_per_call(T)`). -# Box-Muller for complex: sqrt(-log(U)) not sqrt(-2*log(U)), -# so each component has variance 1/2 +# Box-Muller for complex: sqrt(-log(U)) not sqrt(-2*log(U)), so each component +# has variance 1/2. @inline function boxmuller_complex(::Type{T}, u1::T, u2::T) where T r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1))) s, c = sincospi(2 * u2) complex(r * s, r * c) end -@kernel function randn_small_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T - gid = @index(Global, Linear) - idx = 4 * gid - len = length(A) - if idx <= len - a1, a2, a3, a4 = philox4x32_10( - (gid % UInt32, UInt32(0), counter, UInt32(0)), - philox_key(seed)) - n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2))) - n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4))) - @inbounds A[idx - 3] = n1 - @inbounds A[idx - 2] = n2 - @inbounds A[idx - 1] = n3 - @inbounds A[idx] = n4 - elseif idx - 3 <= len - a1, a2, a3, a4 = philox4x32_10( - (gid % UInt32, UInt32(0), counter, UInt32(0)), - philox_key(seed)) - n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2))) - n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4))) - vals = (n1, n2, n3, n4) - for j in 1:min(4, len - idx + 4) - @inbounds A[idx - 4 + j] = vals[j] - end +# Convert Philox UInt32 outputs to N normally-distributed values of type T. +for T in (Float16, Float32) + @eval @inline function philox_to_normals(::Type{$T}, a1, a2, a3, a4) + n1, n2 = boxmuller($T, $T(u01(Float32, a1)), $T(u01(Float32, a2))) + n3, n4 = boxmuller($T, $T(u01(Float32, a3)), $T(u01(Float32, a4))) + (n1, n2, n3, n4) end end - -@kernel function randn_large_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T - gid = @index(Global, Linear) - idx = 2 * gid - len = length(A) - if idx <= len - a1, a2, a3, a4 = philox4x32_10( - (gid % UInt32, UInt32(0), counter, UInt32(0)), - philox_key(seed)) - n1, n2 = boxmuller(T, - T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)), - T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))) - @inbounds A[idx - 1] = n1 - @inbounds A[idx] = n2 - elseif idx - 1 <= len - a1, a2, a3, a4 = philox4x32_10( - (gid % UInt32, UInt32(0), counter, UInt32(0)), - philox_key(seed)) - n1, _ = boxmuller(T, - T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)), - T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))) - @inbounds A[len] = n1 - end +@inline function philox_to_normals(::Type{Float64}, a1, a2, a3, a4) + boxmuller(Float64, + u01(Float64, UInt64(a1) | UInt64(a2) << 32), + u01(Float64, UInt64(a3) | UInt64(a4) << 32)) +end +@inline function philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4) + (boxmuller_complex(Float32, u01(Float32, a1), u01(Float32, a2)), + boxmuller_complex(Float32, u01(Float32, a3), u01(Float32, a4))) +end +@inline function philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4) + (boxmuller_complex(Float64, + u01(Float64, UInt64(a1) | UInt64(a2) << 32), + u01(Float64, UInt64(a3) | UInt64(a4) << 32)),) end -@kernel function randn_complex_small_kernel!(@Const(seed), @Const(counter), A::AbstractArray{Complex{T}}) where T +@kernel function randn_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T gid = @index(Global, Linear) - idx = 2 * gid + N = vals_per_call(T) + idx = N * gid len = length(A) if idx <= len - a1, a2, a3, a4 = philox4x32_10( - (gid % UInt32, UInt32(0), counter, UInt32(0)), - philox_key(seed)) - @inbounds A[idx - 1] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2))) - @inbounds A[idx] = boxmuller_complex(T, T(u01(Float32, a3)), T(u01(Float32, a4))) - elseif idx - 1 <= len - a1, a2, _, _ = philox4x32_10( + vals = philox_to_normals(T, philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - philox_key(seed)) - @inbounds A[len] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2))) - end -end - -@kernel function randn_complex_large_kernel!(@Const(seed), @Const(counter), A::AbstractArray{Complex{T}}) where T - gid = @index(Global, Linear) - if gid <= length(A) - a1, a2, a3, a4 = philox4x32_10( + philox_key(seed))...) + for j in 1:N + @inbounds A[idx - N + j] = vals[j] + end + elseif idx - N < len + vals = philox_to_normals(T, philox4x32_10( (gid % UInt32, UInt32(0), counter, UInt32(0)), - philox_key(seed)) - U1 = T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)) - U2 = T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)) - @inbounds A[gid] = boxmuller_complex(T, U1, U2) + philox_key(seed))...) + for j in 1:min(N, len - idx + N) + @inbounds A[idx - N + j] = vals[j] + end end end -function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: AbstractFloat - isempty(A) && return A - if sizeof(T) <= 4 - randn_small_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 4)) - else - randn_large_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 2)) - end - advance_counter!(rng) - A -end +const BatchedRandnTypes = Union{Float16, Float32, Float64, + Complex{Float32}, Complex{Float64}} -function Random.randn!(rng::RNG, A::AnyGPUArray{Complex{T}}) where T <: AbstractFloat +function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandnTypes isempty(A) && return A - if sizeof(T) <= 4 - randn_complex_small_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 2)) - else - randn_complex_large_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A)) - end + N = vals_per_call(T) + randn_batched_kernel!(get_backend(A))(rng.seed, rng.counter, A; + ndrange=cld(length(A), N)) advance_counter!(rng) A end From 7251da1d86f9e7a4d1615d4a77da7978645a0924 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:12:20 +0200 Subject: [PATCH 06/19] Add Int128/UInt128 to the batched rand! path vals_per_call already returned 1 for 16-byte types; add the corresponding philox_to_vals method and extend the union so rand! dispatches to the batched kernel rather than the generic fallback. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/host/random.jl b/src/host/random.jl index 0f420246..007d9637 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -122,6 +122,10 @@ for T in (UInt64, Int64) ((UInt64(a1) | UInt64(a2) << 32) % $T, (UInt64(a3) | UInt64(a4) << 32) % $T) end +for T in (UInt128, Int128) + @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) = + ((UInt128(a1) | UInt128(a2) << 32 | UInt128(a3) << 64 | UInt128(a4) << 96) % $T,) +end @inline philox_to_vals(::Type{Complex{Float32}}, a1, a2, a3, a4) = (complex(u01(Float32, a1), u01(Float32, a2)), complex(u01(Float32, a3), u01(Float32, a4))) @@ -159,7 +163,8 @@ end # Types with specialized batched kernels const BatchedRandTypes = Union{ Float16, Float32, Float64, Complex{Float32}, Complex{Float64}, - Bool, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64} + Bool, Int8, Int16, Int32, Int64, Int128, + UInt8, UInt16, UInt32, UInt64, UInt128} function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandTypes isempty(A) && return A From a295992fe882080f0146fe567b9d8b0c3f10c21d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:14:35 +0200 Subject: [PATCH 07/19] Fold boxmuller_complex into boxmuller via type dispatch Both variants had the same (::Type{T}, u1, u2) signature with different output types; dispatch on Complex{T} vs T instead of using a separate name, and keep both methods next to each other at the top of the file. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index 007d9637..7094815f 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -63,12 +63,20 @@ end using Base.FastMath -@inline function boxmuller(::Type{T}, u1::T, u2::T) where T +@inline function boxmuller(::Type{T}, u1::T, u2::T) where T <: AbstractFloat r = sqrt(T(-2) * FastMath.log_fast(u1)) s, c = sincospi(2 * u2) (r * s, r * c) end +# For complex normals each component has variance 1/2, so the radius is +# sqrt(-log(U)) rather than sqrt(-2*log(U)). +@inline function boxmuller(::Type{Complex{T}}, u1::T, u2::T) where T <: AbstractFloat + r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1))) + s, c = sincospi(2 * u2) + complex(r * s, r * c) +end + ## RNG type @@ -257,14 +265,6 @@ end # over the element type, with `philox_to_normals` producing N values per call # (matching `vals_per_call(T)`). -# Box-Muller for complex: sqrt(-log(U)) not sqrt(-2*log(U)), so each component -# has variance 1/2. -@inline function boxmuller_complex(::Type{T}, u1::T, u2::T) where T - r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1))) - s, c = sincospi(2 * u2) - complex(r * s, r * c) -end - # Convert Philox UInt32 outputs to N normally-distributed values of type T. for T in (Float16, Float32) @eval @inline function philox_to_normals(::Type{$T}, a1, a2, a3, a4) @@ -279,11 +279,11 @@ end u01(Float64, UInt64(a3) | UInt64(a4) << 32)) end @inline function philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4) - (boxmuller_complex(Float32, u01(Float32, a1), u01(Float32, a2)), - boxmuller_complex(Float32, u01(Float32, a3), u01(Float32, a4))) + (boxmuller(Complex{Float32}, u01(Float32, a1), u01(Float32, a2)), + boxmuller(Complex{Float32}, u01(Float32, a3), u01(Float32, a4))) end @inline function philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4) - (boxmuller_complex(Float64, + (boxmuller(Complex{Float64}, u01(Float64, UInt64(a1) | UInt64(a2) << 32), u01(Float64, UInt64(a3) | UInt64(a4) << 32)),) end From 77b713c9e3383a5f0e926fc62b04c07ef0259bb7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:24:03 +0200 Subject: [PATCH 08/19] Drop CPU-RNG-on-GPU-array fallback Allowed any AbstractRNG to fill a GPUArray by generating values on the CPU and copying them over. That hides accidentally-slow code paths; let unsupported (rng, eltype) combinations fail explicitly via MethodError instead. Also drop the matching cpu_rng exercise from the test suite. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 7 ------- test/testsuite/random.jl | 11 +---------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index 7094815f..b8b510b9 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -323,10 +323,3 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandnTypes end -## host API - -# allow use of CPU RNGs without scalar iteration -Random.rand!(rng::AbstractRNG, A::AnyGPUArray) = - copyto!(A, rand(rng, eltype(A), size(A)...)) -Random.randn!(rng::AbstractRNG, A::AnyGPUArray) = - copyto!(A, randn(rng, eltype(A), size(A)...)) diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl index b688dbde..17c3875e 100644 --- a/test/testsuite/random.jl +++ b/test/testsuite/random.jl @@ -4,7 +4,6 @@ else Random.default_rng() end - cpu_rng = Random.default_rng() @testset "rand" begin # uniform @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2), 3, (3,3)) @@ -13,10 +12,6 @@ rand!(rng, A) rand!(rng, B) @test Array(A) != Array(B) - - if rng != cpu_rng - rand!(cpu_rng, A) - end end # empty arrays @@ -42,10 +37,6 @@ randn!(rng, A) randn!(rng, B) @test Array(A) != Array(B) - - if rng != cpu_rng - randn!(cpu_rng, A) - end end # complex randn @@ -67,7 +58,7 @@ end @testset "seeding" begin - if rng != cpu_rng + if AT <: AbstractGPUArray # seeding should produce reproducible results for T in (Float32, Float64) if T in eltypes From a631a57ad057f2181a07bec784546d9c8a59142b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:27:45 +0200 Subject: [PATCH 09/19] Add generic randn! fallback via ElementRNG Mirrors the rand! structure: batched kernel for Float16/32/64 and their complex variants, generic ElementRNG-based kernel for every other AbstractFloat / Complex{<:AbstractFloat}. Non-batched types go through Random's generic polar Box-Muller fallback, which is GPU-safe (ziggurat is never reached because its signatures cover only Float16/32/64, which dispatch to the batched kernel). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/host/random.jl b/src/host/random.jl index b8b510b9..b8f04faa 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -323,3 +323,35 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandnTypes end +## Generic randn! fallback via ElementRNG +# +# For AbstractFloats outside BatchedRandnTypes (BFloat16, user-defined float +# types, etc.) we route through Random's `randn(rng, T)`. The reachable methods +# from there are: +# +# - `randn(rng, ::BitFloatType)` (Float16/32/64) — ziggurat, uses global `wi` +# /`ki`/`fi` tables that aren't device-accessible. Never reached here because +# those types dispatch to the batched kernel above. +# - `randn(rng, ::Type{Complex{T}})` — recurses into `randn(rng, T)`. +# - `randn(rng, ::Type{T}) where T<:AbstractFloat` — Marsaglia polar Box-Muller +# rejection loop. GPU-safe (only calls `rand(rng, T)`) but warp-divergent. + +@kernel function randn_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + if gid <= length(A) + subctr = Ref{UInt32}(0) + rng = ElementRNG(seed, counter, gid % UInt32, + Base.unsafe_convert(Ptr{UInt32}, subctr)) + @inbounds A[gid] = randn(rng, T) + end +end + +function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Union{AbstractFloat, + Complex{<:AbstractFloat}} + isempty(A) && return A + randn_generic_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A)) + advance_counter!(rng) + A +end + + From 7a901d5cdfc4de32497759673813d71b38e62a08 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:31:25 +0200 Subject: [PATCH 10/19] Refresh random.jl comments - u01 no longer claims "(0, 1]"; the Float64 path is strictly (0, 1). - The generic rand! fallback comment listed types that have since been moved into BatchedRandTypes. - The randn! intro claimed randn! couldn't use ElementRNG; there's now a generic ElementRNG fallback too. Reframe the three-point rationale as "why the batched path exists" rather than "why there's no generic path". - Clarify the per-call batching table to cover Int128/UInt128 and Complex{Float32} explicitly. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 49 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index b8f04faa..229318f8 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -41,7 +41,8 @@ end end -## Float conversions: unsigned integer → uniform float in (0, 1] +## Float conversions: unsigned integer → uniform float, strictly positive +## (Float32 can round up to exactly 1.0; Float64 stays strictly below 1.0.) @inline function u01(::Type{Float32}, u::UInt32) fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33)) @@ -107,10 +108,10 @@ end ## Specialized rand! kernels for common types # Each Philox4x32 call produces 4 UInt32 outputs. Specialized kernels batch -# multiple values per work item to use all 4 outputs efficiently. -# Types that consume 1 UInt32 each get 4 values per call; types that need -# 2 UInt32s (UInt64/Float64) get 2 values per call; complex types that need -# 4 UInt32s get 1 value per call. +# multiple values per work item to use all 4 outputs efficiently: +# - ≤4-byte types (including Float16): 4 values per call +# - 8-byte types (Int64/UInt64/Float64/Complex{Float32}): 2 values per call +# - 16-byte types (Int128/UInt128/Complex{Float64}): 1 value per call # Convert Philox UInt32 outputs to N values of type T @inline philox_to_vals(::Type{Float16}, a1, a2, a3, a4) = @@ -186,12 +187,13 @@ end ## Generic rand! fallback via ElementRNG -# For types without specialized kernels (Float16, integers, Bool, UInt128, etc.), -# an immutable ElementRNG wraps a Philox4x32 call as an AbstractRNG. Julia's Random -# stdlib then handles all type conversions automatically via `rand(rng, T)`. +# For Number types outside BatchedRandTypes (BigFloat, FixedPointNumbers, +# user-defined types, ...), an immutable ElementRNG wraps a Philox4x32 call as +# an AbstractRNG, and Julia's Random stdlib handles the type conversion via +# `rand(rng, T)`. # -# State (subcounter) lives in a stack-allocated Ref; the struct holds a pointer to -# it. This avoids mutable struct heap allocation on GPU. +# State (subcounter) lives in a stack-allocated Ref; the struct holds a pointer +# to it. This avoids mutable struct heap allocation on GPU. struct ElementRNG <: AbstractRNG seed::UInt64 @@ -244,26 +246,23 @@ function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number end -## randn! kernel +## Specialized randn! kernel for common float types -# Unlike rand!, randn! can't use the generic ElementRNG approach: +# Float16/32/64 and their complex variants go through a batched kernel mirroring +# rand_batched_kernel!, rather than the generic ElementRNG path further down: # -# 1. Random's default `randn` uses the Ziggurat algorithm with table lookups -# (`ki`, `wi`, `fi` arrays) that aren't accessible on GPU without device -# overlays (which are backend-specific, not available in KernelAbstractions). +# 1. Random's `randn(rng, ::Float{16,32,64})` uses the Ziggurat algorithm with +# global table lookups (`ki`, `wi`, `fi`), which aren't device-accessible +# from a KernelAbstractions kernel. # -# 2. The generic fallback `randn(rng, T::AbstractFloat)` uses Marsaglia's polar -# Box-Muller variant with a rejection loop (`while true ... 0 < s < 1`), -# which causes severe warp divergence on GPU (~2x slower). +# 2. The polar Box-Muller fallback reachable for other AbstractFloat types has +# a rejection loop (`while true ... 0 < s < 1`), causing warp divergence +# (~2x slower). # # 3. Direct (non-rejection) Box-Muller naturally produces value *pairs* from -# sincospi. Batching these (4 Float32 / 2 Float64 per Philox call) is key -# to staying memory-bandwidth-bound; a 1-per-work-item generic kernel -# discards half the output. -# -# Instead, reuse the batched kernel structure from rand!: one kernel parametric -# over the element type, with `philox_to_normals` producing N values per call -# (matching `vals_per_call(T)`). +# sincospi. Batching these (4 Float32 / 2 Float64 per Philox call) keeps us +# memory-bandwidth-bound; a 1-per-work-item kernel would discard half the +# output. # Convert Philox UInt32 outputs to N normally-distributed values of type T. for T in (Float16, Float32) From 7279ed05194fbc1d121a76bb08551fb4f5e76b16 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:33:48 +0200 Subject: [PATCH 11/19] Pull ElementRNG into its own section ElementRNG and its Random stdlib dispatch methods are shared between the generic rand! and randn! fallbacks; hoist them above both kernels rather than living inside the "Generic rand! fallback" section. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index 229318f8..1dc4b72e 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -185,15 +185,15 @@ function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandTypes end -## Generic rand! fallback via ElementRNG +## ElementRNG: a device-side AbstractRNG for generic rand/randn fallbacks -# For Number types outside BatchedRandTypes (BigFloat, FixedPointNumbers, -# user-defined types, ...), an immutable ElementRNG wraps a Philox4x32 call as -# an AbstractRNG, and Julia's Random stdlib handles the type conversion via -# `rand(rng, T)`. +# For element types without a specialized batched kernel (BigFloat, +# FixedPointNumbers, user-defined types, ...), we create a per-work-item +# AbstractRNG and delegate to Random stdlib's `rand(rng, T)` / `randn(rng, T)`. # -# State (subcounter) lives in a stack-allocated Ref; the struct holds a pointer -# to it. This avoids mutable struct heap allocation on GPU. +# The struct is immutable (mutable structs get heap-allocated on GPU); the +# per-work-item subcounter state lives in a stack-allocated Ref that the +# struct holds a pointer to. struct ElementRNG <: AbstractRNG seed::UInt64 @@ -228,6 +228,9 @@ end @inline Random.rand(rng::ElementRNG, ::Random.SamplerType{Complex{T}}) where {T<:Real} = complex(rand(rng, T), rand(rng, T)) + +## Generic rand! fallback via ElementRNG + @kernel function rand_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T gid = @index(Global, Linear) if gid <= length(A) From 776b03030e7038932d190bc1024a7459c202cd59 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 07:40:51 +0200 Subject: [PATCH 12/19] Drop now-redundant ElementRNG rand overrides These @inline overrides of the generic `rand(::AbstractRNG, ::Type{T})` and `SamplerType{Complex{T}}` methods were needed when ElementRNG was a mutable struct, to keep the dispatch chain transparent enough for the Ref-to-ptr trick to stack-allocate. With ElementRNG immutable, empirical checks (LLVM inspection, BFloat16 rand and Complex{Float16} randn benchmarks) show identical generated code and identical performance whether the overrides are present or not. Base.Random defines both methods with identical bodies, so nothing is lost by removing them. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index 1dc4b72e..265f03b5 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -221,13 +221,6 @@ end @inline Random.rand(rng::ElementRNG, ::Random.SamplerType{T}) where T <: Union{Bool,Base.BitInteger} = rand(rng, Random.SamplerType{UInt64}()) % T -# @inline overrides for Random stdlib entry points that aren't inlined by default. -# Without these, the dispatch chain stays opaque and the Ref won't be optimized. -@inline Random.rand(rng::ElementRNG, ::Type{T}) where {T} = - rand(rng, Random.Sampler(typeof(rng), T, Val(1))) -@inline Random.rand(rng::ElementRNG, ::Random.SamplerType{Complex{T}}) where {T<:Real} = - complex(rand(rng, T), rand(rng, T)) - ## Generic rand! fallback via ElementRNG From f5e47b8b4e66ea31cfc24ac24ef1108196d21617 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 08:15:08 +0200 Subject: [PATCH 13/19] Use polynomial sincospi for Float16/Float32 randn! MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Base.sincospi(::Float32) widens to Float64 internally (see base/special/trig.jl:744 `sinpi_kernel_wide`), which breaks Metal (no Float64 support) and forces FP64 emulation on other backends that hit that path. Vendor PhiloxRNG.jl's minimax polynomial: entirely Float32, consumes the Philox UInt32 output directly. Rewrite the ≤32-bit boxmuller methods to take UInt32 arguments and update philox_to_normals to match. Float64 keeps using Base.sincospi (backends with FP64 have intrinsics, and the polynomial alternative is ~8x slower on consumer GPUs — see the follow-up log commit for measurements). Note: Base.log(::Float32) has the same FP64-widening problem and is fixed in the next commit. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 74 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 15 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index 265f03b5..074e43b8 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -57,22 +57,65 @@ end end -## Box-Muller transform using @fastmath log + sincospi +## Fast sincospi for Box-Muller +# +# Base.sincospi(::Float32) widens to Float64 internally (via `sinpi_kernel_wide`, +# see base/special/trig.jl:744), which breaks Metal — it has no Float64 support +# at all — and wastes cycles on other backends that must emulate FP64. +# +# Vendored from PhiloxRNG.jl (MIT): a minimax polynomial that stays entirely +# in Float32 and consumes the Philox UInt32 output directly. Bottom 3 bits +# select one of 8 octants (swap/negate), upper 29 bits give the reduced +# argument (+0.5-biased so y ≠ 0). +# +# Float64 keeps using Base.sincospi: backends that support FP64 all have +# intrinsics, and the polynomial alternative is ~8× slower on consumer GPUs +# with low FP64 throughput. + +const _SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0) +const _CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0) + +@inline function _fast_sincospi(::Type{Float32}, u::UInt32) + oct = (u % Int32) & Int32(7) + y = fma(Float32(u & ~UInt32(7)), Float32(2)^(-34), Float32(2)^(-32)) + sp = y * evalpoly(y * y, _SP_F32) + cp = evalpoly(y * y, _CP_F32) + swap = !iszero(oct & Int32(1)) + sin_neg = !iszero(oct & Int32(2)) + cos_neg = !iszero(oct & Int32(4)) + s_raw = ifelse(swap, cp, sp) + c_raw = ifelse(swap, sp, cp) + (ifelse(sin_neg, -s_raw, s_raw), ifelse(cos_neg, -c_raw, c_raw)) +end + -# Each backend provides optimized intrinsics via @device_override: -# CUDA: __nv_fast_logf, __nv_sincospif, etc. +## Box-Muller transform using Base.FastMath -@inline function boxmuller(::Type{T}, u1::T, u2::T) where T <: AbstractFloat - r = sqrt(T(-2) * FastMath.log_fast(u1)) +# ≤32-bit float output: polynomial sincospi on the raw UInt32, log through +# FastMath (Base.log(::Float32) also widens to Float64 — fixed in a follow-up). +@inline function boxmuller(::Type{F}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32} + r = sqrt(-2f0 * FastMath.log_fast(u01(Float32, u2))) + s, c = _fast_sincospi(Float32, u1) + (F(r * s), F(r * c)) +end + +# Float64: Base.sincospi has FP64 intrinsics on the backends that support it. +@inline function boxmuller(::Type{Float64}, u1::Float64, u2::Float64) + r = sqrt(-2.0 * FastMath.log_fast(u1)) s, c = sincospi(2 * u2) (r * s, r * c) end # For complex normals each component has variance 1/2, so the radius is -# sqrt(-log(U)) rather than sqrt(-2*log(U)). -@inline function boxmuller(::Type{Complex{T}}, u1::T, u2::T) where T <: AbstractFloat +# sqrt(-log(U)) rather than sqrt(-2·log(U)). +@inline function boxmuller(::Type{Complex{F}}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32} + r = sqrt(-FastMath.log_fast(u01(Float32, u2))) + s, c = _fast_sincospi(Float32, u1) + complex(F(r * s), F(r * c)) +end +@inline function boxmuller(::Type{Complex{Float64}}, u1::Float64, u2::Float64) r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1))) s, c = sincospi(2 * u2) complex(r * s, r * c) @@ -261,10 +304,13 @@ end # output. # Convert Philox UInt32 outputs to N normally-distributed values of type T. +# ≤32-bit float targets pass UInt32s to boxmuller directly (the polynomial +# sincospi extracts bits itself; log still goes through u01 for now). 64-bit +# targets assemble UInt64s and convert to Float64 on the way in. for T in (Float16, Float32) @eval @inline function philox_to_normals(::Type{$T}, a1, a2, a3, a4) - n1, n2 = boxmuller($T, $T(u01(Float32, a1)), $T(u01(Float32, a2))) - n3, n4 = boxmuller($T, $T(u01(Float32, a3)), $T(u01(Float32, a4))) + n1, n2 = boxmuller($T, a1, a2) + n3, n4 = boxmuller($T, a3, a4) (n1, n2, n3, n4) end end @@ -273,15 +319,13 @@ end u01(Float64, UInt64(a1) | UInt64(a2) << 32), u01(Float64, UInt64(a3) | UInt64(a4) << 32)) end -@inline function philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4) - (boxmuller(Complex{Float32}, u01(Float32, a1), u01(Float32, a2)), - boxmuller(Complex{Float32}, u01(Float32, a3), u01(Float32, a4))) -end -@inline function philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4) +@inline philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4) = + (boxmuller(Complex{Float32}, a1, a2), + boxmuller(Complex{Float32}, a3, a4)) +@inline philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4) = (boxmuller(Complex{Float64}, u01(Float64, UInt64(a1) | UInt64(a2) << 32), u01(Float64, UInt64(a3) | UInt64(a4) << 32)),) -end @kernel function randn_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T gid = @index(Global, Linear) From e436f081346994080dbb95d068b5d8ed799ee665 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 08:15:57 +0200 Subject: [PATCH 14/19] Use polynomial log for Float16/Float32 randn! MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same Metal problem as sincospi in the previous commit: Base.log(::Float32) widens to Float64 internally (see base/special/log.jl:242, `Float64(2f0*f)/(2.0+f)`), which breaks Metal and forces FP64 emulation on other backends that end up on that path. Vendor PhiloxRNG.jl's Float32 polynomial log (itself ported from fdlibm's e_logf.c). Fold the u01 conversion into the first fma so the ≤32-bit boxmuller never touches Float64. Float64 still uses Base.log_fast — backends with FP64 support have intrinsics for it, and the polynomial alternative is ~8x slower on consumer GPUs (benchmarked: randn! F64 drops from ~97 GB/s with intrinsics to ~12 GB/s with polynomials on an RTX 5080). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index 074e43b8..54eebb9b 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -89,19 +89,54 @@ const _CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0) end +## Fast log for Box-Muller +# +# Base.log(::Float32) widens to Float64 internally (see base/special/log.jl:242 +# `Float64(2f0*f)/(2.0+f)`), same Metal / FP64-emulation problem as sincospi. +# +# Vendored from PhiloxRNG.jl (MIT), ported from fdlibm's e_logf.c: a Float32 +# minimax polynomial. Takes the raw Philox UInt32 output; the u01 conversion +# is folded into the first fma so there's no intermediate float. +# +# Same Float64-path reasoning as the sincospi block above. + +const _SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5))) +const _LOG_ODD_F32 = (reinterpret(Float32, Int32(0x3f2aaaaa)), + reinterpret(Float32, Int32(0x3e91e9ee))) +const _LOG_EVEN_F32 = (reinterpret(Float32, Int32(0x3eccce13)), + reinterpret(Float32, Int32(0x3e789e26))) + +@inline function _fast_log(::Type{Float32}, u::UInt32) + x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33)) + ix = reinterpret(Int32, x) - _SQRT_HALF_I32 + k = ix >> Int32(23) + f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + _SQRT_HALF_I32) - 1.0f0 + f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33)) + f = ifelse(k == Int32(0), f_comp, f_std) + s = f / (2.0f0 + f) + z = s * s; w = z * z + R = z * evalpoly(w, _LOG_ODD_F32) + w * evalpoly(w, _LOG_EVEN_F32) + hfsq = 0.5f0 * f * f + Float32(k) * reinterpret(Float32, Int32(0x3f317180)) - + ((hfsq - (s * (hfsq + R) + + Float32(k) * reinterpret(Float32, Int32(0x3717f7d1)))) - f) +end + + ## Box-Muller transform using Base.FastMath -# ≤32-bit float output: polynomial sincospi on the raw UInt32, log through -# FastMath (Base.log(::Float32) also widens to Float64 — fixed in a follow-up). +# ≤32-bit float output: both log and sincospi go through the Float32 +# polynomials above. @inline function boxmuller(::Type{F}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32} - r = sqrt(-2f0 * FastMath.log_fast(u01(Float32, u2))) + r = sqrt(-2f0 * _fast_log(Float32, u2)) s, c = _fast_sincospi(Float32, u1) (F(r * s), F(r * c)) end -# Float64: Base.sincospi has FP64 intrinsics on the backends that support it. +# Float64: Base.log_fast / Base.sincospi have FP64 intrinsics on the backends +# that support it. @inline function boxmuller(::Type{Float64}, u1::Float64, u2::Float64) r = sqrt(-2.0 * FastMath.log_fast(u1)) s, c = sincospi(2 * u2) @@ -111,7 +146,7 @@ end # For complex normals each component has variance 1/2, so the radius is # sqrt(-log(U)) rather than sqrt(-2·log(U)). @inline function boxmuller(::Type{Complex{F}}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32} - r = sqrt(-FastMath.log_fast(u01(Float32, u2))) + r = sqrt(-_fast_log(Float32, u2)) s, c = _fast_sincospi(Float32, u1) complex(F(r * s), F(r * c)) end From 80bd752e9ffef54fd4abeeef8b240d4598076671 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 08:52:50 +0200 Subject: [PATCH 15/19] Bypass Base's ziggurat randn in ElementRNG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Base's randn(rng, ::BitFloatType) is the ziggurat path that reads global wi/ki/fi tables, which aren't device-accessible — and the Float64 tables can't even be loaded on Metal. Direct randn!(rng, A) for Float16/32/64 dispatches to the batched kernel so this seemed dormant, but Base's randn(rng, Complex{T}) recurses into randn(rng, T): a randn!(rng, A::GPUArray{Complex{Float16}}) call therefore hit ziggurat via the generic path. Override randn(rng::ElementRNG, ::Type{Float{16,32,64}}) to use our Box-Muller directly. Covers any current or future caller that recurses through these types. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/host/random.jl | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index 54eebb9b..af171d8e 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -404,12 +404,25 @@ end # from there are: # # - `randn(rng, ::BitFloatType)` (Float16/32/64) — ziggurat, uses global `wi` -# /`ki`/`fi` tables that aren't device-accessible. Never reached here because -# those types dispatch to the batched kernel above. +# /`ki`/`fi` tables that aren't device-accessible. Overridden below to use +# our Box-Muller directly. The direct dispatch for these types goes through +# the batched kernel above, but `randn(rng, Complex{Float16})` recurses into +# `randn(rng, Float16)` which hits this path. # - `randn(rng, ::Type{Complex{T}})` — recurses into `randn(rng, T)`. # - `randn(rng, ::Type{T}) where T<:AbstractFloat` — Marsaglia polar Box-Muller # rejection loop. GPU-safe (only calls `rand(rng, T)`) but warp-divergent. +# Bypass Base's ziggurat-based randn(rng, Float{16,32,64}) — its `wi`/`ki`/`fi` +# tables aren't device-accessible, and on Metal the Float64 tables can't even +# be loaded. Reached via Base's Complex recursion when the element type is +# e.g. Complex{Float16}. +@inline Random.randn(rng::ElementRNG, ::Type{Float16}) = + first(boxmuller(Float16, rand(rng, UInt32), rand(rng, UInt32))) +@inline Random.randn(rng::ElementRNG, ::Type{Float32}) = + first(boxmuller(Float32, rand(rng, UInt32), rand(rng, UInt32))) +@inline Random.randn(rng::ElementRNG, ::Type{Float64}) = + first(boxmuller(Float64, u01(Float64, rand(rng, UInt64)), u01(Float64, rand(rng, UInt64)))) + @kernel function randn_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T gid = @index(Global, Linear) if gid <= length(A) From 01df466c20afa3244becddc88b6bebdb1984f142 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 08:59:02 +0200 Subject: [PATCH 16/19] Drop underscore prefix on fast log/sincospi helpers These are internal names but not meant as private underscore-prefixed Julia convention; the leading _ was just carried over from PhiloxRNG. Rename to fast_log / fast_sincospi / SP_F32 / CP_F32 / SQRT_HALF_I32 / LOG_ODD_F32 / LOG_EVEN_F32. Co-Authored-By: Claude Opus 4.6 (1M context) [ci skip] --- src/host/random.jl | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index af171d8e..a4ff7814 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -72,14 +72,14 @@ end # intrinsics, and the polynomial alternative is ~8× slower on consumer GPUs # with low FP64 throughput. -const _SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0) -const _CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0) +const SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0) +const CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0) -@inline function _fast_sincospi(::Type{Float32}, u::UInt32) +@inline function fast_sincospi(::Type{Float32}, u::UInt32) oct = (u % Int32) & Int32(7) y = fma(Float32(u & ~UInt32(7)), Float32(2)^(-34), Float32(2)^(-32)) - sp = y * evalpoly(y * y, _SP_F32) - cp = evalpoly(y * y, _CP_F32) + sp = y * evalpoly(y * y, SP_F32) + cp = evalpoly(y * y, CP_F32) swap = !iszero(oct & Int32(1)) sin_neg = !iszero(oct & Int32(2)) cos_neg = !iszero(oct & Int32(4)) @@ -100,22 +100,22 @@ end # # Same Float64-path reasoning as the sincospi block above. -const _SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5))) -const _LOG_ODD_F32 = (reinterpret(Float32, Int32(0x3f2aaaaa)), +const SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5))) +const LOG_ODD_F32 = (reinterpret(Float32, Int32(0x3f2aaaaa)), reinterpret(Float32, Int32(0x3e91e9ee))) -const _LOG_EVEN_F32 = (reinterpret(Float32, Int32(0x3eccce13)), +const LOG_EVEN_F32 = (reinterpret(Float32, Int32(0x3eccce13)), reinterpret(Float32, Int32(0x3e789e26))) -@inline function _fast_log(::Type{Float32}, u::UInt32) +@inline function fast_log(::Type{Float32}, u::UInt32) x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33)) - ix = reinterpret(Int32, x) - _SQRT_HALF_I32 + ix = reinterpret(Int32, x) - SQRT_HALF_I32 k = ix >> Int32(23) - f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + _SQRT_HALF_I32) - 1.0f0 + f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + SQRT_HALF_I32) - 1.0f0 f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33)) f = ifelse(k == Int32(0), f_comp, f_std) s = f / (2.0f0 + f) z = s * s; w = z * z - R = z * evalpoly(w, _LOG_ODD_F32) + w * evalpoly(w, _LOG_EVEN_F32) + R = z * evalpoly(w, LOG_ODD_F32) + w * evalpoly(w, LOG_EVEN_F32) hfsq = 0.5f0 * f * f Float32(k) * reinterpret(Float32, Int32(0x3f317180)) - ((hfsq - (s * (hfsq + R) + @@ -130,8 +130,8 @@ using Base.FastMath # ≤32-bit float output: both log and sincospi go through the Float32 # polynomials above. @inline function boxmuller(::Type{F}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32} - r = sqrt(-2f0 * _fast_log(Float32, u2)) - s, c = _fast_sincospi(Float32, u1) + r = sqrt(-2f0 * fast_log(Float32, u2)) + s, c = fast_sincospi(Float32, u1) (F(r * s), F(r * c)) end @@ -146,8 +146,8 @@ end # For complex normals each component has variance 1/2, so the radius is # sqrt(-log(U)) rather than sqrt(-2·log(U)). @inline function boxmuller(::Type{Complex{F}}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32} - r = sqrt(-_fast_log(Float32, u2)) - s, c = _fast_sincospi(Float32, u1) + r = sqrt(-fast_log(Float32, u2)) + s, c = fast_sincospi(Float32, u1) complex(F(r * s), F(r * c)) end @inline function boxmuller(::Type{Complex{Float64}}, u1::Float64, u2::Float64) From dba131b9a734c165df17ef631f5aa5d55a3a4ee3 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 10:40:45 +0200 Subject: [PATCH 17/19] Parameterize RNG on the GPU array type. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `GPUArrays.RNG{AT}` now carries the destination array type, so out-of-place `rand(rng, T, dims)` / `randn(rng, T, dims)` allocate an `AT{T}` instead of falling back to Random stdlib's CPU `Array` constructor. The fill paths (`Random.rand!` / `Random.randn!` on `AnyGPUArray`) ignore `AT` and dispatch on the destination's KernelAbstractions backend, so a single RNG can fill any GPU array. A new CPU-array fallback uses `AT` to allocate a temporary, fill it on-device, then `copyto!` the destination — prevents Random's scalar-iter fallback when filling a `Vector` from a GPU RNG. Removes the `default_rng(::Type{<:AnyGPUArray})` interface: it was only called by the testsuite (now uses `GPUArrays.RNG{AT}()` directly) and by the per-backend shims (now redundant — the type parameter replaces them). JLArrays' GLOBAL_RNG ref + override go away. --- lib/JLArrays/src/JLArrays.jl | 8 ------ src/host/random.jl | 55 ++++++++++++++++++++++++++++++++---- test/testsuite/random.jl | 2 +- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 5903b444..3557b63c 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -539,14 +539,6 @@ end using Random -const GLOBAL_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing) -function GPUArrays.default_rng(::Type{<:JLArray}) - if GLOBAL_RNG[] === nothing - GLOBAL_RNG[] = GPUArrays.RNG() - end - GLOBAL_RNG[] -end - ## GPUArrays interfaces diff --git a/src/host/random.jl b/src/host/random.jl index a4ff7814..bae6c688 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -158,17 +158,19 @@ end ## RNG type +# +# Parameterized on the GPU array type `AT` (e.g. `CuArray`, `ROCArray`, `JLArray`). +# `AT` is only consulted by the *out-of-place* `rand`/`randn` constructors and by +# the CPU-array fill fallback — the in-place `rand!`/`randn!` GPU paths dispatch +# on the destination's KernelAbstractions backend and ignore `AT`. -mutable struct RNG <: AbstractRNG +mutable struct RNG{AT} <: AbstractRNG seed::UInt64 counter::UInt32 end -RNG() = RNG(rand(Random.RandomDevice(), UInt64), UInt32(0)) -RNG(seed::Integer) = RNG(seed % UInt64, UInt32(0)) - -# return an instance of GPUArrays.RNG suitable for the requested array type -default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE +RNG{AT}() where {AT} = RNG{AT}(rand(Random.RandomDevice(), UInt64), UInt32(0)) +RNG{AT}(seed::Integer) where {AT} = RNG{AT}(seed % UInt64, UInt32(0)) Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt64); rng.counter = 0; rng) Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt64; rng.counter = 0; rng) @@ -442,3 +444,44 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Union{AbstractFlo end +## Non-GPU array fallback: generate on AT, copyto! the destination. +# +# Without this, `rand!(rng::RNG{CuArray}, ::Vector)` would hit Random's stdlib +# scalar path and silently iterate the GPU rng one element at a time. + +function Random.rand!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T} + isempty(A) && return A + B = AT{T}(undef, size(A)) + Random.rand!(rng, B) + copyto!(A, B) +end +function Random.randn!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T} + isempty(A) && return A + B = AT{T}(undef, size(A)) + Random.randn!(rng, B) + copyto!(A, B) +end + + +## Out-of-place rand / randn — construct an AT array and fill it. + +Random.rand(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T} = + Random.rand!(rng, AT{T}(undef, dims)) +Random.randn(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T<:Union{AbstractFloat, + Complex{<:AbstractFloat}}} = + Random.randn!(rng, AT{T}(undef, dims)) + +# untyped: default to Float32 (matches CUDA convention; better fit than Float64 +# on consumer GPUs) +Random.rand(rng::RNG{AT}, dims::Dims) where {AT} = Random.rand(rng, Float32, dims) +Random.randn(rng::RNG{AT}, dims::Dims) where {AT} = Random.randn(rng, Float32, dims) + +# variadic dim spellings +Random.rand(rng::RNG, dim1::Integer, dims::Integer...) = + Random.rand(rng, Dims((dim1, dims...))) +Random.randn(rng::RNG, dim1::Integer, dims::Integer...) = + Random.randn(rng, Dims((dim1, dims...))) +Random.rand(rng::RNG, ::Type{T}, dim1::Integer, dims::Integer...) where T = + Random.rand(rng, T, Dims((dim1, dims...))) +Random.randn(rng::RNG, ::Type{T}, dim1::Integer, dims::Integer...) where T = + Random.randn(rng, T, Dims((dim1, dims...))) diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl index 17c3875e..7ed2fd56 100644 --- a/test/testsuite/random.jl +++ b/test/testsuite/random.jl @@ -1,6 +1,6 @@ @testsuite "random" (AT, eltypes)->begin rng = if AT <: AbstractGPUArray - GPUArrays.default_rng(AT) + GPUArrays.RNG{AT}() else Random.default_rng() end From 529c2cbc89ee20a0ba48908eafb38e2c28b1856b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 10:50:04 +0200 Subject: [PATCH 18/19] Use similar(AT{T}, dims) instead of AT{T}(undef, dims). similar is the documented Base interface for type-driven array construction (base/abstractarray.jl:873), and its default implementation calls T(undef, dims). Lets array types that override similar(::Type{T}, ::Dims) hook in cleanly. --- src/host/random.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/host/random.jl b/src/host/random.jl index bae6c688..54752b6a 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -451,13 +451,13 @@ end function Random.rand!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T} isempty(A) && return A - B = AT{T}(undef, size(A)) + B = similar(AT{T}, size(A)) Random.rand!(rng, B) copyto!(A, B) end function Random.randn!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T} isempty(A) && return A - B = AT{T}(undef, size(A)) + B = similar(AT{T}, size(A)) Random.randn!(rng, B) copyto!(A, B) end @@ -466,10 +466,10 @@ end ## Out-of-place rand / randn — construct an AT array and fill it. Random.rand(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T} = - Random.rand!(rng, AT{T}(undef, dims)) + Random.rand!(rng, similar(AT{T}, dims)) Random.randn(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T<:Union{AbstractFloat, Complex{<:AbstractFloat}}} = - Random.randn!(rng, AT{T}(undef, dims)) + Random.randn!(rng, similar(AT{T}, dims)) # untyped: default to Float32 (matches CUDA convention; better fit than Float64 # on consumer GPUs) From 8c1f396833302a55059f743313ca6fd3ac939a7e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 15 Apr 2026 10:55:47 +0200 Subject: [PATCH 19/19] Keep GPUArrays.default_rng as an empty stub for downstream compat. Metal.jl and other backends still extend GPUArrays.default_rng even though the interface is no longer called. Without a function declaration in GPUArrays, those extension definitions fail to parse with UndefVarError. Empty function stub is enough for the transition. Also fix the RNG(state::AbstractGPUArray) deprecation: it used to call RNG() (zero-arg) which no longer exists. Now extracts AT from the state array's type and calls RNG{AT}(). --- src/deprecated.jl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/deprecated.jl b/src/deprecated.jl index 62ef634f..f8927e23 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -2,11 +2,17 @@ # The new stateless Philox4x32 RNG doesn't need either. function RNG(state::AbstractGPUArray) - Base.depwarn("RNG(state::AbstractGPUArray) is deprecated, use RNG() instead", :RNG) - RNG() + AT = Base.typename(typeof(state)).wrapper + Base.depwarn("RNG(state::AbstractGPUArray) is deprecated, use RNG{$AT}() instead", :RNG) + RNG{AT}() end function Random.seed!(rng::RNG, seed::Vector{UInt32}) Base.depwarn("seed!(rng::RNG, seed::Vector{UInt32}) is deprecated, use seed!(rng, seed::Integer) instead", :seed!) Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt64) : first(seed)) end + +# Stub kept so downstream packages that still extend `GPUArrays.default_rng` +# (Metal.jl, etc.) continue to load. The interface itself is gone — `RNG{AT}()` +# is now constructed directly — so any extension of this is dead code. +function default_rng end