diff --git a/Project.toml b/Project.toml index 3b7c2e69..ccaaad35 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "GPUArrays" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "11.4.1" +version = "11.5.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 7bba4b95..3557b63c 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -539,18 +539,6 @@ end using Random -const GLOBAL_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing) -function GPUArrays.default_rng(::Type{<:JLArray}) - if GLOBAL_RNG[] === nothing - N = MAXTHREADS - state = JLArray{NTuple{4, UInt32}}(undef, N) - rng = GPUArrays.RNG(state) - Random.seed!(rng) - GLOBAL_RNG[] = rng - end - GLOBAL_RNG[] -end - ## GPUArrays interfaces diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index a35c1ff0..17a4d897 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -38,5 +38,6 @@ include("host/statistics.jl") include("host/sparse.jl") include("host/alloc_cache.jl") +include("deprecated.jl") end # module diff --git a/src/deprecated.jl b/src/deprecated.jl new file mode 100644 index 00000000..f8927e23 --- /dev/null +++ b/src/deprecated.jl @@ -0,0 +1,18 @@ +# RNG: the old RNG took a GPU state array and Vector{UInt32} seeds. +# The new stateless Philox4x32 RNG doesn't need either. + +function RNG(state::AbstractGPUArray) + AT = Base.typename(typeof(state)).wrapper + Base.depwarn("RNG(state::AbstractGPUArray) is deprecated, use RNG{$AT}() instead", :RNG) + RNG{AT}() +end + +function Random.seed!(rng::RNG, seed::Vector{UInt32}) + Base.depwarn("seed!(rng::RNG, seed::Vector{UInt32}) is deprecated, use seed!(rng, seed::Integer) instead", :seed!) + Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt64) : first(seed)) +end + +# Stub kept so downstream packages that still extend `GPUArrays.default_rng` +# (Metal.jl, etc.) continue to load. The interface itself is gone — `RNG{AT}()` +# is now constructed directly — so any extension of this is dead code. +function default_rng end diff --git a/src/host/random.jl b/src/host/random.jl index 7759e841..54752b6a 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -1,119 +1,487 @@ # integration with Random stdlib -## device interface +## Philox4x32-10 counter-based RNG -# hybrid Tausworthe and Linear Congruent generator from -# https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch37.html +# Stateless: (counter, key) → 4 UInt32 outputs. Each unique counter gives independent +# random values with no shared memory or global state needed. # -# only generates Float32 or Float64 numbers, conversion happens elsewhere +# Reference: Salmon et al., "Parallel Random Numbers: As Easy as 1, 2, 3" (2011) -function TausStep(z::Unsigned, S1::Integer, S2::Integer, S3::Integer, M::Unsigned) - b = (((z << S1) ⊻ z) >> S2) - return (((z & M) << S3) ⊻ b) +const PHILOX_M4x32_0 = 0xD2511F53 +const PHILOX_M4x32_1 = 0xCD9E8D57 +const PHILOX_W32_0 = 0x9E3779B9 +const PHILOX_W32_1 = 0xBB67AE85 + +@inline function philox4x32round(ctr::NTuple{4,UInt32}, key::NTuple{2,UInt32}) + mul0 = widemul(PHILOX_M4x32_0, ctr[1]) + mul1 = widemul(PHILOX_M4x32_1, ctr[3]) + hi0 = (mul0 >> 32) % UInt32 + hi1 = (mul1 >> 32) % UInt32 + lo0 = mul0 % UInt32 + lo1 = mul1 % UInt32 + (hi1 ⊻ ctr[2] ⊻ key[1], lo1, hi0 ⊻ ctr[4] ⊻ key[2], lo0) +end + +@inline function philox4x32bumpkey(key::NTuple{2,UInt32}) + (key[1] + PHILOX_W32_0, key[2] + PHILOX_W32_1) +end + +@inline function philox4x32_10(ctr::NTuple{4,UInt32}, key::NTuple{2,UInt32}) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key) + ctr = philox4x32round(ctr, key) + ctr +end + + +## Float conversions: unsigned integer → uniform float, strictly positive +## (Float32 can round up to exactly 1.0; Float64 stays strictly below 1.0.) + +@inline function u01(::Type{Float32}, u::UInt32) + fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33)) +end + +@inline function u01(::Type{Float64}, u::UInt64) + # Bit-pattern construction avoids the expensive Float64(::UInt64) conversion + # and fma on consumer GPUs (where FP64 throughput is as low as 1:64). The + # low bit of the mantissa is forced set so the result is strictly in (0, 1), + # which is required by Box-Muller's log(u). + reinterpret(Float64, ((u >> 12) | 0x1) | 0x3ff0000000000000) - 1.0 +end + + +## Fast sincospi for Box-Muller +# +# Base.sincospi(::Float32) widens to Float64 internally (via `sinpi_kernel_wide`, +# see base/special/trig.jl:744), which breaks Metal — it has no Float64 support +# at all — and wastes cycles on other backends that must emulate FP64. +# +# Vendored from PhiloxRNG.jl (MIT): a minimax polynomial that stays entirely +# in Float32 and consumes the Philox UInt32 output directly. Bottom 3 bits +# select one of 8 octants (swap/negate), upper 29 bits give the reduced +# argument (+0.5-biased so y ≠ 0). +# +# Float64 keeps using Base.sincospi: backends that support FP64 all have +# intrinsics, and the polynomial alternative is ~8× slower on consumer GPUs +# with low FP64 throughput. + +const SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0) +const CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0) + +@inline function fast_sincospi(::Type{Float32}, u::UInt32) + oct = (u % Int32) & Int32(7) + y = fma(Float32(u & ~UInt32(7)), Float32(2)^(-34), Float32(2)^(-32)) + sp = y * evalpoly(y * y, SP_F32) + cp = evalpoly(y * y, CP_F32) + swap = !iszero(oct & Int32(1)) + sin_neg = !iszero(oct & Int32(2)) + cos_neg = !iszero(oct & Int32(4)) + s_raw = ifelse(swap, cp, sp) + c_raw = ifelse(swap, sp, cp) + (ifelse(sin_neg, -s_raw, s_raw), ifelse(cos_neg, -c_raw, c_raw)) end -LCGStep(z::Unsigned, A::Unsigned, C::Unsigned) = A * z + C -make_rand_num(::Type{Float64}, tmp) = 2.3283064365387e-10 * Float64(tmp) -make_rand_num(::Type{Float32}, tmp) = 2.3283064f-10 * Float32(tmp) -# NOTE: the rng state is often not representable as Float16, so perform this in Float32. -make_rand_num(::Type{Float16}, tmp) = Float16(2.3283064f-10 * Float32(tmp)) +## Fast log for Box-Muller +# +# Base.log(::Float32) widens to Float64 internally (see base/special/log.jl:242 +# `Float64(2f0*f)/(2.0+f)`), same Metal / FP64-emulation problem as sincospi. +# +# Vendored from PhiloxRNG.jl (MIT), ported from fdlibm's e_logf.c: a Float32 +# minimax polynomial. Takes the raw Philox UInt32 output; the u01 conversion +# is folded into the first fma so there's no intermediate float. +# +# Same Float64-path reasoning as the sincospi block above. + +const SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5))) +const LOG_ODD_F32 = (reinterpret(Float32, Int32(0x3f2aaaaa)), + reinterpret(Float32, Int32(0x3e91e9ee))) +const LOG_EVEN_F32 = (reinterpret(Float32, Int32(0x3eccce13)), + reinterpret(Float32, Int32(0x3e789e26))) -function next_rand(state::NTuple{4, T}) where {T <: Unsigned} - state = ( - TausStep(state[1], Cint(13), Cint(19), Cint(12), T(4294967294)), - TausStep(state[2], Cint(2), Cint(25), Cint(4), T(4294967288)), - TausStep(state[3], Cint(3), Cint(11), Cint(17), T(4294967280)), - LCGStep(state[4], T(1664525), T(1013904223)) - ) - tmp = (state[1] ⊻ state[2] ⊻ state[3] ⊻ state[4]) - return state, tmp +@inline function fast_log(::Type{Float32}, u::UInt32) + x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33)) + ix = reinterpret(Int32, x) - SQRT_HALF_I32 + k = ix >> Int32(23) + f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + SQRT_HALF_I32) - 1.0f0 + f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33)) + f = ifelse(k == Int32(0), f_comp, f_std) + s = f / (2.0f0 + f) + z = s * s; w = z * z + R = z * evalpoly(w, LOG_ODD_F32) + w * evalpoly(w, LOG_EVEN_F32) + hfsq = 0.5f0 * f * f + Float32(k) * reinterpret(Float32, Int32(0x3f317180)) - + ((hfsq - (s * (hfsq + R) + + Float32(k) * reinterpret(Float32, Int32(0x3717f7d1)))) - f) end -function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T - stateful_rand = next_rand(randstate[threadid]) - randstate[threadid] = stateful_rand[1] - return make_rand_num(T, stateful_rand[2]) + +## Box-Muller transform + +using Base.FastMath + +# ≤32-bit float output: both log and sincospi go through the Float32 +# polynomials above. +@inline function boxmuller(::Type{F}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32} + r = sqrt(-2f0 * fast_log(Float32, u2)) + s, c = fast_sincospi(Float32, u1) + (F(r * s), F(r * c)) end -function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T <: Integer - result = zero(T) - if sizeof(T) >= 4 - for _ in 1:sizeof(T) >> 2 - randstate[threadid], y = next_rand(randstate[threadid]) - result = reinterpret(T, (|)(promote(result << 32, y)...)) +# Float64: Base.log_fast / Base.sincospi have FP64 intrinsics on the backends +# that support it. +@inline function boxmuller(::Type{Float64}, u1::Float64, u2::Float64) + r = sqrt(-2.0 * FastMath.log_fast(u1)) + s, c = sincospi(2 * u2) + (r * s, r * c) +end + +# For complex normals each component has variance 1/2, so the radius is +# sqrt(-log(U)) rather than sqrt(-2·log(U)). +@inline function boxmuller(::Type{Complex{F}}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32} + r = sqrt(-fast_log(Float32, u2)) + s, c = fast_sincospi(Float32, u1) + complex(F(r * s), F(r * c)) +end +@inline function boxmuller(::Type{Complex{Float64}}, u1::Float64, u2::Float64) + r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1))) + s, c = sincospi(2 * u2) + complex(r * s, r * c) +end + + +## RNG type +# +# Parameterized on the GPU array type `AT` (e.g. `CuArray`, `ROCArray`, `JLArray`). +# `AT` is only consulted by the *out-of-place* `rand`/`randn` constructors and by +# the CPU-array fill fallback — the in-place `rand!`/`randn!` GPU paths dispatch +# on the destination's KernelAbstractions backend and ignore `AT`. + +mutable struct RNG{AT} <: AbstractRNG + seed::UInt64 + counter::UInt32 +end + +RNG{AT}() where {AT} = RNG{AT}(rand(Random.RandomDevice(), UInt64), UInt32(0)) +RNG{AT}(seed::Integer) where {AT} = RNG{AT}(seed % UInt64, UInt32(0)) + +Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt64); rng.counter = 0; rng) +Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt64; rng.counter = 0; rng) + +function advance_counter!(rng::RNG) + rng.counter += one(UInt32) + rng.counter == 0 && (rng.seed += one(UInt64)) + rng +end + +# Split the 64-bit seed into the two 32-bit lanes of the Philox key. +@inline philox_key(seed::UInt64) = (seed % UInt32, (seed >> 32) % UInt32) + + +## Specialized rand! kernels for common types + +# Each Philox4x32 call produces 4 UInt32 outputs. Specialized kernels batch +# multiple values per work item to use all 4 outputs efficiently: +# - ≤4-byte types (including Float16): 4 values per call +# - 8-byte types (Int64/UInt64/Float64/Complex{Float32}): 2 values per call +# - 16-byte types (Int128/UInt128/Complex{Float64}): 1 value per call + +# Convert Philox UInt32 outputs to N values of type T +@inline philox_to_vals(::Type{Float16}, a1, a2, a3, a4) = + (Float16(u01(Float32, a1)), Float16(u01(Float32, a2)), + Float16(u01(Float32, a3)), Float16(u01(Float32, a4))) +@inline philox_to_vals(::Type{Float32}, a1, a2, a3, a4) = + (u01(Float32, a1), u01(Float32, a2), u01(Float32, a3), u01(Float32, a4)) +for T in (UInt8, UInt16, UInt32, Int8, Int16, Int32, Bool) + @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) = + (a1 % $T, a2 % $T, a3 % $T, a4 % $T) +end +@inline philox_to_vals(::Type{Float64}, a1, a2, a3, a4) = + (u01(Float64, UInt64(a1) | UInt64(a2) << 32), + u01(Float64, UInt64(a3) | UInt64(a4) << 32)) +for T in (UInt64, Int64) + @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) = + ((UInt64(a1) | UInt64(a2) << 32) % $T, + (UInt64(a3) | UInt64(a4) << 32) % $T) +end +for T in (UInt128, Int128) + @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) = + ((UInt128(a1) | UInt128(a2) << 32 | UInt128(a3) << 64 | UInt128(a4) << 96) % $T,) +end +@inline philox_to_vals(::Type{Complex{Float32}}, a1, a2, a3, a4) = + (complex(u01(Float32, a1), u01(Float32, a2)), + complex(u01(Float32, a3), u01(Float32, a4))) +@inline philox_to_vals(::Type{Complex{Float64}}, a1, a2, a3, a4) = + (complex(u01(Float64, UInt64(a1) | UInt64(a2) << 32), + u01(Float64, UInt64(a3) | UInt64(a4) << 32)),) + +# Number of output values per Philox4x32 call +vals_per_call(::Type{T}) where T = sizeof(T) <= 4 ? 4 : sizeof(T) <= 8 ? 2 : 1 +vals_per_call(::Type{Complex{T}}) where T = sizeof(T) <= 4 ? 2 : 1 + +# Batched kernel: N values per work item from one Philox call +@kernel function rand_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + N = vals_per_call(T) + idx = N * gid + len = length(A) + if idx <= len + vals = philox_to_vals(T, philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + philox_key(seed))...) + for j in 1:N + @inbounds A[idx - N + j] = vals[j] + end + elseif idx - N < len + vals = philox_to_vals(T, philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + philox_key(seed))...) + for j in 1:min(N, len - idx + N) + @inbounds A[idx - N + j] = vals[j] end - else - randstate[threadid], y = next_rand(randstate[threadid]) - x = reinterpret(Int32, y) - result = convert(T, x & typemax(T)) end - result end -# support for complex numbers +# Types with specialized batched kernels +const BatchedRandTypes = Union{ + Float16, Float32, Float64, Complex{Float32}, Complex{Float64}, + Bool, Int8, Int16, Int32, Int64, Int128, + UInt8, UInt16, UInt32, UInt64, UInt128} -function gpu_rand(::Type{Complex{T}}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T - re = gpu_rand(T, threadid, randstate) - im = gpu_rand(T, threadid, randstate) - return complex(re, im) +function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandTypes + isempty(A) && return A + N = vals_per_call(T) + rand_batched_kernel!(get_backend(A))(rng.seed, rng.counter, A; + ndrange=cld(length(A), N)) + advance_counter!(rng) + A end -## host interface +## ElementRNG: a device-side AbstractRNG for generic rand/randn fallbacks + +# For element types without a specialized batched kernel (BigFloat, +# FixedPointNumbers, user-defined types, ...), we create a per-work-item +# AbstractRNG and delegate to Random stdlib's `rand(rng, T)` / `randn(rng, T)`. +# +# The struct is immutable (mutable structs get heap-allocated on GPU); the +# per-work-item subcounter state lives in a stack-allocated Ref that the +# struct holds a pointer to. -struct RNG <: AbstractRNG - state::AbstractGPUArray{NTuple{4,UInt32},1} +struct ElementRNG <: AbstractRNG + seed::UInt64 + counter::UInt32 + gid::UInt32 + subctr_ptr::Ptr{UInt32} end -# return an instance of GPUArrays.RNG suitable for the requested array type -default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE +@inline Random.rng_native_52(::ElementRNG) = UInt64 -make_seed(rng::RNG) = make_seed(rng, rand(UInt)) -function make_seed(rng::RNG, n::Integer) - rand(MersenneTwister(n), UInt32, sizeof(rng.state)÷sizeof(UInt32)) +@inline function Random.rand(rng::ElementRNG, ::Random.SamplerType{UInt64}) + sc = unsafe_load(rng.subctr_ptr) + UInt32(1) + unsafe_store!(rng.subctr_ptr, sc) + a1, a2, _, _ = philox4x32_10( + (rng.gid, sc, rng.counter, UInt32(0)), + philox_key(rng.seed)) + UInt64(a1) | UInt64(a2) << 32 end -Random.seed!(rng::RNG) = Random.seed!(rng, make_seed(rng)) -Random.seed!(rng::RNG, seed::Integer) = Random.seed!(rng, make_seed(rng, seed)) -function Random.seed!(rng::RNG, seed::Vector{UInt32}) - copyto!(rng.state, collect(reinterpret(NTuple{4, UInt32}, seed))) - return +@inline function Random.rand(rng::ElementRNG, ::Random.SamplerType{UInt128}) + UInt128(rand(rng, Random.SamplerType{UInt64}())) | + UInt128(rand(rng, Random.SamplerType{UInt64}())) << 64 +end + +@inline Random.rand(rng::ElementRNG, ::Random.SamplerType{T}) where T <: Union{Bool,Base.BitInteger} = + rand(rng, Random.SamplerType{UInt64}()) % T + + +## Generic rand! fallback via ElementRNG + +@kernel function rand_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + if gid <= length(A) + subctr = Ref{UInt32}(0) + rng = ElementRNG(seed, counter, gid % UInt32, + Base.unsafe_convert(Ptr{UInt32}, subctr)) + @inbounds A[gid] = rand(rng, T) + end end function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number isempty(A) && return A - @kernel function rand!(a, randstate) - idx = @index(Global, Linear) - @inbounds a[idx] = gpu_rand(T, ((idx-1)%length(randstate)+1), randstate) - end - rand!(get_backend(A))(A, rng.state; ndrange = size(A)) + rand_generic_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A)) + advance_counter!(rng) A end -function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number - isempty(A) && return A - threads = (length(A) - 1) ÷ 2 + 1 - @kernel function randn!(a, randstates) - i = @index(Global, Linear) - threadidx = @index(Local, Linear) - idx = 2*(i - 1) + 1 - U1 = gpu_rand(T, threadidx, randstates) - U2 = gpu_rand(T, threadidx, randstates) - Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2) - Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2) - @inbounds a[idx] = Z0 - if idx + 1 <= length(a) - @inbounds a[idx + 1] = Z1 + +## Specialized randn! kernel for common float types + +# Float16/32/64 and their complex variants go through a batched kernel mirroring +# rand_batched_kernel!, rather than the generic ElementRNG path further down: +# +# 1. Random's `randn(rng, ::Float{16,32,64})` uses the Ziggurat algorithm with +# global table lookups (`ki`, `wi`, `fi`), which aren't device-accessible +# from a KernelAbstractions kernel. +# +# 2. The polar Box-Muller fallback reachable for other AbstractFloat types has +# a rejection loop (`while true ... 0 < s < 1`), causing warp divergence +# (~2x slower). +# +# 3. Direct (non-rejection) Box-Muller naturally produces value *pairs* from +# sincospi. Batching these (4 Float32 / 2 Float64 per Philox call) keeps us +# memory-bandwidth-bound; a 1-per-work-item kernel would discard half the +# output. + +# Convert Philox UInt32 outputs to N normally-distributed values of type T. +# ≤32-bit float targets pass UInt32s to boxmuller directly (the polynomial +# sincospi extracts bits itself; log still goes through u01 for now). 64-bit +# targets assemble UInt64s and convert to Float64 on the way in. +for T in (Float16, Float32) + @eval @inline function philox_to_normals(::Type{$T}, a1, a2, a3, a4) + n1, n2 = boxmuller($T, a1, a2) + n3, n4 = boxmuller($T, a3, a4) + (n1, n2, n3, n4) + end +end +@inline function philox_to_normals(::Type{Float64}, a1, a2, a3, a4) + boxmuller(Float64, + u01(Float64, UInt64(a1) | UInt64(a2) << 32), + u01(Float64, UInt64(a3) | UInt64(a4) << 32)) +end +@inline philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4) = + (boxmuller(Complex{Float32}, a1, a2), + boxmuller(Complex{Float32}, a3, a4)) +@inline philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4) = + (boxmuller(Complex{Float64}, + u01(Float64, UInt64(a1) | UInt64(a2) << 32), + u01(Float64, UInt64(a3) | UInt64(a4) << 32)),) + +@kernel function randn_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + N = vals_per_call(T) + idx = N * gid + len = length(A) + if idx <= len + vals = philox_to_normals(T, philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + philox_key(seed))...) + for j in 1:N + @inbounds A[idx - N + j] = vals[j] + end + elseif idx - N < len + vals = philox_to_normals(T, philox4x32_10( + (gid % UInt32, UInt32(0), counter, UInt32(0)), + philox_key(seed))...) + for j in 1:min(N, len - idx + N) + @inbounds A[idx - N + j] = vals[j] end end - kernel = randn!(get_backend(A)) - kernel(A, rng.state; ndrange=threads) +end + +const BatchedRandnTypes = Union{Float16, Float32, Float64, + Complex{Float32}, Complex{Float64}} + +function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandnTypes + isempty(A) && return A + N = vals_per_call(T) + randn_batched_kernel!(get_backend(A))(rng.seed, rng.counter, A; + ndrange=cld(length(A), N)) + advance_counter!(rng) + A +end + + +## Generic randn! fallback via ElementRNG +# +# For AbstractFloats outside BatchedRandnTypes (BFloat16, user-defined float +# types, etc.) we route through Random's `randn(rng, T)`. The reachable methods +# from there are: +# +# - `randn(rng, ::BitFloatType)` (Float16/32/64) — ziggurat, uses global `wi` +# /`ki`/`fi` tables that aren't device-accessible. Overridden below to use +# our Box-Muller directly. The direct dispatch for these types goes through +# the batched kernel above, but `randn(rng, Complex{Float16})` recurses into +# `randn(rng, Float16)` which hits this path. +# - `randn(rng, ::Type{Complex{T}})` — recurses into `randn(rng, T)`. +# - `randn(rng, ::Type{T}) where T<:AbstractFloat` — Marsaglia polar Box-Muller +# rejection loop. GPU-safe (only calls `rand(rng, T)`) but warp-divergent. + +# Bypass Base's ziggurat-based randn(rng, Float{16,32,64}) — its `wi`/`ki`/`fi` +# tables aren't device-accessible, and on Metal the Float64 tables can't even +# be loaded. Reached via Base's Complex recursion when the element type is +# e.g. Complex{Float16}. +@inline Random.randn(rng::ElementRNG, ::Type{Float16}) = + first(boxmuller(Float16, rand(rng, UInt32), rand(rng, UInt32))) +@inline Random.randn(rng::ElementRNG, ::Type{Float32}) = + first(boxmuller(Float32, rand(rng, UInt32), rand(rng, UInt32))) +@inline Random.randn(rng::ElementRNG, ::Type{Float64}) = + first(boxmuller(Float64, u01(Float64, rand(rng, UInt64)), u01(Float64, rand(rng, UInt64)))) + +@kernel function randn_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T + gid = @index(Global, Linear) + if gid <= length(A) + subctr = Ref{UInt32}(0) + rng = ElementRNG(seed, counter, gid % UInt32, + Base.unsafe_convert(Ptr{UInt32}, subctr)) + @inbounds A[gid] = randn(rng, T) + end +end + +function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Union{AbstractFloat, + Complex{<:AbstractFloat}} + isempty(A) && return A + randn_generic_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A)) + advance_counter!(rng) A end -# allow use of CPU RNGs without scalar iteration -Random.rand!(rng::AbstractRNG, A::AnyGPUArray) = - copyto!(A, rand(rng, eltype(A), size(A)...)) -Random.randn!(rng::AbstractRNG, A::AnyGPUArray) = - copyto!(A, randn(rng, eltype(A), size(A)...)) + +## Non-GPU array fallback: generate on AT, copyto! the destination. +# +# Without this, `rand!(rng::RNG{CuArray}, ::Vector)` would hit Random's stdlib +# scalar path and silently iterate the GPU rng one element at a time. + +function Random.rand!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T} + isempty(A) && return A + B = similar(AT{T}, size(A)) + Random.rand!(rng, B) + copyto!(A, B) +end +function Random.randn!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T} + isempty(A) && return A + B = similar(AT{T}, size(A)) + Random.randn!(rng, B) + copyto!(A, B) +end + + +## Out-of-place rand / randn — construct an AT array and fill it. + +Random.rand(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T} = + Random.rand!(rng, similar(AT{T}, dims)) +Random.randn(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T<:Union{AbstractFloat, + Complex{<:AbstractFloat}}} = + Random.randn!(rng, similar(AT{T}, dims)) + +# untyped: default to Float32 (matches CUDA convention; better fit than Float64 +# on consumer GPUs) +Random.rand(rng::RNG{AT}, dims::Dims) where {AT} = Random.rand(rng, Float32, dims) +Random.randn(rng::RNG{AT}, dims::Dims) where {AT} = Random.randn(rng, Float32, dims) + +# variadic dim spellings +Random.rand(rng::RNG, dim1::Integer, dims::Integer...) = + Random.rand(rng, Dims((dim1, dims...))) +Random.randn(rng::RNG, dim1::Integer, dims::Integer...) = + Random.randn(rng, Dims((dim1, dims...))) +Random.rand(rng::RNG, ::Type{T}, dim1::Integer, dims::Integer...) where T = + Random.rand(rng, T, Dims((dim1, dims...))) +Random.randn(rng::RNG, ::Type{T}, dim1::Integer, dims::Integer...) where T = + Random.randn(rng, T, Dims((dim1, dims...))) diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl index 5e00682c..7ed2fd56 100644 --- a/test/testsuite/random.jl +++ b/test/testsuite/random.jl @@ -1,37 +1,25 @@ @testsuite "random" (AT, eltypes)->begin rng = if AT <: AbstractGPUArray - GPUArrays.default_rng(AT) + GPUArrays.RNG{AT}() else Random.default_rng() end - cpu_rng = Random.default_rng() - - SEEDING_BROKEN = (rng != cpu_rng) && !contains(string(AT), "JLArray") @testset "rand" begin # uniform - @testset "$d $T" for T in eltypes, d in (10, (10, 10), (1024, 1024)) + @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2), 3, (3,3)) A = AT{T}(undef, d) B = copy(A) rand!(rng, A) rand!(rng, B) @test Array(A) != Array(B) - - A = AT(rand(T, d)) - B = AT(rand(T, d)) - - Random.seed!(rng) - Random.seed!(rng, 1) - rand!(rng, A) - Random.seed!(rng, 1) - rand!(rng, B) - - @test Array(A) == Array(B) skip=SEEDING_BROKEN && (prod(d) > length(rng.state)) - - if rng != cpu_rng - rand!(cpu_rng, A) - end end + # empty arrays + B = AT{Float32}(undef, 0) + rand!(rng, B) + @test isempty(Array(B)) + + # Bool coverage A = AT{Bool}(undef, 1024) fill!(A, false) rand!(rng, A) @@ -39,43 +27,54 @@ fill!(A, true) rand!(rng, A) @test false in Array(A) - - # AT of length 0 - B = AT{Float32}(undef, 0) - fill!(B, 1f0) - rand!(rng, B) - @test isempty(Array(B)) end @testset "randn" begin # normally-distributed - # XXX: randn calls sqrt, and Base's sqrt(::Complex) performs - # checked type conversions that throw boxed numbers. - @testset "$d $T" for T in filter(isrealfloattype, eltypes), d in (2, (2, 2), (1024, 1024)) + @testset "$T $d" for T in filter(isrealfloattype, eltypes), + d in (2, (2,2), (2,2,2), 3, (3,3)) A = AT{T}(undef, d) B = copy(A) randn!(rng, A) randn!(rng, B) @test Array(A) != Array(B) + end - A = AT(rand(T, d)) - B = AT(rand(T, d)) - - Random.seed!(rng) - Random.seed!(rng, 1) + # complex randn + for T in filter(t -> t <: Complex && isrealfloattype(real(t)), eltypes) + A = AT{T}(undef, 8) randn!(rng, A) - Random.seed!(rng, 1) - randn!(rng, B) - @test Array(A) == Array(B) skip=SEEDING_BROKEN && (prod(d) > (2 * length(rng.state))) - - if rng != cpu_rng - randn!(cpu_rng, A) - end + @test !any(isnan, Array(A)) end - # AT of length 0 + # empty arrays A = AT{Float32}(undef, 0) - fill!(A, 1f0) randn!(rng, A) @test isempty(Array(A)) + + # Box-Muller should not produce infinities + if Float32 in eltypes + @test isfinite(maximum(randn!(rng, AT{Float32}(undef, 2^20)))) + end + end + + @testset "seeding" begin + if AT <: AbstractGPUArray + # seeding should produce reproducible results + for T in (Float32, Float64) + if T in eltypes + Random.seed!(rng, 1) + A = rand!(rng, AT{T}(undef, 100)) + Random.seed!(rng, 1) + B = rand!(rng, AT{T}(undef, 100)) + @test Array(A) == Array(B) + + Random.seed!(rng, 1) + A = randn!(rng, AT{T}(undef, 100)) + Random.seed!(rng, 1) + B = randn!(rng, AT{T}(undef, 100)) + @test Array(A) == Array(B) + end + end + end end end