From a9db6b6b2ee64efe9afa1bf016ca03a82da80c46 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 14 Apr 2026 19:42:04 +0200
Subject: [PATCH 01/19] Replace Tausworthe+LCG RNG with stateless Philox4x32-10

Replace the hybrid Tausworthe+LCG generator with a stateless Philox4x32-10
counter-based RNG using KernelAbstractions, portable across GPU backends.

rand! uses batched kernels for common types (Float16/32/64, integers, Bool,
Complex) that produce multiple values per Philox call. A generic ElementRNG
fallback handles all other types via Julia's Random stdlib dispatch.

randn! uses specialized Box-Muller kernels with @fastmath log + sincospi,
letting each backend provide optimized intrinsics via @device_override.
These can't be made generic like rand! because (1) Random's Ziggurat needs
table lookups requiring device overlays, (2) the generic Marsaglia polar
fallback has a rejection loop causing warp divergence, and (3) direct
Box-Muller produces value pairs that must be batched to stay bandwidth-bound.

The old RNG(state::AbstractGPUArray) constructor and seed!(rng, ::Vector)
are kept as deprecated methods for backwards compatibility with existing
backends.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 Project.toml                 |   2 +-
 lib/JLArrays/src/JLArrays.jl |   6 +-
 src/GPUArrays.jl             |   1 +
 src/deprecated.jl            |  12 ++
 src/host/random.jl           | 395 ++++++++++++++++++++++++++++-------
 test/testsuite/random.jl     |  78 +++----
 6 files changed, 379 insertions(+), 115 deletions(-)
 create mode 100644 src/deprecated.jl

diff --git a/Project.toml b/Project.toml
index 3b7c2e69..ccaaad35 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "GPUArrays"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "11.4.1"
+version = "11.5.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 7bba4b95..5903b444 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -542,11 +542,7 @@ using Random
 const GLOBAL_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing)
 function GPUArrays.default_rng(::Type{<:JLArray})
     if GLOBAL_RNG[] === nothing
-        N = MAXTHREADS
-        state = JLArray{NTuple{4, UInt32}}(undef, N)
-        rng = GPUArrays.RNG(state)
-        Random.seed!(rng)
-        GLOBAL_RNG[] = rng
+        GLOBAL_RNG[] = GPUArrays.RNG()
     end
     GLOBAL_RNG[]
 end
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index a35c1ff0..17a4d897 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -38,5 +38,6 @@ include("host/statistics.jl")
 include("host/sparse.jl")
 include("host/alloc_cache.jl")
 
+include("deprecated.jl")
 
 end # module
diff --git a/src/deprecated.jl b/src/deprecated.jl
new file mode 100644
index 00000000..a088025a
--- /dev/null
+++ b/src/deprecated.jl
@@ -0,0 +1,12 @@
+# RNG: the old RNG took a GPU state array and Vector{UInt32} seeds.
+# The new stateless Philox4x32 RNG doesn't need either.
+
+function RNG(state::AbstractGPUArray)
+    Base.depwarn("RNG(state::AbstractGPUArray) is deprecated, use RNG() instead", :RNG)
+    RNG()
+end
+
+function Random.seed!(rng::RNG, seed::Vector{UInt32})
+    Base.depwarn("seed!(rng::RNG, seed::Vector{UInt32}) is deprecated, use seed!(rng, seed::Integer) instead", :seed!)
+    Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt32) : first(seed))
+end
diff --git a/src/host/random.jl b/src/host/random.jl
index 7759e841..13045cbf 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -1,114 +1,361 @@
 # integration with Random stdlib
 
-## device interface
 
-# hybrid Tausworthe and Linear Congruent generator from
-# https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch37.html
+## Philox4x32-10 counter-based RNG
 #
-# only generates Float32 or Float64 numbers, conversion happens elsewhere
+# Stateless: (counter, key) → 4 UInt32 outputs. Each unique counter gives independent
+# random values with no shared memory or global state needed.
+#
+# Reference: Salmon et al., "Parallel Random Numbers: As Easy as 1, 2, 3" (2011)
+
+const PHILOX_M4x32_0 = 0xD2511F53
+const PHILOX_M4x32_1 = 0xCD9E8D57
+const PHILOX_W32_0   = 0x9E3779B9
+const PHILOX_W32_1   = 0xBB67AE85
+
+@inline function philox4x32round(ctr::NTuple{4,UInt32}, key::NTuple{2,UInt32})
+    mul0 = widemul(PHILOX_M4x32_0, ctr[1])
+    mul1 = widemul(PHILOX_M4x32_1, ctr[3])
+    hi0 = (mul0 >> 32) % UInt32
+    hi1 = (mul1 >> 32) % UInt32
+    lo0 = mul0 % UInt32
+    lo1 = mul1 % UInt32
+    (hi1 ⊻ ctr[2] ⊻ key[1], lo1, hi0 ⊻ ctr[4] ⊻ key[2], lo0)
+end
+
+@inline function philox4x32bumpkey(key::NTuple{2,UInt32})
+    (key[1] + PHILOX_W32_0, key[2] + PHILOX_W32_1)
+end
+
+@inline function philox4x32_10(ctr::NTuple{4,UInt32}, key::NTuple{2,UInt32})
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key)
+    ctr
+end
+
+
+## Float conversions: unsigned integer → uniform float in (0, 1]
+
+@inline function u01(::Type{Float32}, u::UInt32)
+    fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
+end
+
+@inline function u01(::Type{Float64}, u::UInt64)
+    fma(Float64(u), Float64(2)^(-64), Float64(2)^(-65))
+end
+
+
+## Box-Muller transform using @fastmath log + sincospi
+#
+# Each backend provides optimized intrinsics via @device_override:
+# CUDA: __nv_fast_logf, __nv_sincospif, etc.
+
+using Base.FastMath
 
-function TausStep(z::Unsigned, S1::Integer, S2::Integer, S3::Integer, M::Unsigned)
-    b = (((z << S1) ⊻ z) >> S2)
-    return (((z & M) << S3) ⊻ b)
+@inline function boxmuller(::Type{T}, u1::T, u2::T) where T
+    r = sqrt(T(-2) * FastMath.log_fast(u1))
+    s, c = sincospi(2 * u2)
+    (r * s, r * c)
 end
 
-LCGStep(z::Unsigned, A::Unsigned, C::Unsigned) = A * z + C
 
-make_rand_num(::Type{Float64}, tmp) = 2.3283064365387e-10 * Float64(tmp)
-make_rand_num(::Type{Float32}, tmp) = 2.3283064f-10 * Float32(tmp)
-# NOTE: the rng state is often not representable as Float16, so perform this in Float32.
-make_rand_num(::Type{Float16}, tmp) = Float16(2.3283064f-10 * Float32(tmp))
+## RNG type
 
-function next_rand(state::NTuple{4, T}) where {T <: Unsigned}
-    state = (
-        TausStep(state[1], Cint(13), Cint(19), Cint(12), T(4294967294)),
-        TausStep(state[2], Cint(2), Cint(25), Cint(4), T(4294967288)),
-        TausStep(state[3], Cint(3), Cint(11), Cint(17), T(4294967280)),
-        LCGStep(state[4], T(1664525), T(1013904223))
-    )
-    tmp = (state[1] ⊻ state[2] ⊻ state[3] ⊻ state[4])
-    return state, tmp
+mutable struct RNG <: AbstractRNG
+    seed::UInt32
+    counter::UInt32
 end
 
-function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T
-    stateful_rand = next_rand(randstate[threadid])
-    randstate[threadid] = stateful_rand[1]
-    return make_rand_num(T, stateful_rand[2])
+RNG() = RNG(rand(Random.RandomDevice(), UInt32), UInt32(0))
+RNG(seed::Integer) = RNG(seed % UInt32, UInt32(0))
+
+# return an instance of GPUArrays.RNG suitable for the requested array type
+default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE
+
+Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt32); rng.counter = 0; rng)
+Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt32; rng.counter = 0; rng)
+
+function advance_counter!(rng::RNG)
+    new_counter = Int64(rng.counter) + 1
+    overflow, remainder = fldmod(new_counter, typemax(UInt32))
+    rng.seed += overflow % UInt32
+    rng.counter = remainder % UInt32
+end
+
+
+## Specialized rand! kernels for common types
+#
+# Each Philox4x32 call produces 4 UInt32 outputs. Specialized kernels batch
+# multiple values per work item to use all 4 outputs efficiently.
+# Types that consume 1 UInt32 each get 4 values per call; types that need
+# 2 UInt32s (UInt64/Float64) get 2 values per call; complex types that need
+# 4 UInt32s get 1 value per call.
+
+# Convert Philox UInt32 outputs to N values of type T
+@inline philox_to_vals(::Type{Float16}, a1, a2, a3, a4) =
+    (Float16(u01(Float32, a1)), Float16(u01(Float32, a2)),
+     Float16(u01(Float32, a3)), Float16(u01(Float32, a4)))
+@inline philox_to_vals(::Type{Float32}, a1, a2, a3, a4) =
+    (u01(Float32, a1), u01(Float32, a2), u01(Float32, a3), u01(Float32, a4))
+for T in (UInt8, UInt16, UInt32, Int8, Int16, Int32, Bool)
+    @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) =
+        (a1 % $T, a2 % $T, a3 % $T, a4 % $T)
+end
+@inline philox_to_vals(::Type{Float64}, a1, a2, a3, a4) =
+    (u01(Float64, UInt64(a1) | UInt64(a2) << 32),
+     u01(Float64, UInt64(a3) | UInt64(a4) << 32))
+for T in (UInt64, Int64)
+    @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) =
+        ((UInt64(a1) | UInt64(a2) << 32) % $T,
+         (UInt64(a3) | UInt64(a4) << 32) % $T)
 end
+@inline philox_to_vals(::Type{Complex{Float32}}, a1, a2, a3, a4) =
+    (complex(u01(Float32, a1), u01(Float32, a2)),
+     complex(u01(Float32, a3), u01(Float32, a4)))
+@inline philox_to_vals(::Type{Complex{Float64}}, a1, a2, a3, a4) =
+    (complex(u01(Float64, UInt64(a1) | UInt64(a2) << 32),
+             u01(Float64, UInt64(a3) | UInt64(a4) << 32)),)
+
+# Number of output values per Philox4x32 call
+vals_per_call(::Type{T}) where T = sizeof(T) <= 4 ? 4 : sizeof(T) <= 8 ? 2 : 1
+vals_per_call(::Type{Complex{T}}) where T = sizeof(T) <= 4 ? 2 : 1
 
-function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T <: Integer
-    result = zero(T)
-    if sizeof(T) >= 4
-        for _ in 1:sizeof(T) >> 2
-            randstate[threadid], y = next_rand(randstate[threadid])
-            result = reinterpret(T, (|)(promote(result << 32, y)...))
+# Batched kernel: N values per work item from one Philox call
+@kernel function rand_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    N = vals_per_call(T)
+    idx = N * gid
+    len = length(A)
+    if idx <= len
+        vals = philox_to_vals(T, philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))...)
+        for j in 1:N
+            @inbounds A[idx - N + j] = vals[j]
+        end
+    elseif idx - N < len
+        vals = philox_to_vals(T, philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))...)
+        for j in 1:min(N, len - idx + N)
+            @inbounds A[idx - N + j] = vals[j]
         end
-    else
-        randstate[threadid], y = next_rand(randstate[threadid])
-        x = reinterpret(Int32, y)
-        result = convert(T, x & typemax(T))
     end
-    result
 end
 
-# support for complex numbers
+# Types with specialized batched kernels
+const BatchedRandTypes = Union{
+    Float16, Float32, Float64, Complex{Float32}, Complex{Float64},
+    Bool, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64}
 
-function gpu_rand(::Type{Complex{T}}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T
-    re = gpu_rand(T, threadid, randstate)
-    im = gpu_rand(T, threadid, randstate)
-    return complex(re, im)
+function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandTypes
+    isempty(A) && return A
+    N = vals_per_call(T)
+    rand_batched_kernel!(get_backend(A))(rng.seed, rng.counter, A;
+                                         ndrange=cld(length(A), N))
+    advance_counter!(rng)
+    A
 end
 
 
-## host interface
+## Generic rand! fallback via ElementRNG
+#
+# For types without specialized kernels (Float16, integers, Bool, UInt128, etc.),
+# an immutable ElementRNG wraps a Philox4x32 call as an AbstractRNG. Julia's Random
+# stdlib then handles all type conversions automatically via `rand(rng, T)`.
+#
+# State (subcounter) lives in a stack-allocated Ref; the struct holds a pointer to
+# it. This avoids mutable struct heap allocation on GPU.
 
-struct RNG <: AbstractRNG
-    state::AbstractGPUArray{NTuple{4,UInt32},1}
+struct ElementRNG <: AbstractRNG
+    seed::UInt32
+    counter::UInt32
+    gid::UInt32
+    subctr_ptr::Ptr{UInt32}
 end
 
-# return an instance of GPUArrays.RNG suitable for the requested array type
-default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE
+@inline Random.rng_native_52(::ElementRNG) = UInt64
 
-make_seed(rng::RNG) = make_seed(rng, rand(UInt))
-function make_seed(rng::RNG, n::Integer)
-    rand(MersenneTwister(n), UInt32, sizeof(rng.state)÷sizeof(UInt32))
+@inline function Random.rand(rng::ElementRNG, ::Random.SamplerType{UInt64})
+    sc = unsafe_load(rng.subctr_ptr) + UInt32(1)
+    unsafe_store!(rng.subctr_ptr, sc)
+    a1, a2, _, _ = philox4x32_10(
+        (rng.gid, sc, rng.counter, UInt32(0)),
+        (rng.seed, UInt32(0)))
+    UInt64(a1) | UInt64(a2) << 32
 end
 
-Random.seed!(rng::RNG) = Random.seed!(rng, make_seed(rng))
-Random.seed!(rng::RNG, seed::Integer) = Random.seed!(rng, make_seed(rng, seed))
-function Random.seed!(rng::RNG, seed::Vector{UInt32})
-    copyto!(rng.state, collect(reinterpret(NTuple{4, UInt32}, seed)))
-    return
+@inline function Random.rand(rng::ElementRNG, ::Random.SamplerType{UInt128})
+    UInt128(rand(rng, Random.SamplerType{UInt64}())) |
+    UInt128(rand(rng, Random.SamplerType{UInt64}())) << 64
+end
+
+@inline Random.rand(rng::ElementRNG, ::Random.SamplerType{T}) where T <: Union{Bool,Base.BitInteger} =
+    rand(rng, Random.SamplerType{UInt64}()) % T
+
+# @inline overrides for Random stdlib entry points that aren't inlined by default.
+# Without these, the dispatch chain stays opaque and the Ref won't be optimized.
+@inline Random.rand(rng::ElementRNG, ::Type{T}) where {T} =
+    rand(rng, Random.Sampler(typeof(rng), T, Val(1)))
+@inline Random.rand(rng::ElementRNG, ::Random.SamplerType{Complex{T}}) where {T<:Real} =
+    complex(rand(rng, T), rand(rng, T))
+
+@kernel function rand_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    if gid <= length(A)
+        subctr = Ref{UInt32}(0)
+        rng = ElementRNG(seed, counter, gid % UInt32,
+                          Base.unsafe_convert(Ptr{UInt32}, subctr))
+        @inbounds A[gid] = rand(rng, T)
+    end
 end
 
 function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
     isempty(A) && return A
-    @kernel function rand!(a, randstate)
-        idx = @index(Global, Linear)
-        @inbounds a[idx] = gpu_rand(T, ((idx-1)%length(randstate)+1), randstate)
+    rand_generic_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A))
+    advance_counter!(rng)
+    A
+end
+
+
+## randn! kernels
+#
+# Unlike rand!, randn! uses specialized batched kernels instead of the generic
+# ElementRNG approach. This is because:
+#
+# 1. Random's default `randn` uses the Ziggurat algorithm with table lookups
+#    (`ki`, `wi`, `fi` arrays) that aren't accessible on GPU without device
+#    overlays (which are backend-specific, not available in KernelAbstractions).
+#
+# 2. The generic fallback `randn(rng, T::AbstractFloat)` uses Marsaglia's polar
+#    Box-Muller variant with a rejection loop (`while true ... 0 < s < 1`),
+#    which causes severe warp divergence on GPU (~2x slower).
+#
+# 3. Direct (non-rejection) Box-Muller naturally produces value *pairs* from
+#    sincospi. Batching these (4 Float32 / 2 Float64 per Philox call) is key
+#    to staying memory-bandwidth-bound; a 1-per-work-item generic kernel
+#    discards half the output.
+#
+# Box-Muller: each Philox call produces 2 normal values from 2 uniform values.
+# For <=32-bit floats: 4 values per call (2 Box-Muller pairs from 4 UInt32).
+# For >32-bit floats: 2 values per call (1 Box-Muller pair from 2 UInt64).
+
+# Box-Muller for complex: sqrt(-log(U)) not sqrt(-2*log(U)),
+# so each component has variance 1/2
+@inline function boxmuller_complex(::Type{T}, u1::T, u2::T) where T
+    r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1)))
+    s, c = sincospi(2 * u2)
+    complex(r * s, r * c)
+end
+
+@kernel function randn_small_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    idx = 4 * gid
+    len = length(A)
+    if idx <= len
+        a1, a2, a3, a4 = philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))
+        n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
+        n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
+        @inbounds A[idx - 3] = n1
+        @inbounds A[idx - 2] = n2
+        @inbounds A[idx - 1] = n3
+        @inbounds A[idx]     = n4
+    elseif idx - 3 <= len
+        a1, a2, a3, a4 = philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))
+        n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
+        n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
+        vals = (n1, n2, n3, n4)
+        for j in 1:min(4, len - idx + 4)
+            @inbounds A[idx - 4 + j] = vals[j]
+        end
+    end
+end
+
+@kernel function randn_large_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    idx = 2 * gid
+    len = length(A)
+    if idx <= len
+        a1, a2, a3, a4 = philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))
+        n1, n2 = boxmuller(T,
+            T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)),
+            T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)))
+        @inbounds A[idx - 1] = n1
+        @inbounds A[idx]     = n2
+    elseif idx - 1 <= len
+        a1, a2, a3, a4 = philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))
+        n1, _ = boxmuller(T,
+            T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)),
+            T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)))
+        @inbounds A[len] = n1
+    end
+end
+
+@kernel function randn_complex_small_kernel!(@Const(seed), @Const(counter), A::AbstractArray{Complex{T}}) where T
+    gid = @index(Global, Linear)
+    idx = 2 * gid
+    len = length(A)
+    if idx <= len
+        a1, a2, a3, a4 = philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))
+        @inbounds A[idx - 1] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
+        @inbounds A[idx]     = boxmuller_complex(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
+    elseif idx - 1 <= len
+        a1, a2, _, _ = philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))
+        @inbounds A[len] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
+    end
+end
+
+@kernel function randn_complex_large_kernel!(@Const(seed), @Const(counter), A::AbstractArray{Complex{T}}) where T
+    gid = @index(Global, Linear)
+    if gid <= length(A)
+        a1, a2, a3, a4 = philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            (seed, UInt32(0)))
+        U1 = T(u01(Float64, UInt64(a1) | UInt64(a2) << 32))
+        U2 = T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))
+        @inbounds A[gid] = boxmuller_complex(T, U1, U2)
+    end
+end
+
+function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: AbstractFloat
+    isempty(A) && return A
+    if sizeof(T) <= 4
+        randn_small_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 4))
+    else
+        randn_large_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 2))
     end
-    rand!(get_backend(A))(A, rng.state; ndrange = size(A))
+    advance_counter!(rng)
     A
 end
 
-function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
+function Random.randn!(rng::RNG, A::AnyGPUArray{Complex{T}}) where T <: AbstractFloat
     isempty(A) && return A
-    threads = (length(A) - 1) ÷ 2 + 1
-    @kernel function randn!(a, randstates)
-        i = @index(Global, Linear)
-        threadidx = @index(Local, Linear)
-        idx = 2*(i - 1) + 1
-        U1 = gpu_rand(T, threadidx, randstates)
-        U2 = gpu_rand(T, threadidx, randstates)
-        Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2)
-        Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2)
-        @inbounds a[idx] = Z0
-        if idx + 1 <= length(a)
-            @inbounds a[idx + 1] = Z1
-        end
+    if sizeof(T) <= 4
+        randn_complex_small_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 2))
+    else
+        randn_complex_large_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A))
     end
-    kernel = randn!(get_backend(A))
-    kernel(A, rng.state; ndrange=threads)
+    advance_counter!(rng)
     A
 end
 
diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl
index 5e00682c..b688dbde 100644
--- a/test/testsuite/random.jl
+++ b/test/testsuite/random.jl
@@ -6,32 +6,25 @@
     end
     cpu_rng = Random.default_rng()
 
-    SEEDING_BROKEN = (rng != cpu_rng) && !contains(string(AT), "JLArray")
-
     @testset "rand" begin  # uniform
-        @testset "$d $T" for T in eltypes, d in (10, (10, 10), (1024, 1024))
+        @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2), 3, (3,3))
             A = AT{T}(undef, d)
             B = copy(A)
             rand!(rng, A)
             rand!(rng, B)
             @test Array(A) != Array(B)
 
-            A = AT(rand(T, d))
-            B = AT(rand(T, d))
-
-            Random.seed!(rng)
-            Random.seed!(rng, 1)
-            rand!(rng, A)
-            Random.seed!(rng, 1)
-            rand!(rng, B)
-
-            @test Array(A) == Array(B) skip=SEEDING_BROKEN && (prod(d) > length(rng.state))
-
             if rng != cpu_rng
                 rand!(cpu_rng, A)
             end
         end
 
+        # empty arrays
+        B = AT{Float32}(undef, 0)
+        rand!(rng, B)
+        @test isempty(Array(B))
+
+        # Bool coverage
         A = AT{Bool}(undef, 1024)
         fill!(A, false)
         rand!(rng, A)
@@ -39,43 +32,58 @@
         fill!(A, true)
         rand!(rng, A)
         @test false in Array(A)
-
-        # AT of length 0
-        B = AT{Float32}(undef, 0)
-        fill!(B, 1f0)
-        rand!(rng, B)
-        @test isempty(Array(B))
     end
 
     @testset "randn" begin  # normally-distributed
-        # XXX: randn calls sqrt, and Base's sqrt(::Complex) performs
-        #      checked type conversions that throw boxed numbers.
-        @testset "$d $T" for T in filter(isrealfloattype, eltypes), d in (2, (2, 2), (1024, 1024))
+        @testset "$T $d" for T in filter(isrealfloattype, eltypes),
+                              d in (2, (2,2), (2,2,2), 3, (3,3))
             A = AT{T}(undef, d)
             B = copy(A)
             randn!(rng, A)
             randn!(rng, B)
             @test Array(A) != Array(B)
 
-            A = AT(rand(T, d))
-            B = AT(rand(T, d))
-
-            Random.seed!(rng)
-            Random.seed!(rng, 1)
-            randn!(rng, A)
-            Random.seed!(rng, 1)
-            randn!(rng, B)
-            @test Array(A) == Array(B) skip=SEEDING_BROKEN && (prod(d) > (2 * length(rng.state)))
-
             if rng != cpu_rng
                 randn!(cpu_rng, A)
             end
         end
 
-        # AT of length 0
+        # complex randn
+        for T in filter(t -> t <: Complex && isrealfloattype(real(t)), eltypes)
+            A = AT{T}(undef, 8)
+            randn!(rng, A)
+            @test !any(isnan, Array(A))
+        end
+
+        # empty arrays
         A = AT{Float32}(undef, 0)
-        fill!(A, 1f0)
         randn!(rng, A)
         @test isempty(Array(A))
+
+        # Box-Muller should not produce infinities
+        if Float32 in eltypes
+            @test isfinite(maximum(randn!(rng, AT{Float32}(undef, 2^20))))
+        end
+    end
+
+    @testset "seeding" begin
+        if rng != cpu_rng
+            # seeding should produce reproducible results
+            for T in (Float32, Float64)
+                if T in eltypes
+                    Random.seed!(rng, 1)
+                    A = rand!(rng, AT{T}(undef, 100))
+                    Random.seed!(rng, 1)
+                    B = rand!(rng, AT{T}(undef, 100))
+                    @test Array(A) == Array(B)
+
+                    Random.seed!(rng, 1)
+                    A = randn!(rng, AT{T}(undef, 100))
+                    Random.seed!(rng, 1)
+                    B = randn!(rng, AT{T}(undef, 100))
+                    @test Array(A) == Array(B)
+                end
+            end
+        end
     end
 end

From 1546bc8731dce7759073d01cfe152851457ea134 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:07:31 +0200
Subject: [PATCH 02/19] Simplify advance_counter! to use native UInt32 wrap

Previously used fldmod with typemax(UInt32) = 2^32 - 1, which wraps the
counter to 1 (not 0) when it saturates. Not a correctness issue today
since we only advance by 1, but the logic was confusing. Replace with
a plain increment that lets UInt32 overflow naturally.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 13045cbf..d007cc12 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -1,8 +1,7 @@
 # integration with Random stdlib
 
-
 ## Philox4x32-10 counter-based RNG
-#
+
 # Stateless: (counter, key) → 4 UInt32 outputs. Each unique counter gives independent
 # random values with no shared memory or global state needed.
 #
@@ -54,7 +53,7 @@ end
 
 
 ## Box-Muller transform using @fastmath log + sincospi
-#
+
 # Each backend provides optimized intrinsics via @device_override:
 # CUDA: __nv_fast_logf, __nv_sincospif, etc.
 
@@ -84,15 +83,14 @@ Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt32); rng.co
 Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt32; rng.counter = 0; rng)
 
 function advance_counter!(rng::RNG)
-    new_counter = Int64(rng.counter) + 1
-    overflow, remainder = fldmod(new_counter, typemax(UInt32))
-    rng.seed += overflow % UInt32
-    rng.counter = remainder % UInt32
+    rng.counter += one(UInt32)
+    rng.counter == 0 && (rng.seed += one(UInt32))
+    rng
 end
 
 
 ## Specialized rand! kernels for common types
-#
+
 # Each Philox4x32 call produces 4 UInt32 outputs. Specialized kernels batch
 # multiple values per work item to use all 4 outputs efficiently.
 # Types that consume 1 UInt32 each get 4 values per call; types that need
@@ -167,7 +165,7 @@ end
 
 
 ## Generic rand! fallback via ElementRNG
-#
+
 # For types without specialized kernels (Float16, integers, Bool, UInt128, etc.),
 # an immutable ElementRNG wraps a Philox4x32 call as an AbstractRNG. Julia's Random
 # stdlib then handles all type conversions automatically via `rand(rng, T)`.
@@ -227,7 +225,7 @@ end
 
 
 ## randn! kernels
-#
+
 # Unlike rand!, randn! uses specialized batched kernels instead of the generic
 # ElementRNG approach. This is because:
 #
@@ -359,6 +357,9 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{Complex{T}}) where T <: Abstract
     A
 end
 
+
+## host API
+
 # allow use of CPU RNGs without scalar iteration
 Random.rand!(rng::AbstractRNG, A::AnyGPUArray) =
     copyto!(A, rand(rng, eltype(A), size(A)...))

From 71adaa2ad2e074d7ffd1ddc4a66c123111caac74 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:08:23 +0200
Subject: [PATCH 03/19] Widen RNG seed to UInt64 for full Philox key entropy

Previously the seed was UInt32 and the Philox key's upper lane was
always zero, so different 64-bit seeds colliding mod 2^32 produced
identical streams. Store the seed as UInt64 and split both halves
into the key's two lanes via a new philox_key helper.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/deprecated.jl  |  2 +-
 src/host/random.jl | 37 ++++++++++++++++++++-----------------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/deprecated.jl b/src/deprecated.jl
index a088025a..62ef634f 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -8,5 +8,5 @@ end
 
 function Random.seed!(rng::RNG, seed::Vector{UInt32})
     Base.depwarn("seed!(rng::RNG, seed::Vector{UInt32}) is deprecated, use seed!(rng, seed::Integer) instead", :seed!)
-    Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt32) : first(seed))
+    Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt64) : first(seed))
 end
diff --git a/src/host/random.jl b/src/host/random.jl
index d007cc12..fe054fc4 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -69,25 +69,28 @@ end
 ## RNG type
 
 mutable struct RNG <: AbstractRNG
-    seed::UInt32
+    seed::UInt64
     counter::UInt32
 end
 
-RNG() = RNG(rand(Random.RandomDevice(), UInt32), UInt32(0))
-RNG(seed::Integer) = RNG(seed % UInt32, UInt32(0))
+RNG() = RNG(rand(Random.RandomDevice(), UInt64), UInt32(0))
+RNG(seed::Integer) = RNG(seed % UInt64, UInt32(0))
 
 # return an instance of GPUArrays.RNG suitable for the requested array type
 default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE
 
-Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt32); rng.counter = 0; rng)
-Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt32; rng.counter = 0; rng)
+Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt64); rng.counter = 0; rng)
+Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt64; rng.counter = 0; rng)
 
 function advance_counter!(rng::RNG)
     rng.counter += one(UInt32)
-    rng.counter == 0 && (rng.seed += one(UInt32))
+    rng.counter == 0 && (rng.seed += one(UInt64))
     rng
 end
 
+# Split the 64-bit seed into the two 32-bit lanes of the Philox key.
+@inline philox_key(seed::UInt64) = (seed % UInt32, (seed >> 32) % UInt32)
+
 
 ## Specialized rand! kernels for common types
 
@@ -135,14 +138,14 @@ vals_per_call(::Type{Complex{T}}) where T = sizeof(T) <= 4 ? 2 : 1
     if idx <= len
         vals = philox_to_vals(T, philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))...)
+            philox_key(seed))...)
         for j in 1:N
             @inbounds A[idx - N + j] = vals[j]
         end
     elseif idx - N < len
         vals = philox_to_vals(T, philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))...)
+            philox_key(seed))...)
         for j in 1:min(N, len - idx + N)
             @inbounds A[idx - N + j] = vals[j]
         end
@@ -174,7 +177,7 @@ end
 # it. This avoids mutable struct heap allocation on GPU.
 
 struct ElementRNG <: AbstractRNG
-    seed::UInt32
+    seed::UInt64
     counter::UInt32
     gid::UInt32
     subctr_ptr::Ptr{UInt32}
@@ -187,7 +190,7 @@ end
     unsafe_store!(rng.subctr_ptr, sc)
     a1, a2, _, _ = philox4x32_10(
         (rng.gid, sc, rng.counter, UInt32(0)),
-        (rng.seed, UInt32(0)))
+        philox_key(rng.seed))
     UInt64(a1) | UInt64(a2) << 32
 end
 
@@ -261,7 +264,7 @@ end
     if idx <= len
         a1, a2, a3, a4 = philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))
+            philox_key(seed))
         n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
         n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
         @inbounds A[idx - 3] = n1
@@ -271,7 +274,7 @@ end
     elseif idx - 3 <= len
         a1, a2, a3, a4 = philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))
+            philox_key(seed))
         n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
         n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
         vals = (n1, n2, n3, n4)
@@ -288,7 +291,7 @@ end
     if idx <= len
         a1, a2, a3, a4 = philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))
+            philox_key(seed))
         n1, n2 = boxmuller(T,
             T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)),
             T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)))
@@ -297,7 +300,7 @@ end
     elseif idx - 1 <= len
         a1, a2, a3, a4 = philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))
+            philox_key(seed))
         n1, _ = boxmuller(T,
             T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)),
             T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)))
@@ -312,13 +315,13 @@ end
     if idx <= len
         a1, a2, a3, a4 = philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))
+            philox_key(seed))
         @inbounds A[idx - 1] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
         @inbounds A[idx]     = boxmuller_complex(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
     elseif idx - 1 <= len
         a1, a2, _, _ = philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))
+            philox_key(seed))
         @inbounds A[len] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
     end
 end
@@ -328,7 +331,7 @@ end
     if gid <= length(A)
         a1, a2, a3, a4 = philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            (seed, UInt32(0)))
+            philox_key(seed))
         U1 = T(u01(Float64, UInt64(a1) | UInt64(a2) << 32))
         U2 = T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))
         @inbounds A[gid] = boxmuller_complex(T, U1, U2)

From 1affc0923bdde7a03dc64408d34074dc136a41bb Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:09:28 +0200
Subject: [PATCH 04/19] Use bit-pattern trick for u01(Float64, UInt64)

Construct the Float64 bit-pattern directly instead of calling the
expensive Float64(::UInt64) conversion and a subsequent fma. On consumer
GPUs with 1:64 FP64 throughput this is a ~40x speedup for rand!(F64) on
a 100M-element array (0.9 vs 38 ms on an RTX 5080, now memory-bound).

Force the low mantissa bit so the result is strictly in (0, 1), which is
required by Box-Muller's log(u).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index fe054fc4..dfa53760 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -48,7 +48,11 @@ end
 end
 
 @inline function u01(::Type{Float64}, u::UInt64)
-    fma(Float64(u), Float64(2)^(-64), Float64(2)^(-65))
+    # Bit-pattern construction avoids the expensive Float64(::UInt64) conversion
+    # and fma on consumer GPUs (where FP64 throughput is as low as 1:64). The
+    # low bit of the mantissa is forced set so the result is strictly in (0, 1),
+    # which is required by Box-Muller's log(u).
+    reinterpret(Float64, ((u >> 12) | 0x1) | 0x3ff0000000000000) - 1.0
 end
 
 

From f34d77687aa8e6d0e5def390d9f7a7f4bf6d1611 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:11:14 +0200
Subject: [PATCH 05/19] Unify randn! kernels via philox_to_normals

Replace the four randn_*_kernel!'s with a single randn_batched_kernel!
mirroring rand_batched_kernel!, parameterized on the element type via a
philox_to_normals helper (analogous to philox_to_vals). Same performance,
half the kernel code.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 139 +++++++++++++++------------------------------
 1 file changed, 46 insertions(+), 93 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index dfa53760..0f420246 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -231,10 +231,9 @@ function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
 end
 
 
-## randn! kernels
+## randn! kernel
 
-# Unlike rand!, randn! uses specialized batched kernels instead of the generic
-# ElementRNG approach. This is because:
+# Unlike rand!, randn! can't use the generic ElementRNG approach:
 #
 # 1. Random's default `randn` uses the Ziggurat algorithm with table lookups
 #    (`ki`, `wi`, `fi` arrays) that aren't accessible on GPU without device
@@ -249,117 +248,71 @@ end
 #    to staying memory-bandwidth-bound; a 1-per-work-item generic kernel
 #    discards half the output.
 #
-# Box-Muller: each Philox call produces 2 normal values from 2 uniform values.
-# For <=32-bit floats: 4 values per call (2 Box-Muller pairs from 4 UInt32).
-# For >32-bit floats: 2 values per call (1 Box-Muller pair from 2 UInt64).
+# Instead, reuse the batched kernel structure from rand!: one kernel parametric
+# over the element type, with `philox_to_normals` producing N values per call
+# (matching `vals_per_call(T)`).
 
-# Box-Muller for complex: sqrt(-log(U)) not sqrt(-2*log(U)),
-# so each component has variance 1/2
+# Box-Muller for complex: sqrt(-log(U)) not sqrt(-2*log(U)), so each component
+# has variance 1/2.
 @inline function boxmuller_complex(::Type{T}, u1::T, u2::T) where T
     r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1)))
     s, c = sincospi(2 * u2)
     complex(r * s, r * c)
 end
 
-@kernel function randn_small_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
-    gid = @index(Global, Linear)
-    idx = 4 * gid
-    len = length(A)
-    if idx <= len
-        a1, a2, a3, a4 = philox4x32_10(
-            (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            philox_key(seed))
-        n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
-        n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
-        @inbounds A[idx - 3] = n1
-        @inbounds A[idx - 2] = n2
-        @inbounds A[idx - 1] = n3
-        @inbounds A[idx]     = n4
-    elseif idx - 3 <= len
-        a1, a2, a3, a4 = philox4x32_10(
-            (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            philox_key(seed))
-        n1, n2 = boxmuller(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
-        n3, n4 = boxmuller(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
-        vals = (n1, n2, n3, n4)
-        for j in 1:min(4, len - idx + 4)
-            @inbounds A[idx - 4 + j] = vals[j]
-        end
+# Convert Philox UInt32 outputs to N normally-distributed values of type T.
+for T in (Float16, Float32)
+    @eval @inline function philox_to_normals(::Type{$T}, a1, a2, a3, a4)
+        n1, n2 = boxmuller($T, $T(u01(Float32, a1)), $T(u01(Float32, a2)))
+        n3, n4 = boxmuller($T, $T(u01(Float32, a3)), $T(u01(Float32, a4)))
+        (n1, n2, n3, n4)
     end
 end
-
-@kernel function randn_large_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
-    gid = @index(Global, Linear)
-    idx = 2 * gid
-    len = length(A)
-    if idx <= len
-        a1, a2, a3, a4 = philox4x32_10(
-            (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            philox_key(seed))
-        n1, n2 = boxmuller(T,
-            T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)),
-            T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)))
-        @inbounds A[idx - 1] = n1
-        @inbounds A[idx]     = n2
-    elseif idx - 1 <= len
-        a1, a2, a3, a4 = philox4x32_10(
-            (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            philox_key(seed))
-        n1, _ = boxmuller(T,
-            T(u01(Float64, UInt64(a1) | UInt64(a2) << 32)),
-            T(u01(Float64, UInt64(a3) | UInt64(a4) << 32)))
-        @inbounds A[len] = n1
-    end
+@inline function philox_to_normals(::Type{Float64}, a1, a2, a3, a4)
+    boxmuller(Float64,
+        u01(Float64, UInt64(a1) | UInt64(a2) << 32),
+        u01(Float64, UInt64(a3) | UInt64(a4) << 32))
+end
+@inline function philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4)
+    (boxmuller_complex(Float32, u01(Float32, a1), u01(Float32, a2)),
+     boxmuller_complex(Float32, u01(Float32, a3), u01(Float32, a4)))
+end
+@inline function philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4)
+    (boxmuller_complex(Float64,
+        u01(Float64, UInt64(a1) | UInt64(a2) << 32),
+        u01(Float64, UInt64(a3) | UInt64(a4) << 32)),)
 end
 
-@kernel function randn_complex_small_kernel!(@Const(seed), @Const(counter), A::AbstractArray{Complex{T}}) where T
+@kernel function randn_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
     gid = @index(Global, Linear)
-    idx = 2 * gid
+    N = vals_per_call(T)
+    idx = N * gid
     len = length(A)
     if idx <= len
-        a1, a2, a3, a4 = philox4x32_10(
-            (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            philox_key(seed))
-        @inbounds A[idx - 1] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
-        @inbounds A[idx]     = boxmuller_complex(T, T(u01(Float32, a3)), T(u01(Float32, a4)))
-    elseif idx - 1 <= len
-        a1, a2, _, _ = philox4x32_10(
+        vals = philox_to_normals(T, philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            philox_key(seed))
-        @inbounds A[len] = boxmuller_complex(T, T(u01(Float32, a1)), T(u01(Float32, a2)))
-    end
-end
-
-@kernel function randn_complex_large_kernel!(@Const(seed), @Const(counter), A::AbstractArray{Complex{T}}) where T
-    gid = @index(Global, Linear)
-    if gid <= length(A)
-        a1, a2, a3, a4 = philox4x32_10(
+            philox_key(seed))...)
+        for j in 1:N
+            @inbounds A[idx - N + j] = vals[j]
+        end
+    elseif idx - N < len
+        vals = philox_to_normals(T, philox4x32_10(
             (gid % UInt32, UInt32(0), counter, UInt32(0)),
-            philox_key(seed))
-        U1 = T(u01(Float64, UInt64(a1) | UInt64(a2) << 32))
-        U2 = T(u01(Float64, UInt64(a3) | UInt64(a4) << 32))
-        @inbounds A[gid] = boxmuller_complex(T, U1, U2)
+            philox_key(seed))...)
+        for j in 1:min(N, len - idx + N)
+            @inbounds A[idx - N + j] = vals[j]
+        end
     end
 end
 
-function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: AbstractFloat
-    isempty(A) && return A
-    if sizeof(T) <= 4
-        randn_small_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 4))
-    else
-        randn_large_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 2))
-    end
-    advance_counter!(rng)
-    A
-end
+const BatchedRandnTypes = Union{Float16, Float32, Float64,
+                                Complex{Float32}, Complex{Float64}}
 
-function Random.randn!(rng::RNG, A::AnyGPUArray{Complex{T}}) where T <: AbstractFloat
+function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandnTypes
     isempty(A) && return A
-    if sizeof(T) <= 4
-        randn_complex_small_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=cld(length(A), 2))
-    else
-        randn_complex_large_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A))
-    end
+    N = vals_per_call(T)
+    randn_batched_kernel!(get_backend(A))(rng.seed, rng.counter, A;
+                                          ndrange=cld(length(A), N))
     advance_counter!(rng)
     A
 end

From 7251da1d86f9e7a4d1615d4a77da7978645a0924 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:12:20 +0200
Subject: [PATCH 06/19] Add Int128/UInt128 to the batched rand! path

vals_per_call already returned 1 for 16-byte types; add the
corresponding philox_to_vals method and extend the union so rand!
dispatches to the batched kernel rather than the generic fallback.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 0f420246..007d9637 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -122,6 +122,10 @@ for T in (UInt64, Int64)
         ((UInt64(a1) | UInt64(a2) << 32) % $T,
          (UInt64(a3) | UInt64(a4) << 32) % $T)
 end
+for T in (UInt128, Int128)
+    @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) =
+        ((UInt128(a1) | UInt128(a2) << 32 | UInt128(a3) << 64 | UInt128(a4) << 96) % $T,)
+end
 @inline philox_to_vals(::Type{Complex{Float32}}, a1, a2, a3, a4) =
     (complex(u01(Float32, a1), u01(Float32, a2)),
      complex(u01(Float32, a3), u01(Float32, a4)))
@@ -159,7 +163,8 @@ end
 # Types with specialized batched kernels
 const BatchedRandTypes = Union{
     Float16, Float32, Float64, Complex{Float32}, Complex{Float64},
-    Bool, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64}
+    Bool, Int8, Int16, Int32, Int64, Int128,
+    UInt8, UInt16, UInt32, UInt64, UInt128}
 
 function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandTypes
     isempty(A) && return A

From a295992fe882080f0146fe567b9d8b0c3f10c21d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:14:35 +0200
Subject: [PATCH 07/19] Fold boxmuller_complex into boxmuller via type dispatch

Both variants had the same (::Type{T}, u1, u2) signature with different
output types; dispatch on Complex{T} vs T instead of using a separate
name, and keep both methods next to each other at the top of the file.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 007d9637..7094815f 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -63,12 +63,20 @@ end
 
 using Base.FastMath
 
-@inline function boxmuller(::Type{T}, u1::T, u2::T) where T
+@inline function boxmuller(::Type{T}, u1::T, u2::T) where T <: AbstractFloat
     r = sqrt(T(-2) * FastMath.log_fast(u1))
     s, c = sincospi(2 * u2)
     (r * s, r * c)
 end
 
+# For complex normals each component has variance 1/2, so the radius is
+# sqrt(-log(U)) rather than sqrt(-2*log(U)).
+@inline function boxmuller(::Type{Complex{T}}, u1::T, u2::T) where T <: AbstractFloat
+    r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1)))
+    s, c = sincospi(2 * u2)
+    complex(r * s, r * c)
+end
+
 
 ## RNG type
 
@@ -257,14 +265,6 @@ end
 # over the element type, with `philox_to_normals` producing N values per call
 # (matching `vals_per_call(T)`).
 
-# Box-Muller for complex: sqrt(-log(U)) not sqrt(-2*log(U)), so each component
-# has variance 1/2.
-@inline function boxmuller_complex(::Type{T}, u1::T, u2::T) where T
-    r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1)))
-    s, c = sincospi(2 * u2)
-    complex(r * s, r * c)
-end
-
 # Convert Philox UInt32 outputs to N normally-distributed values of type T.
 for T in (Float16, Float32)
     @eval @inline function philox_to_normals(::Type{$T}, a1, a2, a3, a4)
@@ -279,11 +279,11 @@ end
         u01(Float64, UInt64(a3) | UInt64(a4) << 32))
 end
 @inline function philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4)
-    (boxmuller_complex(Float32, u01(Float32, a1), u01(Float32, a2)),
-     boxmuller_complex(Float32, u01(Float32, a3), u01(Float32, a4)))
+    (boxmuller(Complex{Float32}, u01(Float32, a1), u01(Float32, a2)),
+     boxmuller(Complex{Float32}, u01(Float32, a3), u01(Float32, a4)))
 end
 @inline function philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4)
-    (boxmuller_complex(Float64,
+    (boxmuller(Complex{Float64},
         u01(Float64, UInt64(a1) | UInt64(a2) << 32),
         u01(Float64, UInt64(a3) | UInt64(a4) << 32)),)
 end

From 77b713c9e3383a5f0e926fc62b04c07ef0259bb7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:24:03 +0200
Subject: [PATCH 08/19] Drop CPU-RNG-on-GPU-array fallback

Allowed any AbstractRNG to fill a GPUArray by generating values on the
CPU and copying them over. That hides accidentally-slow code paths;
let unsupported (rng, eltype) combinations fail explicitly via
MethodError instead.

Also drop the matching cpu_rng exercise from the test suite.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl       |  7 -------
 test/testsuite/random.jl | 11 +----------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 7094815f..b8b510b9 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -323,10 +323,3 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandnTypes
 end
 
 
-## host API
-
-# allow use of CPU RNGs without scalar iteration
-Random.rand!(rng::AbstractRNG, A::AnyGPUArray) =
-    copyto!(A, rand(rng, eltype(A), size(A)...))
-Random.randn!(rng::AbstractRNG, A::AnyGPUArray) =
-    copyto!(A, randn(rng, eltype(A), size(A)...))
diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl
index b688dbde..17c3875e 100644
--- a/test/testsuite/random.jl
+++ b/test/testsuite/random.jl
@@ -4,7 +4,6 @@
     else
         Random.default_rng()
     end
-    cpu_rng = Random.default_rng()
 
     @testset "rand" begin  # uniform
         @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2), 3, (3,3))
@@ -13,10 +12,6 @@
             rand!(rng, A)
             rand!(rng, B)
             @test Array(A) != Array(B)
-
-            if rng != cpu_rng
-                rand!(cpu_rng, A)
-            end
         end
 
         # empty arrays
@@ -42,10 +37,6 @@
             randn!(rng, A)
             randn!(rng, B)
             @test Array(A) != Array(B)
-
-            if rng != cpu_rng
-                randn!(cpu_rng, A)
-            end
         end
 
         # complex randn
@@ -67,7 +58,7 @@
     end
 
     @testset "seeding" begin
-        if rng != cpu_rng
+        if AT <: AbstractGPUArray
             # seeding should produce reproducible results
             for T in (Float32, Float64)
                 if T in eltypes

From a631a57ad057f2181a07bec784546d9c8a59142b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:27:45 +0200
Subject: [PATCH 09/19] Add generic randn! fallback via ElementRNG

Mirrors the rand! structure: batched kernel for Float16/32/64 and their
complex variants, generic ElementRNG-based kernel for every other
AbstractFloat / Complex{<:AbstractFloat}. Non-batched types go through
Random's generic polar Box-Muller fallback, which is GPU-safe (ziggurat
is never reached because its signatures cover only Float16/32/64, which
dispatch to the batched kernel).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/host/random.jl b/src/host/random.jl
index b8b510b9..b8f04faa 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -323,3 +323,35 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandnTypes
 end
 
 
+## Generic randn! fallback via ElementRNG
+#
+# For AbstractFloats outside BatchedRandnTypes (BFloat16, user-defined float
+# types, etc.) we route through Random's `randn(rng, T)`. The reachable methods
+# from there are:
+#
+# - `randn(rng, ::BitFloatType)` (Float16/32/64) — ziggurat, uses global `wi`
+#   /`ki`/`fi` tables that aren't device-accessible. Never reached here because
+#   those types dispatch to the batched kernel above.
+# - `randn(rng, ::Type{Complex{T}})` — recurses into `randn(rng, T)`.
+# - `randn(rng, ::Type{T}) where T<:AbstractFloat` — Marsaglia polar Box-Muller
+#   rejection loop. GPU-safe (only calls `rand(rng, T)`) but warp-divergent.
+
+@kernel function randn_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    if gid <= length(A)
+        subctr = Ref{UInt32}(0)
+        rng = ElementRNG(seed, counter, gid % UInt32,
+                          Base.unsafe_convert(Ptr{UInt32}, subctr))
+        @inbounds A[gid] = randn(rng, T)
+    end
+end
+
+function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Union{AbstractFloat,
+                                                                     Complex{<:AbstractFloat}}
+    isempty(A) && return A
+    randn_generic_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A))
+    advance_counter!(rng)
+    A
+end
+
+

From 7a901d5cdfc4de32497759673813d71b38e62a08 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:31:25 +0200
Subject: [PATCH 10/19] Refresh random.jl comments

- u01 no longer claims "(0, 1]"; the Float64 path is strictly (0, 1).
- The generic rand! fallback comment listed types that have since been
  moved into BatchedRandTypes.
- The randn! intro claimed randn! couldn't use ElementRNG; there's now a
  generic ElementRNG fallback too. Reframe the three-point rationale as
  "why the batched path exists" rather than "why there's no generic path".
- Clarify the per-call batching table to cover Int128/UInt128 and
  Complex{Float32} explicitly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 49 +++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index b8f04faa..229318f8 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -41,7 +41,8 @@ end
 end
 
 
-## Float conversions: unsigned integer → uniform float in (0, 1]
+## Float conversions: unsigned integer → uniform float, strictly positive
+## (Float32 can round up to exactly 1.0; Float64 stays strictly below 1.0.)
 
 @inline function u01(::Type{Float32}, u::UInt32)
     fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
@@ -107,10 +108,10 @@ end
 ## Specialized rand! kernels for common types
 
 # Each Philox4x32 call produces 4 UInt32 outputs. Specialized kernels batch
-# multiple values per work item to use all 4 outputs efficiently.
-# Types that consume 1 UInt32 each get 4 values per call; types that need
-# 2 UInt32s (UInt64/Float64) get 2 values per call; complex types that need
-# 4 UInt32s get 1 value per call.
+# multiple values per work item to use all 4 outputs efficiently:
+# - ≤4-byte types (including Float16): 4 values per call
+# - 8-byte types (Int64/UInt64/Float64/Complex{Float32}): 2 values per call
+# - 16-byte types (Int128/UInt128/Complex{Float64}):     1 value per call
 
 # Convert Philox UInt32 outputs to N values of type T
 @inline philox_to_vals(::Type{Float16}, a1, a2, a3, a4) =
@@ -186,12 +187,13 @@ end
 
 ## Generic rand! fallback via ElementRNG
 
-# For types without specialized kernels (Float16, integers, Bool, UInt128, etc.),
-# an immutable ElementRNG wraps a Philox4x32 call as an AbstractRNG. Julia's Random
-# stdlib then handles all type conversions automatically via `rand(rng, T)`.
+# For Number types outside BatchedRandTypes (BigFloat, FixedPointNumbers,
+# user-defined types, ...), an immutable ElementRNG wraps a Philox4x32 call as
+# an AbstractRNG, and Julia's Random stdlib handles the type conversion via
+# `rand(rng, T)`.
 #
-# State (subcounter) lives in a stack-allocated Ref; the struct holds a pointer to
-# it. This avoids mutable struct heap allocation on GPU.
+# State (subcounter) lives in a stack-allocated Ref; the struct holds a pointer
+# to it. This avoids mutable struct heap allocation on GPU.
 
 struct ElementRNG <: AbstractRNG
     seed::UInt64
@@ -244,26 +246,23 @@ function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
 end
 
 
-## randn! kernel
+## Specialized randn! kernel for common float types
 
-# Unlike rand!, randn! can't use the generic ElementRNG approach:
+# Float16/32/64 and their complex variants go through a batched kernel mirroring
+# rand_batched_kernel!, rather than the generic ElementRNG path further down:
 #
-# 1. Random's default `randn` uses the Ziggurat algorithm with table lookups
-#    (`ki`, `wi`, `fi` arrays) that aren't accessible on GPU without device
-#    overlays (which are backend-specific, not available in KernelAbstractions).
+# 1. Random's `randn(rng, ::Float{16,32,64})` uses the Ziggurat algorithm with
+#    global table lookups (`ki`, `wi`, `fi`), which aren't device-accessible
+#    from a KernelAbstractions kernel.
 #
-# 2. The generic fallback `randn(rng, T::AbstractFloat)` uses Marsaglia's polar
-#    Box-Muller variant with a rejection loop (`while true ... 0 < s < 1`),
-#    which causes severe warp divergence on GPU (~2x slower).
+# 2. The polar Box-Muller fallback reachable for other AbstractFloat types has
+#    a rejection loop (`while true ... 0 < s < 1`), causing warp divergence
+#    (~2x slower).
 #
 # 3. Direct (non-rejection) Box-Muller naturally produces value *pairs* from
-#    sincospi. Batching these (4 Float32 / 2 Float64 per Philox call) is key
-#    to staying memory-bandwidth-bound; a 1-per-work-item generic kernel
-#    discards half the output.
-#
-# Instead, reuse the batched kernel structure from rand!: one kernel parametric
-# over the element type, with `philox_to_normals` producing N values per call
-# (matching `vals_per_call(T)`).
+#    sincospi. Batching these (4 Float32 / 2 Float64 per Philox call) keeps us
+#    memory-bandwidth-bound; a 1-per-work-item kernel would discard half the
+#    output.
 
 # Convert Philox UInt32 outputs to N normally-distributed values of type T.
 for T in (Float16, Float32)

From 7279ed05194fbc1d121a76bb08551fb4f5e76b16 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:33:48 +0200
Subject: [PATCH 11/19] Pull ElementRNG into its own section

ElementRNG and its Random stdlib dispatch methods are shared between the
generic rand! and randn! fallbacks; hoist them above both kernels rather
than living inside the "Generic rand! fallback" section.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 229318f8..1dc4b72e 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -185,15 +185,15 @@ function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandTypes
 end
 
 
-## Generic rand! fallback via ElementRNG
+## ElementRNG: a device-side AbstractRNG for generic rand/randn fallbacks
 
-# For Number types outside BatchedRandTypes (BigFloat, FixedPointNumbers,
-# user-defined types, ...), an immutable ElementRNG wraps a Philox4x32 call as
-# an AbstractRNG, and Julia's Random stdlib handles the type conversion via
-# `rand(rng, T)`.
+# For element types without a specialized batched kernel (BigFloat,
+# FixedPointNumbers, user-defined types, ...), we create a per-work-item
+# AbstractRNG and delegate to Random stdlib's `rand(rng, T)` / `randn(rng, T)`.
 #
-# State (subcounter) lives in a stack-allocated Ref; the struct holds a pointer
-# to it. This avoids mutable struct heap allocation on GPU.
+# The struct is immutable (mutable structs get heap-allocated on GPU); the
+# per-work-item subcounter state lives in a stack-allocated Ref that the
+# struct holds a pointer to.
 
 struct ElementRNG <: AbstractRNG
     seed::UInt64
@@ -228,6 +228,9 @@ end
 @inline Random.rand(rng::ElementRNG, ::Random.SamplerType{Complex{T}}) where {T<:Real} =
     complex(rand(rng, T), rand(rng, T))
 
+
+## Generic rand! fallback via ElementRNG
+
 @kernel function rand_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
     gid = @index(Global, Linear)
     if gid <= length(A)

From 776b03030e7038932d190bc1024a7459c202cd59 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 07:40:51 +0200
Subject: [PATCH 12/19] Drop now-redundant ElementRNG rand overrides

These @inline overrides of the generic `rand(::AbstractRNG, ::Type{T})`
and `SamplerType{Complex{T}}` methods were needed when ElementRNG was a
mutable struct, to keep the dispatch chain transparent enough for the
Ref-to-ptr trick to stack-allocate. With ElementRNG immutable, empirical
checks (LLVM inspection, BFloat16 rand and Complex{Float16} randn
benchmarks) show identical generated code and identical performance
whether the overrides are present or not. Base.Random defines both
methods with identical bodies, so nothing is lost by removing them.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 1dc4b72e..265f03b5 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -221,13 +221,6 @@ end
 @inline Random.rand(rng::ElementRNG, ::Random.SamplerType{T}) where T <: Union{Bool,Base.BitInteger} =
     rand(rng, Random.SamplerType{UInt64}()) % T
 
-# @inline overrides for Random stdlib entry points that aren't inlined by default.
-# Without these, the dispatch chain stays opaque and the Ref won't be optimized.
-@inline Random.rand(rng::ElementRNG, ::Type{T}) where {T} =
-    rand(rng, Random.Sampler(typeof(rng), T, Val(1)))
-@inline Random.rand(rng::ElementRNG, ::Random.SamplerType{Complex{T}}) where {T<:Real} =
-    complex(rand(rng, T), rand(rng, T))
-
 
 ## Generic rand! fallback via ElementRNG
 

From f5e47b8b4e66ea31cfc24ac24ef1108196d21617 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 08:15:08 +0200
Subject: [PATCH 13/19] Use polynomial sincospi for Float16/Float32 randn!
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Base.sincospi(::Float32) widens to Float64 internally (see
base/special/trig.jl:744 `sinpi_kernel_wide`), which breaks Metal
(no Float64 support) and forces FP64 emulation on other backends that
hit that path.

Vendor PhiloxRNG.jl's minimax polynomial: entirely Float32, consumes
the Philox UInt32 output directly. Rewrite the ≤32-bit boxmuller
methods to take UInt32 arguments and update philox_to_normals to
match. Float64 keeps using Base.sincospi (backends with FP64 have
intrinsics, and the polynomial alternative is ~8x slower on
consumer GPUs — see the follow-up log commit for measurements).

Note: Base.log(::Float32) has the same FP64-widening problem and is
fixed in the next commit.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 74 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 15 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 265f03b5..074e43b8 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -57,22 +57,65 @@ end
 end
 
 
-## Box-Muller transform using @fastmath log + sincospi
+## Fast sincospi for Box-Muller
+#
+# Base.sincospi(::Float32) widens to Float64 internally (via `sinpi_kernel_wide`,
+# see base/special/trig.jl:744), which breaks Metal — it has no Float64 support
+# at all — and wastes cycles on other backends that must emulate FP64.
+#
+# Vendored from PhiloxRNG.jl (MIT): a minimax polynomial that stays entirely
+# in Float32 and consumes the Philox UInt32 output directly. Bottom 3 bits
+# select one of 8 octants (swap/negate), upper 29 bits give the reduced
+# argument (+0.5-biased so y ≠ 0).
+#
+# Float64 keeps using Base.sincospi: backends that support FP64 all have
+# intrinsics, and the polynomial alternative is ~8× slower on consumer GPUs
+# with low FP64 throughput.
+
+const _SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0)
+const _CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0)
+
+@inline function _fast_sincospi(::Type{Float32}, u::UInt32)
+    oct = (u % Int32) & Int32(7)
+    y = fma(Float32(u & ~UInt32(7)), Float32(2)^(-34), Float32(2)^(-32))
+    sp = y * evalpoly(y * y, _SP_F32)
+    cp = evalpoly(y * y, _CP_F32)
+    swap    = !iszero(oct & Int32(1))
+    sin_neg = !iszero(oct & Int32(2))
+    cos_neg = !iszero(oct & Int32(4))
+    s_raw = ifelse(swap, cp, sp)
+    c_raw = ifelse(swap, sp, cp)
+    (ifelse(sin_neg, -s_raw, s_raw), ifelse(cos_neg, -c_raw, c_raw))
+end
+
 
-# Each backend provides optimized intrinsics via @device_override:
-# CUDA: __nv_fast_logf, __nv_sincospif, etc.
+## Box-Muller transform
 
 using Base.FastMath
 
-@inline function boxmuller(::Type{T}, u1::T, u2::T) where T <: AbstractFloat
-    r = sqrt(T(-2) * FastMath.log_fast(u1))
+# ≤32-bit float output: polynomial sincospi on the raw UInt32, log through
+# FastMath (Base.log(::Float32) also widens to Float64 — fixed in a follow-up).
+@inline function boxmuller(::Type{F}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32}
+    r = sqrt(-2f0 * FastMath.log_fast(u01(Float32, u2)))
+    s, c = _fast_sincospi(Float32, u1)
+    (F(r * s), F(r * c))
+end
+
+# Float64: Base.sincospi has FP64 intrinsics on the backends that support it.
+@inline function boxmuller(::Type{Float64}, u1::Float64, u2::Float64)
+    r = sqrt(-2.0 * FastMath.log_fast(u1))
     s, c = sincospi(2 * u2)
     (r * s, r * c)
 end
 
 # For complex normals each component has variance 1/2, so the radius is
-# sqrt(-log(U)) rather than sqrt(-2*log(U)).
-@inline function boxmuller(::Type{Complex{T}}, u1::T, u2::T) where T <: AbstractFloat
+# sqrt(-log(U)) rather than sqrt(-2·log(U)).
+@inline function boxmuller(::Type{Complex{F}}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32}
+    r = sqrt(-FastMath.log_fast(u01(Float32, u2)))
+    s, c = _fast_sincospi(Float32, u1)
+    complex(F(r * s), F(r * c))
+end
+@inline function boxmuller(::Type{Complex{Float64}}, u1::Float64, u2::Float64)
     r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1)))
     s, c = sincospi(2 * u2)
     complex(r * s, r * c)
@@ -261,10 +304,13 @@ end
 #    output.
 
 # Convert Philox UInt32 outputs to N normally-distributed values of type T.
+# ≤32-bit float targets pass UInt32s to boxmuller directly (the polynomial
+# sincospi extracts bits itself; log still goes through u01 for now). 64-bit
+# targets assemble UInt64s and convert to Float64 on the way in.
 for T in (Float16, Float32)
     @eval @inline function philox_to_normals(::Type{$T}, a1, a2, a3, a4)
-        n1, n2 = boxmuller($T, $T(u01(Float32, a1)), $T(u01(Float32, a2)))
-        n3, n4 = boxmuller($T, $T(u01(Float32, a3)), $T(u01(Float32, a4)))
+        n1, n2 = boxmuller($T, a1, a2)
+        n3, n4 = boxmuller($T, a3, a4)
         (n1, n2, n3, n4)
     end
 end
@@ -273,15 +319,13 @@ end
         u01(Float64, UInt64(a1) | UInt64(a2) << 32),
         u01(Float64, UInt64(a3) | UInt64(a4) << 32))
 end
-@inline function philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4)
-    (boxmuller(Complex{Float32}, u01(Float32, a1), u01(Float32, a2)),
-     boxmuller(Complex{Float32}, u01(Float32, a3), u01(Float32, a4)))
-end
-@inline function philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4)
+@inline philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4) =
+    (boxmuller(Complex{Float32}, a1, a2),
+     boxmuller(Complex{Float32}, a3, a4))
+@inline philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4) =
     (boxmuller(Complex{Float64},
         u01(Float64, UInt64(a1) | UInt64(a2) << 32),
         u01(Float64, UInt64(a3) | UInt64(a4) << 32)),)
-end
 
 @kernel function randn_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
     gid = @index(Global, Linear)

From e436f081346994080dbb95d068b5d8ed799ee665 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 08:15:57 +0200
Subject: [PATCH 14/19] Use polynomial log for Float16/Float32 randn!
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same Metal problem as sincospi in the previous commit:
Base.log(::Float32) widens to Float64 internally (see
base/special/log.jl:242, `Float64(2f0*f)/(2.0+f)`), which breaks Metal
and forces FP64 emulation on other backends that end up on that path.

Vendor PhiloxRNG.jl's Float32 polynomial log (itself ported from
fdlibm's e_logf.c). Fold the u01 conversion into the first fma so the
≤32-bit boxmuller never touches Float64.

Float64 still uses Base.log_fast — backends with FP64 support have
intrinsics for it, and the polynomial alternative is ~8x slower on
consumer GPUs (benchmarked: randn! F64 drops from ~97 GB/s with
intrinsics to ~12 GB/s with polynomials on an RTX 5080).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 074e43b8..54eebb9b 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -89,19 +89,54 @@ const _CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0)
 end
 
 
+## Fast log for Box-Muller
+#
+# Base.log(::Float32) widens to Float64 internally (see base/special/log.jl:242
+# `Float64(2f0*f)/(2.0+f)`), same Metal / FP64-emulation problem as sincospi.
+#
+# Vendored from PhiloxRNG.jl (MIT), ported from fdlibm's e_logf.c: a Float32
+# minimax polynomial. Takes the raw Philox UInt32 output; the u01 conversion
+# is folded into the first fma so there's no intermediate float.
+#
+# Same Float64-path reasoning as the sincospi block above.
+
+const _SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5)))
+const _LOG_ODD_F32   = (reinterpret(Float32, Int32(0x3f2aaaaa)),
+                        reinterpret(Float32, Int32(0x3e91e9ee)))
+const _LOG_EVEN_F32  = (reinterpret(Float32, Int32(0x3eccce13)),
+                        reinterpret(Float32, Int32(0x3e789e26)))
+
+@inline function _fast_log(::Type{Float32}, u::UInt32)
+    x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
+    ix = reinterpret(Int32, x) - _SQRT_HALF_I32
+    k = ix >> Int32(23)
+    f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + _SQRT_HALF_I32) - 1.0f0
+    f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33))
+    f = ifelse(k == Int32(0), f_comp, f_std)
+    s = f / (2.0f0 + f)
+    z = s * s; w = z * z
+    R = z * evalpoly(w, _LOG_ODD_F32) + w * evalpoly(w, _LOG_EVEN_F32)
+    hfsq = 0.5f0 * f * f
+    Float32(k) * reinterpret(Float32, Int32(0x3f317180)) -
+        ((hfsq - (s * (hfsq + R) +
+          Float32(k) * reinterpret(Float32, Int32(0x3717f7d1)))) - f)
+end
+
+
 ## Box-Muller transform
 
 using Base.FastMath
 
-# ≤32-bit float output: polynomial sincospi on the raw UInt32, log through
-# FastMath (Base.log(::Float32) also widens to Float64 — fixed in a follow-up).
+# ≤32-bit float output: both log and sincospi go through the Float32
+# polynomials above.
 @inline function boxmuller(::Type{F}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32}
-    r = sqrt(-2f0 * FastMath.log_fast(u01(Float32, u2)))
+    r = sqrt(-2f0 * _fast_log(Float32, u2))
     s, c = _fast_sincospi(Float32, u1)
     (F(r * s), F(r * c))
 end
 
-# Float64: Base.sincospi has FP64 intrinsics on the backends that support it.
+# Float64: Base.log_fast / Base.sincospi have FP64 intrinsics on the backends
+# that support it.
 @inline function boxmuller(::Type{Float64}, u1::Float64, u2::Float64)
     r = sqrt(-2.0 * FastMath.log_fast(u1))
     s, c = sincospi(2 * u2)
@@ -111,7 +146,7 @@ end
 # For complex normals each component has variance 1/2, so the radius is
 # sqrt(-log(U)) rather than sqrt(-2·log(U)).
 @inline function boxmuller(::Type{Complex{F}}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32}
-    r = sqrt(-FastMath.log_fast(u01(Float32, u2)))
+    r = sqrt(-_fast_log(Float32, u2))
     s, c = _fast_sincospi(Float32, u1)
     complex(F(r * s), F(r * c))
 end

From 80bd752e9ffef54fd4abeeef8b240d4598076671 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 08:52:50 +0200
Subject: [PATCH 15/19] Bypass Base's ziggurat randn in ElementRNG
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Base's randn(rng, ::BitFloatType) is the ziggurat path that reads
global wi/ki/fi tables, which aren't device-accessible — and the Float64
tables can't even be loaded on Metal. Direct randn!(rng, A) for
Float16/32/64 dispatches to the batched kernel so this seemed dormant,
but Base's randn(rng, Complex{T}) recurses into randn(rng, T): a
randn!(rng, A::GPUArray{Complex{Float16}}) call therefore hit ziggurat
via the generic path.

Override randn(rng::ElementRNG, ::Type{Float{16,32,64}}) to use our
Box-Muller directly. Covers any current or future caller that recurses
through these types.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/host/random.jl | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index 54eebb9b..af171d8e 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -404,12 +404,25 @@ end
 # from there are:
 #
 # - `randn(rng, ::BitFloatType)` (Float16/32/64) — ziggurat, uses global `wi`
-#   /`ki`/`fi` tables that aren't device-accessible. Never reached here because
-#   those types dispatch to the batched kernel above.
+#   /`ki`/`fi` tables that aren't device-accessible. Overridden below to use
+#   our Box-Muller directly. The direct dispatch for these types goes through
+#   the batched kernel above, but `randn(rng, Complex{Float16})` recurses into
+#   `randn(rng, Float16)` which hits this path.
 # - `randn(rng, ::Type{Complex{T}})` — recurses into `randn(rng, T)`.
 # - `randn(rng, ::Type{T}) where T<:AbstractFloat` — Marsaglia polar Box-Muller
 #   rejection loop. GPU-safe (only calls `rand(rng, T)`) but warp-divergent.
 
+# Bypass Base's ziggurat-based randn(rng, Float{16,32,64}) — its `wi`/`ki`/`fi`
+# tables aren't device-accessible, and on Metal the Float64 tables can't even
+# be loaded. Reached via Base's Complex recursion when the element type is
+# e.g. Complex{Float16}.
+@inline Random.randn(rng::ElementRNG, ::Type{Float16}) =
+    first(boxmuller(Float16, rand(rng, UInt32), rand(rng, UInt32)))
+@inline Random.randn(rng::ElementRNG, ::Type{Float32}) =
+    first(boxmuller(Float32, rand(rng, UInt32), rand(rng, UInt32)))
+@inline Random.randn(rng::ElementRNG, ::Type{Float64}) =
+    first(boxmuller(Float64, u01(Float64, rand(rng, UInt64)), u01(Float64, rand(rng, UInt64))))
+
 @kernel function randn_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
     gid = @index(Global, Linear)
     if gid <= length(A)

From 01df466c20afa3244becddc88b6bebdb1984f142 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 08:59:02 +0200
Subject: [PATCH 16/19] Drop underscore prefix on fast log/sincospi helpers

These are internal names but not meant as private underscore-prefixed
Julia convention; the leading _ was just carried over from PhiloxRNG.
Rename to fast_log / fast_sincospi / SP_F32 / CP_F32 / SQRT_HALF_I32 /
LOG_ODD_F32 / LOG_EVEN_F32.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

[ci skip]
---
 src/host/random.jl | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index af171d8e..a4ff7814 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -72,14 +72,14 @@ end
 # intrinsics, and the polynomial alternative is ~8× slower on consumer GPUs
 # with low FP64 throughput.
 
-const _SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0)
-const _CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0)
+const SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0)
+const CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0)
 
-@inline function _fast_sincospi(::Type{Float32}, u::UInt32)
+@inline function fast_sincospi(::Type{Float32}, u::UInt32)
     oct = (u % Int32) & Int32(7)
     y = fma(Float32(u & ~UInt32(7)), Float32(2)^(-34), Float32(2)^(-32))
-    sp = y * evalpoly(y * y, _SP_F32)
-    cp = evalpoly(y * y, _CP_F32)
+    sp = y * evalpoly(y * y, SP_F32)
+    cp = evalpoly(y * y, CP_F32)
     swap    = !iszero(oct & Int32(1))
     sin_neg = !iszero(oct & Int32(2))
     cos_neg = !iszero(oct & Int32(4))
@@ -100,22 +100,22 @@ end
 #
 # Same Float64-path reasoning as the sincospi block above.
 
-const _SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5)))
-const _LOG_ODD_F32   = (reinterpret(Float32, Int32(0x3f2aaaaa)),
+const SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5)))
+const LOG_ODD_F32   = (reinterpret(Float32, Int32(0x3f2aaaaa)),
                         reinterpret(Float32, Int32(0x3e91e9ee)))
-const _LOG_EVEN_F32  = (reinterpret(Float32, Int32(0x3eccce13)),
+const LOG_EVEN_F32  = (reinterpret(Float32, Int32(0x3eccce13)),
                         reinterpret(Float32, Int32(0x3e789e26)))
 
-@inline function _fast_log(::Type{Float32}, u::UInt32)
+@inline function fast_log(::Type{Float32}, u::UInt32)
     x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
-    ix = reinterpret(Int32, x) - _SQRT_HALF_I32
+    ix = reinterpret(Int32, x) - SQRT_HALF_I32
     k = ix >> Int32(23)
-    f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + _SQRT_HALF_I32) - 1.0f0
+    f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + SQRT_HALF_I32) - 1.0f0
     f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33))
     f = ifelse(k == Int32(0), f_comp, f_std)
     s = f / (2.0f0 + f)
     z = s * s; w = z * z
-    R = z * evalpoly(w, _LOG_ODD_F32) + w * evalpoly(w, _LOG_EVEN_F32)
+    R = z * evalpoly(w, LOG_ODD_F32) + w * evalpoly(w, LOG_EVEN_F32)
     hfsq = 0.5f0 * f * f
     Float32(k) * reinterpret(Float32, Int32(0x3f317180)) -
         ((hfsq - (s * (hfsq + R) +
@@ -130,8 +130,8 @@ using Base.FastMath
 # ≤32-bit float output: both log and sincospi go through the Float32
 # polynomials above.
 @inline function boxmuller(::Type{F}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32}
-    r = sqrt(-2f0 * _fast_log(Float32, u2))
-    s, c = _fast_sincospi(Float32, u1)
+    r = sqrt(-2f0 * fast_log(Float32, u2))
+    s, c = fast_sincospi(Float32, u1)
     (F(r * s), F(r * c))
 end
 
@@ -146,8 +146,8 @@ end
 # For complex normals each component has variance 1/2, so the radius is
 # sqrt(-log(U)) rather than sqrt(-2·log(U)).
 @inline function boxmuller(::Type{Complex{F}}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32}
-    r = sqrt(-_fast_log(Float32, u2))
-    s, c = _fast_sincospi(Float32, u1)
+    r = sqrt(-fast_log(Float32, u2))
+    s, c = fast_sincospi(Float32, u1)
     complex(F(r * s), F(r * c))
 end
 @inline function boxmuller(::Type{Complex{Float64}}, u1::Float64, u2::Float64)

From dba131b9a734c165df17ef631f5aa5d55a3a4ee3 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 10:40:45 +0200
Subject: [PATCH 17/19] Parameterize RNG on the GPU array type.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`GPUArrays.RNG{AT}` now carries the destination array type, so out-of-place
`rand(rng, T, dims)` / `randn(rng, T, dims)` allocate an `AT{T}` instead of
falling back to Random stdlib's CPU `Array` constructor.

The fill paths (`Random.rand!` / `Random.randn!` on `AnyGPUArray`) ignore
`AT` and dispatch on the destination's KernelAbstractions backend, so a
single RNG can fill any GPU array. A new CPU-array fallback uses `AT` to
allocate a temporary, fill it on-device, then `copyto!` the destination —
prevents Random's scalar-iter fallback when filling a `Vector` from a GPU
RNG.

Removes the `default_rng(::Type{<:AnyGPUArray})` interface: it was only
called by the testsuite (now uses `GPUArrays.RNG{AT}()` directly) and by
the per-backend shims (now redundant — the type parameter replaces them).
JLArrays' GLOBAL_RNG ref + override go away.
---
 lib/JLArrays/src/JLArrays.jl |  8 ------
 src/host/random.jl           | 55 ++++++++++++++++++++++++++++++++----
 test/testsuite/random.jl     |  2 +-
 3 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 5903b444..3557b63c 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -539,14 +539,6 @@ end
 
 using Random
 
-const GLOBAL_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing)
-function GPUArrays.default_rng(::Type{<:JLArray})
-    if GLOBAL_RNG[] === nothing
-        GLOBAL_RNG[] = GPUArrays.RNG()
-    end
-    GLOBAL_RNG[]
-end
-
 
 ## GPUArrays interfaces
 
diff --git a/src/host/random.jl b/src/host/random.jl
index a4ff7814..bae6c688 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -158,17 +158,19 @@ end
 
 
 ## RNG type
+#
+# Parameterized on the GPU array type `AT` (e.g. `CuArray`, `ROCArray`, `JLArray`).
+# `AT` is only consulted by the *out-of-place* `rand`/`randn` constructors and by
+# the CPU-array fill fallback — the in-place `rand!`/`randn!` GPU paths dispatch
+# on the destination's KernelAbstractions backend and ignore `AT`.
 
-mutable struct RNG <: AbstractRNG
+mutable struct RNG{AT} <: AbstractRNG
     seed::UInt64
     counter::UInt32
 end
 
-RNG() = RNG(rand(Random.RandomDevice(), UInt64), UInt32(0))
-RNG(seed::Integer) = RNG(seed % UInt64, UInt32(0))
-
-# return an instance of GPUArrays.RNG suitable for the requested array type
-default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE
+RNG{AT}() where {AT} = RNG{AT}(rand(Random.RandomDevice(), UInt64), UInt32(0))
+RNG{AT}(seed::Integer) where {AT} = RNG{AT}(seed % UInt64, UInt32(0))
 
 Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt64); rng.counter = 0; rng)
 Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt64; rng.counter = 0; rng)
@@ -442,3 +444,44 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Union{AbstractFlo
 end
 
 
+## Non-GPU array fallback: generate on AT, copyto! the destination.
+#
+# Without this, `rand!(rng::RNG{CuArray}, ::Vector)` would hit Random's stdlib
+# scalar path and silently iterate the GPU rng one element at a time.
+
+function Random.rand!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T}
+    isempty(A) && return A
+    B = AT{T}(undef, size(A))
+    Random.rand!(rng, B)
+    copyto!(A, B)
+end
+function Random.randn!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T}
+    isempty(A) && return A
+    B = AT{T}(undef, size(A))
+    Random.randn!(rng, B)
+    copyto!(A, B)
+end
+
+
+## Out-of-place rand / randn — construct an AT array and fill it.
+
+Random.rand(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T} =
+    Random.rand!(rng, AT{T}(undef, dims))
+Random.randn(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T<:Union{AbstractFloat,
+                                                                       Complex{<:AbstractFloat}}} =
+    Random.randn!(rng, AT{T}(undef, dims))
+
+# untyped: default to Float32 (matches CUDA convention; better fit than Float64
+# on consumer GPUs)
+Random.rand(rng::RNG{AT}, dims::Dims) where {AT} = Random.rand(rng, Float32, dims)
+Random.randn(rng::RNG{AT}, dims::Dims) where {AT} = Random.randn(rng, Float32, dims)
+
+# variadic dim spellings
+Random.rand(rng::RNG, dim1::Integer, dims::Integer...) =
+    Random.rand(rng, Dims((dim1, dims...)))
+Random.randn(rng::RNG, dim1::Integer, dims::Integer...) =
+    Random.randn(rng, Dims((dim1, dims...)))
+Random.rand(rng::RNG, ::Type{T}, dim1::Integer, dims::Integer...) where T =
+    Random.rand(rng, T, Dims((dim1, dims...)))
+Random.randn(rng::RNG, ::Type{T}, dim1::Integer, dims::Integer...) where T =
+    Random.randn(rng, T, Dims((dim1, dims...)))
diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl
index 17c3875e..7ed2fd56 100644
--- a/test/testsuite/random.jl
+++ b/test/testsuite/random.jl
@@ -1,6 +1,6 @@
 @testsuite "random" (AT, eltypes)->begin
     rng = if AT <: AbstractGPUArray
-        GPUArrays.default_rng(AT)
+        GPUArrays.RNG{AT}()
     else
         Random.default_rng()
     end

From 529c2cbc89ee20a0ba48908eafb38e2c28b1856b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 10:50:04 +0200
Subject: [PATCH 18/19] Use similar(AT{T}, dims) instead of AT{T}(undef, dims).

similar is the documented Base interface for type-driven array construction
(base/abstractarray.jl:873), and its default implementation calls T(undef, dims).
Lets array types that override similar(::Type{T}, ::Dims) hook in cleanly.
---
 src/host/random.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/host/random.jl b/src/host/random.jl
index bae6c688..54752b6a 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -451,13 +451,13 @@ end
 
 function Random.rand!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T}
     isempty(A) && return A
-    B = AT{T}(undef, size(A))
+    B = similar(AT{T}, size(A))
     Random.rand!(rng, B)
     copyto!(A, B)
 end
 function Random.randn!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T}
     isempty(A) && return A
-    B = AT{T}(undef, size(A))
+    B = similar(AT{T}, size(A))
     Random.randn!(rng, B)
     copyto!(A, B)
 end
@@ -466,10 +466,10 @@ end
 ## Out-of-place rand / randn — construct an AT array and fill it.
 
 Random.rand(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T} =
-    Random.rand!(rng, AT{T}(undef, dims))
+    Random.rand!(rng, similar(AT{T}, dims))
 Random.randn(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T<:Union{AbstractFloat,
                                                                        Complex{<:AbstractFloat}}} =
-    Random.randn!(rng, AT{T}(undef, dims))
+    Random.randn!(rng, similar(AT{T}, dims))
 
 # untyped: default to Float32 (matches CUDA convention; better fit than Float64
 # on consumer GPUs)

From 8c1f396833302a55059f743313ca6fd3ac939a7e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 15 Apr 2026 10:55:47 +0200
Subject: [PATCH 19/19] Keep GPUArrays.default_rng as an empty stub for
 downstream compat.

Metal.jl and other backends still extend GPUArrays.default_rng even
though the interface is no longer called. Without a function declaration
in GPUArrays, those extension definitions fail to parse with
UndefVarError. Empty function stub is enough for the transition.

Also fix the RNG(state::AbstractGPUArray) deprecation: it used to call
RNG() (zero-arg) which no longer exists. Now extracts AT from the state
array's type and calls RNG{AT}().
---
 src/deprecated.jl | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/deprecated.jl b/src/deprecated.jl
index 62ef634f..f8927e23 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -2,11 +2,17 @@
 # The new stateless Philox4x32 RNG doesn't need either.
 
 function RNG(state::AbstractGPUArray)
-    Base.depwarn("RNG(state::AbstractGPUArray) is deprecated, use RNG() instead", :RNG)
-    RNG()
+    AT = Base.typename(typeof(state)).wrapper
+    Base.depwarn("RNG(state::AbstractGPUArray) is deprecated, use RNG{$AT}() instead", :RNG)
+    RNG{AT}()
 end
 
 function Random.seed!(rng::RNG, seed::Vector{UInt32})
     Base.depwarn("seed!(rng::RNG, seed::Vector{UInt32}) is deprecated, use seed!(rng, seed::Integer) instead", :seed!)
     Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt64) : first(seed))
 end
+
+# Stub kept so downstream packages that still extend `GPUArrays.default_rng`
+# (Metal.jl, etc.) continue to load. The interface itself is gone — `RNG{AT}()`
+# is now constructed directly — so any extension of this is dead code.
+function default_rng end