diff --git a/Project.toml b/Project.toml
index 3b7c2e69..ccaaad35 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "GPUArrays"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "11.4.1"
+version = "11.5.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 7bba4b95..3557b63c 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -539,18 +539,6 @@ end
 
 using Random
 
-const GLOBAL_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing)
-function GPUArrays.default_rng(::Type{<:JLArray})
-    if GLOBAL_RNG[] === nothing
-        N = MAXTHREADS
-        state = JLArray{NTuple{4, UInt32}}(undef, N)
-        rng = GPUArrays.RNG(state)
-        Random.seed!(rng)
-        GLOBAL_RNG[] = rng
-    end
-    GLOBAL_RNG[]
-end
-
 
 ## GPUArrays interfaces
 
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index a35c1ff0..17a4d897 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -38,5 +38,6 @@ include("host/statistics.jl")
 include("host/sparse.jl")
 include("host/alloc_cache.jl")
 
+include("deprecated.jl")
 
 end # module
diff --git a/src/deprecated.jl b/src/deprecated.jl
new file mode 100644
index 00000000..f8927e23
--- /dev/null
+++ b/src/deprecated.jl
@@ -0,0 +1,18 @@
+# RNG: the old RNG took a GPU state array and Vector{UInt32} seeds.
+# The new stateless Philox4x32 RNG doesn't need either.
+
+function RNG(state::AbstractGPUArray)
+    AT = Base.typename(typeof(state)).wrapper
+    Base.depwarn("RNG(state::AbstractGPUArray) is deprecated, use RNG{$AT}() instead", :RNG)
+    RNG{AT}()
+end
+
+function Random.seed!(rng::RNG, seed::Vector{UInt32})
+    Base.depwarn("seed!(rng::RNG, seed::Vector{UInt32}) is deprecated, use seed!(rng, seed::Integer) instead", :seed!)
+    Random.seed!(rng, isempty(seed) ? rand(Random.RandomDevice(), UInt64) : first(seed))
+end
+
+# Stub kept so downstream packages that still extend `GPUArrays.default_rng`
+# (Metal.jl, etc.) continue to load. The interface itself is gone — `RNG{AT}()`
+# is now constructed directly — so any extension of this is dead code.
+function default_rng end
diff --git a/src/host/random.jl b/src/host/random.jl
index 7759e841..54752b6a 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -1,119 +1,487 @@
 # integration with Random stdlib
 
-## device interface
+## Philox4x32-10 counter-based RNG
 
-# hybrid Tausworthe and Linear Congruent generator from
-# https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch37.html
+# Stateless: (counter, key) → 4 UInt32 outputs. Each unique counter gives independent
+# random values with no shared memory or global state needed.
 #
-# only generates Float32 or Float64 numbers, conversion happens elsewhere
+# Reference: Salmon et al., "Parallel Random Numbers: As Easy as 1, 2, 3" (2011)
 
-function TausStep(z::Unsigned, S1::Integer, S2::Integer, S3::Integer, M::Unsigned)
-    b = (((z << S1) ⊻ z) >> S2)
-    return (((z & M) << S3) ⊻ b)
+const PHILOX_M4x32_0 = 0xD2511F53
+const PHILOX_M4x32_1 = 0xCD9E8D57
+const PHILOX_W32_0   = 0x9E3779B9
+const PHILOX_W32_1   = 0xBB67AE85
+
+@inline function philox4x32round(ctr::NTuple{4,UInt32}, key::NTuple{2,UInt32})
+    mul0 = widemul(PHILOX_M4x32_0, ctr[1])
+    mul1 = widemul(PHILOX_M4x32_1, ctr[3])
+    hi0 = (mul0 >> 32) % UInt32
+    hi1 = (mul1 >> 32) % UInt32
+    lo0 = mul0 % UInt32
+    lo1 = mul1 % UInt32
+    (hi1 ⊻ ctr[2] ⊻ key[1], lo1, hi0 ⊻ ctr[4] ⊻ key[2], lo0)
+end
+
+@inline function philox4x32bumpkey(key::NTuple{2,UInt32})
+    (key[1] + PHILOX_W32_0, key[2] + PHILOX_W32_1)
+end
+
+@inline function philox4x32_10(ctr::NTuple{4,UInt32}, key::NTuple{2,UInt32})
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key); key = philox4x32bumpkey(key)
+    ctr = philox4x32round(ctr, key)
+    ctr
+end
+
+
+## Float conversions: unsigned integer → uniform float, strictly positive
+## (Float32 can round up to exactly 1.0; Float64 stays strictly below 1.0.)
+
+@inline function u01(::Type{Float32}, u::UInt32)
+    fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
+end
+
+@inline function u01(::Type{Float64}, u::UInt64)
+    # Bit-pattern construction avoids the expensive Float64(::UInt64) conversion
+    # and fma on consumer GPUs (where FP64 throughput is as low as 1:64). The
+    # low bit of the mantissa is forced set so the result is strictly in (0, 1),
+    # which is required by Box-Muller's log(u).
+    reinterpret(Float64, ((u >> 12) | 0x1) | 0x3ff0000000000000) - 1.0
+end
+
+
+## Fast sincospi for Box-Muller
+#
+# Base.sincospi(::Float32) widens to Float64 internally (via `sinpi_kernel_wide`,
+# see base/special/trig.jl:744), which breaks Metal — it has no Float64 support
+# at all — and wastes cycles on other backends that must emulate FP64.
+#
+# Vendored from PhiloxRNG.jl (MIT): a minimax polynomial that stays entirely
+# in Float32 and consumes the Philox UInt32 output directly. Bottom 3 bits
+# select one of 8 octants (swap/negate), upper 29 bits give the reduced
+# argument (+0.5-biased so y ≠ 0).
+#
+# Float64 keeps using Base.sincospi: backends that support FP64 all have
+# intrinsics, and the polynomial alternative is ~8× slower on consumer GPUs
+# with low FP64 throughput.
+
+const SP_F32 = (3.1415927f0, -5.167708f0, 2.5497673f0, -0.58907866f0)
+const CP_F32 = (1.0f0, -4.934788f0, 4.057578f0, -1.3061346f0)
+
+@inline function fast_sincospi(::Type{Float32}, u::UInt32)
+    oct = (u % Int32) & Int32(7)
+    y = fma(Float32(u & ~UInt32(7)), Float32(2)^(-34), Float32(2)^(-32))
+    sp = y * evalpoly(y * y, SP_F32)
+    cp = evalpoly(y * y, CP_F32)
+    swap    = !iszero(oct & Int32(1))
+    sin_neg = !iszero(oct & Int32(2))
+    cos_neg = !iszero(oct & Int32(4))
+    s_raw = ifelse(swap, cp, sp)
+    c_raw = ifelse(swap, sp, cp)
+    (ifelse(sin_neg, -s_raw, s_raw), ifelse(cos_neg, -c_raw, c_raw))
 end
 
-LCGStep(z::Unsigned, A::Unsigned, C::Unsigned) = A * z + C
 
-make_rand_num(::Type{Float64}, tmp) = 2.3283064365387e-10 * Float64(tmp)
-make_rand_num(::Type{Float32}, tmp) = 2.3283064f-10 * Float32(tmp)
-# NOTE: the rng state is often not representable as Float16, so perform this in Float32.
-make_rand_num(::Type{Float16}, tmp) = Float16(2.3283064f-10 * Float32(tmp))
+## Fast log for Box-Muller
+#
+# Base.log(::Float32) widens to Float64 internally (see base/special/log.jl:242
+# `Float64(2f0*f)/(2.0+f)`), same Metal / FP64-emulation problem as sincospi.
+#
+# Vendored from PhiloxRNG.jl (MIT), ported from fdlibm's e_logf.c: a Float32
+# minimax polynomial. Takes the raw Philox UInt32 output; the u01 conversion
+# is folded into the first fma so there's no intermediate float.
+#
+# Same Float64-path reasoning as the sincospi block above.
+
+const SQRT_HALF_I32 = reinterpret(Int32, Float32(sqrt(0.5)))
+const LOG_ODD_F32   = (reinterpret(Float32, Int32(0x3f2aaaaa)),
+                        reinterpret(Float32, Int32(0x3e91e9ee)))
+const LOG_EVEN_F32  = (reinterpret(Float32, Int32(0x3eccce13)),
+                        reinterpret(Float32, Int32(0x3e789e26)))
 
-function next_rand(state::NTuple{4, T}) where {T <: Unsigned}
-    state = (
-        TausStep(state[1], Cint(13), Cint(19), Cint(12), T(4294967294)),
-        TausStep(state[2], Cint(2), Cint(25), Cint(4), T(4294967288)),
-        TausStep(state[3], Cint(3), Cint(11), Cint(17), T(4294967280)),
-        LCGStep(state[4], T(1664525), T(1013904223))
-    )
-    tmp = (state[1] ⊻ state[2] ⊻ state[3] ⊻ state[4])
-    return state, tmp
+@inline function fast_log(::Type{Float32}, u::UInt32)
+    x = fma(Float32(u), Float32(2)^(-32), Float32(2)^(-33))
+    ix = reinterpret(Int32, x) - SQRT_HALF_I32
+    k = ix >> Int32(23)
+    f_std = reinterpret(Float32, (ix & Int32(0x007fffff)) + SQRT_HALF_I32) - 1.0f0
+    f_comp = -fma(Float32(~u), Float32(2)^(-32), Float32(2)^(-33))
+    f = ifelse(k == Int32(0), f_comp, f_std)
+    s = f / (2.0f0 + f)
+    z = s * s; w = z * z
+    R = z * evalpoly(w, LOG_ODD_F32) + w * evalpoly(w, LOG_EVEN_F32)
+    hfsq = 0.5f0 * f * f
+    Float32(k) * reinterpret(Float32, Int32(0x3f317180)) -
+        ((hfsq - (s * (hfsq + R) +
+          Float32(k) * reinterpret(Float32, Int32(0x3717f7d1)))) - f)
 end
 
-function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T
-    stateful_rand = next_rand(randstate[threadid])
-    randstate[threadid] = stateful_rand[1]
-    return make_rand_num(T, stateful_rand[2])
+
+## Box-Muller transform
+
+using Base.FastMath
+
+# ≤32-bit float output: both log and sincospi go through the Float32
+# polynomials above.
+@inline function boxmuller(::Type{F}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32}
+    r = sqrt(-2f0 * fast_log(Float32, u2))
+    s, c = fast_sincospi(Float32, u1)
+    (F(r * s), F(r * c))
 end
 
-function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T <: Integer
-    result = zero(T)
-    if sizeof(T) >= 4
-        for _ in 1:sizeof(T) >> 2
-            randstate[threadid], y = next_rand(randstate[threadid])
-            result = reinterpret(T, (|)(promote(result << 32, y)...))
+# Float64: Base.log_fast / Base.sincospi have FP64 intrinsics on the backends
+# that support it.
+@inline function boxmuller(::Type{Float64}, u1::Float64, u2::Float64)
+    r = sqrt(-2.0 * FastMath.log_fast(u1))
+    s, c = sincospi(2 * u2)
+    (r * s, r * c)
+end
+
+# For complex normals each component has variance 1/2, so the radius is
+# sqrt(-log(U)) rather than sqrt(-2·log(U)).
+@inline function boxmuller(::Type{Complex{F}}, u1::UInt32, u2::UInt32) where F <: Union{Float16,Float32}
+    r = sqrt(-fast_log(Float32, u2))
+    s, c = fast_sincospi(Float32, u1)
+    complex(F(r * s), F(r * c))
+end
+@inline function boxmuller(::Type{Complex{Float64}}, u1::Float64, u2::Float64)
+    r = sqrt(FastMath.neg_float_fast(FastMath.log_fast(u1)))
+    s, c = sincospi(2 * u2)
+    complex(r * s, r * c)
+end
+
+
+## RNG type
+#
+# Parameterized on the GPU array type `AT` (e.g. `CuArray`, `ROCArray`, `JLArray`).
+# `AT` is only consulted by the *out-of-place* `rand`/`randn` constructors and by
+# the CPU-array fill fallback — the in-place `rand!`/`randn!` GPU paths dispatch
+# on the destination's KernelAbstractions backend and ignore `AT`.
+
+mutable struct RNG{AT} <: AbstractRNG
+    seed::UInt64
+    counter::UInt32
+end
+
+RNG{AT}() where {AT} = RNG{AT}(rand(Random.RandomDevice(), UInt64), UInt32(0))
+RNG{AT}(seed::Integer) where {AT} = RNG{AT}(seed % UInt64, UInt32(0))
+
+Random.seed!(rng::RNG) = (rng.seed = rand(Random.RandomDevice(), UInt64); rng.counter = 0; rng)
+Random.seed!(rng::RNG, seed::Integer) = (rng.seed = seed % UInt64; rng.counter = 0; rng)
+
+function advance_counter!(rng::RNG)
+    rng.counter += one(UInt32)
+    rng.counter == 0 && (rng.seed += one(UInt64))
+    rng
+end
+
+# Split the 64-bit seed into the two 32-bit lanes of the Philox key.
+@inline philox_key(seed::UInt64) = (seed % UInt32, (seed >> 32) % UInt32)
+
+
+## Specialized rand! kernels for common types
+
+# Each Philox4x32 call produces 4 UInt32 outputs. Specialized kernels batch
+# multiple values per work item to use all 4 outputs efficiently:
+# - ≤4-byte types (including Float16): 4 values per call
+# - 8-byte types (Int64/UInt64/Float64/Complex{Float32}): 2 values per call
+# - 16-byte types (Int128/UInt128/Complex{Float64}):     1 value per call
+
+# Convert Philox UInt32 outputs to N values of type T
+@inline philox_to_vals(::Type{Float16}, a1, a2, a3, a4) =
+    (Float16(u01(Float32, a1)), Float16(u01(Float32, a2)),
+     Float16(u01(Float32, a3)), Float16(u01(Float32, a4)))
+@inline philox_to_vals(::Type{Float32}, a1, a2, a3, a4) =
+    (u01(Float32, a1), u01(Float32, a2), u01(Float32, a3), u01(Float32, a4))
+for T in (UInt8, UInt16, UInt32, Int8, Int16, Int32, Bool)
+    @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) =
+        (a1 % $T, a2 % $T, a3 % $T, a4 % $T)
+end
+@inline philox_to_vals(::Type{Float64}, a1, a2, a3, a4) =
+    (u01(Float64, UInt64(a1) | UInt64(a2) << 32),
+     u01(Float64, UInt64(a3) | UInt64(a4) << 32))
+for T in (UInt64, Int64)
+    @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) =
+        ((UInt64(a1) | UInt64(a2) << 32) % $T,
+         (UInt64(a3) | UInt64(a4) << 32) % $T)
+end
+for T in (UInt128, Int128)
+    @eval @inline philox_to_vals(::Type{$T}, a1, a2, a3, a4) =
+        ((UInt128(a1) | UInt128(a2) << 32 | UInt128(a3) << 64 | UInt128(a4) << 96) % $T,)
+end
+@inline philox_to_vals(::Type{Complex{Float32}}, a1, a2, a3, a4) =
+    (complex(u01(Float32, a1), u01(Float32, a2)),
+     complex(u01(Float32, a3), u01(Float32, a4)))
+@inline philox_to_vals(::Type{Complex{Float64}}, a1, a2, a3, a4) =
+    (complex(u01(Float64, UInt64(a1) | UInt64(a2) << 32),
+             u01(Float64, UInt64(a3) | UInt64(a4) << 32)),)
+
+# Number of output values per Philox4x32 call
+vals_per_call(::Type{T}) where T = sizeof(T) <= 4 ? 4 : sizeof(T) <= 8 ? 2 : 1
+vals_per_call(::Type{Complex{T}}) where T = sizeof(T) <= 4 ? 2 : 1
+
+# Batched kernel: N values per work item from one Philox call
+@kernel function rand_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    N = vals_per_call(T)
+    idx = N * gid
+    len = length(A)
+    if idx <= len
+        vals = philox_to_vals(T, philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            philox_key(seed))...)
+        for j in 1:N
+            @inbounds A[idx - N + j] = vals[j]
+        end
+    elseif idx - N < len
+        vals = philox_to_vals(T, philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            philox_key(seed))...)
+        for j in 1:min(N, len - idx + N)
+            @inbounds A[idx - N + j] = vals[j]
         end
-    else
-        randstate[threadid], y = next_rand(randstate[threadid])
-        x = reinterpret(Int32, y)
-        result = convert(T, x & typemax(T))
     end
-    result
 end
 
-# support for complex numbers
+# Types with specialized batched kernels
+const BatchedRandTypes = Union{
+    Float16, Float32, Float64, Complex{Float32}, Complex{Float64},
+    Bool, Int8, Int16, Int32, Int64, Int128,
+    UInt8, UInt16, UInt32, UInt64, UInt128}
 
-function gpu_rand(::Type{Complex{T}}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T
-    re = gpu_rand(T, threadid, randstate)
-    im = gpu_rand(T, threadid, randstate)
-    return complex(re, im)
+function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandTypes
+    isempty(A) && return A
+    N = vals_per_call(T)
+    rand_batched_kernel!(get_backend(A))(rng.seed, rng.counter, A;
+                                         ndrange=cld(length(A), N))
+    advance_counter!(rng)
+    A
 end
 
 
-## host interface
+## ElementRNG: a device-side AbstractRNG for generic rand/randn fallbacks
+
+# For element types without a specialized batched kernel (BigFloat,
+# FixedPointNumbers, user-defined types, ...), we create a per-work-item
+# AbstractRNG and delegate to Random stdlib's `rand(rng, T)` / `randn(rng, T)`.
+#
+# The struct is immutable (mutable structs get heap-allocated on GPU); the
+# per-work-item subcounter state lives in a stack-allocated Ref that the
+# struct holds a pointer to.
 
-struct RNG <: AbstractRNG
-    state::AbstractGPUArray{NTuple{4,UInt32},1}
+struct ElementRNG <: AbstractRNG
+    seed::UInt64
+    counter::UInt32
+    gid::UInt32
+    subctr_ptr::Ptr{UInt32}
 end
 
-# return an instance of GPUArrays.RNG suitable for the requested array type
-default_rng(::Type{<:AnyGPUArray}) = error("Not implemented") # COV_EXCL_LINE
+@inline Random.rng_native_52(::ElementRNG) = UInt64
 
-make_seed(rng::RNG) = make_seed(rng, rand(UInt))
-function make_seed(rng::RNG, n::Integer)
-    rand(MersenneTwister(n), UInt32, sizeof(rng.state)÷sizeof(UInt32))
+@inline function Random.rand(rng::ElementRNG, ::Random.SamplerType{UInt64})
+    sc = unsafe_load(rng.subctr_ptr) + UInt32(1)
+    unsafe_store!(rng.subctr_ptr, sc)
+    a1, a2, _, _ = philox4x32_10(
+        (rng.gid, sc, rng.counter, UInt32(0)),
+        philox_key(rng.seed))
+    UInt64(a1) | UInt64(a2) << 32
 end
 
-Random.seed!(rng::RNG) = Random.seed!(rng, make_seed(rng))
-Random.seed!(rng::RNG, seed::Integer) = Random.seed!(rng, make_seed(rng, seed))
-function Random.seed!(rng::RNG, seed::Vector{UInt32})
-    copyto!(rng.state, collect(reinterpret(NTuple{4, UInt32}, seed)))
-    return
+@inline function Random.rand(rng::ElementRNG, ::Random.SamplerType{UInt128})
+    UInt128(rand(rng, Random.SamplerType{UInt64}())) |
+    UInt128(rand(rng, Random.SamplerType{UInt64}())) << 64
+end
+
+@inline Random.rand(rng::ElementRNG, ::Random.SamplerType{T}) where T <: Union{Bool,Base.BitInteger} =
+    rand(rng, Random.SamplerType{UInt64}()) % T
+
+
+## Generic rand! fallback via ElementRNG
+
+@kernel function rand_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    if gid <= length(A)
+        subctr = Ref{UInt32}(0)
+        rng = ElementRNG(seed, counter, gid % UInt32,
+                          Base.unsafe_convert(Ptr{UInt32}, subctr))
+        @inbounds A[gid] = rand(rng, T)
+    end
 end
 
 function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
     isempty(A) && return A
-    @kernel function rand!(a, randstate)
-        idx = @index(Global, Linear)
-        @inbounds a[idx] = gpu_rand(T, ((idx-1)%length(randstate)+1), randstate)
-    end
-    rand!(get_backend(A))(A, rng.state; ndrange = size(A))
+    rand_generic_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A))
+    advance_counter!(rng)
     A
 end
 
-function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
-    isempty(A) && return A
-    threads = (length(A) - 1) ÷ 2 + 1
-    @kernel function randn!(a, randstates)
-        i = @index(Global, Linear)
-        threadidx = @index(Local, Linear)
-        idx = 2*(i - 1) + 1
-        U1 = gpu_rand(T, threadidx, randstates)
-        U2 = gpu_rand(T, threadidx, randstates)
-        Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2)
-        Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2)
-        @inbounds a[idx] = Z0
-        if idx + 1 <= length(a)
-            @inbounds a[idx + 1] = Z1
+
+## Specialized randn! kernel for common float types
+
+# Float16/32/64 and their complex variants go through a batched kernel mirroring
+# rand_batched_kernel!, rather than the generic ElementRNG path further down:
+#
+# 1. Random's `randn(rng, ::Float{16,32,64})` uses the Ziggurat algorithm with
+#    global table lookups (`ki`, `wi`, `fi`), which aren't device-accessible
+#    from a KernelAbstractions kernel.
+#
+# 2. The polar Box-Muller fallback reachable for other AbstractFloat types has
+#    a rejection loop (`while true ... 0 < s < 1`), causing warp divergence
+#    (~2x slower).
+#
+# 3. Direct (non-rejection) Box-Muller naturally produces value *pairs* from
+#    sincospi. Batching these (4 Float32 / 2 Float64 per Philox call) keeps us
+#    memory-bandwidth-bound; a 1-per-work-item kernel would discard half the
+#    output.
+
+# Convert Philox UInt32 outputs to N normally-distributed values of type T.
+# ≤32-bit float targets pass UInt32s to boxmuller directly (the polynomial
+# sincospi extracts bits itself; log still goes through u01 for now). 64-bit
+# targets assemble UInt64s and convert to Float64 on the way in.
+for T in (Float16, Float32)
+    @eval @inline function philox_to_normals(::Type{$T}, a1, a2, a3, a4)
+        n1, n2 = boxmuller($T, a1, a2)
+        n3, n4 = boxmuller($T, a3, a4)
+        (n1, n2, n3, n4)
+    end
+end
+@inline function philox_to_normals(::Type{Float64}, a1, a2, a3, a4)
+    boxmuller(Float64,
+        u01(Float64, UInt64(a1) | UInt64(a2) << 32),
+        u01(Float64, UInt64(a3) | UInt64(a4) << 32))
+end
+@inline philox_to_normals(::Type{Complex{Float32}}, a1, a2, a3, a4) =
+    (boxmuller(Complex{Float32}, a1, a2),
+     boxmuller(Complex{Float32}, a3, a4))
+@inline philox_to_normals(::Type{Complex{Float64}}, a1, a2, a3, a4) =
+    (boxmuller(Complex{Float64},
+        u01(Float64, UInt64(a1) | UInt64(a2) << 32),
+        u01(Float64, UInt64(a3) | UInt64(a4) << 32)),)
+
+@kernel function randn_batched_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    N = vals_per_call(T)
+    idx = N * gid
+    len = length(A)
+    if idx <= len
+        vals = philox_to_normals(T, philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            philox_key(seed))...)
+        for j in 1:N
+            @inbounds A[idx - N + j] = vals[j]
+        end
+    elseif idx - N < len
+        vals = philox_to_normals(T, philox4x32_10(
+            (gid % UInt32, UInt32(0), counter, UInt32(0)),
+            philox_key(seed))...)
+        for j in 1:min(N, len - idx + N)
+            @inbounds A[idx - N + j] = vals[j]
         end
     end
-    kernel = randn!(get_backend(A))
-    kernel(A, rng.state; ndrange=threads)
+end
+
+const BatchedRandnTypes = Union{Float16, Float32, Float64,
+                                Complex{Float32}, Complex{Float64}}
+
+function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: BatchedRandnTypes
+    isempty(A) && return A
+    N = vals_per_call(T)
+    randn_batched_kernel!(get_backend(A))(rng.seed, rng.counter, A;
+                                          ndrange=cld(length(A), N))
+    advance_counter!(rng)
+    A
+end
+
+
+## Generic randn! fallback via ElementRNG
+#
+# For AbstractFloats outside BatchedRandnTypes (BFloat16, user-defined float
+# types, etc.) we route through Random's `randn(rng, T)`. The reachable methods
+# from there are:
+#
+# - `randn(rng, ::BitFloatType)` (Float16/32/64) — ziggurat, uses global `wi`
+#   /`ki`/`fi` tables that aren't device-accessible. Overridden below to use
+#   our Box-Muller directly. The direct dispatch for these types goes through
+#   the batched kernel above, but `randn(rng, Complex{Float16})` recurses into
+#   `randn(rng, Float16)` which hits this path.
+# - `randn(rng, ::Type{Complex{T}})` — recurses into `randn(rng, T)`.
+# - `randn(rng, ::Type{T}) where T<:AbstractFloat` — Marsaglia polar Box-Muller
+#   rejection loop. GPU-safe (only calls `rand(rng, T)`) but warp-divergent.
+
+# Bypass Base's ziggurat-based randn(rng, Float{16,32,64}) — its `wi`/`ki`/`fi`
+# tables aren't device-accessible, and on Metal the Float64 tables can't even
+# be loaded. Reached via Base's Complex recursion when the element type is
+# e.g. Complex{Float16}.
+@inline Random.randn(rng::ElementRNG, ::Type{Float16}) =
+    first(boxmuller(Float16, rand(rng, UInt32), rand(rng, UInt32)))
+@inline Random.randn(rng::ElementRNG, ::Type{Float32}) =
+    first(boxmuller(Float32, rand(rng, UInt32), rand(rng, UInt32)))
+@inline Random.randn(rng::ElementRNG, ::Type{Float64}) =
+    first(boxmuller(Float64, u01(Float64, rand(rng, UInt64)), u01(Float64, rand(rng, UInt64))))
+
+@kernel function randn_generic_kernel!(@Const(seed), @Const(counter), A::AbstractArray{T}) where T
+    gid = @index(Global, Linear)
+    if gid <= length(A)
+        subctr = Ref{UInt32}(0)
+        rng = ElementRNG(seed, counter, gid % UInt32,
+                          Base.unsafe_convert(Ptr{UInt32}, subctr))
+        @inbounds A[gid] = randn(rng, T)
+    end
+end
+
+function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Union{AbstractFloat,
+                                                                     Complex{<:AbstractFloat}}
+    isempty(A) && return A
+    randn_generic_kernel!(get_backend(A))(rng.seed, rng.counter, A; ndrange=length(A))
+    advance_counter!(rng)
     A
 end
 
-# allow use of CPU RNGs without scalar iteration
-Random.rand!(rng::AbstractRNG, A::AnyGPUArray) =
-    copyto!(A, rand(rng, eltype(A), size(A)...))
-Random.randn!(rng::AbstractRNG, A::AnyGPUArray) =
-    copyto!(A, randn(rng, eltype(A), size(A)...))
+
+## Non-GPU array fallback: generate on AT, copyto! the destination.
+#
+# Without this, `rand!(rng::RNG{CuArray}, ::Vector)` would hit Random's stdlib
+# scalar path and silently iterate the GPU rng one element at a time.
+
+function Random.rand!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T}
+    isempty(A) && return A
+    B = similar(AT{T}, size(A))
+    Random.rand!(rng, B)
+    copyto!(A, B)
+end
+function Random.randn!(rng::RNG{AT}, A::AbstractArray{T}) where {AT, T}
+    isempty(A) && return A
+    B = similar(AT{T}, size(A))
+    Random.randn!(rng, B)
+    copyto!(A, B)
+end
+
+
+## Out-of-place rand / randn — construct an AT array and fill it.
+
+Random.rand(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T} =
+    Random.rand!(rng, similar(AT{T}, dims))
+Random.randn(rng::RNG{AT}, ::Type{T}, dims::Dims) where {AT, T<:Union{AbstractFloat,
+                                                                       Complex{<:AbstractFloat}}} =
+    Random.randn!(rng, similar(AT{T}, dims))
+
+# untyped: default to Float32 (matches CUDA convention; better fit than Float64
+# on consumer GPUs)
+Random.rand(rng::RNG{AT}, dims::Dims) where {AT} = Random.rand(rng, Float32, dims)
+Random.randn(rng::RNG{AT}, dims::Dims) where {AT} = Random.randn(rng, Float32, dims)
+
+# variadic dim spellings
+Random.rand(rng::RNG, dim1::Integer, dims::Integer...) =
+    Random.rand(rng, Dims((dim1, dims...)))
+Random.randn(rng::RNG, dim1::Integer, dims::Integer...) =
+    Random.randn(rng, Dims((dim1, dims...)))
+Random.rand(rng::RNG, ::Type{T}, dim1::Integer, dims::Integer...) where T =
+    Random.rand(rng, T, Dims((dim1, dims...)))
+Random.randn(rng::RNG, ::Type{T}, dim1::Integer, dims::Integer...) where T =
+    Random.randn(rng, T, Dims((dim1, dims...)))
diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl
index 5e00682c..7ed2fd56 100644
--- a/test/testsuite/random.jl
+++ b/test/testsuite/random.jl
@@ -1,37 +1,25 @@
 @testsuite "random" (AT, eltypes)->begin
     rng = if AT <: AbstractGPUArray
-        GPUArrays.default_rng(AT)
+        GPUArrays.RNG{AT}()
     else
         Random.default_rng()
     end
-    cpu_rng = Random.default_rng()
-
-    SEEDING_BROKEN = (rng != cpu_rng) && !contains(string(AT), "JLArray")
 
     @testset "rand" begin  # uniform
-        @testset "$d $T" for T in eltypes, d in (10, (10, 10), (1024, 1024))
+        @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2), 3, (3,3))
             A = AT{T}(undef, d)
             B = copy(A)
             rand!(rng, A)
             rand!(rng, B)
             @test Array(A) != Array(B)
-
-            A = AT(rand(T, d))
-            B = AT(rand(T, d))
-
-            Random.seed!(rng)
-            Random.seed!(rng, 1)
-            rand!(rng, A)
-            Random.seed!(rng, 1)
-            rand!(rng, B)
-
-            @test Array(A) == Array(B) skip=SEEDING_BROKEN && (prod(d) > length(rng.state))
-
-            if rng != cpu_rng
-                rand!(cpu_rng, A)
-            end
         end
 
+        # empty arrays
+        B = AT{Float32}(undef, 0)
+        rand!(rng, B)
+        @test isempty(Array(B))
+
+        # Bool coverage
         A = AT{Bool}(undef, 1024)
         fill!(A, false)
         rand!(rng, A)
@@ -39,43 +27,54 @@
         fill!(A, true)
         rand!(rng, A)
         @test false in Array(A)
-
-        # AT of length 0
-        B = AT{Float32}(undef, 0)
-        fill!(B, 1f0)
-        rand!(rng, B)
-        @test isempty(Array(B))
     end
 
     @testset "randn" begin  # normally-distributed
-        # XXX: randn calls sqrt, and Base's sqrt(::Complex) performs
-        #      checked type conversions that throw boxed numbers.
-        @testset "$d $T" for T in filter(isrealfloattype, eltypes), d in (2, (2, 2), (1024, 1024))
+        @testset "$T $d" for T in filter(isrealfloattype, eltypes),
+                              d in (2, (2,2), (2,2,2), 3, (3,3))
             A = AT{T}(undef, d)
             B = copy(A)
             randn!(rng, A)
             randn!(rng, B)
             @test Array(A) != Array(B)
+        end
 
-            A = AT(rand(T, d))
-            B = AT(rand(T, d))
-
-            Random.seed!(rng)
-            Random.seed!(rng, 1)
+        # complex randn
+        for T in filter(t -> t <: Complex && isrealfloattype(real(t)), eltypes)
+            A = AT{T}(undef, 8)
             randn!(rng, A)
-            Random.seed!(rng, 1)
-            randn!(rng, B)
-            @test Array(A) == Array(B) skip=SEEDING_BROKEN && (prod(d) > (2 * length(rng.state)))
-
-            if rng != cpu_rng
-                randn!(cpu_rng, A)
-            end
+            @test !any(isnan, Array(A))
         end
 
-        # AT of length 0
+        # empty arrays
         A = AT{Float32}(undef, 0)
-        fill!(A, 1f0)
         randn!(rng, A)
         @test isempty(Array(A))
+
+        # Box-Muller should not produce infinities
+        if Float32 in eltypes
+            @test isfinite(maximum(randn!(rng, AT{Float32}(undef, 2^20))))
+        end
+    end
+
+    @testset "seeding" begin
+        if AT <: AbstractGPUArray
+            # seeding should produce reproducible results
+            for T in (Float32, Float64)
+                if T in eltypes
+                    Random.seed!(rng, 1)
+                    A = rand!(rng, AT{T}(undef, 100))
+                    Random.seed!(rng, 1)
+                    B = rand!(rng, AT{T}(undef, 100))
+                    @test Array(A) == Array(B)
+
+                    Random.seed!(rng, 1)
+                    A = randn!(rng, AT{T}(undef, 100))
+                    Random.seed!(rng, 1)
+                    B = randn!(rng, AT{T}(undef, 100))
+                    @test Array(A) == Array(B)
+                end
+            end
+        end
     end
 end