JuliaGPU · maleadt · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
@@ -615,7 +615,10 @@ end
 
 function (obj::Kernel{JLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
     ndrange, workgroupsize, _, _ = launch_config(obj, ndrange, workgroupsize)
-    device_args = jlconvert.(args)
+    # Use `map` rather than `jlconvert.(args)` to skip the broadcast
+    # machinery (broadcasted/materialize/ntuple) that would otherwise
+    # be specialized per unique arg-tuple type.
+    device_args = map(jlconvert, args)
     new_obj = convert_to_cpu(obj)
     new_obj(device_args...; ndrange, workgroupsize)
 end

diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
@@ -121,8 +121,8 @@ unsafe_free!(x::AbstractGPUArray) = unsafe_free!(storage(x))
 
 using Serialization: AbstractSerializer, serialize_type
 
-function Serialization.serialize(s::AbstractSerializer, t::T) where T <: AbstractGPUArray
-    serialize_type(s, T)
+function Serialization.serialize(s::AbstractSerializer, @nospecialize(t::AbstractGPUArray))
+    serialize_type(s, typeof(t))
     serialize(s, Array(t))
 end
 
@@ -136,16 +136,17 @@ end
 struct ToArray end
 Adapt.adapt_storage(::ToArray, xs::AbstractGPUArray) = convert(Array, xs)
 
-# display
-Base.print_array(io::IO, X::AnyGPUArray) =
+# display: show is called on the materialised CPU copy, so no need to
+# specialize the forwarders per element type / wrapper.
+Base.print_array(io::IO, @nospecialize(X::AnyGPUArray)) =
     Base.print_array(io, adapt(ToArray(), X))
 
 # show
-Base._show_nonempty(io::IO, X::AnyGPUArray, prefix::String) =
+Base._show_nonempty(io::IO, @nospecialize(X::AnyGPUArray), prefix::String) =
     Base._show_nonempty(io, adapt(ToArray(), X), prefix)
-Base._show_empty(io::IO, X::AnyGPUArray) =
+Base._show_empty(io::IO, @nospecialize(X::AnyGPUArray)) =
     Base._show_empty(io, adapt(ToArray(), X))
-Base.show_vector(io::IO, v::AnyGPUArray, args...) =
+Base.show_vector(io::IO, @nospecialize(v::AnyGPUArray), args...) =
     Base.show_vector(io, adapt(ToArray(), v), args...)
 
 ## collect to CPU (discarding wrapper type)
@@ -324,7 +325,7 @@ end
 
 Base.copy(x::AbstractGPUArray) = error("Not implemented") # COV_EXCL_LINE
 
-Base.deepcopy_internal(x::AbstractGPUArray, ::IdDict) = copy(x)
+Base.deepcopy_internal(@nospecialize(x::AbstractGPUArray), ::IdDict) = copy(x)
 
 
 # filtering
@@ -345,7 +346,7 @@ end
 
 # this is needed because copyto! of most GPU arrays
 # doesn't currently support Tuple sources
-function Base.append!(a::AbstractGPUVector, items::Tuple)
+function Base.append!(a::AbstractGPUVector, @nospecialize(items::Tuple))
     append!(a, collect(items))
     return a
 end
diff --git a/src/host/construction.jl b/src/host/construction.jl
@@ -88,7 +88,7 @@ function hasfieldcount(@nospecialize(dt))
 end
 
 # for finding specific element types, e.g., when Float64 is unsupported
-function contains_eltype(T, typ)
+function contains_eltype(@nospecialize(T), @nospecialize(typ))
     if T === typ
       return true
     elseif T isa Union

diff --git a/test/testsuite.jl b/test/testsuite.jl
@@ -16,26 +16,36 @@ using Test
 
 using Adapt
 
-test_result(a, b; kwargs...) = a == b
+test_result(@nospecialize(a), @nospecialize(b); kwargs...) = a == b
 test_result(a::Number, b::Number; kwargs...) = ≈(a, b; kwargs...)
 test_result(a::Missing, b::Missing; kwargs...) = true
 test_result(a::Number, b::Missing; kwargs...) = false
 test_result(a::Missing, b::Number; kwargs...) = false
-function test_result(a::AbstractArray{T}, b::AbstractArray{T}; kwargs...) where {T<:Number}
-    ≈(collect(a), collect(b); kwargs...)
-end
-function test_result(a::AbstractArray{T}, b::AbstractArray{T};
-                     kwargs...) where {T<:NTuple{N,<:Number} where {N}}
-    ET = eltype(T)
-    ≈(reinterpret(ET, collect(a)), reinterpret(ET, collect(b)); kwargs...)
+# Branch on eltype at runtime so one compiled method body handles every
+# (T, ndims) combination — the `where T` version would still instantiate
+# per element type even under @nospecialize.
+function test_result(@nospecialize(a::AbstractArray), @nospecialize(b::AbstractArray); kwargs...)
+    T = eltype(a)
+    # The original `where T<:…` methods required matching eltypes; preserve
+    # that by falling through to `a == b` when they diverge.
+    if eltype(b) === T
+        if T <: Number
+            return ≈(collect(a), collect(b); kwargs...)
+        elseif T <: NTuple{N,<:Number} where {N}
+            ET = eltype(T)
+            return ≈(reinterpret(ET, collect(a)), reinterpret(ET, collect(b)); kwargs...)
+        end
+    end
+    a == b
 end
-function test_result(as::NTuple{N,Any}, bs::NTuple{N,Any}; kwargs...) where {N}
+function test_result(@nospecialize(as::Tuple), @nospecialize(bs::Tuple); kwargs...)
+    length(as) == length(bs) || return false
     all(zip(as, bs)) do (a, b)
         test_result(a, b; kwargs...)
     end
 end
 
-function compare(f, AT::Type{<:AbstractGPUArray}, xs...; kwargs...)
+function compare(@nospecialize(f), AT::Type{<:AbstractGPUArray}, @nospecialize(xs...); kwargs...)
     # copy on the CPU, adapt on the GPU, but keep Ref's
     cpu_in = map(x -> isa(x, Base.RefValue) ? x[] : deepcopy(x), xs)
     gpu_in = map(x -> isa(x, Base.RefValue) ? x[] : adapt(AT, x), xs)
@@ -46,7 +56,7 @@ function compare(f, AT::Type{<:AbstractGPUArray}, xs...; kwargs...)
     test_result(cpu_out, gpu_out; kwargs...)
 end
 
-function compare(f, AT::Type{<:Array}, xs...; kwargs...)
+function compare(@nospecialize(f), AT::Type{<:Array}, @nospecialize(xs...); kwargs...)
     # no need to actually run this tests: we have nothing to compare against,
     # and we'll run it on a CPU array anyhow when comparing to a GPU array.
     #

diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl
@@ -6,7 +6,7 @@
     end
 
     @testset "rand" begin  # uniform
-        @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2), 3, (3,3))
+        @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2))
             A = AT{T}(undef, d)
             B = copy(A)
             rand!(rng, A)
@@ -31,7 +31,7 @@
 
     @testset "randn" begin  # normally-distributed
         @testset "$T $d" for T in filter(isrealfloattype, eltypes),
-                              d in (2, (2,2), (2,2,2), 3, (3,3))
+                              d in (2, (2,2), (2,2,2))
             A = AT{T}(undef, d)
             B = copy(A)
             randn!(rng, A)

diff --git a/test/testsuite/reductions.jl b/test/testsuite/reductions.jl
@@ -56,14 +56,11 @@ end
         end
     end
     # Test more corner cases. Tests from AcceleraterKernels.jl
-    for dims in [1,2,3,4,[1,2],[1,3],[1,4],[2,3],[2,4],[3,4],[1,2,3],[1,2,4],[1,3,4],[2,3,4],[1,2,3,4]]
-        for isize in 0:3
-            for jsize in 0:3
-                for ksize in 0:3
-                    @test compare(A->mapreduce(x->x+x, +, A; init=zero(Int32), dims), AT, rand(Int32(1):Int32(10), isize, jsize, ksize))
-                end
-            end
-        end
+    # Cover empty (size 0) and non-singleton (size 3) axes; the size-10 loop above
+    # already covers the common non-edge shape.
+    for dims in [1,2,3,4,[1,2],[1,3],[1,4],[2,3],[2,4],[3,4],[1,2,3],[1,2,4],[1,3,4],[2,3,4],[1,2,3,4]],
+        isize in (0, 3), jsize in (0, 3), ksize in (0, 3)
+        @test compare(A->mapreduce(x->x+x, +, A; init=zero(Int32), dims), AT, rand(Int32(1):Int32(10), isize, jsize, ksize))
     end
 end
 
@@ -84,35 +81,38 @@ end
         end
     end
     # Test more corner cases. Tests from AcceleraterKernels.jl
-    for dims in [1,2,3,4,[1,2],[1,3],[1,4],[2,3],[2,4],[3,4],[1,2,3],[1,2,4],[1,3,4],[2,3,4],[1,2,3,4]]
-        for isize in 0:3
-            for jsize in 0:3
-                for ksize in 0:3
-                    @test compare(A->reduce(+, A; init=zero(Int32), dims), AT, rand(Int32(1):Int32(10), isize, jsize, ksize))
-                end
-            end
-        end
+    # Cover empty (size 0) and non-singleton (size 3) axes; the size-10 loop above
+    # already covers the common non-edge shape.
+    for dims in [1,2,3,4,[1,2],[1,3],[1,4],[2,3],[2,4],[3,4],[1,2,3],[1,2,4],[1,3,4],[2,3,4],[1,2,3,4]],
+        isize in (0, 3), jsize in (0, 3), ksize in (0, 3)
+        @test compare(A->reduce(+, A; init=zero(Int32), dims), AT, rand(Int32(1):Int32(10), isize, jsize, ksize))
     end
 end
 
 @testsuite "reductions/sum prod" (AT, eltypes)->begin
     @testset "$ET" for ET in eltypes
         range = ET <: Real ? (ET(1):ET(10)) : ET
-        for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[],
-                            (10,)=>:, (10,10)=>:, (10,10,10)=>:,
-                            (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3],
-                            (0,)=>[1]]
+
+        # whole-array reductions: exercise each unique shape only once
+        for sz in ((10,), (10,10), (10,10,10), (0,))
             @test compare(A->sum(A), AT, rand(range, sz))
-            @test compare(A->sum(A; dims=dims), AT, rand(range, sz))
             @test compare(A->prod(A), AT, rand(range, sz))
-            @test compare(A->prod(A; dims=dims), AT, rand(range, sz))
             if typeof(abs(rand(range))) in eltypes
                 # abs(::Complex{Int}) promotes to Float64
                 @test compare(A->sum(abs, A), AT, rand(range, sz))
                 @test compare(A->prod(abs, A), AT, rand(range, sz))
             end
         end
 
+        # reductions along specific dims
+        for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[],
+                            (10,)=>:, (10,10)=>:, (10,10,10)=>:,
+                            (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3],
+                            (0,)=>[1]]
+            @test compare(A->sum(A; dims=dims), AT, rand(range, sz))
+            @test compare(A->prod(A; dims=dims), AT, rand(range, sz))
+        end
+
         if ET in (Float32, Float64, Int64, ComplexF32, ComplexF64)
             # smaller-scale test to avoid very large values and roundoff issues
             for (sz,red) in [(2,)=>(1,), (2,2)=>(1,1), (2,2,2)=>(1,1,1), (2,2,2)=>(2,2,2),
@@ -126,30 +126,33 @@ end
 
 @testsuite "reductions/minimum maximum extrema" (AT, eltypes)->begin
     @testset "$ET" for ET in eltypes
+        ET <: Complex && continue
         range = ET <: Real ? (ET(1):ET(10)) : ET
+
+        # whole-array reductions: exercise each unique shape only once
+        for sz in ((10,), (10,10), (10,10,10))
+            @test compare(A->minimum(A), AT, rand(range, sz))
+            @test compare(A->minimum(x->x*x, A), AT, rand(range, sz))
+            @test compare(A->maximum(A), AT, rand(range, sz))
+            @test compare(A->maximum(x->x*x, A), AT, rand(range, sz))
+            @test compare(A->extrema(A), AT, rand(range, sz))
+            @test compare(A->extrema(x->x*x, A), AT, rand(range, sz))
+        end
+
+        # reductions along specific dims
         for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[],
                           (10,)=>:, (10,10)=>:, (10,10,10)=>:,
                           (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3]]
-            if !(ET <: Complex)
-                @test compare(A->minimum(A), AT, rand(range, sz))
-                @test compare(A->minimum(x->x*x, A), AT, rand(range, sz))
-                @test compare(A->minimum(A; dims=dims), AT, rand(range, sz))
-                @test compare(A->maximum(A), AT, rand(range, sz))
-                @test compare(A->maximum(x->x*x, A), AT, rand(range, sz))
-                @test compare(A->maximum(A; dims=dims), AT, rand(range, sz))
-                @test compare(A->extrema(A), AT, rand(range, sz))
-                @test compare(A->extrema(x->x*x, A), AT, rand(range, sz))
-                @test compare(A->extrema(A; dims=dims), AT, rand(range, sz))
-            end
+            @test compare(A->minimum(A; dims=dims), AT, rand(range, sz))
+            @test compare(A->maximum(A; dims=dims), AT, rand(range, sz))
+            @test compare(A->extrema(A; dims=dims), AT, rand(range, sz))
         end
 
         for (sz,red) in [(10,)=>(1,), (10,10)=>(1,1), (10,10,10)=>(1,1,1), (10,10,10)=>(10,10,10),
                          (10,10,10)=>(1,10,10), (10,10,10)=>(10,1,10), (10,10,10)=>(10,10,1)]
-            if !(ET <: Complex)
-                @test compare((A,R)->minimum!(R, A), AT, rand(range, sz), fill(typemax(ET), red))
-                @test compare((A,R)->maximum!(R, A), AT, rand(range, sz), fill(typemin(ET), red))
-                @test compare((A,R)->extrema!(R, A), AT, rand(range, sz), fill((typemax(ET),typemin(ET)), red))
-            end
+            @test compare((A,R)->minimum!(R, A), AT, rand(range, sz), fill(typemax(ET), red))
+            @test compare((A,R)->maximum!(R, A), AT, rand(range, sz), fill(typemin(ET), red))
+            @test compare((A,R)->extrema!(R, A), AT, rand(range, sz), fill((typemax(ET),typemin(ET)), red))
         end
     end
 end