diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 3557b63c..bf671ff2 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -615,7 +615,10 @@ end function (obj::Kernel{JLBackend})(args...; ndrange=nothing, workgroupsize=nothing) ndrange, workgroupsize, _, _ = launch_config(obj, ndrange, workgroupsize) - device_args = jlconvert.(args) + # Use `map` rather than `jlconvert.(args)` to skip the broadcast + # machinery (broadcasted/materialize/ntuple) that would otherwise + # be specialized per unique arg-tuple type. + device_args = map(jlconvert, args) new_obj = convert_to_cpu(obj) new_obj(device_args...; ndrange, workgroupsize) end diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl index 0080935e..15543e31 100644 --- a/src/host/abstractarray.jl +++ b/src/host/abstractarray.jl @@ -121,8 +121,8 @@ unsafe_free!(x::AbstractGPUArray) = unsafe_free!(storage(x)) using Serialization: AbstractSerializer, serialize_type -function Serialization.serialize(s::AbstractSerializer, t::T) where T <: AbstractGPUArray - serialize_type(s, T) +function Serialization.serialize(s::AbstractSerializer, @nospecialize(t::AbstractGPUArray)) + serialize_type(s, typeof(t)) serialize(s, Array(t)) end @@ -136,16 +136,17 @@ end struct ToArray end Adapt.adapt_storage(::ToArray, xs::AbstractGPUArray) = convert(Array, xs) -# display -Base.print_array(io::IO, X::AnyGPUArray) = +# display: show is called on the materialised CPU copy, so no need to +# specialize the forwarders per element type / wrapper. +Base.print_array(io::IO, @nospecialize(X::AnyGPUArray)) = Base.print_array(io, adapt(ToArray(), X)) # show -Base._show_nonempty(io::IO, X::AnyGPUArray, prefix::String) = +Base._show_nonempty(io::IO, @nospecialize(X::AnyGPUArray), prefix::String) = Base._show_nonempty(io, adapt(ToArray(), X), prefix) -Base._show_empty(io::IO, X::AnyGPUArray) = +Base._show_empty(io::IO, @nospecialize(X::AnyGPUArray)) = Base._show_empty(io, adapt(ToArray(), X)) -Base.show_vector(io::IO, v::AnyGPUArray, args...) = +Base.show_vector(io::IO, @nospecialize(v::AnyGPUArray), args...) = Base.show_vector(io, adapt(ToArray(), v), args...) ## collect to CPU (discarding wrapper type) @@ -324,7 +325,7 @@ end Base.copy(x::AbstractGPUArray) = error("Not implemented") # COV_EXCL_LINE -Base.deepcopy_internal(x::AbstractGPUArray, ::IdDict) = copy(x) +Base.deepcopy_internal(@nospecialize(x::AbstractGPUArray), ::IdDict) = copy(x) # filtering @@ -345,7 +346,7 @@ end # this is needed because copyto! of most GPU arrays # doesn't currently support Tuple sources -function Base.append!(a::AbstractGPUVector, items::Tuple) +function Base.append!(a::AbstractGPUVector, @nospecialize(items::Tuple)) append!(a, collect(items)) return a end diff --git a/src/host/construction.jl b/src/host/construction.jl index 454ff3d9..2025a97b 100644 --- a/src/host/construction.jl +++ b/src/host/construction.jl @@ -88,7 +88,7 @@ function hasfieldcount(@nospecialize(dt)) end # for finding specific element types, e.g., when Float64 is unsupported -function contains_eltype(T, typ) +function contains_eltype(@nospecialize(T), @nospecialize(typ)) if T === typ return true elseif T isa Union diff --git a/test/testsuite.jl b/test/testsuite.jl index 73db8703..e4aea4f7 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -16,26 +16,36 @@ using Test using Adapt -test_result(a, b; kwargs...) = a == b +test_result(@nospecialize(a), @nospecialize(b); kwargs...) = a == b test_result(a::Number, b::Number; kwargs...) = ≈(a, b; kwargs...) test_result(a::Missing, b::Missing; kwargs...) = true test_result(a::Number, b::Missing; kwargs...) = false test_result(a::Missing, b::Number; kwargs...) = false -function test_result(a::AbstractArray{T}, b::AbstractArray{T}; kwargs...) where {T<:Number} - ≈(collect(a), collect(b); kwargs...) -end -function test_result(a::AbstractArray{T}, b::AbstractArray{T}; - kwargs...) where {T<:NTuple{N,<:Number} where {N}} - ET = eltype(T) - ≈(reinterpret(ET, collect(a)), reinterpret(ET, collect(b)); kwargs...) +# Branch on eltype at runtime so one compiled method body handles every +# (T, ndims) combination — the `where T` version would still instantiate +# per element type even under @nospecialize. +function test_result(@nospecialize(a::AbstractArray), @nospecialize(b::AbstractArray); kwargs...) + T = eltype(a) + # The original `where T<:…` methods required matching eltypes; preserve + # that by falling through to `a == b` when they diverge. + if eltype(b) === T + if T <: Number + return ≈(collect(a), collect(b); kwargs...) + elseif T <: NTuple{N,<:Number} where {N} + ET = eltype(T) + return ≈(reinterpret(ET, collect(a)), reinterpret(ET, collect(b)); kwargs...) + end + end + a == b end -function test_result(as::NTuple{N,Any}, bs::NTuple{N,Any}; kwargs...) where {N} +function test_result(@nospecialize(as::Tuple), @nospecialize(bs::Tuple); kwargs...) + length(as) == length(bs) || return false all(zip(as, bs)) do (a, b) test_result(a, b; kwargs...) end end -function compare(f, AT::Type{<:AbstractGPUArray}, xs...; kwargs...) +function compare(@nospecialize(f), AT::Type{<:AbstractGPUArray}, @nospecialize(xs...); kwargs...) # copy on the CPU, adapt on the GPU, but keep Ref's cpu_in = map(x -> isa(x, Base.RefValue) ? x[] : deepcopy(x), xs) gpu_in = map(x -> isa(x, Base.RefValue) ? x[] : adapt(AT, x), xs) @@ -46,7 +56,7 @@ function compare(f, AT::Type{<:AbstractGPUArray}, xs...; kwargs...) test_result(cpu_out, gpu_out; kwargs...) end -function compare(f, AT::Type{<:Array}, xs...; kwargs...) +function compare(@nospecialize(f), AT::Type{<:Array}, @nospecialize(xs...); kwargs...) # no need to actually run this tests: we have nothing to compare against, # and we'll run it on a CPU array anyhow when comparing to a GPU array. # diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl index fcda79b5..2cd65a3c 100644 --- a/test/testsuite/random.jl +++ b/test/testsuite/random.jl @@ -6,7 +6,7 @@ end @testset "rand" begin # uniform - @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2), 3, (3,3)) + @testset "$T $d" for T in eltypes, d in (2, (2,2), (2,2,2)) A = AT{T}(undef, d) B = copy(A) rand!(rng, A) @@ -31,7 +31,7 @@ @testset "randn" begin # normally-distributed @testset "$T $d" for T in filter(isrealfloattype, eltypes), - d in (2, (2,2), (2,2,2), 3, (3,3)) + d in (2, (2,2), (2,2,2)) A = AT{T}(undef, d) B = copy(A) randn!(rng, A) diff --git a/test/testsuite/reductions.jl b/test/testsuite/reductions.jl index 7f135df6..f663dbee 100644 --- a/test/testsuite/reductions.jl +++ b/test/testsuite/reductions.jl @@ -56,14 +56,11 @@ end end end # Test more corner cases. Tests from AcceleraterKernels.jl - for dims in [1,2,3,4,[1,2],[1,3],[1,4],[2,3],[2,4],[3,4],[1,2,3],[1,2,4],[1,3,4],[2,3,4],[1,2,3,4]] - for isize in 0:3 - for jsize in 0:3 - for ksize in 0:3 - @test compare(A->mapreduce(x->x+x, +, A; init=zero(Int32), dims), AT, rand(Int32(1):Int32(10), isize, jsize, ksize)) - end - end - end + # Cover empty (size 0) and non-singleton (size 3) axes; the size-10 loop above + # already covers the common non-edge shape. + for dims in [1,2,3,4,[1,2],[1,3],[1,4],[2,3],[2,4],[3,4],[1,2,3],[1,2,4],[1,3,4],[2,3,4],[1,2,3,4]], + isize in (0, 3), jsize in (0, 3), ksize in (0, 3) + @test compare(A->mapreduce(x->x+x, +, A; init=zero(Int32), dims), AT, rand(Int32(1):Int32(10), isize, jsize, ksize)) end end @@ -84,28 +81,22 @@ end end end # Test more corner cases. Tests from AcceleraterKernels.jl - for dims in [1,2,3,4,[1,2],[1,3],[1,4],[2,3],[2,4],[3,4],[1,2,3],[1,2,4],[1,3,4],[2,3,4],[1,2,3,4]] - for isize in 0:3 - for jsize in 0:3 - for ksize in 0:3 - @test compare(A->reduce(+, A; init=zero(Int32), dims), AT, rand(Int32(1):Int32(10), isize, jsize, ksize)) - end - end - end + # Cover empty (size 0) and non-singleton (size 3) axes; the size-10 loop above + # already covers the common non-edge shape. + for dims in [1,2,3,4,[1,2],[1,3],[1,4],[2,3],[2,4],[3,4],[1,2,3],[1,2,4],[1,3,4],[2,3,4],[1,2,3,4]], + isize in (0, 3), jsize in (0, 3), ksize in (0, 3) + @test compare(A->reduce(+, A; init=zero(Int32), dims), AT, rand(Int32(1):Int32(10), isize, jsize, ksize)) end end @testsuite "reductions/sum prod" (AT, eltypes)->begin @testset "$ET" for ET in eltypes range = ET <: Real ? (ET(1):ET(10)) : ET - for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[], - (10,)=>:, (10,10)=>:, (10,10,10)=>:, - (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3], - (0,)=>[1]] + + # whole-array reductions: exercise each unique shape only once + for sz in ((10,), (10,10), (10,10,10), (0,)) @test compare(A->sum(A), AT, rand(range, sz)) - @test compare(A->sum(A; dims=dims), AT, rand(range, sz)) @test compare(A->prod(A), AT, rand(range, sz)) - @test compare(A->prod(A; dims=dims), AT, rand(range, sz)) if typeof(abs(rand(range))) in eltypes # abs(::Complex{Int}) promotes to Float64 @test compare(A->sum(abs, A), AT, rand(range, sz)) @@ -113,6 +104,15 @@ end end end + # reductions along specific dims + for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[], + (10,)=>:, (10,10)=>:, (10,10,10)=>:, + (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3], + (0,)=>[1]] + @test compare(A->sum(A; dims=dims), AT, rand(range, sz)) + @test compare(A->prod(A; dims=dims), AT, rand(range, sz)) + end + if ET in (Float32, Float64, Int64, ComplexF32, ComplexF64) # smaller-scale test to avoid very large values and roundoff issues for (sz,red) in [(2,)=>(1,), (2,2)=>(1,1), (2,2,2)=>(1,1,1), (2,2,2)=>(2,2,2), @@ -126,30 +126,33 @@ end @testsuite "reductions/minimum maximum extrema" (AT, eltypes)->begin @testset "$ET" for ET in eltypes + ET <: Complex && continue range = ET <: Real ? (ET(1):ET(10)) : ET + + # whole-array reductions: exercise each unique shape only once + for sz in ((10,), (10,10), (10,10,10)) + @test compare(A->minimum(A), AT, rand(range, sz)) + @test compare(A->minimum(x->x*x, A), AT, rand(range, sz)) + @test compare(A->maximum(A), AT, rand(range, sz)) + @test compare(A->maximum(x->x*x, A), AT, rand(range, sz)) + @test compare(A->extrema(A), AT, rand(range, sz)) + @test compare(A->extrema(x->x*x, A), AT, rand(range, sz)) + end + + # reductions along specific dims for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[], (10,)=>:, (10,10)=>:, (10,10,10)=>:, (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3]] - if !(ET <: Complex) - @test compare(A->minimum(A), AT, rand(range, sz)) - @test compare(A->minimum(x->x*x, A), AT, rand(range, sz)) - @test compare(A->minimum(A; dims=dims), AT, rand(range, sz)) - @test compare(A->maximum(A), AT, rand(range, sz)) - @test compare(A->maximum(x->x*x, A), AT, rand(range, sz)) - @test compare(A->maximum(A; dims=dims), AT, rand(range, sz)) - @test compare(A->extrema(A), AT, rand(range, sz)) - @test compare(A->extrema(x->x*x, A), AT, rand(range, sz)) - @test compare(A->extrema(A; dims=dims), AT, rand(range, sz)) - end + @test compare(A->minimum(A; dims=dims), AT, rand(range, sz)) + @test compare(A->maximum(A; dims=dims), AT, rand(range, sz)) + @test compare(A->extrema(A; dims=dims), AT, rand(range, sz)) end for (sz,red) in [(10,)=>(1,), (10,10)=>(1,1), (10,10,10)=>(1,1,1), (10,10,10)=>(10,10,10), (10,10,10)=>(1,10,10), (10,10,10)=>(10,1,10), (10,10,10)=>(10,10,1)] - if !(ET <: Complex) - @test compare((A,R)->minimum!(R, A), AT, rand(range, sz), fill(typemax(ET), red)) - @test compare((A,R)->maximum!(R, A), AT, rand(range, sz), fill(typemin(ET), red)) - @test compare((A,R)->extrema!(R, A), AT, rand(range, sz), fill((typemax(ET),typemin(ET)), red)) - end + @test compare((A,R)->minimum!(R, A), AT, rand(range, sz), fill(typemax(ET), red)) + @test compare((A,R)->maximum!(R, A), AT, rand(range, sz), fill(typemin(ET), red)) + @test compare((A,R)->extrema!(R, A), AT, rand(range, sz), fill((typemax(ET),typemin(ET)), red)) end end end