[CUSPARSE] Fix wrong method for kron with Diagonal and add support to kron with FillArrays#2973
[CUSPARSE] Fix wrong method for kron with Diagonal and add support to kron with FillArrays#2973albertomercurio wants to merge 0 commit into
kron with Diagonal and add support to kron with FillArrays#2973Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/ext/FillArraysExt.jl b/ext/FillArraysExt.jl
index 865faebf5..9fbe77f1f 100644
--- a/ext/FillArraysExt.jl
+++ b/ext/FillArraysExt.jl
@@ -28,13 +28,13 @@ function LinearAlgebra.kron(A::CUSPARSE.CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T
col = repeat(col, inner = nB)
data = repeat(convert(CUDA.CuVector{T}, A.nzVal), inner = nB)
- row .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
- col .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
+ row .+= CUDA.CuVector(repeat(0:(nB - 1), outer = Annz)) .+ 1
+ col .+= CUDA.CuVector(repeat(0:(nB - 1), outer = Annz)) .+ 1
# Multiply by the fill value (already promoted type T)
data .*= fill_value
- CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
+ return CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end
# kron between Diagonal{T, AbstractFill} and CuSparseMatrixCOO
@@ -52,9 +52,9 @@ function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::C
return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape)
end
- row = (0:nA-1) .* mB
+ row = (0:(nA - 1)) .* mB
row = CUDA.CuVector(repeat(row, inner = Bnnz))
- col = (0:nA-1) .* nB
+ col = (0:(nA - 1)) .* nB
col = CUDA.CuVector(repeat(col, inner = Bnnz))
data = CUDA.fill(T(fill_value), nA * Bnnz)
@@ -63,7 +63,7 @@ function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::C
data .*= repeat(B.nzVal, outer = nA)
- CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
+ return CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end
end # extension module
diff --git a/test/libraries/cusparse/linalg.jl b/test/libraries/cusparse/linalg.jl
index 6c6308003..153fbebaf 100644
--- a/test/libraries/cusparse/linalg.jl
+++ b/test/libraries/cusparse/linalg.jl
@@ -6,7 +6,7 @@ m = 10
A = sprand(T, m, m, 0.2)
B = sprand(T, m, m, 0.3)
ZA = spzeros(T, m, m)
- C = Diagonal(rand(T, div(m, 2)))
+ C = Diagonal(rand(T, div(m, 2)))
dC = adapt(CuArray, C)
@testset "type = $typ" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC]
@@ -38,45 +38,45 @@ m = 10
end
@testset "kronecker product with I opa = $opa" for opa in (identity, transpose, adjoint)
@test collect(kron(opa(dA), dC)) ≈ kron(opa(A), C)
- @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
+ @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
@test collect(kron(opa(dZA), dC)) ≈ kron(opa(ZA), C)
@test collect(kron(dC, opa(dZA))) ≈ kron(C, opa(ZA))
end
end
-
+
# Test type promotion for kron with Diagonal
@testset "Type promotion - T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64)]
A_T1 = sprand(T1, m, m, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
dC_T2 = Diagonal(CUDA.rand(T2, div(m, 2)))
C_T2 = collect(dC_T2)
-
+
# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, dC_T2)
result_cpu = kron(A_T1, C_T2)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
-
+
# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(dC_T2, dA_T1)
result_cpu = kron(C_T2, A_T1)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
end
-
+
# Test kron with zero diagonal matrices
@testset "kron with zero Diagonal" begin
A = sprand(T, m, m, 0.2)
dA = CuSparseMatrixCSR(A)
dC_zeros = Diagonal(CUDA.zeros(T, div(m, 2)))
C_zeros = collect(dC_zeros)
-
+
# Test kron(sparse, zero_diagonal)
result_gpu = kron(dA, dC_zeros)
result_cpu = kron(A, C_zeros)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
-
+
# Test kron(zero_diagonal, sparse)
result_gpu = kron(dC_zeros, dA)
result_cpu = kron(C_zeros, A)
diff --git a/test/libraries/fillarrays.jl b/test/libraries/fillarrays.jl
index e4d97c0c8..9df35256f 100644
--- a/test/libraries/fillarrays.jl
+++ b/test/libraries/fillarrays.jl
@@ -6,91 +6,91 @@ using FillArrays
@testset "T = $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
m, n = 100, 80
A = sprand(T, m, n, 0.2)
-
+
# Test with Ones
@testset "Ones - size $diag_size" for diag_size in [5, 10]
C_ones = Diagonal(Ones{T}(diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_ones))
result_cpu = kron(A, collect(C_ones))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_ones, dA))
result_cpu = kron(collect(C_ones), A)
@test result_gpu ≈ result_cpu
end
-
+
# Test with Fill (constant value)
@testset "Fill - value $val, size $diag_size" for val in [T(2.5), T(-1.0)], diag_size in [5, 10]
C_fill = Diagonal(Fill(val, diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_fill))
result_cpu = kron(A, collect(C_fill))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_fill, dA))
result_cpu = kron(collect(C_fill), A)
@test result_gpu ≈ result_cpu
end
-
+
# Test with Zeros
@testset "Zeros - size $diag_size" for diag_size in [5, 10]
C_zeros = Diagonal(Zeros{T}(diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = kron(dA, C_zeros)
result_cpu = kron(A, collect(C_zeros))
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
-
+
# Test kron(diagonal, sparse)
result_gpu = kron(C_zeros, dA)
result_cpu = kron(collect(C_zeros), A)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
end
-
+
# Test with transpose and adjoint wrappers
@testset "Transpose/Adjoint with Fill" begin
diag_size = 5
val = T(3.0)
C_fill = Diagonal(Fill(val, diag_size))
-
+
@testset "opa = $opa" for opa in (identity, transpose, adjoint)
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(opa(sparse), diagonal)
result_gpu = collect(kron(opa(dA), C_fill))
result_cpu = kron(opa(A), collect(C_fill))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, opa(sparse))
result_gpu = collect(kron(C_fill, opa(dA)))
result_cpu = kron(collect(C_fill), opa(A))
@test result_gpu ≈ result_cpu
end
end
-
+
# Test type promotion
@testset "Type promotion" begin
@testset "T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64), (ComplexF32, Float32)]
A_T1 = sprand(T1, m, n, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
C_fill_T2 = Diagonal(Fill(T2(2.5), 5))
-
+
# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, C_fill_T2)
result_cpu = kron(A_T1, collect(C_fill_T2))
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
-
+
# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(C_fill_T2, dA_T1)
result_cpu = kron(collect(C_fill_T2), A_T1) |
|
Replaces #2804? |
|
Oh I wasn't aware of that PR. I would say that both fix the bug. This PR also implements the |
|
Hello, any thoughts on this? |
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 5b6edc9 | Previous: 568d85c | Ratio |
|---|---|---|---|
array/accumulate/Float32/1d |
100826 ns |
101147 ns |
1.00 |
array/accumulate/Float32/dims=1 |
76797 ns |
76185 ns |
1.01 |
array/accumulate/Float32/dims=1L |
1585287.5 ns |
1585279.5 ns |
1.00 |
array/accumulate/Float32/dims=2 |
143613 ns |
143463 ns |
1.00 |
array/accumulate/Float32/dims=2L |
658141.5 ns |
657524.5 ns |
1.00 |
array/accumulate/Int64/1d |
118353.5 ns |
118197 ns |
1.00 |
array/accumulate/Int64/dims=1 |
79968 ns |
79820 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1695128.5 ns |
1694383 ns |
1.00 |
array/accumulate/Int64/dims=2 |
156157.5 ns |
155940 ns |
1.00 |
array/accumulate/Int64/dims=2L |
961952 ns |
962037 ns |
1.00 |
array/broadcast |
20543 ns |
20304 ns |
1.01 |
array/construct |
1270.9 ns |
1284.1 ns |
0.99 |
array/copy |
17994 ns |
17902 ns |
1.01 |
array/copyto!/cpu_to_gpu |
215026.5 ns |
214122 ns |
1.00 |
array/copyto!/gpu_to_cpu |
282986.5 ns |
281971.5 ns |
1.00 |
array/copyto!/gpu_to_gpu |
10704 ns |
10903 ns |
0.98 |
array/iteration/findall/bool |
134623 ns |
134369 ns |
1.00 |
array/iteration/findall/int |
149954 ns |
149144.5 ns |
1.01 |
array/iteration/findfirst/bool |
81405 ns |
81346 ns |
1.00 |
array/iteration/findfirst/int |
83326 ns |
83243 ns |
1.00 |
array/iteration/findmin/1d |
86081 ns |
83202 ns |
1.03 |
array/iteration/findmin/2d |
116774 ns |
116961 ns |
1.00 |
array/iteration/logical |
200483 ns |
198675.5 ns |
1.01 |
array/iteration/scalar |
68373 ns |
67390 ns |
1.01 |
array/permutedims/2d |
52126 ns |
52028 ns |
1.00 |
array/permutedims/3d |
52546.5 ns |
52924.5 ns |
0.99 |
array/permutedims/4d |
51855 ns |
51300 ns |
1.01 |
array/random/rand/Float32 |
12812 ns |
13229 ns |
0.97 |
array/random/rand/Int64 |
25264 ns |
24844 ns |
1.02 |
array/random/rand!/Float32 |
8378.333333333334 ns |
8410.666666666666 ns |
1.00 |
array/random/rand!/Int64 |
21859 ns |
21476 ns |
1.02 |
array/random/randn/Float32 |
37740 ns |
37952.5 ns |
0.99 |
array/random/randn!/Float32 |
30728 ns |
30817 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
34799 ns |
34256 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=1 |
49190.5 ns |
39482 ns |
1.25 |
array/reductions/mapreduce/Float32/dims=1L |
51082 ns |
50992 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2 |
56325 ns |
56096 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2L |
68998 ns |
69150 ns |
1.00 |
array/reductions/mapreduce/Int64/1d |
42226 ns |
41902 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1 |
42478 ns |
41940.5 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1L |
87284 ns |
86987 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2 |
59544 ns |
59225 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2L |
84960 ns |
84376.5 ns |
1.01 |
array/reductions/reduce/Float32/1d |
35154 ns |
34351.5 ns |
1.02 |
array/reductions/reduce/Float32/dims=1 |
42585 ns |
40162 ns |
1.06 |
array/reductions/reduce/Float32/dims=1L |
51561 ns |
51325 ns |
1.00 |
array/reductions/reduce/Float32/dims=2 |
56923 ns |
56373.5 ns |
1.01 |
array/reductions/reduce/Float32/dims=2L |
70071 ns |
69359 ns |
1.01 |
array/reductions/reduce/Int64/1d |
42317 ns |
42066 ns |
1.01 |
array/reductions/reduce/Int64/dims=1 |
43081.5 ns |
42644.5 ns |
1.01 |
array/reductions/reduce/Int64/dims=1L |
87244 ns |
87115 ns |
1.00 |
array/reductions/reduce/Int64/dims=2 |
59650 ns |
59456.5 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
84799.5 ns |
84687 ns |
1.00 |
array/reverse/1d |
17797 ns |
17753 ns |
1.00 |
array/reverse/1dL |
68407 ns |
68387 ns |
1.00 |
array/reverse/1dL_inplace |
65654 ns |
65713 ns |
1.00 |
array/reverse/1d_inplace |
10028.166666666668 ns |
8493.333333333334 ns |
1.18 |
array/reverse/2d |
20720 ns |
20458 ns |
1.01 |
array/reverse/2dL |
72706 ns |
72592.5 ns |
1.00 |
array/reverse/2dL_inplace |
65750 ns |
65723 ns |
1.00 |
array/reverse/2d_inplace |
10562 ns |
9941 ns |
1.06 |
array/sorting/1d |
2735446 ns |
2735278.5 ns |
1.00 |
array/sorting/2d |
1072853.5 ns |
1075661 ns |
1.00 |
array/sorting/by |
3313841.5 ns |
3328229 ns |
1.00 |
cuda/synchronization/context/auto |
1157.85 ns |
1171.1 ns |
0.99 |
cuda/synchronization/context/blocking |
940.25 ns |
928.35 ns |
1.01 |
cuda/synchronization/context/nonblocking |
8242.3 ns |
7539.6 ns |
1.09 |
cuda/synchronization/stream/auto |
995.6875 ns |
992.5 ns |
1.00 |
cuda/synchronization/stream/blocking |
836.1341463414634 ns |
788.1632653061224 ns |
1.06 |
cuda/synchronization/stream/nonblocking |
7805.6 ns |
7284.700000000001 ns |
1.07 |
integration/byval/reference |
143705 ns |
143692 ns |
1.00 |
integration/byval/slices=1 |
145549 ns |
145653 ns |
1.00 |
integration/byval/slices=2 |
284179 ns |
284386 ns |
1.00 |
integration/byval/slices=3 |
422720 ns |
422791 ns |
1.00 |
integration/cudadevrt |
102338 ns |
102280 ns |
1.00 |
integration/volumerhs |
23488050 ns |
23484054 ns |
1.00 |
kernel/indexing |
13079 ns |
13019 ns |
1.00 |
kernel/indexing_checked |
13726 ns |
13745 ns |
1.00 |
kernel/launch |
2085.1111111111113 ns |
2114.3333333333335 ns |
0.99 |
kernel/occupancy |
672.85 ns |
700.7798742138365 ns |
0.96 |
kernel/rand |
17831 ns |
14137 ns |
1.26 |
latency/import |
3849382619 ns |
3831786424 ns |
1.00 |
latency/precompile |
4592526836 ns |
4592066898 ns |
1.00 |
latency/ttfp |
4417512078.5 ns |
4404545722 ns |
1.00 |
This comment was automatically generated by workflow using github-action-benchmark.
|
These changes seem to cause test failures. |
|
Ok I should have fixed the errors. There are some errors on Julia v1.12 and nightly related to |
Codecov Report✅ All modified and coverable lines are covered by tests. Additional details and impacted files@@ Coverage Diff @@
## master #2973 +/- ##
==========================================
+ Coverage 89.43% 89.47% +0.03%
==========================================
Files 148 148
Lines 12991 13001 +10
==========================================
+ Hits 11619 11632 +13
+ Misses 1372 1369 -3 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
|
Ok by updating the branch all the tests pass |
The previous implementation of
kronbetween a CUSPARSE matrix and aDiagonalwas wrong. It was assuming the diagonal to be the identity, which is not always the case. I have extended it to any generalDiagonalmatrix embedding aCuVector.In order to support the
kronwith identity like matrices, I have added the extension to FillArrays.jl. Here the diagonal is filled by an unique value.