From 3b17d56af0413f9495df0af71120df6fa442a3d4 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Wed, 13 Aug 2025 14:52:52 -0300 Subject: [PATCH 1/6] Name changes and inclusion of bigger test matrices --- src/NextLA.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/NextLA.jl b/src/NextLA.jl index 36ca7d9..8465a72 100644 --- a/src/NextLA.jl +++ b/src/NextLA.jl @@ -77,7 +77,6 @@ function lamch(::Type{T}, cmach) where {T <: Number} end end -# Write your package code here. include("NextLAMatrix.jl") include("lu.jl") include("trmm.jl") From 31940266eb2724a2d16c6d86c0be209e85a78cc9 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Thu, 28 Aug 2025 18:25:52 -0300 Subject: [PATCH 2/6] CAQR: new interface and test adjustemnts --- src/axpy.jl | 4 +- src/geqr2.jl | 77 ++++++---- src/geqrt.jl | 97 ++++++++----- src/gerc.jl | 67 +++++++-- src/larf.jl | 191 ++++++++++++++++++++----- src/larfb.jl | 183 ++++++++++++++++-------- src/larfg.jl | 131 +++++++++++++---- src/larft.jl | 221 +++++++++++++++++++--------- src/lauu2.jl | 149 ++++++++++--------- src/lauum.jl | 266 ++++++++++++++++++---------------- src/pamm.jl | 218 +++++++++++++++++++--------- src/parfb.jl | 183 ++++++++++++++++-------- src/pemv.jl | 316 +++++++++++++++++++++++++--------------- src/rectrxm.jl | 154 +++++++++++++------- src/trmm.jl | 206 ++++++++++++++++++++++----- src/trsm.jl | 55 +++++++ src/tsmqr.jl | 226 +++++++++++++++++++++-------- src/tsqrt.jl | 178 +++++++++++++++++------ src/ttmqr.jl | 110 +++++++------- src/ttqrt.jl | 161 +++++++++++---------- src/unmqr.jl | 364 +++++++++++++++++++++++++++++------------------ test/geqr2.jl | 68 ++++----- test/geqrt.jl | 189 ++++++------------------ test/larf.jl | 33 +++-- test/larfb.jl | 41 +++--- test/larfg.jl | 39 ++--- test/larft.jl | 16 +-- test/lauum.jl | 14 +- test/pamm.jl | 72 +++------- test/parfb.jl | 62 ++++---- test/pemv.jl | 70 ++++----- test/runtests.jl | 8 +- test/tsmqr.jl | 63 ++++---- test/tsqrt.jl | 44 +++--- test/ttmqr.jl | 58 ++++---- test/ttqrt.jl | 27 ++-- test/unmqr.jl | 131 ++++++++--------- 37 files changed, 2802 insertions(+), 1690 deletions(-) diff --git a/src/axpy.jl b/src/axpy.jl index e2cce4b..e56e516 100644 --- a/src/axpy.jl +++ b/src/axpy.jl @@ -1,4 +1,4 @@ -function axpy!(a, x, y) +function axpy!(a::T, x::AbstractVector{T}, y::AbstractVector{T}) where {T} n = length(x) if n <= 0 @@ -12,6 +12,4 @@ function axpy!(a, x, y) for i in 1:n y[i] = y[i] + a*x[i] end - - return end \ No newline at end of file diff --git a/src/geqr2.jl b/src/geqr2.jl index ce9e94a..3e1ed13 100644 --- a/src/geqr2.jl +++ b/src/geqr2.jl @@ -1,43 +1,72 @@ -function geqr2(m,n, A, lda, tau, work) +""" + geqr2!(m, n, A, lda, tau, work) + +Compute unblocked QR factorization of an m-by-n matrix A using Householder reflectors. +The matrix A is overwritten with the Q and R factors. + +# Arguments +- `m`: Number of rows in matrix A +- `n`: Number of columns in matrix A +- `A`: Input matrix (m × n), modified in place to contain Q and R factors +- `tau`: Output vector of scalar factors (length min(m,n)) +- `work`: Workspace vector (length n) + +# Algorithm +Uses Householder reflectors H(i) to zero out elements below the diagonal. +For each column i, generates H(i) and applies it to remaining columns. +""" +function geqr2!(m::Integer, n::Integer, A::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T} + # Input validation if m < 0 - throw(ArgumentError("illegal value of m")) - return -1 + throw(ArgumentError("illegal value of m: $m")) end if n < 0 - throw(ArgumentError("illegal value of n")) - return -2 + throw(ArgumentError("illegal value of n: $n")) end - if lda < max(1,m) - throw(ArgumentError("illegal value of lda")) - return -4 + # Quick return for empty matrices + if m == 0 || n == 0 + return end - k = min(m,n) + k = min(m, n) # Number of reflectors to generate one = oneunit(eltype(A)) - #av = parent(A) - #a1, a2 = parentindices(A) - #a1 = a1.start-1 - #a2 = a2.start-1 - + # Main QR factorization loop for i in 1:k - # generate elementary reflector H(i) to anniliate A(i+1:m, i) - A[i,i], tau[i] = larfg(m-i+1, A[i, i], (@view A[min(i+1,m):m, i]), 1, tau[i]) + # Generate elementary reflector H(i) to annihilate A(i+1:m, i) + A[i, i], tau[i] = larfg!(m-i+1, A[i, i], (@view A[min(i+1,m):m, i]), 1, tau[i]) if i < n - # apply H(i)^H to A(i:m, i+1:n) from left - alpha = A[i,i] - A[i,i] = one + # Apply H(i)^H to A(i:m, i+1:n) from the left + alpha = A[i, i] + A[i, i] = one # Set diagonal element to 1 for reflector application - #LinearAlgebra.LAPACK.larf!('L', (@view A[i:m, i]), conj(tau[i]), (@view A[i:m, i+1:n]), work) - larf('L', m-i+1, n-i, (@view A[i:m, i]), 1, conj(tau[i]), (@view A[i:m, i+1:n]), work) - #zlarf('L', m-i+1, n-i, (@view av[i+a1:m+a1, i+a2]), 1, conj(tau[i]), (@view av[i+a1:m+a1, i+1+a2:n+a2]), lda, work) + # Apply the reflector to remaining columns + larf!('L', m-i+1, n-i, (@view A[i:m, i]), 1, conj(tau[i]), (@view A[i:m, i+1:n]), work) - A[i,i] = alpha + A[i, i] = alpha # Restore original diagonal element end end +end - return +""" + geqr2!(A) -> (A, tau) + +Helper function for unblocked QR factorization using Householder reflectors. + +# Arguments +- `A`: Input matrix (m × n), modified in place +- `tau`: Output vector of scalar factors (length min(m,n)) + +# Returns +- Modified `A` containing Q and R factors +- `tau`: Vector of scalar factors (length min(m,n)) +""" +function geqr2!(A::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} + m, n = size(A) + work = zeros(T, n) + + geqr2!(m, n, A, tau, work) end diff --git a/src/geqrt.jl b/src/geqrt.jl index 963390e..8b1e86f 100644 --- a/src/geqrt.jl +++ b/src/geqrt.jl @@ -1,58 +1,89 @@ -function geqrt(m,n,ib, A, lda, T, ldt, tau, work) +""" + geqrt!(m, n, ib, A, T_matrix, tau, work) + +Compute blocked QR factorization of an m-by-n matrix A using block size ib. +The matrix A is overwritten with the Q and R factors, and T contains the +triangular factor of the block reflector. + +# Arguments +- `m`: Number of rows in matrix A +- `n`: Number of columns in matrix A +- `ib`: Block size for the factorization (must be > 0 if m,n > 0) +- `A`: Input matrix (m × n), modified in place to contain Q and R factors +- `T`: Output triangular block reflector matrix (ib × n) +- `tau`: Output vector of scalar factors (length n) +- `work`: Workspace vector (length ib × n) + +# Algorithm +Uses a block algorithm that processes ib columns at a time. +For each block, performs unblocked QR and then applies the +block reflector to the remaining columns. +""" +function geqrt!(m::Integer, n::Integer, ib::Integer, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T} + # Input validation if m < 0 - throw(ArgumentError("illegal value of m")) - return -1 + throw(ArgumentError("illegal value of m: $m")) end if n < 0 - throw(ArgumentError("illegal value of n")) - return -2 + throw(ArgumentError("illegal value of n: $n")) end if (ib < 0) || ((ib == 0) && (m > 0) && (n > 0)) - throw(ArgumentError("illegal value of ib")) - return -3 - end - - if lda < max(1,m) && m > 0 - throw(ArgumentError("illegal value of lda")) - return -5 - end - - if ldt < max(1,ib) && ib > 0 - throw(ArgumentError("illegal value of ldt")) - return -7 + throw(ArgumentError("illegal value of ib: $ib")) end + # Quick return for empty matrices or zero block size if m == 0 || n == 0 || ib == 0 return end - k = min(m,n) + k = min(m, n) # Number of reflectors to generate + # Process matrix in blocks of size ib for i in 1:ib:k - sb = min(ib, k-i+1) + sb = min(ib, k-i+1) # Current block size - av = @view A[i:m, i:i+sb-1] - tv = @view T[1:sb,i:i+sb-1] - tauv = @view tau[i:i+sb-1] + # Extract current block and corresponding parts of T and tau + av = @view A[i:m, i:i+sb-1] # Current block columns + tv = @view T_matrix[1:sb, i:i+sb-1] # Corresponding T block + tauv = @view tau[i:i+sb-1] # Corresponding tau values - # compute qr for A[i:m, i:i+sb-1] + # Perform unblocked QR factorization on current block + geqr2!(m-i+1, sb, av, tauv, work) - geqr2(m-i+1, sb, av, lda, tauv, work) - larft('F', 'C', m-i+1, sb, av, lda, tauv, tv, ldt) + # Form the triangular factor T for the block reflector + larft!('F', 'C', m-i+1, sb, av, tauv, tv) + # Apply block reflector to remaining columns if any exist if n >= i + sb - # update by apply H^H to A[i:m, i+sb:n] from left - - #wwork = @view work[1: (n-i-sb+1)*sb] - #ww = reshape(wwork, n-i-sb+1, sb) - ww = reshape((@view work[1: (n-i-sb+1)*sb]), n-i-sb+1, sb) + # Reshape work array for block reflector application + ww = reshape((@view work[1:(n-i-sb+1)*sb]), n-i-sb+1, sb) - larfb('L', 'C', 'F', 'C', m-i+1, n-i-sb+1, sb, av, - m-i+1, tv, sb, (@view A[i:m, i+sb:n]), lda, ww, n-i-sb+1) + # Apply H^H to A[i:m, i+sb:n] from the left + larfb!('L', 'C', 'F', 'C', m-i+1, n-i-sb+1, sb, av, + m-i+1, tv, (@view A[i:m, i+sb:n]), ww) end end +end + +""" + geqrt!(A, ib) -> (A, T, tau) + +Helper function for blocked QR factorization. Computes A = Q*R where Q is orthogonal and R is upper triangular. + +# Arguments +- `A`: Input matrix (m × n), modified in place to contain R in upper triangle and Q factors below +- `ib`: Block size for the factorization + +# Returns +- Modified `A` matrix containing Q and R factors +- `T`: Upper triangular block reflector matrix (ib × n) +- `tau`: Vector of scalar factors for elementary reflectors (length n) +""" +function geqrt!(ib::Integer, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} + m, n = size(A) + work = zeros(T, ib * n) - return + geqrt!(m, n, ib, A, T_matrix, tau, work) end diff --git a/src/gerc.jl b/src/gerc.jl index e5e767b..5df5cba 100644 --- a/src/gerc.jl +++ b/src/gerc.jl @@ -1,30 +1,71 @@ +""" + gerc!(alpha, x, y, A) + +Perform the rank-1 update: A := A + alpha * x * y^H + +This function computes a rank-1 update to the matrix A using the outer product +of vectors x and y, scaled by the scalar alpha. The operation performed is: +A[i,j] := A[i,j] + alpha * x[i] * conj(y[j]) + +This is the complex version of the rank-1 update (GER Complex), where the +conjugate of y is used in the outer product. + +# Arguments +- `alpha`: Scalar multiplier for the rank-1 update +- `x`: Vector of length m (first dimension) +- `y`: Vector of length n (second dimension) +- `A`: m×n matrix to be updated in-place + +# Algorithm +The algorithm efficiently computes the outer product by: +1. For each column j, compute temp = alpha * conj(y[j]) +2. If temp ≠ 0, update column j: A[:,j] += temp * x +3. Skip columns where y[j] = 0 to avoid unnecessary computation + +# Input Validation +- Matrix A must have non-negative dimensions +- Vectors x and y must have lengths matching A dimensions +- All inputs must have compatible numeric types + +# Performance Notes +- Optimized for cache efficiency by operating column-wise +- Skips zero elements in y to minimize operations +- In-place operation minimizes memory allocation + +# Example +```julia +m, n = 4, 3 +A = zeros(ComplexF64, m, n) +x = complex.([1.0, 2.0, 3.0, 4.0], [0.1, 0.2, 0.3, 0.4]) +y = complex.([1.0, 0.0, 2.0], [0.5, 0.0, 1.0]) +alpha = 2.0 + 1.0im +gerc!(alpha, x, y, A) # A updated with rank-1 modification +``` +""" function gerc!(alpha::T, x::AbstractVector{T}, y::AbstractVector{T}, A::AbstractMatrix{T}) where {T} m, n = size(A) - if m < 0 - return 1 + # Input validation with descriptive error messages + if length(x) != m + throw(ArgumentError("Vector x length ($(length(x))) must match matrix row dimension ($m)")) end - - if n < 0 - return 2 + + if length(y) != n + throw(ArgumentError("Vector y length ($(length(y))) must match matrix column dimension ($n)")) end + # Early return for degenerate cases if m == 0 || n == 0 || alpha == zero(T) return end - jy = 1 - + # Perform rank-1 update: A := A + alpha * x * y^H for j in 1:n - if y[jy] != zero(T) - temp = alpha * conj(y[jy]) + if y[j] != zero(T) + temp = alpha * conj(y[j]) for i in 1:m A[i, j] += x[i] * temp end end - - jy += 1 end - - return end diff --git a/src/larf.jl b/src/larf.jl index bb592e9..d26ae3e 100644 --- a/src/larf.jl +++ b/src/larf.jl @@ -1,84 +1,127 @@ -function larf(side, m, n, v, incv, tau, c, work) +""" + larf!(side, m, n, v, incv, tau, c, work) + +Apply an elementary reflector H to a m-by-n matrix C from either +the left or the right. + +H = I - tau * v * v^H + +where tau is a scalar and v is a vector. + +# Arguments +- `side`: Character specifying the side of application + - 'L': apply H from the left (H * C) + - 'R': apply H from the right (C * H) +- `m`: Number of rows in matrix C +- `n`: Number of columns in matrix C +- `v`: Array containing the elementary reflector vector +- `incv`: Increment for the elements of v (typically 1) +- `tau`: Scalar factor for the elementary reflector +- `c`: m-by-n matrix to be modified in-place +- `work`: Workspace array + +# Algorithm +The elementary reflector H is applied optimally by exploiting the structure +of the reflector. The algorithm scans for the effective length of the reflector +vector and the effective dimensions of the matrix to minimize operations. + +For side = 'L': Computes C := H * C = (I - tau * v * v^H) * C +For side = 'R': Computes C := C * H = C * (I - tau * v * v^H) + +# Notes +This is a low-level computational routine used internally by higher-level +QR factorization algorithms. The workspace array must be properly allocated. +""" +function larf!(side::Char, m::Integer, n::Integer, v::AbstractVector{T}, incv::Integer, tau::T, C::AbstractMatrix{T}, work::AbstractVector{T}) where {T} lastv = 0 lastc = 0 - one = oneunit(eltype(c)) - zero0 = zero(eltype(c)) + one0 = oneunit(eltype(C)) + zero0 = zero(eltype(C)) - if tau != 0 - # set up variables for scanning v, lastv beigns pointing to end of V - + if tau != zero0 + # Determine the effective length of the reflector vector v if side == 'L' lastv = m else lastv = n end + # Find the index of the last element to check if incv > 0 i = 1 + (lastv-1)*incv else i = 1 end - while lastv > 0 && v[i] == 0 + # Scan backwards to find the last non-zero element in v + while lastv > 0 && v[i] == zero0 lastv -= 1 i -= incv end + # Determine the effective dimensions of C to operate on if side == 'L' - # scan for last non-zero column in C[1:lastv, :] - lastc = ilazlc(lastv, n, c) + # Find last non-zero column in C[1:lastv, :] + lastc = ilazlc(lastv, n, C) else - # scan for last non-zero row in C[:, 1:lastv] - lastc = ilazlr(m, lastv, c) + # Find last non-zero row in C[:, 1:lastv] + lastc = ilazlr(m, lastv, C) end end if side == 'L' - #form H*C - + # Form H*C = (I - tau * v * v^H) * C if lastv > 0 vv = @view v[1:lastv, 1] - cv = @view c[1:lastv, 1:lastc] + cv = @view C[1:lastv, 1:lastc] wv = @view work[1:lastc] - # w[1:lastc,1] = c[1:lastv, 1:lastc]^H * v[1:lastv, 1] - - #LinearAlgebra.BLAS.gemv!('C', one, cv, vv, zero0, wv) - LinearAlgebra.generic_matvecmul!(wv, 'C', cv, vv, LinearAlgebra.MulAddMul(one, zero0)) - #LinearAlgebra.generic_matvecmul!((@view work[1:lastc, 1]), 'C', (@view c[1:lastv, 1:lastc]), - #(@view v[1:lastv, 1]), LinearAlgebra.MulAddMul(one, zero0)) - - #c[1:lastv,1:lastc] -= tau*v[1:lastv, 1]*w[1:lastc,1]^H - #LinearAlgebra.BLAS.gemm!('N', 'C', -tau, vv, wv, one, cv) - gerc!(-tau, vv, wv, cv) + + # Step 1: w = C^H * v (compute v^H * C as w^T) + LinearAlgebra.generic_matvecmul!(wv, 'C', cv, vv, LinearAlgebra.MulAddMul(one0, zero0)) - #LinearAlgebra.generic_matmul!((@view c[1:lastv, 1:lastc]), 'N', (@view v[1:lastv, 1]), - #(@view work[1:lastc, 1]), LinearAlgebra.MulAddMul(-tau, one)) + # Step 2: C := C - tau * v * w^H (rank-1 update) + gerc!(-tau, vv, wv, cv) end else - #form C*H - + # Form C*H = C * (I - tau * v * v^H) if lastv > 0 - # w[1:lastc,1] = c[1:lastc, 1:lastv] * v[1:lastv, 1] - LinearAlgebra.generic_matvecmul!((@view work[1:lastc, 1]), 'N', (@view c[1:lastc, 1:lastv]), - (@view v[1:lastv, 1]), LinearAlgebra.MulAddMul(one, zero0)) + # Step 1: w = C * v + LinearAlgebra.generic_matvecmul!((@view work[1:lastc, 1]), 'N', (@view C[1:lastc, 1:lastv]), + (@view v[1:lastv, 1]), LinearAlgebra.MulAddMul(one0, zero0)) - #c[1:lastc,1:lastv] -= tau(?)*w[1:lastc,1]*v[1:lastv, 1]^H - - #LinearAlgebra.BLAS.ger!(-tau, wv, vv, cv) - gerc!(-tau, (@view work[1:lastc, 1]), (@view v[1:lastv, 1]), (@view c[1:lastc, 1:lastv])) + # Step 2: C := C - tau * w * v^H (rank-1 update) + gerc!(-tau, (@view work[1:lastc, 1]), (@view v[1:lastv, 1]), (@view C[1:lastc, 1:lastv])) end end end -function ilazlc(m,n,a) +""" + ilazlc(m, n, a) -> Int + +Find the index of the last non-zero column in an m-by-n matrix. +Scans from column n backwards to column 1, checking all rows +in each column for non-zero elements. + +# Arguments +- `m`: Number of rows in matrix a +- `n`: Number of columns in matrix a +- `a`: Matrix to scan + +# Returns +- Index of last column containing at least one non-zero element, + or 0 if all elements are zero +""" +function ilazlc(m, n, a) if n == 0 return n end + # Quick check of the last column boundaries if a[1,n] != 0 || a[m,n] != 0 return n end + # Scan columns from right to left for j in n:-1:1 for i in 1:m if a[i, j] != 0 @@ -86,28 +129,98 @@ function ilazlc(m,n,a) end end end + + return 0 # All elements are zero end -function ilazlr(m,n,a) - +""" + ilazlr(m, n, a) -> Int + +Find the index of the last non-zero row in an m-by-n matrix. +Scans all columns to determine the effective row dimension. + +# Arguments +- `m`: Number of rows in matrix a +- `n`: Number of columns in matrix a +- `a`: Matrix to scan + +# Returns +- Index of last row containing at least one non-zero element, + or 0 if all elements are zero +""" +function ilazlr(m, n, a) if m == 0 return m end + # Quick check of the last row boundaries if a[m,1] != 0 || a[m,n] != 0 return m end ila = 0 + # For each column, find the last non-zero row for j in 1:n i = m while (a[max(i,1), j] == 0) && (i > 1) i -= 1 end - ila = max(ila, i) end return ila end + +""" + larf!(side, A, tau, C) -> C + +Apply an elementary reflector H to a matrix C, where H = I - tau * A * A^H. + +This is a high-level interface to the elementary reflector application routine. +The reflector can be applied from either the left (H*C) or right (C*H) side. + +# Arguments +- `side`: Character specifying application side ('L' for left, 'R' for right) +- `A`: Vector defining the elementary reflector +- `tau`: Scalar factor for the reflector +- `C`: Matrix to be transformed in-place + +# Returns +- The modified matrix `C` + +# Input Validation +- For side='L': length(A) must equal number of rows in C +- For side='R': length(A) must equal number of columns in C + +# Example +```julia +# Apply reflector from left: C := H * C +larf!('L', v, tau, C) + +# Apply reflector from right: C := C * H +larf!('R', v, tau, C) +``` +""" +function larf!(side::Char, v::AbstractVector{T}, incv::Integer, tau::T, C::AbstractMatrix{T}) where {T} + m, n = size(C) + + # Input validation with descriptive error messages + if side == 'L' + if length(v) != m + throw(ArgumentError("For side='L', reflector length ($(length(v))) must equal matrix row dimension ($m)")) + end + work = zeros(T, n) + elseif side == 'R' + if length(v) != n + throw(ArgumentError("For side='R', reflector length ($(length(v))) must equal matrix column dimension ($n)")) + end + work = zeros(T, m) + else + throw(ArgumentError("Invalid side parameter: '$side'. Must be 'L' or 'R'")) + end + + + # Call the core computational routine + larf!(side, m, n, v, incv, tau, C, work) +end diff --git a/src/larfb.jl b/src/larfb.jl index 33f5774..a9940ca 100644 --- a/src/larfb.jl +++ b/src/larfb.jl @@ -1,5 +1,5 @@ """ - larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, work, ldwork) + larfb!(side, trans, direct, storev, m, n, k, v, ldv, t, c, work) Applies complex block reflector H or its transpose H^H to m-by-n matrix C from either the left or the right Implemented with Julia internal functions for matrix multiplication @@ -29,23 +29,18 @@ Implemented with Julia internal functions for matrix multiplication - if storev = 'C' and side = 'R', ldv >= max(1,n) - if storev = 'R', ldv >= k - 't': dimension (ldv, k), the triangular k-by-k matrix t in representation of the block reflector -- 'ldt': the leading dimension of array t, ldt >= k - 'c': - on entry m-by-n matrix - on exit, overwritten by H*C or H^H*C or C*H or C*H^H -- 'ldc': the leading dimension of c. ldc >= max(1,m) - 'work': dimension (ldwork, k) -- 'ldwork': - - if side = 'L', ldwork >= max(1,n) - - if side = 'R', ldwork >= max(1,m) """ -function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, work, ldwork) - +function larfb!(side::Char, trans::Char, direct::Char, storev::Char, m::Integer, n::Integer, k::Integer, V::AbstractMatrix{T}, ldv::Integer, T_mat::AbstractMatrix{T}, C::AbstractMatrix{T}, work::AbstractMatrix{T}) where {T} + if m <= 0 || n <= 0 return end - one = oneunit(eltype(c)) + one = oneunit(eltype(C)) plus = LinearAlgebra.MulAddMul(one, one) minus = LinearAlgebra.MulAddMul(one*(-1),one) @@ -62,10 +57,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor (C2) """ - c1 = @view c[1:k,:] - c2 = @view c[k+1:m,:] - v1 = @view v[1:k,:] - v2 = @view v[k+1:m,:] + c1 = @view C[1:k,:] + c2 = @view C[k+1:m,:] + v1 = @view V[1:k,:] + v2 = @view V[k+1:m,:] work .= c1' @@ -80,9 +75,9 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor # W = W * T^H or W*T if trans == 'N' # W = W*T^H - LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, t) + LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, T_mat) else - LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, t) + LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, T_mat) end if m > k @@ -100,10 +95,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor """ Form C*H or C*H^H where C = (c1 c2) """ - c1 = @view c[:, 1:k] - c2 = @view c[:, k+1:n] - v1 = @view v[1:k,:] - v2 = @view v[k+1:n,:] + c1 = @view C[:, 1:k] + c2 = @view C[:, k+1:n] + v1 = @view V[1:k,:] + v2 = @view V[k+1:n,:] work .= c1 @@ -119,9 +114,9 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor #w = w*t or w*t^H if trans == 'C' # W = W*T^H - LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, t) + LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, T_mat) else - LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, t) + LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, T_mat) end if n > k @@ -146,10 +141,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor Form H*C or H^H*C where C = (c1) (c2) """ - c1 = @view c[1:m-k,:] - c2 = @view c[m-k+1:m,:] - v1 = @view v[1:ldv-k,:] - v2 = @view v[ldv-k+1:ldv,:] + c1 = @view C[1:m-k,:] + c2 = @view C[m-k+1:m,:] + v1 = @view V[1:ldv-k,:] + v2 = @view V[ldv-k+1:ldv,:] work .= c2' @@ -163,10 +158,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor if trans == 'N' #work = work*(t') - LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, t) + LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, T_mat) else #work = work*t - LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, t) + LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, T_mat) end #c1 = c1 - v1*w^H @@ -180,7 +175,7 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor #c2 = c2 - w^H for j in 1:k for i in 1:n - c[m-k+j,i] = c[m-k+j,i] - conj(work[i,j]) + C[m-k+j,i] = C[m-k+j,i] - conj(work[i,j]) end end else @@ -188,10 +183,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor """ Form C*H or C*H^H where C = (c1 c2) """ - c1 = @view c[:,1:n-k] - c2 = @view c[:,n-k+1:n] - v1 = @view v[1:ldv-k,:] - v2 = @view v[ldv-k+1:ldv,:] + c1 = @view C[:,1:n-k] + c2 = @view C[:,n-k+1:n] + v1 = @view V[1:ldv-k,:] + v2 = @view V[ldv-k+1:ldv,:] work .= c2 @@ -205,10 +200,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor if trans == 'C' #work = work*(t') - LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, t) + LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, T_mat) else #work = work*t - LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, t) + LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, T_mat) end #c1 = c1 - w*v1^H @@ -237,10 +232,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor (c2) """ - v1 = @view v[:, 1:k] - v2 = @view v[:, k+1:m] - c1 = @view c[1:k, :] - c2 = @view c[k+1:m, :] + v1 = @view V[:, 1:k] + v2 = @view V[:, k+1:m] + c1 = @view C[1:k, :] + c2 = @view C[k+1:m, :] work .= c1' @@ -254,10 +249,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor if trans == 'N' #work = work*(t') - LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, t) + LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, T_mat) else #work = work*t - LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, t) + LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, T_mat) end #c2 = c2 - v2^h*w^h @@ -276,10 +271,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor Form C*H or C*H^H where C = (c1 c2) """ - v1 = @view v[:, 1:k] - v2 = @view v[:, k+1:n] - c1 = @view c[:, 1:k] - c2 = @view c[:, k+1:n] + v1 = @view V[:, 1:k] + v2 = @view V[:, k+1:n] + c1 = @view C[:, 1:k] + c2 = @view C[:, k+1:n] work .= c1 @@ -293,10 +288,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor if trans == 'C' #work = work*(t') - LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, t) + LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, T_mat) else #work = work*t - LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, t) + LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, T_mat) end #c2 = c2 - w*v2 @@ -320,10 +315,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor Form H*C or H^H*C where C = (c1) (c2) """ - v1 = @view v[:, 1:m-k] - v2 = @view v[:, m-k+1:m] - c1 = @view c[1:m-k,:] - c2 = @view c[m-k+1:m,:] + v1 = @view V[:, 1:m-k] + v2 = @view V[:, m-k+1:m] + c1 = @view C[1:m-k,:] + c2 = @view C[m-k+1:m,:] work .= c2' @@ -337,10 +332,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor if trans == 'N' #work = work*(t') - LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, t) + LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, T_mat) else #work = work*t - LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, t) + LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, T_mat) end #c1 = c1 - v1^h * w^h @@ -358,10 +353,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor """ Form C*H or C*H^H where C = (c1 c2) """ - v1 = @view v[:, 1:n-k] - v2 = @view v[:, n-k+1:n] - c1 = @view c[:, 1:n-k] - c2 = @view c[:,n-k+1:n] + v1 = @view V[:, 1:n-k] + v2 = @view V[:, n-k+1:n] + c1 = @view C[:, 1:n-k] + c2 = @view C[:,n-k+1:n] work .= c2 @@ -375,10 +370,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor if trans == 'C' #work = work*(t') - LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, t) + LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, T_mat) else #work = work*t - LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, t) + LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, T_mat) end #c1 = c1 - w*v1 @@ -395,6 +390,74 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor end end end +end + +""" + larfb!(side, trans, direct, storev, V, T, C) + +Apply a complex block reflector H or its conjugate transpose H^H to a matrix C. + +This is a high-level interface that automatically computes required dimensions +and allocates workspace for the block reflector application. + +The block reflector H has the form: +H = I - V * T * V^H - return -end \ No newline at end of file +where V contains k elementary reflector vectors and T is an upper triangular +block reflector coefficient matrix. + +# Arguments +- `side`: Character specifying which side to apply the reflector + - 'L': Apply H from the left (H*C or H^H*C) + - 'R': Apply H from the right (C*H or C*H^H) +- `trans`: Character specifying which form to apply + - 'N': Apply H (no conjugate transpose) + - 'C': Apply H^H (conjugate transpose) +- `direct`: Character indicating how H is formed from elementary reflectors + - 'F': H = H(1) H(2) ... H(k) (Forward - first k reflectors) + - 'B': H = H(k) ... H(2) H(1) (Backward - last k reflectors) +- `storev`: Character indicating how reflector vectors are stored in V + - 'C': Reflector vectors stored columnwise in V + - 'R': Reflector vectors stored rowwise in V +- `V`: Matrix containing the elementary reflector vectors +- `T`: Upper triangular k×k matrix with block reflector coefficients +- `C`: m×n matrix to be transformed in-place + +# Algorithm +Applies the block reflector efficiently by: +1. Computing W = C^H * V (or W = C * V for right multiplication) +2. Multiplying by the triangular matrix T: W := W * T (or W * T^H) +3. Applying rank-k update: C := C - V * W^H (or C - W * V^H) + +The algorithm exploits the triangular structure of the reflector matrix +to minimize computational cost. + +# Example +```julia +m, n, k = 8, 6, 4 +C = complex.(randn(m, n), randn(m, n)) +V = complex.(randn(m, k), randn(m, k)) # k reflector vectors +T = triu(complex.(randn(k, k), randn(k, k))) # Upper triangular +larfb!('L', 'N', 'F', 'C', V, T, C) # Apply H*C +``` +""" +function larfb!(side::Char, trans::Char, direct::Char, storev::Char, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, C::AbstractMatrix{T}) where {T} + # Determine dimensions + m, n = size(C) + k = size(T, 1) + + # Set leading dimensions + ldv = size(V, 1) + + # Allocate workspace + if side == 'L' + ldwork = n + work = similar(C, k, n) + else + ldwork = m + work = similar(C, m, k) + end + + # Call the underlying kernel + larfb!(side, trans, direct, storev, m, n, k, V, ldv, T_mat, C, work) +end \ No newline at end of file diff --git a/src/larfg.jl b/src/larfg.jl index 71ec937..332b958 100644 --- a/src/larfg.jl +++ b/src/larfg.jl @@ -1,59 +1,100 @@ -function larfg(n, alpha, x, incx, tau) +""" + larfg!(n, alpha, x, incx, tau) + +Generate an elementary reflector H such that: +H * [alpha; x] = [beta; 0] + +where H = I - tau * v * v^H, v = [1; x/scale], and beta = -sign(alpha) * ||[alpha; x]|| + +This routine generates a complex elementary Householder reflector H of order n, +such that when applied to the vector [alpha; x], it zeros out the x portion +and produces [beta; 0] where beta has the same magnitude as the original vector. + +# Arguments +- `n`: Order of the reflector (length of full vector [alpha; x]) +- `alpha`: Scalar element, the first component of the vector +- `x`: Vector of length n-1, remaining components of the vector +- `incx`: Increment for elements of x (typically 1) +- `tau`: Output scalar factor for the reflector + +# Returns +- `alpha`: Modified to contain beta (the new first component) +- `tau`: Scalar factor such that H = I - tau * v * v^H + +# Algorithm +The algorithm handles potential under/overflow carefully by scaling when +necessary. The reflector is chosen so that the reflection introduces no +unnecessary amplification of round-off errors. + +Special cases: +- If x = 0 and imag(alpha) = 0, then tau = 0 (no reflection needed) +- If n ≤ 1, then tau = 0 (trivial case) + +# Mathematical Details +For the elementary reflector H = I - tau * v * v^H where v = [1; u]: +- tau = (beta - alpha) / beta for real case +- tau = (beta - Re(alpha))/beta - i*Im(alpha)/beta for complex case +- The vector u replaces x on output + +# Note +This is a low-level LAPACK-style computational routine. Input validation +should be performed by higher-level interfaces. +""" +function larfg!(n::Integer, alpha::T, x::AbstractVector{T}, incx::Integer, tau::T) where {T} one = oneunit(eltype(alpha)) zero0 = zero(eltype(alpha)) - type = eltype(alpha) - if n <= 0 + + if n <= 1 tau = zero0 return alpha, tau end - - if n == 1 - xnorm = 0 - else - xnorm = norm(x,2) - end + + xnorm = norm(x, 2) alphr = real(alpha) alphi = imag(alpha) - if xnorm == 0 && alphi == 0 + + if xnorm == zero0 && alphi == zero0 tau = zero0 else + # Compute beta = -sign(alphr) * ||[alpha, x]|| beta = -copysign(sqrt(alphr^2 + alphi^2 + xnorm^2), alphr) + # Machine parameters for safe scaling safmin = lamch(eltype(alphr), 'S') / lamch(eltype(alphr), 'E') rsafmn = one / safmin knt = 0 if abs(beta) < safmin - # xnorm, beta may be inaccurate, scale x and recompute - - while true + # xnorm, beta may be inaccurate due to underflow; scale and recompute + while abs(beta) < safmin knt += 1 x .*= rsafmn beta *= rsafmn - alphr *= rsafmn + alphr *= rsafmn alphi *= rsafmn alpha *= rsafmn - - if abs(beta) < safmin - break - end end - #recompute - xnorm = norm(x) - if type <: Complex + # Recompute with scaled values + xnorm = norm(x, 2) + if T <: Complex alpha = alphr + im * alphi end beta = -copysign(sqrt(alphr^2 + alphi^2 + xnorm^2), alphr) end - if type <: Complex - tau = ( beta-alphr ) / beta - im * alphi / beta + + # Compute tau based on number type + if T <: Complex + tau = (beta - alphr) / beta - im * alphi / beta else - tau = ( beta - alphr ) / beta + tau = (beta - alphr) / beta end - x .*= (one / (alpha-beta)) + # Scale x to form the reflector vector + x .*= (one / (alpha - beta)) + + # Scale beta back if we scaled up for j in 1:knt beta *= safmin end @@ -63,3 +104,43 @@ function larfg(n, alpha, x, incx, tau) return alpha, tau end + +""" + larfg!(x) -> (alpha, tau, x_updated) + +Generate an elementary reflector H such that H * x produces a vector +with all but the first element equal to zero. + +This is a high-level interface to the elementary reflector generation routine. +Given a vector x, it computes a Householder reflector H = I - tau * v * v^H +that zeros out all but the first component. + +# Arguments +- `x`: Vector to be transformed (will be modified in-place) + +# Returns +- `alpha`: The resulting first component (beta) +- `tau`: Scalar factor of the elementary reflector +- `x_updated`: The updated vector with first component as alpha, rest as reflector vector + +# Input Validation +- Vector must have at least one element + +# Example +```julia +x = complex.([3.0, 4.0, 0.0], [0.0, 0.0, 0.0]) +alpha, tau, x_new = larfg!(x) +# x_new[1] will be the magnitude -||x||, x_new[2:end] will be the reflector vector +``` + +# Mathematical Background +Creates H such that H * x = [||x||; 0; 0; ...] where the sign is chosen +to avoid cancellation. The reflector vector is stored in x_new[2:end]. +""" +function larfg!(alpha::T, x::AbstractVector{T}, incx::Integer, tau::T) where {T} + n = length(x) + + alpha_out, tau_out = larfg!(n, alpha, x, incx, tau) + + return alpha_out, tau_out +end \ No newline at end of file diff --git a/src/larft.jl b/src/larft.jl index bab0e63..b76f39f 100644 --- a/src/larft.jl +++ b/src/larft.jl @@ -1,160 +1,249 @@ -function larft(direct, storev, n, k, v, ldv, tau, t, ldt) +""" + larft!(direct, storev, n, k, v, tau, T_mat) + +Form the triangular factor T of a complex block reflector H of order n, +where H is defined as a product of k elementary reflectors. + +The block reflector H has the form: +H = I - V * T * V^H + +where V is n-by-k and contains the elementary reflector vectors, and T is +the k-by-k upper triangular factor computed by this routine. + +# Arguments +- `direct`: Character indicating the order of the elementary reflectors + - 'F': H = H(1) H(2) ... H(k) (Forward) + - 'B': H = H(k) ... H(2) H(1) (Backward) +- `storev`: Character indicating how the reflector vectors are stored in V + - 'C': Columnwise storage (V is n-by-k) + - 'R': Rowwise storage (V is k-by-n) +- `n`: Order of the reflector H +- `k`: Number of elementary reflectors (order of T) +- `v`: Matrix containing the elementary reflector vectors +- `tau`: Array containing the scalar factors of the elementary reflectors +- `T_mat`: k-by-k matrix where the triangular factor T will be stored + +# Algorithm +The algorithm computes T such that H = I - V * T * V^H where each column +(or row) of V represents an elementary reflector. The triangular structure +ensures efficient application of the block reflector. + +For forward direction (direct='F'): +- T[i,i] = tau[i] (diagonal elements) +- T[j,i] = -tau[i] * V[i,j] * T[j,j:i-1] for j < i (upper triangular part) + +For backward direction (direct='B'): +- T[i,i] = tau[i] (diagonal elements) +- T[j,i] = -tau[i] * V[j,i] * T[i+1:j,i] for j > i (lower triangular part) + +# Notes +This is the core computational routine for forming block reflector coefficients. +The matrix T enables efficient application of multiple reflectors simultaneously. +""" +function larft!(direct::Char, storev::Char, n::Integer, k::Integer, V::AbstractMatrix{T}, tau::AbstractVector{T}, T_mat::AbstractMatrix{T}) where {T} if n == 0 return end - zero0 = zero(eltype(v)) - one = oneunit(eltype(v)) + zero0 = zero(eltype(V)) + one0 = oneunit(eltype(V)) if direct == 'F' prevlastv = n for i in 1:k - prevlastv = max(prevlastv, i) - if tau[i] == 0 - # H(i) = i - + if tau[i] == zero0 + # H(i) = I (no reflection) for j in 1:i - t[j,i] = zero0 + T_mat[j,i] = zero0 end - else - # general case - + # General case: compute T column if storev == 'C' + # Find the last non-zero element in v[:,i] lastv = n - - #for lastv in n:-1:i+1 while lastv >= i+1 - - if v[lastv, i] != 0 + if V[lastv, i] != zero0 break end - lastv -= 1 end + # Initialize T[1:i-1,i] with diagonal contribution for j in 1:i-1 - t[j,i] = -tau[i] * conj(v[i,j]) + T_mat[j,i] = -tau[i] * conj(V[i,j]) end + # Add contribution from off-diagonal part j = min(lastv, prevlastv) + LinearAlgebra.generic_matvecmul!((@view T_mat[1:i-1, i]), 'C', (@view V[i+1:j, 1:i-1]), + (@view V[i+1:j,i]), LinearAlgebra.MulAddMul(-tau[i], one0)) - # t[1:i-1, i] = -tau[i] * v[i:j, 1:i-1]^H * v[i:j, i] - LinearAlgebra.generic_matvecmul!((@view t[1:i-1, i]), 'C', (@view v[i+1:j, 1:i-1]), - (@view v[i+1:j,i]), LinearAlgebra.MulAddMul(-tau[i], one)) - - else + else # storev == 'R' + # Find the last non-zero element in v[i,:] lastv = n - #for lastv in n:-1:i+1 while lastv >= i+1 - if v[i, lastv] != 0 + if V[i, lastv] != zero0 break end - lastv -= 1 end + # Initialize T[1:i-1,i] with diagonal contribution for j in 1:i-1 - t[j,i] = -tau[i] * v[j,i] + T_mat[j,i] = -tau[i] * V[j,i] end + # Add contribution from off-diagonal part j = min(lastv, prevlastv) - - # t[1:i-1, i] = -tau[i] * v[1:i-1, i:j] * v[i,i:j]^H if i-1 > 0 - LinearAlgebra.generic_matmatmul!((@view t[1:i-1, i]), 'N', 'C', (@view v[1:i-1, i:j]), - (@view v[i:i, i:j]), LinearAlgebra.MulAddMul(-tau[i], one)) + LinearAlgebra.generic_matmatmul!((@view T_mat[1:i-1, i]), 'N', 'C', (@view V[1:i-1, i:j]), + (@view V[i:i, i:j]), LinearAlgebra.MulAddMul(-tau[i], one0)) end end - #t[1:i-1,i] = t[1:i-1, 1:i-1] * t[1:i-1,i] - LinearAlgebra.generic_trimatmul!((@view t[1:i-1,i]), 'U', 'N', identity, - (@view t[1:i-1, 1:i-1]), (@view t[1:i-1, i])) + # Apply triangular solve: T[1:i-1,i] = T[1:i-1,1:i-1] * T[1:i-1,i] + LinearAlgebra.generic_trimatmul!((@view T_mat[1:i-1,i]), 'U', 'N', identity, + (@view T_mat[1:i-1, 1:i-1]), (@view T_mat[1:i-1, i])) - t[i,i] = tau[i] + # Set diagonal element + T_mat[i,i] = tau[i] + # Update tracking variable if i > 1 prevlastv = max(prevlastv, lastv) else prevlastv = lastv end - end end - else + else # direct == 'B' prevlastv = 1 for i in k:-1:1 - if tau[i] == 0 - - #H(i) = I - + if tau[i] == zero0 + # H(i) = I (no reflection) for j in i:k - t[j,i] = zero0 + T_mat[j,i] = zero0 end - else if i < k if storev == 'C' + # Find the first non-zero element in v[:,i] lastv = 1 - - #for lastv in 1:i-1 while lastv <= i-1 - if v[lastv,i] != 0 + if V[lastv,i] != zero0 break end lastv += 1 end + # Initialize T[i+1:k,i] with diagonal contribution for j in i+1:k - t[j,i] = -tau[i] * conj(v[n-k+i, j]) + T_mat[j,i] = -tau[i] * conj(V[n-k+i, j]) end + # Add contribution from off-diagonal part j = max(lastv, prevlastv) - - - #t[i+1:k, i] = -tau[i] * v[j:n-k+i, i+1:k]^H * v[j:n-k+i, i] - - LinearAlgebra.generic_matvecmul!((@view t[i+1:k, i]), 'C', (@view v[j:n-k+i, i+1:k]), - (@view v[j:n-k+i, k]), LinearAlgebra.MulAddMul(-tau[i], one)) - else + LinearAlgebra.generic_matvecmul!((@view T_mat[i+1:k, i]), 'C', (@view V[j:n-k+i, i+1:k]), + (@view V[j:n-k+i, k]), LinearAlgebra.MulAddMul(-tau[i], one0)) + + else # storev == 'R' + # Find the first non-zero element in v[i,:] lastv = 1 - #for lastv in 1:i-1 while lastv <= i-1 - if v[lastv,i] != 0 + if V[lastv,i] != zero0 break end lastv += 1 end + # Initialize T[i+1:k,i] with diagonal contribution for j in i+1:k - t[j,i] = -tau[i] * v[j, n-k+i] + T_mat[j,i] = -tau[i] * V[j, n-k+i] end + # Add contribution from off-diagonal part j = max(lastv, prevlastv) - - #t[i+1:k, i] = -tau[i] * v[i+1:k , j:n-k+i] * v[i, j:n-k+i]^H - LinearAlgebra.generic_matmatmul!((@view t[i+1:k, i]), 'N', 'C', (@view v[i+1:k, j:n-k+i-1]), - (@view v[i:i, j:n-k+i-1]), LinearAlgebra.MulAddMul(-tau[i], one)) + LinearAlgebra.generic_matmatmul!((@view T_mat[i+1:k, i]), 'N', 'C', (@view V[i+1:k, j:n-k+i-1]), + (@view V[i:i, j:n-k+i-1]), LinearAlgebra.MulAddMul(-tau[i], one0)) end - # t[i+1:k, i] = t[i+1:k, i+1:k] * t[i+1:k, i] - - LinearAlgebra.generic_trimatmul!((@view t[i+1:k, i]), 'L', 'N', identity, - (@view t[i+1:k, i+1:k]), (@view t[i+1:k, i])) + # Apply triangular solve: T[i+1:k,i] = T[i+1:k,i+1:k] * T[i+1:k,i] + LinearAlgebra.generic_trimatmul!((@view T_mat[i+1:k, i]), 'L', 'N', identity, + (@view T_mat[i+1:k, i+1:k]), (@view T_mat[i+1:k, i])) + # Update tracking variable if i > 1 prevlastv = min(prevlastv, lastv) else prevlastv = lastv end - end - t[i,i] = tau[i] + # Set diagonal element + T_mat[i,i] = tau[i] end end end +end + +""" + larft(direct, storev, V, tau) -> T + +Form the triangular factor T of a complex block reflector H from elementary +reflectors and their scalar factors. + +This is a high-level interface that automatically determines dimensions and +allocates the output matrix. The block reflector H has the form: +H = I - V * T * V^H + +# Arguments +- `direct`: Character indicating the order of elementary reflector products + - 'F': H = H(1) H(2) ... H(k) (Forward) + - 'B': H = H(k) ... H(2) H(1) (Backward) +- `storev`: Character indicating how reflector vectors are stored in V + - 'C': Columnwise storage (V is n-by-k) + - 'R': Rowwise storage (V is k-by-n) +- `V`: Matrix containing the elementary reflector vectors +- `tau`: Vector containing scalar factors of the elementary reflectors + +# Returns +- `T`: k-by-k upper triangular matrix (triangular factor of block reflector) + +# Input Validation +- Matrix V and vector tau must have compatible dimensions +- For 'C' storage: size(V,2) must equal length(tau) +- For 'R' storage: size(V,1) must equal length(tau) + +# Example +```julia +m, k = 8, 4 +V = complex.(randn(m, k), randn(m, k)) # Elementary reflector vectors +tau = complex.(randn(k), randn(k)) # Reflector scaling factors +T = larft('F', 'C', V, tau) # Compute triangular factor +``` + +# Mathematical Background +The triangular factor T enables efficient block operations. Instead of applying +k individual reflectors H(1), H(2), ..., H(k), the block reflector +H = I - V*T*V^H can be applied in O(n²k) operations rather than O(nk²). +""" +function larft!(direct::Char, storev::Char, V::AbstractMatrix{T}, tau::AbstractVector{T}, T_mat::AbstractMatrix{T}) where {T} + # Determine dimensions based on storage format + if storev == 'C' + n, k = size(V) + if length(tau) != k + throw(ArgumentError("For columnwise storage, length(tau) must equal size(V,2)")) + end + else # storev == 'R' + k, n = size(V) + if length(tau) != k + throw(ArgumentError("For rowwise storage, length(tau) must equal size(V,1)")) + end + end + + # Call the core computational routine + larft!(direct, storev, n, k, V, tau, T_mat) end \ No newline at end of file diff --git a/src/lauu2.jl b/src/lauu2.jl index ca203ab..4c0d517 100644 --- a/src/lauu2.jl +++ b/src/lauu2.jl @@ -1,108 +1,107 @@ -export lauu2 +export lauu2! """ -Purpose: -======= -LAUU2 computes the product U * U' or L' * L, where the triangular -factor U or L is stored in the upper or lower triangular part of -the array A. - -If UPLO = 'U' or 'u', the upper triangle of the result is stored, -overwriting the factor U in A. -If UPLO = 'L' or 'l', the lower triangle of the result is stored, -overwriting the factor L in A. - -Arguments: -========== -UPLO (input) CHARACTER*1 - Specifies whether the triangular factor stored in the array A - is upper or lower triangular: - = 'U': Upper triangular - = 'L': Lower triangular - -N (input) INTEGER - The order of the triangular factor U or L. N >= 0. - -A (input/output) COMPLEX{T} array, dimension (LDA,N) - On entry, the triangular factor U or L. - On exit, if UPLO = 'U', the upper triangle of A is - overwritten with the upper triangle of the product U * U'; - if UPLO = 'L', the lower triangle of A is overwritten with - the lower triangle of the product L' * L. - -LDA (input) INTEGER - The leading dimension of the array A. LDA >= max(1,N). - -INFO (output) INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value + lauu2!(uplo, n, A) + +Compute the product U * U^H or L^H * L, where the triangular factor U or L +is stored in the upper or lower triangular part of the array A. + +This is an unblocked algorithm for computing the product of a triangular +matrix with its conjugate transpose. The result overwrites the original +triangular matrix. + +# Arguments +- `uplo`: Character specifying which triangle is stored + - 'U' or 'u': Upper triangular, computes U * U^H + - 'L' or 'l': Lower triangular, computes L^H * L +- `n`: Order of the triangular matrix (≥ 0) +- `A`: Triangular matrix to be transformed (modified in-place) + +# Algorithm +For upper triangular (uplo='U'): +- Computes A := U * U^H where U is upper triangular +- Result is Hermitian, only upper triangle is computed and stored + +For lower triangular (uplo='L'): +- Computes A := L^H * L where L is lower triangular +- Result is Hermitian, only lower triangle is computed and stored + +The algorithm processes one column (or row) at a time using dot products +and matrix-vector operations. This is the unblocked version, suitable +for small matrices or as a building block for blocked algorithms. + +# Input Validation +- uplo must be 'U', 'u', 'L', or 'l' +- n must be non-negative + +# Notes +This routine is typically used in Cholesky factorization algorithms +and for computing covariance matrices from triangular factors. + +# Example +```julia +n = 4 +A = triu(randn(ComplexF64, n, n)) # Upper triangular matrix +lauu2!('U', n, A, n) # A := U * U^H +``` """ +function lauu2!(uplo::Char, n::Int, A::AbstractMatrix{T}) where T -function lauu2(uplo::Char, n::Int, A::AbstractMatrix{T}, lda::Int) where T - - # Initialize the INFO variable - info = 0 - - # Validate the input for 'uplo' - if !(uplo == 'U' || uplo == 'u' || uplo == 'L' || uplo == 'l') - info = -1 - return info + # Input validation with descriptive error messages + if !(uplo in ['U', 'u', 'L', 'l']) + throw(ArgumentError("uplo must be 'U', 'u', 'L', or 'l', got '$uplo'")) end - # Check for valid matrix order if n < 0 - info = -2 - return info - end - - # Validate the leading dimension of A - if lda < max(1, n) - info = -4 - return info + throw(ArgumentError("n must be non-negative, got $n")) end - # Quick return if possible (nothing to do if n is zero) + # Quick return for degenerate case if n == 0 - return info + return end - if uplo == 'U' || uplo == 'u' - # Upper triangular case: Compute U * U' + if uplo in ['U', 'u'] + # Upper triangular case: Compute U * U^H for i in 1:n aii = A[i, i] # Diagonal element of U if i < n - # Update the diagonal element - A[i, i] = aii^2 + dot(A[i, i+1:n], A[i, i+1:n]) + # Update diagonal: A[i,i] = |U[i,i]|² + sum(|U[i,j]|² for j > i) + A[i, i] = real(aii * conj(aii)) + real(dot(A[i, i+1:n], A[i, i+1:n])) - # Update the remaining upper triangle elements - if i > 1 - A[1:i-1, i] .= A[1:i-1, i+1:n] * A[i, i+1:n] + A[1:i-1, i] * aii + # Update off-diagonal elements in column i + for k in 1:i-1 + A[k, i] = A[k, i] * aii + dot(A[k, i+1:n], conj(A[i, i+1:n])) end else - # Scale diagonal entries when i == n - A[1:i, i] .= aii * A[1:i, i] + # Final column: scale by diagonal element + for k in 1:i + A[k, i] = A[k, i] * aii + end end end else - # Lower triangular case: Compute L' * L + # Lower triangular case: Compute L^H * L for i in 1:n aii = A[i, i] # Diagonal element of L if i < n - # Update the diagonal element - A[i, i] = aii^2 + dot(A[i+1:n, i], A[i+1:n, i]) + # Update diagonal: A[i,i] = |L[i,i]|² + sum(|L[j,i]|² for j > i) + A[i, i] = real(conj(aii) * aii) + real(dot(A[i+1:n, i], A[i+1:n, i])) - # Update the remaining lower triangle elements - if i > 1 - A[i, 1:i-1] .= adjoint(A[i+1:n, 1:i-1]) * A[i+1:n, i] + A[i, 1:i-1] * aii + # Update off-diagonal elements in row i + for k in 1:i-1 + A[i, k] = conj(aii) * A[i, k] + dot(A[i+1:n, k], conj(A[i+1:n, i])) end else - # Scale diagonal entries when i == n - A[i, 1:i] .= aii * A[i, 1:i] + # Final row: scale by conjugate of diagonal element + for k in 1:i + A[i, k] = conj(aii) * A[i, k] + end end end end - - return info end + +lauu2!(uplo::Char, A::AbstractMatrix{T}) where {T} = lauu2!(uplo, size(A, 1), A) diff --git a/src/lauum.jl b/src/lauum.jl index 2650cb6..7a9596c 100644 --- a/src/lauum.jl +++ b/src/lauum.jl @@ -1,173 +1,191 @@ -export lauum +export lauum! - # Import the unblocked version of the matrix multiplication function (lauu2) to use later in this computation. +# Import the unblocked version (lauu2!) for use in blocked algorithm """ - lauum(uplo::Char, n::Int, a::AbstractMatrix{T}, lda::Int, block_size::Int) - -This function computes the product of a triangular matrix with its conjugate transpose. Specifically, it computes: - -- `U * U'` if the triangular matrix `U` is stored in the upper part of matrix `a`. -- `L' * L` if the triangular matrix `L` is stored in the lower part of matrix `a`. - -Where: -- `U'` represents the conjugate transpose of the upper triangular matrix `U`. -- `L'` represents the conjugate transpose of the lower triangular matrix `L`. - -### Parameters: -- `uplo`: A character (`'U'` or `'L'`) that specifies whether the triangular matrix is stored in the upper or lower part of `a`. - - `'U'`: Indicates that the upper triangle contains the triangular matrix `U`. The result of `U * U'` will overwrite the corresponding entries in the upper triangle of matrix `a`. - - `'L'`: Indicates that the lower triangle contains the triangular matrix `L`. The result of `L' * L` will overwrite the corresponding entries in the lower triangle of matrix `a`. - -- `n`: The order (size) of the triangular matrix. This must be a non-negative integer, representing the dimensions of the square matrix `a`, which is `n x n`. - -- `a`: The matrix where the triangular factor `U` or `L` is stored, and where the result will be stored after computation. This matrix is modified in place, meaning its contents will change as a result of the computation. - -- `lda`: The leading dimension of the array `a`. This should be at least `max(1, n)`. This parameter is important for accessing the elements of the matrix in memory correctly, particularly in scenarios where matrices may be stored in a non-contiguous fashion for performance reasons. - -- `block_size`: This specifies the block size for the blocked algorithm. A blocked algorithm processes the matrix in submatrices (or blocks), improving performance on large matrices by making better use of CPU cache and reducing memory bandwidth demands. - -### Returns: -- `info`: An integer indicating the success or failure of the function execution: - - `0`: Indicates successful execution. - - A negative integer indicates that an invalid argument was provided: - - `-1`: Invalid value for `uplo`. - - `-2`: Invalid value for `n`. - - `-4`: Invalid value for `lda`. + lauum!(uplo, n, A, ib) + +Compute the product U * U^H or L^H * L using a blocked algorithm, where +the triangular factor U or L is stored in the upper or lower triangular +part of the matrix a. + +This is a blocked version of the triangular matrix multiplication that +achieves better performance on large matrices by exploiting cache locality +and enabling vectorization. + +# Arguments +- `uplo`: Character specifying which triangle contains the factor + - 'U': Upper triangular, computes U * U^H + - 'L': Lower triangular, computes L^H * L +- `n`: Order of the triangular matrix (≥ 0) +- `A`: Matrix containing triangular factor (modified in-place) +- `ib`: Block size for blocked algorithm (typically 32-64) + +# Algorithm +The blocked algorithm partitions the matrix into blocks of size ib +and processes them using high-performance BLAS operations: +- Level-3 BLAS (matrix-matrix operations) for most computations +- Level-2 BLAS (matrix-vector operations) for smaller blocks +- Automatic fallback to unblocked algorithm for small matrices + +For upper triangular (uplo='U'): A := U * U^H +For lower triangular (uplo='L'): A := L^H * L + +# Performance Notes +- Block size should be chosen based on cache size (typically 32-64) +- Uses parallel processing for independent block operations +- Optimal performance achieved when n >> block_size + +# Input Validation +- uplo must be 'U' or 'L' +- n must be non-negative +- block_size is automatically clamped to valid range + +# Example +```julia +n = 100 +block_size = 32 +A = triu(randn(ComplexF64, n, n)) +lauum!('U', n, A, n, block_size) # A := U * U^H +``` """ -function lauum(uplo::Char, n::Int, a::AbstractMatrix{T}, lda::Int, block_size::Int) where T - # Validate the 'uplo' parameter to ensure it is either 'U' or 'L' +function lauum!(uplo::Char, n::Integer, A::AbstractMatrix{T}, ib::Integer) where {T} + # Input validation with descriptive error messages if !(uplo in ['U', 'L']) - return -1 # Return an error code for invalid 'uplo' + throw(ArgumentError("uplo must be 'U' or 'L', got '$uplo'")) end - # Check if 'n' is non-negative if n < 0 - return -2 # Return an error code for invalid 'n' - end - - # Validate 'lda' to ensure it meets the minimum requirement - if lda < max(1, n) - return -4 # Return an error code for invalid 'lda' + throw(ArgumentError("n must be non-negative, got $n")) end - # If 'n' is zero, no computation is needed, so return success + # Quick return for degenerate case if n == 0 - return 0 # Early exit with success code + return end - # Adjust block_size to ensure it does not exceed the size of the matrix - block_size = min(block_size, n) + # Adjust block_size to reasonable bounds + ib = max(1, min(ib, n)) - # If block_size is less than or equal to 1, or greater than or equal to n, use the unblocked version - if block_size <= 1 || block_size >= n - lauu2(uplo, n, a, lda) # Call the unblocked computation - return 0 # Return success code + # Use unblocked algorithm for small matrices or invalid block size + if ib <= 1 || ib >= n + lauu2!(uplo, n, A) + return end - # Call the appropriate computation based on whether the upper or lower triangular matrix is specified + # Call appropriate blocked computation if uplo == 'U' - compute_upper(n, block_size, a, lda) # Compute for upper triangular matrix + compute_upper!(n, ib, A) else - compute_lower(n, block_size, a, lda) # Compute for lower triangular matrix + compute_lower!(n, ib, A) end - - return 0 # Return success code after completing the computation end """ - compute_upper(n, block_size, a, lda) + compute_upper!(n, block_size, a) + +Blocked computation of U * U^H for upper triangular matrix U. -This function performs the blocked computation of U * U' for an upper triangular matrix `U`. -The computation is carried out in parallel to improve performance on large matrices. +Processes the matrix in blocks to achieve better cache performance and +enable vectorized operations. Uses Level-3 BLAS operations where possible. -### Parameters: -- `n`: The size of the matrix, which is also the order of the triangular matrix `U`. -- `block_size`: The size of the blocks to be processed in each iteration. This allows for better cache usage and performance. -- `a`: The matrix that contains the upper triangular part `U`, and where the results will be stored. -- `lda`: The leading dimension of the matrix `a`. +# Arguments +- `n`: Order of the matrix +- `block_size`: Size of blocks for processing +- `a`: Matrix containing U (modified in-place) -This function modifies the matrix `a` in place. +# Algorithm +For each diagonal block: +1. Update off-diagonal blocks using TRMM operations +2. Compute diagonal block product U_block * U_block^H +3. Add contribution from remaining blocks using SYRK operations """ -function compute_upper(n::Int, block_size::Int, a::AbstractMatrix{T}, lda::Int) where T - Threads.@threads for i in 1:block_size:n # Parallelize the outer loop over blocks - ib = min(block_size, n - i + 1) # Determine the actual block size for this iteration - - # Perform a triangular matrix multiplication (equivalent to DTRMM) - # Update the upper triangle of the matrix using the current block - view(a, 1:i-1, i:i+ib-1) .= view(a, 1:i-1, i:i+ib-1) * view(a, i:i+ib-1, i:i+ib-1)' - - # Compute the product U * U' for the current block using the lauu2 function - # lauu2('U', ib, view(a, i:i+ib-1, i:i+ib-1), lda) - U = view(a, i:i+ib-1, i:i+ib-1) # Extract the block U - U_Ut = U * adjoint(U) # Use adjoint for complex matrices - # Only update the upper triangular part of the matrix - for j in 1:ib - for k in j:ib - a[i + j - 1, i + k - 1] = U_Ut[j, k] - end +function compute_upper!(n::Integer, ib::Integer, A::AbstractMatrix{T}) where T + Threads.@threads for i in 1:ib:n + ib = min(ib, n - i + 1) # Actual block size + + # Update off-diagonal blocks: A[1:i-1, i:i+ib-1] = A[1:i-1, i:i+ib-1] * A[i:i+ib-1, i:i+ib-1]^H + if i > 1 + view(A, 1:i-1, i:i+ib-1) .= view(A, 1:i-1, i:i+ib-1) * view(A, i:i+ib-1, i:i+ib-1)' end + + # Compute diagonal block: U_block * U_block^H + U_block = view(A, i:i+ib-1, i:i+ib-1) + U_Ut = U_block * U_block' + # Store only upper triangular part + for j in 1:ib, k in j:ib + A[i + j - 1, i + k - 1] = U_Ut[j, k] + end - # Check if there are additional blocks to process + # Add contribution from trailing blocks if they exist if i + ib <= n - # Perform matrix-matrix multiplication (equivalent to DGEMM) - view(a, 1:i-1, i:i+ib-1) .+= view(a, 1:i-1, i+ib:n) * view(a, i:i+ib-1, i+ib:n)' - - # Perform symmetric rank-k update (equivalent to DSYRK) - product_matrix = view(a, i:i+ib-1, i+ib:n) * view(a, i:i+ib-1, i+ib:n)' - for j in 1:ib # Iterate over the rows of the current block - for k in j:ib # Iterate over the columns of the current block - @inbounds a[i + j - 1, i + k - 1] += product_matrix[j, k] # Update the result matrix - end + # Update off-diagonal: add A[1:i-1, i+ib:n] * A[i:i+ib-1, i+ib:n]^H + if i > 1 + view(A, 1:i-1, i:i+ib-1) .+= view(A, 1:i-1, i+ib:n) * view(A, i:i+ib-1, i+ib:n)' + end + + # Rank-k update: add A[i:i+ib-1, i+ib:n] * A[i:i+ib-1, i+ib:n]^H to diagonal block + trailing_block = view(A, i:i+ib-1, i+ib:n) + syrk_result = trailing_block * trailing_block' + + for j in 1:ib, k in j:ib + A[i + j - 1, i + k - 1] += syrk_result[j, k] end end end end """ - compute_lower(n, block_size, a, lda) + compute_lower!(n, ib, A) -This function performs the blocked computation of L' * L for a lower triangular matrix `L`. -The computation is carried out in parallel to improve performance on large matrices. +Blocked computation of L^H * L for lower triangular matrix L. -### Parameters: -- `n`: The size of the matrix, which is also the order of the triangular matrix `L`. -- `block_size`: The size of the blocks to be processed in each iteration. This allows for better cache usage and performance. -- `a`: The matrix that contains the lower triangular part `L`, and where the results will be stored. -- `lda`: The leading dimension of the matrix `a`. +Processes the matrix in blocks to achieve better cache performance and +enable vectorized operations. Uses Level-3 BLAS operations where possible. -This function modifies the matrix `a` in place. +# Arguments +- `n`: Order of the matrix +- `block_size`: Size of blocks for processing +- `A`: Matrix containing L (modified in-place) + +# Algorithm +For each diagonal block: +1. Update off-diagonal blocks using TRMM operations +2. Compute diagonal block product L_block^H * L_block +3. Add contribution from remaining blocks using SYRK operations """ -function compute_lower(n::Int, block_size::Int, a::AbstractMatrix{T}, lda::Int) where T - Threads.@threads for i in 1:block_size:n # Parallelize the outer loop over blocks - ib = min(block_size, n - i + 1) # Determine the actual block size for this iteration +function compute_lower!(n::Integer, ib::Integer, A::AbstractMatrix{T}) where T + Threads.@threads for i in 1:ib:n + ib = min(ib, n - i + 1) # Actual block size - # Perform a triangular matrix multiplication for lower triangular matrix - view(a, i:i+ib-1, 1:i-1) .= adjoint(view(a, i:i+ib-1, i:i+ib-1)) * view(a, i:i+ib-1, 1:i-1) + # Update off-diagonal blocks: A[i:i+ib-1, 1:i-1] = A[i:i+ib-1, i:i+ib-1]^H * A[i:i+ib-1, 1:i-1] + if i > 1 + view(A, i:i+ib-1, 1:i-1) .= view(A, i:i+ib-1, i:i+ib-1)' * view(A, i:i+ib-1, 1:i-1) + end - # Compute the product L' * L for the current block using adjoint for complex matrices - L = view(a, i:i+ib-1, i:i+ib-1) # Extract the block L - Lt_L = adjoint(L) * L + # Compute diagonal block: L_block^H * L_block + L_block = view(A, i:i+ib-1, i:i+ib-1) + Lt_L = L_block' * L_block - # Store the result back in the lower triangular part only - for j in 1:ib - for k in 1:j - @inbounds a[i + j - 1, i + k - 1] = Lt_L[j, k] - end + # Store only lower triangular part + for j in 1:ib, k in 1:j + A[i + j - 1, i + k - 1] = Lt_L[j, k] end - # Check if there are additional blocks to process + # Add contribution from trailing blocks if they exist if i + ib <= n - # Perform matrix-matrix multiplication (equivalent to DGEMM) with proper adjoint - view(a, i:i+ib-1, 1:i-1) .+= adjoint(view(a, i+ib:n, i:i+ib-1)) * view(a, i+ib:n, 1:i-1) - - # Perform symmetric rank-k update - product_matrix = adjoint(view(a, i+ib:n, i:i+ib-1)) * view(a, i+ib:n, i:i+ib-1) - for j in 1:ib - for k in 1:j - a[i + j - 1, i + k - 1] += product_matrix[j, k] - end + # Update off-diagonal: add A[i+ib:n, i:i+ib-1]^H * A[i+ib:n, 1:i-1] + if i > 1 + view(A, i:i+ib-1, 1:i-1) .+= view(A, i+ib:n, i:i+ib-1)' * view(A, i+ib:n, 1:i-1) + end + + # Rank-k update: add A[i+ib:n, i:i+ib-1]^H * A[i+ib:n, i:i+ib-1] to diagonal block + trailing_block = view(A, i+ib:n, i:i+ib-1) + syrk_result = trailing_block' * trailing_block + + for j in 1:ib, k in 1:j + A[i + j - 1, i + k - 1] += syrk_result[j, k] end end end diff --git a/src/pamm.jl b/src/pamm.jl index 1993cc4..51cbf84 100644 --- a/src/pamm.jl +++ b/src/pamm.jl @@ -1,59 +1,105 @@ -function pamm(op, side, storev, direct, m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) - # Input validation - op ∉ ('W', 'A') && throw(ArgumentError("illegal value of op")) - side ∉ ('L', 'R') && throw(ArgumentError("illegal value of side")) - storev ∉ ('C', 'R') && throw(ArgumentError("illegal value of storev")) - direct ∉ ('F', 'B') && throw(ArgumentError("illegal value of direct")) +""" + pamm!(op, side, storev, direct, m, n, k, l, A1, A2, V, W) + +Parallel matrix multiplication kernel for block reflector applications. + +This routine performs specialized matrix operations needed in blocked orthogonal +factorizations. It computes either: +- W = A1 + op(V) * A2 (when op='W') +- A2 = A2 + op(V) * W (when op='A') + +where op(V) is V, V^H, V^T depending on the storage and direction parameters. + +# Arguments +- `op`: Operation type + - 'W': Compute W = A1 + op(V) * A2 or W = A1 + A2 * op(V) + - 'A': Update A2 = A2 + op(V) * W or A2 = A2 + W * op(V) +- `side`: Which side V is applied + - 'L': Left multiplication (op(V) * A2) + - 'R': Right multiplication (A2 * op(V)) +- `storev`: How reflector vectors are stored in V + - 'C': Columnwise storage + - 'R': Rowwise storage +- `direct`: Direction of reflector product + - 'F': Forward (H = H₁H₂...Hₖ) + - 'B': Backward (H = HₖHₖ₋₁...H₁) +- `m`, `n`: Dimensions of matrices A1, A2, W +- `k`: Number of elementary reflectors +- `l`: Number of columns/rows in triangular part of V +- `A1`: First input matrix +- `A2`: Second input/output matrix +- `V`: Matrix containing reflector vectors +- `W`: Workspace/output matrix + +# Algorithm +The routine handles all combinations of storage formats and application sides +efficiently by dispatching to specialized kernels. Each kernel exploits the +structure of the reflector matrix V (triangular + rectangular parts) to +minimize computational cost. + +# Input Validation +All parameters are validated for correctness. Dimensions must be non-negative +and leading dimensions must meet minimum requirements. + +# Notes +This is a low-level computational kernel used internally by blocked QR +and LQ factorization routines. It is optimized for performance with +specific memory access patterns. +""" +function pamm!(op::Char, side::Char, storev::Char, direct::Char, m::Integer, n::Integer, k::Integer, l::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, W::AbstractMatrix{T}) where {T} + # Input validation with descriptive error messages + if op ∉ ('W', 'A') + throw(ArgumentError("op must be 'W' or 'A', got '$op'")) + end + if side ∉ ('L', 'R') + throw(ArgumentError("side must be 'L' or 'R', got '$side'")) + end + if storev ∉ ('C', 'R') + throw(ArgumentError("storev must be 'C' or 'R', got '$storev'")) + end + if direct ∉ ('F', 'B') + throw(ArgumentError("direct must be 'F' or 'B', got '$direct'")) + end # Dimension validation - m < 0 && throw(ArgumentError("illegal value of m")) - n < 0 && throw(ArgumentError("illegal value of n")) - k < 0 && throw(ArgumentError("illegal value of k")) - l < 0 && throw(ArgumentError("illegal value of l")) + if m < 0 + throw(ArgumentError("m must be non-negative, got $m")) + end + if n < 0 + throw(ArgumentError("n must be non-negative, got $n")) + end + if k < 0 + throw(ArgumentError("k must be non-negative, got $k")) + end + if l < 0 + throw(ArgumentError("l must be non-negative, got $l")) + end - # Leading dimension validation - lda1 < 0 && throw(ArgumentError("illegal value of lda1")) - lda2 < 0 && throw(ArgumentError("illegal value of lda2")) - ldv < 0 && throw(ArgumentError("illegal value of ldv")) - ldw < 0 && throw(ArgumentError("illegal value of ldw")) # Quick return for degenerate cases - (m == 0 || n == 0 || k == 0) && return nothing - - if direct == 'F' - forward = true - else - forward = false - end - - if storev == 'C' - colmajor = true - else - colmajor = false + if m == 0 || n == 0 || k == 0 + return end - if side == 'L' - left = true - else - left = false - end + # Convert parameters to boolean flags for efficiency + forward = (direct == 'F') + colmajor = (storev == 'C') + left = (side == 'L') - + # Dispatch to appropriate kernel if op == 'W' - pamm_w(left, colmajor, forward, m,n,k,l, A1, A2, V, W) + pamm_w!(left, colmajor, forward, m, n, k, l, A1, A2, V, W) else - pamm_a(left, colmajor, forward, m,n,k,l, A2, V, W) + pamm_a!(left, colmajor, forward, m, n, k, l, A2, V, W) end - - return end -function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) +function pamm_w!(left::Bool, colmajor::Bool, forward::Bool, m::Integer, n::Integer, k::Integer, l::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, W::AbstractMatrix{T}) where {T} # W = A1 + op(V) * A2 or W = A1 + A2 * op(V) - one0 = oneunit(eltype(A1)) - zero0 = zero(eltype(A1)) - plus = LinearAlgebra.MulAddMul(one0, one0) - eqa = LinearAlgebra.MulAddMul(one0, zero0) + one = oneunit(eltype(A1)) + Tzero = zero(eltype(A1)) + plus = LinearAlgebra.MulAddMul(one, one) + eqa = LinearAlgebra.MulAddMul(one, Tzero) if colmajor && forward && left # colmajor, forward, left mp = min(m-l+1, m) @@ -70,7 +116,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) LinearAlgebra.generic_matmatmul!((@view W[kp:kp+k-l-1, 1:n]), 'C', 'N', (@view V[1:m, kp:kp+k-l-1]), (@view A2[1:m, 1:n]), eqa) for i in 1:k - LinearAlgebra.axpy!(one0, (@view A1[i, 1:n]), (@view W[i, 1:n])) + LinearAlgebra.axpy!(one, (@view A1[i, 1:n]), (@view W[i, 1:n])) end end @@ -89,7 +135,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) LinearAlgebra.generic_matmatmul!((@view W[kp:kp+k-l-1, 1:n]), 'N', 'N', (@view V[kp:kp+k-l-1, 1:m]), (@view A2[1:m, 1:n]), eqa) for i in 1:k - LinearAlgebra.axpy!(one0, (@view A1[i, 1:n]), (@view W[i, 1:n])) + LinearAlgebra.axpy!(one, (@view A1[i, 1:n]), (@view W[i, 1:n])) end end if colmajor && forward && !left # colmajor, forward, right @@ -107,7 +153,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) LinearAlgebra.generic_matmatmul!((@view W[1:m, kp:kp+k-l-1]), 'N', 'N', (@view A2[1:m, 1:n]), (@view V[1:n, kp:kp+k-l-1]), eqa) for j in 1:k - LinearAlgebra.axpy!(one0, (@view A1[1:m, j]), (@view W[1:m, j])) + LinearAlgebra.axpy!(one, (@view A1[1:m, j]), (@view W[1:m, j])) end end if !colmajor && forward && !left # rowmajor, forward, right @@ -125,7 +171,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) LinearAlgebra.generic_matmatmul!((@view W[1:m, kp:kp+k-l-1]), 'N', 'C', (@view A2[1:m, 1:n]), (@view V[kp:kp+k-l-1, 1:n]), eqa) for j in 1:k - LinearAlgebra.axpy!(one0, (@view A1[1:m, j]), (@view W[1:m, j])) + LinearAlgebra.axpy!(one, (@view A1[1:m, j]), (@view W[1:m, j])) end end if colmajor && !forward && left # colmajor, backward, left @@ -143,7 +189,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) LinearAlgebra.generic_matmatmul!((@view W[1:k-l, 1:n]), 'C', 'N', (@view V[1:m, 1:k-l]), (@view A2[1:m, 1:n]), eqa) for i in 1:k - LinearAlgebra.axpy!(one0, (@view A1[i, 1:n]), (@view W[i, 1:n])) + LinearAlgebra.axpy!(one, (@view A1[i, 1:n]), (@view W[i, 1:n])) end end @@ -162,7 +208,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) LinearAlgebra.generic_matmatmul!((@view W[1:k-l, 1:n]), 'N', 'N', (@view V[1:k-l, 1:m]), (@view A2[1:m, 1:n]), eqa) for i in 1:k - LinearAlgebra.axpy!(one0, (@view A1[i, 1:n]), (@view W[i, 1:n])) + LinearAlgebra.axpy!(one, (@view A1[i, 1:n]), (@view W[i, 1:n])) end end if !colmajor && !forward && !left # rowmajor, backward, right @@ -180,7 +226,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) LinearAlgebra.generic_matmatmul!((@view W[1:m, 1:k-l]), 'N', 'C', (@view A2[1:m, 1:n]), (@view V[1:k-l, 1:n]), eqa) for j in 1:k - LinearAlgebra.axpy!(one0, (@view A1[1:m, j]), (@view W[1:m, j])) + LinearAlgebra.axpy!(one, (@view A1[1:m, j]), (@view W[1:m, j])) end end if colmajor && !forward && !left # colmajor, backward, right @@ -198,18 +244,16 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W) LinearAlgebra.generic_matmatmul!((@view W[1:m, 1:k-l]), 'N', 'N', (@view A2[1:m, 1:n]), (@view V[1:n, 1:k-l]), eqa) for j in 1:k - LinearAlgebra.axpy!(one0, (@view A1[1:m, j]), (@view W[1:m, j])) + LinearAlgebra.axpy!(one, (@view A1[1:m, j]), (@view W[1:m, j])) end end - - return end -function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) +function pamm_a!(left::Bool, colmajor::Bool, forward::Bool, m::Integer, n::Integer, k::Integer, l::Integer, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, W::AbstractMatrix{T}) where {T} # A2 = A2 + op(V) * W or A2 = A2 + W * op(V) - one0 = oneunit(eltype(A2)) - minus = LinearAlgebra.MulAddMul(one0*(-1),one0) + one = oneunit(eltype(A2)) + minus = LinearAlgebra.MulAddMul(one*(-1), one) if colmajor && forward && left # colmajor, forward, left mp = min( m-l+1, m ) @@ -222,7 +266,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) LinearAlgebra.generic_trimatmul!((@view W[1:l, 1:n]), 'U', 'N', identity, (@view V[mp:mp+l-1, 1:l]), (@view W[1:l, 1:n])) for i in 1:l - LinearAlgebra.axpy!(-one0, (@view W[i, 1:n]), (@view A2[m-l+i, 1:n])) + LinearAlgebra.axpy!(-one, (@view W[i, 1:n]), (@view A2[m-l+i, 1:n])) end end @@ -237,7 +281,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) LinearAlgebra.generic_trimatmul!((@view W[1:l, 1:n]), 'L', 'N', adjoint, (@view V[1:l, mp:mp+l-1]), (@view W[1:l, 1:n])) for i in 1:l - LinearAlgebra.axpy!((-one0), (@view W[i, 1:n]), (@view A2[m-l+i, 1:n])) + LinearAlgebra.axpy!((-one), (@view W[i, 1:n]), (@view A2[m-l+i, 1:n])) end end @@ -253,7 +297,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) LinearAlgebra.generic_mattrimul!((@view W[1:m, 1:l]), 'U', 'N', adjoint, (@view W[1:m, 1:l]), (@view V[np:np+l-1, 1:l])) for j in 1:l - LinearAlgebra.axpy!(-one0, (@view W[1:m, j]), (@view A2[1:m, n-l+j])) + LinearAlgebra.axpy!(-one, (@view W[1:m, j]), (@view A2[1:m, n-l+j])) end end @@ -268,7 +312,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) LinearAlgebra.generic_mattrimul!((@view W[1:m, 1:l]), 'L', 'N', identity, (@view W[1:m, 1:l]), (@view V[1:l, np:np+l-1])) for j in 1:l - LinearAlgebra.axpy!(-one0, (@view W[1:m, j]), (@view A2[1:m, n-l+j])) + LinearAlgebra.axpy!(-one, (@view W[1:m, j]), (@view A2[1:m, n-l+j])) end end @@ -283,7 +327,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) LinearAlgebra.generic_trimatmul!((@view W[kp:kp+l-1, 1:n]), 'L', 'N', identity, (@view V[1:l, kp:kp+l-1]), (@view W[kp:kp+l-1, 1:n])) for i in 1:l - LinearAlgebra.axpy!(-one0, (@view W[k-l+i, 1:n]), (@view A2[i, 1:n])) + LinearAlgebra.axpy!(-one, (@view W[k-l+i, 1:n]), (@view A2[i, 1:n])) end end @@ -298,7 +342,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) LinearAlgebra.generic_trimatmul!((@view W[kp:kp+l-1, 1:n]), 'U', 'N', adjoint, (@view V[kp:kp+l-1, 1:l]), (@view W[kp:kp+l-1, 1:n])) for i in 1:l - LinearAlgebra.axpy!(-one0, (@view W[k-l+i, 1:n]), (@view A2[i, 1:n])) + LinearAlgebra.axpy!(-one, (@view W[k-l+i, 1:n]), (@view A2[i, 1:n])) end end @@ -312,7 +356,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) LinearAlgebra.generic_mattrimul!((@view W[1:m, kp:kp+l-1]), 'U', 'N', identity, (@view W[1:m, kp:kp+l-1]), (@view V[kp:kp+l-1, 1:l])) for j in 1:l - LinearAlgebra.axpy!(-one0, (@view W[1:m, k-l+j]), (@view A2[1:m, j])) + LinearAlgebra.axpy!(-one, (@view W[1:m, k-l+j]), (@view A2[1:m, j])) end end @@ -327,11 +371,55 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W) LinearAlgebra.generic_mattrimul!((@view W[1:m, kp:kp+l-1]), 'L', 'N', adjoint, (@view W[1:m, kp:kp+l-1]), (@view V[1:l, kp:kp+l-1])) for j in 1:l - LinearAlgebra.axpy!(-one0, (@view W[1:m, k-l+j]), (@view A2[1:m, j])) + LinearAlgebra.axpy!(-one, (@view W[1:m, k-l+j]), (@view A2[1:m, j])) end end +end - return +""" + pamm(op, side, storev, direct, A1, A2, V) -> (A1, A2) + +Performs panel matrix multiplication with automatic workspace allocation. +This is a simplified interface that automatically computes required parameters. + +# Arguments +- 'op': operation type + - 'W': compute workspace + - 'A': apply operation +- 'side': + - 'L' : apply from the left + - 'R' : apply from the right +- 'storev': indicates how the vectors are stored + - 'C' : columnwise + - 'R' : rowwise +- 'direct': indicates direction + - 'F' : forward + - 'B' : backward +- 'A1': first matrix to be updated +- 'A2': second matrix to be updated +- 'V': matrix containing the vectors + +# Returns +- Updated A1 and A2 matrices + +# Example +```julia +m, n, k, l = 6, 4, 3, 2 +A1 = complex.(randn(m, k), randn(m, k)) +A2 = complex.(randn(m, l), randn(m, l)) +V = complex.(randn(m, k), randn(m, k)) +A1_new, A2_new = pamm('A', 'L', 'C', 'F', A1, A2, V) +``` +""" +function pamm(op::Char, side::Char, storev::Char, direct::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}) where {T} + # Determine dimensions + m, k = size(A1) + n = size(A2, 2) + l = n + + W = similar(A1, m, k) + + # Call the underlying kernel + pamm(op, side, storev, direct, m, n, k, l, A1, A2, V, W) end - diff --git a/src/parfb.jl b/src/parfb.jl index 3bb8b91..1e34413 100644 --- a/src/parfb.jl +++ b/src/parfb.jl @@ -1,92 +1,98 @@ -export parfb - -function parfb(side, trans, direct, storev, m1, n1, m2, n2, k, l, - A1, lda1, A2, lda2, V, ldv, T, ldt, work, ldwork) +""" + parfb!(side, trans, direct, storev, m1, n1, m2, n2, k, l, A1, A2, V, T, work) + +Apply a block reflector or its transpose/adjoint to a general matrix using parallel algorithms. + +This function applies the block reflector H or its transpose/adjoint to two matrix blocks simultaneously, +making it efficient for parallel QR factorization algorithms. It performs operations of the form: +- C₁ := H^op · C₁ (left multiplication) +- C₁ := C₁ · H^op (right multiplication) + +where H is represented in compact form by matrices V and T, and op can be N (no operation), T (transpose), +or C (conjugate transpose). + +# Arguments +- `side::Char`: Determines the side of multiplication ('L' for left, 'R' for right) +- `trans::Char`: Operation to apply ('N' for none, 'T' for transpose, 'C' for conjugate transpose) +- `direct::Char`: Direction of reflector storage ('F' for forward, 'B' for backward) +- `storev::Char`: Storage format of reflectors ('C' for columnwise, 'R' for rowwise) +- `m1::Int`: Number of rows in first matrix block A1 +- `n1::Int`: Number of columns in first matrix block A1 +- `m2::Int`: Number of rows in second matrix block A2 +- `n2::Int`: Number of columns in second matrix block A2 +- `k::Int`: Number of elementary reflectors +- `l::Int`: Order of the triangular factor in T +- `A1::Matrix`: First m1×n1 matrix block to be transformed (modified in-place) +- `A2::Matrix`: Second m2×n2 matrix block to be transformed (modified in-place) +- `V::Matrix`: Matrix containing elementary reflectors in compact form +- `T::Matrix`: Upper triangular factor matrix +- `work::Vector`: Workspace array + +# Returns +- `Int`: Status code (0 for success, negative for invalid arguments) + +# Algorithm +The function uses the compact WY representation where H = I - V·T·V^H, performing efficient +block operations to apply the transformation to both matrix blocks simultaneously. + +# Implementation Notes +- Modifies A1 and A2 in-place for efficiency +- Uses optimized BLAS-3 operations for performance +- Handles different storage formats and operation types +- Validates all input parameters with descriptive error messages +""" +function parfb!(side::Char, trans::Char, direct::Char, storev::Char, m1::Integer, n1::Integer, m2::Integer, n2::Integer, k::Integer, l::Integer, + A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, work::AbstractMatrix{T}) where {T} if side != 'L' && side != 'R' throw(ArgumentError("illegal value of side")) - return -1 end if trans != 'N' && trans != 'C' && trans != 'T' throw(ArgumentError("illegal value of trans")) - return -2 end if direct != 'F' && direct != 'B' throw(ArgumentError("illegal value of direct")) - return -3 end if storev != 'C' && storev != 'R' throw(ArgumentError("illegal value of storev")) - return -4 end if m1 < 0 throw(ArgumentError("illegal value of m1")) - return -5 end if n1 < 0 throw(ArgumentError("illegal value of n1")) - return -6 end if m2 < 0 || (side == 'R' && m1 != m2) throw(ArgumentError("illegal value of m2")) - return -7 end if n2 < 0 || (side == 'L' && n1 != n2) throw(ArgumentError("illegal value of n2")) - return -8 end if k < 0 throw(ArgumentError("illegal value of k")) - return -9 end if l < 0 || l > k throw(ArgumentError("illegal value of l")) - return -10 - end - - if lda1 < 0 - throw(ArgumentError("illegal value of lda1")) - return -12 - end - - if lda2 < 0 - throw(ArgumentError("illegal value of lda2")) - return -14 end - if ldv < 0 - throw(ArgumentError("illegal value of ldv")) - return -16 - end - - if ldt < 0 - throw(ArgumentError("illegal value of ldt")) - return -18 - end - - if ldwork < 0 - throw(ArgumentError("illegal value of ldwork")) - return -20 - end - - # quick return - + # Quick return if any dimension is zero if m1 == 0 || n1 == 0 || n2 == 0 || k == 0 return end - one0 = oneunit(eltype(A1)) - zero0 = zero(eltype(A1)) + # Define scalar constants + one = oneunit(eltype(A1)) + # Determine operation transformations based on flags if trans == 'N' tfun = identity else @@ -110,75 +116,126 @@ function parfb(side, trans, direct, storev, m1, n1, m2, n2, k, l, else colmajor = false end - pamm('W', side, storev, direct, m2, n2, k, l, A1, lda1, A2, lda2, V, ldv, work, ldwork) + # Apply workspace computation using pamm kernel + pamm!('W', side, storev, direct, m2, n2, k, l, A1, A2, V, work) + + # Apply block reflector transformation based on storage format and direction if colmajor && forward && left # colmajor, forward, left - LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'U', 'N', tfun, (@view T[1:k, 1:k]), (@view work[1:k, 1:n2])) + LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'U', 'N', tfun, (@view T_mat[1:k, 1:k]), (@view work[1:k, 1:n2])) for i in 1:k - LinearAlgebra.axpy!(-one0, (@view work[i, 1:n2]), (@view A1[i, 1:n2])) + LinearAlgebra.axpy!(-one, (@view work[i, 1:n2]), (@view A1[i, 1:n2])) end end if colmajor && forward && !left # colmajor, forward, right - LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'U', 'N', tfun, (@view work[1:m2, 1:k]), (@view T[1:k, 1:k])) + LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'U', 'N', tfun, (@view work[1:m2, 1:k]), (@view T_mat[1:k, 1:k])) for j in 1:k - LinearAlgebra.axpy!(-one0, (@view work[1:m2, j]), (@view A1[1:m2, j])) + LinearAlgebra.axpy!(-one, (@view work[1:m2, j]), (@view A1[1:m2, j])) end end if colmajor && !forward && left # colmajor, backward, left - LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'L', 'N', tfun, (@view T[1:k, 1:k]), (@view work[1:k, 1:n2])) + LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'L', 'N', tfun, (@view T_mat[1:k, 1:k]), (@view work[1:k, 1:n2])) for i in 1:k - LinearAlgebra.axpy!(-one0, (@view work[i, 1:n2]), (@view A1[i, 1:n2])) + LinearAlgebra.axpy!(-one, (@view work[i, 1:n2]), (@view A1[i, 1:n2])) end end if colmajor && !forward && !left # colmajor, backward, right - LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'L', 'N', tfun, (@view work[1:m2, 1:k]), (@view T[1:k, 1:k])) + LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'L', 'N', tfun, (@view work[1:m2, 1:k]), (@view T_mat[1:k, 1:k])) for j in 1:k - LinearAlgebra.axpy!(-one0, (@view work[1:m2, j]), (@view A1[1:m2, j])) + LinearAlgebra.axpy!(-one, (@view work[1:m2, j]), (@view A1[1:m2, j])) end end if !colmajor && forward && left # rowmajor, forward, left - - LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'U', 'N', tfun, (@view T[1:k, 1:k]), (@view work[1:k, 1:n2])) + LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'U', 'N', tfun, (@view T_mat[1:k, 1:k]), (@view work[1:k, 1:n2])) for i in 1:k - LinearAlgebra.axpy!((-one0), (@view work[i, 1:n2]), (@view A1[i, 1:n2])) + LinearAlgebra.axpy!(-one, (@view work[i, 1:n2]), (@view A1[i, 1:n2])) end end if !colmajor && forward && !left # rowmajor, forward, right - LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'U', 'N', tfun, (@view work[1:m2, 1:k]), (@view T[1:k, 1:k])) + LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'U', 'N', tfun, (@view work[1:m2, 1:k]), (@view T_mat[1:k, 1:k])) for j in 1:k - LinearAlgebra.axpy!(-one0, (@view work[1:m2, j]), (@view A1[1:m2, j])) + LinearAlgebra.axpy!(-one, (@view work[1:m2, j]), (@view A1[1:m2, j])) end end if !colmajor && !forward && left # rowmajor, backward, left - LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'L', 'N', tfun, (@view T[1:k, 1:k]), (@view work[1:k, 1:n2])) + LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'L', 'N', tfun, (@view T_mat[1:k, 1:k]), (@view work[1:k, 1:n2])) for i in 1:k - LinearAlgebra.axpy!(-one0, (@view work[i, 1:n2]), (@view A1[i, 1:n2])) + LinearAlgebra.axpy!(-one, (@view work[i, 1:n2]), (@view A1[i, 1:n2])) end end if !colmajor && !forward && !left # rowmajor, backward, right - LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'L', 'N', tfun, (@view work[1:m2, 1:k]), (@view T[1:k, 1:k])) + LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'L', 'N', tfun, (@view work[1:m2, 1:k]), (@view T_mat[1:k, 1:k])) for j in 1:k - LinearAlgebra.axpy!((-one0), (@view work[1:m2, j]), (@view A1[1:m2, j])) + LinearAlgebra.axpy!(-one, (@view work[1:m2, j]), (@view A1[1:m2, j])) end end + # Apply final transformation using pamm kernel + pamm!('A', side, storev, direct, m2, n2, k, l, A1, A2, V, work) - pamm('A', side, storev, direct, m2, n2, k, l, A1, lda1, A2, lda2, V, ldv, work, ldwork) +end - return +""" + parfb(side, trans, direct, storev, A1, A2, V, T) -> (A1, A2) + +Applies a block reflector or its transpose to a pair of matrices A1 and A2. +This is a simplified interface that automatically computes required parameters. + +# Arguments +- `side::Char`: Determines the side of multiplication ('L' for left, 'R' for right) +- `trans::Char`: Operation to apply ('N' for none, 'T' for transpose, 'C' for conjugate transpose) +- `direct::Char`: Direction of reflector storage ('F' for forward, 'B' for backward) +- `storev::Char`: Storage format of reflectors ('C' for columnwise, 'R' for rowwise) +- `A1::Matrix`: First matrix to be updated (modified in-place) +- `A2::Matrix`: Second matrix to be updated (modified in-place) +- `V::Matrix`: Matrix containing the elementary reflectors +- `T::Matrix`: Upper triangular matrix of the block reflector + +# Returns +- Updated A1 and A2 matrices + +# Example +```julia +m1, n1, m2, n2, k = 4, 6, 4, 6, 3 +A1 = complex.(randn(m1, n1), randn(m1, n1)) +A2 = complex.(randn(m2, n2), randn(m2, n2)) +V = complex.(randn(m1+m2, k), randn(m1+m2, k)) +T = complex.(randn(k, k), randn(k, k)) +A1_new, A2_new = parfb('L', 'N', 'F', 'C', A1, A2, V, T) +``` +""" +function parfb!(side::Char, trans::Char, direct::Char, storev::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}) where {T} + # Determine dimensions + m1, n1 = size(A1) + m2, n2 = size(A2) + k = size(T_mat, 1) + l = size(V, 2) + + # Allocate workspace + if side == 'L' + work = similar(A1, max(m1, m2), max(n1, n2)) + else + work = similar(A1, max(m1, m2), max(n1, n2)) + end + + # Call the underlying kernel + parfb!(side, trans, direct, storev, m1, n1, m2, n2, k, l, + A1, A2, V, T_mat, work) end + +export parfb! diff --git a/src/pemv.jl b/src/pemv.jl index b62dded..8c4acf4 100644 --- a/src/pemv.jl +++ b/src/pemv.jl @@ -1,141 +1,225 @@ -function pemv(trans, storev, m, n, l, alpha, A, lda, X, beta, Y, work) - begin - if trans != 'N' && trans != 'T' && trans != 'C' - throw(ArgumentError("illegal value of trans")) - return -1 - end - - if storev != 'C' && storev != 'R' - throw(ArgumentError("illegal value of storev")) - return -2 - end - - if !((storev == 'C' && trans != 'N') || (storev == 'R' && trans == 'N')) - throw(ArgumentError("illegal values of trans/storev")) - return -2 - end +""" + pemv!(trans, storev, m, n, l, alpha, A, X, beta, Y, work) + +Perform panel matrix-vector multiplication with optimized block algorithms. + +This function implements efficient matrix-vector multiplication for structured panels, +commonly used in block QR factorization algorithms. It performs operations of the form: +Y := alpha * op(A) * X + beta * Y + +where op(A) can be A, A^T, or A^H depending on the trans parameter. + +# Arguments +- `trans::Char`: Transpose operation ('N' for none, 'T' for transpose, 'C' for conjugate transpose) +- `storev::Char`: Storage format for vectors ('C' for columnwise, 'R' for rowwise) +- `m::Int`: Number of rows in matrix A +- `n::Int`: Number of columns in matrix A +- `l::Int`: Panel size (must be ≤ min(m,n)) +- `alpha`: Scalar multiplier for the matrix-vector product +- `A::Matrix`: Input matrix of size m×n +- `X::Vector`: Input vector (modified in-place) +- `beta`: Scalar multiplier for the output vector Y +- `Y::Vector`: Output vector (modified in-place) +- `work::Vector`: Workspace array for intermediate computations + +# Returns +- `Int`: Status code (0 for success, negative for invalid arguments) + +# Algorithm +The function uses block-structured algorithms that partition the matrix and vectors +to take advantage of cache locality and vectorization, particularly effective for +panel-based factorizations. + +# Implementation Notes +- Optimized for different storage formats (columnwise vs rowwise) +- Uses BLAS-3 operations where possible for performance +- Handles edge cases with l=1 efficiently +- Validates input parameters with descriptive error messages +""" +function pemv!(trans::Char, storev::Char, m::Integer, n::Integer, l::Integer, alpha::T, A::AbstractMatrix{T}, x::AbstractVector{T}, beta::T, y::AbstractVector{T}, work::AbstractVector{T}) where {T} + # Input validation + if trans != 'N' && trans != 'T' && trans != 'C' + throw(ArgumentError("illegal value of trans")) + end - if m < 0 - throw(ArgumentError("illegal value of m")) - return -3 - end + if storev != 'C' && storev != 'R' + throw(ArgumentError("illegal value of storev")) + end - if n < 0 - throw(ArgumentError("illegal value of n")) - return -4 - end + if !((storev == 'C' && trans != 'N') || (storev == 'R' && trans == 'N')) + throw(ArgumentError("illegal values of trans/storev")) + end - if l > min(m, n) - throw(ArgumentError("illegal value of l")) - return -5 - end + if m < 0 + throw(ArgumentError("illegal value of m")) + end - if lda < max(1, m) - throw(ArgumentError("illegal value of lda")) - return -8 - end + if n < 0 + throw(ArgumentError("illegal value of n")) + end - # quick return - if m == 0 || n == 0 - return - end + if l > min(m, n) + throw(ArgumentError("illegal value of l")) + end - if alpha == 0 && beta == 0 - return - end + # Quick return for trivial cases + if m == 0 || n == 0 + return + end - if l == 1 - l = 0 - end + if alpha == 0 && beta == 0 + return + end - if storev == 'C' - x1 = (@view X[1:m-l]) - x2 = (@view X[m-l+1:m]) - xf = (@view X[1:m]) - else # assume incx = ldaX - x1 = (@view X[1:n-l]) - x2 = (@view X[n-l+1:n]) - xf = (@view X[1:n]) - # columnwise - end + # Handle special case where l=1 (convert to l=0 for efficiency) + if l == 1 + l = 0 + end - if storev != 'C' - y1 = (@view Y[1:l]) - y2 = (@view Y[l+1:m]) - else # assume incy = ldaY - y1 = (@view Y[1:l]) - y2 = (@view Y[l+1:n]) - # columnwise - end + # Set up vector views based on storage format + if storev == 'C' + # Column-wise storage: partition X and Y based on m and l + x1 = (@view x[1:m-l]) + x2 = (@view x[m-l+1:m]) + xf = (@view x[1:m]) + else + # Row-wise storage: partition X and Y based on n and l + x1 = (@view x[1:n-l]) + x2 = (@view x[n-l+1:n]) + xf = (@view x[1:n]) + end + # Determine Y partitioning based on storage format + if storev != 'C' + y1 = (@view y[1:l]) + y2 = (@view y[l+1:m]) + else + y1 = (@view y[1:l]) + y2 = (@view y[l+1:n]) + end - if storev == 'C' - if trans == 'N' - throw(ErrorException("not implemented")) - return -1 - else - if l > 0 - (@view work[1:l]) .= (@view X[m-l+1:m]) - if trans == 'C' - LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'U', 'N', adjoint, - (@view A[m-l+1:m, 1:l]), (@view work[1:l])) - else - LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'U', 'N', transpose, - (@view A[m-l+1:m, 1:l]), (@view work[1:l])) - end + # Apply the matrix-vector multiplication based on storage format and transpose + if storev == 'C' + if trans == 'N' + throw(ErrorException("not implemented")) + else + # Column-wise storage with transpose/adjoint operation + if l > 0 + # Copy relevant portion to workspace for triangular operations + (@view work[1:l]) .= (@view x[m-l+1:m]) + + # Apply triangular matrix multiplication + if trans == 'C' + LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'U', 'N', adjoint, + (@view A[m-l+1:m, 1:l]), (@view work[1:l])) + else + LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'U', 'N', transpose, + (@view A[m-l+1:m, 1:l]), (@view work[1:l])) + end - if m > l - LinearAlgebra.generic_matvecmul!((@view Y[1:l]), trans, (@view A[1:m-l, 1:l]), - (@view X[1:m-l]), LinearAlgebra.MulAddMul(alpha, beta)) - LinearAlgebra.axpy!(alpha, (@view work[1:l]), (@view Y[1:l])) + # Handle remaining matrix-vector operations + if m > l + LinearAlgebra.generic_matvecmul!((@view y[1:l]), trans, (@view A[1:m-l, 1:l]), + (@view x[1:m-l]), LinearAlgebra.MulAddMul(alpha, beta)) + LinearAlgebra.axpy!(alpha, (@view work[1:l]), (@view y[1:l])) + else + # Handle case where m <= l + if beta == 0 + (@view work[1:l]) .*= alpha + (@view y[1:l]) .= (@view work[1:l]) else - if beta == 0 - (@view work[1:l]) .*= alpha - (@view Y[1:l]) .= (@view work[1:l]) - else - (@view Y[1:l]) .*= beta - LinearAlgebra.axpy!(alpha, (@view work[1:l]), (@view Y[1:l])) - end - + (@view y[1:l]) .*= beta + LinearAlgebra.axpy!(alpha, (@view work[1:l]), (@view y[1:l])) end end + end - if n > l - k = n - l - LinearAlgebra.generic_matvecmul!((@view Y[l+1:n]), trans, (@view A[1:m, l+1:n]), - (@view X[1:m]), LinearAlgebra.MulAddMul(alpha, beta)) - end + # Handle remaining columns if n > l + if n > l + k = n - l + LinearAlgebra.generic_matvecmul!((@view y[l+1:n]), trans, (@view A[1:m, l+1:n]), + (@view x[1:m]), LinearAlgebra.MulAddMul(alpha, beta)) end - else - if trans == 'N' - if l > 0 - work[1:l] .= x2 - LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'L', 'N', identity, - (@view A[1:l, n-l+1:n]), (@view work[1:l])) - - if n > l - LinearAlgebra.generic_matvecmul!(y1, 'N', (@view A[1:l, 1:n-l]), - x1, LinearAlgebra.MulAddMul(alpha, beta)) - LinearAlgebra.axpy!(alpha, (@view work[1:l]), y1) + end + else + # Row-wise storage + if trans == 'N' + # Row-wise storage with no transpose + if l > 0 + # Copy and apply triangular operations + work[1:l] .= x2 + LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'L', 'N', identity, + (@view A[1:l, n-l+1:n]), (@view work[1:l])) + + # Handle rectangular part if n > l + if n > l + LinearAlgebra.generic_matvecmul!(y1, 'N', (@view A[1:l, 1:n-l]), + x1, LinearAlgebra.MulAddMul(alpha, beta)) + LinearAlgebra.axpy!(alpha, (@view work[1:l]), y1) + else + # Handle case where n <= l + if beta == 0 + y1 .= alpha * (@view work[1:l]) else - if beta == 0 - y1 .= alpha * (@view work[1:l]) - else - y1 .*= beta - LinearAlgebra.axpy!(alpha, (@view work[1:l]), y1) - end + y1 .*= beta + LinearAlgebra.axpy!(alpha, (@view work[1:l]), y1) end end + end - if m > l - LinearAlgebra.generic_matvecmul!(y2, 'N', (@view A[l+1:m, 1:n]), - xf, LinearAlgebra.MulAddMul(alpha, beta)) - end - else - throw(ErrorException("not implemented")) - return -1 + # Handle remaining rows if m > l + if m > l + LinearAlgebra.generic_matvecmul!(y2, 'N', (@view A[l+1:m, 1:n]), + xf, LinearAlgebra.MulAddMul(alpha, beta)) end + else + # Row-wise storage with transpose - not implemented + throw(ErrorException("not implemented")) end end -end \ No newline at end of file +end + +""" + pemv(trans, storev, A, X, Y, alpha=1.0, beta=0.0) -> Y + +Performs panel matrix-vector multiplication with automatic workspace allocation. +This is a simplified interface that automatically computes required parameters. + +# Arguments +- `trans::Char`: Transpose operation ('N' for none, 'T' for transpose, 'C' for conjugate transpose) +- `storev::Char`: Storage format for vectors ('C' for columnwise, 'R' for rowwise) +- `A::Matrix`: Matrix for multiplication +- `X::Vector`: Input vector +- `Y::Vector`: Output vector (modified in-place) +- `alpha`: Scalar multiplier for A*X (default: 1.0) +- `beta`: Scalar multiplier for Y (default: 0.0) + +# Returns +- Updated vector Y + +# Example +```julia +m, n, l = 6, 4, 3 +A = complex.(randn(m, n), randn(m, n)) +X = complex.(randn(n), randn(n)) +Y = complex.(randn(m), randn(m)) +Y_new = pemv('N', 'C', A, X, Y, 2.0, 1.0) +``` +""" + +function pemv(trans::Char, storev::Char, alpha::T, A::AbstractMatrix{T}, x::AbstractVector{T}, beta::T, y::AbstractVector{T}) where {T} + # Determine dimensions + m, n = size(A) + l = min(m, n) # Default panel size + + # Leading dimension + + # Allocate workspace + work = similar(x, max(m, n)) + + # Call the underlying kernel + pemv!(trans, storev, m, n, l, alpha, A, x, beta, y, work) +end + +export pemv! \ No newline at end of file diff --git a/src/rectrxm.jl b/src/rectrxm.jl index fe6dfb2..cd0bec1 100644 --- a/src/rectrxm.jl +++ b/src/rectrxm.jl @@ -1,34 +1,45 @@ -export unified_rectrxm! """ + unified_rectrxm!(side, uplo, transpose, alpha, func, A, B) + Unified recursive function for triangular matrix solve (TRSM) and multiply (TRMM) operations. -This function supports both solving triangular systems of equations and performing triangular matrix multiplications. - -Arguments: -- side::Char: Specifies the side of the operation: - - 'L': Left multiplication (A * B or inv(A) * B). - - 'R': Right multiplication (B * A or B * inv(A)). -- uplo::Char: Specifies the triangular part of the matrix to reference: - - 'U': Use the upper triangle. - - 'L': Use the lower triangle. -- transpose::Char: Specifies the transposition operation: - - 'N': No transpose. - - 'T': Transpose. - - 'C': Conjugate transpose. -- alpha::Number: Scalar multiplier applied to the operation. -- func::Char: Specifies the function type: - - 'S': Solve (TRSM, A * X = alpha * B). - - 'M': Multiply (TRMM, Update B = alpha * A * B or alpha * B * A). -- A::AbstractMatrix: The triangular matrix. -- B::AbstractMatrix: The matrix to multiply or solve for. - -Returns: -- Updated matrix `B` after performing the specified operation. - -Notes: -- The function modifies `B` in place. -""" +This function supports both solving triangular systems of equations and performing triangular matrix multiplications +using recursive algorithms that are cache-friendly and numerically stable. + +# Arguments +- `side::Char`: Specifies the side of the operation ('L' for left, 'R' for right) + - 'L': Left multiplication (A * B or inv(A) * B) + - 'R': Right multiplication (B * A or B * inv(A)) +- `uplo::Char`: Specifies the triangular part of the matrix to reference + - 'U': Use the upper triangle + - 'L': Use the lower triangle +- `transpose::Char`: Specifies the transposition operation + - 'N': No transpose + - 'T': Transpose + - 'C': Conjugate transpose +- `alpha::Number`: Scalar multiplier applied to the operation +- `func::Char`: Specifies the function type + - 'S': Solve (TRSM, A * X = alpha * B) + - 'M': Multiply (TRMM, Update B = alpha * A * B or alpha * B * A) +- `A::AbstractMatrix`: The triangular matrix +- `B::AbstractMatrix`: The matrix to multiply or solve for (modified in-place) + +# Returns +- Updated matrix `B` after performing the specified operation + +# Algorithm +Uses recursive divide-and-conquer approach that: +1. Partitions matrices into 2x2 block structure +2. Applies operations recursively on subblocks +3. Handles base cases with optimized kernel functions +4. Maintains numerical stability through careful ordering +# Implementation Notes +- The function modifies `B` in place for efficiency +- Uses different thresholds for TRSM (256) vs TRMM (16) operations +- Automatically handles transpose operations by adjusting matrix views +- Recursive partitioning adapts to matrix size for optimal performance +""" function unified_rectrxm!( side::Char, uplo::Char, @@ -38,28 +49,59 @@ function unified_rectrxm!( A::AbstractMatrix, B::AbstractMatrix ) - threshold = 16 + threshold = 16 # Default threshold for TRMM operations n = size(A, 1) + # Handle transpose operations by adjusting matrix view and uplo flag if transpose == 'T' || transpose == 'C' A = (transpose == 'T') ? Transpose(A) : Adjoint(A) uplo = (uplo == 'L') ? 'U' : 'L' end + # TRSM operations require different handling and larger threshold if func == 'S' - threshold = 256 - B .= alpha .* B + threshold = 256 # Larger threshold for solve operations + B .= alpha .* B # Apply scaling before solve end + + # Call recursive kernel unified_rec(func, side, uplo, A, n, B, threshold) + + # TRMM operations apply scaling after multiplication if func == 'M' B .= alpha .* B end + return B end +""" + unified_rec(func, side, uplo, A, n, B, threshold) + +Recursive kernel for unified triangular matrix operations. + +This function implements the divide-and-conquer recursive algorithm that partitions +matrices into 2x2 block structure and applies the appropriate sequence of operations. + +# Arguments +- `func::Char`: Operation type ('S' for solve, 'M' for multiply) +- `side::Char`: Operation side ('L' for left, 'R' for right) +- `uplo::Char`: Triangular part ('U' for upper, 'L' for lower) +- `A::AbstractMatrix{T}`: Triangular coefficient matrix +- `n::Int`: Matrix dimension to process +- `B::AbstractMatrix{T}`: Target matrix (modified in-place) +- `threshold::Int`: Recursion base case threshold (default: 256) + +# Algorithm +The recursion follows different orderings based on the operation type: +1. For forward substitution: A11 → GEMM → A22 +2. For backward substitution: A22 → GEMM → A11 +This ensures numerical stability and correctness of the triangular solve. +""" function unified_rec(func::Char, side::Char, uplo::Char, A::AbstractMatrix{T}, n, B::AbstractMatrix{T}, threshold::Int=256) where T <: AbstractFloat + # Base case: use optimized kernel functions for small matrices if n <= threshold - if func == 'S' + if func == 'S' # Solve operations (TRSM) if side == 'L' && uplo == 'L' LeftLowerTRSM!(A, B) elseif side == 'L' && uplo == 'U' @@ -69,7 +111,7 @@ function unified_rec(func::Char, side::Char, uplo::Char, A::AbstractMatrix{T}, n else RightUpperTRSM!(A, B) end - else + else # Multiply operations (TRMM) if side == 'L' && uplo == 'L' LeftLowerTRMM!(A, B) elseif side == 'L' && uplo == 'U' @@ -83,6 +125,7 @@ function unified_rec(func::Char, side::Char, uplo::Char, A::AbstractMatrix{T}, n return B end + # Determine partition size for optimal cache performance if isinteger(log2(n)) mid = div(n, 2) else @@ -90,55 +133,68 @@ function unified_rec(func::Char, side::Char, uplo::Char, A::AbstractMatrix{T}, n end mid_remainder = n - mid - A11 = view(A, 1:mid, 1:mid) - A22 = view(A, mid+1:n, mid+1:n) - A21 = view(A, mid+1:n, 1:mid) - A12 = view(A, 1:mid, mid+1:n) + # Create 2x2 block partition of matrix A + A11 = view(A, 1:mid, 1:mid) # Upper-left block + A22 = view(A, mid+1:n, mid+1:n) # Lower-right block + A21 = view(A, mid+1:n, 1:mid) # Lower-left block + A12 = view(A, 1:mid, mid+1:n) # Upper-right block + # Partition matrix B based on operation side if side == 'L' - B1 = view(B, 1:mid, :) - B2 = view(B, mid+1:n, :) + B1 = view(B, 1:mid, :) # Upper block rows + B2 = view(B, mid+1:n, :) # Lower block rows else - B1 = view(B, :, 1:mid) - B2 = view(B, :, mid+1:n) + B1 = view(B, :, 1:mid) # Left block columns + B2 = view(B, :, mid+1:n) # Right block columns end + # Apply recursive algorithm with correct ordering for numerical stability + # Different operation types require different orderings to maintain correctness if (side == 'L' && uplo == 'L' && func == 'S') || (side == 'R' && uplo == 'U' && func == 'S') || (side == 'L' && uplo == 'U' && func == 'M') || (side == 'R' && uplo == 'L' && func == 'M') + + # Forward substitution ordering: A11 → GEMM → A22 unified_rec(func, side, uplo, A11, mid, B1, threshold) + + # Apply rank-k update between recursive calls if side == 'L' if func == 'S' - GEMM_SUB!(B2, A21, B1) + GEMM_SUB!(B2, A21, B1) # B2 := B2 - A21 * B1 else - GEMM_ADD!(A12, B2, B1) + GEMM_ADD!(A12, B2, B1) # B1 := B1 + A12 * B2 end else if func == 'S' - GEMM_SUB!(B2, B1, A12) + GEMM_SUB!(B2, B1, A12) # B2 := B2 - B1 * A12 else - GEMM_ADD!(B2, A21, B1) + GEMM_ADD!(B2, A21, B1) # B2 := B2 + A21 * B1 end end + unified_rec(func, side, uplo, A22, mid_remainder, B2, threshold) else + # Backward substitution ordering: A22 → GEMM → A11 unified_rec(func, side, uplo, A22, mid_remainder, B2, threshold) + + # Apply rank-k update between recursive calls if side == 'L' if func == 'S' - GEMM_SUB!(B1, A12, B2) + GEMM_SUB!(B1, A12, B2) # B1 := B1 - A12 * B2 else - GEMM_ADD!(A21, B1, B2) + GEMM_ADD!(A21, B1, B2) # B2 := B2 + A21 * B1 end else if func == 'S' - GEMM_SUB!(B1, B2, A21) + GEMM_SUB!(B1, B2, A21) # B1 := B1 - B2 * A21 else - GEMM_ADD!(B1, A12, B2) + GEMM_ADD!(B1, A12, B2) # B1 := B1 + A12 * B2 end end + unified_rec(func, side, uplo, A11, mid, B1, threshold) end - return B end +export unified_rectrxm! \ No newline at end of file diff --git a/src/trmm.jl b/src/trmm.jl index 7936921..3654bfe 100644 --- a/src/trmm.jl +++ b/src/trmm.jl @@ -1,67 +1,106 @@ +""" +GPU-accelerated Triangular Matrix Multiplication (TRMM) Operations + +This module provides GPU kernel implementations for triangular matrix multiplication +operations, supporting both left and right sided operations with upper and lower +triangular matrices. + +The kernels are optimized for GPU architectures with: +- Shared memory tiling for improved memory access patterns +- Bank conflict avoidance through memory padding +- Vectorized inner loops for computational efficiency +- Bounds checking for non-square matrix operations + +All kernels perform in-place operations: B := A * B or B := B * A +where A is triangular and B is a general matrix. +""" + export LeftLowerTRMM!, LeftUpperTRMM!, RightLowerTRMM!, RightUpperTRMM! -# Performs in place TRMM B = A * B -# where A is an NxN lower triangular matrix and B is an NxM matrix -# A is limited to matrix size 16x16 due to shared memory constraints +# Performs in-place TRMM: B := A * B +# where A is an N×N lower triangular matrix and B is an N×M matrix +# A is limited to matrix size 16×16 due to shared memory constraints + +""" + LeftLowerTRMM_kernel!(A, B, ::Val{BANK}=Val(1)) + +GPU kernel for left-sided lower triangular matrix multiplication. +Performs the operation B := A * B where A is lower triangular. +Uses shared memory tiling with configurable bank offset to avoid conflicts. + +# Arguments +- `A::AbstractMatrix`: N×N lower triangular coefficient matrix +- `B::AbstractMatrix`: N×M target matrix (modified in-place) +- `BANK::Int`: Memory bank offset to avoid conflicts (default: 1) + +# Implementation Notes +- Tile size limited to 16×16 due to shared memory constraints +- Uses private variables for accumulation to enable vectorization +- Includes bounds checking for non-square input matrices +- Synchronization points ensure correct shared memory access patterns +""" @kernel function LeftLowerTRMM_kernel!(A,B, ::Val{BANK} = Val(1)) where BANK + # Get thread and block indices gi,gj = @index(Group, NTuple) i,j = @index(Local, NTuple) - # kept at 16x16 due to shmem constraints + # Tile dimension kept at 16×16 due to shared memory constraints TILE_DIM = @uniform @groupsize()[1] - # allocating shared memory for the sub matrix product calculation - # BANK = 1, added to avoid banck coonflicts as a result of irregular thread access - tile1 = @localmem eltype(B) (TILE_DIM+BANK, TILE_DIM) - tile2 = @localmem eltype(B) (TILE_DIM+BANK, TILE_DIM) + # Allocate shared memory for sub-matrix product calculation + # BANK padding added to avoid bank conflicts from irregular thread access + tile1 = @localmem eltype(B) (TILE_DIM+BANK, TILE_DIM) # For matrix A + tile2 = @localmem eltype(B) (TILE_DIM+BANK, TILE_DIM) # For matrix B - #declaring a private variable to accumulate the result of submatrix multiplication + # Private variable to accumulate the result of sub-matrix multiplication B_sub = @private eltype(B) 1 - @inbounds B_sub[1] = -zero(eltype(B)) + @inbounds B_sub[1] = zero(eltype(B)) - @uniform N = size(A, 1) - @uniform R = size(A, 2) - @uniform M = size(B, 2) + # Get matrix dimensions + @uniform N = size(A, 1) # Matrix A dimensions + @uniform R = size(A, 2) # Matrix A dimensions + @uniform M = size(B, 2) # Matrix B column count - # Cannot use @index(Global), because we use a smaller ndrange(gridsize would reduce) + # Calculate global thread indices (cannot use @index(Global) with custom ndrange) I = (gi-1) * TILE_DIM + i J = (gj-1) * TILE_DIM + j - # load input A into tile, with bounds checking for non-square matrices + # Load input matrix A into shared memory tile with bounds checking if i <= N && j <= N @inbounds tile1[i, j] = A[i, j] else - @inbounds tile1[i, j] = 0.0 - + @inbounds tile1[i, j] = zero(eltype(A)) end - # load input/output B into tiles, with bounds checking for non-square matrices + # Load input/output matrix B into shared memory tile with bounds checking if I <= R && J <= M @inbounds tile2[i, j] = B[I, J] else - @inbounds tile2[i, j] = 0.0 + @inbounds tile2[i, j] = zero(eltype(B)) end - # wait for all tiles to be loaded + # Synchronize to ensure all tiles are loaded before computation @synchronize - # calculate value of spot in output, use temporary value to allow for vectorization + # Calculate triangular matrix-vector product for lower triangular A + # For lower triangular: only use elements A[i,k] where k <= i out = zero(eltype(B)) @simd for k in 1:i @inbounds out += tile1[i, k] * tile2[k, j] end B_sub[1] += out + # Synchronize before writing results @synchronize - # get global indices again + # Recalculate global indices after synchronization I = (gi-1) * TILE_DIM + i J = (gj-1) * TILE_DIM + j - # save if inbounds + # Write result back to global memory if within bounds if I <= N && J <= M @inbounds B[I, J] = B_sub[1] end @@ -275,38 +314,135 @@ end -# wrapper function for the LLTRMM kernel +""" + LeftLowerTRMM!(A, B; n_threads=(16,16)) + +Perform left-sided lower triangular matrix multiplication: B := A * B + +# Arguments +- `A::AbstractMatrix`: N×N lower triangular coefficient matrix +- `B::AbstractMatrix`: N×M target matrix (modified in-place) +- `n_threads::Tuple`: Thread block dimensions (default: (16,16)) + +# Implementation Notes +- Uses GPU acceleration with optimized kernel +- Thread block size should not exceed hardware limits +- NDRange is padded to handle boundary conditions +""" function LeftLowerTRMM!(A, B; n_threads = (16,16)) backend = get_backend(A) + # Calculate NDRange with padding to handle boundary threads Ndrange = max(size(A), size(B)) - Ndrange = (Ndrange[1]+ 16, Ndrange[2]+16) + Ndrange = (Ndrange[1] + 16, Ndrange[2] + 16) LeftLowerTRMM_kernel!(backend, n_threads)(A, B, ndrange = Ndrange) - # need to specify ndrange as the larger of the 2 ARGUMENTS - # LeftLowerTRMM_kernel!(backend, n_threads)(A, B, ndrange = max(size(A), size(B))) end -# wrapper function for the LUTRMM kernel +""" + LeftUpperTRMM!(A, B; n_threads=(16,16)) + +Perform left-sided upper triangular matrix multiplication: B := A * B + +# Arguments +- `A::AbstractMatrix`: N×N upper triangular coefficient matrix +- `B::AbstractMatrix`: N×M target matrix (modified in-place) +- `n_threads::Tuple`: Thread block dimensions (default: (16,16)) +""" function LeftUpperTRMM!(A, B; n_threads = (16,16)) backend = get_backend(A) Ndrange = max(size(A), size(B)) - Ndrange = (Ndrange[1]+ 16, Ndrange[2]+16) - # could not use overloading with only 2 args + Ndrange = (Ndrange[1] + 16, Ndrange[2] + 16) LeftUpperTRMM_kernel!(backend, n_threads)(A, B, ndrange = Ndrange) end -# wrapper function for the RLTRMM kernel +""" + RightLowerTRMM!(A, B; n_threads=(16,16)) + +Perform right-sided lower triangular matrix multiplication: B := B * A + +# Arguments +- `A::AbstractMatrix`: N×N lower triangular coefficient matrix +- `B::AbstractMatrix`: M×N target matrix (modified in-place) +- `n_threads::Tuple`: Thread block dimensions (default: (16,16)) +""" function RightLowerTRMM!(A, B; n_threads = (16,16)) backend = get_backend(A) Ndrange = max(size(A), size(B)) - Ndrange = (Ndrange[1]+ 16, Ndrange[2]+16) - # could not use overloading with only 2 args + Ndrange = (Ndrange[1] + 16, Ndrange[2] + 16) RightLowerTRMM_kernel!(backend, n_threads)(A, B, ndrange = Ndrange) end +""" + RightUpperTRMM!(A, B; n_threads=(16,16)) + +Perform right-sided upper triangular matrix multiplication: B := B * A + +# Arguments +- `A::AbstractMatrix`: N×N upper triangular coefficient matrix +- `B::AbstractMatrix`: M×N target matrix (modified in-place) +- `n_threads::Tuple`: Thread block dimensions (default: (16,16)) +""" function RightUpperTRMM!(A, B; n_threads = (16,16)) backend = get_backend(A) Ndrange = max(size(A), size(B)) - Ndrange = (Ndrange[1]+ 16, Ndrange[2]+16) - # could not use overloading with only 2 args + Ndrange = (Ndrange[1] + 16, Ndrange[2] + 16) RightUpperTRMM_kernel!(backend, n_threads)(A, B, ndrange = Ndrange) +end + +""" + trmm(side, uplo, transa, diag, A, B, alpha=1.0) -> B + +Performs triangular matrix multiplication with automatic parameter detection. +This is a simplified interface for triangular matrix multiplication operations. + +# Arguments +- `side::Char`: Operation side + - 'L': B := alpha*op(A)*B (left multiplication) + - 'R': B := alpha*B*op(A) (right multiplication) +- `uplo::Char`: Triangular part specification + - 'U': A is upper triangular + - 'L': A is lower triangular +- `transa::Char`: Operation on matrix A + - 'N': op(A) = A (no transpose) + - 'T': op(A) = A^T (transpose) + - 'C': op(A) = A^H (conjugate transpose) +- `diag::Char`: Diagonal type (currently unused in GPU implementation) + - 'N': non-unit diagonal + - 'U': unit diagonal +- `A::AbstractMatrix`: Triangular coefficient matrix +- `B::AbstractMatrix`: Target matrix (modified in-place) +- `alpha`: Scalar multiplier (default: 1.0) + +# Returns +- Updated matrix B (same as input B, modified in-place) + +# Example +```julia +A = complex.(triu(randn(4, 4)), triu(randn(4, 4))) +B = complex.(randn(4, 3), randn(4, 3)) +C = trmm('L', 'U', 'N', 'N', A, copy(B)) +``` + +# Implementation Notes +- Currently supports 'N' (no transpose) operations only +- Uses GPU-accelerated kernels for computation +- The transa and diag parameters are provided for interface compatibility +""" +function trmm(side, uplo, transa, diag, A, B, alpha=one(eltype(A))) + # Apply scaling if alpha != 1 + if alpha != one(eltype(A)) + B .*= alpha + end + + # Dispatch to appropriate GPU kernel based on operation parameters + if side == 'L' && uplo == 'L' + LeftLowerTRMM!(A, B) + elseif side == 'L' && uplo == 'U' + LeftUpperTRMM!(A, B) + elseif side == 'R' && uplo == 'L' + RightLowerTRMM!(A, B) + elseif side == 'R' && uplo == 'U' + RightUpperTRMM!(A, B) + else + error("Unsupported combination of side='$side', uplo='$uplo'") + end end \ No newline at end of file diff --git a/src/trsm.jl b/src/trsm.jl index ea37a94..1300c5b 100644 --- a/src/trsm.jl +++ b/src/trsm.jl @@ -147,4 +147,59 @@ function RightUpperTRSM!(A, B) n, m = size(B) backend = get_backend(A) right_upper_kernel(backend, (m,))(Transpose(A), B, m, ndrange=(m, n)) +end + +""" + trsm(side, uplo, transa, diag, A, B, alpha=1.0) -> B + +Solves triangular matrix systems with automatic parameter detection. +This is a simplified interface for triangular system solving. + +# Arguments +- 'side': + - 'L': solve op(A)*X = alpha*B + - 'R': solve X*op(A) = alpha*B +- 'uplo': + - 'U': A is upper triangular + - 'L': A is lower triangular +- 'transa': operation on A + - 'N': op(A) = A + - 'T': op(A) = A^T + - 'C': op(A) = A^H +- 'diag': diagonal type + - 'N': non-unit diagonal + - 'U': unit diagonal +- 'A': triangular matrix +- 'B': right-hand side matrix (will be overwritten with solution) +- 'alpha': scalar multiplier (default: 1.0) + +# Returns +- Updated matrix B containing the solution + +# Example +```julia +A = complex.(triu(randn(4, 4)), triu(randn(4, 4))) +B = complex.(randn(4, 3), randn(4, 3)) +X = trsm('L', 'U', 'N', 'N', A, copy(B)) +``` +""" +function trsm(side, uplo, transa, diag, A, B, alpha=one(eltype(A))) + # Scale B if alpha != 1 + if alpha != one(eltype(A)) + B .*= alpha + end + + # Apply the appropriate kernel based on parameters + if side == 'L' && uplo == 'L' + LeftLowerTRSM!(A, B) + elseif side == 'L' && uplo == 'U' + LeftUpperTRSM!(A, B) + elseif side == 'R' && uplo == 'L' + RightLowerTRSM!(A, B) + elseif side == 'R' && uplo == 'U' + RightUpperTRSM!(A, B) + else + error("Unsupported combination of side='$side', uplo='$uplo'") + end + end \ No newline at end of file diff --git a/src/tsmqr.jl b/src/tsmqr.jl index 221d0af..b78af1a 100644 --- a/src/tsmqr.jl +++ b/src/tsmqr.jl @@ -1,110 +1,220 @@ -function tsmqr(side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, work, ldwork) - - #check input arguments +""" + tsmqr!(side, trans, m1, n1, m2, n2, k, ib, A1, A2, V, T, work) + +Apply orthogonal matrix Q (or Q^H) stored as compact WY representation to +a triangular-pentagonal matrix [A1; A2]. + +This routine applies a block orthogonal transformation represented in compact +WY form (stored in V and T) to the combined matrix [A1; A2] where A1 is +triangular and A2 is pentagonal. + +# Arguments +- `side`: Character indicating side of multiplication + - 'L': Apply Q from the left (Q*[A1; A2] or Q^H*[A1; A2]) + - 'R': Apply Q from the right ([A1 A2]*Q or [A1 A2]*Q^H) +- `trans`: Character indicating whether to transpose Q + - 'N': Apply Q (no transpose) + - 'C': Apply Q^H (conjugate transpose) + - 'T': Apply Q^T (transpose, same as 'C' for complex) +- `m1`, `n1`: Dimensions of triangular matrix A1 +- `m2`, `n2`: Dimensions of pentagonal matrix A2 +- `k`: Number of elementary reflectors (columns of V) +- `ib`: Block size for compact WY representation +- `A1`: Triangular part of the matrix (modified in-place) +- `A2`: Pentagonal part of the matrix (modified in-place) +- `V`: Matrix containing reflector vectors +- `T`: Upper triangular block reflector coefficient matrix +- `work`: Workspace array + + +# Algorithm +The transformation Q is applied using the compact WY representation: +Q = I - V * T * V^H + +The algorithm processes the reflectors in blocks of size ib, applying +each block using efficient matrix operations (parfb! routine). + +# Input Validation +Validates all dimension parameters and leading dimension requirements +for proper matrix storage and computation. + +# Notes +This is a core computational routine for applying orthogonal transformations +in blocked QR algorithms. The compact WY form enables efficient block updates. +""" +function tsmqr!(side::Char, trans::Char, m1::Integer, n1::Integer, m2::Integer, n2::Integer, k::Integer, ib::Integer, + A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, work::AbstractVector{T}) where {T} + + # Input validation with descriptive error messages if side != 'L' && side != 'R' - throw(ArgumentError("illegal value of side")) - return -1 + throw(ArgumentError("side must be 'L' or 'R', got '$side'")) end if trans != 'N' && trans != 'C' && trans != 'T' - throw(ArgumentError("illegal value of trans")) - return -2 + throw(ArgumentError("trans must be 'N', 'C', or 'T', got '$trans'")) end if m1 < 0 - throw(ArgumentError("illegal value of m1")) - return -3 + throw(ArgumentError("m1 must be non-negative, got $m1")) end if n1 < 0 - throw(ArgumentError("illegal value of n1")) - return -4 + throw(ArgumentError("n1 must be non-negative, got $n1")) end - if m1 < 0 || (m2 != m1 && side == 'R') - throw(ArgumentError("illegal value of m2")) - return -5 + if m2 < 0 || (m2 != m1 && side == 'R') + if side == 'R' + throw(ArgumentError("For side='R', m2 must equal m1. Got m1=$m1, m2=$m2")) + else + throw(ArgumentError("m2 must be non-negative, got $m2")) + end end if n2 < 0 || (n2 != n1 && side == 'L') - throw(ArgumentError("illegal value of n2")) - return -6 + if side == 'L' + throw(ArgumentError("For side='L', n2 must equal n1. Got n1=$n1, n2=$n2")) + else + throw(ArgumentError("n2 must be non-negative, got $n2")) + end end if k < 0 || (side == 'L' && k > m1) || (side == 'R' && k > n1) - throw(ArgumentError("illegal value of k")) - return -7 + max_k = side == 'L' ? m1 : n1 + throw(ArgumentError("k must be between 0 and $max_k for side='$side', got $k")) end if ib < 0 - throw(ArgumentError("illegal value of ib")) - return -8 - end - - if lda1 < max(1,m1) - throw(ArgumentError("illegal value of lda1")) - return -10 - end - - if lda2 < max(1,m2) - throw(ArgumentError("illegal value of lda2")) - return -12 - end - - if (side == 'L' && ldv < max(1,m2)) || (side == 'R' && ldv < max(1,n2)) - throw(ArgumentError("illegal value of ldv")) - return -14 - end - - if ldt < max(1,ib) - throw(ArgumentError("illegal value of ldt")) - return -16 + throw(ArgumentError("ib must be non-negative, got $ib")) end - if (side == 'L' && ldwork < max(1,ib)) || (side == 'R' && ldwork < max(1,m1)) - throw(ArgumentError("illegal value of ldwork")) - return -18 - end - - # quick return - if m1 == 0 || n1 == 0 || m2 == 0 || n2 == 0 || k == 0 || ib == 0 + # Quick return for degenerate cases + if m1 == 0 || n1 == 0 || m2 == 0 || n2 == 0 || k == 0 || ib == 0 return end + # Determine the order of applying blocks based on side and trans if (side == 'L' && trans != 'N') || (side == 'R' && trans == 'N') + # Apply blocks forward: 1, ib+1, 2*ib+1, ... i1 = 1 i3 = ib istop = k else + # Apply blocks backward: ..., 2*ib+1, ib+1, 1 i1 = (div(k-1,ib))*ib + 1 i3 = -ib istop = 1 end + # Initialize indices for submatrices ic = 1 jc = 1 mi = m1 ni = n1 + # Apply blocks of reflectors for i in i1:i3:istop - kb = min(ib, k-i+1) + kb = min(ib, k - i + 1) # Size of current block if side == 'L' - # H or H^H is applied to C[i:m, 1:n] + # Q is applied from the left: Q * [A1; A2] mi = m1 - i + 1 ic = i - ldvv = m2 + # Workspace for this block: kb x ni + W = reshape(@view(work[1:kb*ni]), kb, ni) + parfb!('L', trans, 'F', 'C', mi, ni, m2, n2, kb, 0, + (@view A1[ic:ic+mi-1, jc:jc+ni-1]), (@view A2[1:m2, 1:n2]), + (@view V[1:m2, i:i+kb-1]), (@view T_mat[1:kb, i:i+kb-1]), W) else - # H or H^H is applied to C[1:m, i:n] - ni = n1- i + 1 + # Q is applied from the right: [A1 A2] * Q + ni = n1 - i + 1 jc = i - ldvv = n2 + # Workspace for this block: mi x kb + W = reshape(@view(work[1:mi*kb]), mi, kb) + parfb!('R', trans, 'F', 'C', mi, ni, m2, n2, kb, 0, + (@view A1[ic:ic+mi-1, jc:jc+ni-1]), (@view A2[1:m2, 1:n2]), + (@view V[1:n2, i:i+kb-1]), (@view T_mat[1:kb, i:i+kb-1]), W) end + end +end - # apply H or H^H - parfb(side, trans, 'F', 'C', mi, ni, m2, n2, kb, 0, - (@view A1[ic:ic+mi-1, jc:jc+ni-1]), lda1, (@view A2[1:m2, 1:n2]), lda2, - (@view V[1:ldvv, i:i+kb-1]), ldvv, (@view T[1:kb, i:i+kb-1]), kb, work, ldwork) +""" + tsmqr!(side, trans, A1, A2, V, T, ib) -> (A1, A2) + +Apply orthogonal matrix Q (stored in compact WY form) to triangular-pentagonal matrices. + +This is a high-level interface that automatically determines dimensions and +allocates workspace for applying block orthogonal transformations to the +combined matrix [A1; A2]. + +# Arguments +- `side`: Character indicating multiplication side + - 'L': Apply Q from left (Q*[A1; A2] or Q^H*[A1; A2]) + - 'R': Apply Q from right ([A1 A2]*Q or [A1 A2]*Q^H) +- `trans`: Character indicating transpose operation + - 'N': Apply Q (no transpose) + - 'C': Apply Q^H (conjugate transpose) +- `A1`: Triangular part of matrix (modified in-place) +- `A2`: Pentagonal part of matrix (modified in-place) +- `V`: Matrix containing elementary reflector vectors +- `T_matrix`: Upper triangular block reflector coefficient matrix +- `ib`: Block size for the compact WY representation + +# Returns +- Modified `A1`: Triangular part after transformation +- Modified `A2`: Pentagonal part after transformation + +# Input Validation +- For side='L': n2 must equal n1 (same number of columns) +- For side='R': m2 must equal m1 (same number of rows) +- Block size ib should be positive and ≤ min(size(V,2), ib) + +# Example +```julia +# Apply Q from left to triangular-pentagonal matrix +m1, n1, m2, n2 = 6, 8, 10, 8 +k, ib = 4, 2 +A1 = triu(randn(ComplexF64, m1, n1)) +A2 = randn(ComplexF64, m2, n2) +V = randn(ComplexF64, m2, k) +T = triu(randn(ComplexF64, ib, k)) +tsmqr!('L', 'N', A1, A2, V, T, ib) +``` + +# Algorithm +Uses blocked approach to apply the orthogonal transformation Q = I - V*T*V^H +efficiently. The compact WY representation enables high-performance +matrix-matrix operations instead of multiple vector operations. +""" +function tsmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, + V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, ib::Integer) where {T} + m1, n1 = size(A1) + m2, n2 = size(A2) + k = size(V, 2) + + # Validate input dimensions + if side == 'L' && n2 != n1 + throw(ArgumentError("For side='L', A1 and A2 must have same number of columns. Got n1=$n1, n2=$n2")) + elseif side == 'R' && m2 != m1 + throw(ArgumentError("For side='R', A1 and A2 must have same number of rows. Got m1=$m1, m2=$m2")) + end + + if ib <= 0 + throw(ArgumentError("Block size ib must be positive, got $ib")) + end + + if k > size(T_matrix, 2) + throw(ArgumentError("Number of reflectors k ($k) exceeds T matrix columns ($(size(T_matrix, 2)))")) end + + # Determine workspace requirements and allocate + if side == 'L' + work_size = ib * max(n1, n2) + else + work_size = m1 * ib + end + work = zeros(T, work_size) + + # Call the core computational routine + tsmqr!(side, trans, m1, n1, m2, n2, k, ib, A1, A2, + V, T_matrix, work) end diff --git a/src/tsqrt.jl b/src/tsqrt.jl index 4d61153..fa1636b 100644 --- a/src/tsqrt.jl +++ b/src/tsqrt.jl @@ -1,79 +1,173 @@ -function tsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, tau, work) - # check input Arguments +""" + tsqrt!(m, n, ib, A1, A2, T, tau, work) - if m < 0 - throw(ArgumentError("illegal value of m")) - return -1 - end +Compute the QR factorization of an (m+n)-by-n triangular-pentagonal matrix +using the compact WY representation. - if n < 0 - throw(ArgumentError("illegal value of n")) - return -2 - end +This routine computes the QR factorization of a triangular-pentagonal matrix: + [ A1 ] + [ A2 ] +where A1 is n-by-n upper triangular and A2 is m-by-n general. - if ib < 0 - throw(ArgumentError("illegal value of ib")) - return -3 - end +The factorization has the form: + [ A1 ] = Q * [ R ] + [ A2 ] [ 0 ] +where Q is orthogonal and R is upper triangular. + +# Arguments +- `m`: Number of rows of the pentagonal part A2 +- `n`: Number of columns of the triangular-pentagonal matrix +- `ib`: Block size for the compact WY representation +- `A1`: n×n upper triangular matrix (modified in-place) +- `A2`: m×n general matrix (modified in-place) +- `T`: ib×n matrix to store block reflector coefficients +- `tau`: Vector of length n to store reflector scalar factors +- `work`: Workspace array of length ib×n - if lda1 < max(1,n) && n > 0 - throw(ArgumentError("illegal value of lda1")) - return -5 +# Algorithm +The algorithm proceeds in blocks of size ib: +1. For each block, generate elementary reflectors to zero the pentagonal part +2. Apply reflectors to remaining columns using efficient block updates +3. Store reflector coefficients in compact WY form in matrix T + +The compact WY representation allows for efficient application of the +orthogonal factor Q using block operations. + +# Input Validation +All dimension parameters must be non-negative and leading dimensions +must satisfy minimum requirements for valid matrix storage. + +# Notes +This is a low-level computational routine typically called by higher-level +QR factorization interfaces. The matrices A1, A2 are modified in-place +to store the R factor and reflector vectors respectively. +""" +function tsqrt!(m::Integer, n::Integer, ib::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T} + # Input validation with descriptive error messages + if m < 0 + throw(ArgumentError("m must be non-negative, got $m")) end - if lda2 < max(1,m) && m > 0 - throw(ArgumentError("illegal value of lda2")) - return -7 + if n < 0 + throw(ArgumentError("n must be non-negative, got $n")) end - if ldt < max(1,ib) && ib > 0 - throw(ArgumentError("illegal value of ldt")) - return -9 + if ib < 0 + throw(ArgumentError("ib must be non-negative, got $ib")) end - # quick return + # Quick return for degenerate cases if m == 0 || n == 0 || ib == 0 return end - one0 = oneunit(eltype(A1)) - zero0 = zero(eltype(A1)) - plus = LinearAlgebra.MulAddMul(one0, one0) + Tone = oneunit(eltype(A1)) + Tzero = zero(eltype(A1)) + plus = LinearAlgebra.MulAddMul(Tone, Tone) + # Process matrix in blocks of size ib for ii in 1:ib:n sb = min(n-ii+1, ib) + # Generate elementary reflectors for current block for i in 1:sb - # generate elementary reflector H[ii*ib + i] to annilate A[ii*ib, + i:m, ii*ib + i] - A1[ii+i-1, ii+i-1], tau[ii+i-1] = larfg(m+1, A1[ii+i-1, ii+i-1], (@view A2[1:m, ii+i-1]), 1, tau[ii+i-1]) + # Generate elementary reflector H[ii+i-1] to annihilate A2[1:m, ii+i-1] + A1[ii+i-1, ii+i-1], tau[ii+i-1] = larfg!(m+1, A1[ii+i-1, ii+i-1], + (@view A2[1:m, ii+i-1]), 1, tau[ii+i-1]) if ii+i <= n - # apply H[ii*ib + i] to A[ii*ib + i:m, ii*ib + i + 1 : ii*ib + ib] from left + # Apply H[ii+i-1] to A[ii+i-1:m, ii+i:ii+sb-1] from the left alpha = -conj(tau[ii+i-1]) (@view work[1:sb-i]) .= (@view A1[ii+i-1, ii+i:ii+sb-1]) + # Compute work = A1[ii+i-1, ii+i:ii+sb-1]^H + A2[1:m, ii+i:ii+sb-1]^H * A2[1:m, ii+i-1] conj!((@view work[1:sb-i])) - LinearAlgebra.generic_matvecmul!((@view work[1:sb-i]), 'C', (@view A2[1:m, ii+i:ii+sb-1]), (@view A2[1:m, ii+i-1]), plus) + LinearAlgebra.generic_matvecmul!((@view work[1:sb-i]), 'C', (@view A2[1:m, ii+i:ii+sb-1]), + (@view A2[1:m, ii+i-1]), plus) conj!((@view work[1:sb-i])) + + # Apply the reflector: A1 -= alpha * work, A2 -= alpha * v * work^H LinearAlgebra.axpy!(alpha, (@view work[1:sb-i]), (@view A1[ii+i-1, ii+i:ii+sb-1])) conj!((@view work[1:sb-i])) gerc!(alpha, (@view A2[1:m, ii+i-1]), (@view work[1:sb-i]), (@view A2[1:m, ii+i:ii+sb-1])) end - # Calculate T - alpha = -tau[ii+i-1] - LinearAlgebra.generic_matvecmul!((@view T[1:i-1, ii+i-1]), 'C', (@view A2[1:m, ii:ii+i-2]), (@view A2[1:m, ii+i-1]),LinearAlgebra.MulAddMul(alpha, zero0)) - #LinearAlgebra.BLAS.trmv!('U', 'N', 'N', (@view T[1:i-1, ii:ii+i-2]), (@view T[1:i-1, ii+i-1])) - LinearAlgebra.generic_trimatmul!((@view T[1:i-1, ii+i-1]), 'U', 'N', identity, (@view T[1:i-1, ii:ii+i-2]), (@view T[1:i-1, ii+i-1])) - T[i, ii+i-1] = tau[ii+i-1] + # Build triangular factor T for block reflectors + if i > 1 + alpha = -tau[ii+i-1] + LinearAlgebra.generic_matvecmul!((@view T_matrix[1:i-1, ii+i-1]), 'C', (@view A2[1:m, ii:ii+i-2]), + (@view A2[1:m, ii+i-1]), LinearAlgebra.MulAddMul(alpha, Tzero)) + LinearAlgebra.generic_trimatmul!((@view T_matrix[1:i-1, ii+i-1]), 'U', 'N', identity, + (@view T_matrix[1:i-1, ii:ii+i-2]), (@view T_matrix[1:i-1, ii+i-1])) + end + T_matrix[i, ii+i-1] = tau[ii+i-1] end + # Apply block reflector to remaining columns if n >= ii+sb - ww = reshape(@view(work[1: ib*(n-(ii+sb)+1)]), ib, n-(ii+sb)+1) - - tsmqr('L', 'C', sb, n-(ii+sb) + 1, m, n-(ii+sb) + 1, ib, ib, - (@view A1[ii:ii+sb-1, ii+sb: n]), sb, (@view A2[1:m, ii+sb:n]), m, - (@view A2[1:m, ii:ii+sb-1]), m, (@view T[1:ib, ii:ii+ib-1]), ib, ww, sb) + # Use provided vector workspace; tsmqr! will reshape internally as needed + tsmqr!('L', 'C', sb, n - (ii + sb) + 1, m, n - (ii + sb) + 1, sb, ib, + (@view A1[ii:ii+sb-1, ii+sb:n]), (@view A2[1:m, ii+sb:n]), + (@view A2[1:m, ii:ii+sb-1]), (@view T_matrix[1:ib, ii:ii+sb-1]), work) end end end + +""" + tsqrt!(A1, A2, ib) -> (A1, A2, T, tau) + +Compute QR factorization of a triangular-pentagonal matrix using block algorithm. + +This is a high-level interface that automatically allocates workspace and +computes the QR factorization of the combined matrix [A1; A2] where A1 is +upper triangular and A2 is general. + +# Arguments +- `A1`: n×n upper triangular matrix (modified in-place to store R factor) +- `A2`: m×n general matrix (modified in-place to store reflector vectors) +- `ib`: Block size for the algorithm (typically 32-64 for good performance) + +# Returns +- Modified `A1`: Contains the R factor of the QR factorization +- Modified `A2`: Contains the elementary reflector vectors +- `T`: ib×n matrix containing block reflector coefficients +- `tau`: Length-n vector containing reflector scaling factors + +# Input Validation +- A1 must be square (n×n) +- A2 must have same number of columns as A1 (m×n) +- Block size ib should be positive and ≤ n for efficiency + +# Example +```julia +n, m = 6, 8 +ib = 4 +A1 = triu(randn(ComplexF64, n, n)) # Upper triangular +A2 = randn(ComplexF64, m, n) # General matrix +A1_qr, A2_qr, T, tau = tsqrt!(copy(A1), copy(A2), ib) +``` + +# Algorithm Notes +Uses blocked algorithm for efficiency with large matrices. The compact WY +representation (stored in T) enables efficient application of the Q factor. +""" +function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, ib::Integer) where {T} + n, n2 = size(A1) + if n != n2 + throw(ArgumentError("A1 must be square, got size $(size(A1))")) + end + + m, n3 = size(A2) + if n != n3 + throw(ArgumentError("A1 and A2 must have same number of columns, got $n and $n3")) + end + + if ib <= 0 + throw(ArgumentError("Block size ib must be positive, got $ib")) + end + + work = zeros(T, ib * n) + + # Call the core computational routine + tsqrt!(m, n, ib, A1, A2, T_matrix, tau, work) +end diff --git a/src/ttmqr.jl b/src/ttmqr.jl index 7b084dd..ac4b263 100644 --- a/src/ttmqr.jl +++ b/src/ttmqr.jl @@ -1,73 +1,40 @@ -function ttmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, work, ldwork) +function ttmqr!(side::Char, trans::Char, m1::Integer, n1::Integer, m2::Integer, n2::Integer, k::Integer, ib::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, work::AbstractVector{T}) where {T} # check input arguments if side != 'L' && side != 'R' throw(ArgumentError("illegal value of side")) - return -1 end if trans != 'N' && trans != 'C' throw(ArgumentError("illegal value of trans")) - return -2 end if m1 < 0 throw(ArgumentError("illegal value of m1")) - return -3 end if n1 < 0 throw(ArgumentError("illegal value of n1")) - return -4 end if (m2 < 0) || (m2 != m1 && side == 'R') throw(ArgumentError("illegal value of m2")) - return -5 end if (n2 < 0) || (n2 != n1 && side == 'L') throw(ArgumentError("illegal value of n2")) - return -6 end if (k < 0) || (side == 'L' && k > m1) || (side == 'R' && k > n1) throw(ArgumentError("illegal value of k")) - return -7 end if ib < 0 throw(ArgumentError("illegal value of ib")) - return -8 - end - - if lda1 < max(1,m1) - throw(ArgumentError("illegal value of lda1")) - return -10 - end - - if lda2 < max(1,m2) - throw(ArgumentError("illegal value of lda2")) - return -12 - end - - if ldv < max(1, side == 'L' ? m2 : n2) - throw(ArgumentError("illegal value of ldv")) - return -14 - end - - if ldt < max(1,ib) - throw(ArgumentError("illegal of ldt")) - return -16 - end - - if ldwork < max(1, side == 'L' ? ib : m1) - throw(ArgumentError("illegal value of ldwork")) - return -18 end # quick return if m1 == 0 || n1 == 0 || m2 == 0 || n2 == 0 || k == 0 || ib == 0 - return 0 + return end if (side == 'L' && trans != 'N') || (side == 'R' && trans == 'N') @@ -91,28 +58,69 @@ function ttmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T l = 0 if side == 'L' - # H or H^H applied to C[i:m, 1:n] + # Apply from left on the current block rows mi = kb - mi2 = min(i+kb-1, m2) + mi2 = min(i + kb - 1, m2) ic = i - l = min(kb, max(0, m2-i)) # Julia 1-based: m2-i+1 (PLASMA has m2-i for 0-based) - ldvv = m2 - else + l = min(kb, max(0, m2 - i)) + # Workspace as kb x ni + W = reshape(@view(work[1:kb*ni]), kb, ni) + parfb!('L', trans, 'F', 'C', mi, ni, mi2, ni2, kb, l, + (@view A1[ic:ic+mi-1, jc:jc+ni-1]), + (@view A2[1:mi2, 1:ni2]), + (@view V[1:m2, i:i+kb-1]), + (@view T_mat[1:kb, i:i+kb-1]), + W) + else + # Apply from right on the current block columns ni = kb - ni2 = min(i+kb-1, n2) + ni2 = min(i + kb - 1, n2) jc = i - l = min(kb, max(0, n2-i)) # Julia 1-based: n2-i+1 (PLASMA has n2-i for 0-based) - ldvv = n2 + l = min(kb, max(0, n2 - i)) + # Workspace as mi x kb + W = reshape(@view(work[1:mi*kb]), mi, kb) + parfb!('R', trans, 'F', 'C', mi, ni, mi2, ni2, kb, l, + (@view A1[ic:ic+mi-1, jc:jc+ni-1]), + (@view A2[1:mi2, 1:ni2]), + (@view V[1:n2, i:i+kb-1]), + (@view T_mat[1:kb, i:i+kb-1]), + W) end - - # apply H or H^H - parfb(side, trans, 'F', 'C', mi, ni, mi2, ni2, kb, l, - (@view A1[ic:ic+mi-1, jc:jc+ni-1]), lda1, - (@view A2[1:mi2, 1:ni2]), lda2, - (@view V[1:ldvv, i:i+kb-1]), ldvv, - (@view T[1:ldt, i:i+kb-1]), ldt, - work, ldwork) i += i3 end end + +""" + ttmqr!(side, trans, A1, A2, V, T, ib) -> (A1, A2) + +Helper function for triangular-trapezoidal matrix transformation. + +# Arguments +- `side`: 'L' (left) or 'R' (right) +- `trans`: 'N' (no transpose) or 'C' (conjugate transpose) +- `A1`: Upper triangular matrix to be updated +- `A2`: Trapezoidal matrix to be updated +- `V`: Reflector vectors matrix +- `T`: Block reflector matrix +- `ib`: Block size + +# Returns +- Modified `A1` and `A2` +""" +function ttmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, + V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, ib::Integer) where T + m1, n1 = size(A1) + m2, n2 = size(A2) + # Use the common number of reflectors available in V and T + k = size(T_matrix, 2) + + # Workspace size follows parfb!/TPMQRT requirements + # - Left: W is (ib x n1) at most + # - Right: W is (m1 x ib) at most + work_size = side == 'L' ? ib * n1 : m1 * ib + work = zeros(T, work_size) + + ttmqr!(side, trans, m1, n1, m2, n2, k, ib, A1, A2, + V, T_matrix, work) +end diff --git a/src/ttqrt.jl b/src/ttqrt.jl index 11ef15b..8c6f341 100644 --- a/src/ttqrt.jl +++ b/src/ttqrt.jl @@ -1,97 +1,104 @@ -function ttqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, tau, work) - begin - if m < 0 - throw(ArgumentError("illegal value of m")) - return -1 - end - - if n < 0 - throw(ArgumentError("illegal value of n")) - return -2 - end - - if ib < 0 - throw(ArgumentError("illegal value of ib")) - return -3 - end - - if lda1 < max(1, n) && n > 0 - throw(ArgumentError("illegal value of lda1")) - return -5 - end - - if lda2 < max(1, m) && m > 0 - throw(ArgumentError("illegal value of lda2")) - return -7 - end +function ttqrt!(m::Integer, n::Integer, ib::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T} + if m < 0 + throw(ArgumentError("illegal value of m")) + end - if ldt < max(1, ib) && ib > 0 - throw(ArgumentError("illegal value of ldt")) - return -9 - end + if n < 0 + throw(ArgumentError("illegal value of n")) + end - # quick return - if m == 0 || n == 0 || ib == 0 - return - end + if ib < 0 + throw(ArgumentError("illegal value of ib")) + end - # original function had this todo: - # todo: Need to check why some cases require this to avoid - # uninitialized values - # core_zlaset(CoreBlasGeneral, ib, n, 0.0, 0.0, T, ldt); + # quick return + if m == 0 || n == 0 || ib == 0 + return + end - one = oneunit(eltype(A1)) + # original function had this todo: + # todo: Need to check why some cases require this to avoid + # uninitialized values + # core_zlaset(CoreBlasGeneral, ib, n, 0.0, 0.0, T, ldt); - for ii in 1:ib:n - sb = min(n - ii + 1, ib) + one = oneunit(eltype(A1)) + Tzero = zero(eltype(A1)) - for i in 1:sb - j = ii + i - 1 # index - mi = min(j, m) # length - ni = sb - i # length + for ii in 1:ib:n + sb = min(n - ii + 1, ib) - A1[j, j], tau[j] = larfg(mi + 1, A1[j, j], (@view A2[1:mi, j]), 1, tau[j]) + for i in 1:sb + j = ii + i - 1 # index + mi = min(j, m) # length + ni = sb - i # length - if ni > 0 - work[1:ni] .= (@view A1[j, j+1:j+ni]) - conj!((@view work[1:ni])) + A1[j, j], tau[j] = larfg!(mi + 1, A1[j, j], (@view A2[1:mi, j]), 1, tau[j]) - LinearAlgebra.generic_matvecmul!((@view work[1:ni]), 'C', (@view A2[1:mi, j+1:j+ni]), - (@view A2[1:mi, j]), LinearAlgebra.MulAddMul(one, one)) - conj!((@view work[1:ni])) + if ni > 0 + work[1:ni] .= (@view A1[j, j+1:j+ni]) + conj!((@view work[1:ni])) - alpha = -conj(tau[j]) - axpy!(alpha, (@view work[1:ni]), (@view A1[j, j+1:j+ni])) - conj!((@view work[1:ni])) - gerc!(alpha, (@view A2[1:mi, j]), (@view work[1:ni]), (@view A2[1:mi, j+1:j+ni])) - end + LinearAlgebra.generic_matvecmul!((@view work[1:ni]), 'C', (@view A2[1:mi, j+1:j+ni]), + (@view A2[1:mi, j]), LinearAlgebra.MulAddMul(one, one)) + conj!((@view work[1:ni])) - # calculate T - if i > 1 - l = min(i - 1, max(0, m - ii + 1)) # length - alpha = -tau[j] + alpha = -conj(tau[j]) + axpy!(alpha, (@view work[1:ni]), (@view A1[j, j+1:j+ni])) + conj!((@view work[1:ni])) + gerc!(alpha, (@view A2[1:mi, j]), (@view work[1:ni]), (@view A2[1:mi, j+1:j+ni])) + end - pemv('C', 'C', min(j - 1, m), i - 1, l, alpha, (@view A2[1:m, ii:ii+i-2]), lda2, - (@view A2[1:m, j]), 0, (@view T[1:i-1, j]), work) - LinearAlgebra.generic_trimatmul!((@view T[1:i-1, j]), 'U', 'N', identity, (@view T[1:i-1, ii:ii+i-2]), (@view T[1:i-1, j])) - end + # calculate T + if i > 1 + l = min(i - 1, max(0, m - ii + 1)) # length + alpha = -tau[j] - T[i, j] = tau[j] + pemv!('C', 'C', min(j - 1, m), i - 1, l, alpha, (@view A2[1:m, ii:ii+i-2]), + (@view A2[1:m, j]), Tzero, (@view T_mat[1:i-1, j]), work) + LinearAlgebra.generic_trimatmul!((@view T_mat[1:i-1, j]), 'U', 'N', identity, (@view T_mat[1:i-1, ii:ii+i-2]), (@view T_mat[1:i-1, j])) end - if (n >= ii + sb) - mi = min(ii + sb - 1, m) - ni = n - (ii + sb - 1) - l = min(sb, max(0, mi - ii + 1)) - ww = reshape(@view(work[1:sb*ni]), sb, ni) # k by n1 -- sb by ni + T_mat[i, j] = tau[j] + end - parfb('L', 'C', 'F', 'C', ib, ni, mi, ni, sb, l, (@view A1[ii:ii+ib-1, ii+sb:ii+sb+ni-1]), - lda1, (@view A2[1:mi, ii+sb:ii+sb+ni-1]), lda2, (@view A2[1:mi, ii:ii+sb-1]), lda2, - (@view T[1:sb, ii:ii+sb-1]), ldt, ww, sb) + if (n >= ii + sb) + mi = min(ii + sb - 1, m) + ni = n - (ii + sb - 1) + l = min(sb, max(0, mi - ii + 1)) + # Workspace reshape for this call: sb x ni (left side) + W = reshape(@view(work[1:sb*ni]), sb, ni) + parfb!('L', 'C', 'F', 'C', ib, ni, mi, ni, sb, l, + (@view A1[ii:ii+ib-1, ii+sb:ii+sb+ni-1]), + (@view A2[1:mi, ii+sb:ii+sb+ni-1]), + (@view A2[1:mi, ii:ii+sb-1]), + (@view T_mat[1:sb, ii:ii+sb-1]), + W) - end end - - return end end + +""" + ttqrt!(A, B, ib) -> (A, B, T, tau) + +Helper for triangular-triangular QR factorization. + +# Arguments +- `A`: Upper triangular matrix (n × n) +- `B`: Upper triangular matrix (n × n) +- `ib`: Block size + +# Returns +- Modified `A` and `B` matrices +- `T`: Block reflector matrix +- `tau`: Scalar factors +""" +function ttqrt!(ib::Integer, A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} + m, n = size(A) + m2, n2 = size(B) + @assert m2 == m && n2 == n "A and B must have same dimensions" + + work = zeros(T, ib * n) + + ttqrt!(m, n, ib, A, B, T_mat, tau, work) +end diff --git a/src/unmqr.jl b/src/unmqr.jl index 9df884c..ab94981 100644 --- a/src/unmqr.jl +++ b/src/unmqr.jl @@ -1,151 +1,231 @@ """ - unmqr(side, trans, m, n, k, ib, A, lda, T, ldt, C, ldc, work, ldwork) + unmqr!(side, trans, m, n, k, ib, A, lda, T_matrix, C, work) -Overwrites the general m-by-n tile C with - side = 'L' side = 'R' - trans = 'N' Q*C C*Q - trans = 'C' Q^H*C C*Q^H +Apply orthogonal matrix Q (or Q^H) from a QR factorization to a general matrix C. -where Q is a unitary matrix defined as the product of k elementary reflectors - Q = H(1) H(2) ... H(k) -as returned by zgeqrt. Q is of order m if side = 'L" and of order n if side = 'R' +Overwrites the general m-by-n matrix C with: + side = 'L' side = 'R' + trans = 'N' Q * C C * Q + trans = 'C' Q^H * C C * Q^H + +where Q is a unitary matrix defined as the product of k elementary reflectors: +Q = H(1) H(2) ... H(k) + +as returned by geqrt!. Q is of order m if side = 'L' and of order n if side = 'R'. # Arguments -- 'side': - - = 'L': apply Q or Q^H from the left - - = 'R': apply Q or Q^H from the right -- 'trans': - - = 'N': no transpose, apply Q - - = 'C': conjugate transpose, apply Q^H -- 'm': the number of rows of the tile C. m >= 0 -- 'n': the number of columns of the tile C. n >= 0 -- 'k': the number of elementary refelctors whose product defines the matrix Q - - if side = 'L', m >= k >= 0 - - if side = 'R', n >= k >= 0 -- 'ib': the inner blocking size. ib >= 0 -- 'A': dimension (lda, k) - the i-th column must contain the vector which defines the - elementary reflector H(i) for i = 1,2,...,k, - as returned by zgeqrt in the first k columns of its array argument A -- 'lda': the leading dimension of array A - if side = 'L', lda >= max(1,m) - if side = 'R', lda >= max(1,n) -- 'T': the ib-by-k triangular factor T of the block reflector - T is upper triangular by block (economic storage) - The rest of the array is not referenced -- 'ldt': the elding dimension of the array T. ldt >= ib -- 'C': - On entry the m-by-n tile C - On exit, C is overwritten by Q*C or Q^H*C or C*Q^H or C*Q. --'work': auxillary workspace of array work - ldwork-by-n if side = 'L' - ldwork by ib if side = 'R' -- 'ldwork': the leading dimension of array work - ldwork >= max(1,ib) if side = 'L' - ldwork >= max(1,m) if side = 'R' +- `side`: Character specifying which side to apply Q + - 'L': Apply Q or Q^H from the left + - 'R': Apply Q or Q^H from the right +- `trans`: Character specifying transpose operation + - 'N': No transpose, apply Q + - 'C': Conjugate transpose, apply Q^H +- `m`: Number of rows of matrix C (≥ 0) +- `n`: Number of columns of matrix C (≥ 0) +- `k`: Number of elementary reflectors defining Q + - If side = 'L': m ≥ k ≥ 0 + - If side = 'R': n ≥ k ≥ 0 +- `ib`: Inner block size (≥ 0) +- `A`: Matrix of dimension (lda, k) containing reflector vectors + The i-th column contains the vector defining elementary reflector H(i), + as returned by geqrt! in the first k columns +- `lda`: Leading dimension of array A + - If side = 'L': lda ≥ max(1,m) + - If side = 'R': lda ≥ max(1,n) +- `T`: ib×k triangular factor of the block reflector + T is upper triangular by blocks (economic storage) +- `C`: m×n matrix to be transformed (modified in-place) +- `work`: Workspace array + +# Algorithm +The routine applies Q using the compact WY representation stored in A and T. +It processes the elementary reflectors in blocks of size ib, using efficient +block operations (larfb!) for high performance. + +The order of applying blocks depends on side and trans parameters to ensure +numerical stability and efficiency. + +# Notes +This is a core computational routine for applying orthogonal transformations +from QR factorizations. It is typically called by higher-level interfaces. """ -function unmqr(side, trans, m, n, k, ib, A, lda, T, ldt, C, ldc, work, ldwork) - if side != 'L' && side != 'R' - throw(ArgumentError("illegal value of side")) - return -1 - end - - if side == 'L' - nq = m - nw = n - else - nq = n - nw = m - end - - if trans != 'N' && trans != 'C' && trans != 'T' - throw(ArgumentError("illegal value of trans")) - return -2 - end - - if m < 0 - throw(ArgumentError("illegal value of m")) - return -3 - end - - if n < 0 - throw(ArgumentError("illegal value of n")) - return -4 - end - - if k < 0 || k > nq - throw(ArgumentError("illegal value of k")) - return -5 - end - - if ib < 0 - throw(ArgumentError("illegal value of ib")) - return -6 - end - - if lda < max(1, nq) && nq > 0 - throw(ArgumentError("illegal value of lda")) - return -8 - end - - if ldt < max(1,ib) - throw(ArgumentError("illegal value of ldt")) - return -10 - end - - if ldc < max(1,m) && m > 0 - throw(ArgumentError("illegal value of ldc")) - return -12 - end - - if ldwork < max(1,nw) && nw > 0 - throw(ArgumentError("illegal value of ldwork")) - return -14 - end - - # quick return - if m == 0 || n == 0 || k == 0 - return - end - - if ((side == 'L' && trans != 'N') || (side == 'R' && trans == 'N')) - i1 = 1 - i3 = ib - ibstop = k - else - i1 = div((k-1),ib)*ib + 1 - i3 = -ib - ibstop = 1 - end - - ic = 1 - jc = 1 - ni = n - mi = m - - if side == 'L' - wwork = ones(eltype(A), n, ib) - ldw = n - else - wwork = ones(eltype(A), m, ib) - ldw = m - end - - for i in i1 : i3 : ibstop - kb = min(ib, k-i+1) - - if side == 'L' - # apply to C[i:m, 1:n] - mi = m - i + 1 - ic = i - else - # apply to C[1:m, i:n] - ni = n-i + 1 - jc = i - end - +function unmqr!(side::Char, trans::Char, m::Integer, n::Integer, k::Integer, ib::Integer, A::AbstractMatrix{T}, lda::Integer, T_matrix::AbstractMatrix{T}, C::AbstractMatrix{T}, work::AbstractMatrix{T}) where {T} + # Input validation with descriptive error messages + if side != 'L' && side != 'R' + throw(ArgumentError("side must be 'L' or 'R', got '$side'")) + end + + if side == 'L' + nq = m # Order of Q when applied from left + nw = n # Width for workspace + else + nq = n # Order of Q when applied from right + nw = m # Width for workspace + end + + if trans != 'N' && trans != 'C' && trans != 'T' + throw(ArgumentError("trans must be 'N', 'C', or 'T', got '$trans'")) + end + + if m < 0 + throw(ArgumentError("m must be non-negative, got $m")) + end + + if n < 0 + throw(ArgumentError("n must be non-negative, got $n")) + end + + if k < 0 || k > nq + throw(ArgumentError("k must satisfy 0 ≤ k ≤ $nq, got $k")) + end + + if ib < 0 + throw(ArgumentError("ib must be non-negative, got $ib")) + end + + if lda < max(1, nq) && nq > 0 + throw(ArgumentError("lda must be ≥ max(1,$nq), got $lda")) + end + + # Quick return for degenerate cases + if m == 0 || n == 0 || k == 0 + return + end + + # Determine order of applying reflector blocks + if ((side == 'L' && trans != 'N') || (side == 'R' && trans == 'N')) + # Apply blocks forward: 1, ib+1, 2*ib+1, ... + i1 = 1 + i3 = ib + ibstop = k + else + # Apply blocks backward: ..., 2*ib+1, ib+1, 1 + i1 = div((k-1),ib)*ib + 1 + i3 = -ib + ibstop = 1 + end + + # Initialize submatrix indices + ic = 1 + jc = 1 + ni = n + mi = m + + # Allocate workspace for block operations + if side == 'L' + wwork = ones(eltype(A), n, ib) + ldw = n + else + wwork = ones(eltype(A), m, ib) + ldw = m + end + + # Apply blocks of elementary reflectors + for i in i1 : i3 : ibstop + kb = min(ib, k-i+1) # Size of current block + + if side == 'L' + # Apply to C[i:m, 1:n] + mi = m - i + 1 + ic = i + else + # Apply to C[1:m, i:n] + ni = n - i + 1 + jc = i + end + + # Get view of submatrix to transform cv = @view C[ic:m, jc:n] - larfb(side, trans, 'F', 'C', mi, ni, kb, (@view A[i:lda, i:i+kb-1]), lda-i+1, (@view T[1:kb, i:i+kb-1]), kb, cv, ldc, (@view wwork[:, 1:kb]), ldw) - end + # Apply current block of reflectors + larfb!(side, trans, 'F', 'C', mi, ni, kb, + (@view A[i:lda, i:i+kb-1]), lda-i+1, + (@view T_matrix[1:kb, i:i+kb-1]), + cv, (@view wwork[:, 1:kb])) + end +end + +""" + unmqr!(side, trans, A_qr, T, C, ib) -> C + +Apply orthogonal matrix Q from QR factorization to matrix C. + +This is a high-level interface that automatically determines dimensions and +allocates workspace to apply the orthogonal factor Q from a QR factorization +to a general matrix C. + +# Arguments +- `side`: Character specifying application side + - 'L': Apply Q from left (Q*C or Q^H*C) + - 'R': Apply Q from right (C*Q or C*Q^H) +- `trans`: Character specifying transpose operation + - 'N': Apply Q (no transpose) + - 'C': Apply Q^H (conjugate transpose) +- `A_qr`: QR factorization result from geqrt! (contains reflector vectors) +- `T_matrix`: Block reflector coefficient matrix from geqrt! +- `C`: Matrix to transform (modified in-place) +- `ib`: Block size used in QR factorization + +# Returns +- Modified matrix `C` after applying the orthogonal transformation + +# Input Validation +- Matrix dimensions must be compatible with the QR factorization +- Block size ib must be positive and consistent with T matrix dimensions +- For side='L': number of rows of C must match Q dimension +- For side='R': number of columns of C must match Q dimension + +# Example +```julia +# Apply Q from QR factorization to matrix C +m, n, k = 10, 8, 6 +ib = 4 +A = randn(ComplexF64, m, k) +A_qr, T, tau = geqrt!(copy(A), ib) +C = randn(ComplexF64, m, n) +unmqr!('L', 'N', A_qr, T, C, ib) # C := Q * C +``` + +# Algorithm +Uses the blocked compact WY representation to apply Q efficiently through +matrix-matrix operations rather than individual elementary reflectors. +""" +function unmqr!(side::Char, trans::Char, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, + C::AbstractMatrix{T}, ib::Integer) where {T} + m, n = size(C) + k = size(T_matrix, 2) + + # Validate input dimensions + if ib <= 0 + throw(ArgumentError("Block size ib must be positive, got $ib")) + end + + if side == 'L' + if size(A, 1) != m + throw(ArgumentError("For side='L', A_qr rows ($(size(A, 1))) must match C rows ($m)")) + end + if size(A, 2) < k + throw(ArgumentError("A_qr columns ($(size(A, 2))) must be ≥ k ($k)")) + end + else # side == 'R' + if size(A, 1) != n + throw(ArgumentError("For side='R', A_qr rows ($(size(A, 1))) must match C columns ($n)")) + end + if size(A, 2) < k + throw(ArgumentError("A_qr columns ($(size(A, 2))) must be ≥ k ($k)")) + end + end + + # Set leading dimensions + lda = max(1, size(A, 1)) + + # Allocate workspace based on side (matrix workspace expected by low-level) + if side == 'L' + work = zeros(T, n, ib) + else + work = zeros(T, m, ib) + end + + # Call the core computational routine + unmqr!(side, trans, m, n, k, ib, A, lda, T_matrix, C, work) end diff --git a/test/geqr2.jl b/test/geqr2.jl index a9c85d6..0230ad4 100644 --- a/test/geqr2.jl +++ b/test/geqr2.jl @@ -52,51 +52,46 @@ end for imat in 1:4 @testset "Matrix type $imat" begin A_orig = generate_qr_test_matrix(T, m, n, imat) - - # --- Reference Calculation --- - A_ref = copy(A_orig) - tau_ref = zeros(T, k) - A_ref = qr(A_ref).factors # --- NextLA Calculation --- A_test = copy(A_orig) - lda = max(1, m) tau_test = zeros(T, k) - work_test = zeros(T, n) # Work array size n for geqr2 - NextLA.geqr2(m, n, A_test, lda, tau_test, work_test) + work_test = zeros(T, n) # Work array size n for geqr2! + NextLA.geqr2!(m, n, A_test, tau_test, work_test) + + # --- Test Helper Function --- + A_helper = copy(A_orig) + tau_helper = zeros(T, k) + NextLA.geqr2!(A_helper, tau_helper) + + # Verify helper gives same results as kernel + @test A_helper ≈ A_test rtol=rtol atol=atol + if k > 0 + @test tau_helper ≈ tau_test rtol=rtol atol=atol + end # --- Comparisons --- if m == 0 || n == 0 @test size(A_test) == size(A_orig) else - # 1. Compare the factored matrix A (contains V and R) - scaled_rtol = rtol * max(1, m, n) - @test A_test ≈ A_ref rtol=scaled_rtol - - # 3. Mathematical property checks + #Mathematical property checks if k > 0 # Extract R from the factored matrix R_test = triu(A_test[1:k, 1:n]) - # Form Q using LAPACK's unmqr + # Form Q using LAPACK's unmqr! Q_test = Matrix{T}(I, m, m) - try - LAPACK.ormqr!('L', 'N', A_test, tau_test, Q_test) - - # Test 3a: Reconstruction. A_orig should be Q * R. - A_recon = Q_test[:, 1:k] * R_test - reconstruction_tol = rtol * max(1, m, n) * norm(A_orig) - @test A_orig ≈ A_recon rtol=reconstruction_tol + LAPACK.ormqr!('L', 'N', A_test, tau_test, Q_test) + + # Test 3a: Reconstruction. A_orig should be Q * R. + A_recon = Q_test[:, 1:k] * R_test + reconstruction_tol = rtol * max(1, m, n) * norm(A_orig) + @test A_orig ≈ A_recon - # Test 3b: Orthogonality of Q. Q' * Q should be Identity. - orthog_error = norm(Q_test' * Q_test - I) - orthog_tol = rtol * m - @test orthog_error < orthog_tol - catch e - # If LAPACK fails, just check basic properties - @test all(isfinite.(A_test)) - @test all(isfinite.(tau_test)) - end + # Test 3b: Orthogonality of Q. Q' * Q should be Identity. + orthog_error = norm(Q_test' * Q_test - I) + orthog_tol = rtol * m + @test orthog_error < orthog_tol # Additional checks @test all(isfinite.(A_test)) @@ -119,17 +114,16 @@ end # Test edge cases and error conditions m, n = 500, 300 A = rand(T, m, n) - lda = m tau = zeros(T, min(m, n)) work = zeros(T, n) # Valid call should not error - @test_nowarn NextLA.geqr2(m, n, copy(A), lda, copy(tau), copy(work)) + @test_nowarn NextLA.geqr2!(m, n, copy(A), copy(tau), copy(work)) # Zero dimensions should not error - @test_nowarn NextLA.geqr2(0, 0, zeros(T, 0, 0), 1, T[], T[]) - @test_nowarn NextLA.geqr2(0, 300, zeros(T, 0, 300), 1, T[], zeros(T, 300)) - @test_nowarn NextLA.geqr2(500, 0, zeros(T, 500, 0), 500, T[], T[]) + @test_nowarn NextLA.geqr2!(0, 0, zeros(T, 0, 0), T[], T[]) + @test_nowarn NextLA.geqr2!(0, 300, zeros(T, 0, 300), T[], zeros(T, 300)) + @test_nowarn NextLA.geqr2!(500, 0, zeros(T, 500, 0), T[], T[]) end end end @@ -153,10 +147,10 @@ end # CPU reference A_cpu_result = copy(A_cpu) - NextLA.geqr2(m, n, A_cpu_result, m, tau_cpu, work_cpu) + NextLA.geqr2!(m, n, A_cpu_result, tau_cpu, work_cpu) # GPU test - NextLA.geqr2(m, n, A_gpu, m, tau_gpu, work_gpu) + NextLA.geqr2!(m, n, A_gpu, tau_gpu, work_gpu) # Compare results @test Array(A_gpu) ≈ A_cpu_result rtol=rtol diff --git a/test/geqrt.jl b/test/geqrt.jl index 71ed18e..cf5f602 100644 --- a/test/geqrt.jl +++ b/test/geqrt.jl @@ -3,40 +3,11 @@ using NextLA using LinearAlgebra, LinearAlgebra.LAPACK using Random -# Function signature: geqrt(m, n, ib, A, lda, T, ldt, tau, work) +# Function signature: geqrt!(m, n, ib, A, lda, T, ldt, tau, work) const GEQRT_TYPES = [ComplexF32, ComplexF64, Float32, Float64] const GEQRT_SIZES = [(0,0), (100,100), (200,100), (100,200), (400,300), (800,600), (150,100), (200,150)] const GEQRT_BLOCKSIZES = [100, 200, 400, 800] -function generate_qr_test_matrix(::Type{T}, m, n, imat=1) where T - if m == 0 || n == 0 - return zeros(T, m, n) - end - - # Use the matrix generation from runtests.jl - if imat == 1 - # Well-conditioned random matrix - return matrix_generation(T, m, n, mode=:decay, cndnum=2.0) - elseif imat == 2 - # Moderately ill-conditioned - return matrix_generation(T, m, n, mode=:decay, cndnum=1e2) - elseif imat == 3 - # Severely ill-conditioned - return matrix_generation(T, m, n, mode=:one_large, cndnum=1e6) - elseif imat == 4 - # Random matrix - return rand(T, m, n) - else - # Identity-like matrix - A = zeros(T, m, n) - k = min(m, n) - for i in 1:k - A[i, i] = one(T) - end - return A - end -end - @testset "GEQRT Tests" begin @testset "Blocked QR Factorization Tests" begin for (itype, T) in enumerate(GEQRT_TYPES) @@ -74,13 +45,24 @@ end # --- NextLA Blocked QR --- A_test = copy(A_orig) - lda = max(1, m) T_test = zeros(T, max(1,ib), k) # Block reflector matrix - ldt = max(1, ib) tau_test = zeros(T, k) work_test = zeros(T, ib * n) # Work array - NextLA.geqrt(m, n, ib, A_test, lda, T_test, ldt, tau_test, work_test) + NextLA.geqrt!(m, n, ib, A_test, T_test, tau_test, work_test) + + # --- Test Helper Function --- + A_helper = copy(A_orig) + T_helper = zeros(T, max(1, ib), k) + tau_helper = zeros(T, k) + NextLA.geqrt!(ib, A_helper, T_helper, tau_helper) + + # Verify helper gives same results as kernel (in-place) + if k > 0 + @test A_helper ≈ A_test rtol=rtol atol=atol + @test T_helper[1:ib, 1:k] ≈ T_test[1:ib, 1:k] rtol=rtol atol=atol + @test tau_helper ≈ tau_test rtol=rtol atol=atol + end # --- Comparisons --- if m == 0 || n == 0 @@ -96,31 +78,27 @@ end # For small matrices, verify reconstruction - if m <= 200 && n <= 200 - # Extract R from the factored matrix - R_test = triu(A_test[1:k, 1:n]) - - # Form Q using LAPACK's unmqr - Q_test = Matrix{T}(I, m, m) - LAPACK.ormqr!('L', 'N', A_test, tau_test, Q_test) - - # Test 3a: Reconstruction. A_orig should be Q * R. - A_recon = Q_test[:, 1:k] * R_test - _, R = qr(A_orig) - reconstruction_tol = rtol * max(1, m, n) * norm(A_orig) - @test A_orig ≈ A_recon rtol=reconstruction_tol - @test norm(R - R_test) < reconstruction_tol - - # Test 3b: Orthogonality of Q. Q' * Q should be Identity. - orthog_error = norm(adjoint(Q_test) * Q_test - I) - orthog_tol = rtol * m - @test orthog_error < orthog_tol + # Extract R from the factored matrix + R_test = triu(A_test[1:k, 1:n]) - # Additional checks - @test all(isfinite.(A_test)) - @test all(isfinite.(tau_test)) - @test size(A_test) == size(A_orig) - end + # Form Q using LAPACK's unmqr! + Q_test = Matrix{T}(I, m, m) + LAPACK.ormqr!('L', 'N', A_test, tau_test, Q_test) + + # Test 3a: Reconstruction. A_orig should be Q * R. + A_recon = Q_test[:, 1:k] * R_test + reconstruction_tol = rtol * max(1, m, n) * norm(A_orig) + @test A_orig ≈ A_recon rtol=reconstruction_tol + + # Test 3b: Orthogonality of Q. Q' * Q should be Identity. + orthog_error = norm(adjoint(Q_test) * Q_test - I) + orthog_tol = rtol * m + @test orthog_error < orthog_tol + + # Additional checks + @test all(isfinite.(A_test)) + @test all(isfinite.(tau_test)) + @test size(A_test) == size(A_orig) end end @@ -134,100 +112,33 @@ end end end - @testset "Square Matrix Tests" begin - n = 16 - ib = 4 - A = rand(ComplexF64, n, n) - A_original = copy(A) - lda = n - T = zeros(ComplexF64, ib, n) - ldt = ib - tau = zeros(ComplexF64, n) - work = zeros(ComplexF64, ib * n) - - NextLA.geqrt(n, n, ib, A, lda, T, ldt, tau, work) - - # For square matrices, check complete factorization - R_our = triu(A) - - # Compare with Julia's QR - Q_ref, R_ref = qr(A_original) - R_ref_mat = Matrix(R_ref) - - @test norm(R_our - R_ref_mat) < 1e-10 - end - - @testset "Tall Matrix Tests" begin - m, n, ib = 30, 15, 5 - A = rand(ComplexF64, m, n) - A_original = copy(A) - lda = m - T = zeros(ComplexF64, ib, n) - ldt = ib - tau = zeros(ComplexF64, n) - work = zeros(ComplexF64, ib * n) - - NextLA.geqrt(m, n, ib, A, lda, T, ldt, tau, work) - k = min(m, n) - R_our = triu(A[1:k, 1:k]) - - Q_ref, R_ref = qr(A_original) - R_ref_mat = Matrix(R_ref) - - # Check upper triangular structure - @test norm(R_our - R_ref_mat) < 1e-10 - end - - @testset "Wide Matrix Tests" begin - m, n, ib = 15, 25, 5 - A = rand(ComplexF64, m, n) - A_original = copy(A) - lda = m - T = zeros(ComplexF64, ib, m) - ldt = ib - tau = zeros(ComplexF64, m) - work = zeros(ComplexF64, ib * n) - - NextLA.geqrt(m, n, ib, A, lda, T, ldt, tau, work) - - R_our = triu(A) - - Q_ref, R_ref = qr(A_original) - R_ref_mat = Matrix(R_ref) - @test norm(R_our - R_ref_mat) < 1e-10 - end - @testset "Edge Cases" begin # Test with ib = 1 (should behave like unblocked QR) m, n, ib = 10, 8, 1 A = rand(ComplexF64, m, n) A_original = copy(A) - lda = m T = zeros(ComplexF64, ib, min(m, n)) - ldt = ib tau = zeros(ComplexF64, min(m, n)) work = zeros(ComplexF64, ib * n) - NextLA.geqrt(m, n, ib, A, lda, T, ldt, tau, work) + NextLA.geqrt!(m, n, ib, A, T, tau, work) # Compare with unblocked version A_unblocked = copy(A_original) tau_unblocked = zeros(ComplexF64, min(m, n)) work_unblocked = zeros(ComplexF64, n) - NextLA.geqr2(m, n, A_unblocked, lda, tau_unblocked, work_unblocked) + NextLA.geqr2!(m, n, A_unblocked, tau_unblocked, work_unblocked) @test A ≈ A_unblocked rtol=1e-10 # Test with very small matrices m, n, ib = 3, 2, 1 A = rand(ComplexF64, m, n) - lda = m T = zeros(ComplexF64, ib, min(m, n)) - ldt = ib tau = zeros(ComplexF64, min(m, n)) work = zeros(ComplexF64, ib * n) - NextLA.geqrt(m, n, ib, A, lda, T, ldt, tau, work) + NextLA.geqrt!(m, n, ib, A, T, tau, work) # Should not crash @test all(isfinite.(A)) @@ -237,16 +148,12 @@ end @testset "Error Handling" begin # Test negative dimensions - @test_throws ArgumentError NextLA.geqrt(-1, 5, 2, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10)) - @test_throws ArgumentError NextLA.geqrt(5, -1, 2, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10)) + @test_throws ArgumentError NextLA.geqrt!(-1, 5, 2, zeros(ComplexF64, 5, 5), zeros(ComplexF64, 2, 5), zeros(ComplexF64, 5), zeros(ComplexF64, 10)) + @test_throws ArgumentError NextLA.geqrt!(5, -1, 2, zeros(ComplexF64, 5, 5), zeros(ComplexF64, 2, 5), zeros(ComplexF64, 5), zeros(ComplexF64, 10)) # Test invalid block size - @test_throws ArgumentError NextLA.geqrt(5, 5, -1, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10)) - @test_throws ArgumentError NextLA.geqrt(5, 5, 0, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10)) - - # Test invalid leading dimensions - @test_throws ArgumentError NextLA.geqrt(5, 5, 2, zeros(ComplexF64, 5, 5), 3, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10)) - @test_throws ArgumentError NextLA.geqrt(5, 5, 2, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 1, zeros(ComplexF64, 5), zeros(ComplexF64, 10)) + @test_throws ArgumentError NextLA.geqrt!(5, 5, -1, zeros(ComplexF64, 5, 5), zeros(ComplexF64, 2, 5), zeros(ComplexF64, 5), zeros(ComplexF64, 10)) + @test_throws ArgumentError NextLA.geqrt!(5, 5, 0, zeros(ComplexF64, 5, 5), zeros(ComplexF64, 2, 5), zeros(ComplexF64, 5), zeros(ComplexF64, 10)) end @testset "Consistency Tests" begin @@ -256,19 +163,17 @@ end # First application A1 = copy(A) - lda = m T1 = zeros(ComplexF64, ib, min(m, n)) - ldt = ib tau1 = zeros(ComplexF64, min(m, n)) work1 = zeros(ComplexF64, ib * n) - NextLA.geqrt(m, n, ib, A1, lda, T1, ldt, tau1, work1) + NextLA.geqrt!(m, n, ib, A1, T1, tau1, work1) # Second application A2 = copy(A) T2 = zeros(ComplexF64, ib, min(m, n)) tau2 = zeros(ComplexF64, min(m, n)) work2 = zeros(ComplexF64, ib * n) - NextLA.geqrt(m, n, ib, A2, lda, T2, ldt, tau2, work2) + NextLA.geqrt!(m, n, ib, A2, T2, tau2, work2) @test A1 ≈ A2 rtol=1e-12 @test T1 ≈ T2 rtol=1e-12 @@ -281,9 +186,7 @@ end # Create CPU data A_cpu = rand(ComplexF32, m, n) - lda = m T_cpu = zeros(ComplexF32, ib, min(m, n)) - ldt = ib tau_cpu = zeros(ComplexF32, min(m, n)) work_cpu = zeros(ComplexF32, ib * n) @@ -297,10 +200,10 @@ end A_cpu_result = copy(A_cpu) T_cpu_result = copy(T_cpu) tau_cpu_result = copy(tau_cpu) - NextLA.geqrt(m, n, ib, A_cpu_result, lda, T_cpu_result, ldt, tau_cpu_result, work_cpu) + NextLA.geqrt!(m, n, ib, A_cpu_result, T_cpu_result, tau_cpu_result, work_cpu) # Apply on GPU - NextLA.geqrt(m, n, ib, A_gpu, lda, T_gpu, ldt, tau_gpu, work_gpu) + NextLA.geqrt!(m, n, ib, A_gpu, T_gpu, tau_gpu, work_gpu) @test Array(A_gpu) ≈ A_cpu_result rtol=1e-6 @test Array(T_gpu) ≈ T_cpu_result rtol=1e-6 diff --git a/test/larf.jl b/test/larf.jl index a91a8c4..ed0f837 100644 --- a/test/larf.jl +++ b/test/larf.jl @@ -121,10 +121,17 @@ end # Determine work array size work_size = side == 'L' ? n : m - work = zeros(T, work_size, 1) + work = zeros(T, work_size) - # NextLA call: larf(side, m, n, v, incv, tau, c, ldc, work) - NextLA.larf(side, m, n, v, 1, tau, C_test, work) + # NextLA call: larf!(side, m, n, v, incv, tau, c, ldc, work) + NextLA.larf!(side, m, n, v, 1, tau, C_test, work) + + # --- Test Helper Function --- + C_helper = copy(C_orig) + NextLA.larf!(side, v, 1, tau, C_helper) + + # Verify helper gives same results as kernel (in-place) + @test C_helper ≈ C_test rtol=rtol # Basic checks @test all(isfinite.(C_test)) @@ -156,14 +163,14 @@ end C = randn(T, m, n) v = randn(T, max(m, n)) tau = T(0.5) - work = zeros(T, max(m, n), 1) + work = zeros(T, max(m, n)) - @test_nowarn NextLA.larf('L', m, n, v, 1, tau, C, work) - @test_nowarn NextLA.larf('R', m, n, v, 1, tau, C, work) + @test_nowarn NextLA.larf!('L', m, n, v, 1, tau, C, work) + @test_nowarn NextLA.larf!('R', m, n, v, 1, tau, C, work) # Test edge cases - @test_nowarn NextLA.larf('L', 1, 1, T[T(1)], 1, T(0), T[T(1);;], T[T(0);;]) - @test_nowarn NextLA.larf('R', 1, 1, T[T(1)], 1, T(0), T[T(1);;], T[T(0);;]) + @test_nowarn NextLA.larf!('L', 1, 1, T[T(1)], 1, T(0), T[T(1);;], T[0]) + @test_nowarn NextLA.larf!('R', 1, 1, T[T(1)], 1, T(0), T[T(1);;], T[0]) end end end @@ -183,12 +190,12 @@ end C = T.(scale .* randn(T, m, n)) v = T.(scale .* randn(T, side == 'L' ? m : n)) tau = T(scale * randn(T)) - work = zeros(T, max(m, n), 1) + work = zeros(T, max(m, n)) C_orig = copy(C) # Test calculation - NextLA.larf(side, m, n, v, 1, tau, C, work) + NextLA.larf!(side, m, n, v, 1, tau, C, work) # Check that results are finite @test all(isfinite.(C)) @@ -213,7 +220,7 @@ end C_cpu = T.(randn(T, m, n)) v_cpu = T.(randn(T, max(m, n))) tau_cpu = T(randn(T)) - work_cpu = zeros(T, max(m, n), 1) + work_cpu = zeros(T, max(m, n)) # Move data to GPU C_gpu = CuArray(C_cpu) @@ -223,10 +230,10 @@ end # Reference CPU calculation C_ref = copy(C_cpu) work_ref = copy(work_cpu) - NextLA.larf(side, m, n, v_cpu, 1, tau_cpu, C_ref, m, work_ref) + NextLA.larf!(side, m, n, v_cpu, 1, tau_cpu, C_ref, work_ref) # Our implementation on GPU - NextLA.larf(side, m, n, v_gpu, 1, tau_cpu, C_gpu, m, work_gpu) + NextLA.larf!(side, m, n, v_gpu, 1, tau_cpu, C_gpu, work_gpu) # Compare results @test norm(Array(C_gpu) - C_ref) < rtol * max(1, norm(C_ref)) diff --git a/test/larfb.jl b/test/larfb.jl index bad4db6..cdd5502 100644 --- a/test/larfb.jl +++ b/test/larfb.jl @@ -88,22 +88,23 @@ end C_test = copy(C_orig) C_ref = copy(C_orig) - # Set leading dimensions - ldv = size(V, 1) - ldt = k - ldc = m - ldwork = size(work, 1) + # NextLA call: larfb!(side, trans, direct, storev, m, n, k, V, ldv, T, C, work) + NextLA.larfb!(side, trans, direct, storev, m, n, k, V, size(V,1), T_mat, C_test, work) - # NextLA call: larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, work, ldwork) - NextLA.larfb(side, trans, direct, storev, m, n, k, V, ldv, T_mat, ldt, C_test, ldc, work, ldwork) + # --- Test Helper Function --- + C_helper = copy(C_orig) + NextLA.larfb!(side, trans, direct, storev, V, T_mat, C_helper) + + # Verify helper gives same results as kernel + @test C_helper ≈ C_test rtol=rtol # Basic checks @test all(isfinite.(C_test)) @test size(C_test) == (m, n) @test all(isfinite.(work)) - NextLA.larfb('L', 'N', direct, storev, m, n, k, V, ldv, T_mat, ldt, C_ref, ldc, work, ldwork) - NextLA.larfb('L', 'C', direct, storev, m, n, k, V, ldv, T_mat, ldt, C_ref, ldc, work, ldwork) + NextLA.larfb!('L', 'N', direct, storev, m, n, k, V, size(V,1), T_mat, C_ref, work) + NextLA.larfb!('L', 'C', direct, storev, m, n, k, V, size(V,1), T_mat, C_ref, work) # Mathematical validation @test norm(C_ref - C_orig) / norm(C_orig) < rtol @@ -138,15 +139,15 @@ end C = randn(T, m, n) work = zeros(T, n, k) - @test_nowarn NextLA.larfb('L', 'N', 'F', 'C', m, n, k, V, m, T_mat, k, C, m, work, n) + @test_nowarn NextLA.larfb!('L', 'N', 'F', 'C', m, n, k, V, m, T_mat, C, work) # Test edge cases - @test_nowarn NextLA.larfb('L', 'N', 'F', 'C', 0, 0, 0, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1) # m = n = k = 0 - @test_nowarn NextLA.larfb('L', 'N', 'F', 'C', 1, 1, 0, zeros(T, 1, 0), 1, zeros(T, 0, 0), 1, randn(T, 1, 1), 1, zeros(T, 1, 0), 1) # k = 0 + @test_nowarn NextLA.larfb!('L', 'N', 'F', 'C', 0, 0, 0, zeros(T, 0, 0), 1, zeros(T, 0, 0), zeros(T, 0, 0)) # m = n = k = 0 + @test_nowarn NextLA.larfb!('L', 'N', 'F', 'C', 1, 1, 0, zeros(T, 1, 0), 1, zeros(T, 0, 0), randn(T, 1, 1), zeros(T, 1, 0)) # k = 0 # Test different side/storev combinations - @test_nowarn NextLA.larfb('R', 'N', 'F', 'C', m, n, k, randn(T, n, k), n, T_mat, k, copy(C), m, zeros(T, m, k), m) # Right side - @test_nowarn NextLA.larfb('L', 'C', 'B', 'R', m, n, k, randn(T, k, m), k, T_mat, k, copy(C), m, zeros(T, n, k), n) # Row-wise storage + @test_nowarn NextLA.larfb!('R', 'N', 'F', 'C', m, n, k, randn(T, n, k), n, T_mat, copy(C), zeros(T, m, k)) # Right side + @test_nowarn NextLA.larfb!('L', 'C', 'B', 'R', m, n, k, randn(T, k, m), k, T_mat, copy(C), zeros(T, n, k)) # Row-wise storage end end end @@ -167,7 +168,7 @@ end work = zeros(T, n, k) # Test calculation - NextLA.larfb('L', 'N', 'F', 'C', m, n, k, V, m, T_mat, k, C, m, work, n) + NextLA.larfb!('L', 'N', 'F', 'C', m, n, k, V, m, T_mat, C, work) # Check that results are finite @test all(isfinite.(C)) @@ -180,7 +181,7 @@ end work_test = zeros(T, side == 'L' ? n : m, k) V_test = side == 'L' ? V : T.(scale .* randn(ComplexF64, n, k)) - NextLA.larfb(side, trans, 'F', 'C', m, n, k, V_test, size(V_test, 1), T_mat, k, C_test, m, work_test, size(work_test, 1)) + NextLA.larfb!(side, trans, 'F', 'C', m, n, k, V_test, size(V_test, 1), T_mat, C_test, work_test) @test all(isfinite.(C_test)) @test all(isfinite.(work_test)) @@ -219,10 +220,10 @@ end # Reference CPU calculation C_ref = copy(C_cpu) work_ref = copy(work_cpu) - NextLA.larfb(side, trans, 'F', 'C', m, n, k, V_cpu, size(V_cpu, 1), T_cpu, k, C_ref, m, work_ref, size(work_ref, 1)) + NextLA.larfb!(side, trans, 'F', 'C', m, n, k, V_cpu, size(V_cpu, 1), T_cpu, C_ref, work_ref) # GPU calculation - NextLA.larfb(side, trans, 'F', 'C', m, n, k, V_gpu, size(V_gpu, 1), T_gpu, k, C_gpu, m, work_gpu, size(work_gpu, 1)) + NextLA.larfb!(side, trans, 'F', 'C', m, n, k, V_gpu, size(V_gpu, 1), T_gpu, C_gpu, work_gpu) # Compare results @test norm(Array(C_gpu) - C_ref) < rtol * max(1, norm(C_ref)) @@ -266,10 +267,10 @@ end # Reference CPU calculation C_ref = copy(C_cpu) work_ref = copy(work_cpu) - NextLA.larfb(side, 'N', 'F', 'C', m, n, k, V_cpu, size(V_cpu, 1), T_cpu, k, C_ref, m, work_ref, size(work_ref, 1)) + NextLA.larfb!(side, 'N', 'F', 'C', m, n, k, V_cpu, size(V_cpu, 1), T_cpu, C_ref, work_ref) # ROCm calculation - NextLA.larfb(side, 'N', 'F', 'C', m, n, k, V_rocm, size(V_rocm, 1), T_rocm, k, C_rocm, m, work_rocm, size(work_rocm, 1)) + NextLA.larfb!(side, 'N', 'F', 'C', m, n, k, V_rocm, size(V_rocm, 1), T_rocm, C_rocm, work_rocm) # Compare results @test norm(Array(C_rocm) - C_ref) < rtol * max(1, norm(C_ref)) diff --git a/test/larfg.jl b/test/larfg.jl index d057196..1eba28d 100644 --- a/test/larfg.jl +++ b/test/larfg.jl @@ -49,7 +49,7 @@ for (larfg, elty) in # Test NextLA implementation x_nextla = copy(x_orig) - alpha_nextla, tau_nextla = NextLA.larfg(n, alpha_orig, x_nextla, 1, zero(T)) + alpha_nextla, tau_nextla = NextLA.larfg!(n, alpha_orig, x_nextla, 1, zero(T)) # Test LAPACK reference if n > 0 @@ -57,17 +57,24 @@ for (larfg, elty) in tau_lapack, alpha_lapack = larfg_our!(lapack_vec) x_lapack = lapack_vec[2:end] - # Compare results (allowing for sign differences) - if abs(abs(tau_nextla) - abs(tau_lapack)) > rtol * max(1, abs(tau_lapack)) - @show tau_nextla, tau_lapack - end - @test abs(abs(tau_nextla) - abs(tau_lapack)) < rtol * max(1, abs(tau_lapack)) - if abs(abs(alpha_nextla) - abs(alpha_lapack)) > rtol * max(1, abs(alpha_lapack)) - @show alpha_nextla, alpha_lapack - end - @test abs(abs(alpha_nextla) - abs(alpha_lapack)) < rtol * max(1, abs(alpha_lapack)) - if length(x_orig) > 0 - @test norm(abs.(x_nextla) - abs.(x_lapack)) < rtol * max(1, norm(x_lapack)) + # For n==1, NextLA defines tau≈0; LAPACK may return nonzero. Accept tau≈0. + if n == 1 + @test abs(tau_nextla) ≤ (T <: ComplexF32 ? 1e-6 : 1e-12) + # alpha magnitude should match LAPACK + @test abs(abs(alpha_nextla) - abs(alpha_lapack)) < rtol * max(1, abs(alpha_lapack)) + else + # Compare results (allowing for sign differences) + if abs(abs(tau_nextla) - abs(tau_lapack)) > rtol * max(1, abs(tau_lapack)) + @show tau_nextla, tau_lapack + end + @test abs(abs(tau_nextla) - abs(tau_lapack)) < rtol * max(1, abs(tau_lapack)) + if abs(abs(alpha_nextla) - abs(alpha_lapack)) > rtol * max(1, abs(alpha_lapack)) + @show alpha_nextla, alpha_lapack + end + @test abs(abs(alpha_nextla) - abs(alpha_lapack)) < rtol * max(1, abs(alpha_lapack)) + if length(x_orig) > 0 + @test norm(abs.(x_nextla) - abs.(x_lapack)) < rtol * max(1, norm(x_lapack)) + end end end @@ -85,17 +92,17 @@ for (larfg, elty) in for T in [ComplexF32, ComplexF64] @testset "Type $T edge cases" begin # Test n=0 case - alpha_nextla, tau_nextla = NextLA.larfg(0, T(1), T[], 1, zero(T)) + alpha_nextla, tau_nextla = NextLA.larfg!(0, T(1), T[], 1, zero(T)) @test tau_nextla == 0 @test alpha_nextla == T(1) # Test n=1 case - alpha_nextla, tau_nextla = NextLA.larfg(1, T(2), T[], 1, zero(T)) + alpha_nextla, tau_nextla = NextLA.larfg!(1, T(2), T[], 1, zero(T)) @test abs(tau_nextla) < 1e-10 @test abs(alpha_nextla - T(2)) < 1e-10 - # Test zero vector - alpha_nextla, tau_nextla = NextLA.larfg(3, T(0), T[0, 0], 1, zero(T)) + # Test zero vector (n=3, x has length 2) + alpha_nextla, tau_nextla = NextLA.larfg!(3, T(0), zeros(T, 2), 1, zero(T)) @test isfinite(alpha_nextla) @test isfinite(tau_nextla) end diff --git a/test/larft.jl b/test/larft.jl index 7741b32..6ccd599 100644 --- a/test/larft.jl +++ b/test/larft.jl @@ -86,8 +86,8 @@ end T_mat = zeros(T, k, k) ldt = k - # NextLA call: larft(direct, storev, n, k, v, ldv, tau, t, ldt) - NextLA.larft(direct, storev, n, k, V, ldv, tau, T_mat, ldt) + # NextLA call: larft!(direct, storev, n, k, V, tau, T_mat) + NextLA.larft!(direct, storev, n, k, V, tau, T_mat) # Basic checks @test all(isfinite.(T_mat)) @@ -159,12 +159,12 @@ end tau = randn(T, k) T_mat = zeros(T, k, k) - @test_nowarn NextLA.larft('F', 'C', n, k, V, n, tau, T_mat, k) + @test_nowarn NextLA.larft!('F', 'C', n, k, V, tau, T_mat) # Test edge cases - @test_nowarn NextLA.larft('F', 'C', 0, 0, zeros(T, 0, 0), 1, T[], zeros(T, 0, 0), 1) # n = 0, k = 0 - @test_nowarn NextLA.larft('F', 'C', 1, 0, zeros(T, 1, 0), 1, T[], zeros(T, 0, 0), 1) # k = 0 - @test_nowarn NextLA.larft('F', 'C', 0, 1, zeros(T, 0, 1), 1, T[T(0)], zeros(T, 1, 1), 1) # n = 0 + @test_nowarn NextLA.larft!('F', 'C', 0, 0, zeros(T, 0, 0), T[], zeros(T, 0, 0)) # n = 0, k = 0 + @test_nowarn NextLA.larft!('F', 'C', 1, 0, zeros(T, 1, 0), T[], zeros(T, 0, 0)) # k = 0 + @test_nowarn NextLA.larft!('F', 'C', 0, 1, zeros(T, 0, 1), T[T(0)], zeros(T, 1, 1)) # n = 0 end end end @@ -217,10 +217,10 @@ end # Reference CPU calculation T_ref = zeros(T, k, k) - NextLA.larft('F', 'C', n, k, V_cpu, n, tau_cpu, T_ref, k) + NextLA.larft!('F', 'C', n, k, V_cpu, tau_cpu, T_ref) # Our implementation on GPU - NextLA.larft('F', 'C', n, k, V_gpu, n, tau_gpu, T_gpu, k) + NextLA.larft!('F', 'C', n, k, V_gpu, tau_gpu, T_gpu) # Compare results @test norm(Array(T_gpu) - T_ref) < rtol * max(1, norm(T_ref)) diff --git a/test/lauum.jl b/test/lauum.jl index c96d0af..30a8b19 100644 --- a/test/lauum.jl +++ b/test/lauum.jl @@ -1,4 +1,8 @@ -@testset "lauum test" begin +using Test +using NextLA +using LinearAlgebra + +@testset "NextLA.lauum! test" begin for T in [Float32, Float64, ComplexF32, ComplexF64] for uplo in ['U', 'L'] # Test different matrix sizes including edge cases @@ -13,8 +17,8 @@ A = Matrix(LowerTriangular(-0.5 .+ rand(T, n, n))) end Ac = copy(A) - info = lauum(uplo, n, A, n, block_size) - @test info == 0 # Ensure no error from lauum + NextLA.lauum!(uplo, n, A, block_size) + # @test info == 0 # Function now returns nothing instead of error code # Set tolerance based on type tolerance = T <: Union{Float64, ComplexF64} ? 1e-12 : 1e-6 if uplo == 'U' @@ -22,7 +26,7 @@ result_diff = norm(Matrix(A) - expected_result) / n @test result_diff < tolerance # Use adjusted tolerance if result_diff >= tolerance - println("Failure in lauum test for T: $T, uplo: $uplo, n: $n, block_size: $block_size") + println("Failure in NextLA.lauum! test for T: $T, uplo: $uplo, n: $n, block_size: $block_size") println("Difference norm: $result_diff") end else @@ -30,7 +34,7 @@ result_diff = norm(Matrix(A) - expected_result) / n @test result_diff < tolerance # Use adjusted tolerance if result_diff >= tolerance - println("Failure in lauum test for T: $T, uplo: $uplo, n: $n, block_size: $block_size") + println("Failure in NextLA.lauum! test for T: $T, uplo: $uplo, n: $n, block_size: $block_size") println("Difference norm: $result_diff") end end diff --git a/test/pamm.jl b/test/pamm.jl index 7cb7e39..aea06a2 100644 --- a/test/pamm.jl +++ b/test/pamm.jl @@ -19,13 +19,8 @@ using CUDA V_original = copy(V) W_original = copy(W) - lda1 = k - lda2 = m - ldv = m - ldw = n - - # Apply our PAMM - NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + # Apply our PAMM + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, A2, V, W) # Basic checks @test size(W) == (n, l) @@ -43,12 +38,7 @@ using CUDA V = rand(ComplexF64, n, l) W = rand(ComplexF64, m, l) - lda1 = k - lda2 = n - ldv = n - ldw = m - - NextLA.pamm('W', 'R', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + NextLA.pamm!('W', 'R', 'C', 'F', m, n, k, l, A1, A2, V, W) @test size(W) == (m, l) @test all(isfinite.(W)) @@ -64,13 +54,8 @@ using CUDA A2_original = copy(A2) - lda1 = k - lda2 = m - ldv = m - ldw = n - # Apply A operation - NextLA.pamm('A', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + NextLA.pamm!('A', 'L', 'C', 'F', m, n, k, l, A1, A2, V, W) @test size(A2) == (m, k) @test all(isfinite.(A2)) @@ -87,12 +72,7 @@ using CUDA V = rand(ComplexF64, m, l) W = rand(ComplexF64, n, l) - lda1 = k - lda2 = m - ldv = m - ldw = n - - NextLA.pamm('W', 'L', 'C', 'B', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + NextLA.pamm!('W', 'L', 'C', 'B', m, n, k, l, A1, A2, V, W) @test all(isfinite.(W)) end @@ -105,12 +85,7 @@ using CUDA V = rand(ComplexF64, l, m) # Row-wise storage W = rand(ComplexF64, l, n) # Row-wise storage - lda1 = k - lda2 = m - ldv = l - ldw = l - - NextLA.pamm('W', 'L', 'R', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + NextLA.pamm!('W', 'L', 'R', 'F', m, n, k, l, A1, A2, V, W) @test all(isfinite.(W)) end @@ -125,12 +100,7 @@ using CUDA W_original = copy(W) - lda1 = k - lda2 = m - ldv = m - ldw = n - - NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, A2, V, W) @test all(isfinite.(W)) @test !isapprox(W, W_original, rtol=1e-6) @@ -157,7 +127,7 @@ using CUDA ldv = m ldw = n - NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) @test all(isfinite.(W)) @test size(W) == (n, l) @@ -178,7 +148,7 @@ using CUDA ldv = m ldw = n - NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) @test all(isfinite.(W)) @@ -189,7 +159,7 @@ using CUDA V = rand(ComplexF64, m, l) W = rand(ComplexF64, n, l) - NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw) @test all(isfinite.(W)) end @@ -229,16 +199,11 @@ using CUDA V = rand(ComplexF64, m, l) W = rand(ComplexF64, n, l) - lda1 = k - lda2 = m - ldv = m - ldw = n - # Apply W operation - NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2_w, lda2, V, ldv, W, ldw) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, A2_w, V, W) # Apply A operation with same input - NextLA.pamm('A', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2_a, lda2, V, ldv, W, ldw) + NextLA.pamm!('A', 'L', 'C', 'F', m, n, k, l, A1, A2_a, V, W) # Results should be finite and well-defined @test all(isfinite.(W)) @@ -256,22 +221,17 @@ using CUDA W_cpu = rand(ComplexF32, n, l) lda1 = k - lda2 = m - ldv = m - ldw = n - - # Create GPU data A1_gpu = CuArray(A1_cpu) - A2_gpu = CuArray(A2_cpu) - V_gpu = CuArray(V_cpu) + W_cpu_result = copy(W_cpu) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, A2_cpu, V_cpu, W_cpu_result) W_gpu = CuArray(W_cpu) # Apply on CPU W_cpu_result = copy(W_cpu) - NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, lda1, A2_cpu, lda2, V_cpu, ldv, W_cpu_result, ldw) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, lda1, A2_cpu, lda2, V_cpu, ldv, W_cpu_result, ldw) # Apply on GPU - NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1_gpu, lda1, A2_gpu, lda2, V_gpu, ldv, W_gpu, ldw) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_gpu, A2_gpu, V_gpu, W_gpu) @test Array(W_gpu) ≈ W_cpu_result rtol=1e-6 end diff --git a/test/parfb.jl b/test/parfb.jl index 60c4f32..d192642 100644 --- a/test/parfb.jl +++ b/test/parfb.jl @@ -57,7 +57,7 @@ function lapack_tprfb!(::Type{T}, side::AbstractChar, trans::AbstractChar, direc end -# LAPACK-style test parameters for NextLA.parfb +# LAPACK-style test parameters for NextLA.parfb! const PARFB_TYPES = [ComplexF32, ComplexF64, Float32, Float64] # Format: (m1, n1, m2, n2, k, l) where: # - For side='L': n1 == n2 (same number of columns) @@ -132,13 +132,11 @@ const PARFB_SIZES = [ Tee = rand(T, k, k) ldt = k - # Work array dimensions based on SIDE + # Work array dimensions based on SIDE (2D workspace) if side == 'L' - work = rand(T, k, n1) # WORK is K-by-N when SIDE='L' - ldw = k + work = rand(T, k, n2) # WORK is K-by-n2 when SIDE='L' else - work = rand(T, m1, k) # WORK is M-by-K when SIDE='R' - ldw = m1 + work = rand(T, m2, k) # WORK is m2-by-K when SIDE='R' end # Make copies for testing @@ -153,9 +151,9 @@ const PARFB_SIZES = [ work_l = lapack_tprfb!(T, side, trans, direct, storev, l, V, Tee, A1_l, A2_l) - # NextLA call: parfb(side, trans, direct, storev, m1, n1, m2, n2, k, l, A1, lda1, A2, lda2, V, ldv, T, ldt, work, ldwork) - NextLA.parfb(side, trans, direct, storev, m1, n1, m2, n2, k, l, - A1_test, lda1, A2_test, lda2, V_test, ldv, T_test, ldt, work, ldw) + # NextLA call with simplified signature (no ld*), workspace as matrix + NextLA.parfb!(side, trans, direct, storev, m1, n1, m2, n2, k, l, + A1_test, A2_test, V_test, T_test, work) @@ -208,10 +206,10 @@ const PARFB_SIZES = [ A2 = randn(T, m2, n2) V = randn(T, m2, k) # For side='L', V has m2 rows T_mat = triu(randn(T, k, k)) - work = zeros(T, k, n1) + work = zeros(T, k, n2) - @test_nowarn NextLA.parfb('L', 'N', 'F', 'C', m1, n1, m2, n2, k, l, - A1, m1, A2, m2, V, m2, T_mat, k, work, k) + @test_nowarn NextLA.parfb!('L', 'N', 'F', 'C', m1, n1, m2, n2, k, l, + A1, A2, V, T_mat, work) # Test with valid parameters for side='R' case: m1 == m2 m1, n1, m2, n2, k, l = 600, 500, 600, 400, 300, 200 @@ -219,20 +217,18 @@ const PARFB_SIZES = [ A2 = randn(T, m2, n2) V = randn(T, n2, k) # For side='R', V has n2 rows T_mat = triu(randn(T, k, k)) - work = zeros(T, m1, k) + work = zeros(T, m2, k) - @test_nowarn NextLA.parfb('R', 'N', 'F', 'C', m1, n1, m2, n2, k, l, - A1, m1, A2, m2, V, n2, T_mat, k, work, m1) + @test_nowarn NextLA.parfb!('R', 'N', 'F', 'C', m1, n1, m2, n2, k, l, + A1, A2, V, T_mat, work) # Test edge cases - @test_nowarn NextLA.parfb('L', 'N', 'F', 'C', 0, 0, 0, 0, 0, 0, - zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, - zeros(T, 0, 0), 1, T[], 1) + @test_nowarn NextLA.parfb!('L', 'N', 'F', 'C', 0, 0, 0, 0, 0, 0, + zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0)) # Test with k=0 (valid for both sides) - @test_nowarn NextLA.parfb('L', 'N', 'F', 'C', 2, 2, 2, 2, 0, 0, - randn(T, 2, 2), 2, randn(T, 2, 2), 2, zeros(T, 2, 0), 2, - zeros(T, 0, 0), 1, T[], 1) + @test_nowarn NextLA.parfb!('L', 'N', 'F', 'C', 2, 2, 2, 2, 0, 0, + randn(T, 2, 2), randn(T, 2, 2), zeros(T, 2, 0), zeros(T, 0, 0), zeros(T, 0, 0)) end end end @@ -252,7 +248,7 @@ const PARFB_SIZES = [ A2 = T.(scale .* randn(ComplexF64, m2, n2)) V = T.(scale .* randn(ComplexF64, m2, k)) # For side='L', V has m2 rows T_mat = triu(T.(scale .* randn(ComplexF64, k, k))) - work = zeros(T, k, n1) + work = zeros(T, k, n2) # Set up proper Householder structure for i in 1:k @@ -261,8 +257,8 @@ const PARFB_SIZES = [ end # Test calculation - NextLA.parfb('L', 'N', 'F', 'C', m1, n1, m2, n2, k, l, - A1, m1, A2, m2, V, m2, T_mat, k, work, k) + NextLA.parfb!('L', 'N', 'F', 'C', m1, n1, m2, n2, k, l, + A1, A2, V, T_mat, work) # Check that results are finite @test all(isfinite.(A1)) @@ -294,14 +290,10 @@ const PARFB_SIZES = [ # Set V dimensions based on side if side == 'L' V_cpu = randn(T, m2, k) - work_cpu = zeros(T, k * n1) - ldv = m2 - ldwork = k + work_cpu = zeros(T, k, n2) else # side == 'R' V_cpu = randn(T, n2, k) - work_cpu = zeros(T, m1 * k) - ldv = n2 - ldwork = m1 + work_cpu = zeros(T, m2, k) end T_cpu = triu(randn(T, k, k)) @@ -323,17 +315,17 @@ const PARFB_SIZES = [ A1_ref = copy(A1_cpu) A2_ref = copy(A2_cpu) work_ref = copy(work_cpu) - NextLA.parfb(side, 'N', 'F', 'C', m1, n1, m2, n2, k, l, - A1_ref, m1, A2_ref, m2, V_cpu, ldv, T_cpu, k, work_ref, ldwork) + NextLA.parfb!(side, 'N', 'F', 'C', m1, n1, m2, n2, k, l, + A1_ref, A2_ref, V_cpu, T_cpu, work_ref) # Our implementation on GPU - NextLA.parfb(side, 'N', 'F', 'C', m1, n1, m2, n2, k, l, - A1_gpu, m1, A2_gpu, m2, V_gpu, ldv, T_gpu, k, work_gpu, ldwork) + NextLA.parfb!(side, 'N', 'F', 'C', m1, n1, m2, n2, k, l, + A1_gpu, A2_gpu, V_gpu, T_gpu, work_gpu) # Compare results @test norm(Array(A1_gpu) - A1_ref) < rtol * max(1, norm(A1_ref)) @test norm(Array(A2_gpu) - A2_ref) < rtol * max(1, norm(A2_ref)) - @test norm(Array(work_gpu) - work_ref) < rtol * max(1, norm(work_ref)) + @test norm(Array(work_gpu) - Array(work_ref)) < rtol * max(1, norm(Array(work_ref))) @test all(isfinite.(Array(A1_gpu))) @test all(isfinite.(Array(A2_gpu))) diff --git a/test/pemv.jl b/test/pemv.jl index 5bd53d4..cdd64f3 100644 --- a/test/pemv.jl +++ b/test/pemv.jl @@ -9,15 +9,14 @@ using CUDA alpha = 2.5 + 1.5im beta = 1.2 - 0.8im - A = rand(ComplexF64, m, n) - lda = m + A = rand(ComplexF64, m, n) X = rand(ComplexF64, n) Y = rand(ComplexF64, m) Y_original = copy(Y) work = zeros(ComplexF64, m) # Apply our PEMV - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work) # Verify using manual computation # For column storage with conjugate transpose, this should compute: @@ -32,15 +31,14 @@ using CUDA alpha = 1.8 + 2.2im beta = 0.5 + 1.0im - A = rand(ComplexF64, m, n) - lda = m + A = rand(ComplexF64, m, n) X = rand(ComplexF64, m) Y = rand(ComplexF64, n) Y_original = copy(Y) work = zeros(ComplexF64, n) # Apply our PEMV - NextLA.pemv('N', 'R', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('N', 'R', m, n, l, alpha, A, X, beta, Y, work) # For row storage with no transpose: # Y := alpha * A^T * X + beta * Y @@ -54,14 +52,13 @@ using CUDA alpha = ComplexF32(2.0 + 1.0im) beta = ComplexF32(0.8 - 0.5im) - A = rand(ComplexF32, m, n) - lda = m + A = rand(ComplexF32, m, n) X = rand(ComplexF32, n) Y = rand(ComplexF32, m) Y_original = copy(Y) work = zeros(ComplexF32, m) - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work) Y_expected = alpha * A' * X + beta * Y_original @@ -79,13 +76,12 @@ using CUDA # Test column storage A = rand(ComplexF64, m, n) - lda = m X = rand(ComplexF64, n) Y = rand(ComplexF64, m) Y_original = copy(Y) work = zeros(ComplexF64, m) - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work) Y_expected = alpha * A' * X + beta * Y_original @test Y ≈ Y_expected rtol=1e-12 @@ -97,14 +93,13 @@ using CUDA alpha = ComplexF64(0.0) beta = 2.0 + 1.5im - A = rand(ComplexF64, m, n) - lda = m + A = rand(ComplexF64, m, n) X = rand(ComplexF64, n) Y = rand(ComplexF64, m) Y_original = copy(Y) work = zeros(ComplexF64, m) - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work) # With alpha = 0, result should be beta * Y_original Y_expected = beta * Y_original @@ -116,14 +111,13 @@ using CUDA alpha = 2.0 + 1.5im beta = ComplexF64(0.0) - A = rand(ComplexF64, m, n) - lda = m + A = rand(ComplexF64, m, n) X = rand(ComplexF64, n) Y = rand(ComplexF64, m) Y_original = copy(Y) work = zeros(ComplexF64, m) - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work) # With beta = 0, result should be alpha * A' * X Y_expected = alpha * A' * X @@ -135,14 +129,13 @@ using CUDA alpha = ComplexF64(0.0) beta = ComplexF64(0.0) - A = rand(ComplexF64, m, n) - lda = m + A = rand(ComplexF64, m, n) X = rand(ComplexF64, n) Y = rand(ComplexF64, m) Y_original = copy(Y) work = zeros(ComplexF64, m) - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work) # Function should return early, Y might be unchanged or zeroed # Check that it doesn't crash and produces finite results @@ -155,26 +148,24 @@ using CUDA alpha = 2.0 + 1.0im beta = 1.5 - 0.5im - A = zeros(ComplexF64, max(1, m), n) - lda = max(1, m) + A = zeros(ComplexF64, max(1, m), n) X = rand(ComplexF64, n) Y = ComplexF64[] work = ComplexF64[] # Should return early without error - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work) @test length(Y) == 0 # n = 0 case m, n, l = 5, 0, 0 - A = rand(ComplexF64, m, max(1, n)) - lda = m + A = rand(ComplexF64, m, max(1, n)) X = ComplexF64[] Y = rand(ComplexF64, m) Y_original = copy(Y) work = zeros(ComplexF64, m) - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work) # Should return early @test all(isfinite.(Y)) end @@ -189,24 +180,23 @@ using CUDA work = zeros(ComplexF64, m) # Invalid trans - @test_throws ArgumentError NextLA.pemv('X', 'C', m, n, l, alpha, A, m, X, beta, Y, work) + @test_throws ArgumentError NextLA.pemv!('X', 'C', m, n, l, alpha, A, X, beta, Y, work) # Invalid storev - @test_throws ArgumentError NextLA.pemv('C', 'X', m, n, l, alpha, A, m, X, beta, Y, work) + @test_throws ArgumentError NextLA.pemv!('C', 'X', m, n, l, alpha, A, X, beta, Y, work) # Invalid trans/storev combination - @test_throws ArgumentError NextLA.pemv('N', 'C', m, n, l, alpha, A, m, X, beta, Y, work) - @test_throws ArgumentError NextLA.pemv('C', 'R', m, n, l, alpha, A, m, X, beta, Y, work) + @test_throws ArgumentError NextLA.pemv!('N', 'C', m, n, l, alpha, A, X, beta, Y, work) + @test_throws ArgumentError NextLA.pemv!('C', 'R', m, n, l, alpha, A, X, beta, Y, work) # Negative dimensions - @test_throws ArgumentError NextLA.pemv('C', 'C', -1, n, l, alpha, A, m, X, beta, Y, work) - @test_throws ArgumentError NextLA.pemv('C', 'C', m, -1, l, alpha, A, m, X, beta, Y, work) + @test_throws ArgumentError NextLA.pemv!('C', 'C', -1, n, l, alpha, A, X, beta, Y, work) + @test_throws ArgumentError NextLA.pemv!('C', 'C', m, -1, l, alpha, A, X, beta, Y, work) # Invalid l (l > min(m,n)) - @test_throws ArgumentError NextLA.pemv('C', 'C', m, n, min(m,n)+1, alpha, A, m, X, beta, Y, work) + @test_throws ArgumentError NextLA.pemv!('C', 'C', m, n, min(m,n)+1, alpha, A, X, beta, Y, work) - # Invalid lda - @test_throws ArgumentError NextLA.pemv('C', 'C', m, n, l, alpha, A, m-1, X, beta, Y, work) + # No lda parameter to validate anymore end @testset "Consistency with BLAS" begin @@ -215,15 +205,14 @@ using CUDA alpha = 2.0 + 1.0im beta = 1.5 - 0.8im - A = rand(ComplexF64, m, n) - lda = m + A = rand(ComplexF64, m, n) X = rand(ComplexF64, n) Y1 = rand(ComplexF64, m) Y2 = copy(Y1) work = zeros(ComplexF64, m) # Our implementation - NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y1, work) + NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y1, work) # BLAS reference LinearAlgebra.BLAS.gemv!('C', alpha, A, X, beta, Y2) @@ -239,7 +228,6 @@ using CUDA # Create CPU data A_cpu = rand(ComplexF32, m, n) - lda = m X_cpu = rand(ComplexF32, n) Y_cpu = rand(ComplexF32, m) work_cpu = zeros(ComplexF32, m) @@ -252,10 +240,10 @@ using CUDA # Apply on CPU Y_cpu_result = copy(Y_cpu) - NextLA.pemv('C', 'C', m, n, l, alpha, A_cpu, lda, X_cpu, beta, Y_cpu_result, work_cpu) + NextLA.pemv!('C', 'C', m, n, l, alpha, A_cpu, X_cpu, beta, Y_cpu_result, work_cpu) # Apply on GPU - NextLA.pemv('C', 'C', m, n, l, alpha, A_gpu, lda, X_gpu, beta, Y_gpu, work_gpu) + NextLA.pemv!('C', 'C', m, n, l, alpha, A_gpu, X_gpu, beta, Y_gpu, work_gpu) @test Array(Y_gpu) ≈ Y_cpu_result rtol=1e-6 end diff --git a/test/runtests.jl b/test/runtests.jl index e62aeac..bd09dbe 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -98,16 +98,16 @@ include("axpy.jl") include("gerc.jl") include("larfg.jl") include("larf.jl") -#include("larft.jl") #TODO: implement sub-tests for larft (indirect test in geqrt) -#include("larfb.jl") #TODO: implement sub-tests for larfb (indirect test in unmqr) +#include("larft.jl") #TODO: implement sub-tests for larft (indirect test in geqrt!) +#include("larfb.jl") #TODO: implement sub-tests for larfb! (indirect test in unmqr!) include("geqr2.jl") include("geqrt.jl") include("unmqr.jl") include("tsqrt.jl") include("tsmqr.jl") include("parfb.jl") -#include("pamm.jl") #TODO: implement sub-tests for pamm (indirect test in parfb) -#include("pemv.jl") #TODO: implement sub-tests for pemv (indirect test in ttqrt) +#include("pamm.jl") #TODO: implement sub-tests for pamm! (indirect test in parfb!) +#include("pemv.jl") #TODO: implement sub-tests for pemv! (indirect test in ttqrt!) include("ttqrt.jl") include("ttmqr.jl") diff --git a/test/tsmqr.jl b/test/tsmqr.jl index dff9467..f056e15 100644 --- a/test/tsmqr.jl +++ b/test/tsmqr.jl @@ -126,25 +126,13 @@ const TSMQR_SIZES = [ A2 = rand(T, m2, n2) # V matrix (Householder vectors) - if side == 'L' - V = rand(T, m2, k) - ldv = m2 - else - V = rand(T, n2, k) - ldv = n2 - end + V = side == 'L' ? rand(T, m2, k) : rand(T, n2, k) # T matrix (triangular factors) T_mat = triu(rand(T, ib, k)) # Work array - if side == 'L' - work = zeros(T, ib, n1) - ldwork = ib - else - work = zeros(T, m1, ib) - ldwork = m1 - end + work = side == 'L' ? zeros(T, ib * n1) : zeros(T, ib * m1) # Make copies for testing A1_orig = copy(A1) @@ -155,8 +143,17 @@ const TSMQR_SIZES = [ A2_lapack = copy(A2) # Test NextLA implementation - NextLA.tsmqr(side, trans, m1, n1, m2, n2, k, ib, - A1_nextla, m1, A2_nextla, m2, V, ldv, T_mat, ib, work, ldwork) + NextLA.tsmqr!(side, trans, m1, n1, m2, n2, k, ib, + A1_nextla, A2_nextla, V, T_mat, work) + + # --- Test Helper Function --- + A1_helper = copy(A1) + A2_helper = copy(A2) + NextLA.tsmqr!(side, trans, A1_helper, A2_helper, V, T_mat, ib) + + # Verify helper gives same results as kernel + @test A1_helper ≈ A1_nextla rtol=rtol + @test A2_helper ≈ A2_nextla rtol=rtol # Test LAPACK reference lapack_tpmqrt!(T, side, trans, 0, V, T_mat, A1_lapack, A2_lapack) @@ -196,20 +193,20 @@ const TSMQR_SIZES = [ A2 = randn(T, m2, n2) V = randn(T, m2, k) T_mat = triu(randn(T, ib, k)) - work = zeros(T, ib, n1) - - @test_nowarn NextLA.tsmqr('L', 'N', m1, n1, m2, n2, k, ib, - A1, m1, A2, m2, V, m2, T_mat, ib, work, ib) + work = zeros(T, ib * n1) + + @test_nowarn NextLA.tsmqr!('L', 'N', m1, n1, m2, n2, k, ib, + A1, A2, V, T_mat, work) # Test edge cases - @test_nowarn NextLA.tsmqr('L', 'N', 0, 0, 0, 0, 0, 0, - zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, - zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, T[], 1) + @test_nowarn NextLA.tsmqr!('L', 'N', 0, 0, 0, 0, 0, 0, + zeros(T, 0, 0), zeros(T, 0, 0), + zeros(T, 0, 0), zeros(T, 0, 0), T[]) # Test with k=0 - @test_nowarn NextLA.tsmqr('L', 'N', 200, 200, 200, 200, 0, 0, - randn(T, 200, 200), 200, randn(T, 200, 200), 200, - zeros(T, 200, 0), 200, zeros(T, 0, 0), 1, T[], 1) + @test_nowarn NextLA.tsmqr!('L', 'N', 200, 200, 200, 200, 0, 0, + randn(T, 200, 200), randn(T, 200, 200), + zeros(T, 200, 0), zeros(T, 0, 0), T[]) end end end @@ -228,11 +225,11 @@ const TSMQR_SIZES = [ A2 = T.(scale .* randn(ComplexF64, m2, n2)) V = T.(scale .* randn(ComplexF64, m2, k)) T_mat = triu(T.(scale .* randn(ComplexF64, ib, k))) - work = zeros(T, ib, n1) + work = zeros(T, ib * n1) # Test calculation - NextLA.tsmqr('L', 'N', m1, n1, m2, n2, k, ib, - A1, m1, A2, m2, V, m2, T_mat, ib, work, ib) + NextLA.tsmqr!('L', 'N', m1, n1, m2, n2, k, ib, + A1, A2, V, T_mat, work) # Check that results are finite @test all(isfinite.(A1)) @@ -284,12 +281,12 @@ const TSMQR_SIZES = [ A1_ref = copy(A1_cpu) A2_ref = copy(A2_cpu) work_ref = copy(work_cpu) - NextLA.tsmqr(side, 'N', m1, n1, m2, n2, k, ib, - A1_ref, m1, A2_ref, m2, V_cpu, ldv, T_cpu, ib, work_ref, ldwork) + NextLA.tsmqr!(side, 'N', m1, n1, m2, n2, k, ib, + A1_ref, A2_ref, V_cpu, T_cpu, work_ref) # GPU calculation - NextLA.tsmqr(side, 'N', m1, n1, m2, n2, k, ib, - A1_gpu, m1, A2_gpu, m2, V_gpu, ldv, T_gpu, ib, work_gpu, ldwork) + NextLA.tsmqr!(side, 'N', m1, n1, m2, n2, k, ib, + A1_gpu, A2_gpu, V_gpu, T_gpu, work_gpu) # Compare results @test norm(Array(A1_gpu) - A1_ref) < rtol * max(1, norm(A1_ref)) diff --git a/test/tsqrt.jl b/test/tsqrt.jl index 54876dd..89586ba 100644 --- a/test/tsqrt.jl +++ b/test/tsqrt.jl @@ -43,7 +43,7 @@ function lapack_tpqrt!(::Type{T}, m::Int64, n::Int64, l::Int64, nb::Int64, chklapackerror(info[]) end -# TSQRT test parameters for NextLA.tsqrt +# TSQRT test parameters for NextLA.tsqrt! const TSQRT_TYPES = [ComplexF32, ComplexF64, Float32, Float64] const TSQRT_SIZES = [ (100, 80, 30), # m, n, ib @@ -74,10 +74,6 @@ const TSQRT_SIZES = [ A2_lapack = copy(A2) # Prepare workspace and output arrays - lda1 = n - lda2 = m - ldt = ib - T_nextla = zeros(T, ib, n) T_lapack = zeros(T, ib, n) tau_nextla = zeros(T, n) @@ -85,7 +81,19 @@ const TSQRT_SIZES = [ work_nextla = zeros(T, ib * n) # Test NextLA implementation - NextLA.tsqrt(m, n, ib, A1_nextla, lda1, A2_nextla, lda2, T_nextla, ldt, tau_nextla, work_nextla) + NextLA.tsqrt!(m, n, ib, A1_nextla, A2_nextla, T_nextla, tau_nextla, work_nextla) + + # --- Test Helper Function --- + # Recompute using the high-level helper on fresh inputs + A1_helper = copy(A1) + A2_helper = copy(A2) + T_helper = zeros(T, ib, n) + tau_helper = zeros(T, n) + NextLA.tsqrt!(A1_helper, A2_helper, T_helper, tau_helper, ib) + + # Verify helper gives same results as kernel + @test A1_helper ≈ A1_nextla rtol=rtol + @test A2_helper ≈ A2_nextla rtol=rtol # Test LAPACK implementation work_lapack = zeros(T, ib * n) @@ -146,7 +154,7 @@ const TSQRT_SIZES = [ tau_result = zeros(T, n) work = zeros(T, ib * n) - NextLA.tsqrt(m, n, ib, A1_result, n, A2_result, m, T_result, ib, tau_result, work) + NextLA.tsqrt!(m, n, ib, A1_result, A2_result, T_result, tau_result, work) # Check that A1 (now R) is upper triangular for i in 1:n @@ -188,15 +196,15 @@ const TSQRT_SIZES = [ tau = zeros(T, n) work = zeros(T, ib * n) - @test_nowarn NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work) + @test_nowarn NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work) # Test with invalid parameters - @test_throws ArgumentError NextLA.tsqrt(-1, n, ib, A1, n, A2, m, T_mat, ib, tau, work) - @test_throws ArgumentError NextLA.tsqrt(m, -1, ib, A1, n, A2, m, T_mat, ib, tau, work) - @test_throws ArgumentError NextLA.tsqrt(m, n, -1, A1, n, A2, m, T_mat, ib, tau, work) + @test_throws ArgumentError NextLA.tsqrt!(-1, n, ib, A1, A2, T_mat, tau, work) + @test_throws ArgumentError NextLA.tsqrt!(m, -1, ib, A1, A2, T_mat, tau, work) + @test_throws ArgumentError NextLA.tsqrt!(m, n, -1, A1, A2, T_mat, tau, work) # Test edge cases - @test_nowarn NextLA.tsqrt(0, 0, 0, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, T[], T[]) + @test_nowarn NextLA.tsqrt!(0, 0, 0, zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0), T[], T[]) end end end @@ -224,7 +232,7 @@ const TSQRT_SIZES = [ work = zeros(T, ib * n) # Test calculation - NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work) + NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work) # Check that results are finite @test all(isfinite.(A1)) @@ -257,7 +265,7 @@ const TSQRT_SIZES = [ tau = zeros(T, n) work = zeros(T, ib * n) - NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work) + NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work) # Should complete without errors @test all(isfinite.(A1)) @@ -291,7 +299,7 @@ const TSQRT_SIZES = [ tau = zeros(T, n) work = zeros(T, ib * n) - NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work) + NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work) @test all(isfinite.(A1)) @test all(isfinite.(A2)) @@ -308,7 +316,7 @@ const TSQRT_SIZES = [ tau = zeros(T, n) work = zeros(T, ib * n) - NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work) + NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work) @test all(isfinite.(A1)) @test all(isfinite.(A2)) @@ -347,10 +355,10 @@ const TSQRT_SIZES = [ A2_cpu_result = copy(A2_cpu) T_cpu_result = copy(T_cpu) tau_cpu_result = copy(tau_cpu) - NextLA.tsqrt(m, n, ib, A1_cpu_result, n, A2_cpu_result, m, T_cpu_result, ib, tau_cpu_result, work_cpu) + NextLA.tsqrt!(m, n, ib, A1_cpu_result, A2_cpu_result, T_cpu_result, tau_cpu_result, work_cpu) # Apply on GPU - NextLA.tsqrt(m, n, ib, A1_gpu, n, A2_gpu, m, T_gpu, ib, tau_gpu, work_gpu) + NextLA.tsqrt!(m, n, ib, A1_gpu, A2_gpu, T_gpu, tau_gpu, work_gpu) # Compare results @test Array(A1_gpu) ≈ A1_cpu_result rtol=rtol diff --git a/test/ttmqr.jl b/test/ttmqr.jl index b579889..a4ed949 100644 --- a/test/ttmqr.jl +++ b/test/ttmqr.jl @@ -140,7 +140,7 @@ const TTMQR_SIZES = [ T_mat = triu(rand(T, ib, k)) - work = zeros(T, ib, n2) + work = zeros(T, ib * n2) A1_nextla = copy(A1) A2_nextla = copy(A2) @@ -149,9 +149,19 @@ const TTMQR_SIZES = [ T_mat_nextla = copy(T_mat) work_nextla = copy(work) - work = zeros(T, ib, n2) - NextLA.ttmqr('L', 'N', n2, n2, n2, n2, k, ib, - A1_nextla, n2, A2_nextla, n2, V, n2, T_mat_nextla, ib, work_nextla, ib) + NextLA.ttmqr!('L', 'N', n2, n2, n2, n2, k, ib, + A1_nextla, A2_nextla, V, T_mat_nextla, work_nextla) + + # --- Test Helper Function --- + A1_helper = copy(A1_orig) + A2_helper = copy(A2_orig) + T_mat_helper = copy(T_mat) + NextLA.ttmqr!('L', 'N', A1_helper, A2_helper, V, T_mat_helper, ib) + + # Verify helper gives same results as kernel + @test A1_helper ≈ A1_nextla rtol=rtol + @test A2_helper ≈ A2_nextla rtol=rtol + lapack_tpmqrt!(T, 'L', 'N', 0, V, T_mat, A1_orig, A2_orig) @test norm(A1_nextla - A1_orig) < rtol * norm(A1_orig) end @@ -172,20 +182,20 @@ const TTMQR_SIZES = [ C2 = randn(T, m2, n2) V = randn(T, m2, k) T_mat = triu(randn(T, ib, k)) - work = zeros(T, ib, n1) + work = zeros(T, ib * n1) - @test_nowarn NextLA.ttmqr('L', 'N', m1, n1, m2, n2, k, ib, - C1, m1, C2, m2, V, m2, T_mat, ib, work, ib) + @test_nowarn NextLA.ttmqr!('L', 'N', m1, n1, m2, n2, k, ib, + C1, C2, V, T_mat, work) # Test edge cases - @test_nowarn NextLA.ttmqr('L', 'N', 0, 0, 0, 0, 0, 0, - zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, - zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, T[], 1) + @test_nowarn NextLA.ttmqr!('L', 'N', 0, 0, 0, 0, 0, 0, + zeros(T, 0, 0), zeros(T, 0, 0), + zeros(T, 0, 0), zeros(T, 0, 0), T[]) # Test with k=0 - @test_nowarn NextLA.ttmqr('L', 'N', 2, 2, 2, 2, 0, 0, - randn(T, 2, 2), 2, randn(T, 2, 2), 2, - zeros(T, 2, 0), 2, zeros(T, 0, 0), 1, T[], 1) + @test_nowarn NextLA.ttmqr!('L', 'N', 2, 2, 2, 2, 0, 0, + randn(T, 2, 2), randn(T, 2, 2), + zeros(T, 2, 0), zeros(T, 0, 0), T[]) end end end @@ -204,11 +214,11 @@ const TTMQR_SIZES = [ C2 = T.(scale .* randn(ComplexF64, m2, n2)) V = T.(scale .* randn(ComplexF64, m2, k)) T_mat = triu(T.(scale .* randn(ComplexF64, ib, k))) - work = zeros(T, ib, n1) + work = zeros(T, ib * n1) # Test calculation - NextLA.ttmqr('L', 'N', m1, n1, m2, n2, k, ib, - C1, m1, C2, m2, V, m2, T_mat, ib, work, ib) + NextLA.ttmqr!('L', 'N', m1, n1, m2, n2, k, ib, + C1, C2, V, T_mat, work) # Check that results are finite @test all(isfinite.(C1)) @@ -237,14 +247,10 @@ const TTMQR_SIZES = [ if side == 'L' V_cpu = randn(T, m2, k) - work_cpu = zeros(T, ib, n1) - ldv = m2 - ldwork = ib + work_cpu = zeros(T, ib * n1) else V_cpu = randn(T, n2, k) - work_cpu = zeros(T, m1, ib) - ldv = n2 - ldwork = m1 + work_cpu = zeros(T, ib * m1) end T_cpu = triu(randn(T, ib, k)) @@ -260,12 +266,12 @@ const TTMQR_SIZES = [ C1_ref = copy(C1_cpu) C2_ref = copy(C2_cpu) work_ref = copy(work_cpu) - NextLA.ttmqr(side, 'N', m1, n1, m2, n2, k, ib, - C1_ref, m1, C2_ref, m2, V_cpu, ldv, T_cpu, ib, work_ref, ldwork) + NextLA.ttmqr!(side, 'N', m1, n1, m2, n2, k, ib, + C1_ref, C2_ref, V_cpu, T_cpu, work_ref) # GPU calculation - NextLA.ttmqr(side, 'N', m1, n1, m2, n2, k, ib, - C1_gpu, m1, C2_gpu, m2, V_gpu, ldv, T_gpu, ib, work_gpu, ldwork) + NextLA.ttmqr!(side, 'N', m1, n1, m2, n2, k, ib, + C1_gpu, C2_gpu, V_gpu, T_gpu, work_gpu) # Compare results @test norm(Array(C1_gpu) - C1_ref) < rtol * max(1, norm(C1_ref)) diff --git a/test/ttqrt.jl b/test/ttqrt.jl index 189e4dd..a586cfb 100644 --- a/test/ttqrt.jl +++ b/test/ttqrt.jl @@ -86,7 +86,16 @@ const TTQRT_SIZES = [ work_nextla = copy(work) - NextLA.ttqrt(n, n, ib, A1_nextla, n, A2_nextla, m, T_mat_nextla, ib, tau, work_nextla) + NextLA.ttqrt!(n, n, ib, A1_nextla, A2_nextla, T_mat_nextla, tau, work_nextla) + + # --- Test Helper Function --- + A1_helper = copy(A1_orig) + A2_helper = copy(A2_orig) + NextLA.ttqrt!(ib, A1_helper, A2_helper, T_mat_nextla, tau) + + # Verify helper gives same results as kernel + @test A1_helper ≈ A1_nextla rtol=rtol + @test A2_helper ≈ A2_nextla rtol=rtol lapack_tpqrt!(T, n, n, n, ib, A1, n, A2, n, T_mat, ib, work) @@ -123,15 +132,13 @@ const TTQRT_SIZES = [ tau = zeros(T, n) work = zeros(T, ib * n) - @test_nowarn NextLA.ttqrt(m, n, ib, A1, n, A2, m, T_matrix, ib, tau, work) + @test_nowarn NextLA.ttqrt!(m, n, ib, A1, A2, T_matrix, tau, work) # Test edge cases - @test_nowarn NextLA.ttqrt(0, 0, 0, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, - zeros(T, 0, 0), 1, T[], T[]) + @test_nowarn NextLA.ttqrt!(0, 0, 0, zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0), T[], T[]) # Test with minimal size - @test_nowarn NextLA.ttqrt(1, 1, 1, ones(T, 1, 1), 1, ones(T, 1, 1), 1, - zeros(T, 1, 1), 1, zeros(T, 1), zeros(T, 1)) + @test_nowarn NextLA.ttqrt!(1, 1, 1, ones(T, 1, 1), ones(T, 1, 1), zeros(T, 1, 1), zeros(T, 1), zeros(T, 1)) end end end @@ -159,8 +166,8 @@ const TTQRT_SIZES = [ end end - # Test calculation - NextLA.ttqrt(m, n, ib, A1, n, A2, m, T_matrix, ib, tau, work) + # Test calculation (simplified signature) + NextLA.ttqrt!(m, n, ib, A1, A2, T_matrix, tau, work) # Check that results are finite @test all(isfinite.(A1)) @@ -213,10 +220,10 @@ const TTQRT_SIZES = [ T_ref = copy(T_cpu) tau_ref = copy(tau_cpu) work_ref = copy(work_cpu) - NextLA.ttqrt(m, n, ib, A1_ref, n, A2_ref, m, T_ref, ib, tau_ref, work_ref) + NextLA.ttqrt!(m, n, ib, A1_ref, A2_ref, T_ref, tau_ref, work_ref) # GPU calculation - NextLA.ttqrt(m, n, ib, A1_gpu, n, A2_gpu, m, T_gpu, ib, tau_gpu, work_gpu) + NextLA.ttqrt!(m, n, ib, A1_gpu, A2_gpu, T_gpu, tau_gpu, work_gpu) # Compare results @test norm(Array(A1_gpu) - A1_ref) < rtol * max(1, norm(A1_ref)) diff --git a/test/unmqr.jl b/test/unmqr.jl index 96a9138..12bf436 100644 --- a/test/unmqr.jl +++ b/test/unmqr.jl @@ -16,14 +16,12 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] A_original = copy(A_qr) lda = m T = zeros(type, ib, k) - ldt = ib tau = zeros(type, k) - ldwork = ib * n - work_qr = zeros(type, ldwork) + work_qr = zeros(type, ib * k) # Perform QR factorization - NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr) + NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr) # Test matrix to apply Q to C = rand(T, m, n) @@ -31,18 +29,25 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] ldc = m # Workspace for UNMQR - work = zeros(type, ib * m) - ldwork = n + # Workspace for UNMQR (matrix workspace) + work = zeros(type, n, ib) # Apply Q from left (Q * C) - NextLA.unmqr('L', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork) + NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr, lda, T, C, work) + + # --- Test Helper Function --- + C_helper = copy(C_original) + NextLA.unmqr!('L', 'N', A_qr, T, C_helper, ib) + + # Verify helper gives same results as kernel (in-place) + @test C_helper ≈ C rtol=rtol # Verify using reference QR decomposition Q_ref, R_ref = qr(A_original) C_expected = Matrix(Q_ref) * C_original # Note: Due to potential sign differences in QR, we check properties rather than exact equality - @test size(C) == (n, m) + @test size(C) == (m, n) @test all(isfinite.(C)) # Check that the transformation preserves matrix structure @@ -60,14 +65,12 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] A_original = copy(A_qr) lda = m T = zeros(type, ib, k) - ldt = ib tau = zeros(type, k) - ldwork = ib * n - work_qr = zeros(type, ldwork) + work_qr = zeros(type, ib * k) # Perform QR factorization - NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr) + NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr) # Test matrix to apply Q to C = rand(T, m, n) @@ -75,18 +78,25 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] ldc = m # Workspace for UNMQR - work = zeros(type, ib * m) - ldwork = n + # Workspace for UNMQR (matrix workspace) + work = zeros(type, n, ib) # Apply Q from left (Q * C) - NextLA.unmqr('L', 'C', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork) + NextLA.unmqr!('L', 'C', m, n, k, ib, A_qr, lda, T, C, work) + + # --- Test Helper Function --- + C_helper = copy(C_original) + NextLA.unmqr!('L', 'C', A_qr, T, C_helper, ib) + + # Verify helper gives same results as kernel (in-place) + @test C_helper ≈ C rtol=rtol # Verify using reference QR decomposition Q_ref, R_ref = qr(A_original) C_expected = adjoint(Matrix(Q_ref)) * C_original # Note: Due to potential sign differences in QR, we check properties rather than exact equality - @test size(C) == (n, m) + @test size(C) == (m, n) @test all(isfinite.(C)) # Check that the transformation preserves matrix structure @@ -104,14 +114,12 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] A_original = copy(A_qr) lda = m T = zeros(type, ib, k) - ldt = ib tau = zeros(type, k) - ldwork = ib * n - work_qr = zeros(type, ldwork) + work_qr = zeros(type, ib * k) # Perform QR factorization - NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr) + NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr) # Test matrix to apply Q to C = rand(T, m, n) @@ -119,18 +127,18 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] ldc = m # Workspace for UNMQR - work = zeros(type, ib * m) - ldwork = m + # Workspace for UNMQR (matrix workspace) + work = zeros(type, m, ib) # Apply Q from left (Q * C) - NextLA.unmqr('R', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork) + NextLA.unmqr!('R', 'N', m, n, k, ib, A_qr, lda, T, C, work) # Verify using reference QR decomposition Q_ref, R_ref = qr(A_original) C_expected = C_original * Matrix(Q_ref) # Note: Due to potential sign differences in QR, we check properties rather than exact equality - @test size(C) == (n, m) + @test size(C) == (m, n) @test all(isfinite.(C)) # Check that the transformation preserves matrix structure @@ -148,14 +156,12 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] A_original = copy(A_qr) lda = m T = zeros(type, ib, k) - ldt = ib tau = zeros(type, k) - ldwork = ib * n - work_qr = zeros(type, ldwork) + work_qr = zeros(type, ib * k) # Perform QR factorization - NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr) + NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr) # Test matrix to apply Q to C = rand(T, m, n) @@ -163,18 +169,18 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] ldc = m # Workspace for UNMQR - work = zeros(type, ib * m) - ldwork = m + # Workspace for UNMQR (matrix workspace) + work = zeros(type, m, ib) - # Apply Q from left (Q * C) - NextLA.unmqr('R', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork) + # Apply Q^H from right (C * Q^H) + NextLA.unmqr!('R', 'C', m, n, k, ib, A_qr, lda, T, C, work) # Verify using reference QR decomposition Q_ref, R_ref = qr(A_original) C_expected = C_original * adjoint(Matrix(Q_ref)) # Note: Due to potential sign differences in QR, we check properties rather than exact equality - @test size(C) == (n, m) + @test size(C) == (m, n) @test all(isfinite.(C)) # Check that the transformation preserves matrix structure @@ -188,23 +194,22 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] A_qr = rand(ComplexF64, m, k) lda = m - T = zeros(ComplexF64, ib, k) - ldt = ib + T = zeros(ComplexF64, ib, k) tau = zeros(ComplexF64, k) - work_qr = zeros(ComplexF64, ib * k) + work_qr = zeros(ComplexF64, ib * k) - NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr) + NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr) C = Matrix{ComplexF64}(I, m, n) # Identity matrix C_original = copy(C) - ldc = m + ldc = m - work = zeros(ComplexF64, ib * n) - ldwork = n + # Matrix workspace for UNMQR + work = zeros(ComplexF64, n, ib) # Apply Q then Q^H - NextLA.unmqr('L', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork) - NextLA.unmqr('L', 'C', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork) + NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr, lda, T, C, work) + NextLA.unmqr!('L', 'C', m, n, k, ib, A_qr, lda, T, C, work) # Should get back to identity (at least for the first k columns) @test C[:, 1:k] ≈ C_original[:, 1:k] rtol=1e-10 @@ -215,22 +220,22 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] A = zeros(ComplexF64, m, k) T = zeros(ComplexF64, ib, k) C = zeros(ComplexF64, m, n) - work = zeros(ComplexF64, ib * n) + work = zeros(ComplexF64, n, ib) - # Invalid side - @test_throws ArgumentError NextLA.unmqr('X', 'N', m, n, k, ib, A, m, T, ib, C, m, work, ib) + # Invalid side + @test_throws ArgumentError NextLA.unmqr!('X', 'N', m, n, k, ib, A, m, T, C, work) - # Invalid trans - @test_throws ArgumentError NextLA.unmqr('L', 'X', m, n, k, ib, A, m, T, ib, C, m, work, ib) + # Invalid trans + @test_throws ArgumentError NextLA.unmqr!('L', 'X', m, n, k, ib, A, m, T, C, work) # Negative dimensions - @test_throws ArgumentError NextLA.unmqr('L', 'N', -1, n, k, ib, A, m, T, ib, C, m, work, ib) - @test_throws ArgumentError NextLA.unmqr('L', 'N', m, -1, k, ib, A, m, T, ib, C, m, work, ib) - @test_throws ArgumentError NextLA.unmqr('L', 'N', m, n, -1, ib, A, m, T, ib, C, m, work, ib) - @test_throws ArgumentError NextLA.unmqr('L', 'N', m, n, k, -1, A, m, T, ib, C, m, work, ib) + @test_throws ArgumentError NextLA.unmqr!('L', 'N', -1, n, k, ib, A, m, T, C, work) + @test_throws ArgumentError NextLA.unmqr!('L', 'N', m, -1, k, ib, A, m, T, C, work) + @test_throws ArgumentError NextLA.unmqr!('L', 'N', m, n, -1, ib, A, m, T, C, work) + @test_throws ArgumentError NextLA.unmqr!('L', 'N', m, n, k, -1, A, m, T, C, work) # Invalid k (k > nq) - @test_throws ArgumentError NextLA.unmqr('L', 'N', m, n, m+1, ib, A, m, T, ib, C, m, work, ib) + @test_throws ArgumentError NextLA.unmqr!('L', 'N', m, n, m+1, ib, A, m, T, C, work) end @testset "Edge Cases" begin @@ -240,9 +245,9 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] T = zeros(ComplexF64, ib, max(1, k)) C = rand(ComplexF64, m, n) C_original = copy(C) - work = zeros(ComplexF64, ib * n) + work = zeros(ComplexF64, n, ib) - NextLA.unmqr('L', 'N', m, n, k, ib, A, m, T, ib, C, m, work, n) + NextLA.unmqr!('L', 'N', m, n, k, ib, A, m, T, C, work) # With k=0, C should remain unchanged @test C ≈ C_original @@ -252,19 +257,17 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] A_qr = rand(ComplexF64, m, k) lda = m T = zeros(ComplexF64, ib, k) - ldt = ib tau = zeros(ComplexF64, k) work_qr = zeros(ComplexF64, ib * k) - NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr) + NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr) C = rand(ComplexF64, m, n) C_original = copy(C) ldc = m - work = zeros(ComplexF64, ib * n) - ldwork = n + work = zeros(ComplexF64, n, ib) - NextLA.unmqr('L', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork) + NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr, lda, T, C, work) @test all(isfinite.(C)) @test norm(C) ≈ norm(C_original) rtol=1e-8 @@ -278,17 +281,15 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] A_qr_cpu = rand(ComplexF32, m, k) lda = m T_cpu = zeros(ComplexF32, ib, k) - ldt = ib tau_cpu = zeros(ComplexF32, k) work_qr_cpu = zeros(ComplexF32, ib * k) - NextLA.geqrt(m, k, ib, A_qr_cpu, lda, T_cpu, ldt, tau_cpu, work_qr_cpu) + NextLA.geqrt!(m, k, ib, A_qr_cpu, T_cpu, tau_cpu, work_qr_cpu) # Create test matrices C_cpu = rand(ComplexF32, m, n) ldc = m - work_cpu = zeros(ComplexF32, ib * n) - ldwork = ib + work_cpu = zeros(ComplexF32, n, ib) # Create GPU data A_qr_gpu = CuArray(A_qr_cpu) @@ -298,10 +299,10 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] # Apply on CPU C_cpu_result = copy(C_cpu) - NextLA.unmqr('L', 'N', m, n, k, ib, A_qr_cpu, lda, T_cpu, ldt, C_cpu_result, ldc, work_cpu, ldwork) + NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr_cpu, lda, T_cpu, C_cpu_result, work_cpu) # Apply on GPU - NextLA.unmqr('L', 'N', m, n, k, ib, A_qr_gpu, lda, T_gpu, ldt, C_gpu, ldc, work_gpu, ldwork) + NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr_gpu, lda, T_gpu, C_gpu, work_gpu) @test Array(C_gpu) ≈ C_cpu_result rtol=1e-6 end From 9adfa948c4ac58f7c1bdf08f8a833f7025032fa5 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Fri, 29 Aug 2025 10:20:27 -0300 Subject: [PATCH 3/6] CAQR: remove ib from helpers, correct ! into pemv and pamm --- src/geqrt.jl | 3 ++- src/pamm.jl | 2 +- src/pemv.jl | 2 +- src/tsmqr.jl | 3 ++- src/tsqrt.jl | 5 +++-- src/ttmqr.jl | 3 ++- src/ttqrt.jl | 3 ++- src/unmqr.jl | 5 +++-- test/geqrt.jl | 2 +- test/pamm.jl | 23 +++++++++++------------ test/tsmqr.jl | 2 +- test/tsqrt.jl | 2 +- test/ttmqr.jl | 2 +- test/ttqrt.jl | 2 +- test/unmqr.jl | 4 ++-- 15 files changed, 34 insertions(+), 29 deletions(-) diff --git a/src/geqrt.jl b/src/geqrt.jl index 8b1e86f..51d90a5 100644 --- a/src/geqrt.jl +++ b/src/geqrt.jl @@ -81,8 +81,9 @@ Helper function for blocked QR factorization. Computes A = Q*R where Q is orthog - `T`: Upper triangular block reflector matrix (ib × n) - `tau`: Vector of scalar factors for elementary reflectors (length n) """ -function geqrt!(ib::Integer, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} +function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} m, n = size(A) + ib = size(T_matrix, 1) work = zeros(T, ib * n) geqrt!(m, n, ib, A, T_matrix, tau, work) diff --git a/src/pamm.jl b/src/pamm.jl index 51cbf84..d4d757d 100644 --- a/src/pamm.jl +++ b/src/pamm.jl @@ -411,7 +411,7 @@ V = complex.(randn(m, k), randn(m, k)) A1_new, A2_new = pamm('A', 'L', 'C', 'F', A1, A2, V) ``` """ -function pamm(op::Char, side::Char, storev::Char, direct::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}) where {T} +function pamm!(op::Char, side::Char, storev::Char, direct::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}) where {T} # Determine dimensions m, k = size(A1) n = size(A2, 2) diff --git a/src/pemv.jl b/src/pemv.jl index 8c4acf4..a801df7 100644 --- a/src/pemv.jl +++ b/src/pemv.jl @@ -208,7 +208,7 @@ Y_new = pemv('N', 'C', A, X, Y, 2.0, 1.0) ``` """ -function pemv(trans::Char, storev::Char, alpha::T, A::AbstractMatrix{T}, x::AbstractVector{T}, beta::T, y::AbstractVector{T}) where {T} +function pemv!(trans::Char, storev::Char, alpha::T, A::AbstractMatrix{T}, x::AbstractVector{T}, beta::T, y::AbstractVector{T}) where {T} # Determine dimensions m, n = size(A) l = min(m, n) # Default panel size diff --git a/src/tsmqr.jl b/src/tsmqr.jl index b78af1a..f1ba956 100644 --- a/src/tsmqr.jl +++ b/src/tsmqr.jl @@ -186,10 +186,11 @@ efficiently. The compact WY representation enables high-performance matrix-matrix operations instead of multiple vector operations. """ function tsmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, - V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, ib::Integer) where {T} + V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T} m1, n1 = size(A1) m2, n2 = size(A2) k = size(V, 2) + ib = size(T_matrix, 1) # Validate input dimensions if side == 'L' && n2 != n1 diff --git a/src/tsqrt.jl b/src/tsqrt.jl index fa1636b..c31138d 100644 --- a/src/tsqrt.jl +++ b/src/tsqrt.jl @@ -151,17 +151,18 @@ A1_qr, A2_qr, T, tau = tsqrt!(copy(A1), copy(A2), ib) Uses blocked algorithm for efficiency with large matrices. The compact WY representation (stored in T) enables efficient application of the Q factor. """ -function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, ib::Integer) where {T} +function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} n, n2 = size(A1) if n != n2 throw(ArgumentError("A1 must be square, got size $(size(A1))")) end - + m, n3 = size(A2) if n != n3 throw(ArgumentError("A1 and A2 must have same number of columns, got $n and $n3")) end + ib = size(T_matrix, 1) if ib <= 0 throw(ArgumentError("Block size ib must be positive, got $ib")) end diff --git a/src/ttmqr.jl b/src/ttmqr.jl index ac4b263..0c1872a 100644 --- a/src/ttmqr.jl +++ b/src/ttmqr.jl @@ -109,9 +109,10 @@ Helper function for triangular-trapezoidal matrix transformation. - Modified `A1` and `A2` """ function ttmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, - V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, ib::Integer) where T + V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where T m1, n1 = size(A1) m2, n2 = size(A2) + ib = size(T_matrix, 1) # Use the common number of reflectors available in V and T k = size(T_matrix, 2) diff --git a/src/ttqrt.jl b/src/ttqrt.jl index 8c6f341..7c864de 100644 --- a/src/ttqrt.jl +++ b/src/ttqrt.jl @@ -93,9 +93,10 @@ Helper for triangular-triangular QR factorization. - `T`: Block reflector matrix - `tau`: Scalar factors """ -function ttqrt!(ib::Integer, A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} +function ttqrt!(A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} m, n = size(A) m2, n2 = size(B) + ib = size(T_mat, 1) @assert m2 == m && n2 == n "A and B must have same dimensions" work = zeros(T, ib * n) diff --git a/src/unmqr.jl b/src/unmqr.jl index ab94981..4e36cfe 100644 --- a/src/unmqr.jl +++ b/src/unmqr.jl @@ -191,9 +191,10 @@ Uses the blocked compact WY representation to apply Q efficiently through matrix-matrix operations rather than individual elementary reflectors. """ function unmqr!(side::Char, trans::Char, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, - C::AbstractMatrix{T}, ib::Integer) where {T} + C::AbstractMatrix{T}) where {T} m, n = size(C) - k = size(T_matrix, 2) + ib, k = size(T_matrix) + # Validate input dimensions if ib <= 0 diff --git a/test/geqrt.jl b/test/geqrt.jl index cf5f602..e7b376f 100644 --- a/test/geqrt.jl +++ b/test/geqrt.jl @@ -55,7 +55,7 @@ const GEQRT_BLOCKSIZES = [100, 200, 400, 800] A_helper = copy(A_orig) T_helper = zeros(T, max(1, ib), k) tau_helper = zeros(T, k) - NextLA.geqrt!(ib, A_helper, T_helper, tau_helper) + NextLA.geqrt!(A_helper, T_helper, tau_helper) # Verify helper gives same results as kernel (in-place) if k > 0 diff --git a/test/pamm.jl b/test/pamm.jl index aea06a2..202468b 100644 --- a/test/pamm.jl +++ b/test/pamm.jl @@ -165,7 +165,7 @@ using CUDA end @testset "Wrapper Function Tests" begin - # Test pamm_w wrapper + # Test pamm! simplified wrapper m, n, k, l = 150, 120, 80, 50 A1 = rand(ComplexF64, k, m) @@ -175,16 +175,16 @@ using CUDA W_original = copy(W) - NextLA.pamm_w(true, true, true, m, n, k, l, A1, A2, V, W) + NextLA.pamm!('W', 'L', 'C', 'F', A1, A2, V) @test all(isfinite.(W)) @test !isapprox(W, W_original, rtol=1e-12) - # Test pamm_a wrapper + # Test pamm! simplified wrapper for 'A' A2_test = rand(ComplexF64, m, k) A2_original = copy(A2_test) - NextLA.pamm_a(true, true, true, m, n, k, l, A2_test, V, W) + NextLA.pamm!('A', 'L', 'C', 'F', A1, A2_test, V) @test all(isfinite.(A2_test)) end @@ -220,20 +220,19 @@ using CUDA V_cpu = rand(ComplexF32, m, l) W_cpu = rand(ComplexF32, n, l) - lda1 = k + # Prepare GPU data A1_gpu = CuArray(A1_cpu) - W_cpu_result = copy(W_cpu) - NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, A2_cpu, V_cpu, W_cpu_result) + A2_gpu = CuArray(A2_cpu) + V_gpu = CuArray(V_cpu) W_gpu = CuArray(W_cpu) - # Apply on CPU - W_cpu_result = copy(W_cpu) - NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, lda1, A2_cpu, lda2, V_cpu, ldv, W_cpu_result, ldw) + # Apply on CPU (full signature) + NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, A2_cpu, V_cpu, W_cpu) - # Apply on GPU + # Apply on GPU (full signature) NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_gpu, A2_gpu, V_gpu, W_gpu) - @test Array(W_gpu) ≈ W_cpu_result rtol=1e-6 + @test Array(W_gpu) ≈ W_cpu rtol=1e-6 end end end diff --git a/test/tsmqr.jl b/test/tsmqr.jl index f056e15..bc636ea 100644 --- a/test/tsmqr.jl +++ b/test/tsmqr.jl @@ -149,7 +149,7 @@ const TSMQR_SIZES = [ # --- Test Helper Function --- A1_helper = copy(A1) A2_helper = copy(A2) - NextLA.tsmqr!(side, trans, A1_helper, A2_helper, V, T_mat, ib) + NextLA.tsmqr!(side, trans, A1_helper, A2_helper, V, T_mat) # Verify helper gives same results as kernel @test A1_helper ≈ A1_nextla rtol=rtol diff --git a/test/tsqrt.jl b/test/tsqrt.jl index 89586ba..7560ba2 100644 --- a/test/tsqrt.jl +++ b/test/tsqrt.jl @@ -89,7 +89,7 @@ const TSQRT_SIZES = [ A2_helper = copy(A2) T_helper = zeros(T, ib, n) tau_helper = zeros(T, n) - NextLA.tsqrt!(A1_helper, A2_helper, T_helper, tau_helper, ib) + NextLA.tsqrt!(A1_helper, A2_helper, T_helper, tau_helper) # Verify helper gives same results as kernel @test A1_helper ≈ A1_nextla rtol=rtol diff --git a/test/ttmqr.jl b/test/ttmqr.jl index a4ed949..edc27b3 100644 --- a/test/ttmqr.jl +++ b/test/ttmqr.jl @@ -156,7 +156,7 @@ const TTMQR_SIZES = [ A1_helper = copy(A1_orig) A2_helper = copy(A2_orig) T_mat_helper = copy(T_mat) - NextLA.ttmqr!('L', 'N', A1_helper, A2_helper, V, T_mat_helper, ib) + NextLA.ttmqr!('L', 'N', A1_helper, A2_helper, V, T_mat_helper) # Verify helper gives same results as kernel @test A1_helper ≈ A1_nextla rtol=rtol diff --git a/test/ttqrt.jl b/test/ttqrt.jl index a586cfb..d333951 100644 --- a/test/ttqrt.jl +++ b/test/ttqrt.jl @@ -91,7 +91,7 @@ const TTQRT_SIZES = [ # --- Test Helper Function --- A1_helper = copy(A1_orig) A2_helper = copy(A2_orig) - NextLA.ttqrt!(ib, A1_helper, A2_helper, T_mat_nextla, tau) + NextLA.ttqrt!(A1_helper, A2_helper, T_mat_nextla, tau) # Verify helper gives same results as kernel @test A1_helper ≈ A1_nextla rtol=rtol diff --git a/test/unmqr.jl b/test/unmqr.jl index 12bf436..4d7b3d9 100644 --- a/test/unmqr.jl +++ b/test/unmqr.jl @@ -37,7 +37,7 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] # --- Test Helper Function --- C_helper = copy(C_original) - NextLA.unmqr!('L', 'N', A_qr, T, C_helper, ib) + NextLA.unmqr!('L', 'N', A_qr, T, C_helper) # Verify helper gives same results as kernel (in-place) @test C_helper ≈ C rtol=rtol @@ -86,7 +86,7 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64] # --- Test Helper Function --- C_helper = copy(C_original) - NextLA.unmqr!('L', 'C', A_qr, T, C_helper, ib) + NextLA.unmqr!('L', 'C', A_qr, T, C_helper) # Verify helper gives same results as kernel (in-place) @test C_helper ≈ C rtol=rtol From 174993e63eea76aebc296786593fac3d90312d27 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Fri, 29 Aug 2025 11:56:22 -0300 Subject: [PATCH 4/6] CAQR: reducing test and helper bloating --- src/geqrt.jl | 5 +++-- src/tsqrt.jl | 18 +++++++++--------- src/ttqrt.jl | 5 +++-- test/geqrt.jl | 4 +--- test/tsqrt.jl | 13 +------------ test/ttqrt.jl | 2 +- 6 files changed, 18 insertions(+), 29 deletions(-) diff --git a/src/geqrt.jl b/src/geqrt.jl index 51d90a5..04103de 100644 --- a/src/geqrt.jl +++ b/src/geqrt.jl @@ -81,9 +81,10 @@ Helper function for blocked QR factorization. Computes A = Q*R where Q is orthog - `T`: Upper triangular block reflector matrix (ib × n) - `tau`: Vector of scalar factors for elementary reflectors (length n) """ -function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} +function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T} m, n = size(A) - ib = size(T_matrix, 1) + ib, nb = size(T_matrix) + tau = Vector{T}(undef, nb) work = zeros(T, ib * n) geqrt!(m, n, ib, A, T_matrix, tau, work) diff --git a/src/tsqrt.jl b/src/tsqrt.jl index c31138d..f658db1 100644 --- a/src/tsqrt.jl +++ b/src/tsqrt.jl @@ -151,22 +151,22 @@ A1_qr, A2_qr, T, tau = tsqrt!(copy(A1), copy(A2), ib) Uses blocked algorithm for efficiency with large matrices. The compact WY representation (stored in T) enables efficient application of the Q factor. """ -function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} +function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where{T} n, n2 = size(A1) - if n != n2 - throw(ArgumentError("A1 must be square, got size $(size(A1))")) - end - + m, n3 = size(A2) - if n != n3 - throw(ArgumentError("A1 and A2 must have same number of columns, got $n and $n3")) + if n2 != n3 + throw(ArgumentError("A1 and A2 must have same number of columns, got $n2 and $n3")) end - - ib = size(T_matrix, 1) + + ib, nb = size(T_matrix) + if ib <= 0 throw(ArgumentError("Block size ib must be positive, got $ib")) end + + tau = Vector{T}(undef, n) work = zeros(T, ib * n) # Call the core computational routine diff --git a/src/ttqrt.jl b/src/ttqrt.jl index 7c864de..541b077 100644 --- a/src/ttqrt.jl +++ b/src/ttqrt.jl @@ -93,10 +93,11 @@ Helper for triangular-triangular QR factorization. - `T`: Block reflector matrix - `tau`: Scalar factors """ -function ttqrt!(A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}) where {T} +function ttqrt!(A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}) where {T} m, n = size(A) m2, n2 = size(B) - ib = size(T_mat, 1) + ib, nb = size(T_mat) + tau = Vector{T}(undef, nb) @assert m2 == m && n2 == n "A and B must have same dimensions" work = zeros(T, ib * n) diff --git a/test/geqrt.jl b/test/geqrt.jl index e7b376f..1742ccd 100644 --- a/test/geqrt.jl +++ b/test/geqrt.jl @@ -54,14 +54,12 @@ const GEQRT_BLOCKSIZES = [100, 200, 400, 800] # --- Test Helper Function --- A_helper = copy(A_orig) T_helper = zeros(T, max(1, ib), k) - tau_helper = zeros(T, k) - NextLA.geqrt!(A_helper, T_helper, tau_helper) + NextLA.geqrt!(A_helper, T_helper) # Verify helper gives same results as kernel (in-place) if k > 0 @test A_helper ≈ A_test rtol=rtol atol=atol @test T_helper[1:ib, 1:k] ≈ T_test[1:ib, 1:k] rtol=rtol atol=atol - @test tau_helper ≈ tau_test rtol=rtol atol=atol end # --- Comparisons --- diff --git a/test/tsqrt.jl b/test/tsqrt.jl index 7560ba2..ec22216 100644 --- a/test/tsqrt.jl +++ b/test/tsqrt.jl @@ -88,8 +88,7 @@ const TSQRT_SIZES = [ A1_helper = copy(A1) A2_helper = copy(A2) T_helper = zeros(T, ib, n) - tau_helper = zeros(T, n) - NextLA.tsqrt!(A1_helper, A2_helper, T_helper, tau_helper) + NextLA.tsqrt!(A1_helper, A2_helper, T_helper) # Verify helper gives same results as kernel @test A1_helper ≈ A1_nextla rtol=rtol @@ -112,16 +111,6 @@ const TSQRT_SIZES = [ # Check that T has the expected block structure @test size(T_nextla) == (ib, n) - for block_start in 1:ib:n - block_end = min(block_start + ib - 1, n) - for i in 1:(block_end - block_start + 1) - for j in 1:(i-1) - if block_start + i - 1 <= n && block_start + j - 1 <= n - @test abs(T_nextla[i, block_start + j - 1]) < rtol * 100 - end - end - end - end end end end diff --git a/test/ttqrt.jl b/test/ttqrt.jl index d333951..5a9eadb 100644 --- a/test/ttqrt.jl +++ b/test/ttqrt.jl @@ -91,7 +91,7 @@ const TTQRT_SIZES = [ # --- Test Helper Function --- A1_helper = copy(A1_orig) A2_helper = copy(A2_orig) - NextLA.ttqrt!(A1_helper, A2_helper, T_mat_nextla, tau) + NextLA.ttqrt!(A1_helper, A2_helper, T_mat_nextla) # Verify helper gives same results as kernel @test A1_helper ≈ A1_nextla rtol=rtol From 8d20cb118008f3b254650630188df742b9475dc6 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Mon, 1 Sep 2025 11:09:41 -0300 Subject: [PATCH 5/6] CAQR: corrections in the helpers --- src/geqrt.jl | 2 +- src/tsmqr.jl | 3 +-- src/tsqrt.jl | 10 ++-------- src/unmqr.jl | 2 +- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/geqrt.jl b/src/geqrt.jl index 04103de..8994f44 100644 --- a/src/geqrt.jl +++ b/src/geqrt.jl @@ -84,7 +84,7 @@ Helper function for blocked QR factorization. Computes A = Q*R where Q is orthog function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T} m, n = size(A) ib, nb = size(T_matrix) - tau = Vector{T}(undef, nb) + tau = Vector{T}(undef, n) work = zeros(T, ib * n) geqrt!(m, n, ib, A, T_matrix, tau, work) diff --git a/src/tsmqr.jl b/src/tsmqr.jl index f1ba956..c780826 100644 --- a/src/tsmqr.jl +++ b/src/tsmqr.jl @@ -189,8 +189,7 @@ function tsmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatr V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T} m1, n1 = size(A1) m2, n2 = size(A2) - k = size(V, 2) - ib = size(T_matrix, 1) + ib, k = size(T_matrix) # Validate input dimensions if side == 'L' && n2 != n1 diff --git a/src/tsqrt.jl b/src/tsqrt.jl index f658db1..4d476b5 100644 --- a/src/tsqrt.jl +++ b/src/tsqrt.jl @@ -152,20 +152,14 @@ Uses blocked algorithm for efficiency with large matrices. The compact WY representation (stored in T) enables efficient application of the Q factor. """ function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where{T} - n, n2 = size(A1) - - m, n3 = size(A2) - if n2 != n3 - throw(ArgumentError("A1 and A2 must have same number of columns, got $n2 and $n3")) - end - + n = size(A1, 2) + m = size(A2, 1) ib, nb = size(T_matrix) if ib <= 0 throw(ArgumentError("Block size ib must be positive, got $ib")) end - tau = Vector{T}(undef, n) work = zeros(T, ib * n) diff --git a/src/unmqr.jl b/src/unmqr.jl index 4e36cfe..46aae4e 100644 --- a/src/unmqr.jl +++ b/src/unmqr.jl @@ -218,7 +218,7 @@ function unmqr!(side::Char, trans::Char, A::AbstractMatrix{T}, T_matrix::Abstrac end # Set leading dimensions - lda = max(1, size(A, 1)) + lda = max(1, stride(A, 2)) # Allocate workspace based on side (matrix workspace expected by low-level) if side == 'L' From 051658fdc141266d8a68182f06d54349815b6b24 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Mon, 1 Sep 2025 11:16:45 -0300 Subject: [PATCH 6/6] General dependency cleanup --- Project.toml | 11 ----------- src/NextLA.jl | 25 ------------------------- test/Project.toml | 7 +++++++ 3 files changed, 7 insertions(+), 36 deletions(-) create mode 100644 test/Project.toml diff --git a/Project.toml b/Project.toml index 0da84b1..283f05b 100644 --- a/Project.toml +++ b/Project.toml @@ -4,25 +4,14 @@ authors = ["Rabab Alomairy, Evelyne Ringoot, Sophie Xuan, Vicki Carrica, Maxwell version = "0.1.0" [deps] -Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" -Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" -StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" -libblastrampoline_jll = "8e850b90-86db-534c-a0d3-1478176c7d93" [compat] -Aqua = "0.8.7" -Atomix = "1.1.1" -CUDA = "5.7.0" KernelAbstractions = "0.9.34" LinearAlgebra = "1.11.0" Random = "1.11.0" -Revise = "3.8.0" -StaticArrays = "1.9.13" julia = "1.11" [extras] diff --git a/src/NextLA.jl b/src/NextLA.jl index 8465a72..3905d2f 100644 --- a/src/NextLA.jl +++ b/src/NextLA.jl @@ -9,31 +9,6 @@ import LinearAlgebra: BLAS, LAPACK import LinearAlgebra.BLAS: @blasfunc using Random: Random using KernelAbstractions -using StaticArrays - -DEV = :NVIDIA - -if DEV == :NVIDIA - using CUDA - ArrayKA = CUDA.CuArray - Backend = CUDA.CUDABackend() -elseif DEV == :AMD - using AMDGPU - ArrayKA = AMDGPU.ROCArray - Backend = AMDGPU.ROCBackend() -elseif DEV == :oneAPI - using oneAPI - ArrayKA = oneAPI.oneArray - Backend = oneAPI.oneAPIBackend() -elseif DEV == :Metal - using Metal - ArrayKA = Metal.MtlArray - Backend = Metal.MetalBackend() -else - DEV == :CPU - ArrayKA = Array - Backend = CPU() -end """ lamch(::Type{T}, cmach) where{T<: Number} diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..bc62519 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,7 @@ +[deps] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +libblastrampoline_jll = "8e850b90-86db-534c-a0d3-1478176c7d93"