NextLinearAlgebra · fda-tome · Oct 6, 2025 · Aug 13, 2025 · Aug 28, 2025 · Aug 29, 2025
diff --git a/Project.toml b/Project.toml
@@ -4,25 +4,14 @@ authors = ["Rabab Alomairy, Evelyne Ringoot, Sophie Xuan, Vicki Carrica, Maxwell
 version = "0.1.0"
 
 [deps]
-Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
-Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-libblastrampoline_jll = "8e850b90-86db-534c-a0d3-1478176c7d93"
 
 [compat]
-Aqua = "0.8.7"
-Atomix = "1.1.1"
-CUDA = "5.7.0"
 KernelAbstractions = "0.9.34"
 LinearAlgebra = "1.11.0"
 Random = "1.11.0"
-Revise = "3.8.0"
-StaticArrays = "1.9.13"
 julia = "1.11"
 
 [extras]

diff --git a/src/NextLA.jl b/src/NextLA.jl
@@ -9,31 +9,6 @@ import LinearAlgebra: BLAS, LAPACK
 import LinearAlgebra.BLAS: @blasfunc
 using Random: Random
 using KernelAbstractions
-using StaticArrays
-
-DEV = :NVIDIA
-
-if DEV == :NVIDIA
-	using CUDA
-	ArrayKA = CUDA.CuArray
-	Backend = CUDA.CUDABackend()
-elseif DEV == :AMD
-	using AMDGPU
-	ArrayKA = AMDGPU.ROCArray
-	Backend = AMDGPU.ROCBackend()
-elseif DEV == :oneAPI
-	using oneAPI
-	ArrayKA = oneAPI.oneArray
-	Backend = oneAPI.oneAPIBackend()
-elseif DEV == :Metal
-	using Metal
-	ArrayKA = Metal.MtlArray
-	Backend = Metal.MetalBackend()
-else
-	DEV == :CPU
-	ArrayKA = Array
-	Backend = CPU()
-end
 
 """
 	lamch(::Type{T}, cmach) where{T<: Number}
@@ -77,7 +52,6 @@ function lamch(::Type{T}, cmach) where {T <: Number}
 	end
 end
 
-# Write your package code here.
 include("NextLAMatrix.jl")
 include("lu.jl")
 include("trmm.jl")

diff --git a/src/axpy.jl b/src/axpy.jl
@@ -1,4 +1,4 @@
-function axpy!(a, x, y)
+function axpy!(a::T, x::AbstractVector{T}, y::AbstractVector{T}) where {T}
     n = length(x)
 
     if n <= 0
@@ -12,6 +12,4 @@ function axpy!(a, x, y)
     for i in 1:n
         y[i] = y[i] + a*x[i]
     end
-
-    return
 end
diff --git a/src/geqr2.jl b/src/geqr2.jl
@@ -1,43 +1,72 @@
-function geqr2(m,n, A, lda, tau, work)
+"""
+    geqr2!(m, n, A, lda, tau, work)
+
+Compute unblocked QR factorization of an m-by-n matrix A using Householder reflectors.
+The matrix A is overwritten with the Q and R factors.
+
+# Arguments
+- `m`: Number of rows in matrix A
+- `n`: Number of columns in matrix A  
+- `A`: Input matrix (m × n), modified in place to contain Q and R factors
+- `tau`: Output vector of scalar factors (length min(m,n))
+- `work`: Workspace vector (length n)
+
+# Algorithm
+Uses Householder reflectors H(i) to zero out elements below the diagonal.
+For each column i, generates H(i) and applies it to remaining columns.
+"""
+function geqr2!(m::Integer, n::Integer, A::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T}
+    # Input validation
     if m < 0
-        throw(ArgumentError("illegal value of m"))
-        return -1
+        throw(ArgumentError("illegal value of m: $m"))
     end
 
     if n < 0
-        throw(ArgumentError("illegal value of n"))
-        return -2
+        throw(ArgumentError("illegal value of n: $n"))
     end
 
-    if lda < max(1,m)
-        throw(ArgumentError("illegal value of lda"))
-        return -4
+    # Quick return for empty matrices
+    if m == 0 || n == 0
+        return
     end
 
-    k = min(m,n)
+    k = min(m, n)  # Number of reflectors to generate
     one = oneunit(eltype(A))
 
-    #av = parent(A)
-    #a1, a2 = parentindices(A)
-    #a1 = a1.start-1
-    #a2 = a2.start-1
-
+    # Main QR factorization loop
     for i in 1:k
-        # generate elementary reflector H(i) to anniliate A(i+1:m, i)
-        A[i,i], tau[i] = larfg(m-i+1, A[i, i], (@view A[min(i+1,m):m, i]), 1, tau[i])
+        # Generate elementary reflector H(i) to annihilate A(i+1:m, i)
+        A[i, i], tau[i] = larfg!(m-i+1, A[i, i], (@view A[min(i+1,m):m, i]), 1, tau[i])
 
         if i < n
-            # apply H(i)^H to A(i:m, i+1:n) from left
-            alpha = A[i,i]
-            A[i,i] = one
+            # Apply H(i)^H to A(i:m, i+1:n) from the left
+            alpha = A[i, i]
+            A[i, i] = one  # Set diagonal element to 1 for reflector application
 
-            #LinearAlgebra.LAPACK.larf!('L', (@view A[i:m, i]), conj(tau[i]), (@view A[i:m, i+1:n]), work)
-            larf('L', m-i+1, n-i, (@view A[i:m, i]), 1, conj(tau[i]), (@view A[i:m, i+1:n]), work)
-            #zlarf('L', m-i+1, n-i, (@view av[i+a1:m+a1, i+a2]), 1, conj(tau[i]), (@view av[i+a1:m+a1, i+1+a2:n+a2]), lda, work)
+            # Apply the reflector to remaining columns
+            larf!('L', m-i+1, n-i, (@view A[i:m, i]), 1, conj(tau[i]), (@view A[i:m, i+1:n]), work)
 
-            A[i,i] = alpha
+            A[i, i] = alpha  # Restore original diagonal element
         end
     end
+end
 
-    return   
+"""
+    geqr2!(A) -> (A, tau)
+
+Helper function for unblocked QR factorization using Householder reflectors.
+
+# Arguments  
+- `A`: Input matrix (m × n), modified in place
+- `tau`: Output vector of scalar factors (length min(m,n))
+
+# Returns
+- Modified `A` containing Q and R factors
+- `tau`: Vector of scalar factors (length min(m,n))
+"""
+function geqr2!(A::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+    m, n = size(A)
+    work = zeros(T, n)
+
+    geqr2!(m, n, A, tau, work)
 end
diff --git a/src/geqrt.jl b/src/geqrt.jl
@@ -1,58 +1,91 @@
-function geqrt(m,n,ib, A, lda, T, ldt, tau, work)
+"""
+    geqrt!(m, n, ib, A, T_matrix, tau, work)
+
+Compute blocked QR factorization of an m-by-n matrix A using block size ib.
+The matrix A is overwritten with the Q and R factors, and T contains the 
+triangular factor of the block reflector.
+
+# Arguments
+- `m`: Number of rows in matrix A
+- `n`: Number of columns in matrix A
+- `ib`: Block size for the factorization (must be > 0 if m,n > 0)
+- `A`: Input matrix (m × n), modified in place to contain Q and R factors
+- `T`: Output triangular block reflector matrix (ib × n)
+- `tau`: Output vector of scalar factors (length n)
+- `work`: Workspace vector (length ib × n)
+
+# Algorithm
+Uses a block algorithm that processes ib columns at a time.
+For each block, performs unblocked QR and then applies the 
+block reflector to the remaining columns.
+"""
+function geqrt!(m::Integer, n::Integer, ib::Integer, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T}
+    # Input validation
     if m < 0
-        throw(ArgumentError("illegal value of m"))
-        return -1
+        throw(ArgumentError("illegal value of m: $m"))
     end
 
     if n < 0
-        throw(ArgumentError("illegal value of n"))
-        return -2
+        throw(ArgumentError("illegal value of n: $n"))
     end
 
     if (ib < 0) || ((ib == 0) && (m > 0) && (n > 0))
-        throw(ArgumentError("illegal value of ib"))
-        return -3
-    end
-
-    if lda < max(1,m) && m > 0
-        throw(ArgumentError("illegal value of lda"))
-        return -5
-    end
-
-    if ldt < max(1,ib) && ib > 0
-        throw(ArgumentError("illegal value of ldt"))
-        return -7
+        throw(ArgumentError("illegal value of ib: $ib"))
     end
 
+    # Quick return for empty matrices or zero block size
     if m == 0 || n == 0 || ib == 0
         return 
     end
 
-    k = min(m,n)
+    k = min(m, n)  # Number of reflectors to generate
 
+    # Process matrix in blocks of size ib
     for i in 1:ib:k
-        sb = min(ib, k-i+1)
+        sb = min(ib, k-i+1)  # Current block size
 
-        av = @view A[i:m, i:i+sb-1]
-        tv = @view T[1:sb,i:i+sb-1]
-        tauv = @view tau[i:i+sb-1]
+        # Extract current block and corresponding parts of T and tau
+    av = @view A[i:m, i:i+sb-1]           # Current block columns
+    tv = @view T_matrix[1:sb, i:i+sb-1]          # Corresponding T block
+        tauv = @view tau[i:i+sb-1]            # Corresponding tau values
 
-        # compute qr for A[i:m, i:i+sb-1]
+    # Perform unblocked QR factorization on current block
+    geqr2!(m-i+1, sb, av, tauv, work)
 
-        geqr2(m-i+1, sb, av, lda, tauv, work)
-        larft('F', 'C', m-i+1, sb, av, lda, tauv, tv, ldt)
+    # Form the triangular factor T for the block reflector
+    larft!('F', 'C', m-i+1, sb, av, tauv, tv)
 
+        # Apply block reflector to remaining columns if any exist
         if n >= i + sb
-            # update by apply H^H to A[i:m, i+sb:n] from left
-
-            #wwork = @view work[1: (n-i-sb+1)*sb]
-            #ww = reshape(wwork, n-i-sb+1, sb)
-            ww = reshape((@view work[1: (n-i-sb+1)*sb]), n-i-sb+1, sb)
+            # Reshape work array for block reflector application
+            ww = reshape((@view work[1:(n-i-sb+1)*sb]), n-i-sb+1, sb)
 
-            larfb('L', 'C', 'F', 'C', m-i+1, n-i-sb+1, sb, av, 
-                m-i+1, tv, sb, (@view A[i:m, i+sb:n]), lda, ww, n-i-sb+1)
+            # Apply H^H to A[i:m, i+sb:n] from the left
+            larfb!('L', 'C', 'F', 'C', m-i+1, n-i-sb+1, sb, av, 
+                m-i+1, tv, (@view A[i:m, i+sb:n]), ww)
         end
     end
+end
+
+"""
+    geqrt!(A, ib) -> (A, T, tau)
+
+Helper function for blocked QR factorization. Computes A = Q*R where Q is orthogonal and R is upper triangular.
+
+# Arguments
+- `A`: Input matrix (m × n), modified in place to contain R in upper triangle and Q factors below
+- `ib`: Block size for the factorization
+
+# Returns
+- Modified `A` matrix containing Q and R factors  
+- `T`: Upper triangular block reflector matrix (ib × n)
+- `tau`: Vector of scalar factors for elementary reflectors (length n)
+"""
+function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T}
+    m, n = size(A)
+    ib, nb = size(T_matrix)
+    tau = Vector{T}(undef, n)
+    work = zeros(T, ib * n)
 
-    return
+    geqrt!(m, n, ib, A, T_matrix, tau, work)
 end
diff --git a/src/gerc.jl b/src/gerc.jl
@@ -1,30 +1,71 @@
+"""
+    gerc!(alpha, x, y, A)
+
+Perform the rank-1 update: A := A + alpha * x * y^H
+
+This function computes a rank-1 update to the matrix A using the outer product
+of vectors x and y, scaled by the scalar alpha. The operation performed is:
+A[i,j] := A[i,j] + alpha * x[i] * conj(y[j])
+
+This is the complex version of the rank-1 update (GER Complex), where the 
+conjugate of y is used in the outer product.
+
+# Arguments
+- `alpha`: Scalar multiplier for the rank-1 update
+- `x`: Vector of length m (first dimension)
+- `y`: Vector of length n (second dimension)  
+- `A`: m×n matrix to be updated in-place
+
+# Algorithm
+The algorithm efficiently computes the outer product by:
+1. For each column j, compute temp = alpha * conj(y[j])
+2. If temp ≠ 0, update column j: A[:,j] += temp * x
+3. Skip columns where y[j] = 0 to avoid unnecessary computation
+
+# Input Validation
+- Matrix A must have non-negative dimensions
+- Vectors x and y must have lengths matching A dimensions
+- All inputs must have compatible numeric types
+
+# Performance Notes
+- Optimized for cache efficiency by operating column-wise
+- Skips zero elements in y to minimize operations
+- In-place operation minimizes memory allocation
+
+# Example
+```julia
+m, n = 4, 3
+A = zeros(ComplexF64, m, n)
+x = complex.([1.0, 2.0, 3.0, 4.0], [0.1, 0.2, 0.3, 0.4])
+y = complex.([1.0, 0.0, 2.0], [0.5, 0.0, 1.0])
+alpha = 2.0 + 1.0im
+gerc!(alpha, x, y, A)  # A updated with rank-1 modification
+```
+"""
 function gerc!(alpha::T, x::AbstractVector{T}, y::AbstractVector{T}, A::AbstractMatrix{T}) where {T}
     m, n = size(A)
 
-    if m < 0
-        return 1
+    # Input validation with descriptive error messages
+    if length(x) != m
+        throw(ArgumentError("Vector x length ($(length(x))) must match matrix row dimension ($m)"))
     end
-
-    if n < 0
-        return 2
+    
+    if length(y) != n
+        throw(ArgumentError("Vector y length ($(length(y))) must match matrix column dimension ($n)"))
     end
 
+    # Early return for degenerate cases
     if m == 0 || n == 0 || alpha == zero(T)
         return
     end
 
-    jy = 1
-
+    # Perform rank-1 update: A := A + alpha * x * y^H
     for j in 1:n
-        if y[jy] != zero(T)
-            temp = alpha * conj(y[jy])
+        if y[j] != zero(T)
+            temp = alpha * conj(y[j])
             for i in 1:m
                 A[i, j] += x[i] * temp
             end
         end
-
-        jy += 1
     end
-
-    return
 end